From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/osd/CMakeLists.txt | 75 + src/osd/ClassHandler.cc | 350 + src/osd/ClassHandler.h | 126 + src/osd/DynamicPerfStats.h | 267 + src/osd/ECBackend.cc | 2637 ++++++ src/osd/ECBackend.h | 686 ++ src/osd/ECMsgTypes.cc | 393 + src/osd/ECMsgTypes.h | 140 + src/osd/ECTransaction.cc | 670 ++ src/osd/ECTransaction.h | 200 + src/osd/ECUtil.cc | 248 + src/osd/ECUtil.h | 169 + src/osd/ExtentCache.cc | 245 + src/osd/ExtentCache.h | 489 ++ src/osd/HitSet.cc | 256 + src/osd/HitSet.h | 455 + src/osd/MissingLoc.cc | 226 + src/osd/MissingLoc.h | 353 + src/osd/OSD.cc | 11378 ++++++++++++++++++++++++ src/osd/OSD.h | 2152 +++++ src/osd/OSDCap.cc | 532 ++ src/osd/OSDCap.h | 261 + src/osd/OSDMap.cc | 6412 ++++++++++++++ src/osd/OSDMap.h | 1600 ++++ src/osd/OSDMapMapping.cc | 207 + src/osd/OSDMapMapping.h | 352 + src/osd/ObjectVersioner.h | 35 + src/osd/OpRequest.cc | 170 + src/osd/OpRequest.h | 200 + src/osd/PG.cc | 2753 ++++++ src/osd/PG.h | 1341 +++ src/osd/PGBackend.cc | 1324 +++ src/osd/PGBackend.h | 641 ++ src/osd/PGLog.cc | 1189 +++ src/osd/PGLog.h | 1697 ++++ src/osd/PGPeeringEvent.cc | 17 + src/osd/PGPeeringEvent.h | 220 + src/osd/PGStateUtils.cc | 57 + src/osd/PGStateUtils.h | 85 + src/osd/PGTransaction.h | 601 ++ src/osd/PeeringState.cc | 7607 ++++++++++++++++ src/osd/PeeringState.h | 2442 ++++++ src/osd/PrimaryLogPG.cc | 15470 +++++++++++++++++++++++++++++++++ src/osd/PrimaryLogPG.h | 1969 +++++ src/osd/PrimaryLogScrub.cc | 589 ++ src/osd/PrimaryLogScrub.h | 71 + src/osd/ReplicatedBackend.cc | 2425 ++++++ src/osd/ReplicatedBackend.h | 437 + src/osd/ScrubStore.cc | 198 + src/osd/ScrubStore.h | 52 + src/osd/Session.cc | 106 + src/osd/Session.h | 240 + src/osd/SnapMapper.cc | 752 ++ src/osd/SnapMapper.h | 338 + src/osd/TierAgentState.h | 128 + src/osd/Watch.cc | 550 ++ src/osd/Watch.h | 291 + src/osd/error_code.cc | 105 + src/osd/error_code.h | 53 + src/osd/objclass.cc | 702 ++ src/osd/object_state.h | 190 + src/osd/osd_internal_types.h | 320 + src/osd/osd_op_util.cc | 263 + src/osd/osd_op_util.h | 83 + src/osd/osd_perf_counters.cc | 321 + src/osd/osd_perf_counters.h | 163 + src/osd/osd_types.cc | 7212 +++++++++++++++ src/osd/osd_types.h | 6568 ++++++++++++++ src/osd/pg_scrubber.cc | 2384 +++++ src/osd/pg_scrubber.h | 821 ++ src/osd/recovery_types.cc | 16 + src/osd/recovery_types.h | 95 + src/osd/scheduler/OpScheduler.cc | 56 + src/osd/scheduler/OpScheduler.h | 147 + src/osd/scheduler/OpSchedulerItem.cc | 259 + src/osd/scheduler/OpSchedulerItem.h | 629 ++ src/osd/scheduler/mClockScheduler.cc | 514 ++ src/osd/scheduler/mClockScheduler.h | 204 + src/osd/scrub_machine.cc | 534 ++ src/osd/scrub_machine.h | 344 + src/osd/scrub_machine_lstnr.h | 164 + src/osd/scrubber_common.h | 299 + 82 files changed, 97320 insertions(+) create mode 100644 src/osd/CMakeLists.txt create mode 100644 src/osd/ClassHandler.cc create mode 100644 src/osd/ClassHandler.h create mode 100644 src/osd/DynamicPerfStats.h create mode 100644 src/osd/ECBackend.cc create mode 100644 src/osd/ECBackend.h create mode 100644 src/osd/ECMsgTypes.cc create mode 100644 src/osd/ECMsgTypes.h create mode 100644 src/osd/ECTransaction.cc create mode 100644 src/osd/ECTransaction.h create mode 100644 src/osd/ECUtil.cc create mode 100644 src/osd/ECUtil.h create mode 100644 src/osd/ExtentCache.cc create mode 100644 src/osd/ExtentCache.h create mode 100644 src/osd/HitSet.cc create mode 100644 src/osd/HitSet.h create mode 100644 src/osd/MissingLoc.cc create mode 100644 src/osd/MissingLoc.h create mode 100644 src/osd/OSD.cc create mode 100644 src/osd/OSD.h create mode 100644 src/osd/OSDCap.cc create mode 100644 src/osd/OSDCap.h create mode 100644 src/osd/OSDMap.cc create mode 100644 src/osd/OSDMap.h create mode 100644 src/osd/OSDMapMapping.cc create mode 100644 src/osd/OSDMapMapping.h create mode 100644 src/osd/ObjectVersioner.h create mode 100644 src/osd/OpRequest.cc create mode 100644 src/osd/OpRequest.h create mode 100644 src/osd/PG.cc create mode 100644 src/osd/PG.h create mode 100644 src/osd/PGBackend.cc create mode 100644 src/osd/PGBackend.h create mode 100644 src/osd/PGLog.cc create mode 100644 src/osd/PGLog.h create mode 100644 src/osd/PGPeeringEvent.cc create mode 100644 src/osd/PGPeeringEvent.h create mode 100644 src/osd/PGStateUtils.cc create mode 100644 src/osd/PGStateUtils.h create mode 100644 src/osd/PGTransaction.h create mode 100644 src/osd/PeeringState.cc create mode 100644 src/osd/PeeringState.h create mode 100644 src/osd/PrimaryLogPG.cc create mode 100644 src/osd/PrimaryLogPG.h create mode 100644 src/osd/PrimaryLogScrub.cc create mode 100644 src/osd/PrimaryLogScrub.h create mode 100644 src/osd/ReplicatedBackend.cc create mode 100644 src/osd/ReplicatedBackend.h create mode 100644 src/osd/ScrubStore.cc create mode 100644 src/osd/ScrubStore.h create mode 100644 src/osd/Session.cc create mode 100644 src/osd/Session.h create mode 100644 src/osd/SnapMapper.cc create mode 100644 src/osd/SnapMapper.h create mode 100644 src/osd/TierAgentState.h create mode 100644 src/osd/Watch.cc create mode 100644 src/osd/Watch.h create mode 100644 src/osd/error_code.cc create mode 100644 src/osd/error_code.h create mode 100644 src/osd/objclass.cc create mode 100644 src/osd/object_state.h create mode 100644 src/osd/osd_internal_types.h create mode 100644 src/osd/osd_op_util.cc create mode 100644 src/osd/osd_op_util.h create mode 100644 src/osd/osd_perf_counters.cc create mode 100644 src/osd/osd_perf_counters.h create mode 100644 src/osd/osd_types.cc create mode 100644 src/osd/osd_types.h create mode 100644 src/osd/pg_scrubber.cc create mode 100644 src/osd/pg_scrubber.h create mode 100644 src/osd/recovery_types.cc create mode 100644 src/osd/recovery_types.h create mode 100644 src/osd/scheduler/OpScheduler.cc create mode 100644 src/osd/scheduler/OpScheduler.h create mode 100644 src/osd/scheduler/OpSchedulerItem.cc create mode 100644 src/osd/scheduler/OpSchedulerItem.h create mode 100644 src/osd/scheduler/mClockScheduler.cc create mode 100644 src/osd/scheduler/mClockScheduler.h create mode 100644 src/osd/scrub_machine.cc create mode 100644 src/osd/scrub_machine.h create mode 100644 src/osd/scrub_machine_lstnr.h create mode 100644 src/osd/scrubber_common.h (limited to 'src/osd') diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt new file mode 100644 index 000000000..373456fc6 --- /dev/null +++ b/src/osd/CMakeLists.txt @@ -0,0 +1,75 @@ +set(osdc_osd_srcs + ${CMAKE_SOURCE_DIR}/src/osdc/Objecter.cc + ${CMAKE_SOURCE_DIR}/src/osdc/Striper.cc) + +if(WITH_OSD_INSTRUMENT_FUNCTIONS AND CMAKE_CXX_COMPILER_ID STREQUAL GNU) + add_compile_options( + -finstrument-functions + -finstrument-functions-exclude-function-list=_mm_loadu_si128,_mm_cmpeq_epi32,_mm_movemask_epi8) + set(osd_cyg_functions_src ${CMAKE_SOURCE_DIR}/src/tracing/cyg_profile_functions.c) +endif() + +set(osd_srcs + OSD.cc + pg_scrubber.cc + scrub_machine.cc + PrimaryLogScrub.cc + Watch.cc + ClassHandler.cc + PG.cc + PGLog.cc + PrimaryLogPG.cc + ReplicatedBackend.cc + ECBackend.cc + ECTransaction.cc + PGBackend.cc + OSDCap.cc + Watch.cc + Session.cc + SnapMapper.cc + ScrubStore.cc + osd_types.cc + ECUtil.cc + ExtentCache.cc + scheduler/OpScheduler.cc + scheduler/OpSchedulerItem.cc + scheduler/mClockScheduler.cc + PeeringState.cc + PGStateUtils.cc + recovery_types.cc + MissingLoc.cc + osd_perf_counters.cc + ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc + ${CMAKE_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc + ${osd_cyg_functions_src} + ${osdc_osd_srcs}) +if(HAS_VTA) + set_source_files_properties(osdcap.cc + PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments) +endif() +add_library(osd STATIC ${osd_srcs}) +target_link_libraries(osd + PUBLIC dmclock::dmclock Boost::MPL + PRIVATE os heap_profiler cpu_profiler fmt::fmt ${CMAKE_DL_LIBS}) +if(WITH_LTTNG) + add_dependencies(osd osd-tp pg-tp) +endif() +if(WITH_EVENTTRACE) + add_dependencies(osd eventtrace_tp) +endif() +if(WITH_OSD_INSTRUMENT_FUNCTIONS) + add_dependencies(osd cyg_profile_tp) +endif() + +# libcls_* are runtime dependencies +add_dependencies(osd cls_journal cls_hello cls_lock cls_log cls_numops + cls_refcount cls_timeindex cls_user cls_version cls_cas cls_cmpomap) +if(WITH_CEPHFS) + add_dependencies(osd cls_cephfs) +endif() +if(WITH_RBD) + add_dependencies(osd cls_rbd) +endif() +if(WITH_RADOSGW) + add_dependencies(osd cls_otp cls_rgw cls_queue cls_rgw_gc cls_2pc_queue cls_fifo) +endif() diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc new file mode 100644 index 000000000..d1e726408 --- /dev/null +++ b/src/osd/ClassHandler.cc @@ -0,0 +1,350 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/types.h" +#include "ClassHandler.h" +#include "common/errno.h" +#include "common/ceph_context.h" +#include "include/dlfcn_compat.h" + +#include + +#if defined(__FreeBSD__) +#include +#endif + +#include "common/config.h" +#include "common/debug.h" + +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout + + +#define CLS_PREFIX "libcls_" +#define CLS_SUFFIX SHARED_LIB_SUFFIX + +using std::map; +using std::set; +using std::string; + +using ceph::bufferlist; + + +int ClassHandler::open_class(const string& cname, ClassData **pcls) +{ + std::lock_guard lock(mutex); + ClassData *cls = _get_class(cname, true); + if (!cls) + return -EPERM; + if (cls->status != ClassData::CLASS_OPEN) { + int r = _load_class(cls); + if (r) + return r; + } + *pcls = cls; + return 0; +} + +int ClassHandler::open_all_classes() +{ + ldout(cct, 10) << __func__ << dendl; + DIR *dir = ::opendir(cct->_conf->osd_class_dir.c_str()); + if (!dir) + return -errno; + + struct dirent *pde = nullptr; + int r = 0; + while ((pde = ::readdir(dir))) { + if (pde->d_name[0] == '.') + continue; + if (strlen(pde->d_name) > sizeof(CLS_PREFIX) - 1 + sizeof(CLS_SUFFIX) - 1 && + strncmp(pde->d_name, CLS_PREFIX, sizeof(CLS_PREFIX) - 1) == 0 && + strcmp(pde->d_name + strlen(pde->d_name) - (sizeof(CLS_SUFFIX) - 1), CLS_SUFFIX) == 0) { + char cname[PATH_MAX + 1]; + strncpy(cname, pde->d_name + sizeof(CLS_PREFIX) - 1, sizeof(cname) -1); + cname[strlen(cname) - (sizeof(CLS_SUFFIX) - 1)] = '\0'; + ldout(cct, 10) << __func__ << " found " << cname << dendl; + ClassData *cls; + // skip classes that aren't in 'osd class load list' + r = open_class(cname, &cls); + if (r < 0 && r != -EPERM) + goto out; + } + } + out: + closedir(dir); + return r; +} + +void ClassHandler::shutdown() +{ + for (auto& cls : classes) { + if (cls.second.handle) { + dlclose(cls.second.handle); + } + } + classes.clear(); +} + +/* + * Check if @cname is in the whitespace delimited list @list, or the @list + * contains the wildcard "*". + * + * This is expensive but doesn't consume memory for an index, and is performed + * only once when a class is loaded. + */ +bool ClassHandler::in_class_list(const std::string& cname, + const std::string& list) +{ + std::istringstream ss(list); + std::istream_iterator begin{ss}; + std::istream_iterator end{}; + + const std::vector targets{cname, "*"}; + + auto it = std::find_first_of(begin, end, + targets.begin(), targets.end()); + + return it != end; +} + +ClassHandler::ClassData *ClassHandler::_get_class(const string& cname, + bool check_allowed) +{ + ClassData *cls; + map::iterator iter = classes.find(cname); + + if (iter != classes.end()) { + cls = &iter->second; + } else { + if (check_allowed && !in_class_list(cname, cct->_conf->osd_class_load_list)) { + ldout(cct, 0) << "_get_class not permitted to load " << cname << dendl; + return NULL; + } + cls = &classes[cname]; + ldout(cct, 10) << "_get_class adding new class name " << cname << " " << cls << dendl; + cls->name = cname; + cls->handler = this; + cls->allowed = in_class_list(cname, cct->_conf->osd_class_default_list); + } + return cls; +} + +int ClassHandler::_load_class(ClassData *cls) +{ + // already open + if (cls->status == ClassData::CLASS_OPEN) + return 0; + + if (cls->status == ClassData::CLASS_UNKNOWN || + cls->status == ClassData::CLASS_MISSING) { + char fname[PATH_MAX]; + snprintf(fname, sizeof(fname), "%s/" CLS_PREFIX "%s" CLS_SUFFIX, + cct->_conf->osd_class_dir.c_str(), + cls->name.c_str()); + ldout(cct, 10) << "_load_class " << cls->name << " from " << fname << dendl; + + cls->handle = dlopen(fname, RTLD_NOW); + if (!cls->handle) { + struct stat st; + int r = ::stat(fname, &st); + if (r < 0) { + r = -errno; + ldout(cct, 0) << __func__ << " could not stat class " << fname + << ": " << cpp_strerror(r) << dendl; + } else { + ldout(cct, 0) << "_load_class could not open class " << fname + << " (dlopen failed): " << dlerror() << dendl; + r = -EIO; + } + cls->status = ClassData::CLASS_MISSING; + return r; + } + + cls_deps_t *(*cls_deps)(); + cls_deps = (cls_deps_t *(*)())dlsym(cls->handle, "class_deps"); + if (cls_deps) { + cls_deps_t *deps = cls_deps(); + while (deps) { + if (!deps->name) + break; + ClassData *cls_dep = _get_class(deps->name, false); + cls->dependencies.insert(cls_dep); + if (cls_dep->status != ClassData::CLASS_OPEN) + cls->missing_dependencies.insert(cls_dep); + deps++; + } + } + } + + // resolve dependencies + set::iterator p = cls->missing_dependencies.begin(); + while (p != cls->missing_dependencies.end()) { + ClassData *dc = *p; + int r = _load_class(dc); + if (r < 0) { + cls->status = ClassData::CLASS_MISSING_DEPS; + return r; + } + + ldout(cct, 10) << "_load_class " << cls->name << " satisfied dependency " << dc->name << dendl; + cls->missing_dependencies.erase(p++); + } + + // initialize + void (*cls_init)() = (void (*)())dlsym(cls->handle, "__cls_init"); + if (cls_init) { + cls->status = ClassData::CLASS_INITIALIZING; + cls_init(); + } + + ldout(cct, 10) << "_load_class " << cls->name << " success" << dendl; + cls->status = ClassData::CLASS_OPEN; + return 0; +} + + + +ClassHandler::ClassData *ClassHandler::register_class(const char *cname) +{ + ceph_assert(ceph_mutex_is_locked(mutex)); + + ClassData *cls = _get_class(cname, false); + ldout(cct, 10) << "register_class " << cname << " status " << cls->status << dendl; + + if (cls->status != ClassData::CLASS_INITIALIZING) { + ldout(cct, 0) << "class " << cname << " isn't loaded; is the class registering under the wrong name?" << dendl; + return NULL; + } + return cls; +} + +void ClassHandler::unregister_class(ClassHandler::ClassData *cls) +{ + /* FIXME: do we really need this one? */ +} + +ClassHandler::ClassMethod *ClassHandler::ClassData::register_method(const char *mname, + int flags, + cls_method_call_t func) +{ + /* no need for locking, called under the class_init mutex */ + if (!flags) { + lderr(handler->cct) << "register_method " << name << "." << mname + << " flags " << flags << " " << (void*)func + << " FAILED -- flags must be non-zero" << dendl; + return NULL; + } + ldout(handler->cct, 10) << "register_method " << name << "." << mname << " flags " << flags << " " << (void*)func << dendl; + [[maybe_unused]] auto [method, added] = methods_map.try_emplace(mname, mname, func, flags, this); + return &method->second; +} + +ClassHandler::ClassMethod *ClassHandler::ClassData::register_cxx_method(const char *mname, + int flags, + cls_method_cxx_call_t func) +{ + /* no need for locking, called under the class_init mutex */ + ldout(handler->cct, 10) << "register_cxx_method " << name << "." << mname << " flags " << flags << " " << (void*)func << dendl; + [[maybe_unused]] auto [method, added] = methods_map.try_emplace(mname, mname, func, flags, this); + return &method->second; +} + +ClassHandler::ClassFilter *ClassHandler::ClassData::register_cxx_filter( + const std::string &filter_name, + cls_cxx_filter_factory_t fn) +{ + ClassFilter &filter = filters_map[filter_name]; + filter.fn = fn; + filter.name = filter_name; + filter.cls = this; + return &filter; +} + +ClassHandler::ClassMethod *ClassHandler::ClassData::_get_method( + const std::string& mname) +{ + if (auto iter = methods_map.find(mname); iter != methods_map.end()) { + return &(iter->second); + } else { + return nullptr; + } +} + +int ClassHandler::ClassData::get_method_flags(const std::string& mname) +{ + std::lock_guard l(handler->mutex); + ClassMethod *method = _get_method(mname); + if (!method) + return -ENOENT; + return method->flags; +} + +void ClassHandler::ClassData::unregister_method(ClassHandler::ClassMethod *method) +{ + /* no need for locking, called under the class_init mutex */ + map::iterator iter = methods_map.find(method->name); + if (iter == methods_map.end()) + return; + methods_map.erase(iter); +} + +void ClassHandler::ClassMethod::unregister() +{ + cls->unregister_method(this); +} + +void ClassHandler::ClassData::unregister_filter(ClassHandler::ClassFilter *filter) +{ + /* no need for locking, called under the class_init mutex */ + map::iterator iter = filters_map.find(filter->name); + if (iter == filters_map.end()) + return; + filters_map.erase(iter); +} + +void ClassHandler::ClassFilter::unregister() +{ + cls->unregister_filter(this); +} + +int ClassHandler::ClassMethod::exec(cls_method_context_t ctx, bufferlist& indata, bufferlist& outdata) +{ + int ret = 0; + std::visit([&](auto method) { + using method_t = decltype(method); + if constexpr (std::is_same_v) { + // C++ call version + ret = method(ctx, &indata, &outdata); + } else if constexpr (std::is_same_v) { + // C version + char *out = nullptr; + int olen = 0; + ret = method(ctx, indata.c_str(), indata.length(), &out, &olen); + if (out) { + // assume *out was allocated via cls_alloc (which calls malloc!) + ceph::buffer::ptr bp = ceph::buffer::claim_malloc(olen, out); + outdata.push_back(bp); + } + } else { + static_assert(std::is_same_v); + } + }, func); + return ret; +} + +ClassHandler& ClassHandler::get_instance() +{ +#ifdef WITH_SEASTAR + // the context is being used solely for: + // 1. random number generation (cls_gen_random_bytes) + // 2. accessing the configuration + // 3. logging + static CephContext cct; + static ClassHandler single(&cct); +#else + static ClassHandler single(g_ceph_context); +#endif // WITH_SEASTAR + return single; +} diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h new file mode 100644 index 000000000..fff61d5d2 --- /dev/null +++ b/src/osd/ClassHandler.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_CLASSHANDLER_H +#define CEPH_CLASSHANDLER_H + +#include + +#include "include/types.h" +#include "include/common_fwd.h" +#include "common/ceph_mutex.h" +#include "objclass/objclass.h" + +//forward declaration +class ClassHandler +{ +public: + CephContext *cct; + struct ClassData; + + struct ClassMethod { + const std::string name; + using func_t = std::variant; + func_t func; + int flags = 0; + ClassData *cls = nullptr; + + int exec(cls_method_context_t ctx, + ceph::bufferlist& indata, + ceph::bufferlist& outdata); + void unregister(); + + int get_flags() { + std::lock_guard l(cls->handler->mutex); + return flags; + } + ClassMethod(const char* name, func_t call, int flags, ClassData* cls) + : name{name}, func{call}, flags{flags}, cls{cls} + {} + }; + + struct ClassFilter { + ClassData *cls = nullptr; + std::string name; + cls_cxx_filter_factory_t fn = nullptr; + + void unregister(); + }; + + struct ClassData { + enum Status { + CLASS_UNKNOWN, + CLASS_MISSING, // missing + CLASS_MISSING_DEPS, // missing dependencies + CLASS_INITIALIZING, // calling init() right now + CLASS_OPEN, // initialized, usable + } status = CLASS_UNKNOWN; + + std::string name; + ClassHandler *handler = nullptr; + void *handle = nullptr; + + bool allowed = false; + + std::map methods_map; + std::map filters_map; + + std::set dependencies; /* our dependencies */ + std::set missing_dependencies; /* only missing dependencies */ + + ClassMethod *_get_method(const std::string& mname); + + ClassMethod *register_method(const char *mname, + int flags, + cls_method_call_t func); + ClassMethod *register_cxx_method(const char *mname, + int flags, + cls_method_cxx_call_t func); + void unregister_method(ClassMethod *method); + + ClassFilter *register_cxx_filter(const std::string &filter_name, + cls_cxx_filter_factory_t fn); + void unregister_filter(ClassFilter *method); + + ClassMethod *get_method(const std::string& mname) { + std::lock_guard l(handler->mutex); + return _get_method(mname); + } + int get_method_flags(const std::string& mname); + + ClassFilter *get_filter(const std::string &filter_name) { + std::lock_guard l(handler->mutex); + if (auto i = filters_map.find(filter_name); i == filters_map.end()) { + return nullptr; + } else { + return &(i->second); + } + } + }; + +private: + std::map classes; + + ClassData *_get_class(const std::string& cname, bool check_allowed); + int _load_class(ClassData *cls); + + static bool in_class_list(const std::string& cname, + const std::string& list); + + ceph::mutex mutex = ceph::make_mutex("ClassHandler"); + +public: + explicit ClassHandler(CephContext *cct) : cct(cct) {} + + int open_all_classes(); + int open_class(const std::string& cname, ClassData **pcls); + + ClassData *register_class(const char *cname); + void unregister_class(ClassData *cls); + + void shutdown(); + + static ClassHandler& get_instance(); +}; + + +#endif diff --git a/src/osd/DynamicPerfStats.h b/src/osd/DynamicPerfStats.h new file mode 100644 index 000000000..1c6c26c71 --- /dev/null +++ b/src/osd/DynamicPerfStats.h @@ -0,0 +1,267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef DYNAMIC_PERF_STATS_H +#define DYNAMIC_PERF_STATS_H + +#include "include/random.h" +#include "messages/MOSDOp.h" +#include "mgr/OSDPerfMetricTypes.h" +#include "osd/OSD.h" +#include "osd/OpRequest.h" + +class DynamicPerfStats { +public: + DynamicPerfStats() { + } + + DynamicPerfStats(const std::list &queries) { + for (auto &query : queries) { + data[query]; + } + } + + void merge(const DynamicPerfStats &dps) { + for (auto &query_it : dps.data) { + auto &query = query_it.first; + for (auto &key_it : query_it.second) { + auto &key = key_it.first; + auto counter_it = key_it.second.begin(); + auto update_counter_fnc = + [&counter_it](const PerformanceCounterDescriptor &d, + PerformanceCounter *c) { + c->first += counter_it->first; + c->second += counter_it->second; + counter_it++; + }; + + ceph_assert(key_it.second.size() >= data[query][key].size()); + query.update_counters(update_counter_fnc, &data[query][key]); + } + } + } + + void set_queries(const std::list &queries) { + std::map> new_data; + for (auto &query : queries) { + std::swap(new_data[query], data[query]); + } + std::swap(data, new_data); + } + + bool is_enabled() { + return !data.empty(); + } + + void add(const OSDService *osd, const pg_info_t &pg_info, const OpRequest& op, + uint64_t inb, uint64_t outb, const utime_t &latency) { + + auto update_counter_fnc = + [&op, inb, outb, &latency](const PerformanceCounterDescriptor &d, + PerformanceCounter *c) { + ceph_assert(d.is_supported()); + + switch(d.type) { + case PerformanceCounterType::OPS: + c->first++; + return; + case PerformanceCounterType::WRITE_OPS: + if (op.may_write() || op.may_cache()) { + c->first++; + } + return; + case PerformanceCounterType::READ_OPS: + if (op.may_read()) { + c->first++; + } + return; + case PerformanceCounterType::BYTES: + c->first += inb + outb; + return; + case PerformanceCounterType::WRITE_BYTES: + if (op.may_write() || op.may_cache()) { + c->first += inb; + } + return; + case PerformanceCounterType::READ_BYTES: + if (op.may_read()) { + c->first += outb; + } + return; + case PerformanceCounterType::LATENCY: + c->first += latency.to_nsec(); + c->second++; + return; + case PerformanceCounterType::WRITE_LATENCY: + if (op.may_write() || op.may_cache()) { + c->first += latency.to_nsec(); + c->second++; + } + return; + case PerformanceCounterType::READ_LATENCY: + if (op.may_read()) { + c->first += latency.to_nsec(); + c->second++; + } + return; + default: + ceph_abort_msg("unknown counter type"); + } + }; + + auto get_subkey_fnc = + [&osd, &pg_info, &op](const OSDPerfMetricSubKeyDescriptor &d, + OSDPerfMetricSubKey *sub_key) { + ceph_assert(d.is_supported()); + + auto m = op.get_req(); + std::string match_string; + switch(d.type) { + case OSDPerfMetricSubKeyType::CLIENT_ID: + match_string = stringify(m->get_reqid().name); + break; + case OSDPerfMetricSubKeyType::CLIENT_ADDRESS: + match_string = stringify(m->get_connection()->get_peer_addr()); + break; + case OSDPerfMetricSubKeyType::POOL_ID: + match_string = stringify(m->get_spg().pool()); + break; + case OSDPerfMetricSubKeyType::NAMESPACE: + match_string = m->get_hobj().nspace; + break; + case OSDPerfMetricSubKeyType::OSD_ID: + match_string = stringify(osd->get_nodeid()); + break; + case OSDPerfMetricSubKeyType::PG_ID: + match_string = stringify(pg_info.pgid); + break; + case OSDPerfMetricSubKeyType::OBJECT_NAME: + match_string = m->get_oid().name; + break; + case OSDPerfMetricSubKeyType::SNAP_ID: + match_string = stringify(m->get_snapid()); + break; + default: + ceph_abort_msg("unknown counter type"); + } + + std::smatch match; + if (!std::regex_search(match_string, match, d.regex)) { + return false; + } + if (match.size() <= 1) { + return false; + } + for (size_t i = 1; i < match.size(); i++) { + sub_key->push_back(match[i].str()); + } + return true; + }; + + for (auto &it : data) { + auto &query = it.first; + OSDPerfMetricKey key; + if (query.get_key(get_subkey_fnc, &key)) { + query.update_counters(update_counter_fnc, &it.second[key]); + } + } + } + + void add_to_reports( + const std::map &limits, + std::map *reports) { + for (auto &it : data) { + auto &query = it.first; + auto limit_it = limits.find(query); + if (limit_it == limits.end()) { + continue; + } + auto &query_limits = limit_it->second; + auto &counters = it.second; + auto &report = (*reports)[query]; + + query.get_performance_counter_descriptors( + &report.performance_counter_descriptors); + + auto &descriptors = report.performance_counter_descriptors; + ceph_assert(descriptors.size() > 0); + + if (!is_limited(query_limits, counters.size())) { + for (auto &it_counters : counters) { + auto &bl = report.group_packed_performance_counters[it_counters.first]; + query.pack_counters(it_counters.second, &bl); + } + continue; + } + + for (auto &limit : query_limits) { + size_t index = 0; + for (; index < descriptors.size(); index++) { + if (descriptors[index] == limit.order_by) { + break; + } + } + if (index == descriptors.size()) { + // should not happen + continue; + } + + // Weighted Random Sampling (Algorithm A-Chao): + // Select the first [0, max_count) samples, randomly replace + // with samples from [max_count, end) using weighted + // probability, and return [0, max_count) as the result. + + ceph_assert(limit.max_count < counters.size()); + typedef std::map::iterator + Iterator; + std::vector counter_iterators; + counter_iterators.reserve(limit.max_count); + + Iterator it_counters = counters.begin(); + uint64_t wsum = 0; + for (size_t i = 0; i < limit.max_count; i++) { + wsum += it_counters->second[index].first; + counter_iterators.push_back(it_counters++); + } + for (; it_counters != counters.end(); it_counters++) { + wsum += it_counters->second[index].first; + if (ceph::util::generate_random_number(0, wsum) <= + it_counters->second[index].first) { + auto i = ceph::util::generate_random_number(0, limit.max_count - 1); + counter_iterators[i] = it_counters; + } + } + + for (auto it_counters : counter_iterators) { + auto &bl = + report.group_packed_performance_counters[it_counters->first]; + if (bl.length() == 0) { + query.pack_counters(it_counters->second, &bl); + } + } + } + } + } + +private: + static bool is_limited(const OSDPerfMetricLimits &limits, + size_t counters_size) { + if (limits.empty()) { + return false; + } + + for (auto &limit : limits) { + if (limit.max_count >= counters_size) { + return false; + } + } + + return true; + } + + std::map> data; +}; + +#endif // DYNAMIC_PERF_STATS_H diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc new file mode 100644 index 000000000..b13a99fbc --- /dev/null +++ b/src/osd/ECBackend.cc @@ -0,0 +1,2637 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include + +#include "ECBackend.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDECSubOpWrite.h" +#include "messages/MOSDECSubOpWriteReply.h" +#include "messages/MOSDECSubOpRead.h" +#include "messages/MOSDECSubOpReadReply.h" +#include "ECMsgTypes.h" + +#include "PrimaryLogPG.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +using std::dec; +using std::hex; +using std::list; +using std::make_pair; +using std::map; +using std::pair; +using std::ostream; +using std::set; +using std::string; +using std::unique_ptr; +using std::vector; + +using ceph::bufferhash; +using ceph::bufferlist; +using ceph::bufferptr; +using ceph::ErasureCodeInterfaceRef; +using ceph::Formatter; + +static ostream& _prefix(std::ostream *_dout, ECBackend *pgb) { + return pgb->get_parent()->gen_dbg_prefix(*_dout); +} + +struct ECRecoveryHandle : public PGBackend::RecoveryHandle { + list ops; +}; + +ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs) { + switch (rhs.pipeline_state) { + case ECBackend::pipeline_state_t::CACHE_VALID: + return lhs << "CACHE_VALID"; + case ECBackend::pipeline_state_t::CACHE_INVALID: + return lhs << "CACHE_INVALID"; + default: + ceph_abort_msg("invalid pipeline state"); + } + return lhs; // unreachable +} + +static ostream &operator<<(ostream &lhs, const map &rhs) +{ + lhs << "["; + for (map::const_iterator i = rhs.begin(); + i != rhs.end(); + ++i) { + if (i != rhs.begin()) + lhs << ", "; + lhs << make_pair(i->first, i->second.length()); + } + return lhs << "]"; +} + +static ostream &operator<<(ostream &lhs, const map &rhs) +{ + lhs << "["; + for (map::const_iterator i = rhs.begin(); + i != rhs.end(); + ++i) { + if (i != rhs.begin()) + lhs << ", "; + lhs << make_pair(i->first, i->second.length()); + } + return lhs << "]"; +} + +static ostream &operator<<( + ostream &lhs, + const boost::tuple > &rhs) +{ + return lhs << "(" << rhs.get<0>() << ", " + << rhs.get<1>() << ", " << rhs.get<2>() << ")"; +} + +ostream &operator<<(ostream &lhs, const ECBackend::read_request_t &rhs) +{ + return lhs << "read_request_t(to_read=[" << rhs.to_read << "]" + << ", need=" << rhs.need + << ", want_attrs=" << rhs.want_attrs + << ")"; +} + +ostream &operator<<(ostream &lhs, const ECBackend::read_result_t &rhs) +{ + lhs << "read_result_t(r=" << rhs.r + << ", errors=" << rhs.errors; + if (rhs.attrs) { + lhs << ", attrs=" << *(rhs.attrs); + } else { + lhs << ", noattrs"; + } + return lhs << ", returned=" << rhs.returned << ")"; +} + +ostream &operator<<(ostream &lhs, const ECBackend::ReadOp &rhs) +{ + lhs << "ReadOp(tid=" << rhs.tid; + if (rhs.op && rhs.op->get_req()) { + lhs << ", op="; + rhs.op->get_req()->print(lhs); + } + return lhs << ", to_read=" << rhs.to_read + << ", complete=" << rhs.complete + << ", priority=" << rhs.priority + << ", obj_to_source=" << rhs.obj_to_source + << ", source_to_obj=" << rhs.source_to_obj + << ", in_progress=" << rhs.in_progress << ")"; +} + +void ECBackend::ReadOp::dump(Formatter *f) const +{ + f->dump_unsigned("tid", tid); + if (op && op->get_req()) { + f->dump_stream("op") << *(op->get_req()); + } + f->dump_stream("to_read") << to_read; + f->dump_stream("complete") << complete; + f->dump_int("priority", priority); + f->dump_stream("obj_to_source") << obj_to_source; + f->dump_stream("source_to_obj") << source_to_obj; + f->dump_stream("in_progress") << in_progress; +} + +ostream &operator<<(ostream &lhs, const ECBackend::Op &rhs) +{ + lhs << "Op(" << rhs.hoid + << " v=" << rhs.version + << " tt=" << rhs.trim_to + << " tid=" << rhs.tid + << " reqid=" << rhs.reqid; + if (rhs.client_op && rhs.client_op->get_req()) { + lhs << " client_op="; + rhs.client_op->get_req()->print(lhs); + } + lhs << " roll_forward_to=" << rhs.roll_forward_to + << " temp_added=" << rhs.temp_added + << " temp_cleared=" << rhs.temp_cleared + << " pending_read=" << rhs.pending_read + << " remote_read=" << rhs.remote_read + << " remote_read_result=" << rhs.remote_read_result + << " pending_apply=" << rhs.pending_apply + << " pending_commit=" << rhs.pending_commit + << " plan.to_read=" << rhs.plan.to_read + << " plan.will_write=" << rhs.plan.will_write + << ")"; + return lhs; +} + +ostream &operator<<(ostream &lhs, const ECBackend::RecoveryOp &rhs) +{ + return lhs << "RecoveryOp(" + << "hoid=" << rhs.hoid + << " v=" << rhs.v + << " missing_on=" << rhs.missing_on + << " missing_on_shards=" << rhs.missing_on_shards + << " recovery_info=" << rhs.recovery_info + << " recovery_progress=" << rhs.recovery_progress + << " obc refcount=" << rhs.obc.use_count() + << " state=" << ECBackend::RecoveryOp::tostr(rhs.state) + << " waiting_on_pushes=" << rhs.waiting_on_pushes + << " extent_requested=" << rhs.extent_requested + << ")"; +} + +void ECBackend::RecoveryOp::dump(Formatter *f) const +{ + f->dump_stream("hoid") << hoid; + f->dump_stream("v") << v; + f->dump_stream("missing_on") << missing_on; + f->dump_stream("missing_on_shards") << missing_on_shards; + f->dump_stream("recovery_info") << recovery_info; + f->dump_stream("recovery_progress") << recovery_progress; + f->dump_stream("state") << tostr(state); + f->dump_stream("waiting_on_pushes") << waiting_on_pushes; + f->dump_stream("extent_requested") << extent_requested; +} + +ECBackend::ECBackend( + PGBackend::Listener *pg, + const coll_t &coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct, + ErasureCodeInterfaceRef ec_impl, + uint64_t stripe_width) + : PGBackend(cct, pg, store, coll, ch), + ec_impl(ec_impl), + sinfo(ec_impl->get_data_chunk_count(), stripe_width) { + ceph_assert((ec_impl->get_data_chunk_count() * + ec_impl->get_chunk_size(stripe_width)) == stripe_width); +} + +PGBackend::RecoveryHandle *ECBackend::open_recovery_op() +{ + return new ECRecoveryHandle; +} + +void ECBackend::_failed_push(const hobject_t &hoid, + pair &in) +{ + ECBackend::read_result_t &res = in.second; + dout(10) << __func__ << ": Read error " << hoid << " r=" + << res.r << " errors=" << res.errors << dendl; + dout(10) << __func__ << ": canceling recovery op for obj " << hoid + << dendl; + ceph_assert(recovery_ops.count(hoid)); + eversion_t v = recovery_ops[hoid].v; + recovery_ops.erase(hoid); + + set fl; + for (auto&& i : res.errors) { + fl.insert(i.first); + } + get_parent()->on_failed_pull(fl, hoid, v); +} + +struct OnRecoveryReadComplete : + public GenContext &> { + ECBackend *pg; + hobject_t hoid; + OnRecoveryReadComplete(ECBackend *pg, const hobject_t &hoid) + : pg(pg), hoid(hoid) {} + void finish(pair &in) override { + ECBackend::read_result_t &res = in.second; + if (!(res.r == 0 && res.errors.empty())) { + pg->_failed_push(hoid, in); + return; + } + ceph_assert(res.returned.size() == 1); + pg->handle_recovery_read_complete( + hoid, + res.returned.back(), + res.attrs, + in.first); + } +}; + +struct RecoveryMessages { + map reads; + map> want_to_read; + void read( + ECBackend *ec, + const hobject_t &hoid, uint64_t off, uint64_t len, + set &&_want_to_read, + const map>> &need, + bool attrs) { + list > to_read; + to_read.push_back(boost::make_tuple(off, len, 0)); + ceph_assert(!reads.count(hoid)); + want_to_read.insert(make_pair(hoid, std::move(_want_to_read))); + reads.insert( + make_pair( + hoid, + ECBackend::read_request_t( + to_read, + need, + attrs, + new OnRecoveryReadComplete( + ec, + hoid)))); + } + + map > pushes; + map > push_replies; + ObjectStore::Transaction t; + RecoveryMessages() {} + ~RecoveryMessages() {} +}; + +void ECBackend::handle_recovery_push( + const PushOp &op, + RecoveryMessages *m, + bool is_repair) +{ + if (get_parent()->check_failsafe_full()) { + dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl; + ceph_abort(); + } + + bool oneshot = op.before_progress.first && op.after_progress.data_complete; + ghobject_t tobj; + if (oneshot) { + tobj = ghobject_t(op.soid, ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard); + } else { + tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.soid, + op.version), + ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard); + if (op.before_progress.first) { + dout(10) << __func__ << ": Adding oid " + << tobj.hobj << " in the temp collection" << dendl; + add_temp_obj(tobj.hobj); + } + } + + if (op.before_progress.first) { + m->t.remove(coll, tobj); + m->t.touch(coll, tobj); + } + + if (!op.data_included.empty()) { + uint64_t start = op.data_included.range_start(); + uint64_t end = op.data_included.range_end(); + ceph_assert(op.data.length() == (end - start)); + + m->t.write( + coll, + tobj, + start, + op.data.length(), + op.data); + } else { + ceph_assert(op.data.length() == 0); + } + + if (get_parent()->pg_is_remote_backfilling()) { + get_parent()->pg_add_local_num_bytes(op.data.length()); + get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count()); + dout(10) << __func__ << " " << op.soid + << " add new actual data by " << op.data.length() + << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count() + << dendl; + } + + if (op.before_progress.first) { + ceph_assert(op.attrset.count(string("_"))); + m->t.setattrs( + coll, + tobj, + op.attrset); + } + + if (op.after_progress.data_complete && !oneshot) { + dout(10) << __func__ << ": Removing oid " + << tobj.hobj << " from the temp collection" << dendl; + clear_temp_obj(tobj.hobj); + m->t.remove(coll, ghobject_t( + op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + m->t.collection_move_rename( + coll, tobj, + coll, ghobject_t( + op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + } + if (op.after_progress.data_complete) { + if ((get_parent()->pgb_is_primary())) { + ceph_assert(recovery_ops.count(op.soid)); + ceph_assert(recovery_ops[op.soid].obc); + if (get_parent()->pg_is_repair()) + get_parent()->inc_osd_stat_repaired(); + get_parent()->on_local_recover( + op.soid, + op.recovery_info, + recovery_ops[op.soid].obc, + false, + &m->t); + } else { + // If primary told us this is a repair, bump osd_stat_t::num_objects_repaired + if (is_repair) + get_parent()->inc_osd_stat_repaired(); + get_parent()->on_local_recover( + op.soid, + op.recovery_info, + ObjectContextRef(), + false, + &m->t); + if (get_parent()->pg_is_remote_backfilling()) { + struct stat st; + int r = store->stat(ch, ghobject_t(op.soid, ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard), &st); + if (r == 0) { + get_parent()->pg_sub_local_num_bytes(st.st_size); + // XXX: This can be way overestimated for small objects + get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count()); + dout(10) << __func__ << " " << op.soid + << " sub actual data by " << st.st_size + << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count() + << dendl; + } + } + } + } + m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp()); + m->push_replies[get_parent()->primary_shard()].back().soid = op.soid; +} + +void ECBackend::handle_recovery_push_reply( + const PushReplyOp &op, + pg_shard_t from, + RecoveryMessages *m) +{ + if (!recovery_ops.count(op.soid)) + return; + RecoveryOp &rop = recovery_ops[op.soid]; + ceph_assert(rop.waiting_on_pushes.count(from)); + rop.waiting_on_pushes.erase(from); + continue_recovery_op(rop, m); +} + +void ECBackend::handle_recovery_read_complete( + const hobject_t &hoid, + boost::tuple > &to_read, + std::optional > attrs, + RecoveryMessages *m) +{ + dout(10) << __func__ << ": returned " << hoid << " " + << "(" << to_read.get<0>() + << ", " << to_read.get<1>() + << ", " << to_read.get<2>() + << ")" + << dendl; + ceph_assert(recovery_ops.count(hoid)); + RecoveryOp &op = recovery_ops[hoid]; + ceph_assert(op.returned_data.empty()); + map target; + for (set::iterator i = op.missing_on_shards.begin(); + i != op.missing_on_shards.end(); + ++i) { + target[*i] = &(op.returned_data[*i]); + } + map from; + for(map::iterator i = to_read.get<2>().begin(); + i != to_read.get<2>().end(); + ++i) { + from[i->first.shard] = std::move(i->second); + } + dout(10) << __func__ << ": " << from << dendl; + int r; + r = ECUtil::decode(sinfo, ec_impl, from, target); + ceph_assert(r == 0); + if (attrs) { + op.xattrs.swap(*attrs); + + if (!op.obc) { + // attrs only reference the origin bufferlist (decode from + // ECSubReadReply message) whose size is much greater than attrs + // in recovery. If obc cache it (get_obc maybe cache the attr), + // this causes the whole origin bufferlist would not be free + // until obc is evicted from obc cache. So rebuild the + // bufferlist before cache it. + for (map::iterator it = op.xattrs.begin(); + it != op.xattrs.end(); + ++it) { + it->second.rebuild(); + } + // Need to remove ECUtil::get_hinfo_key() since it should not leak out + // of the backend (see bug #12983) + map sanitized_attrs(op.xattrs); + sanitized_attrs.erase(ECUtil::get_hinfo_key()); + op.obc = get_parent()->get_obc(hoid, sanitized_attrs); + ceph_assert(op.obc); + op.recovery_info.size = op.obc->obs.oi.size; + op.recovery_info.oi = op.obc->obs.oi; + } + + ECUtil::HashInfo hinfo(ec_impl->get_chunk_count()); + if (op.obc->obs.oi.size > 0) { + ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key())); + auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin(); + decode(hinfo, bp); + } + op.hinfo = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo); + } + ceph_assert(op.xattrs.size()); + ceph_assert(op.obc); + continue_recovery_op(op, m); +} + +struct SendPushReplies : public Context { + PGBackend::Listener *l; + epoch_t epoch; + map replies; + SendPushReplies( + PGBackend::Listener *l, + epoch_t epoch, + map &in) : l(l), epoch(epoch) { + replies.swap(in); + } + void finish(int) override { + std::vector> messages; + messages.reserve(replies.size()); + for (map::iterator i = replies.begin(); + i != replies.end(); + ++i) { + messages.push_back(std::make_pair(i->first, i->second)); + } + if (!messages.empty()) { + l->send_message_osd_cluster(messages, epoch); + } + replies.clear(); + } + ~SendPushReplies() override { + for (map::iterator i = replies.begin(); + i != replies.end(); + ++i) { + i->second->put(); + } + replies.clear(); + } +}; + +void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority) +{ + for (map >::iterator i = m.pushes.begin(); + i != m.pushes.end(); + m.pushes.erase(i++)) { + MOSDPGPush *msg = new MOSDPGPush(); + msg->set_priority(priority); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); + msg->from = get_parent()->whoami_shard(); + msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard); + msg->pushes.swap(i->second); + msg->compute_cost(cct); + msg->is_repair = get_parent()->pg_is_repair(); + get_parent()->send_message( + i->first.osd, + msg); + } + map replies; + for (map >::iterator i = + m.push_replies.begin(); + i != m.push_replies.end(); + m.push_replies.erase(i++)) { + MOSDPGPushReply *msg = new MOSDPGPushReply(); + msg->set_priority(priority); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); + msg->from = get_parent()->whoami_shard(); + msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard); + msg->replies.swap(i->second); + msg->compute_cost(cct); + replies.insert(make_pair(i->first.osd, msg)); + } + + if (!replies.empty()) { + (m.t).register_on_complete( + get_parent()->bless_context( + new SendPushReplies( + get_parent(), + get_osdmap_epoch(), + replies))); + get_parent()->queue_transaction(std::move(m.t)); + } + + if (m.reads.empty()) + return; + start_read_op( + priority, + m.want_to_read, + m.reads, + OpRequestRef(), + false, true); +} + +void ECBackend::continue_recovery_op( + RecoveryOp &op, + RecoveryMessages *m) +{ + dout(10) << __func__ << ": continuing " << op << dendl; + while (1) { + switch (op.state) { + case RecoveryOp::IDLE: { + // start read + op.state = RecoveryOp::READING; + ceph_assert(!op.recovery_progress.data_complete); + set want(op.missing_on_shards.begin(), op.missing_on_shards.end()); + uint64_t from = op.recovery_progress.data_recovered_to; + uint64_t amount = get_recovery_chunk_size(); + + if (op.recovery_progress.first && op.obc) { + /* We've got the attrs and the hinfo, might as well use them */ + op.hinfo = get_hash_info(op.hoid); + if (!op.hinfo) { + derr << __func__ << ": " << op.hoid << " has inconsistent hinfo" + << dendl; + ceph_assert(recovery_ops.count(op.hoid)); + eversion_t v = recovery_ops[op.hoid].v; + recovery_ops.erase(op.hoid); + get_parent()->on_failed_pull({get_parent()->whoami_shard()}, + op.hoid, v); + return; + } + op.xattrs = op.obc->attr_cache; + encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]); + } + + map>> to_read; + int r = get_min_avail_to_read_shards( + op.hoid, want, true, false, &to_read); + if (r != 0) { + // we must have lost a recovery source + ceph_assert(!op.recovery_progress.first); + dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid + << dendl; + get_parent()->cancel_pull(op.hoid); + recovery_ops.erase(op.hoid); + return; + } + m->read( + this, + op.hoid, + op.recovery_progress.data_recovered_to, + amount, + std::move(want), + to_read, + op.recovery_progress.first && !op.obc); + op.extent_requested = make_pair( + from, + amount); + dout(10) << __func__ << ": IDLE return " << op << dendl; + return; + } + case RecoveryOp::READING: { + // read completed, start write + ceph_assert(op.xattrs.size()); + ceph_assert(op.returned_data.size()); + op.state = RecoveryOp::WRITING; + ObjectRecoveryProgress after_progress = op.recovery_progress; + after_progress.data_recovered_to += op.extent_requested.second; + after_progress.first = false; + if (after_progress.data_recovered_to >= op.obc->obs.oi.size) { + after_progress.data_recovered_to = + sinfo.logical_to_next_stripe_offset( + op.obc->obs.oi.size); + after_progress.data_complete = true; + } + for (set::iterator mi = op.missing_on.begin(); + mi != op.missing_on.end(); + ++mi) { + ceph_assert(op.returned_data.count(mi->shard)); + m->pushes[*mi].push_back(PushOp()); + PushOp &pop = m->pushes[*mi].back(); + pop.soid = op.hoid; + pop.version = op.v; + pop.data = op.returned_data[mi->shard]; + dout(10) << __func__ << ": before_progress=" << op.recovery_progress + << ", after_progress=" << after_progress + << ", pop.data.length()=" << pop.data.length() + << ", size=" << op.obc->obs.oi.size << dendl; + ceph_assert( + pop.data.length() == + sinfo.aligned_logical_offset_to_chunk_offset( + after_progress.data_recovered_to - + op.recovery_progress.data_recovered_to) + ); + if (pop.data.length()) + pop.data_included.insert( + sinfo.aligned_logical_offset_to_chunk_offset( + op.recovery_progress.data_recovered_to), + pop.data.length() + ); + if (op.recovery_progress.first) { + pop.attrset = op.xattrs; + } + pop.recovery_info = op.recovery_info; + pop.before_progress = op.recovery_progress; + pop.after_progress = after_progress; + if (*mi != get_parent()->primary_shard()) + get_parent()->begin_peer_recover( + *mi, + op.hoid); + } + op.returned_data.clear(); + op.waiting_on_pushes = op.missing_on; + op.recovery_progress = after_progress; + dout(10) << __func__ << ": READING return " << op << dendl; + return; + } + case RecoveryOp::WRITING: { + if (op.waiting_on_pushes.empty()) { + if (op.recovery_progress.data_complete) { + op.state = RecoveryOp::COMPLETE; + for (set::iterator i = op.missing_on.begin(); + i != op.missing_on.end(); + ++i) { + if (*i != get_parent()->primary_shard()) { + dout(10) << __func__ << ": on_peer_recover on " << *i + << ", obj " << op.hoid << dendl; + get_parent()->on_peer_recover( + *i, + op.hoid, + op.recovery_info); + } + } + object_stat_sum_t stat; + stat.num_bytes_recovered = op.recovery_info.size; + stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ? + stat.num_objects_recovered = 1; + if (get_parent()->pg_is_repair()) + stat.num_objects_repaired = 1; + get_parent()->on_global_recover(op.hoid, stat, false); + dout(10) << __func__ << ": WRITING return " << op << dendl; + recovery_ops.erase(op.hoid); + return; + } else { + op.state = RecoveryOp::IDLE; + dout(10) << __func__ << ": WRITING continue " << op << dendl; + continue; + } + } + return; + } + // should never be called once complete + case RecoveryOp::COMPLETE: + default: { + ceph_abort(); + }; + } + } +} + +void ECBackend::run_recovery_op( + RecoveryHandle *_h, + int priority) +{ + ECRecoveryHandle *h = static_cast(_h); + RecoveryMessages m; + for (list::iterator i = h->ops.begin(); + i != h->ops.end(); + ++i) { + dout(10) << __func__ << ": starting " << *i << dendl; + ceph_assert(!recovery_ops.count(i->hoid)); + RecoveryOp &op = recovery_ops.insert(make_pair(i->hoid, *i)).first->second; + continue_recovery_op(op, &m); + } + + dispatch_recovery_messages(m, priority); + send_recovery_deletes(priority, h->deletes); + delete _h; +} + +int ECBackend::recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + RecoveryHandle *_h) +{ + ECRecoveryHandle *h = static_cast(_h); + h->ops.push_back(RecoveryOp()); + h->ops.back().v = v; + h->ops.back().hoid = hoid; + h->ops.back().obc = obc; + h->ops.back().recovery_info.soid = hoid; + h->ops.back().recovery_info.version = v; + if (obc) { + h->ops.back().recovery_info.size = obc->obs.oi.size; + h->ops.back().recovery_info.oi = obc->obs.oi; + } + if (hoid.is_snap()) { + if (obc) { + ceph_assert(obc->ssc); + h->ops.back().recovery_info.ss = obc->ssc->snapset; + } else if (head) { + ceph_assert(head->ssc); + h->ops.back().recovery_info.ss = head->ssc->snapset; + } else { + ceph_abort_msg("neither obc nor head set for a snap object"); + } + } + h->ops.back().recovery_progress.omap_complete = true; + for (set::const_iterator i = + get_parent()->get_acting_recovery_backfill_shards().begin(); + i != get_parent()->get_acting_recovery_backfill_shards().end(); + ++i) { + dout(10) << "checking " << *i << dendl; + if (get_parent()->get_shard_missing(*i).is_missing(hoid)) { + h->ops.back().missing_on.insert(*i); + h->ops.back().missing_on_shards.insert(i->shard); + } + } + dout(10) << __func__ << ": built op " << h->ops.back() << dendl; + return 0; +} + +bool ECBackend::can_handle_while_inactive( + OpRequestRef _op) +{ + return false; +} + +bool ECBackend::_handle_message( + OpRequestRef _op) +{ + dout(10) << __func__ << ": " << *_op->get_req() << dendl; + int priority = _op->get_req()->get_priority(); + switch (_op->get_req()->get_type()) { + case MSG_OSD_EC_WRITE: { + // NOTE: this is non-const because handle_sub_write modifies the embedded + // ObjectStore::Transaction in place (and then std::move's it). It does + // not conflict with ECSubWrite's operator<<. + MOSDECSubOpWrite *op = static_cast( + _op->get_nonconst_req()); + parent->maybe_preempt_replica_scrub(op->op.soid); + handle_sub_write(op->op.from, _op, op->op, _op->pg_trace); + return true; + } + case MSG_OSD_EC_WRITE_REPLY: { + const MOSDECSubOpWriteReply *op = static_cast( + _op->get_req()); + handle_sub_write_reply(op->op.from, op->op, _op->pg_trace); + return true; + } + case MSG_OSD_EC_READ: { + auto op = _op->get_req(); + MOSDECSubOpReadReply *reply = new MOSDECSubOpReadReply; + reply->pgid = get_parent()->primary_spg_t(); + reply->map_epoch = get_osdmap_epoch(); + reply->min_epoch = get_parent()->get_interval_start_epoch(); + handle_sub_read(op->op.from, op->op, &(reply->op), _op->pg_trace); + reply->trace = _op->pg_trace; + get_parent()->send_message_osd_cluster( + reply, _op->get_req()->get_connection()); + return true; + } + case MSG_OSD_EC_READ_REPLY: { + // NOTE: this is non-const because handle_sub_read_reply steals resulting + // buffers. It does not conflict with ECSubReadReply operator<<. + MOSDECSubOpReadReply *op = static_cast( + _op->get_nonconst_req()); + RecoveryMessages rm; + handle_sub_read_reply(op->op.from, op->op, &rm, _op->pg_trace); + dispatch_recovery_messages(rm, priority); + return true; + } + case MSG_OSD_PG_PUSH: { + auto op = _op->get_req(); + RecoveryMessages rm; + for (vector::const_iterator i = op->pushes.begin(); + i != op->pushes.end(); + ++i) { + handle_recovery_push(*i, &rm, op->is_repair); + } + dispatch_recovery_messages(rm, priority); + return true; + } + case MSG_OSD_PG_PUSH_REPLY: { + const MOSDPGPushReply *op = static_cast( + _op->get_req()); + RecoveryMessages rm; + for (vector::const_iterator i = op->replies.begin(); + i != op->replies.end(); + ++i) { + handle_recovery_push_reply(*i, op->from, &rm); + } + dispatch_recovery_messages(rm, priority); + return true; + } + default: + return false; + } + return false; +} + +struct SubWriteCommitted : public Context { + ECBackend *pg; + OpRequestRef msg; + ceph_tid_t tid; + eversion_t version; + eversion_t last_complete; + const ZTracer::Trace trace; + SubWriteCommitted( + ECBackend *pg, + OpRequestRef msg, + ceph_tid_t tid, + eversion_t version, + eversion_t last_complete, + const ZTracer::Trace &trace) + : pg(pg), msg(msg), tid(tid), + version(version), last_complete(last_complete), trace(trace) {} + void finish(int) override { + if (msg) + msg->mark_event("sub_op_committed"); + pg->sub_write_committed(tid, version, last_complete, trace); + } +}; +void ECBackend::sub_write_committed( + ceph_tid_t tid, eversion_t version, eversion_t last_complete, + const ZTracer::Trace &trace) { + if (get_parent()->pgb_is_primary()) { + ECSubWriteReply reply; + reply.tid = tid; + reply.last_complete = last_complete; + reply.committed = true; + reply.applied = true; + reply.from = get_parent()->whoami_shard(); + handle_sub_write_reply( + get_parent()->whoami_shard(), + reply, trace); + } else { + get_parent()->update_last_complete_ondisk(last_complete); + MOSDECSubOpWriteReply *r = new MOSDECSubOpWriteReply; + r->pgid = get_parent()->primary_spg_t(); + r->map_epoch = get_osdmap_epoch(); + r->min_epoch = get_parent()->get_interval_start_epoch(); + r->op.tid = tid; + r->op.last_complete = last_complete; + r->op.committed = true; + r->op.applied = true; + r->op.from = get_parent()->whoami_shard(); + r->set_priority(CEPH_MSG_PRIO_HIGH); + r->trace = trace; + r->trace.event("sending sub op commit"); + get_parent()->send_message_osd_cluster( + get_parent()->primary_shard().osd, r, get_osdmap_epoch()); + } +} + +void ECBackend::handle_sub_write( + pg_shard_t from, + OpRequestRef msg, + ECSubWrite &op, + const ZTracer::Trace &trace) +{ + if (msg) + msg->mark_event("sub_op_started"); + trace.event("handle_sub_write"); +#ifdef HAVE_JAEGER + if (msg->osd_parent_span) { + auto ec_sub_trans = jaeger_tracing::child_span(__func__, msg->osd_parent_span); + } +#endif + if (!get_parent()->pgb_is_primary()) + get_parent()->update_stats(op.stats); + ObjectStore::Transaction localt; + if (!op.temp_added.empty()) { + add_temp_objs(op.temp_added); + } + if (op.backfill_or_async_recovery) { + for (set::iterator i = op.temp_removed.begin(); + i != op.temp_removed.end(); + ++i) { + dout(10) << __func__ << ": removing object " << *i + << " since we won't get the transaction" << dendl; + localt.remove( + coll, + ghobject_t( + *i, + ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard)); + } + } + clear_temp_objs(op.temp_removed); + dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl; + // flag set to true during async recovery + bool async = false; + pg_missing_tracker_t pmissing = get_parent()->get_local_missing(); + if (pmissing.is_missing(op.soid)) { + async = true; + dout(30) << __func__ << " is_missing " << pmissing.is_missing(op.soid) << dendl; + for (auto &&e: op.log_entries) { + dout(30) << " add_next_event entry " << e << dendl; + get_parent()->add_local_next_event(e); + dout(30) << " entry is_delete " << e.is_delete() << dendl; + } + } + get_parent()->log_operation( + std::move(op.log_entries), + op.updated_hit_set_history, + op.trim_to, + op.roll_forward_to, + op.roll_forward_to, + !op.backfill_or_async_recovery, + localt, + async); + + if (!get_parent()->pg_is_undersized() && + (unsigned)get_parent()->whoami_shard().shard >= + ec_impl->get_data_chunk_count()) + op.t.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + + localt.register_on_commit( + get_parent()->bless_context( + new SubWriteCommitted( + this, msg, op.tid, + op.at_version, + get_parent()->get_info().last_complete, trace))); + vector tls; + tls.reserve(2); + tls.push_back(std::move(op.t)); + tls.push_back(std::move(localt)); + get_parent()->queue_transactions(tls, msg); + dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl; + if (op.at_version != eversion_t()) { + // dummy rollforward transaction doesn't get at_version (and doesn't advance it) + get_parent()->op_applied(op.at_version); + } +} + +void ECBackend::handle_sub_read( + pg_shard_t from, + const ECSubRead &op, + ECSubReadReply *reply, + const ZTracer::Trace &trace) +{ + trace.event("handle sub read"); + shard_id_t shard = get_parent()->whoami_shard().shard; + for(auto i = op.to_read.begin(); + i != op.to_read.end(); + ++i) { + int r = 0; + for (auto j = i->second.begin(); j != i->second.end(); ++j) { + bufferlist bl; + if ((op.subchunks.find(i->first)->second.size() == 1) && + (op.subchunks.find(i->first)->second.front().second == + ec_impl->get_sub_chunk_count())) { + dout(25) << __func__ << " case1: reading the complete chunk/shard." << dendl; + r = store->read( + ch, + ghobject_t(i->first, ghobject_t::NO_GEN, shard), + j->get<0>(), + j->get<1>(), + bl, j->get<2>()); // Allow EIO return + } else { + dout(25) << __func__ << " case2: going to do fragmented read." << dendl; + int subchunk_size = + sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count(); + bool error = false; + for (int m = 0; m < (int)j->get<1>() && !error; + m += sinfo.get_chunk_size()) { + for (auto &&k:op.subchunks.find(i->first)->second) { + bufferlist bl0; + r = store->read( + ch, + ghobject_t(i->first, ghobject_t::NO_GEN, shard), + j->get<0>() + m + (k.first)*subchunk_size, + (k.second)*subchunk_size, + bl0, j->get<2>()); + if (r < 0) { + error = true; + break; + } + bl.claim_append(bl0); + } + } + } + + if (r < 0) { + // if we are doing fast reads, it's possible for one of the shard + // reads to cross paths with another update and get a (harmless) + // ENOENT. Suppress the message to the cluster log in that case. + if (r == -ENOENT && get_parent()->get_pool().fast_read) { + dout(5) << __func__ << ": Error " << r + << " reading " << i->first << ", fast read, probably ok" + << dendl; + } else { + get_parent()->clog_error() << "Error " << r + << " reading object " + << i->first; + dout(5) << __func__ << ": Error " << r + << " reading " << i->first << dendl; + } + goto error; + } else { + dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl; + reply->buffers_read[i->first].push_back( + make_pair( + j->get<0>(), + bl) + ); + } + + if (!get_parent()->get_pool().allows_ecoverwrites()) { + // This shows that we still need deep scrub because large enough files + // are read in sections, so the digest check here won't be done here. + // Do NOT check osd_read_eio_on_bad_digest here. We need to report + // the state of our chunk in case other chunks could substitute. + ECUtil::HashInfoRef hinfo; + hinfo = get_hash_info(i->first); + if (!hinfo) { + r = -EIO; + get_parent()->clog_error() << "Corruption detected: object " + << i->first + << " is missing hash_info"; + dout(5) << __func__ << ": No hinfo for " << i->first << dendl; + goto error; + } + ceph_assert(hinfo->has_chunk_hash()); + if ((bl.length() == hinfo->get_total_chunk_size()) && + (j->get<0>() == 0)) { + dout(20) << __func__ << ": Checking hash of " << i->first << dendl; + bufferhash h(-1); + h << bl; + if (h.digest() != hinfo->get_chunk_hash(shard)) { + get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x" + << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec; + dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x" + << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl; + r = -EIO; + goto error; + } + } + } + } + continue; +error: + // Do NOT check osd_read_eio_on_bad_digest here. We need to report + // the state of our chunk in case other chunks could substitute. + reply->buffers_read.erase(i->first); + reply->errors[i->first] = r; + } + for (set::iterator i = op.attrs_to_read.begin(); + i != op.attrs_to_read.end(); + ++i) { + dout(10) << __func__ << ": fulfilling attr request on " + << *i << dendl; + if (reply->errors.count(*i)) + continue; + int r = store->getattrs( + ch, + ghobject_t( + *i, ghobject_t::NO_GEN, shard), + reply->attrs_read[*i]); + if (r < 0) { + // If we read error, we should not return the attrs too. + reply->attrs_read.erase(*i); + reply->buffers_read.erase(*i); + reply->errors[*i] = r; + } + } + reply->from = get_parent()->whoami_shard(); + reply->tid = op.tid; +} + +void ECBackend::handle_sub_write_reply( + pg_shard_t from, + const ECSubWriteReply &op, + const ZTracer::Trace &trace) +{ + map::iterator i = tid_to_op_map.find(op.tid); + ceph_assert(i != tid_to_op_map.end()); + if (op.committed) { + trace.event("sub write committed"); + ceph_assert(i->second.pending_commit.count(from)); + i->second.pending_commit.erase(from); + if (from != get_parent()->whoami_shard()) { + get_parent()->update_peer_last_complete_ondisk(from, op.last_complete); + } + } + if (op.applied) { + trace.event("sub write applied"); + ceph_assert(i->second.pending_apply.count(from)); + i->second.pending_apply.erase(from); + } + + if (i->second.pending_commit.empty() && + i->second.on_all_commit && + // also wait for apply, to preserve ordering with luminous peers. + i->second.pending_apply.empty()) { + dout(10) << __func__ << " Calling on_all_commit on " << i->second << dendl; + i->second.on_all_commit->complete(0); + i->second.on_all_commit = 0; + i->second.trace.event("ec write all committed"); + } + check_ops(); +} + +void ECBackend::handle_sub_read_reply( + pg_shard_t from, + ECSubReadReply &op, + RecoveryMessages *m, + const ZTracer::Trace &trace) +{ + trace.event("ec sub read reply"); + dout(10) << __func__ << ": reply " << op << dendl; + map::iterator iter = tid_to_read_map.find(op.tid); + if (iter == tid_to_read_map.end()) { + //canceled + dout(20) << __func__ << ": dropped " << op << dendl; + return; + } + ReadOp &rop = iter->second; + for (auto i = op.buffers_read.begin(); + i != op.buffers_read.end(); + ++i) { + ceph_assert(!op.errors.count(i->first)); // If attribute error we better not have sent a buffer + if (!rop.to_read.count(i->first)) { + // We canceled this read! @see filter_read_op + dout(20) << __func__ << " to_read skipping" << dendl; + continue; + } + list >::const_iterator req_iter = + rop.to_read.find(i->first)->second.to_read.begin(); + list< + boost::tuple< + uint64_t, uint64_t, map > >::iterator riter = + rop.complete[i->first].returned.begin(); + for (list >::iterator j = i->second.begin(); + j != i->second.end(); + ++j, ++req_iter, ++riter) { + ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end()); + ceph_assert(riter != rop.complete[i->first].returned.end()); + pair adjusted = + sinfo.aligned_offset_len_to_chunk( + make_pair(req_iter->get<0>(), req_iter->get<1>())); + ceph_assert(adjusted.first == j->first); + riter->get<2>()[from] = std::move(j->second); + } + } + for (auto i = op.attrs_read.begin(); + i != op.attrs_read.end(); + ++i) { + ceph_assert(!op.errors.count(i->first)); // if read error better not have sent an attribute + if (!rop.to_read.count(i->first)) { + // We canceled this read! @see filter_read_op + dout(20) << __func__ << " to_read skipping" << dendl; + continue; + } + rop.complete[i->first].attrs = map(); + (*(rop.complete[i->first].attrs)).swap(i->second); + } + for (auto i = op.errors.begin(); + i != op.errors.end(); + ++i) { + rop.complete[i->first].errors.insert( + make_pair( + from, + i->second)); + dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl; + } + + map >::iterator siter = + shard_to_read_map.find(from); + ceph_assert(siter != shard_to_read_map.end()); + ceph_assert(siter->second.count(op.tid)); + siter->second.erase(op.tid); + + ceph_assert(rop.in_progress.count(from)); + rop.in_progress.erase(from); + unsigned is_complete = 0; + bool need_resend = false; + // For redundant reads check for completion as each shard comes in, + // or in a non-recovery read check for completion once all the shards read. + if (rop.do_redundant_reads || rop.in_progress.empty()) { + for (map::const_iterator iter = + rop.complete.begin(); + iter != rop.complete.end(); + ++iter) { + set have; + for (map::const_iterator j = + iter->second.returned.front().get<2>().begin(); + j != iter->second.returned.front().get<2>().end(); + ++j) { + have.insert(j->first.shard); + dout(20) << __func__ << " have shard=" << j->first.shard << dendl; + } + map>> dummy_minimum; + int err; + if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) { + dout(20) << __func__ << " minimum_to_decode failed" << dendl; + if (rop.in_progress.empty()) { + // If we don't have enough copies, try other pg_shard_ts if available. + // During recovery there may be multiple osds with copies of the same shard, + // so getting EIO from one may result in multiple passes through this code path. + if (!rop.do_redundant_reads) { + int r = send_all_remaining_reads(iter->first, rop); + if (r == 0) { + // We changed the rop's to_read and not incrementing is_complete + need_resend = true; + continue; + } + // Couldn't read any additional shards so handle as completed with errors + } + // We don't want to confuse clients / RBD with objectstore error + // values in particular ENOENT. We may have different error returns + // from different shards, so we'll return minimum_to_decode() error + // (usually EIO) to reader. It is likely an error here is due to a + // damaged pg. + rop.complete[iter->first].r = err; + ++is_complete; + } + } else { + ceph_assert(rop.complete[iter->first].r == 0); + if (!rop.complete[iter->first].errors.empty()) { + if (cct->_conf->osd_read_ec_check_for_errors) { + dout(10) << __func__ << ": Not ignoring errors, use one shard err=" << err << dendl; + err = rop.complete[iter->first].errors.begin()->second; + rop.complete[iter->first].r = err; + } else { + get_parent()->clog_warn() << "Error(s) ignored for " + << iter->first << " enough copies available"; + dout(10) << __func__ << " Error(s) ignored for " << iter->first + << " enough copies available" << dendl; + rop.complete[iter->first].errors.clear(); + } + } + // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects + rop.to_read.at(iter->first).need.clear(); + rop.to_read.at(iter->first).want_attrs = false; + ++is_complete; + } + } + } + if (need_resend) { + do_read_op(rop); + } else if (rop.in_progress.empty() || + is_complete == rop.complete.size()) { + dout(20) << __func__ << " Complete: " << rop << dendl; + rop.trace.event("ec read complete"); + complete_read_op(rop, m); + } else { + dout(10) << __func__ << " readop not complete: " << rop << dendl; + } +} + +void ECBackend::complete_read_op(ReadOp &rop, RecoveryMessages *m) +{ + map::iterator reqiter = + rop.to_read.begin(); + map::iterator resiter = + rop.complete.begin(); + ceph_assert(rop.to_read.size() == rop.complete.size()); + for (; reqiter != rop.to_read.end(); ++reqiter, ++resiter) { + if (reqiter->second.cb) { + pair arg( + m, resiter->second); + reqiter->second.cb->complete(arg); + reqiter->second.cb = nullptr; + } + } + // if the read op is over. clean all the data of this tid. + for (set::iterator iter = rop.in_progress.begin(); + iter != rop.in_progress.end(); + iter++) { + shard_to_read_map[*iter].erase(rop.tid); + } + rop.in_progress.clear(); + tid_to_read_map.erase(rop.tid); +} + +struct FinishReadOp : public GenContext { + ECBackend *ec; + ceph_tid_t tid; + FinishReadOp(ECBackend *ec, ceph_tid_t tid) : ec(ec), tid(tid) {} + void finish(ThreadPool::TPHandle &handle) override { + auto ropiter = ec->tid_to_read_map.find(tid); + ceph_assert(ropiter != ec->tid_to_read_map.end()); + int priority = ropiter->second.priority; + RecoveryMessages rm; + ec->complete_read_op(ropiter->second, &rm); + ec->dispatch_recovery_messages(rm, priority); + } +}; + +void ECBackend::filter_read_op( + const OSDMapRef& osdmap, + ReadOp &op) +{ + set to_cancel; + for (map >::iterator i = op.source_to_obj.begin(); + i != op.source_to_obj.end(); + ++i) { + if (osdmap->is_down(i->first.osd)) { + to_cancel.insert(i->second.begin(), i->second.end()); + op.in_progress.erase(i->first); + continue; + } + } + + if (to_cancel.empty()) + return; + + for (map >::iterator i = op.source_to_obj.begin(); + i != op.source_to_obj.end(); + ) { + for (set::iterator j = i->second.begin(); + j != i->second.end(); + ) { + if (to_cancel.count(*j)) + i->second.erase(j++); + else + ++j; + } + if (i->second.empty()) { + op.source_to_obj.erase(i++); + } else { + ceph_assert(!osdmap->is_down(i->first.osd)); + ++i; + } + } + + for (set::iterator i = to_cancel.begin(); + i != to_cancel.end(); + ++i) { + get_parent()->cancel_pull(*i); + + ceph_assert(op.to_read.count(*i)); + read_request_t &req = op.to_read.find(*i)->second; + dout(10) << __func__ << ": canceling " << req + << " for obj " << *i << dendl; + ceph_assert(req.cb); + delete req.cb; + req.cb = nullptr; + + op.to_read.erase(*i); + op.complete.erase(*i); + recovery_ops.erase(*i); + } + + if (op.in_progress.empty()) { + get_parent()->schedule_recovery_work( + get_parent()->bless_unlocked_gencontext( + new FinishReadOp(this, op.tid))); + } +} + +void ECBackend::check_recovery_sources(const OSDMapRef& osdmap) +{ + set tids_to_filter; + for (map >::iterator + i = shard_to_read_map.begin(); + i != shard_to_read_map.end(); + ) { + if (osdmap->is_down(i->first.osd)) { + tids_to_filter.insert(i->second.begin(), i->second.end()); + shard_to_read_map.erase(i++); + } else { + ++i; + } + } + for (set::iterator i = tids_to_filter.begin(); + i != tids_to_filter.end(); + ++i) { + map::iterator j = tid_to_read_map.find(*i); + ceph_assert(j != tid_to_read_map.end()); + filter_read_op(osdmap, j->second); + } +} + +void ECBackend::on_change() +{ + dout(10) << __func__ << dendl; + + completed_to = eversion_t(); + committed_to = eversion_t(); + pipeline_state.clear(); + waiting_reads.clear(); + waiting_state.clear(); + waiting_commit.clear(); + for (auto &&op: tid_to_op_map) { + cache.release_write_pin(op.second.pin); + } + tid_to_op_map.clear(); + + for (map::iterator i = tid_to_read_map.begin(); + i != tid_to_read_map.end(); + ++i) { + dout(10) << __func__ << ": cancelling " << i->second << dendl; + for (map::iterator j = + i->second.to_read.begin(); + j != i->second.to_read.end(); + ++j) { + delete j->second.cb; + j->second.cb = nullptr; + } + } + tid_to_read_map.clear(); + in_progress_client_reads.clear(); + shard_to_read_map.clear(); + clear_recovery_state(); +} + +void ECBackend::clear_recovery_state() +{ + recovery_ops.clear(); +} + +void ECBackend::dump_recovery_info(Formatter *f) const +{ + f->open_array_section("recovery_ops"); + for (map::const_iterator i = recovery_ops.begin(); + i != recovery_ops.end(); + ++i) { + f->open_object_section("op"); + i->second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("read_ops"); + for (map::const_iterator i = tid_to_read_map.begin(); + i != tid_to_read_map.end(); + ++i) { + f->open_object_section("read_op"); + i->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void ECBackend::submit_transaction( + const hobject_t &hoid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&t, + const eversion_t &trim_to, + const eversion_t &min_last_complete_ondisk, + vector&& log_entries, + std::optional &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef client_op + ) +{ + ceph_assert(!tid_to_op_map.count(tid)); + Op *op = &(tid_to_op_map[tid]); + op->hoid = hoid; + op->delta_stats = delta_stats; + op->version = at_version; + op->trim_to = trim_to; + op->roll_forward_to = std::max(min_last_complete_ondisk, committed_to); + op->log_entries = log_entries; + std::swap(op->updated_hit_set_history, hset_history); + op->on_all_commit = on_all_commit; + op->tid = tid; + op->reqid = reqid; + op->client_op = client_op; + if (client_op) + op->trace = client_op->pg_trace; + +#ifdef HAVE_JAEGER + if (client_op->osd_parent_span) { + auto ec_sub_trans = jaeger_tracing::child_span("ECBackend::submit_transaction", client_op->osd_parent_span); + } +#endif + dout(10) << __func__ << ": op " << *op << " starting" << dendl; + start_rmw(op, std::move(t)); +} + +void ECBackend::call_write_ordered(std::function &&cb) { + if (!waiting_state.empty()) { + waiting_state.back().on_write.emplace_back(std::move(cb)); + } else if (!waiting_reads.empty()) { + waiting_reads.back().on_write.emplace_back(std::move(cb)); + } else { + // Nothing earlier in the pipeline, just call it + cb(); + } +} + +void ECBackend::get_all_avail_shards( + const hobject_t &hoid, + const set &error_shards, + set &have, + map &shards, + bool for_recovery) +{ + for (set::const_iterator i = + get_parent()->get_acting_shards().begin(); + i != get_parent()->get_acting_shards().end(); + ++i) { + dout(10) << __func__ << ": checking acting " << *i << dendl; + const pg_missing_t &missing = get_parent()->get_shard_missing(*i); + if (error_shards.find(*i) != error_shards.end()) + continue; + if (!missing.is_missing(hoid)) { + ceph_assert(!have.count(i->shard)); + have.insert(i->shard); + ceph_assert(!shards.count(i->shard)); + shards.insert(make_pair(i->shard, *i)); + } + } + + if (for_recovery) { + for (set::const_iterator i = + get_parent()->get_backfill_shards().begin(); + i != get_parent()->get_backfill_shards().end(); + ++i) { + if (error_shards.find(*i) != error_shards.end()) + continue; + if (have.count(i->shard)) { + ceph_assert(shards.count(i->shard)); + continue; + } + dout(10) << __func__ << ": checking backfill " << *i << dendl; + ceph_assert(!shards.count(i->shard)); + const pg_info_t &info = get_parent()->get_shard_info(*i); + const pg_missing_t &missing = get_parent()->get_shard_missing(*i); + if (hoid < info.last_backfill && + !missing.is_missing(hoid)) { + have.insert(i->shard); + shards.insert(make_pair(i->shard, *i)); + } + } + + map>::const_iterator miter = + get_parent()->get_missing_loc_shards().find(hoid); + if (miter != get_parent()->get_missing_loc_shards().end()) { + for (set::iterator i = miter->second.begin(); + i != miter->second.end(); + ++i) { + dout(10) << __func__ << ": checking missing_loc " << *i << dendl; + auto m = get_parent()->maybe_get_shard_missing(*i); + if (m) { + ceph_assert(!(*m).is_missing(hoid)); + } + if (error_shards.find(*i) != error_shards.end()) + continue; + have.insert(i->shard); + shards.insert(make_pair(i->shard, *i)); + } + } + } +} + +int ECBackend::get_min_avail_to_read_shards( + const hobject_t &hoid, + const set &want, + bool for_recovery, + bool do_redundant_reads, + map>> *to_read) +{ + // Make sure we don't do redundant reads for recovery + ceph_assert(!for_recovery || !do_redundant_reads); + + set have; + map shards; + set error_shards; + + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + + map>> need; + int r = ec_impl->minimum_to_decode(want, have, &need); + if (r < 0) + return r; + + if (do_redundant_reads) { + vector> subchunks_list; + subchunks_list.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); + for (auto &&i: have) { + need[i] = subchunks_list; + } + } + + if (!to_read) + return 0; + + for (auto &&i:need) { + ceph_assert(shards.count(shard_id_t(i.first))); + to_read->insert(make_pair(shards[shard_id_t(i.first)], i.second)); + } + return 0; +} + +int ECBackend::get_remaining_shards( + const hobject_t &hoid, + const set &avail, + const set &want, + const read_result_t &result, + map>> *to_read, + bool for_recovery) +{ + ceph_assert(to_read); + + set have; + map shards; + set error_shards; + for (auto &p : result.errors) { + error_shards.insert(p.first); + } + + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + + map>> need; + int r = ec_impl->minimum_to_decode(want, have, &need); + if (r < 0) { + dout(0) << __func__ << " not enough shards left to try for " << hoid + << " read result was " << result << dendl; + return -EIO; + } + + set shards_left; + for (auto p : need) { + if (avail.find(p.first) == avail.end()) { + shards_left.insert(p.first); + } + } + + vector> subchunks; + subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); + for (set::iterator i = shards_left.begin(); + i != shards_left.end(); + ++i) { + ceph_assert(shards.count(shard_id_t(*i))); + ceph_assert(avail.find(*i) == avail.end()); + to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks)); + } + return 0; +} + +void ECBackend::start_read_op( + int priority, + map> &want_to_read, + map &to_read, + OpRequestRef _op, + bool do_redundant_reads, + bool for_recovery) +{ + ceph_tid_t tid = get_parent()->get_tid(); + ceph_assert(!tid_to_read_map.count(tid)); + auto &op = tid_to_read_map.emplace( + tid, + ReadOp( + priority, + tid, + do_redundant_reads, + for_recovery, + _op, + std::move(want_to_read), + std::move(to_read))).first->second; + dout(10) << __func__ << ": starting " << op << dendl; + if (_op) { + op.trace = _op->pg_trace; + op.trace.event("start ec read"); + } + do_read_op(op); +} + +void ECBackend::do_read_op(ReadOp &op) +{ + int priority = op.priority; + ceph_tid_t tid = op.tid; + + dout(10) << __func__ << ": starting read " << op << dendl; + + map messages; + for (map::iterator i = op.to_read.begin(); + i != op.to_read.end(); + ++i) { + bool need_attrs = i->second.want_attrs; + + for (auto j = i->second.need.begin(); + j != i->second.need.end(); + ++j) { + if (need_attrs) { + messages[j->first].attrs_to_read.insert(i->first); + need_attrs = false; + } + messages[j->first].subchunks[i->first] = j->second; + op.obj_to_source[i->first].insert(j->first); + op.source_to_obj[j->first].insert(i->first); + } + for (list >::const_iterator j = + i->second.to_read.begin(); + j != i->second.to_read.end(); + ++j) { + pair chunk_off_len = + sinfo.aligned_offset_len_to_chunk(make_pair(j->get<0>(), j->get<1>())); + for (auto k = i->second.need.begin(); + k != i->second.need.end(); + ++k) { + messages[k->first].to_read[i->first].push_back( + boost::make_tuple( + chunk_off_len.first, + chunk_off_len.second, + j->get<2>())); + } + ceph_assert(!need_attrs); + } + } + + std::vector> m; + m.reserve(messages.size()); + for (map::iterator i = messages.begin(); + i != messages.end(); + ++i) { + op.in_progress.insert(i->first); + shard_to_read_map[i->first].insert(op.tid); + i->second.tid = tid; + MOSDECSubOpRead *msg = new MOSDECSubOpRead; + msg->set_priority(priority); + msg->pgid = spg_t( + get_parent()->whoami_spg_t().pgid, + i->first.shard); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_interval_start_epoch(); + msg->op = i->second; + msg->op.from = get_parent()->whoami_shard(); + msg->op.tid = tid; + if (op.trace) { + // initialize a child span for this shard + msg->trace.init("ec sub read", nullptr, &op.trace); + msg->trace.keyval("shard", i->first.shard.id); + } + m.push_back(std::make_pair(i->first.osd, msg)); + } + if (!m.empty()) { + get_parent()->send_message_osd_cluster(m, get_osdmap_epoch()); + } + + dout(10) << __func__ << ": started " << op << dendl; +} + +ECUtil::HashInfoRef ECBackend::get_hash_info( + const hobject_t &hoid, bool create, const map *attrs) +{ + dout(10) << __func__ << ": Getting attr on " << hoid << dendl; + ECUtil::HashInfoRef ref = unstable_hashinfo_registry.lookup(hoid); + if (!ref) { + dout(10) << __func__ << ": not in cache " << hoid << dendl; + struct stat st; + int r = store->stat( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + &st); + ECUtil::HashInfo hinfo(ec_impl->get_chunk_count()); + if (r >= 0) { + dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl; + bufferlist bl; + if (attrs) { + map::const_iterator k = attrs->find(ECUtil::get_hinfo_key()); + if (k == attrs->end()) { + dout(5) << __func__ << " " << hoid << " missing hinfo attr" << dendl; + } else { + bl.push_back(k->second); + } + } else { + r = store->getattr( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ECUtil::get_hinfo_key(), + bl); + if (r < 0) { + dout(5) << __func__ << ": getattr failed: " << cpp_strerror(r) << dendl; + bl.clear(); // just in case + } + } + if (bl.length() > 0) { + auto bp = bl.cbegin(); + try { + decode(hinfo, bp); + } catch(...) { + dout(0) << __func__ << ": Can't decode hinfo for " << hoid << dendl; + return ECUtil::HashInfoRef(); + } + if (hinfo.get_total_chunk_size() != (uint64_t)st.st_size) { + dout(0) << __func__ << ": Mismatch of total_chunk_size " + << hinfo.get_total_chunk_size() << dendl; + return ECUtil::HashInfoRef(); + } + } else if (st.st_size > 0) { // If empty object and no hinfo, create it + return ECUtil::HashInfoRef(); + } + } else if (r != -ENOENT || !create) { + derr << __func__ << ": stat " << hoid << " failed: " << cpp_strerror(r) + << dendl; + return ECUtil::HashInfoRef(); + } + ref = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo); + } + return ref; +} + +void ECBackend::start_rmw(Op *op, PGTransactionUPtr &&t) +{ + ceph_assert(op); + + op->plan = ECTransaction::get_write_plan( + sinfo, + std::move(t), + [&](const hobject_t &i) { + ECUtil::HashInfoRef ref = get_hash_info(i, true); + if (!ref) { + derr << __func__ << ": get_hash_info(" << i << ")" + << " returned a null pointer and there is no " + << " way to recover from such an error in this " + << " context" << dendl; + ceph_abort(); + } + return ref; + }, + get_parent()->get_dpp()); + + dout(10) << __func__ << ": " << *op << dendl; + + waiting_state.push_back(*op); + check_ops(); +} + +bool ECBackend::try_state_to_reads() +{ + if (waiting_state.empty()) + return false; + + Op *op = &(waiting_state.front()); + if (op->requires_rmw() && pipeline_state.cache_invalid()) { + ceph_assert(get_parent()->get_pool().allows_ecoverwrites()); + dout(20) << __func__ << ": blocking " << *op + << " because it requires an rmw and the cache is invalid " + << pipeline_state + << dendl; + return false; + } + + if (!pipeline_state.caching_enabled()) { + op->using_cache = false; + } else if (op->invalidates_cache()) { + dout(20) << __func__ << ": invalidating cache after this op" + << dendl; + pipeline_state.invalidate(); + } + + waiting_state.pop_front(); + waiting_reads.push_back(*op); + + if (op->using_cache) { + cache.open_write_pin(op->pin); + + extent_set empty; + for (auto &&hpair: op->plan.will_write) { + auto to_read_plan_iter = op->plan.to_read.find(hpair.first); + const extent_set &to_read_plan = + to_read_plan_iter == op->plan.to_read.end() ? + empty : + to_read_plan_iter->second; + + extent_set remote_read = cache.reserve_extents_for_rmw( + hpair.first, + op->pin, + hpair.second, + to_read_plan); + + extent_set pending_read = to_read_plan; + pending_read.subtract(remote_read); + + if (!remote_read.empty()) { + op->remote_read[hpair.first] = std::move(remote_read); + } + if (!pending_read.empty()) { + op->pending_read[hpair.first] = std::move(pending_read); + } + } + } else { + op->remote_read = op->plan.to_read; + } + + dout(10) << __func__ << ": " << *op << dendl; + + if (!op->remote_read.empty()) { + ceph_assert(get_parent()->get_pool().allows_ecoverwrites()); + objects_read_async_no_cache( + op->remote_read, + [this, op](map > &&results) { + for (auto &&i: results) { + op->remote_read_result.emplace(i.first, i.second.second); + } + check_ops(); + }); + } + + return true; +} + +bool ECBackend::try_reads_to_commit() +{ + if (waiting_reads.empty()) + return false; + Op *op = &(waiting_reads.front()); + if (op->read_in_progress()) + return false; + waiting_reads.pop_front(); + waiting_commit.push_back(*op); + + dout(10) << __func__ << ": starting commit on " << *op << dendl; + dout(20) << __func__ << ": " << cache << dendl; + + get_parent()->apply_stats( + op->hoid, + op->delta_stats); + + if (op->using_cache) { + for (auto &&hpair: op->pending_read) { + op->remote_read_result[hpair.first].insert( + cache.get_remaining_extents_for_rmw( + hpair.first, + op->pin, + hpair.second)); + } + op->pending_read.clear(); + } else { + ceph_assert(op->pending_read.empty()); + } + + map trans; + for (set::const_iterator i = + get_parent()->get_acting_recovery_backfill_shards().begin(); + i != get_parent()->get_acting_recovery_backfill_shards().end(); + ++i) { + trans[i->shard]; + } + + op->trace.event("start ec write"); + + map written; + if (op->plan.t) { + ECTransaction::generate_transactions( + op->plan, + ec_impl, + get_parent()->get_info().pgid.pgid, + sinfo, + op->remote_read_result, + op->log_entries, + &written, + &trans, + &(op->temp_added), + &(op->temp_cleared), + get_parent()->get_dpp(), + get_osdmap()->require_osd_release); + } + + dout(20) << __func__ << ": " << cache << dendl; + dout(20) << __func__ << ": written: " << written << dendl; + dout(20) << __func__ << ": op: " << *op << dendl; + + if (!get_parent()->get_pool().allows_ecoverwrites()) { + for (auto &&i: op->log_entries) { + if (i.requires_kraken()) { + derr << __func__ << ": log entry " << i << " requires kraken" + << " but overwrites are not enabled!" << dendl; + ceph_abort(); + } + } + } + + map written_set; + for (auto &&i: written) { + written_set[i.first] = i.second.get_interval_set(); + } + dout(20) << __func__ << ": written_set: " << written_set << dendl; + ceph_assert(written_set == op->plan.will_write); + + if (op->using_cache) { + for (auto &&hpair: written) { + dout(20) << __func__ << ": " << hpair << dendl; + cache.present_rmw_update(hpair.first, op->pin, hpair.second); + } + } + op->remote_read.clear(); + op->remote_read_result.clear(); + + ObjectStore::Transaction empty; + bool should_write_local = false; + ECSubWrite local_write_op; + std::vector> messages; + messages.reserve(get_parent()->get_acting_recovery_backfill_shards().size()); + set backfill_shards = get_parent()->get_backfill_shards(); + for (set::const_iterator i = + get_parent()->get_acting_recovery_backfill_shards().begin(); + i != get_parent()->get_acting_recovery_backfill_shards().end(); + ++i) { + op->pending_apply.insert(*i); + op->pending_commit.insert(*i); + map::iterator iter = + trans.find(i->shard); + ceph_assert(iter != trans.end()); + bool should_send = get_parent()->should_send_op(*i, op->hoid); + const pg_stat_t &stats = + (should_send || !backfill_shards.count(*i)) ? + get_info().stats : + parent->get_shard_info().find(*i)->second.stats; + + ECSubWrite sop( + get_parent()->whoami_shard(), + op->tid, + op->reqid, + op->hoid, + stats, + should_send ? iter->second : empty, + op->version, + op->trim_to, + op->roll_forward_to, + op->log_entries, + op->updated_hit_set_history, + op->temp_added, + op->temp_cleared, + !should_send); + + ZTracer::Trace trace; + if (op->trace) { + // initialize a child span for this shard + trace.init("ec sub write", nullptr, &op->trace); + trace.keyval("shard", i->shard.id); + } + + if (*i == get_parent()->whoami_shard()) { + should_write_local = true; + local_write_op.claim(sop); + } else { + MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop); + r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard); + r->map_epoch = get_osdmap_epoch(); + r->min_epoch = get_parent()->get_interval_start_epoch(); + r->trace = trace; + messages.push_back(std::make_pair(i->osd, r)); + } + } + +#ifdef HAVE_JAEGER + if (op->client_op->osd_parent_span) { + auto sub_write_span = jaeger_tracing::child_span("EC sub write", op->client_op->osd_parent_span); + } +#endif + if (!messages.empty()) { + get_parent()->send_message_osd_cluster(messages, get_osdmap_epoch()); + } + + if (should_write_local) { + handle_sub_write( + get_parent()->whoami_shard(), + op->client_op, + local_write_op, + op->trace); + } + + for (auto i = op->on_write.begin(); + i != op->on_write.end(); + op->on_write.erase(i++)) { + (*i)(); + } + + return true; +} + +bool ECBackend::try_finish_rmw() +{ + if (waiting_commit.empty()) + return false; + Op *op = &(waiting_commit.front()); + if (op->write_in_progress()) + return false; + waiting_commit.pop_front(); + + dout(10) << __func__ << ": " << *op << dendl; + dout(20) << __func__ << ": " << cache << dendl; + + if (op->roll_forward_to > completed_to) + completed_to = op->roll_forward_to; + if (op->version > committed_to) + committed_to = op->version; + + if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + if (op->version > get_parent()->get_log().get_can_rollback_to() && + waiting_reads.empty() && + waiting_commit.empty()) { + // submit a dummy transaction to kick the rollforward + auto tid = get_parent()->get_tid(); + Op *nop = &(tid_to_op_map[tid]); + nop->hoid = op->hoid; + nop->trim_to = op->trim_to; + nop->roll_forward_to = op->version; + nop->tid = tid; + nop->reqid = op->reqid; + waiting_reads.push_back(*nop); + } + } + + if (op->using_cache) { + cache.release_write_pin(op->pin); + } + tid_to_op_map.erase(op->tid); + + if (waiting_reads.empty() && + waiting_commit.empty()) { + pipeline_state.clear(); + dout(20) << __func__ << ": clearing pipeline_state " + << pipeline_state + << dendl; + } + return true; +} + +void ECBackend::check_ops() +{ + while (try_state_to_reads() || + try_reads_to_commit() || + try_finish_rmw()); +} + +int ECBackend::objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + bufferlist *bl) +{ + return -EOPNOTSUPP; +} + +void ECBackend::objects_read_async( + const hobject_t &hoid, + const list, + pair > > &to_read, + Context *on_complete, + bool fast_read) +{ + map > > + reads; + + uint32_t flags = 0; + extent_set es; + for (list, + pair > >::const_iterator i = + to_read.begin(); + i != to_read.end(); + ++i) { + pair tmp = + sinfo.offset_len_to_stripe_bounds( + make_pair(i->first.get<0>(), i->first.get<1>())); + + es.union_insert(tmp.first, tmp.second); + flags |= i->first.get<2>(); + } + + if (!es.empty()) { + auto &offsets = reads[hoid]; + for (auto j = es.begin(); + j != es.end(); + ++j) { + offsets.push_back( + boost::make_tuple( + j.get_start(), + j.get_len(), + flags)); + } + } + + struct cb { + ECBackend *ec; + hobject_t hoid; + list, + pair > > to_read; + unique_ptr on_complete; + cb(const cb&) = delete; + cb(cb &&) = default; + cb(ECBackend *ec, + const hobject_t &hoid, + const list, + pair > > &to_read, + Context *on_complete) + : ec(ec), + hoid(hoid), + to_read(to_read), + on_complete(on_complete) {} + void operator()(map > &&results) { + auto dpp = ec->get_parent()->get_dpp(); + ldpp_dout(dpp, 20) << "objects_read_async_cb: got: " << results + << dendl; + ldpp_dout(dpp, 20) << "objects_read_async_cb: cache: " << ec->cache + << dendl; + + auto &got = results[hoid]; + + int r = 0; + for (auto &&read: to_read) { + if (got.first < 0) { + if (read.second.second) { + read.second.second->complete(got.first); + } + if (r == 0) + r = got.first; + } else { + ceph_assert(read.second.first); + uint64_t offset = read.first.get<0>(); + uint64_t length = read.first.get<1>(); + auto range = got.second.get_containing_range(offset, length); + ceph_assert(range.first != range.second); + ceph_assert(range.first.get_off() <= offset); + ldpp_dout(dpp, 30) << "offset: " << offset << dendl; + ldpp_dout(dpp, 30) << "range offset: " << range.first.get_off() << dendl; + ldpp_dout(dpp, 30) << "length: " << length << dendl; + ldpp_dout(dpp, 30) << "range length: " << range.first.get_len() << dendl; + ceph_assert( + (offset + length) <= + (range.first.get_off() + range.first.get_len())); + read.second.first->substr_of( + range.first.get_val(), + offset - range.first.get_off(), + length); + if (read.second.second) { + read.second.second->complete(length); + read.second.second = nullptr; + } + } + } + to_read.clear(); + if (on_complete) { + on_complete.release()->complete(r); + } + } + ~cb() { + for (auto &&i: to_read) { + delete i.second.second; + } + to_read.clear(); + } + }; + objects_read_and_reconstruct( + reads, + fast_read, + make_gen_lambda_context< + map > &&, cb>( + cb(this, + hoid, + to_read, + on_complete))); +} + +struct CallClientContexts : + public GenContext &> { + hobject_t hoid; + ECBackend *ec; + ECBackend::ClientAsyncReadStatus *status; + list > to_read; + CallClientContexts( + hobject_t hoid, + ECBackend *ec, + ECBackend::ClientAsyncReadStatus *status, + const list > &to_read) + : hoid(hoid), ec(ec), status(status), to_read(to_read) {} + void finish(pair &in) override { + ECBackend::read_result_t &res = in.second; + extent_map result; + if (res.r != 0) + goto out; + ceph_assert(res.returned.size() == to_read.size()); + ceph_assert(res.errors.empty()); + for (auto &&read: to_read) { + pair adjusted = + ec->sinfo.offset_len_to_stripe_bounds( + make_pair(read.get<0>(), read.get<1>())); + ceph_assert(res.returned.front().get<0>() == adjusted.first && + res.returned.front().get<1>() == adjusted.second); + map to_decode; + bufferlist bl; + for (map::iterator j = + res.returned.front().get<2>().begin(); + j != res.returned.front().get<2>().end(); + ++j) { + to_decode[j->first.shard] = std::move(j->second); + } + int r = ECUtil::decode( + ec->sinfo, + ec->ec_impl, + to_decode, + &bl); + if (r < 0) { + res.r = r; + goto out; + } + bufferlist trimmed; + trimmed.substr_of( + bl, + read.get<0>() - adjusted.first, + std::min(read.get<1>(), + bl.length() - (read.get<0>() - adjusted.first))); + result.insert( + read.get<0>(), trimmed.length(), std::move(trimmed)); + res.returned.pop_front(); + } +out: + status->complete_object(hoid, res.r, std::move(result)); + ec->kick_reads(); + } +}; + +void ECBackend::objects_read_and_reconstruct( + const map > + > &reads, + bool fast_read, + GenContextURef > &&> &&func) +{ + in_progress_client_reads.emplace_back( + reads.size(), std::move(func)); + if (!reads.size()) { + kick_reads(); + return; + } + + map> obj_want_to_read; + set want_to_read; + get_want_to_read_shards(&want_to_read); + + map for_read_op; + for (auto &&to_read: reads) { + map>> shards; + int r = get_min_avail_to_read_shards( + to_read.first, + want_to_read, + false, + fast_read, + &shards); + ceph_assert(r == 0); + + CallClientContexts *c = new CallClientContexts( + to_read.first, + this, + &(in_progress_client_reads.back()), + to_read.second); + for_read_op.insert( + make_pair( + to_read.first, + read_request_t( + to_read.second, + shards, + false, + c))); + obj_want_to_read.insert(make_pair(to_read.first, want_to_read)); + } + + start_read_op( + CEPH_MSG_PRIO_DEFAULT, + obj_want_to_read, + for_read_op, + OpRequestRef(), + fast_read, false); + return; +} + + +int ECBackend::send_all_remaining_reads( + const hobject_t &hoid, + ReadOp &rop) +{ + set already_read; + const set& ots = rop.obj_to_source[hoid]; + for (set::iterator i = ots.begin(); i != ots.end(); ++i) + already_read.insert(i->shard); + dout(10) << __func__ << " have/error shards=" << already_read << dendl; + map>> shards; + int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid], + rop.complete[hoid], &shards, rop.for_recovery); + if (r) + return r; + + list > offsets = + rop.to_read.find(hoid)->second.to_read; + GenContext &> *c = + rop.to_read.find(hoid)->second.cb; + + // (Note cuixf) If we need to read attrs and we read failed, try to read again. + bool want_attrs = + rop.to_read.find(hoid)->second.want_attrs && + (!rop.complete[hoid].attrs || rop.complete[hoid].attrs->empty()); + if (want_attrs) { + dout(10) << __func__ << " want attrs again" << dendl; + } + + rop.to_read.erase(hoid); + rop.to_read.insert(make_pair( + hoid, + read_request_t( + offsets, + shards, + want_attrs, + c))); + return 0; +} + +int ECBackend::objects_get_attrs( + const hobject_t &hoid, + map *out) +{ + int r = store->getattrs( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + *out); + if (r < 0) + return r; + + for (map::iterator i = out->begin(); + i != out->end(); + ) { + if (ECUtil::is_hinfo_key_string(i->first)) + out->erase(i++); + else + ++i; + } + return r; +} + +void ECBackend::rollback_append( + const hobject_t &hoid, + uint64_t old_size, + ObjectStore::Transaction *t) +{ + ceph_assert(old_size % sinfo.get_stripe_width() == 0); + t->truncate( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + sinfo.aligned_logical_offset_to_chunk_offset( + old_size)); +} + +int ECBackend::be_deep_scrub( + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) +{ + dout(10) << __func__ << " " << poid << " pos " << pos << dendl; + int r; + + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + utime_t sleeptime; + sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep); + if (sleeptime != utime_t()) { + lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl; + sleeptime.sleep(); + } + + if (pos.data_pos == 0) { + pos.data_hash = bufferhash(-1); + } + + uint64_t stride = cct->_conf->osd_deep_scrub_stride; + if (stride % sinfo.get_chunk_size()) + stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size()); + + bufferlist bl; + r = store->read( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + pos.data_pos, + stride, bl, + fadvise_flags); + if (r < 0) { + dout(20) << __func__ << " " << poid << " got " + << r << " on read, read_error" << dendl; + o.read_error = true; + return 0; + } + if (bl.length() % sinfo.get_chunk_size()) { + dout(20) << __func__ << " " << poid << " got " + << r << " on read, not chunk size " << sinfo.get_chunk_size() << " aligned" + << dendl; + o.read_error = true; + return 0; + } + if (r > 0) { + pos.data_hash << bl; + } + pos.data_pos += r; + if (r == (int)stride) { + return -EINPROGRESS; + } + + ECUtil::HashInfoRef hinfo = get_hash_info(poid, false, &o.attrs); + if (!hinfo) { + dout(0) << "_scan_list " << poid << " could not retrieve hash info" << dendl; + o.read_error = true; + o.digest_present = false; + return 0; + } else { + if (!get_parent()->get_pool().allows_ecoverwrites()) { + if (!hinfo->has_chunk_hash()) { + dout(0) << "_scan_list " << poid << " got invalid hash info" << dendl; + o.ec_size_mismatch = true; + return 0; + } + if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) { + dout(0) << "_scan_list " << poid << " got incorrect size on read 0x" + << std::hex << pos + << " expected 0x" << hinfo->get_total_chunk_size() << std::dec + << dendl; + o.ec_size_mismatch = true; + return 0; + } + + if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) != + pos.data_hash.digest()) { + dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x" + << std::hex << pos.data_hash.digest() << " != expected 0x" + << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) + << std::dec << dendl; + o.ec_hash_mismatch = true; + return 0; + } + + /* We checked above that we match our own stored hash. We cannot + * send a hash of the actual object, so instead we simply send + * our locally stored hash of shard 0 on the assumption that if + * we match our chunk hash and our recollection of the hash for + * chunk 0 matches that of our peers, there is likely no corruption. + */ + o.digest = hinfo->get_chunk_hash(0); + o.digest_present = true; + } else { + /* Hack! We must be using partial overwrites, and partial overwrites + * don't support deep-scrub yet + */ + o.digest = 0; + o.digest_present = true; + } + } + + o.omap_digest = -1; + o.omap_digest_present = true; + return 0; +} diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h new file mode 100644 index 000000000..45495376a --- /dev/null +++ b/src/osd/ECBackend.h @@ -0,0 +1,686 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ECBACKEND_H +#define ECBACKEND_H + +#include +#include + +#include "OSD.h" +#include "PGBackend.h" +#include "erasure-code/ErasureCodeInterface.h" +#include "ECUtil.h" +#include "ECTransaction.h" +#include "ExtentCache.h" + +//forward declaration +struct ECSubWrite; +struct ECSubWriteReply; +struct ECSubRead; +struct ECSubReadReply; + +struct RecoveryMessages; +class ECBackend : public PGBackend { +public: + RecoveryHandle *open_recovery_op() override; + + void run_recovery_op( + RecoveryHandle *h, + int priority + ) override; + + int recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + RecoveryHandle *h + ) override; + + bool _handle_message( + OpRequestRef op + ) override; + bool can_handle_while_inactive( + OpRequestRef op + ) override; + friend struct SubWriteApplied; + friend struct SubWriteCommitted; + void sub_write_committed( + ceph_tid_t tid, + eversion_t version, + eversion_t last_complete, + const ZTracer::Trace &trace); + void handle_sub_write( + pg_shard_t from, + OpRequestRef msg, + ECSubWrite &op, + const ZTracer::Trace &trace + ); + void handle_sub_read( + pg_shard_t from, + const ECSubRead &op, + ECSubReadReply *reply, + const ZTracer::Trace &trace + ); + void handle_sub_write_reply( + pg_shard_t from, + const ECSubWriteReply &op, + const ZTracer::Trace &trace + ); + void handle_sub_read_reply( + pg_shard_t from, + ECSubReadReply &op, + RecoveryMessages *m, + const ZTracer::Trace &trace + ); + + /// @see ReadOp below + void check_recovery_sources(const OSDMapRef& osdmap) override; + + void on_change() override; + void clear_recovery_state() override; + + void dump_recovery_info(ceph::Formatter *f) const override; + + void call_write_ordered(std::function &&cb) override; + + void submit_transaction( + const hobject_t &hoid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&t, + const eversion_t &trim_to, + const eversion_t &min_last_complete_ondisk, + std::vector&& log_entries, + std::optional &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef op + ) override; + + int objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + ceph::buffer::list *bl) override; + + /** + * Async read mechanism + * + * Async reads use the same async read mechanism as does recovery. + * CallClientContexts is responsible for reconstructing the response + * buffer as well as for calling the callbacks. + * + * One tricky bit is that two reads may possibly not read from the same + * std::set of replicas. This could result in two reads completing in the + * wrong (from the interface user's point of view) order. Thus, we + * maintain a queue of in progress reads (@see in_progress_client_reads) + * to ensure that we always call the completion callback in order. + * + * Another subtly is that while we may read a degraded object, we will + * still only perform a client read from shards in the acting std::set. This + * ensures that we won't ever have to restart a client initiated read in + * check_recovery_sources. + */ + void objects_read_and_reconstruct( + const std::map > + > &reads, + bool fast_read, + GenContextURef > &&> &&func); + + friend struct CallClientContexts; + struct ClientAsyncReadStatus { + unsigned objects_to_read; + GenContextURef > &&> func; + std::map > results; + explicit ClientAsyncReadStatus( + unsigned objects_to_read, + GenContextURef > &&> &&func) + : objects_to_read(objects_to_read), func(std::move(func)) {} + void complete_object( + const hobject_t &hoid, + int err, + extent_map &&buffers) { + ceph_assert(objects_to_read); + --objects_to_read; + ceph_assert(!results.count(hoid)); + results.emplace(hoid, std::make_pair(err, std::move(buffers))); + } + bool is_complete() const { + return objects_to_read == 0; + } + void run() { + func.release()->complete(std::move(results)); + } + }; + std::list in_progress_client_reads; + void objects_read_async( + const hobject_t &hoid, + const std::list, + std::pair > > &to_read, + Context *on_complete, + bool fast_read = false) override; + + template + void objects_read_async_no_cache( + const std::map &to_read, + Func &&on_complete) { + std::map > > _to_read; + for (auto &&hpair: to_read) { + auto &l = _to_read[hpair.first]; + for (auto extent: hpair.second) { + l.emplace_back(extent.first, extent.second, 0); + } + } + objects_read_and_reconstruct( + _to_read, + false, + make_gen_lambda_context< + std::map > &&, Func>( + std::forward(on_complete))); + } + void kick_reads() { + while (in_progress_client_reads.size() && + in_progress_client_reads.front().is_complete()) { + in_progress_client_reads.front().run(); + in_progress_client_reads.pop_front(); + } + } + +private: + friend struct ECRecoveryHandle; + uint64_t get_recovery_chunk_size() const { + return round_up_to(cct->_conf->osd_recovery_max_chunk, + sinfo.get_stripe_width()); + } + + void get_want_to_read_shards(std::set *want_to_read) const { + const std::vector &chunk_mapping = ec_impl->get_chunk_mapping(); + for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) { + int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i; + want_to_read->insert(chunk); + } + } + + /** + * Recovery + * + * Recovery uses the same underlying read mechanism as client reads + * with the slight difference that recovery reads may come from non + * acting shards. Thus, check_recovery_sources may wind up calling + * cancel_pull for a read originating with RecoveryOp. + * + * The recovery process is expressed as a state machine: + * - IDLE: Nothing is currently in progress, reads will be started and + * we will transition to READING + * - READING: We are awaiting a pending read op. Once complete, we will + * decode the buffers and proceed to WRITING + * - WRITING: We are awaiting a completed push. Once complete, we will + * either transition to COMPLETE or to IDLE to continue. + * - COMPLETE: complete + * + * We use the existing Push and PushReply messages and structures to + * handle actually shuffling the data over to the replicas. recovery_info + * and recovery_progress are expressed in terms of the logical offset + * space except for data_included which is in terms of the chunked object + * space (to match the passed buffer). + * + * xattrs are requested on the first read and used to initialize the + * object_context if missing on completion of the first read. + * + * In order to batch up reads and writes, we batch Push, PushReply, + * Transaction, and reads in a RecoveryMessages object which is passed + * among the recovery methods. + */ + struct RecoveryOp { + hobject_t hoid; + eversion_t v; + std::set missing_on; + std::set missing_on_shards; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress recovery_progress; + + enum state_t { IDLE, READING, WRITING, COMPLETE } state; + + static const char* tostr(state_t state) { + switch (state) { + case ECBackend::RecoveryOp::IDLE: + return "IDLE"; + case ECBackend::RecoveryOp::READING: + return "READING"; + case ECBackend::RecoveryOp::WRITING: + return "WRITING"; + case ECBackend::RecoveryOp::COMPLETE: + return "COMPLETE"; + default: + ceph_abort(); + return ""; + } + } + + // must be filled if state == WRITING + std::map returned_data; + std::map xattrs; + ECUtil::HashInfoRef hinfo; + ObjectContextRef obc; + std::set waiting_on_pushes; + + // valid in state READING + std::pair extent_requested; + + void dump(ceph::Formatter *f) const; + + RecoveryOp() : state(IDLE) {} + }; + friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs); + std::map recovery_ops; + + void continue_recovery_op( + RecoveryOp &op, + RecoveryMessages *m); + void dispatch_recovery_messages(RecoveryMessages &m, int priority); + friend struct OnRecoveryReadComplete; + void handle_recovery_read_complete( + const hobject_t &hoid, + boost::tuple > &to_read, + std::optional > attrs, + RecoveryMessages *m); + void handle_recovery_push( + const PushOp &op, + RecoveryMessages *m, + bool is_repair); + void handle_recovery_push_reply( + const PushReplyOp &op, + pg_shard_t from, + RecoveryMessages *m); + void get_all_avail_shards( + const hobject_t &hoid, + const std::set &error_shards, + std::set &have, + std::map &shards, + bool for_recovery); + +public: + /** + * Low level async read mechanism + * + * To avoid duplicating the logic for requesting and waiting for + * multiple object shards, there is a common async read mechanism + * taking a std::map of hobject_t->read_request_t which defines callbacks + * taking read_result_ts as arguments. + * + * tid_to_read_map gives open read ops. check_recovery_sources uses + * shard_to_read_map and ReadOp::source_to_obj to restart reads + * involving down osds. + * + * The user is responsible for specifying replicas on which to read + * and for reassembling the buffer on the other side since client + * reads require the original object buffer while recovery only needs + * the missing pieces. + * + * Rather than handling reads on the primary directly, we simply send + * ourselves a message. This avoids a dedicated primary path for that + * part. + */ + struct read_result_t { + int r; + std::map errors; + std::optional > attrs; + std::list< + boost::tuple< + uint64_t, uint64_t, std::map > > returned; + read_result_t() : r(0) {} + }; + struct read_request_t { + const std::list > to_read; + std::map>> need; + bool want_attrs; + GenContext &> *cb; + read_request_t( + const std::list > &to_read, + const std::map>> &need, + bool want_attrs, + GenContext &> *cb) + : to_read(to_read), need(need), want_attrs(want_attrs), + cb(cb) {} + }; + friend ostream &operator<<(ostream &lhs, const read_request_t &rhs); + + struct ReadOp { + int priority; + ceph_tid_t tid; + OpRequestRef op; // may be null if not on behalf of a client + // True if redundant reads are issued, false otherwise, + // this is useful to tradeoff some resources (redundant ops) for + // low latency read, especially on relatively idle cluster + bool do_redundant_reads; + // True if reading for recovery which could possibly reading only a subset + // of the available shards. + bool for_recovery; + + ZTracer::Trace trace; + + std::map> want_to_read; + std::map to_read; + std::map complete; + + std::map> obj_to_source; + std::map > source_to_obj; + + void dump(ceph::Formatter *f) const; + + std::set in_progress; + + ReadOp( + int priority, + ceph_tid_t tid, + bool do_redundant_reads, + bool for_recovery, + OpRequestRef op, + std::map> &&_want_to_read, + std::map &&_to_read) + : priority(priority), tid(tid), op(op), do_redundant_reads(do_redundant_reads), + for_recovery(for_recovery), want_to_read(std::move(_want_to_read)), + to_read(std::move(_to_read)) { + for (auto &&hpair: to_read) { + auto &returned = complete[hpair.first].returned; + for (auto &&extent: hpair.second.to_read) { + returned.push_back( + boost::make_tuple( + extent.get<0>(), + extent.get<1>(), + std::map())); + } + } + } + ReadOp() = delete; + ReadOp(const ReadOp &) = default; + ReadOp(ReadOp &&) = default; + }; + friend struct FinishReadOp; + void filter_read_op( + const OSDMapRef& osdmap, + ReadOp &op); + void complete_read_op(ReadOp &rop, RecoveryMessages *m); + friend ostream &operator<<(ostream &lhs, const ReadOp &rhs); + std::map tid_to_read_map; + std::map > shard_to_read_map; + void start_read_op( + int priority, + std::map> &want_to_read, + std::map &to_read, + OpRequestRef op, + bool do_redundant_reads, bool for_recovery); + + void do_read_op(ReadOp &rop); + int send_all_remaining_reads( + const hobject_t &hoid, + ReadOp &rop); + + + /** + * Client writes + * + * ECTransaction is responsible for generating a transaction for + * each shard to which we need to send the write. As required + * by the PGBackend interface, the ECBackend write mechanism + * passes trim information with the write and last_complete back + * with the reply. + * + * As with client reads, there is a possibility of out-of-order + * completions. Thus, callbacks and completion are called in order + * on the writing std::list. + */ + struct Op : boost::intrusive::list_base_hook<> { + /// From submit_transaction caller, describes operation + hobject_t hoid; + object_stat_sum_t delta_stats; + eversion_t version; + eversion_t trim_to; + std::optional updated_hit_set_history; + std::vector log_entries; + ceph_tid_t tid; + osd_reqid_t reqid; + ZTracer::Trace trace; + + eversion_t roll_forward_to; /// Soon to be generated internally + + /// Ancillary also provided from submit_transaction caller + std::map obc_map; + + /// see call_write_ordered + std::list > on_write; + + /// Generated internally + std::set temp_added; + std::set temp_cleared; + + ECTransaction::WritePlan plan; + bool requires_rmw() const { return !plan.to_read.empty(); } + bool invalidates_cache() const { return plan.invalidates_cache; } + + // must be true if requires_rmw(), must be false if invalidates_cache() + bool using_cache = true; + + /// In progress read state; + std::map pending_read; // subset already being read + std::map remote_read; // subset we must read + std::map remote_read_result; + bool read_in_progress() const { + return !remote_read.empty() && remote_read_result.empty(); + } + + /// In progress write state. + std::set pending_commit; + // we need pending_apply for pre-mimic peers so that we don't issue a + // read on a remote shard before it has applied a previous write. We can + // remove this after nautilus. + std::set pending_apply; + bool write_in_progress() const { + return !pending_commit.empty() || !pending_apply.empty(); + } + + /// optional, may be null, for tracking purposes + OpRequestRef client_op; + + /// pin for cache + ExtentCache::write_pin pin; + + /// Callbacks + Context *on_all_commit = nullptr; + ~Op() { + delete on_all_commit; + } + }; + using op_list = boost::intrusive::list; + friend ostream &operator<<(ostream &lhs, const Op &rhs); + + ExtentCache cache; + std::map tid_to_op_map; /// Owns Op structure + + /** + * We model the possible rmw states as a std::set of waitlists. + * All writes at this time complete in order, so a write blocked + * at waiting_state blocks all writes behind it as well (same for + * other states). + * + * Future work: We can break this up into a per-object pipeline + * (almost). First, provide an ordering token to submit_transaction + * and require that all operations within a single transaction take + * place on a subset of hobject_t space partitioned by that token + * (the hashid seem about right to me -- even works for temp objects + * if you recall that a temp object created for object head foo will + * only ever be referenced by other transactions on foo and aren't + * reused). Next, factor this part into a class and maintain one per + * ordering token. Next, fixup PrimaryLogPG's repop queue to be + * partitioned by ordering token. Finally, refactor the op pipeline + * so that the log entries passed into submit_transaction aren't + * versioned. We can't assign versions to them until we actually + * submit the operation. That's probably going to be the hard part. + */ + class pipeline_state_t { + enum { + CACHE_VALID = 0, + CACHE_INVALID = 1 + } pipeline_state = CACHE_VALID; + public: + bool caching_enabled() const { + return pipeline_state == CACHE_VALID; + } + bool cache_invalid() const { + return !caching_enabled(); + } + void invalidate() { + pipeline_state = CACHE_INVALID; + } + void clear() { + pipeline_state = CACHE_VALID; + } + friend ostream &operator<<(ostream &lhs, const pipeline_state_t &rhs); + } pipeline_state; + + + op_list waiting_state; /// writes waiting on pipe_state + op_list waiting_reads; /// writes waiting on partial stripe reads + op_list waiting_commit; /// writes waiting on initial commit + eversion_t completed_to; + eversion_t committed_to; + void start_rmw(Op *op, PGTransactionUPtr &&t); + bool try_state_to_reads(); + bool try_reads_to_commit(); + bool try_finish_rmw(); + void check_ops(); + + ceph::ErasureCodeInterfaceRef ec_impl; + + + /** + * ECRecPred + * + * Determines the whether _have is sufficient to recover an object + */ + class ECRecPred : public IsPGRecoverablePredicate { + std::set want; + ceph::ErasureCodeInterfaceRef ec_impl; + public: + explicit ECRecPred(ceph::ErasureCodeInterfaceRef ec_impl) : ec_impl(ec_impl) { + for (unsigned i = 0; i < ec_impl->get_chunk_count(); ++i) { + want.insert(i); + } + } + bool operator()(const std::set &_have) const override { + std::set have; + for (std::set::const_iterator i = _have.begin(); + i != _have.end(); + ++i) { + have.insert(i->shard); + } + std::map>> min; + return ec_impl->minimum_to_decode(want, have, &min) == 0; + } + }; + IsPGRecoverablePredicate *get_is_recoverable_predicate() const override { + return new ECRecPred(ec_impl); + } + + int get_ec_data_chunk_count() const override { + return ec_impl->get_data_chunk_count(); + } + int get_ec_stripe_chunk_size() const override { + return sinfo.get_chunk_size(); + } + + /** + * ECReadPred + * + * Determines the whether _have is sufficient to read an object + */ + class ECReadPred : public IsPGReadablePredicate { + pg_shard_t whoami; + ECRecPred rec_pred; + public: + ECReadPred( + pg_shard_t whoami, + ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(ec_impl) {} + bool operator()(const std::set &_have) const override { + return _have.count(whoami) && rec_pred(_have); + } + }; + IsPGReadablePredicate *get_is_readable_predicate() const override { + return new ECReadPred(get_parent()->whoami_shard(), ec_impl); + } + + + const ECUtil::stripe_info_t sinfo; + /// If modified, ensure that the ref is held until the update is applied + SharedPtrRegistry unstable_hashinfo_registry; + ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid, bool create = false, + const std::map *attr = NULL); + +public: + ECBackend( + PGBackend::Listener *pg, + const coll_t &coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct, + ceph::ErasureCodeInterfaceRef ec_impl, + uint64_t stripe_width); + + /// Returns to_read replicas sufficient to reconstruct want + int get_min_avail_to_read_shards( + const hobject_t &hoid, ///< [in] object + const std::set &want, ///< [in] desired shards + bool for_recovery, ///< [in] true if we may use non-acting replicas + bool do_redundant_reads, ///< [in] true if we want to issue redundant reads to reduce latency + std::map>> *to_read ///< [out] shards, corresponding subchunks to read + ); ///< @return error code, 0 on success + + int get_remaining_shards( + const hobject_t &hoid, + const std::set &avail, + const std::set &want, + const read_result_t &result, + std::map>> *to_read, + bool for_recovery); + + int objects_get_attrs( + const hobject_t &hoid, + std::map *out) override; + + void rollback_append( + const hobject_t &hoid, + uint64_t old_size, + ObjectStore::Transaction *t) override; + + bool auto_repair_supported() const override { return true; } + + int be_deep_scrub( + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) override; + uint64_t be_get_ondisk_size(uint64_t logical_size) override { + return sinfo.logical_to_next_chunk_offset(logical_size); + } + void _failed_push(const hobject_t &hoid, + std::pair &in); +}; +ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs); + +#endif diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc new file mode 100644 index 000000000..a65676643 --- /dev/null +++ b/src/osd/ECMsgTypes.cc @@ -0,0 +1,393 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "ECMsgTypes.h" + +using std::list; +using std::make_pair; +using std::map; +using std::pair; +using std::set; +using ceph::bufferlist; +using ceph::Formatter; + +void ECSubWrite::encode(bufferlist &bl) const +{ + ENCODE_START(4, 1, bl); + encode(from, bl); + encode(tid, bl); + encode(reqid, bl); + encode(soid, bl); + encode(stats, bl); + encode(t, bl); + encode(at_version, bl); + encode(trim_to, bl); + encode(log_entries, bl); + encode(temp_added, bl); + encode(temp_removed, bl); + encode(updated_hit_set_history, bl); + encode(roll_forward_to, bl); + encode(backfill_or_async_recovery, bl); + ENCODE_FINISH(bl); +} + +void ECSubWrite::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(4, bl); + decode(from, bl); + decode(tid, bl); + decode(reqid, bl); + decode(soid, bl); + decode(stats, bl); + decode(t, bl); + decode(at_version, bl); + decode(trim_to, bl); + decode(log_entries, bl); + decode(temp_added, bl); + decode(temp_removed, bl); + if (struct_v >= 2) { + decode(updated_hit_set_history, bl); + } + if (struct_v >= 3) { + decode(roll_forward_to, bl); + } else { + roll_forward_to = trim_to; + } + if (struct_v >= 4) { + decode(backfill_or_async_recovery, bl); + } else { + // The old protocol used an empty transaction to indicate backfill or async_recovery + backfill_or_async_recovery = t.empty(); + } + DECODE_FINISH(bl); +} + +std::ostream &operator<<( + std::ostream &lhs, const ECSubWrite &rhs) +{ + lhs << "ECSubWrite(tid=" << rhs.tid + << ", reqid=" << rhs.reqid + << ", at_version=" << rhs.at_version + << ", trim_to=" << rhs.trim_to + << ", roll_forward_to=" << rhs.roll_forward_to; + if (rhs.updated_hit_set_history) + lhs << ", has_updated_hit_set_history"; + if (rhs.backfill_or_async_recovery) + lhs << ", backfill_or_async_recovery"; + return lhs << ")"; +} + +void ECSubWrite::dump(Formatter *f) const +{ + f->dump_unsigned("tid", tid); + f->dump_stream("reqid") << reqid; + f->dump_stream("at_version") << at_version; + f->dump_stream("trim_to") << trim_to; + f->dump_stream("roll_forward_to") << roll_forward_to; + f->dump_bool("has_updated_hit_set_history", + static_cast(updated_hit_set_history)); + f->dump_bool("backfill_or_async_recovery", backfill_or_async_recovery); +} + +void ECSubWrite::generate_test_instances(list &o) +{ + o.push_back(new ECSubWrite()); + o.back()->tid = 1; + o.back()->at_version = eversion_t(2, 100); + o.back()->trim_to = eversion_t(1, 40); + o.push_back(new ECSubWrite()); + o.back()->tid = 4; + o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678); + o.back()->at_version = eversion_t(10, 300); + o.back()->trim_to = eversion_t(5, 42); + o.push_back(new ECSubWrite()); + o.back()->tid = 9; + o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678); + o.back()->at_version = eversion_t(10, 300); + o.back()->trim_to = eversion_t(5, 42); + o.back()->roll_forward_to = eversion_t(8, 250); +} + +void ECSubWriteReply::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(from, bl); + encode(tid, bl); + encode(last_complete, bl); + encode(committed, bl); + encode(applied, bl); + ENCODE_FINISH(bl); +} + +void ECSubWriteReply::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(from, bl); + decode(tid, bl); + decode(last_complete, bl); + decode(committed, bl); + decode(applied, bl); + DECODE_FINISH(bl); +} + +std::ostream &operator<<( + std::ostream &lhs, const ECSubWriteReply &rhs) +{ + return lhs + << "ECSubWriteReply(tid=" << rhs.tid + << ", last_complete=" << rhs.last_complete + << ", committed=" << rhs.committed + << ", applied=" << rhs.applied << ")"; +} + +void ECSubWriteReply::dump(Formatter *f) const +{ + f->dump_unsigned("tid", tid); + f->dump_stream("last_complete") << last_complete; + f->dump_bool("committed", committed); + f->dump_bool("applied", applied); +} + +void ECSubWriteReply::generate_test_instances(list& o) +{ + o.push_back(new ECSubWriteReply()); + o.back()->tid = 20; + o.back()->last_complete = eversion_t(100, 2000); + o.back()->committed = true; + o.push_back(new ECSubWriteReply()); + o.back()->tid = 80; + o.back()->last_complete = eversion_t(50, 200); + o.back()->applied = true; +} + +void ECSubRead::encode(bufferlist &bl, uint64_t features) const +{ + if ((features & CEPH_FEATURE_OSD_FADVISE_FLAGS) == 0) { + ENCODE_START(2, 1, bl); + encode(from, bl); + encode(tid, bl); + map >> tmp; + for (auto m = to_read.cbegin(); m != to_read.cend(); ++m) { + list > tlist; + for (auto l = m->second.cbegin(); l != m->second.cend(); ++l) { + tlist.push_back(std::make_pair(l->get<0>(), l->get<1>())); + } + tmp[m->first] = tlist; + } + encode(tmp, bl); + encode(attrs_to_read, bl); + encode(subchunks, bl); + ENCODE_FINISH(bl); + return; + } + + ENCODE_START(3, 2, bl); + encode(from, bl); + encode(tid, bl); + encode(to_read, bl); + encode(attrs_to_read, bl); + encode(subchunks, bl); + ENCODE_FINISH(bl); +} + +void ECSubRead::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(3, bl); + decode(from, bl); + decode(tid, bl); + if (struct_v == 1) { + map >>tmp; + decode(tmp, bl); + for (auto m = tmp.cbegin(); m != tmp.cend(); ++m) { + list > tlist; + for (auto l = m->second.cbegin(); l != m->second.cend(); ++l) { + tlist.push_back(boost::make_tuple(l->first, l->second, 0)); + } + to_read[m->first] = tlist; + } + } else { + decode(to_read, bl); + } + decode(attrs_to_read, bl); + if (struct_v > 2 && struct_v > struct_compat) { + decode(subchunks, bl); + } else { + for (auto &i : to_read) { + subchunks[i.first].push_back(make_pair(0, 1)); + } + } + DECODE_FINISH(bl); +} + +std::ostream &operator<<( + std::ostream &lhs, const ECSubRead &rhs) +{ + return lhs + << "ECSubRead(tid=" << rhs.tid + << ", to_read=" << rhs.to_read + << ", subchunks=" << rhs.subchunks + << ", attrs_to_read=" << rhs.attrs_to_read << ")"; +} + +void ECSubRead::dump(Formatter *f) const +{ + f->dump_stream("from") << from; + f->dump_unsigned("tid", tid); + f->open_array_section("objects"); + for (auto i = to_read.cbegin(); i != to_read.cend(); ++i) { + f->open_object_section("object"); + f->dump_stream("oid") << i->first; + f->open_array_section("extents"); + for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) { + f->open_object_section("extent"); + f->dump_unsigned("off", j->get<0>()); + f->dump_unsigned("len", j->get<1>()); + f->dump_unsigned("flags", j->get<2>()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("object_attrs_requested"); + for (auto i = attrs_to_read.cbegin(); i != attrs_to_read.cend(); ++i) { + f->open_object_section("object"); + f->dump_stream("oid") << *i; + f->close_section(); + } + f->close_section(); +} + +void ECSubRead::generate_test_instances(list& o) +{ + hobject_t hoid1(sobject_t("asdf", 1)); + hobject_t hoid2(sobject_t("asdf2", CEPH_NOSNAP)); + o.push_back(new ECSubRead()); + o.back()->from = pg_shard_t(2, shard_id_t(-1)); + o.back()->tid = 1; + o.back()->to_read[hoid1].push_back(boost::make_tuple(100, 200, 0)); + o.back()->to_read[hoid1].push_back(boost::make_tuple(400, 600, 0)); + o.back()->to_read[hoid2].push_back(boost::make_tuple(400, 600, 0)); + o.back()->attrs_to_read.insert(hoid1); + o.push_back(new ECSubRead()); + o.back()->from = pg_shard_t(2, shard_id_t(-1)); + o.back()->tid = 300; + o.back()->to_read[hoid1].push_back(boost::make_tuple(300, 200, 0)); + o.back()->to_read[hoid2].push_back(boost::make_tuple(400, 600, 0)); + o.back()->to_read[hoid2].push_back(boost::make_tuple(2000, 600, 0)); + o.back()->attrs_to_read.insert(hoid2); +} + +void ECSubReadReply::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(from, bl); + encode(tid, bl); + encode(buffers_read, bl); + encode(attrs_read, bl); + encode(errors, bl); + ENCODE_FINISH(bl); +} + +void ECSubReadReply::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(from, bl); + decode(tid, bl); + decode(buffers_read, bl); + decode(attrs_read, bl); + decode(errors, bl); + DECODE_FINISH(bl); +} + +std::ostream &operator<<( + std::ostream &lhs, const ECSubReadReply &rhs) +{ + return lhs + << "ECSubReadReply(tid=" << rhs.tid + << ", attrs_read=" << rhs.attrs_read.size() + << ")"; +} + +void ECSubReadReply::dump(Formatter *f) const +{ + f->dump_stream("from") << from; + f->dump_unsigned("tid", tid); + f->open_array_section("buffers_read"); + for (auto i = buffers_read.cbegin(); i != buffers_read.cend(); ++i) { + f->open_object_section("object"); + f->dump_stream("oid") << i->first; + f->open_array_section("data"); + for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) { + f->open_object_section("extent"); + f->dump_unsigned("off", j->first); + f->dump_unsigned("buf_len", j->second.length()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("attrs_returned"); + for (auto i = attrs_read.cbegin(); i != attrs_read.cend(); ++i) { + f->open_object_section("object_attrs"); + f->dump_stream("oid") << i->first; + f->open_array_section("attrs"); + for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) { + f->open_object_section("attr"); + f->dump_string("attr", j->first); + f->dump_unsigned("val_len", j->second.length()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("errors"); + for (auto i = errors.cbegin(); i != errors.cend(); ++i) { + f->open_object_section("error_pair"); + f->dump_stream("oid") << i->first; + f->dump_int("error", i->second); + f->close_section(); + } + f->close_section(); +} + +void ECSubReadReply::generate_test_instances(list& o) +{ + hobject_t hoid1(sobject_t("asdf", 1)); + hobject_t hoid2(sobject_t("asdf2", CEPH_NOSNAP)); + bufferlist bl; + bl.append_zero(100); + bufferlist bl2; + bl2.append_zero(200); + o.push_back(new ECSubReadReply()); + o.back()->from = pg_shard_t(2, shard_id_t(-1)); + o.back()->tid = 1; + o.back()->buffers_read[hoid1].push_back(make_pair(20, bl)); + o.back()->buffers_read[hoid1].push_back(make_pair(2000, bl2)); + o.back()->buffers_read[hoid2].push_back(make_pair(0, bl)); + o.back()->attrs_read[hoid1]["foo"] = bl; + o.back()->attrs_read[hoid1]["_"] = bl2; + o.push_back(new ECSubReadReply()); + o.back()->from = pg_shard_t(2, shard_id_t(-1)); + o.back()->tid = 300; + o.back()->buffers_read[hoid2].push_back(make_pair(0, bl2)); + o.back()->attrs_read[hoid2]["foo"] = bl; + o.back()->attrs_read[hoid2]["_"] = bl2; + o.back()->errors[hoid1] = -2; +} diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h new file mode 100644 index 000000000..77b4222b2 --- /dev/null +++ b/src/osd/ECMsgTypes.h @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ECBMSGTYPES_H +#define ECBMSGTYPES_H + +#include "osd_types.h" +#include "include/buffer.h" +#include "os/ObjectStore.h" +#include "boost/tuple/tuple.hpp" + +struct ECSubWrite { + pg_shard_t from; + ceph_tid_t tid; + osd_reqid_t reqid; + hobject_t soid; + pg_stat_t stats; + ObjectStore::Transaction t; + eversion_t at_version; + eversion_t trim_to; + eversion_t roll_forward_to; + std::vector log_entries; + std::set temp_added; + std::set temp_removed; + std::optional updated_hit_set_history; + bool backfill_or_async_recovery = false; + ECSubWrite() : tid(0) {} + ECSubWrite( + pg_shard_t from, + ceph_tid_t tid, + osd_reqid_t reqid, + hobject_t soid, + const pg_stat_t &stats, + const ObjectStore::Transaction &t, + eversion_t at_version, + eversion_t trim_to, + eversion_t roll_forward_to, + std::vector log_entries, + std::optional updated_hit_set_history, + const std::set &temp_added, + const std::set &temp_removed, + bool backfill_or_async_recovery) + : from(from), tid(tid), reqid(reqid), + soid(soid), stats(stats), t(t), + at_version(at_version), + trim_to(trim_to), roll_forward_to(roll_forward_to), + log_entries(log_entries), + temp_added(temp_added), + temp_removed(temp_removed), + updated_hit_set_history(updated_hit_set_history), + backfill_or_async_recovery(backfill_or_async_recovery) + {} + void claim(ECSubWrite &other) { + from = other.from; + tid = other.tid; + reqid = other.reqid; + soid = other.soid; + stats = other.stats; + t.swap(other.t); + at_version = other.at_version; + trim_to = other.trim_to; + roll_forward_to = other.roll_forward_to; + log_entries.swap(other.log_entries); + temp_added.swap(other.temp_added); + temp_removed.swap(other.temp_removed); + updated_hit_set_history = other.updated_hit_set_history; + backfill_or_async_recovery = other.backfill_or_async_recovery; + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +private: + // no outside copying -- slow + ECSubWrite(ECSubWrite& other); + const ECSubWrite& operator=(const ECSubWrite& other); +}; +WRITE_CLASS_ENCODER(ECSubWrite) + +struct ECSubWriteReply { + pg_shard_t from; + ceph_tid_t tid; + eversion_t last_complete; + bool committed; + bool applied; + ECSubWriteReply() : tid(0), committed(false), applied(false) {} + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(ECSubWriteReply) + +struct ECSubRead { + pg_shard_t from; + ceph_tid_t tid; + std::map >> to_read; + std::set attrs_to_read; + std::map>> subchunks; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(ECSubRead) + +struct ECSubReadReply { + pg_shard_t from; + ceph_tid_t tid; + std::map >> buffers_read; + std::map> attrs_read; + std::map errors; + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(ECSubReadReply) + +std::ostream &operator<<( + std::ostream &lhs, const ECSubWrite &rhs); +std::ostream &operator<<( + std::ostream &lhs, const ECSubWriteReply &rhs); +std::ostream &operator<<( + std::ostream &lhs, const ECSubRead &rhs); +std::ostream &operator<<( + std::ostream &lhs, const ECSubReadReply &rhs); + +#endif diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc new file mode 100644 index 000000000..603f9af0e --- /dev/null +++ b/src/osd/ECTransaction.cc @@ -0,0 +1,670 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include + +#include "ECTransaction.h" +#include "ECUtil.h" +#include "os/ObjectStore.h" +#include "common/inline_variant.h" + +using std::make_pair; +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::ErasureCodeInterfaceRef; + +void encode_and_write( + pg_t pgid, + const hobject_t &oid, + const ECUtil::stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ecimpl, + const set &want, + uint64_t offset, + bufferlist bl, + uint32_t flags, + ECUtil::HashInfoRef hinfo, + extent_map &written, + map *transactions, + DoutPrefixProvider *dpp) { + const uint64_t before_size = hinfo->get_total_logical_size(sinfo); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(offset)); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(bl.length())); + ceph_assert(bl.length()); + + map buffers; + int r = ECUtil::encode( + sinfo, ecimpl, bl, want, &buffers); + ceph_assert(r == 0); + + written.insert(offset, bl.length(), bl); + + ldpp_dout(dpp, 20) << __func__ << ": " << oid + << " new_size " + << offset + bl.length() + << dendl; + + if (offset >= before_size) { + ceph_assert(offset == before_size); + hinfo->append( + sinfo.aligned_logical_offset_to_chunk_offset(offset), + buffers); + } + + for (auto &&i : *transactions) { + ceph_assert(buffers.count(i.first)); + bufferlist &enc_bl = buffers[i.first]; + if (offset >= before_size) { + i.second.set_alloc_hint( + coll_t(spg_t(pgid, i.first)), + ghobject_t(oid, ghobject_t::NO_GEN, i.first), + 0, 0, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE | + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY); + } + i.second.write( + coll_t(spg_t(pgid, i.first)), + ghobject_t(oid, ghobject_t::NO_GEN, i.first), + sinfo.logical_to_prev_chunk_offset( + offset), + enc_bl.length(), + enc_bl, + flags); + } +} + +bool ECTransaction::requires_overwrite( + uint64_t prev_size, + const PGTransaction::ObjectOperation &op) { + // special handling for truncates to 0 + if (op.truncate && op.truncate->first == 0) + return false; + return op.is_none() && + ((!op.buffer_updates.empty() && + (op.buffer_updates.begin().get_off() < prev_size)) || + (op.truncate && + (op.truncate->first < prev_size))); +} + +void ECTransaction::generate_transactions( + WritePlan &plan, + ErasureCodeInterfaceRef &ecimpl, + pg_t pgid, + const ECUtil::stripe_info_t &sinfo, + const map &partial_extents, + vector &entries, + map *written_map, + map *transactions, + set *temp_added, + set *temp_removed, + DoutPrefixProvider *dpp, + const ceph_release_t require_osd_release) +{ + ceph_assert(written_map); + ceph_assert(transactions); + ceph_assert(temp_added); + ceph_assert(temp_removed); + ceph_assert(plan.t); + auto &t = *(plan.t); + + auto &hash_infos = plan.hash_infos; + + map obj_to_log; + for (auto &&i: entries) { + obj_to_log.insert(make_pair(i.soid, &i)); + } + + t.safe_create_traverse( + [&](pair &opair) { + const hobject_t &oid = opair.first; + auto &op = opair.second; + auto &obc_map = t.obc_map; + auto &written = (*written_map)[oid]; + + auto iter = obj_to_log.find(oid); + pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr; + + ObjectContextRef obc; + auto obiter = t.obc_map.find(oid); + if (obiter != t.obc_map.end()) { + obc = obiter->second; + } + if (entry) { + ceph_assert(obc); + } else { + ceph_assert(oid.is_temp()); + } + + ECUtil::HashInfoRef hinfo; + { + auto iter = hash_infos.find(oid); + ceph_assert(iter != hash_infos.end()); + hinfo = iter->second; + } + + if (oid.is_temp()) { + if (op.is_fresh_object()) { + temp_added->insert(oid); + } else if (op.is_delete()) { + temp_removed->insert(oid); + } + } + + if (entry && + entry->is_modify() && + op.updated_snaps) { + bufferlist bl(op.updated_snaps->second.size() * 8 + 8); + encode(op.updated_snaps->second, bl); + entry->snaps.swap(bl); + entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + ldpp_dout(dpp, 20) << "generate_transactions: " + << opair.first + << ", current size is " + << hinfo->get_total_logical_size(sinfo) + << " buffers are " + << op.buffer_updates + << dendl; + if (op.truncate) { + ldpp_dout(dpp, 20) << "generate_transactions: " + << " truncate is " + << *(op.truncate) + << dendl; + } + + if (entry && op.updated_snaps) { + entry->mod_desc.update_snaps(op.updated_snaps->first); + } + + map > xattr_rollback; + ceph_assert(hinfo); + bufferlist old_hinfo; + encode(*hinfo, old_hinfo); + xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo; + + if (op.is_none() && op.truncate && op.truncate->first == 0) { + ceph_assert(op.truncate->first == 0); + ceph_assert(op.truncate->first == + op.truncate->second); + ceph_assert(entry); + ceph_assert(obc); + + if (op.truncate->first != op.truncate->second) { + op.truncate->first = op.truncate->second; + } else { + op.truncate = std::nullopt; + } + + op.delete_first = true; + op.init_type = PGTransaction::ObjectOperation::Init::Create(); + + if (obc) { + /* We need to reapply all of the cached xattrs. + * std::map insert fortunately only writes keys + * which don't already exist, so this should do + * the right thing. */ + op.attr_updates.insert( + obc->attr_cache.begin(), + obc->attr_cache.end()); + } + } + + if (op.delete_first) { + /* We also want to remove the std::nullopt entries since + * the keys already won't exist */ + for (auto j = op.attr_updates.begin(); + j != op.attr_updates.end(); + ) { + if (j->second) { + ++j; + } else { + op.attr_updates.erase(j++); + } + } + /* Fill in all current entries for xattr rollback */ + if (obc) { + xattr_rollback.insert( + obc->attr_cache.begin(), + obc->attr_cache.end()); + obc->attr_cache.clear(); + } + if (entry) { + entry->mod_desc.rmobject(entry->version.version); + for (auto &&st: *transactions) { + st.second.collection_move_rename( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, entry->version.version, st.first)); + } + } else { + for (auto &&st: *transactions) { + st.second.remove( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } + } + hinfo->clear(); + } + + if (op.is_fresh_object() && entry) { + entry->mod_desc.create(); + } + + match( + op.init_type, + [&](const PGTransaction::ObjectOperation::Init::None &) {}, + [&](const PGTransaction::ObjectOperation::Init::Create &op) { + for (auto &&st: *transactions) { + if (require_osd_release >= ceph_release_t::octopus) { + st.second.create( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } else { + st.second.touch( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } + } + }, + [&](const PGTransaction::ObjectOperation::Init::Clone &op) { + for (auto &&st: *transactions) { + st.second.clone( + coll_t(spg_t(pgid, st.first)), + ghobject_t(op.source, ghobject_t::NO_GEN, st.first), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } + + auto siter = hash_infos.find(op.source); + ceph_assert(siter != hash_infos.end()); + hinfo->update_to(*(siter->second)); + + if (obc) { + auto cobciter = obc_map.find(op.source); + ceph_assert(cobciter != obc_map.end()); + obc->attr_cache = cobciter->second->attr_cache; + } + }, + [&](const PGTransaction::ObjectOperation::Init::Rename &op) { + ceph_assert(op.source.is_temp()); + for (auto &&st: *transactions) { + st.second.collection_move_rename( + coll_t(spg_t(pgid, st.first)), + ghobject_t(op.source, ghobject_t::NO_GEN, st.first), + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } + auto siter = hash_infos.find(op.source); + ceph_assert(siter != hash_infos.end()); + hinfo->update_to(*(siter->second)); + if (obc) { + auto cobciter = obc_map.find(op.source); + ceph_assert(cobciter == obc_map.end()); + obc->attr_cache.clear(); + } + }); + + // omap not supported (except 0, handled above) + ceph_assert(!(op.clear_omap)); + ceph_assert(!(op.omap_header)); + ceph_assert(op.omap_updates.empty()); + + if (!op.attr_updates.empty()) { + map to_set; + for (auto &&j: op.attr_updates) { + if (j.second) { + to_set[j.first] = *(j.second); + } else { + for (auto &&st : *transactions) { + st.second.rmattr( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + j.first); + } + } + if (obc) { + auto citer = obc->attr_cache.find(j.first); + if (entry) { + if (citer != obc->attr_cache.end()) { + // won't overwrite anything we put in earlier + xattr_rollback.insert( + make_pair( + j.first, + std::optional(citer->second))); + } else { + // won't overwrite anything we put in earlier + xattr_rollback.insert( + make_pair( + j.first, + std::nullopt)); + } + } + if (j.second) { + obc->attr_cache[j.first] = *(j.second); + } else if (citer != obc->attr_cache.end()) { + obc->attr_cache.erase(citer); + } + } else { + ceph_assert(!entry); + } + } + for (auto &&st : *transactions) { + st.second.setattrs( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + to_set); + } + ceph_assert(!xattr_rollback.empty()); + } + if (entry && !xattr_rollback.empty()) { + entry->mod_desc.setattrs(xattr_rollback); + } + + if (op.alloc_hint) { + /* logical_to_next_chunk_offset() scales down both aligned and + * unaligned offsets + + * we don't bother to roll this back at this time for two reasons: + * 1) it's advisory + * 2) we don't track the old value */ + uint64_t object_size = sinfo.logical_to_next_chunk_offset( + op.alloc_hint->expected_object_size); + uint64_t write_size = sinfo.logical_to_next_chunk_offset( + op.alloc_hint->expected_write_size); + + for (auto &&st : *transactions) { + st.second.set_alloc_hint( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + object_size, + write_size, + op.alloc_hint->flags); + } + } + + extent_map to_write; + auto pextiter = partial_extents.find(oid); + if (pextiter != partial_extents.end()) { + to_write = pextiter->second; + } + + vector > rollback_extents; + const uint64_t orig_size = hinfo->get_total_logical_size(sinfo); + + uint64_t new_size = orig_size; + uint64_t append_after = new_size; + ldpp_dout(dpp, 20) << __func__ << ": new_size start " << new_size << dendl; + if (op.truncate && op.truncate->first < new_size) { + ceph_assert(!op.is_fresh_object()); + new_size = sinfo.logical_to_next_stripe_offset( + op.truncate->first); + ldpp_dout(dpp, 20) << __func__ << ": new_size truncate down " + << new_size << dendl; + if (new_size != op.truncate->first) { // 0 the unaligned part + bufferlist bl; + bl.append_zero(new_size - op.truncate->first); + to_write.insert( + op.truncate->first, + bl.length(), + bl); + append_after = sinfo.logical_to_prev_stripe_offset( + op.truncate->first); + } else { + append_after = new_size; + } + to_write.erase( + new_size, + std::numeric_limits::max() - new_size); + + if (entry && !op.is_fresh_object()) { + uint64_t restore_from = sinfo.logical_to_prev_chunk_offset( + op.truncate->first); + uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset( + orig_size - + sinfo.logical_to_prev_stripe_offset(op.truncate->first)); + ceph_assert(rollback_extents.empty()); + + ldpp_dout(dpp, 20) << __func__ << ": saving extent " + << make_pair(restore_from, restore_len) + << dendl; + ldpp_dout(dpp, 20) << __func__ << ": truncating to " + << new_size + << dendl; + rollback_extents.emplace_back( + make_pair(restore_from, restore_len)); + for (auto &&st : *transactions) { + st.second.touch( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, entry->version.version, st.first)); + st.second.clone_range( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + ghobject_t(oid, entry->version.version, st.first), + restore_from, + restore_len, + restore_from); + + } + } else { + ldpp_dout(dpp, 20) << __func__ << ": not saving extents, fresh object" + << dendl; + } + for (auto &&st : *transactions) { + st.second.truncate( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + sinfo.aligned_logical_offset_to_chunk_offset(new_size)); + } + } + + uint32_t fadvise_flags = 0; + for (auto &&extent: op.buffer_updates) { + using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; + bufferlist bl; + match( + extent.get_val(), + [&](const BufferUpdate::Write &op) { + bl = op.buffer; + fadvise_flags |= op.fadvise_flags; + }, + [&](const BufferUpdate::Zero &) { + bl.append_zero(extent.get_len()); + }, + [&](const BufferUpdate::CloneRange &) { + ceph_assert( + 0 == + "CloneRange is not allowed, do_op should have returned ENOTSUPP"); + }); + + uint64_t off = extent.get_off(); + uint64_t len = extent.get_len(); + uint64_t end = off + len; + ldpp_dout(dpp, 20) << __func__ << ": adding buffer_update " + << make_pair(off, len) + << dendl; + ceph_assert(len > 0); + if (off > new_size) { + ceph_assert(off > append_after); + bl.prepend_zero(off - new_size); + len += off - new_size; + ldpp_dout(dpp, 20) << __func__ << ": prepending zeroes to align " + << off << "->" << new_size + << dendl; + off = new_size; + } + if (!sinfo.logical_offset_is_stripe_aligned(end) && (end > append_after)) { + uint64_t aligned_end = sinfo.logical_to_next_stripe_offset( + end); + uint64_t tail = aligned_end - end; + bl.append_zero(tail); + ldpp_dout(dpp, 20) << __func__ << ": appending zeroes to align end " + << end << "->" << end+tail + << ", len: " << len << "->" << len+tail + << dendl; + end += tail; + len += tail; + } + + to_write.insert(off, len, bl); + if (end > new_size) + new_size = end; + } + + if (op.truncate && + op.truncate->second > new_size) { + ceph_assert(op.truncate->second > append_after); + uint64_t truncate_to = + sinfo.logical_to_next_stripe_offset( + op.truncate->second); + uint64_t zeroes = truncate_to - new_size; + bufferlist bl; + bl.append_zero(zeroes); + to_write.insert( + new_size, + zeroes, + bl); + new_size = truncate_to; + ldpp_dout(dpp, 20) << __func__ << ": truncating out to " + << truncate_to + << dendl; + } + + set want; + for (unsigned i = 0; i < ecimpl->get_chunk_count(); ++i) { + want.insert(i); + } + auto to_overwrite = to_write.intersect(0, append_after); + ldpp_dout(dpp, 20) << __func__ << ": to_overwrite: " + << to_overwrite + << dendl; + for (auto &&extent: to_overwrite) { + ceph_assert(extent.get_off() + extent.get_len() <= append_after); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off())); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len())); + if (entry) { + uint64_t restore_from = sinfo.aligned_logical_offset_to_chunk_offset( + extent.get_off()); + uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset( + extent.get_len()); + ldpp_dout(dpp, 20) << __func__ << ": overwriting " + << restore_from << "~" << restore_len + << dendl; + if (rollback_extents.empty()) { + for (auto &&st : *transactions) { + st.second.touch( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, entry->version.version, st.first)); + } + } + rollback_extents.emplace_back(make_pair(restore_from, restore_len)); + for (auto &&st : *transactions) { + st.second.clone_range( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + ghobject_t(oid, entry->version.version, st.first), + restore_from, + restore_len, + restore_from); + } + } + encode_and_write( + pgid, + oid, + sinfo, + ecimpl, + want, + extent.get_off(), + extent.get_val(), + fadvise_flags, + hinfo, + written, + transactions, + dpp); + } + + auto to_append = to_write.intersect( + append_after, + std::numeric_limits::max() - append_after); + ldpp_dout(dpp, 20) << __func__ << ": to_append: " + << to_append + << dendl; + for (auto &&extent: to_append) { + ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off())); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len())); + ldpp_dout(dpp, 20) << __func__ << ": appending " + << extent.get_off() << "~" << extent.get_len() + << dendl; + encode_and_write( + pgid, + oid, + sinfo, + ecimpl, + want, + extent.get_off(), + extent.get_val(), + fadvise_flags, + hinfo, + written, + transactions, + dpp); + } + + ldpp_dout(dpp, 20) << __func__ << ": " << oid + << " resetting hinfo to logical size " + << new_size + << dendl; + if (!rollback_extents.empty() && entry) { + if (entry) { + ldpp_dout(dpp, 20) << __func__ << ": " << oid + << " marking rollback extents " + << rollback_extents + << dendl; + entry->mod_desc.rollback_extents( + entry->version.version, rollback_extents); + } + hinfo->set_total_chunk_size_clear_hash( + sinfo.aligned_logical_offset_to_chunk_offset(new_size)); + } else { + ceph_assert(hinfo->get_total_logical_size(sinfo) == new_size); + } + + if (entry && !to_append.empty()) { + ldpp_dout(dpp, 20) << __func__ << ": marking append " + << append_after + << dendl; + entry->mod_desc.append(append_after); + } + + if (!op.is_delete()) { + bufferlist hbuf; + encode(*hinfo, hbuf); + for (auto &&i : *transactions) { + i.second.setattr( + coll_t(spg_t(pgid, i.first)), + ghobject_t(oid, ghobject_t::NO_GEN, i.first), + ECUtil::get_hinfo_key(), + hbuf); + } + } + }); +} diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h new file mode 100644 index 000000000..5cb16261a --- /dev/null +++ b/src/osd/ECTransaction.h @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ECTRANSACTION_H +#define ECTRANSACTION_H + +#include "OSD.h" +#include "PGBackend.h" +#include "ECUtil.h" +#include "erasure-code/ErasureCodeInterface.h" +#include "PGTransaction.h" +#include "ExtentCache.h" + +namespace ECTransaction { + struct WritePlan { + PGTransactionUPtr t; + bool invalidates_cache = false; // Yes, both are possible + std::map to_read; + std::map will_write; // superset of to_read + + std::map hash_infos; + }; + + bool requires_overwrite( + uint64_t prev_size, + const PGTransaction::ObjectOperation &op); + + template + WritePlan get_write_plan( + const ECUtil::stripe_info_t &sinfo, + PGTransactionUPtr &&t, + F &&get_hinfo, + DoutPrefixProvider *dpp) { + WritePlan plan; + t->safe_create_traverse( + [&](std::pair &i) { + ECUtil::HashInfoRef hinfo = get_hinfo(i.first); + plan.hash_infos[i.first] = hinfo; + + uint64_t projected_size = + hinfo->get_projected_total_logical_size(sinfo); + + if (i.second.deletes_first()) { + ldpp_dout(dpp, 20) << __func__ << ": delete, setting projected size" + << " to 0" << dendl; + projected_size = 0; + } + + hobject_t source; + if (i.second.has_source(&source)) { + plan.invalidates_cache = true; + + ECUtil::HashInfoRef shinfo = get_hinfo(source); + projected_size = shinfo->get_projected_total_logical_size(sinfo); + plan.hash_infos[source] = shinfo; + } + + auto &will_write = plan.will_write[i.first]; + if (i.second.truncate && + i.second.truncate->first < projected_size) { + if (!(sinfo.logical_offset_is_stripe_aligned( + i.second.truncate->first))) { + plan.to_read[i.first].union_insert( + sinfo.logical_to_prev_stripe_offset(i.second.truncate->first), + sinfo.get_stripe_width()); + + ldpp_dout(dpp, 20) << __func__ << ": unaligned truncate" << dendl; + + will_write.union_insert( + sinfo.logical_to_prev_stripe_offset(i.second.truncate->first), + sinfo.get_stripe_width()); + } + projected_size = sinfo.logical_to_next_stripe_offset( + i.second.truncate->first); + } + + extent_set raw_write_set; + for (auto &&extent: i.second.buffer_updates) { + using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; + if (boost::get(&(extent.get_val()))) { + ceph_assert( + 0 == + "CloneRange is not allowed, do_op should have returned ENOTSUPP"); + } + raw_write_set.insert(extent.get_off(), extent.get_len()); + } + + auto orig_size = projected_size; + for (auto extent = raw_write_set.begin(); + extent != raw_write_set.end(); + ++extent) { + uint64_t head_start = + sinfo.logical_to_prev_stripe_offset(extent.get_start()); + uint64_t head_finish = + sinfo.logical_to_next_stripe_offset(extent.get_start()); + if (head_start > projected_size) { + head_start = projected_size; + } + if (head_start != head_finish && + head_start < orig_size) { + ceph_assert(head_finish <= orig_size); + ceph_assert(head_finish - head_start == sinfo.get_stripe_width()); + ldpp_dout(dpp, 20) << __func__ << ": reading partial head stripe " + << head_start << "~" << sinfo.get_stripe_width() + << dendl; + plan.to_read[i.first].union_insert( + head_start, sinfo.get_stripe_width()); + } + + uint64_t tail_start = + sinfo.logical_to_prev_stripe_offset( + extent.get_start() + extent.get_len()); + uint64_t tail_finish = + sinfo.logical_to_next_stripe_offset( + extent.get_start() + extent.get_len()); + if (tail_start != tail_finish && + (head_start == head_finish || tail_start != head_start) && + tail_start < orig_size) { + ceph_assert(tail_finish <= orig_size); + ceph_assert(tail_finish - tail_start == sinfo.get_stripe_width()); + ldpp_dout(dpp, 20) << __func__ << ": reading partial tail stripe " + << tail_start << "~" << sinfo.get_stripe_width() + << dendl; + plan.to_read[i.first].union_insert( + tail_start, sinfo.get_stripe_width()); + } + + if (head_start != tail_finish) { + ceph_assert( + sinfo.logical_offset_is_stripe_aligned( + tail_finish - head_start) + ); + will_write.union_insert( + head_start, tail_finish - head_start); + if (tail_finish > projected_size) + projected_size = tail_finish; + } else { + ceph_assert(tail_finish <= projected_size); + } + } + + if (i.second.truncate && + i.second.truncate->second > projected_size) { + uint64_t truncating_to = + sinfo.logical_to_next_stripe_offset(i.second.truncate->second); + ldpp_dout(dpp, 20) << __func__ << ": truncating out to " + << truncating_to + << dendl; + will_write.union_insert(projected_size, + truncating_to - projected_size); + projected_size = truncating_to; + } + + ldpp_dout(dpp, 20) << __func__ << ": " << i.first + << " projected size " + << projected_size + << dendl; + hinfo->set_projected_total_logical_size( + sinfo, + projected_size); + + /* validate post conditions: + * to_read should have an entry for i.first iff it isn't empty + * and if we are reading from i.first, we can't be renaming or + * cloning it */ + ceph_assert(plan.to_read.count(i.first) == 0 || + (!plan.to_read.at(i.first).empty() && + !i.second.has_source())); + }); + plan.t = std::move(t); + return plan; + } + + void generate_transactions( + WritePlan &plan, + ceph::ErasureCodeInterfaceRef &ecimpl, + pg_t pgid, + const ECUtil::stripe_info_t &sinfo, + const std::map &partial_extents, + std::vector &entries, + std::map *written, + std::map *transactions, + std::set *temp_added, + std::set *temp_removed, + DoutPrefixProvider *dpp, + const ceph_release_t require_osd_release = ceph_release_t::unknown); +}; + +#endif diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc new file mode 100644 index 000000000..94b328458 --- /dev/null +++ b/src/osd/ECUtil.cc @@ -0,0 +1,248 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include +#include "include/encoding.h" +#include "ECUtil.h" + +using namespace std; +using ceph::bufferlist; +using ceph::ErasureCodeInterfaceRef; +using ceph::Formatter; + +int ECUtil::decode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + map &to_decode, + bufferlist *out) { + ceph_assert(to_decode.size()); + + uint64_t total_data_size = to_decode.begin()->second.length(); + ceph_assert(total_data_size % sinfo.get_chunk_size() == 0); + + ceph_assert(out); + ceph_assert(out->length() == 0); + + for (map::iterator i = to_decode.begin(); + i != to_decode.end(); + ++i) { + ceph_assert(i->second.length() == total_data_size); + } + + if (total_data_size == 0) + return 0; + + for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) { + map chunks; + for (map::iterator j = to_decode.begin(); + j != to_decode.end(); + ++j) { + chunks[j->first].substr_of(j->second, i, sinfo.get_chunk_size()); + } + bufferlist bl; + int r = ec_impl->decode_concat(chunks, &bl); + ceph_assert(r == 0); + ceph_assert(bl.length() == sinfo.get_stripe_width()); + out->claim_append(bl); + } + return 0; +} + +int ECUtil::decode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + map &to_decode, + map &out) { + + ceph_assert(to_decode.size()); + + for (auto &&i : to_decode) { + if(i.second.length() == 0) + return 0; + } + + set need; + for (map::iterator i = out.begin(); + i != out.end(); + ++i) { + ceph_assert(i->second); + ceph_assert(i->second->length() == 0); + need.insert(i->first); + } + + set avail; + for (auto &&i : to_decode) { + ceph_assert(i.second.length() != 0); + avail.insert(i.first); + } + + map>> min; + int r = ec_impl->minimum_to_decode(need, avail, &min); + ceph_assert(r == 0); + + int chunks_count = 0; + int repair_data_per_chunk = 0; + int subchunk_size = sinfo.get_chunk_size()/ec_impl->get_sub_chunk_count(); + + for (auto &&i : to_decode) { + auto found = min.find(i.first); + if (found != min.end()) { + int repair_subchunk_count = 0; + for (auto& subchunks : min[i.first]) { + repair_subchunk_count += subchunks.second; + } + repair_data_per_chunk = repair_subchunk_count * subchunk_size; + chunks_count = (int)i.second.length() / repair_data_per_chunk; + break; + } + } + + for (int i = 0; i < chunks_count; i++) { + map chunks; + for (auto j = to_decode.begin(); + j != to_decode.end(); + ++j) { + chunks[j->first].substr_of(j->second, + i*repair_data_per_chunk, + repair_data_per_chunk); + } + map out_bls; + r = ec_impl->decode(need, chunks, &out_bls, sinfo.get_chunk_size()); + ceph_assert(r == 0); + for (auto j = out.begin(); j != out.end(); ++j) { + ceph_assert(out_bls.count(j->first)); + ceph_assert(out_bls[j->first].length() == sinfo.get_chunk_size()); + j->second->claim_append(out_bls[j->first]); + } + } + for (auto &&i : out) { + ceph_assert(i.second->length() == chunks_count * sinfo.get_chunk_size()); + } + return 0; +} + +int ECUtil::encode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + bufferlist &in, + const set &want, + map *out) { + + uint64_t logical_size = in.length(); + + ceph_assert(logical_size % sinfo.get_stripe_width() == 0); + ceph_assert(out); + ceph_assert(out->empty()); + + if (logical_size == 0) + return 0; + + for (uint64_t i = 0; i < logical_size; i += sinfo.get_stripe_width()) { + map encoded; + bufferlist buf; + buf.substr_of(in, i, sinfo.get_stripe_width()); + int r = ec_impl->encode(want, buf, &encoded); + ceph_assert(r == 0); + for (map::iterator i = encoded.begin(); + i != encoded.end(); + ++i) { + ceph_assert(i->second.length() == sinfo.get_chunk_size()); + (*out)[i->first].claim_append(i->second); + } + } + + for (map::iterator i = out->begin(); + i != out->end(); + ++i) { + ceph_assert(i->second.length() % sinfo.get_chunk_size() == 0); + ceph_assert( + sinfo.aligned_chunk_offset_to_logical_offset(i->second.length()) == + logical_size); + } + return 0; +} + +void ECUtil::HashInfo::append(uint64_t old_size, + map &to_append) { + ceph_assert(old_size == total_chunk_size); + uint64_t size_to_append = to_append.begin()->second.length(); + if (has_chunk_hash()) { + ceph_assert(to_append.size() == cumulative_shard_hashes.size()); + for (map::iterator i = to_append.begin(); + i != to_append.end(); + ++i) { + ceph_assert(size_to_append == i->second.length()); + ceph_assert((unsigned)i->first < cumulative_shard_hashes.size()); + uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]); + cumulative_shard_hashes[i->first] = new_hash; + } + } + total_chunk_size += size_to_append; +} + +void ECUtil::HashInfo::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(total_chunk_size, bl); + encode(cumulative_shard_hashes, bl); + ENCODE_FINISH(bl); +} + +void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(total_chunk_size, bl); + decode(cumulative_shard_hashes, bl); + projected_total_chunk_size = total_chunk_size; + DECODE_FINISH(bl); +} + +void ECUtil::HashInfo::dump(Formatter *f) const +{ + f->dump_unsigned("total_chunk_size", total_chunk_size); + f->open_array_section("cumulative_shard_hashes"); + for (unsigned i = 0; i != cumulative_shard_hashes.size(); ++i) { + f->open_object_section("hash"); + f->dump_unsigned("shard", i); + f->dump_unsigned("hash", cumulative_shard_hashes[i]); + f->close_section(); + } + f->close_section(); +} + +namespace ECUtil { +std::ostream& operator<<(std::ostream& out, const HashInfo& hi) +{ + ostringstream hashes; + for (auto hash: hi.cumulative_shard_hashes) + hashes << " " << hex << hash; + return out << "tcs=" << hi.total_chunk_size << hashes.str(); +} +} + +void ECUtil::HashInfo::generate_test_instances(list& o) +{ + o.push_back(new HashInfo(3)); + { + bufferlist bl; + bl.append_zero(20); + map buffers; + buffers[0] = bl; + buffers[1] = bl; + buffers[2] = bl; + o.back()->append(0, buffers); + o.back()->append(20, buffers); + } + o.push_back(new HashInfo(4)); +} + +const string HINFO_KEY = "hinfo_key"; + +bool ECUtil::is_hinfo_key_string(const string &key) +{ + return key == HINFO_KEY; +} + +const string &ECUtil::get_hinfo_key() +{ + return HINFO_KEY; +} diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h new file mode 100644 index 000000000..dce78b8a8 --- /dev/null +++ b/src/osd/ECUtil.h @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ECUTIL_H +#define ECUTIL_H + +#include +#include "erasure-code/ErasureCodeInterface.h" +#include "include/buffer_fwd.h" +#include "include/ceph_assert.h" +#include "include/encoding.h" +#include "common/Formatter.h" + +namespace ECUtil { + +class stripe_info_t { + const uint64_t stripe_width; + const uint64_t chunk_size; +public: + stripe_info_t(uint64_t stripe_size, uint64_t stripe_width) + : stripe_width(stripe_width), + chunk_size(stripe_width / stripe_size) { + ceph_assert(stripe_width % stripe_size == 0); + } + bool logical_offset_is_stripe_aligned(uint64_t logical) const { + return (logical % stripe_width) == 0; + } + uint64_t get_stripe_width() const { + return stripe_width; + } + uint64_t get_chunk_size() const { + return chunk_size; + } + uint64_t logical_to_prev_chunk_offset(uint64_t offset) const { + return (offset / stripe_width) * chunk_size; + } + uint64_t logical_to_next_chunk_offset(uint64_t offset) const { + return ((offset + stripe_width - 1)/ stripe_width) * chunk_size; + } + uint64_t logical_to_prev_stripe_offset(uint64_t offset) const { + return offset - (offset % stripe_width); + } + uint64_t logical_to_next_stripe_offset(uint64_t offset) const { + return ((offset % stripe_width) ? + (offset - (offset % stripe_width) + stripe_width) : + offset); + } + uint64_t aligned_logical_offset_to_chunk_offset(uint64_t offset) const { + ceph_assert(offset % stripe_width == 0); + return (offset / stripe_width) * chunk_size; + } + uint64_t aligned_chunk_offset_to_logical_offset(uint64_t offset) const { + ceph_assert(offset % chunk_size == 0); + return (offset / chunk_size) * stripe_width; + } + std::pair aligned_offset_len_to_chunk( + std::pair in) const { + return std::make_pair( + aligned_logical_offset_to_chunk_offset(in.first), + aligned_logical_offset_to_chunk_offset(in.second)); + } + std::pair offset_len_to_stripe_bounds( + std::pair in) const { + uint64_t off = logical_to_prev_stripe_offset(in.first); + uint64_t len = logical_to_next_stripe_offset( + (in.first - off) + in.second); + return std::make_pair(off, len); + } +}; + +int decode( + const stripe_info_t &sinfo, + ceph::ErasureCodeInterfaceRef &ec_impl, + std::map &to_decode, + ceph::buffer::list *out); + +int decode( + const stripe_info_t &sinfo, + ceph::ErasureCodeInterfaceRef &ec_impl, + std::map &to_decode, + std::map &out); + +int encode( + const stripe_info_t &sinfo, + ceph::ErasureCodeInterfaceRef &ec_impl, + ceph::buffer::list &in, + const std::set &want, + std::map *out); + +class HashInfo { + uint64_t total_chunk_size = 0; + std::vector cumulative_shard_hashes; + + // purely ephemeral, represents the size once all in-flight ops commit + uint64_t projected_total_chunk_size = 0; +public: + HashInfo() {} + explicit HashInfo(unsigned num_chunks) : + cumulative_shard_hashes(num_chunks, -1) {} + void append(uint64_t old_size, std::map &to_append); + void clear() { + total_chunk_size = 0; + cumulative_shard_hashes = std::vector( + cumulative_shard_hashes.size(), + -1); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + uint32_t get_chunk_hash(int shard) const { + ceph_assert((unsigned)shard < cumulative_shard_hashes.size()); + return cumulative_shard_hashes[shard]; + } + uint64_t get_total_chunk_size() const { + return total_chunk_size; + } + uint64_t get_projected_total_chunk_size() const { + return projected_total_chunk_size; + } + uint64_t get_total_logical_size(const stripe_info_t &sinfo) const { + return get_total_chunk_size() * + (sinfo.get_stripe_width()/sinfo.get_chunk_size()); + } + uint64_t get_projected_total_logical_size(const stripe_info_t &sinfo) const { + return get_projected_total_chunk_size() * + (sinfo.get_stripe_width()/sinfo.get_chunk_size()); + } + void set_projected_total_logical_size( + const stripe_info_t &sinfo, + uint64_t logical_size) { + ceph_assert(sinfo.logical_offset_is_stripe_aligned(logical_size)); + projected_total_chunk_size = sinfo.aligned_logical_offset_to_chunk_offset( + logical_size); + } + void set_total_chunk_size_clear_hash(uint64_t new_chunk_size) { + cumulative_shard_hashes.clear(); + total_chunk_size = new_chunk_size; + } + bool has_chunk_hash() const { + return !cumulative_shard_hashes.empty(); + } + void update_to(const HashInfo &rhs) { + auto ptcs = projected_total_chunk_size; + *this = rhs; + projected_total_chunk_size = ptcs; + } + friend std::ostream& operator<<(std::ostream& out, const HashInfo& hi); +}; + +typedef std::shared_ptr HashInfoRef; + +bool is_hinfo_key_string(const std::string &key); +const std::string &get_hinfo_key(); + +WRITE_CLASS_ENCODER(ECUtil::HashInfo) +} +#endif diff --git a/src/osd/ExtentCache.cc b/src/osd/ExtentCache.cc new file mode 100644 index 000000000..3a8bbf11b --- /dev/null +++ b/src/osd/ExtentCache.cc @@ -0,0 +1,245 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "ExtentCache.h" + +using std::ostream; + +using ceph::bufferlist; + +void ExtentCache::extent::_link_pin_state(pin_state &pin_state) +{ + ceph_assert(parent_extent_set); + ceph_assert(!parent_pin_state); + parent_pin_state = &pin_state; + pin_state.pin_list.push_back(*this); +} + +void ExtentCache::extent::_unlink_pin_state() +{ + ceph_assert(parent_extent_set); + ceph_assert(parent_pin_state); + auto liter = pin_state::list::s_iterator_to(*this); + parent_pin_state->pin_list.erase(liter); + parent_pin_state = nullptr; +} + +void ExtentCache::extent::unlink() +{ + ceph_assert(parent_extent_set); + ceph_assert(parent_pin_state); + + _unlink_pin_state(); + + // remove from extent set + { + auto siter = object_extent_set::set::s_iterator_to(*this); + auto &set = object_extent_set::set::container_from_iterator(siter); + ceph_assert(&set == &(parent_extent_set->extent_set)); + set.erase(siter); + } + + parent_extent_set = nullptr; + ceph_assert(!parent_pin_state); +} + +void ExtentCache::extent::link( + object_extent_set &extent_set, + pin_state &pin_state) +{ + ceph_assert(!parent_extent_set); + parent_extent_set = &extent_set; + extent_set.extent_set.insert(*this); + + _link_pin_state(pin_state); +} + +void ExtentCache::extent::move( + pin_state &to) +{ + _unlink_pin_state(); + _link_pin_state(to); +} + +void ExtentCache::remove_and_destroy_if_empty(object_extent_set &eset) +{ + if (eset.extent_set.empty()) { + auto siter = cache_set::s_iterator_to(eset); + auto &set = cache_set::container_from_iterator(siter); + ceph_assert(&set == &per_object_caches); + + // per_object_caches owns eset + per_object_caches.erase(eset); + delete &eset; + } +} + +ExtentCache::object_extent_set &ExtentCache::get_or_create( + const hobject_t &oid) +{ + cache_set::insert_commit_data data; + auto p = per_object_caches.insert_check(oid, Cmp(), data); + if (p.second) { + auto *eset = new object_extent_set(oid); + per_object_caches.insert_commit(*eset, data); + return *eset; + } else { + return *(p.first); + } +} + +ExtentCache::object_extent_set *ExtentCache::get_if_exists( + const hobject_t &oid) +{ + cache_set::insert_commit_data data; + auto p = per_object_caches.insert_check(oid, Cmp(), data); + if (p.second) { + return nullptr; + } else { + return &*(p.first); + } +} + +std::pair< + ExtentCache::object_extent_set::set::iterator, + ExtentCache::object_extent_set::set::iterator + > ExtentCache::object_extent_set::get_containing_range( + uint64_t off, uint64_t len) +{ + // fst is first iterator with end after off (may be end) + auto fst = extent_set.upper_bound(off, uint_cmp()); + if (fst != extent_set.begin()) + --fst; + if (fst != extent_set.end() && off >= (fst->offset + fst->get_length())) + ++fst; + + // lst is first iterator with start >= off + len (may be end) + auto lst = extent_set.lower_bound(off + len, uint_cmp()); + return std::make_pair(fst, lst); +} + +extent_set ExtentCache::reserve_extents_for_rmw( + const hobject_t &oid, + write_pin &pin, + const extent_set &to_write, + const extent_set &to_read) +{ + if (to_write.empty() && to_read.empty()) { + return extent_set(); + } + extent_set must_read; + auto &eset = get_or_create(oid); + extent_set missing; + for (auto &&res: to_write) { + eset.traverse_update( + pin, + res.first, + res.second, + [&](uint64_t off, uint64_t len, + extent *ext, object_extent_set::update_action *action) { + action->action = object_extent_set::update_action::UPDATE_PIN; + if (!ext) { + missing.insert(off, len); + } + }); + } + must_read.intersection_of( + to_read, + missing); + return must_read; +} + +extent_map ExtentCache::get_remaining_extents_for_rmw( + const hobject_t &oid, + write_pin &pin, + const extent_set &to_get) +{ + if (to_get.empty()) { + return extent_map(); + } + extent_map ret; + auto &eset = get_or_create(oid); + for (auto &&res: to_get) { + bufferlist bl; + uint64_t cur = res.first; + eset.traverse_update( + pin, + res.first, + res.second, + [&](uint64_t off, uint64_t len, + extent *ext, object_extent_set::update_action *action) { + ceph_assert(off == cur); + cur = off + len; + action->action = object_extent_set::update_action::NONE; + ceph_assert(ext && ext->bl && ext->pinned_by_write()); + bl.substr_of( + *(ext->bl), + off - ext->offset, + len); + ret.insert(off, len, bl); + }); + } + return ret; +} + +void ExtentCache::present_rmw_update( + const hobject_t &oid, + write_pin &pin, + const extent_map &extents) +{ + if (extents.empty()) { + return; + } + auto &eset = get_or_create(oid); + for (auto &&res: extents) { + eset.traverse_update( + pin, + res.get_off(), + res.get_len(), + [&](uint64_t off, uint64_t len, + extent *ext, object_extent_set::update_action *action) { + action->action = object_extent_set::update_action::NONE; + ceph_assert(ext && ext->pinned_by_write()); + action->bl = bufferlist(); + action->bl->substr_of( + res.get_val(), + off - res.get_off(), + len); + }); + } +} + +ostream &ExtentCache::print(ostream &out) const +{ + out << "ExtentCache(" << std::endl; + for (auto esiter = per_object_caches.begin(); + esiter != per_object_caches.end(); + ++esiter) { + out << " Extents(" << esiter->oid << ")[" << std::endl; + for (auto exiter = esiter->extent_set.begin(); + exiter != esiter->extent_set.end(); + ++exiter) { + out << " Extent(" << exiter->offset + << "~" << exiter->get_length() + << ":" << exiter->pin_tid() + << ")" << std::endl; + } + } + return out << ")" << std::endl; +} + +ostream &operator<<(ostream &lhs, const ExtentCache &cache) +{ + return cache.print(lhs); +} diff --git a/src/osd/ExtentCache.h b/src/osd/ExtentCache.h new file mode 100644 index 000000000..972228cd0 --- /dev/null +++ b/src/osd/ExtentCache.h @@ -0,0 +1,489 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef EXTENT_CACHE_H +#define EXTENT_CACHE_H + +#include +#include +#include +#include +#include +#include +#include +#include "include/interval_set.h" +#include "common/interval_map.h" +#include "include/buffer.h" +#include "common/hobject.h" + +/** + ExtentCache + + The main purpose of this cache is to ensure that we can pipeline + overlapping partial overwrites. + + To that end we need to ensure that an extent pinned for an operation is + live until that operation completes. However, a particular extent + might be pinned by multiple operations (several pipelined writes + on the same object). + + 1) When we complete an operation, we only look at extents owned only + by that operation. + 2) Per-extent overhead is fixed size. + 2) Per-operation metadata is fixed size. + + This is simple enough to realize with two main structures: + - extent: contains a pointer to the pin owning it and intrusive list + pointers to other extents owned by the same pin + - pin_state: contains the list head for extents owned by it + + This works as long as we only need to remember one "owner" for + each extent. To make this work, we'll need to leverage some + invariants guaranteed by higher layers: + + 1) Writes on a particular object must be ordered + 2) A particular object will have outstanding reads or writes, but not + both (note that you can have a read while a write is committed, but + not applied). + + Our strategy therefore will be to have whichever in-progress op will + finish "last" be the owner of a particular extent. For now, we won't + cache reads, so 2) simply means that we can assume that reads and + recovery operations imply no unstable extents on the object in + question. + + Write: WaitRead -> WaitCommit -> Complete + + Invariant 1) above actually indicates that we can't have writes + bypassing the WaitRead state while there are writes waiting on + Reads. Thus, the set of operations pinning a particular extent + must always complete in order or arrival. + + This suggests that a particular extent may be in only the following + states: + + + 0) Empty (not in the map at all) + 1) Write Pending N + - Some write with reqid <= N is currently fetching the data for + this extent + - The extent must persist until Write reqid N completes + - All ops pinning this extent are writes in the WaitRead state of + the Write pipeline (there must be an in progress write, so no + reads can be in progress). + 2) Write Pinned N: + - This extent has data corresponding to some reqid M <= N + - The extent must persist until Write reqid N commits + - All ops pinning this extent are writes in some Write + state (all are possible). Reads are not possible + in this state (or the others) due to 2). + + All of the above suggests that there are 3 things users can + ask of the cache corresponding to the 3 Write pipelines + states. + */ + +/// If someone wants these types, but not ExtentCache, move to another file +struct bl_split_merge { + ceph::buffer::list split( + uint64_t offset, + uint64_t length, + ceph::buffer::list &bl) const { + ceph::buffer::list out; + out.substr_of(bl, offset, length); + return out; + } + bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const { + return true; + } + ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const { + ceph::buffer::list bl{std::move(left)}; + bl.claim_append(right); + return bl; + } + uint64_t length(const ceph::buffer::list &b) const { return b.length(); } +}; +using extent_set = interval_set; +using extent_map = interval_map; + +class ExtentCache { + struct object_extent_set; + struct pin_state; +private: + + struct extent { + object_extent_set *parent_extent_set = nullptr; + pin_state *parent_pin_state = nullptr; + boost::intrusive::set_member_hook<> extent_set_member; + boost::intrusive::list_member_hook<> pin_list_member; + + uint64_t offset; + uint64_t length; + std::optional bl; + + uint64_t get_length() const { + return length; + } + + bool is_pending() const { + return bl == std::nullopt; + } + + bool pinned_by_write() const { + ceph_assert(parent_pin_state); + return parent_pin_state->is_write(); + } + + uint64_t pin_tid() const { + ceph_assert(parent_pin_state); + return parent_pin_state->tid; + } + + extent(uint64_t offset, ceph::buffer::list _bl) + : offset(offset), length(_bl.length()), bl(_bl) {} + + extent(uint64_t offset, uint64_t length) + : offset(offset), length(length) {} + + bool operator<(const extent &rhs) const { + return offset < rhs.offset; + } + private: + // can briefly violate the two link invariant, used in unlink() and move() + void _link_pin_state(pin_state &pin_state); + void _unlink_pin_state(); + public: + void unlink(); + void link(object_extent_set &parent_extent_set, pin_state &pin_state); + void move(pin_state &to); + }; + + struct object_extent_set : boost::intrusive::set_base_hook<> { + hobject_t oid; + explicit object_extent_set(const hobject_t &oid) : oid(oid) {} + + using set_member_options = boost::intrusive::member_hook< + extent, + boost::intrusive::set_member_hook<>, + &extent::extent_set_member>; + using set = boost::intrusive::set; + set extent_set; + + bool operator<(const object_extent_set &rhs) const { + return oid < rhs.oid; + } + + struct uint_cmp { + bool operator()(uint64_t lhs, const extent &rhs) const { + return lhs < rhs.offset; + } + bool operator()(const extent &lhs, uint64_t rhs) const { + return lhs.offset < rhs; + } + }; + std::pair get_containing_range( + uint64_t offset, uint64_t length); + + void erase(uint64_t offset, uint64_t length); + + struct update_action { + enum type { + NONE, + UPDATE_PIN + }; + type action = NONE; + std::optional bl; + }; + template + void traverse_update( + pin_state &pin, + uint64_t offset, + uint64_t length, + F &&f) { + auto range = get_containing_range(offset, length); + + if (range.first == range.second || range.first->offset > offset) { + uint64_t extlen = range.first == range.second ? + length : range.first->offset - offset; + + update_action action; + f(offset, extlen, nullptr, &action); + ceph_assert(!action.bl || action.bl->length() == extlen); + if (action.action == update_action::UPDATE_PIN) { + extent *ext = action.bl ? + new extent(offset, *action.bl) : + new extent(offset, extlen); + ext->link(*this, pin); + } else { + ceph_assert(!action.bl); + } + } + + for (auto p = range.first; p != range.second;) { + extent *ext = &*p; + ++p; + + uint64_t extoff = std::max(ext->offset, offset); + uint64_t extlen = std::min( + ext->length - (extoff - ext->offset), + offset + length - extoff); + + update_action action; + f(extoff, extlen, ext, &action); + ceph_assert(!action.bl || action.bl->length() == extlen); + extent *final_extent = nullptr; + if (action.action == update_action::NONE) { + final_extent = ext; + } else { + pin_state *ps = ext->parent_pin_state; + ext->unlink(); + if ((ext->offset < offset) && + (ext->offset + ext->get_length() > offset)) { + extent *head = nullptr; + if (ext->bl) { + ceph::buffer::list bl; + bl.substr_of( + *(ext->bl), + 0, + offset - ext->offset); + head = new extent(ext->offset, bl); + } else { + head = new extent( + ext->offset, offset - ext->offset); + } + head->link(*this, *ps); + } + if ((ext->offset + ext->length > offset + length) && + (offset + length > ext->offset)) { + uint64_t nlen = + (ext->offset + ext->get_length()) - (offset + length); + extent *tail = nullptr; + if (ext->bl) { + ceph::buffer::list bl; + bl.substr_of( + *(ext->bl), + ext->get_length() - nlen, + nlen); + tail = new extent(offset + length, bl); + } else { + tail = new extent(offset + length, nlen); + } + tail->link(*this, *ps); + } + if (action.action == update_action::UPDATE_PIN) { + if (ext->bl) { + ceph::buffer::list bl; + bl.substr_of( + *(ext->bl), + extoff - ext->offset, + extlen); + final_extent = new ExtentCache::extent( + extoff, + bl); + } else { + final_extent = new ExtentCache::extent( + extoff, extlen); + } + final_extent->link(*this, pin); + } + delete ext; + } + + if (action.bl) { + ceph_assert(final_extent); + ceph_assert(final_extent->length == action.bl->length()); + final_extent->bl = *(action.bl); + } + + uint64_t next_off = p == range.second ? + offset + length : p->offset; + if (extoff + extlen < next_off) { + uint64_t tailoff = extoff + extlen; + uint64_t taillen = next_off - tailoff; + + update_action action; + f(tailoff, taillen, nullptr, &action); + ceph_assert(!action.bl || action.bl->length() == taillen); + if (action.action == update_action::UPDATE_PIN) { + extent *ext = action.bl ? + new extent(tailoff, *action.bl) : + new extent(tailoff, taillen); + ext->link(*this, pin); + } else { + ceph_assert(!action.bl); + } + } + } + } + }; + struct Cmp { + bool operator()(const hobject_t &oid, const object_extent_set &rhs) const { + return oid < rhs.oid; + } + bool operator()(const object_extent_set &lhs, const hobject_t &oid) const { + return lhs.oid < oid; + } + }; + + object_extent_set &get_or_create(const hobject_t &oid); + object_extent_set *get_if_exists(const hobject_t &oid); + + void remove_and_destroy_if_empty(object_extent_set &set); + using cache_set = boost::intrusive::set; + cache_set per_object_caches; + + uint64_t next_write_tid = 1; + uint64_t next_read_tid = 1; + struct pin_state { + uint64_t tid = 0; + enum pin_type_t { + NONE, + WRITE, + }; + pin_type_t pin_type = NONE; + bool is_write() const { return pin_type == WRITE; } + + pin_state(const pin_state &other) = delete; + pin_state &operator=(const pin_state &other) = delete; + pin_state(pin_state &&other) = delete; + pin_state() = default; + + using list_member_options = boost::intrusive::member_hook< + extent, + boost::intrusive::list_member_hook<>, + &extent::pin_list_member>; + using list = boost::intrusive::list; + list pin_list; + ~pin_state() { + ceph_assert(pin_list.empty()); + ceph_assert(tid == 0); + ceph_assert(pin_type == NONE); + } + void _open(uint64_t in_tid, pin_type_t in_type) { + ceph_assert(pin_type == NONE); + ceph_assert(in_tid > 0); + tid = in_tid; + pin_type = in_type; + } + }; + + void release_pin(pin_state &p) { + for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) { + std::unique_ptr extent(&*iter); // we now own this + iter++; // unlink will invalidate + ceph_assert(extent->parent_extent_set); + auto &eset = *(extent->parent_extent_set); + extent->unlink(); + remove_and_destroy_if_empty(eset); + } + p.tid = 0; + p.pin_type = pin_state::NONE; + } + +public: + class write_pin : private pin_state { + friend class ExtentCache; + private: + void open(uint64_t in_tid) { + _open(in_tid, pin_state::WRITE); + } + public: + write_pin() : pin_state() {} + }; + + void open_write_pin(write_pin &pin) { + pin.open(next_write_tid++); + } + + /** + * Reserves extents required for rmw, and learn + * which need to be read + * + * Pins all extents in to_write. Returns subset of to_read not + * currently present in the cache. Caller must obtain those + * extents before calling get_remaining_extents_for_rmw. + * + * Transition table: + * - Empty -> Write Pending pin.reqid + * - Write Pending N -> Write Pending pin.reqid + * - Write Pinned N -> Write Pinned pin.reqid + * + * @param oid [in] object undergoing rmw + * @param pin [in,out] pin to use (obtained from create_write_pin) + * @param to_write [in] extents which will be written + * @param to_read [in] extents to read prior to write (must be subset + * of to_write) + * @return subset of to_read which isn't already present or pending + */ + extent_set reserve_extents_for_rmw( + const hobject_t &oid, + write_pin &pin, + const extent_set &to_write, + const extent_set &to_read); + + /** + * Gets extents required for rmw not returned from + * reserve_extents_for_rmw + * + * Requested extents (to_get) must be the set to_read \ the set + * returned from reserve_extents_for_rmw. No transition table, + * all extents at this point must be present and already pinned + * for this pin by reserve_extents_for_rmw. + * + * @param oid [in] object + * @param pin [in,out] pin associated with this IO + * @param to_get [in] extents to get (see above for restrictions) + * @return map of buffers from to_get + */ + extent_map get_remaining_extents_for_rmw( + const hobject_t &oid, + write_pin &pin, + const extent_set &to_get); + + /** + * Updates the cache to reflect the rmw write + * + * All presented extents must already have been specified in + * reserve_extents_for_rmw under to_write. + * + * Transition table: + * - Empty -> invalid, must call reserve_extents_for_rmw first + * - Write Pending N -> Write Pinned N, update buffer + * (assert N >= pin.reqid) + * - Write Pinned N -> Update buffer (assert N >= pin.reqid) + * + * @param oid [in] object + * @param pin [in,out] pin associated with this IO + * @param extents [in] map of buffers to update + * @return void + */ + void present_rmw_update( + const hobject_t &oid, + write_pin &pin, + const extent_map &extents); + + /** + * Release all buffers pinned by pin + */ + void release_write_pin( + write_pin &pin) { + release_pin(pin); + } + + std::ostream &print(std::ostream &out) const; +}; + +std::ostream &operator <<(std::ostream &lhs, const ExtentCache &cache); + +#endif diff --git a/src/osd/HitSet.cc b/src/osd/HitSet.cc new file mode 100644 index 000000000..03475d36f --- /dev/null +++ b/src/osd/HitSet.cc @@ -0,0 +1,256 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "HitSet.h" +#include "common/Formatter.h" + +using std::ostream; +using std::list; +using ceph::Formatter; + +// -- HitSet -- + +HitSet::HitSet(const HitSet::Params& params) + : sealed(false) +{ + switch (params.get_type()) { + case TYPE_BLOOM: + { + BloomHitSet::Params *p = + static_cast(params.impl.get()); + impl.reset(new BloomHitSet(p)); + } + break; + + case TYPE_EXPLICIT_HASH: + impl.reset(new ExplicitHashHitSet(static_cast(params.impl.get()))); + break; + + case TYPE_EXPLICIT_OBJECT: + impl.reset(new ExplicitObjectHitSet(static_cast(params.impl.get()))); + break; + + default: + assert (0 == "unknown HitSet type"); + } +} + +void HitSet::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(1, 1, bl); + encode(sealed, bl); + if (impl) { + encode((__u8)impl->get_type(), bl); + impl->encode(bl); + } else { + encode((__u8)TYPE_NONE, bl); + } + ENCODE_FINISH(bl); +} + +void HitSet::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(sealed, bl); + __u8 type; + decode(type, bl); + switch ((impl_type_t)type) { + case TYPE_EXPLICIT_HASH: + impl.reset(new ExplicitHashHitSet); + break; + case TYPE_EXPLICIT_OBJECT: + impl.reset(new ExplicitObjectHitSet); + break; + case TYPE_BLOOM: + impl.reset(new BloomHitSet); + break; + case TYPE_NONE: + impl.reset(NULL); + break; + default: + throw ceph::buffer::malformed_input("unrecognized HitMap type"); + } + if (impl) + impl->decode(bl); + DECODE_FINISH(bl); +} + +void HitSet::dump(Formatter *f) const +{ + f->dump_string("type", get_type_name()); + f->dump_string("sealed", sealed ? "yes" : "no"); + if (impl) + impl->dump(f); +} + +void HitSet::generate_test_instances(list& o) +{ + o.push_back(new HitSet); + o.push_back(new HitSet(new BloomHitSet(10, .1, 1))); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + o.push_back(new HitSet(new ExplicitHashHitSet)); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + o.push_back(new HitSet(new ExplicitObjectHitSet)); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); +} + +HitSet::Params::Params(const Params& o) noexcept +{ + if (o.get_type() != TYPE_NONE) { + create_impl(o.get_type()); + // it's annoying to write virtual operator= methods; use encode/decode + // instead. + ceph::buffer::list bl; + o.impl->encode(bl); + auto p = bl.cbegin(); + impl->decode(p); + } // else we don't need to do anything +} + +const HitSet::Params& HitSet::Params::operator=(const Params& o) +{ + create_impl(o.get_type()); + if (o.impl) { + // it's annoying to write virtual operator= methods; use encode/decode + // instead. + ceph::buffer::list bl; + o.impl->encode(bl); + auto p = bl.cbegin(); + impl->decode(p); + } + return *this; +} + +void HitSet::Params::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(1, 1, bl); + if (impl) { + encode((__u8)impl->get_type(), bl); + impl->encode(bl); + } else { + encode((__u8)TYPE_NONE, bl); + } + ENCODE_FINISH(bl); +} + +bool HitSet::Params::create_impl(impl_type_t type) +{ + switch ((impl_type_t)type) { + case TYPE_EXPLICIT_HASH: + impl.reset(new ExplicitHashHitSet::Params); + break; + case TYPE_EXPLICIT_OBJECT: + impl.reset(new ExplicitObjectHitSet::Params); + break; + case TYPE_BLOOM: + impl.reset(new BloomHitSet::Params); + break; + case TYPE_NONE: + impl.reset(NULL); + break; + default: + return false; + } + return true; +} + +void HitSet::Params::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + __u8 type; + decode(type, bl); + if (!create_impl((impl_type_t)type)) + throw ceph::buffer::malformed_input("unrecognized HitMap type"); + if (impl) + impl->decode(bl); + DECODE_FINISH(bl); +} + +void HitSet::Params::dump(Formatter *f) const +{ + f->dump_string("type", HitSet::get_type_name(get_type())); + if (impl) + impl->dump(f); +} + +void HitSet::Params::generate_test_instances(list& o) +{ +#define loop_hitset_params(kind) \ +{ \ + list params; \ + kind::Params::generate_test_instances(params); \ + for (list::iterator i = params.begin(); \ + i != params.end(); ++i) \ + o.push_back(new Params(*i)); \ +} + o.push_back(new Params); + o.push_back(new Params(new BloomHitSet::Params)); + loop_hitset_params(BloomHitSet); + o.push_back(new Params(new ExplicitHashHitSet::Params)); + loop_hitset_params(ExplicitHashHitSet); + o.push_back(new Params(new ExplicitObjectHitSet::Params)); + loop_hitset_params(ExplicitObjectHitSet); +} + +ostream& operator<<(ostream& out, const HitSet::Params& p) { + out << HitSet::get_type_name(p.get_type()); + if (p.impl) { + out << "{"; + p.impl->dump_stream(out); + } + out << "}"; + return out; +} + + +void ExplicitHashHitSet::dump(Formatter *f) const { + f->dump_unsigned("insert_count", count); + f->open_array_section("hash_set"); + for (ceph::unordered_set::const_iterator p = hits.begin(); + p != hits.end(); + ++p) + f->dump_unsigned("hash", *p); + f->close_section(); +} + +void ExplicitObjectHitSet::dump(Formatter *f) const { + f->dump_unsigned("insert_count", count); + f->open_array_section("set"); + for (ceph::unordered_set::const_iterator p = hits.begin(); + p != hits.end(); + ++p) { + f->open_object_section("object"); + p->dump(f); + f->close_section(); + } + f->close_section(); +} + +void BloomHitSet::Params::dump(Formatter *f) const { + f->dump_float("false_positive_probability", get_fpp()); + f->dump_int("target_size", target_size); + f->dump_int("seed", seed); +} + +void BloomHitSet::dump(Formatter *f) const { + f->open_object_section("bloom_filter"); + bloom.dump(f); + f->close_section(); +} diff --git a/src/osd/HitSet.h b/src/osd/HitSet.h new file mode 100644 index 000000000..dedc45ed4 --- /dev/null +++ b/src/osd/HitSet.h @@ -0,0 +1,455 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_HITSET_H +#define CEPH_OSD_HITSET_H + +#include + +#include + +#include "include/encoding.h" +#include "include/unordered_set.h" +#include "common/bloom_filter.hpp" +#include "common/hobject.h" + +/** + * generic container for a HitSet + * + * Encapsulate a HitSetImpl of any type. Expose a generic interface + * to users and wrap the encoded object with a type so that it can be + * safely decoded later. + */ + +class HitSet { +public: + typedef enum { + TYPE_NONE = 0, + TYPE_EXPLICIT_HASH = 1, + TYPE_EXPLICIT_OBJECT = 2, + TYPE_BLOOM = 3 + } impl_type_t; + + static std::string_view get_type_name(impl_type_t t) { + switch (t) { + case TYPE_NONE: return "none"; + case TYPE_EXPLICIT_HASH: return "explicit_hash"; + case TYPE_EXPLICIT_OBJECT: return "explicit_object"; + case TYPE_BLOOM: return "bloom"; + default: return "???"; + } + } + std::string_view get_type_name() const { + if (impl) + return get_type_name(impl->get_type()); + return get_type_name(TYPE_NONE); + } + + /// abstract interface for a HitSet implementation + class Impl { + public: + virtual impl_type_t get_type() const = 0; + virtual bool is_full() const = 0; + virtual void insert(const hobject_t& o) = 0; + virtual bool contains(const hobject_t& o) const = 0; + virtual unsigned insert_count() const = 0; + virtual unsigned approx_unique_insert_count() const = 0; + virtual void encode(ceph::buffer::list &bl) const = 0; + virtual void decode(ceph::buffer::list::const_iterator& p) = 0; + virtual void dump(ceph::Formatter *f) const = 0; + virtual Impl* clone() const = 0; + virtual void seal() {} + virtual ~Impl() {} + }; + + boost::scoped_ptr impl; + bool sealed; + + class Params { + /// create an Impl* of the given type + bool create_impl(impl_type_t t); + + public: + class Impl { + public: + virtual impl_type_t get_type() const = 0; + virtual HitSet::Impl *get_new_impl() const = 0; + virtual void encode(ceph::buffer::list &bl) const {} + virtual void decode(ceph::buffer::list::const_iterator& p) {} + virtual void dump(ceph::Formatter *f) const {} + virtual void dump_stream(std::ostream& o) const {} + virtual ~Impl() {} + }; + + Params() {} + explicit Params(Impl *i) : impl(i) {} + virtual ~Params() {} + + boost::scoped_ptr impl; + + impl_type_t get_type() const { + if (impl) + return impl->get_type(); + return TYPE_NONE; + } + + Params(const Params& o) noexcept; + const Params& operator=(const Params& o); + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + + friend std::ostream& operator<<(std::ostream& out, const HitSet::Params& p); + }; + + HitSet() : impl(NULL), sealed(false) {} + explicit HitSet(Impl *i) : impl(i), sealed(false) {} + explicit HitSet(const HitSet::Params& params); + + HitSet(const HitSet& o) { + sealed = o.sealed; + if (o.impl) + impl.reset(o.impl->clone()); + else + impl.reset(NULL); + } + const HitSet& operator=(const HitSet& o) { + sealed = o.sealed; + if (o.impl) + impl.reset(o.impl->clone()); + else + impl.reset(NULL); + return *this; + } + + + bool is_full() const { + return impl->is_full(); + } + /// insert a hash into the set + void insert(const hobject_t& o) { + impl->insert(o); + } + /// query whether a hash is in the set + bool contains(const hobject_t& o) const { + return impl->contains(o); + } + + unsigned insert_count() const { + return impl->insert_count(); + } + unsigned approx_unique_insert_count() const { + return impl->approx_unique_insert_count(); + } + void seal() { + ceph_assert(!sealed); + sealed = true; + impl->seal(); + } + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + +private: + void reset_to_type(impl_type_t type); +}; +WRITE_CLASS_ENCODER(HitSet) +WRITE_CLASS_ENCODER(HitSet::Params) + +typedef boost::shared_ptr HitSetRef; + +std::ostream& operator<<(std::ostream& out, const HitSet::Params& p); + +/** + * explicitly enumerate hash hits in the set + */ +class ExplicitHashHitSet : public HitSet::Impl { + uint64_t count; + ceph::unordered_set hits; +public: + class Params : public HitSet::Params::Impl { + public: + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_EXPLICIT_HASH; + } + HitSet::Impl *get_new_impl() const override { + return new ExplicitHashHitSet; + } + static void generate_test_instances(std::list& o) { + o.push_back(new Params); + } + }; + + ExplicitHashHitSet() : count(0) {} + explicit ExplicitHashHitSet(const ExplicitHashHitSet::Params *p) : count(0) {} + ExplicitHashHitSet(const ExplicitHashHitSet &o) : count(o.count), + hits(o.hits) {} + + HitSet::Impl *clone() const override { + return new ExplicitHashHitSet(*this); + } + + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_EXPLICIT_HASH; + } + bool is_full() const override { + return false; + } + void insert(const hobject_t& o) override { + hits.insert(o.get_hash()); + ++count; + } + bool contains(const hobject_t& o) const override { + return hits.count(o.get_hash()); + } + unsigned insert_count() const override { + return count; + } + unsigned approx_unique_insert_count() const override { + return hits.size(); + } + void encode(ceph::buffer::list &bl) const override { + ENCODE_START(1, 1, bl); + encode(count, bl); + encode(hits, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) override { + DECODE_START(1, bl); + decode(count, bl); + decode(hits, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const override; + static void generate_test_instances(std::list& o) { + o.push_back(new ExplicitHashHitSet); + o.push_back(new ExplicitHashHitSet); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + } +}; +WRITE_CLASS_ENCODER(ExplicitHashHitSet) + +/** + * explicitly enumerate objects in the set + */ +class ExplicitObjectHitSet : public HitSet::Impl { + uint64_t count; + ceph::unordered_set hits; +public: + class Params : public HitSet::Params::Impl { + public: + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_EXPLICIT_OBJECT; + } + HitSet::Impl *get_new_impl() const override { + return new ExplicitObjectHitSet; + } + static void generate_test_instances(std::list& o) { + o.push_back(new Params); + } + }; + + ExplicitObjectHitSet() : count(0) {} + explicit ExplicitObjectHitSet(const ExplicitObjectHitSet::Params *p) : count(0) {} + ExplicitObjectHitSet(const ExplicitObjectHitSet &o) : count(o.count), + hits(o.hits) {} + + HitSet::Impl *clone() const override { + return new ExplicitObjectHitSet(*this); + } + + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_EXPLICIT_OBJECT; + } + bool is_full() const override { + return false; + } + void insert(const hobject_t& o) override { + hits.insert(o); + ++count; + } + bool contains(const hobject_t& o) const override { + return hits.count(o); + } + unsigned insert_count() const override { + return count; + } + unsigned approx_unique_insert_count() const override { + return hits.size(); + } + void encode(ceph::buffer::list &bl) const override { + ENCODE_START(1, 1, bl); + encode(count, bl); + encode(hits, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) override { + DECODE_START(1, bl); + decode(count, bl); + decode(hits, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const override; + static void generate_test_instances(std::list& o) { + o.push_back(new ExplicitObjectHitSet); + o.push_back(new ExplicitObjectHitSet); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + } +}; +WRITE_CLASS_ENCODER(ExplicitObjectHitSet) + +/** + * use a bloom_filter to track hits to the set + */ +class BloomHitSet : public HitSet::Impl { + compressible_bloom_filter bloom; + +public: + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_BLOOM; + } + + class Params : public HitSet::Params::Impl { + public: + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_BLOOM; + } + HitSet::Impl *get_new_impl() const override { + return new BloomHitSet; + } + + uint32_t fpp_micro; ///< false positive probability / 1M + uint64_t target_size; ///< number of unique insertions we expect to this HitSet + uint64_t seed; ///< seed to use when initializing the bloom filter + + Params() + : fpp_micro(0), target_size(0), seed(0) {} + Params(double fpp, uint64_t t, uint64_t s) + : fpp_micro(fpp * 1000000.0), target_size(t), seed(s) {} + Params(const Params &o) + : fpp_micro(o.fpp_micro), + target_size(o.target_size), + seed(o.seed) {} + ~Params() override {} + + double get_fpp() const { + return (double)fpp_micro / 1000000.0; + } + void set_fpp(double f) { + fpp_micro = (unsigned)(llrintl(f * 1000000.0)); + } + + void encode(ceph::buffer::list& bl) const override { + ENCODE_START(1, 1, bl); + encode(fpp_micro, bl); + encode(target_size, bl); + encode(seed, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) override { + DECODE_START(1, bl); + decode(fpp_micro, bl); + decode(target_size, bl); + decode(seed, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const override; + void dump_stream(std::ostream& o) const override { + o << "false_positive_probability: " + << get_fpp() << ", target_size: " << target_size + << ", seed: " << seed; + } + static void generate_test_instances(std::list& o) { + o.push_back(new Params); + o.push_back(new Params); + (*o.rbegin())->fpp_micro = 123456; + (*o.rbegin())->target_size = 300; + (*o.rbegin())->seed = 99; + } + }; + + BloomHitSet() {} + BloomHitSet(unsigned inserts, double fpp, int seed) + : bloom(inserts, fpp, seed) + {} + explicit BloomHitSet(const BloomHitSet::Params *p) : bloom(p->target_size, + p->get_fpp(), + p->seed) + {} + + BloomHitSet(const BloomHitSet &o) { + // oh god + ceph::buffer::list bl; + o.encode(bl); + auto bli = std::cbegin(bl); + this->decode(bli); + } + + HitSet::Impl *clone() const override { + return new BloomHitSet(*this); + } + + bool is_full() const override { + return bloom.is_full(); + } + + void insert(const hobject_t& o) override { + bloom.insert(o.get_hash()); + } + bool contains(const hobject_t& o) const override { + return bloom.contains(o.get_hash()); + } + unsigned insert_count() const override { + return bloom.element_count(); + } + unsigned approx_unique_insert_count() const override { + return bloom.approx_unique_element_count(); + } + void seal() override { + // aim for a density of .5 (50% of bit set) + double pc = bloom.density() * 2.0; + if (pc < 1.0) + bloom.compress(pc); + } + + void encode(ceph::buffer::list &bl) const override { + ENCODE_START(1, 1, bl); + encode(bloom, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) override { + DECODE_START(1, bl); + decode(bloom, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const override; + static void generate_test_instances(std::list& o) { + o.push_back(new BloomHitSet); + o.push_back(new BloomHitSet(10, .1, 1)); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + } +}; +WRITE_CLASS_ENCODER(BloomHitSet) + +#endif diff --git a/src/osd/MissingLoc.cc b/src/osd/MissingLoc.cc new file mode 100644 index 000000000..d45220a82 --- /dev/null +++ b/src/osd/MissingLoc.cc @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MissingLoc.h" + +#define dout_context cct +#undef dout_prefix +#define dout_prefix (gen_prefix(*_dout)) +#define dout_subsys ceph_subsys_osd + +using std::set; + +bool MissingLoc::readable_with_acting( + const hobject_t &hoid, + const set &acting, + eversion_t* v) const { + if (!needs_recovery(hoid, v)) + return true; + if (is_deleted(hoid)) + return false; + auto missing_loc_entry = missing_loc.find(hoid); + if (missing_loc_entry == missing_loc.end()) + return false; + const set &locs = missing_loc_entry->second; + ldout(cct, 10) << __func__ << ": locs:" << locs << dendl; + set have_acting; + for (auto i = locs.begin(); i != locs.end(); ++i) { + if (acting.count(*i)) + have_acting.insert(*i); + } + return (*is_readable)(have_acting); +} + +void MissingLoc::add_batch_sources_info( + const set &sources, + HBHandle *handle) +{ + ldout(cct, 10) << __func__ << ": adding sources in batch " + << sources.size() << dendl; + unsigned loop = 0; + bool sources_updated = false; + for (auto i = needs_recovery_map.begin(); + i != needs_recovery_map.end(); + ++i) { + if (handle && ++loop >= cct->_conf->osd_loop_before_reset_tphandle) { + handle->reset_tp_timeout(); + loop = 0; + } + if (i->second.is_delete()) + continue; + + auto p = missing_loc.find(i->first); + if (p == missing_loc.end()) { + p = missing_loc.emplace(i->first, set()).first; + } else { + _dec_count(p->second); + } + missing_loc[i->first].insert(sources.begin(), sources.end()); + _inc_count(p->second); + + if (!sources_updated) { + missing_loc_sources.insert(sources.begin(), sources.end()); + sources_updated = true; + } + } +} + +bool MissingLoc::add_source_info( + pg_shard_t fromosd, + const pg_info_t &oinfo, + const pg_missing_t &omissing, + HBHandle *handle) +{ + bool found_missing = false; + unsigned loop = 0; + bool sources_updated = false; + // found items? + for (auto p = needs_recovery_map.begin(); + p != needs_recovery_map.end(); + ++p) { + const hobject_t &soid(p->first); + eversion_t need = p->second.need; + if (handle && ++loop >= cct->_conf->osd_loop_before_reset_tphandle) { + handle->reset_tp_timeout(); + loop = 0; + } + if (p->second.is_delete()) { + ldout(cct, 10) << __func__ << " " << soid + << " delete, ignoring source" << dendl; + continue; + } + if (oinfo.last_update < need) { + ldout(cct, 10) << "search_for_missing " << soid << " " << need + << " also missing on osd." << fromosd + << " (last_update " << oinfo.last_update + << " < needed " << need << ")" << dendl; + continue; + } + if (p->first >= oinfo.last_backfill) { + // FIXME: this is _probably_ true, although it could conceivably + // be in the undefined region! Hmm! + ldout(cct, 10) << "search_for_missing " << soid << " " << need + << " also missing on osd." << fromosd + << " (past last_backfill " << oinfo.last_backfill + << ")" << dendl; + continue; + } + if (omissing.is_missing(soid)) { + ldout(cct, 10) << "search_for_missing " << soid << " " << need + << " also missing on osd." << fromosd << dendl; + continue; + } + + ldout(cct, 10) << "search_for_missing " << soid << " " << need + << " is on osd." << fromosd << dendl; + + { + auto p = missing_loc.find(soid); + if (p == missing_loc.end()) { + p = missing_loc.emplace(soid, set()).first; + } else { + _dec_count(p->second); + } + p->second.insert(fromosd); + _inc_count(p->second); + } + + if (!sources_updated) { + missing_loc_sources.insert(fromosd); + sources_updated = true; + } + found_missing = true; + } + + ldout(cct, 20) << "needs_recovery_map missing " << needs_recovery_map + << dendl; + return found_missing; +} + +void MissingLoc::check_recovery_sources(const OSDMapRef& osdmap) +{ + set now_down; + for (auto p = missing_loc_sources.begin(); + p != missing_loc_sources.end(); + ) { + if (osdmap->is_up(p->osd)) { + ++p; + continue; + } + ldout(cct, 10) << __func__ << " source osd." << *p << " now down" << dendl; + now_down.insert(*p); + missing_loc_sources.erase(p++); + } + + if (now_down.empty()) { + ldout(cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl; + } else { + ldout(cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are " + << missing_loc_sources << dendl; + + // filter missing_loc + auto p = missing_loc.begin(); + while (p != missing_loc.end()) { + auto q = p->second.begin(); + bool changed = false; + while (q != p->second.end()) { + if (now_down.count(*q)) { + if (!changed) { + changed = true; + _dec_count(p->second); + } + p->second.erase(q++); + } else { + ++q; + } + } + if (p->second.empty()) { + missing_loc.erase(p++); + } else { + if (changed) { + _inc_count(p->second); + } + ++p; + } + } + } +} + +void MissingLoc::remove_stray_recovery_sources(pg_shard_t stray) +{ + ldout(cct, 10) << __func__ << " remove osd " << stray << " from missing_loc" << dendl; + // filter missing_loc + auto p = missing_loc.begin(); + while (p != missing_loc.end()) { + auto q = p->second.begin(); + bool changed = false; + while (q != p->second.end()) { + if (*q == stray) { + if (!changed) { + changed = true; + _dec_count(p->second); + } + p->second.erase(q++); + } else { + ++q; + } + } + if (p->second.empty()) { + missing_loc.erase(p++); + } else { + if (changed) { + _inc_count(p->second); + } + ++p; + } + } + // filter missing_loc_sources + for (auto p = missing_loc_sources.begin(); p != missing_loc_sources.end();) { + if (*p != stray) { + ++p; + continue; + } + ldout(cct, 10) << __func__ << " remove osd" << stray << " from missing_loc_sources" << dendl; + missing_loc_sources.erase(p++); + } +} diff --git a/src/osd/MissingLoc.h b/src/osd/MissingLoc.h new file mode 100644 index 000000000..9bce3ceda --- /dev/null +++ b/src/osd/MissingLoc.h @@ -0,0 +1,353 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +#include "OSDMap.h" +#include "common/HBHandle.h" +#include "common/ceph_context.h" +#include "common/dout.h" +#include "osd_types.h" + +class MissingLoc { + public: + + class MappingInfo { + public: + virtual const std::set &get_upset() const = 0; + virtual bool is_ec_pg() const = 0; + virtual int get_pg_size() const = 0; + virtual ~MappingInfo() {} + }; + + // a loc_count indicates how many locations we know in each of + // these distinct sets + struct loc_count_t { + int up = 0; //< up + int other = 0; //< other + + friend bool operator<(const loc_count_t& l, + const loc_count_t& r) { + return (l.up < r.up || + (l.up == r.up && + (l.other < r.other))); + } + friend std::ostream& operator<<(std::ostream& out, const loc_count_t& l) { + ceph_assert(l.up >= 0); + ceph_assert(l.other >= 0); + return out << "(" << l.up << "+" << l.other << ")"; + } + }; + + + using missing_by_count_t = std::map>; + private: + loc_count_t _get_count(const std::set &shards) { + loc_count_t r; + for (auto s : shards) { + if (mapping_info->get_upset().count(s)) { + r.up++; + } else { + r.other++; + } + } + return r; + } + + std::map needs_recovery_map; + std::map > missing_loc; + std::set missing_loc_sources; + + // for every entry in missing_loc, we count how many of each type of shard we have, + // and maintain totals here. The sum of the values for this std::map will always equal + // missing_loc.size(). + missing_by_count_t missing_by_count; + + void pgs_by_shard_id( + const std::set& s, + std::map >& pgsbs) { + if (mapping_info->is_ec_pg()) { + int num_shards = mapping_info->get_pg_size(); + // For completely missing shards initialize with empty std::set + for (int i = 0 ; i < num_shards ; ++i) { + shard_id_t shard(i); + pgsbs[shard]; + } + for (auto pgs: s) + pgsbs[pgs.shard].insert(pgs); + } else { + pgsbs[shard_id_t::NO_SHARD] = s; + } + } + + void _inc_count(const std::set& s) { + std::map< shard_id_t, std::set > pgsbs; + pgs_by_shard_id(s, pgsbs); + for (auto shard: pgsbs) + ++missing_by_count[shard.first][_get_count(shard.second)]; + } + void _dec_count(const std::set& s) { + std::map< shard_id_t, std::set > pgsbs; + pgs_by_shard_id(s, pgsbs); + for (auto shard: pgsbs) { + auto p = missing_by_count[shard.first].find(_get_count(shard.second)); + ceph_assert(p != missing_by_count[shard.first].end()); + if (--p->second == 0) { + missing_by_count[shard.first].erase(p); + } + } + } + + spg_t pgid; + MappingInfo *mapping_info; + DoutPrefixProvider *dpp; + CephContext *cct; + std::set empty_set; + public: + boost::scoped_ptr is_readable; + boost::scoped_ptr is_recoverable; + explicit MissingLoc( + spg_t pgid, + MappingInfo *mapping_info, + DoutPrefixProvider *dpp, + CephContext *cct) + : pgid(pgid), mapping_info(mapping_info), dpp(dpp), cct(cct) { } + void set_backend_predicates( + IsPGReadablePredicate *_is_readable, + IsPGRecoverablePredicate *_is_recoverable) { + is_readable.reset(_is_readable); + is_recoverable.reset(_is_recoverable); + } + const IsPGRecoverablePredicate &get_recoverable_predicate() const { + return *is_recoverable; + } + std::ostream& gen_prefix(std::ostream& out) const { + return dpp->gen_prefix(out); + } + bool needs_recovery( + const hobject_t &hoid, + eversion_t *v = 0) const { + std::map::const_iterator i = + needs_recovery_map.find(hoid); + if (i == needs_recovery_map.end()) + return false; + if (v) + *v = i->second.need; + return true; + } + bool is_deleted(const hobject_t &hoid) const { + auto i = needs_recovery_map.find(hoid); + if (i == needs_recovery_map.end()) + return false; + return i->second.is_delete(); + } + bool is_unfound(const hobject_t &hoid) const { + auto it = needs_recovery_map.find(hoid); + if (it == needs_recovery_map.end()) { + return false; + } + if (it->second.is_delete()) { + return false; + } + auto mit = missing_loc.find(hoid); + return mit == missing_loc.end() || !(*is_recoverable)(mit->second); + } + bool readable_with_acting( + const hobject_t &hoid, + const std::set &acting, + eversion_t* v = 0) const; + uint64_t num_unfound() const { + uint64_t ret = 0; + for (std::map::const_iterator i = + needs_recovery_map.begin(); + i != needs_recovery_map.end(); + ++i) { + if (i->second.is_delete()) + continue; + auto mi = missing_loc.find(i->first); + if (mi == missing_loc.end() || !(*is_recoverable)(mi->second)) + ++ret; + } + return ret; + } + + bool have_unfound() const { + for (std::map::const_iterator i = + needs_recovery_map.begin(); + i != needs_recovery_map.end(); + ++i) { + if (i->second.is_delete()) + continue; + auto mi = missing_loc.find(i->first); + if (mi == missing_loc.end() || !(*is_recoverable)(mi->second)) + return true; + } + return false; + } + void clear() { + needs_recovery_map.clear(); + missing_loc.clear(); + missing_loc_sources.clear(); + missing_by_count.clear(); + } + + void add_location(const hobject_t &hoid, pg_shard_t location) { + auto p = missing_loc.find(hoid); + if (p == missing_loc.end()) { + p = missing_loc.emplace(hoid, std::set()).first; + } else { + _dec_count(p->second); + } + p->second.insert(location); + _inc_count(p->second); + } + void remove_location(const hobject_t &hoid, pg_shard_t location) { + auto p = missing_loc.find(hoid); + if (p != missing_loc.end()) { + _dec_count(p->second); + p->second.erase(location); + if (p->second.empty()) { + missing_loc.erase(p); + } else { + _inc_count(p->second); + } + } + } + + void clear_location(const hobject_t &hoid) { + auto p = missing_loc.find(hoid); + if (p != missing_loc.end()) { + _dec_count(p->second); + missing_loc.erase(p); + } + } + + void add_active_missing(const pg_missing_t &missing) { + for (std::map::const_iterator i = + missing.get_items().begin(); + i != missing.get_items().end(); + ++i) { + std::map::const_iterator j = + needs_recovery_map.find(i->first); + if (j == needs_recovery_map.end()) { + needs_recovery_map.insert(*i); + } else { + if (i->second.need != j->second.need) { + lgeneric_dout(cct, 0) << this << " " << pgid << " unexpected need for " + << i->first << " have " << j->second + << " tried to add " << i->second << dendl; + ceph_assert(0 == "unexpected need for missing item"); + } + } + } + } + + void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) { + needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete); + } + void revise_need(const hobject_t &hoid, eversion_t need) { + auto it = needs_recovery_map.find(hoid); + ceph_assert(it != needs_recovery_map.end()); + it->second.need = need; + } + + /// Adds info about a possible recovery source + bool add_source_info( + pg_shard_t source, ///< [in] source + const pg_info_t &oinfo, ///< [in] info + const pg_missing_t &omissing, ///< [in] (optional) missing + HBHandle *handle ///< [in] ThreadPool handle + ); ///< @return whether a new object location was discovered + + /// Adds recovery sources in batch + void add_batch_sources_info( + const std::set &sources, ///< [in] a std::set of resources which can be used for all objects + HBHandle *handle ///< [in] ThreadPool handle + ); + + /// Uses osdmap to update structures for now down sources + void check_recovery_sources(const OSDMapRef& osdmap); + + /// Remove stray from recovery sources + void remove_stray_recovery_sources(pg_shard_t stray); + + /// Call when hoid is no longer missing in acting std::set + void recovered(const hobject_t &hoid) { + needs_recovery_map.erase(hoid); + auto p = missing_loc.find(hoid); + if (p != missing_loc.end()) { + _dec_count(p->second); + missing_loc.erase(p); + } + } + + /// Call to update structures for hoid after a change + void rebuild( + const hobject_t &hoid, + pg_shard_t self, + const std::set &to_recover, + const pg_info_t &info, + const pg_missing_t &missing, + const std::map &pmissing, + const std::map &pinfo) { + recovered(hoid); + std::optional item; + auto miter = missing.get_items().find(hoid); + if (miter != missing.get_items().end()) { + item = miter->second; + } else { + for (auto &&i: to_recover) { + if (i == self) + continue; + auto pmiter = pmissing.find(i); + ceph_assert(pmiter != pmissing.end()); + miter = pmiter->second.get_items().find(hoid); + if (miter != pmiter->second.get_items().end()) { + item = miter->second; + break; + } + } + } + if (!item) + return; // recovered! + + needs_recovery_map[hoid] = *item; + if (item->is_delete()) + return; + auto mliter = + missing_loc.emplace(hoid, std::set()).first; + ceph_assert(info.last_backfill.is_max()); + ceph_assert(info.last_update >= item->need); + if (!missing.is_missing(hoid)) + mliter->second.insert(self); + for (auto &&i: pmissing) { + if (i.first == self) + continue; + auto pinfoiter = pinfo.find(i.first); + ceph_assert(pinfoiter != pinfo.end()); + if (item->need <= pinfoiter->second.last_update && + hoid <= pinfoiter->second.last_backfill && + !i.second.is_missing(hoid)) + mliter->second.insert(i.first); + } + _inc_count(mliter->second); + } + + const std::set &get_locations(const hobject_t &hoid) const { + auto it = missing_loc.find(hoid); + return it == missing_loc.end() ? empty_set : it->second; + } + const std::map> &get_missing_locs() const { + return missing_loc; + } + const std::map &get_needs_recovery() const { + return needs_recovery_map; + } + + const missing_by_count_t &get_missing_by_count() const { + return missing_by_count; + } +}; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc new file mode 100644 index 000000000..4066a679f --- /dev/null +++ b/src/osd/OSD.cc @@ -0,0 +1,11378 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2017 OVH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "acconfig.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_SYS_PARAM_H +#include +#endif + +#ifdef HAVE_SYS_MOUNT_H +#include +#endif + +#include "osd/PG.h" +#include "osd/scrub_machine.h" +#include "osd/pg_scrubber.h" + +#include "include/types.h" +#include "include/compat.h" +#include "include/random.h" + +#include "OSD.h" +#include "OSDMap.h" +#include "Watch.h" +#include "osdc/Objecter.h" + +#include "common/errno.h" +#include "common/ceph_argparse.h" +#include "common/ceph_releases.h" +#include "common/ceph_time.h" +#include "common/version.h" +#include "common/async/blocked_completion.h" +#include "common/pick_address.h" +#include "common/blkdev.h" +#include "common/numa.h" + +#include "os/ObjectStore.h" +#ifdef HAVE_LIBFUSE +#include "os/FuseStore.h" +#endif + +#include "PrimaryLogPG.h" + +#include "msg/Messenger.h" +#include "msg/Message.h" + +#include "mon/MonClient.h" + +#include "messages/MLog.h" + +#include "messages/MGenericMessage.h" +#include "messages/MOSDPing.h" +#include "messages/MOSDFailure.h" +#include "messages/MOSDMarkMeDown.h" +#include "messages/MOSDMarkMeDead.h" +#include "messages/MOSDFull.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDBeacon.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDPGTemp.h" +#include "messages/MOSDPGReadyToMerge.h" + +#include "messages/MOSDMap.h" +#include "messages/MMonGetOSDMap.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGNotify2.h" +#include "messages/MOSDPGQuery.h" +#include "messages/MOSDPGQuery2.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGRemove.h" +#include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGInfo2.h" +#include "messages/MOSDPGCreate.h" +#include "messages/MOSDPGCreate2.h" +#include "messages/MBackfillReserve.h" +#include "messages/MRecoveryReserve.h" +#include "messages/MOSDForceRecovery.h" +#include "messages/MOSDECSubOpWrite.h" +#include "messages/MOSDECSubOpWriteReply.h" +#include "messages/MOSDECSubOpRead.h" +#include "messages/MOSDECSubOpReadReply.h" +#include "messages/MOSDPGCreated.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" + +#include "messages/MOSDPeeringOp.h" + +#include "messages/MOSDAlive.h" + +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrub2.h" +#include "messages/MOSDRepScrub.h" + +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" + +#include "messages/MPGStats.h" + +#include "messages/MWatchNotify.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGPull.h" + +#include "messages/MMonGetPurgedSnaps.h" +#include "messages/MMonGetPurgedSnapsReply.h" + +#include "common/perf_counters.h" +#include "common/Timer.h" +#include "common/LogClient.h" +#include "common/AsyncReserver.h" +#include "common/HeartbeatMap.h" +#include "common/admin_socket.h" +#include "common/ceph_context.h" + +#include "global/signal_handler.h" +#include "global/pidfile.h" + +#include "include/color.h" +#include "perfglue/cpu_profiler.h" +#include "perfglue/heap_profiler.h" + +#include "osd/ClassHandler.h" +#include "osd/OpRequest.h" + +#include "auth/AuthAuthorizeHandler.h" +#include "auth/RotatingKeyRing.h" + +#include "objclass/objclass.h" + +#include "common/cmdparse.h" +#include "include/str_list.h" +#include "include/util.h" + +#include "include/ceph_assert.h" +#include "common/config.h" +#include "common/EventTrace.h" + +#include "json_spirit/json_spirit_reader.h" +#include "json_spirit/json_spirit_writer.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/osd.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif +#ifdef HAVE_JAEGER +#include "common/tracer.h" +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch()) + +using std::deque; +using std::list; +using std::lock_guard; +using std::make_pair; +using std::make_tuple; +using std::make_unique; +using std::map; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::stringstream; +using std::to_string; +using std::unique_ptr; +using std::vector; + +using ceph::bufferlist; +using ceph::bufferptr; +using ceph::decode; +using ceph::encode; +using ceph::fixed_u_to_string; +using ceph::Formatter; +using ceph::heartbeat_handle_d; +using ceph::make_mutex; + +using namespace ceph::osd::scheduler; +using TOPNSPC::common::cmd_getval; + +static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) { + return *_dout << "osd." << whoami << " " << epoch << " "; +} + +//Initial features in new superblock. +//Features here are also automatically upgraded +CompatSet OSD::get_osd_initial_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2); + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} + +//Features are added here that this OSD supports. +CompatSet OSD::get_osd_compat_set() { + CompatSet compat = get_osd_initial_compat_set(); + //Any features here can be set in code, but not in initial superblock + compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + return compat; +} + +OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) : + osd(osd), + cct(osd->cct), + whoami(osd->whoami), store(osd->store), + log_client(osd->log_client), clog(osd->clog), + pg_recovery_stats(osd->pg_recovery_stats), + cluster_messenger(osd->cluster_messenger), + client_messenger(osd->client_messenger), + logger(osd->logger), + recoverystate_perf(osd->recoverystate_perf), + monc(osd->monc), + osd_max_object_size(cct->_conf, "osd_max_object_size"), + osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"), + publish_lock{ceph::make_mutex("OSDService::publish_lock")}, + pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")}, + max_oldest_map(0), + scrubs_local(0), + scrubs_remote(0), + agent_valid_iterator(false), + agent_ops(0), + flush_mode_high_count(0), + agent_active(true), + agent_thread(this), + agent_stop_flag(false), + agent_timer(osd->client_messenger->cct, agent_timer_lock), + last_recalibrate(ceph_clock_now()), + promote_max_objects(0), + promote_max_bytes(0), + poolctx(poolctx), + objecter(make_unique(osd->client_messenger->cct, + osd->objecter_messenger, + osd->monc, poolctx)), + m_objecter_finishers(cct->_conf->osd_objecter_finishers), + watch_timer(osd->client_messenger->cct, watch_lock), + next_notif_id(0), + recovery_request_timer(cct, recovery_request_lock, false), + sleep_timer(cct, sleep_lock, false), + reserver_finisher(cct), + local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills, + cct->_conf->osd_min_recovery_priority), + remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills, + cct->_conf->osd_min_recovery_priority), + snap_reserver(cct, &reserver_finisher, + cct->_conf->osd_max_trimming_pgs), + recovery_ops_active(0), + recovery_ops_reserved(0), + recovery_paused(false), + map_cache(cct, cct->_conf->osd_map_cache_size), + map_bl_cache(cct->_conf->osd_map_cache_size), + map_bl_inc_cache(cct->_conf->osd_map_cache_size), + cur_state(NONE), + cur_ratio(0), physical_ratio(0), + boot_epoch(0), up_epoch(0), bind_epoch(0) +{ + objecter->init(); + + for (int i = 0; i < m_objecter_finishers; i++) { + ostringstream str; + str << "objecter-finisher-" << i; + auto fin = make_unique(osd->client_messenger->cct, str.str(), "finisher"); + objecter_finishers.push_back(std::move(fin)); + } +} + +#ifdef PG_DEBUG_REFS +void OSDService::add_pgid(spg_t pgid, PG *pg) { + std::lock_guard l(pgid_lock); + if (!pgid_tracker.count(pgid)) { + live_pgs[pgid] = pg; + } + pgid_tracker[pgid]++; +} +void OSDService::remove_pgid(spg_t pgid, PG *pg) +{ + std::lock_guard l(pgid_lock); + ceph_assert(pgid_tracker.count(pgid)); + ceph_assert(pgid_tracker[pgid] > 0); + pgid_tracker[pgid]--; + if (pgid_tracker[pgid] == 0) { + pgid_tracker.erase(pgid); + live_pgs.erase(pgid); + } +} +void OSDService::dump_live_pgids() +{ + std::lock_guard l(pgid_lock); + derr << "live pgids:" << dendl; + for (map::const_iterator i = pgid_tracker.cbegin(); + i != pgid_tracker.cend(); + ++i) { + derr << "\t" << *i << dendl; + live_pgs[i->first]->dump_live_ids(); + } +} +#endif + + +ceph::signedspan OSDService::get_mnow() +{ + return ceph::mono_clock::now() - osd->startup_time; +} + +void OSDService::identify_splits_and_merges( + OSDMapRef old_map, + OSDMapRef new_map, + spg_t pgid, + set> *split_children, + set> *merge_pgs) +{ + if (!old_map->have_pg_pool(pgid.pool())) { + return; + } + int old_pgnum = old_map->get_pg_num(pgid.pool()); + auto p = osd->pg_num_history.pg_nums.find(pgid.pool()); + if (p == osd->pg_num_history.pg_nums.end()) { + return; + } + dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch() + << " to e" << new_map->get_epoch() + << " pg_nums " << p->second << dendl; + deque queue; + queue.push_back(pgid); + set did; + while (!queue.empty()) { + auto cur = queue.front(); + queue.pop_front(); + did.insert(cur); + unsigned pgnum = old_pgnum; + for (auto q = p->second.lower_bound(old_map->get_epoch()); + q != p->second.end() && + q->first <= new_map->get_epoch(); + ++q) { + if (pgnum < q->second) { + // split? + if (cur.ps() < pgnum) { + set children; + if (cur.is_split(pgnum, q->second, &children)) { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " children " << children << dendl; + for (auto i : children) { + split_children->insert(make_pair(i, q->first)); + if (!did.count(i)) + queue.push_back(i); + } + } + } else if (cur.ps() < q->second) { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is a child" << dendl; + // normally we'd capture this from the parent, but it's + // possible the parent doesn't exist yet (it will be + // fabricated to allow an intervening merge). note this PG + // as a split child here to be sure we catch it. + split_children->insert(make_pair(cur, q->first)); + } else { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is post-split, skipping" << dendl; + } + } else if (merge_pgs) { + // merge? + if (cur.ps() >= q->second) { + if (cur.ps() < pgnum) { + spg_t parent; + if (cur.is_merge_source(pgnum, q->second, &parent)) { + set children; + parent.is_split(q->second, pgnum, &children); + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is merge source, target " << parent + << ", source(s) " << children << dendl; + merge_pgs->insert(make_pair(parent, q->first)); + if (!did.count(parent)) { + // queue (and re-scan) parent in case it might not exist yet + // and there are some future splits pending on it + queue.push_back(parent); + } + for (auto c : children) { + merge_pgs->insert(make_pair(c, q->first)); + if (!did.count(c)) + queue.push_back(c); + } + } + } else { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is beyond old pgnum, skipping" << dendl; + } + } else { + set children; + if (cur.is_split(q->second, pgnum, &children)) { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is merge target, source " << children << dendl; + for (auto c : children) { + merge_pgs->insert(make_pair(c, q->first)); + if (!did.count(c)) + queue.push_back(c); + } + merge_pgs->insert(make_pair(cur, q->first)); + } + } + } + pgnum = q->second; + } + } +} + +void OSDService::need_heartbeat_peer_update() +{ + osd->need_heartbeat_peer_update(); +} + +HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer) +{ + std::lock_guard l(hb_stamp_lock); + if (peer >= hb_stamps.size()) { + hb_stamps.resize(peer + 1); + } + if (!hb_stamps[peer]) { + hb_stamps[peer] = ceph::make_ref(peer); + } + return hb_stamps[peer]; +} + +void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid) +{ + osd->enqueue_peering_evt( + spgid, + PGPeeringEventRef( + std::make_shared( + epoch, epoch, + RenewLease()))); +} + +void OSDService::start_shutdown() +{ + { + std::lock_guard l(agent_timer_lock); + agent_timer.shutdown(); + } + + { + std::lock_guard l(sleep_lock); + sleep_timer.shutdown(); + } + + { + std::lock_guard l(recovery_request_lock); + recovery_request_timer.shutdown(); + } +} + +void OSDService::shutdown_reserver() +{ + reserver_finisher.wait_for_empty(); + reserver_finisher.stop(); +} + +void OSDService::shutdown() +{ + mono_timer.suspend(); + + { + std::lock_guard l(watch_lock); + watch_timer.shutdown(); + } + + objecter->shutdown(); + for (auto& f : objecter_finishers) { + f->wait_for_empty(); + f->stop(); + } + + publish_map(OSDMapRef()); + next_osdmap = OSDMapRef(); +} + +void OSDService::init() +{ + reserver_finisher.start(); + for (auto& f : objecter_finishers) { + f->start(); + } + objecter->set_client_incarnation(0); + + // deprioritize objecter in daemonperf output + objecter->get_logger()->set_prio_adjust(-3); + + watch_timer.init(); + agent_timer.init(); + mono_timer.resume(); + + agent_thread.create("osd_srv_agent"); + + if (cct->_conf->osd_recovery_delay_start) + defer_recovery(cct->_conf->osd_recovery_delay_start); +} + +void OSDService::final_init() +{ + objecter->start(osdmap.get()); +} + +void OSDService::activate_map() +{ + // wake/unwake the tiering agent + std::lock_guard l{agent_lock}; + agent_active = + !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) && + osd->is_active(); + agent_cond.notify_all(); +} + +void OSDService::request_osdmap_update(epoch_t e) +{ + osd->osdmap_subscribe(e, false); +} + + +class AgentTimeoutCB : public Context { + PGRef pg; +public: + explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {} + void finish(int) override { + pg->agent_choose_mode_restart(); + } +}; + +void OSDService::agent_entry() +{ + dout(10) << __func__ << " start" << dendl; + std::unique_lock agent_locker{agent_lock}; + + while (!agent_stop_flag) { + if (agent_queue.empty()) { + dout(20) << __func__ << " empty queue" << dendl; + agent_cond.wait(agent_locker); + continue; + } + uint64_t level = agent_queue.rbegin()->first; + set& top = agent_queue.rbegin()->second; + dout(10) << __func__ + << " tiers " << agent_queue.size() + << ", top is " << level + << " with pgs " << top.size() + << ", ops " << agent_ops << "/" + << cct->_conf->osd_agent_max_ops + << (agent_active ? " active" : " NOT ACTIVE") + << dendl; + dout(20) << __func__ << " oids " << agent_oids << dendl; + int max = cct->_conf->osd_agent_max_ops - agent_ops; + int agent_flush_quota = max; + if (!flush_mode_high_count) + agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops; + if (agent_flush_quota <= 0 || top.empty() || !agent_active) { + agent_cond.wait(agent_locker); + continue; + } + + if (!agent_valid_iterator || agent_queue_pos == top.end()) { + agent_queue_pos = top.begin(); + agent_valid_iterator = true; + } + PGRef pg = *agent_queue_pos; + dout(10) << "high_count " << flush_mode_high_count + << " agent_ops " << agent_ops + << " flush_quota " << agent_flush_quota << dendl; + agent_locker.unlock(); + if (!pg->agent_work(max, agent_flush_quota)) { + dout(10) << __func__ << " " << pg->pg_id + << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time + << " seconds" << dendl; + + logger->inc(l_osd_tier_delay); + // Queue a timer to call agent_choose_mode for this pg in 5 seconds + std::lock_guard timer_locker{agent_timer_lock}; + Context *cb = new AgentTimeoutCB(pg); + agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb); + } + agent_locker.lock(); + } + dout(10) << __func__ << " finish" << dendl; +} + +void OSDService::agent_stop() +{ + { + std::lock_guard l(agent_lock); + + // By this time all ops should be cancelled + ceph_assert(agent_ops == 0); + // By this time all PGs are shutdown and dequeued + if (!agent_queue.empty()) { + set& top = agent_queue.rbegin()->second; + derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl; + ceph_abort_msg("agent queue not empty"); + } + + agent_stop_flag = true; + agent_cond.notify_all(); + } + agent_thread.join(); +} + +// ------------------------------------- + +void OSDService::promote_throttle_recalibrate() +{ + utime_t now = ceph_clock_now(); + double dur = now - last_recalibrate; + last_recalibrate = now; + unsigned prob = promote_probability_millis; + + uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec; + uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec; + + unsigned min_prob = 1; + + uint64_t attempts, obj, bytes; + promote_counter.sample_and_attenuate(&attempts, &obj, &bytes); + dout(10) << __func__ << " " << attempts << " attempts, promoted " + << obj << " objects and " << byte_u_t(bytes) << "; target " + << target_obj_sec << " obj/sec or " + << byte_u_t(target_bytes_sec) << "/sec" + << dendl; + + // calculate what the probability *should* be, given the targets + unsigned new_prob; + if (attempts && dur > 0) { + uint64_t avg_size = 1; + if (obj) + avg_size = std::max(bytes / obj, 1); + unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts; + unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0 + / (double)attempts; + dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size " + << avg_size << dendl; + if (target_obj_sec && target_bytes_sec) + new_prob = std::min(po, pb); + else if (target_obj_sec) + new_prob = po; + else if (target_bytes_sec) + new_prob = pb; + else + new_prob = 1000; + } else { + new_prob = 1000; + } + dout(20) << __func__ << " new_prob " << new_prob << dendl; + + // correct for persistent skew between target rate and actual rate, adjust + double ratio = 1.0; + unsigned actual = 0; + if (attempts && obj) { + actual = obj * 1000 / attempts; + ratio = (double)actual / (double)prob; + new_prob = (double)new_prob / ratio; + } + new_prob = std::max(new_prob, min_prob); + new_prob = std::min(new_prob, 1000u); + + // adjust + prob = (prob + new_prob) / 2; + prob = std::max(prob, min_prob); + prob = std::min(prob, 1000u); + dout(10) << __func__ << " actual " << actual + << ", actual/prob ratio " << ratio + << ", adjusted new_prob " << new_prob + << ", prob " << promote_probability_millis << " -> " << prob + << dendl; + promote_probability_millis = prob; + + // set hard limits for this interval to mitigate stampedes + promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2; + promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2; +} + +// ------------------------------------- + +float OSDService::get_failsafe_full_ratio() +{ + float full_ratio = cct->_conf->osd_failsafe_full_ratio; + if (full_ratio > 1.0) full_ratio /= 100.0; + return full_ratio; +} + +OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject) +{ + // The OSDMap ratios take precendence. So if the failsafe is .95 and + // the admin sets the cluster full to .96, the failsafe moves up to .96 + // too. (Not that having failsafe == full is ideal, but it's better than + // dropping writes before the clusters appears full.) + OSDMapRef osdmap = get_osdmap(); + if (!osdmap || osdmap->get_epoch() == 0) { + return NONE; + } + float nearfull_ratio = osdmap->get_nearfull_ratio(); + float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio); + float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio); + float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio); + + if (osdmap->require_osd_release < ceph_release_t::luminous) { + // use the failsafe for nearfull and full; the mon isn't using the + // flags anyway because we're mid-upgrade. + full_ratio = failsafe_ratio; + backfillfull_ratio = failsafe_ratio; + nearfull_ratio = failsafe_ratio; + } else if (full_ratio <= 0 || + backfillfull_ratio <= 0 || + nearfull_ratio <= 0) { + derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl; + // use failsafe flag. ick. the monitor did something wrong or the user + // did something stupid. + full_ratio = failsafe_ratio; + backfillfull_ratio = failsafe_ratio; + nearfull_ratio = failsafe_ratio; + } + + if (injectfull_state > NONE && injectfull) { + inject = "(Injected)"; + return injectfull_state; + } else if (pratio > failsafe_ratio) { + return FAILSAFE; + } else if (ratio > full_ratio) { + return FULL; + } else if (ratio > backfillfull_ratio) { + return BACKFILLFULL; + } else if (pratio > nearfull_ratio) { + return NEARFULL; + } + return NONE; +} + +void OSDService::check_full_status(float ratio, float pratio) +{ + std::lock_guard l(full_status_lock); + + cur_ratio = ratio; + physical_ratio = pratio; + + string inject; + s_names new_state; + new_state = recalc_full_state(ratio, pratio, inject); + + dout(20) << __func__ << " cur ratio " << ratio + << ", physical ratio " << pratio + << ", new state " << get_full_state_name(new_state) + << " " << inject + << dendl; + + // warn + if (cur_state != new_state) { + dout(10) << __func__ << " " << get_full_state_name(cur_state) + << " -> " << get_full_state_name(new_state) << dendl; + if (new_state == FAILSAFE) { + clog->error() << "full status failsafe engaged, dropping updates, now " + << (int)roundf(ratio * 100) << "% full"; + } else if (cur_state == FAILSAFE) { + clog->error() << "full status failsafe disengaged, no longer dropping " + << "updates, now " << (int)roundf(ratio * 100) << "% full"; + } + cur_state = new_state; + } +} + +bool OSDService::need_fullness_update() +{ + OSDMapRef osdmap = get_osdmap(); + s_names cur = NONE; + if (osdmap->exists(whoami)) { + if (osdmap->get_state(whoami) & CEPH_OSD_FULL) { + cur = FULL; + } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) { + cur = BACKFILLFULL; + } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) { + cur = NEARFULL; + } + } + s_names want = NONE; + if (is_full()) + want = FULL; + else if (is_backfillfull()) + want = BACKFILLFULL; + else if (is_nearfull()) + want = NEARFULL; + return want != cur; +} + +bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const +{ + if (injectfull && injectfull_state >= type) { + // injectfull is either a count of the number of times to return failsafe full + // or if -1 then always return full + if (injectfull > 0) + --injectfull; + ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD (" + << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")" + << dendl; + return true; + } + return false; +} + +bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const +{ + std::lock_guard l(full_status_lock); + + if (_check_inject_full(dpp, type)) + return true; + + if (cur_state >= type) + ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio + << " physical " << physical_ratio << dendl; + + return cur_state >= type; +} + +bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat) +{ + ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl; + { + std::lock_guard l(full_status_lock); + if (_check_inject_full(dpp, type)) { + return true; + } + } + + float pratio; + float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used); + + string notused; + s_names tentative_state = recalc_full_state(ratio, pratio, notused); + + if (tentative_state >= type) + ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl; + + return tentative_state >= type; +} + +bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const +{ + return _check_full(dpp, FAILSAFE); +} + +bool OSDService::check_full(DoutPrefixProvider *dpp) const +{ + return _check_full(dpp, FULL); +} + +bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats) +{ + return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats); +} + +bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const +{ + return _check_full(dpp, BACKFILLFULL); +} + +bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const +{ + return _check_full(dpp, NEARFULL); +} + +bool OSDService::is_failsafe_full() const +{ + std::lock_guard l(full_status_lock); + return cur_state == FAILSAFE; +} + +bool OSDService::is_full() const +{ + std::lock_guard l(full_status_lock); + return cur_state >= FULL; +} + +bool OSDService::is_backfillfull() const +{ + std::lock_guard l(full_status_lock); + return cur_state >= BACKFILLFULL; +} + +bool OSDService::is_nearfull() const +{ + std::lock_guard l(full_status_lock); + return cur_state >= NEARFULL; +} + +void OSDService::set_injectfull(s_names type, int64_t count) +{ + std::lock_guard l(full_status_lock); + injectfull_state = type; + injectfull = count; +} + +void OSDService::set_statfs(const struct store_statfs_t &stbuf, + osd_alert_list_t& alerts) +{ + uint64_t bytes = stbuf.total; + uint64_t avail = stbuf.available; + uint64_t used = stbuf.get_used_raw(); + + // For testing fake statfs values so it doesn't matter if all + // OSDs are using the same partition. + if (cct->_conf->fake_statfs_for_testing) { + uint64_t total_num_bytes = 0; + vector pgs; + osd->_get_pgs(&pgs); + for (auto p : pgs) { + total_num_bytes += p->get_stats_num_bytes(); + } + bytes = cct->_conf->fake_statfs_for_testing; + if (total_num_bytes < bytes) + avail = bytes - total_num_bytes; + else + avail = 0; + dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing + << " adjust available " << avail + << dendl; + used = bytes - avail; + } + + logger->set(l_osd_stat_bytes, bytes); + logger->set(l_osd_stat_bytes_used, used); + logger->set(l_osd_stat_bytes_avail, avail); + + std::lock_guard l(stat_lock); + osd_stat.statfs = stbuf; + osd_stat.os_alerts.clear(); + osd_stat.os_alerts[whoami].swap(alerts); + if (cct->_conf->fake_statfs_for_testing) { + osd_stat.statfs.total = bytes; + osd_stat.statfs.available = avail; + // For testing don't want used to go negative, so clear reserved + osd_stat.statfs.internally_reserved = 0; + } +} + +osd_stat_t OSDService::set_osd_stat(vector& hb_peers, + int num_pgs) +{ + utime_t now = ceph_clock_now(); + auto stale_time = g_conf().get_val("osd_mon_heartbeat_stat_stale"); + std::lock_guard l(stat_lock); + osd_stat.hb_peers.swap(hb_peers); + osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist); + osd_stat.num_pgs = num_pgs; + // Clean entries that aren't updated + // This is called often enough that we can just remove 1 at a time + for (auto i: osd_stat.hb_pingtime) { + if (i.second.last_update == 0) + continue; + if (stale_time && now.sec() - i.second.last_update > stale_time) { + dout(20) << __func__ << " time out heartbeat for osd " << i.first + << " last_update " << i.second.last_update << dendl; + osd_stat.hb_pingtime.erase(i.first); + break; + } + } + return osd_stat; +} + +void OSDService::inc_osd_stat_repaired() +{ + std::lock_guard l(stat_lock); + osd_stat.num_shards_repaired++; + return; +} + +float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, + uint64_t adjust_used) +{ + *pratio = + ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total); + + if (adjust_used) { + dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl; + if (new_stat.statfs.available > adjust_used) + new_stat.statfs.available -= adjust_used; + else + new_stat.statfs.available = 0; + dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl; + } + + // Check all pgs and adjust kb_used to include all pending backfill data + int backfill_adjusted = 0; + vector pgs; + osd->_get_pgs(&pgs); + for (auto p : pgs) { + backfill_adjusted += p->pg_stat_adjust(&new_stat); + } + if (backfill_adjusted) { + dout(20) << __func__ << " backfill adjusted " << new_stat << dendl; + } + return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total); +} + +void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch) +{ + OSDMapRef next_map = get_nextmap_reserved(); + // service map is always newer/newest + ceph_assert(from_epoch <= next_map->get_epoch()); + + if (next_map->is_down(peer) || + next_map->get_info(peer).up_from > from_epoch) { + m->put(); + release_map(next_map); + return; + } + ConnectionRef peer_con; + if (peer == whoami) { + peer_con = osd->cluster_messenger->get_loopback_connection(); + } else { + peer_con = osd->cluster_messenger->connect_to_osd( + next_map->get_cluster_addrs(peer), false, true); + } + maybe_share_map(peer_con.get(), next_map); + peer_con->send_message(m); + release_map(next_map); +} + +void OSDService::send_message_osd_cluster(std::vector>& messages, epoch_t from_epoch) +{ + OSDMapRef next_map = get_nextmap_reserved(); + // service map is always newer/newest + ceph_assert(from_epoch <= next_map->get_epoch()); + + for (auto& iter : messages) { + if (next_map->is_down(iter.first) || + next_map->get_info(iter.first).up_from > from_epoch) { + iter.second->put(); + continue; + } + ConnectionRef peer_con; + if (iter.first == whoami) { + peer_con = osd->cluster_messenger->get_loopback_connection(); + } else { + peer_con = osd->cluster_messenger->connect_to_osd( + next_map->get_cluster_addrs(iter.first), false, true); + } + maybe_share_map(peer_con.get(), next_map); + peer_con->send_message(iter.second); + } + release_map(next_map); +} +ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch) +{ + OSDMapRef next_map = get_nextmap_reserved(); + // service map is always newer/newest + ceph_assert(from_epoch <= next_map->get_epoch()); + + if (next_map->is_down(peer) || + next_map->get_info(peer).up_from > from_epoch) { + release_map(next_map); + return NULL; + } + ConnectionRef con; + if (peer == whoami) { + con = osd->cluster_messenger->get_loopback_connection(); + } else { + con = osd->cluster_messenger->connect_to_osd( + next_map->get_cluster_addrs(peer), false, true); + } + release_map(next_map); + return con; +} + +pair OSDService::get_con_osd_hb(int peer, epoch_t from_epoch) +{ + OSDMapRef next_map = get_nextmap_reserved(); + // service map is always newer/newest + ceph_assert(from_epoch <= next_map->get_epoch()); + + pair ret; + if (next_map->is_down(peer) || + next_map->get_info(peer).up_from > from_epoch) { + release_map(next_map); + return ret; + } + ret.first = osd->hb_back_client_messenger->connect_to_osd( + next_map->get_hb_back_addrs(peer)); + ret.second = osd->hb_front_client_messenger->connect_to_osd( + next_map->get_hb_front_addrs(peer)); + release_map(next_map); + return ret; +} + +entity_name_t OSDService::get_cluster_msgr_name() const +{ + return cluster_messenger->get_myname(); +} + +void OSDService::queue_want_pg_temp(pg_t pgid, + const vector& want, + bool forced) +{ + std::lock_guard l(pg_temp_lock); + auto p = pg_temp_pending.find(pgid); + if (p == pg_temp_pending.end() || + p->second.acting != want || + forced) { + pg_temp_wanted[pgid] = {want, forced}; + } +} + +void OSDService::remove_want_pg_temp(pg_t pgid) +{ + std::lock_guard l(pg_temp_lock); + pg_temp_wanted.erase(pgid); + pg_temp_pending.erase(pgid); +} + +void OSDService::_sent_pg_temp() +{ +#ifdef HAVE_STDLIB_MAP_SPLICING + pg_temp_pending.merge(pg_temp_wanted); +#else + pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)), + make_move_iterator(end(pg_temp_wanted))); +#endif + pg_temp_wanted.clear(); +} + +void OSDService::requeue_pg_temp() +{ + std::lock_guard l(pg_temp_lock); + // wanted overrides pending. note that remove_want_pg_temp + // clears the item out of both. + unsigned old_wanted = pg_temp_wanted.size(); + unsigned old_pending = pg_temp_pending.size(); + _sent_pg_temp(); + pg_temp_wanted.swap(pg_temp_pending); + dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> " + << pg_temp_wanted.size() << dendl; +} + +std::ostream& operator<<(std::ostream& out, + const OSDService::pg_temp_t& pg_temp) +{ + out << pg_temp.acting; + if (pg_temp.forced) { + out << " (forced)"; + } + return out; +} + +void OSDService::send_pg_temp() +{ + std::lock_guard l(pg_temp_lock); + if (pg_temp_wanted.empty()) + return; + dout(10) << "send_pg_temp " << pg_temp_wanted << dendl; + MOSDPGTemp *ms[2] = {nullptr, nullptr}; + for (auto& [pgid, pg_temp] : pg_temp_wanted) { + auto& m = ms[pg_temp.forced]; + if (!m) { + m = new MOSDPGTemp(osdmap->get_epoch()); + m->forced = pg_temp.forced; + } + m->pg_temp.emplace(pgid, pg_temp.acting); + } + for (auto m : ms) { + if (m) { + monc->send_mon_message(m); + } + } + _sent_pg_temp(); +} + +void OSDService::send_pg_created(pg_t pgid) +{ + std::lock_guard l(pg_created_lock); + dout(20) << __func__ << dendl; + auto o = get_osdmap(); + if (o->require_osd_release >= ceph_release_t::luminous) { + pg_created.insert(pgid); + monc->send_mon_message(new MOSDPGCreated(pgid)); + } +} + +void OSDService::send_pg_created() +{ + std::lock_guard l(pg_created_lock); + dout(20) << __func__ << dendl; + auto o = get_osdmap(); + if (o->require_osd_release >= ceph_release_t::luminous) { + for (auto pgid : pg_created) { + monc->send_mon_message(new MOSDPGCreated(pgid)); + } + } +} + +void OSDService::prune_pg_created() +{ + std::lock_guard l(pg_created_lock); + dout(20) << __func__ << dendl; + auto o = get_osdmap(); + auto i = pg_created.begin(); + while (i != pg_created.end()) { + auto p = o->get_pg_pool(i->pool()); + if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) { + dout(20) << __func__ << " pruning " << *i << dendl; + i = pg_created.erase(i); + } else { + dout(20) << __func__ << " keeping " << *i << dendl; + ++i; + } + } +} + + +// -------------------------------------- +// dispatch + +bool OSDService::can_inc_scrubs() +{ + bool can_inc = false; + std::lock_guard l(sched_scrub_lock); + + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote + << " remote < max " << cct->_conf->osd_max_scrubs << dendl; + can_inc = true; + } else { + dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote + << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; + } + + return can_inc; +} + +bool OSDService::inc_scrubs_local() +{ + bool result = false; + std::lock_guard l{sched_scrub_lock}; + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1) + << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl; + result = true; + ++scrubs_local; + } else { + dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; + } + return result; +} + +void OSDService::dec_scrubs_local() +{ + std::lock_guard l{sched_scrub_lock}; + dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1) + << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl; + --scrubs_local; + ceph_assert(scrubs_local >= 0); +} + +bool OSDService::inc_scrubs_remote() +{ + bool result = false; + std::lock_guard l{sched_scrub_lock}; + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1) + << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl; + result = true; + ++scrubs_remote; + } else { + dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; + } + return result; +} + +void OSDService::dec_scrubs_remote() +{ + std::lock_guard l{sched_scrub_lock}; + dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1) + << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl; + --scrubs_remote; + ceph_assert(scrubs_remote >= 0); +} + +void OSDService::dump_scrub_reservations(Formatter *f) +{ + std::lock_guard l{sched_scrub_lock}; + f->dump_int("scrubs_local", scrubs_local); + f->dump_int("scrubs_remote", scrubs_remote); + f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs); +} + +void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch, + epoch_t *_bind_epoch) const +{ + std::lock_guard l(epoch_lock); + if (_boot_epoch) + *_boot_epoch = boot_epoch; + if (_up_epoch) + *_up_epoch = up_epoch; + if (_bind_epoch) + *_bind_epoch = bind_epoch; +} + +void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch, + const epoch_t *_bind_epoch) +{ + std::lock_guard l(epoch_lock); + if (_boot_epoch) { + ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch); + boot_epoch = *_boot_epoch; + } + if (_up_epoch) { + ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch); + up_epoch = *_up_epoch; + } + if (_bind_epoch) { + ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch); + bind_epoch = *_bind_epoch; + } +} + +bool OSDService::prepare_to_stop() +{ + std::unique_lock l(is_stopping_lock); + if (get_state() != NOT_STOPPING) + return false; + + OSDMapRef osdmap = get_osdmap(); + if (osdmap && osdmap->is_up(whoami)) { + dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl; + set_state(PREPARING_TO_STOP); + monc->send_mon_message( + new MOSDMarkMeDown( + monc->get_fsid(), + whoami, + osdmap->get_addrs(whoami), + osdmap->get_epoch(), + true, // request ack + true // mark as down and dead + )); + const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout); + is_stopping_cond.wait_for(l, timeout, + [this] { return get_state() == STOPPING; }); + } + + dout(0) << __func__ << " starting shutdown" << dendl; + set_state(STOPPING); + return true; +} + +void OSDService::got_stop_ack() +{ + std::scoped_lock l(is_stopping_lock); + if (get_state() == PREPARING_TO_STOP) { + dout(0) << __func__ << " starting shutdown" << dendl; + set_state(STOPPING); + is_stopping_cond.notify_all(); + } else { + dout(10) << __func__ << " ignoring msg" << dendl; + } +} + +MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to, + OSDSuperblock& sblock) +{ + MOSDMap *m = new MOSDMap(monc->get_fsid(), + osdmap->get_encoding_features()); + m->oldest_map = max_oldest_map; + m->newest_map = sblock.newest_map; + + int max = cct->_conf->osd_map_message_max; + ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes; + + if (since < m->oldest_map) { + // we don't have the next map the target wants, so start with a + // full map. + bufferlist bl; + dout(10) << __func__ << " oldest map " << max_oldest_map << " > since " + << since << ", starting with full map" << dendl; + since = m->oldest_map; + if (!get_map_bl(since, bl)) { + derr << __func__ << " missing full map " << since << dendl; + goto panic; + } + max--; + max_bytes -= bl.length(); + m->maps[since] = std::move(bl); + } + for (epoch_t e = since + 1; e <= to; ++e) { + bufferlist bl; + if (get_inc_map_bl(e, bl)) { + m->incremental_maps[e] = std::move(bl); + } else { + dout(10) << __func__ << " missing incremental map " << e << dendl; + if (!get_map_bl(e, bl)) { + derr << __func__ << " also missing full map " << e << dendl; + goto panic; + } + m->maps[e] = std::move(bl); + } + max--; + max_bytes -= bl.length(); + if (max <= 0 || max_bytes <= 0) { + break; + } + } + return m; + + panic: + if (!m->maps.empty() || + !m->incremental_maps.empty()) { + // send what we have so far + return m; + } + // send something + bufferlist bl; + if (get_inc_map_bl(m->newest_map, bl)) { + m->incremental_maps[m->newest_map] = std::move(bl); + } else { + derr << __func__ << " unable to load latest map " << m->newest_map << dendl; + if (!get_map_bl(m->newest_map, bl)) { + derr << __func__ << " unable to load latest full map " << m->newest_map + << dendl; + ceph_abort(); + } + m->maps[m->newest_map] = std::move(bl); + } + return m; +} + +void OSDService::send_map(MOSDMap *m, Connection *con) +{ + con->send_message(m); +} + +void OSDService::send_incremental_map(epoch_t since, Connection *con, + const OSDMapRef& osdmap) +{ + epoch_t to = osdmap->get_epoch(); + dout(10) << "send_incremental_map " << since << " -> " << to + << " to " << con << " " << con->get_peer_addr() << dendl; + + MOSDMap *m = NULL; + while (!m) { + OSDSuperblock sblock(get_superblock()); + if (since < sblock.oldest_map) { + // just send latest full map + MOSDMap *m = new MOSDMap(monc->get_fsid(), + osdmap->get_encoding_features()); + m->oldest_map = max_oldest_map; + m->newest_map = sblock.newest_map; + get_map_bl(to, m->maps[to]); + send_map(m, con); + return; + } + + if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) { + dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs + << ", only sending most recent" << dendl; + since = to - cct->_conf->osd_map_share_max_epochs; + } + + m = build_incremental_map_msg(since, to, sblock); + } + send_map(m, con); +} + +bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl) +{ + bool found = map_bl_cache.lookup(e, &bl); + if (found) { + logger->inc(l_osd_map_bl_cache_hit); + return true; + } + logger->inc(l_osd_map_bl_cache_miss); + found = store->read(meta_ch, + OSD::get_osdmap_pobject_name(e), 0, 0, bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0; + if (found) { + _add_map_bl(e, bl); + } + return found; +} + +bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl) +{ + std::lock_guard l(map_cache_lock); + bool found = map_bl_inc_cache.lookup(e, &bl); + if (found) { + logger->inc(l_osd_map_bl_cache_hit); + return true; + } + logger->inc(l_osd_map_bl_cache_miss); + found = store->read(meta_ch, + OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0; + if (found) { + _add_map_inc_bl(e, bl); + } + return found; +} + +void OSDService::_add_map_bl(epoch_t e, bufferlist& bl) +{ + dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl; + // cache a contiguous buffer + if (bl.get_num_buffers() > 1) { + bl.rebuild(); + } + bl.try_assign_to_mempool(mempool::mempool_osd_mapbl); + map_bl_cache.add(e, bl); +} + +void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl) +{ + dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl; + // cache a contiguous buffer + if (bl.get_num_buffers() > 1) { + bl.rebuild(); + } + bl.try_assign_to_mempool(mempool::mempool_osd_mapbl); + map_bl_inc_cache.add(e, bl); +} + +OSDMapRef OSDService::_add_map(OSDMap *o) +{ + epoch_t e = o->get_epoch(); + + if (cct->_conf->osd_map_dedup) { + // Dedup against an existing map at a nearby epoch + OSDMapRef for_dedup = map_cache.lower_bound(e); + if (for_dedup) { + OSDMap::dedup(for_dedup.get(), o); + } + } + bool existed; + OSDMapRef l = map_cache.add(e, o, &existed); + if (existed) { + delete o; + } + return l; +} + +OSDMapRef OSDService::try_get_map(epoch_t epoch) +{ + std::lock_guard l(map_cache_lock); + OSDMapRef retval = map_cache.lookup(epoch); + if (retval) { + dout(30) << "get_map " << epoch << " -cached" << dendl; + logger->inc(l_osd_map_cache_hit); + return retval; + } + { + logger->inc(l_osd_map_cache_miss); + epoch_t lb = map_cache.cached_key_lower_bound(); + if (epoch < lb) { + dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl; + logger->inc(l_osd_map_cache_miss_low); + logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch); + } + } + + OSDMap *map = new OSDMap; + if (epoch > 0) { + dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl; + bufferlist bl; + if (!_get_map_bl(epoch, bl) || bl.length() == 0) { + derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl; + delete map; + return OSDMapRef(); + } + map->decode(bl); + } else { + dout(20) << "get_map " << epoch << " - return initial " << map << dendl; + } + return _add_map(map); +} + +// ops + + +void OSDService::reply_op_error(OpRequestRef op, int err) +{ + reply_op_error(op, err, eversion_t(), 0, {}); +} + +void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v, + version_t uv, + vector op_returns) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + int flags; + flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); + + MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, + !m->has_flag(CEPH_OSD_FLAG_RETURNVEC)); + reply->set_reply_versions(v, uv); + reply->set_op_returns(op_returns); + m->get_connection()->send_message(reply); +} + +void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op) +{ + if (!cct->_conf->osd_debug_misdirected_ops) { + return; + } + + auto m = op->get_req(); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + + ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since); + + if (pg->is_ec_pg()) { + /** + * OSD recomputes op target based on current OSDMap. With an EC pg, we + * can get this result: + * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping + * [CRUSH_ITEM_NONE, 2, 3]/3 + * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping + * [3, 2, 3]/3 + * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary + * -- misdirected op + * 4) client resends and this time PG 3.9s0 having caught up to 513 gets + * it and fulfils it + * + * We can't compute the op target based on the sending map epoch due to + * splitting. The simplest thing is to detect such cases here and drop + * them without an error (the client will resend anyway). + */ + ceph_assert(m->get_map_epoch() <= superblock.newest_map); + OSDMapRef opmap = try_get_map(m->get_map_epoch()); + if (!opmap) { + dout(7) << __func__ << ": " << *pg << " no longer have map for " + << m->get_map_epoch() << ", dropping" << dendl; + return; + } + pg_t _pgid = m->get_raw_pg(); + spg_t pgid; + if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0) + _pgid = opmap->raw_pg_to_pg(_pgid); + if (opmap->get_primary_shard(_pgid, &pgid) && + pgid.shard != pg->pg_id.shard) { + dout(7) << __func__ << ": " << *pg << " primary changed since " + << m->get_map_epoch() << ", dropping" << dendl; + return; + } + } + + dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl; + clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid() + << " pg " << m->get_raw_pg() + << " to osd." << whoami + << " not " << pg->get_acting() + << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch(); +} + +void OSDService::enqueue_back(OpSchedulerItem&& qi) +{ + osd->op_shardedwq.queue(std::move(qi)); +} + +void OSDService::enqueue_front(OpSchedulerItem&& qi) +{ + osd->op_shardedwq.queue_front(std::move(qi)); +} + +void OSDService::queue_recovery_context( + PG *pg, + GenContext *c) +{ + epoch_t e = get_osdmap_epoch(); + enqueue_back( + OpSchedulerItem( + unique_ptr( + new PGRecoveryContext(pg->get_pgid(), c, e)), + cct->_conf->osd_recovery_cost, + cct->_conf->osd_recovery_priority, + ceph_clock_now(), + 0, + e)); +} + +void OSDService::queue_for_snap_trim(PG *pg) +{ + dout(10) << "queueing " << *pg << " for snaptrim" << dendl; + enqueue_back( + OpSchedulerItem( + unique_ptr( + new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())), + cct->_conf->osd_snap_trim_cost, + cct->_conf->osd_snap_trim_priority, + ceph_clock_now(), + 0, + pg->get_osdmap_epoch())); +} + +template +void OSDService::queue_scrub_event_msg(PG* pg, + Scrub::scrub_prio_t with_priority, + unsigned int qu_priority, + Scrub::act_token_t act_token) +{ + const auto epoch = pg->get_osdmap_epoch(); + auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token); + dout(15) << "queue a scrub event (" << *msg << ") for " << *pg + << ". Epoch: " << epoch << " token: " << act_token << dendl; + + enqueue_back(OpSchedulerItem( + unique_ptr(msg), cct->_conf->osd_scrub_cost, + pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch)); +} + +template +void OSDService::queue_scrub_event_msg(PG* pg, + Scrub::scrub_prio_t with_priority) +{ + const auto epoch = pg->get_osdmap_epoch(); + auto msg = new MSG_TYPE(pg->get_pgid(), epoch); + dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl; + + enqueue_back(OpSchedulerItem( + unique_ptr(msg), cct->_conf->osd_scrub_cost, + pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch)); +} + +void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority) +{ + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority) +{ + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_for_rep_scrub(PG* pg, + Scrub::scrub_prio_t with_priority, + unsigned int qu_priority, + Scrub::act_token_t act_token) +{ + queue_scrub_event_msg(pg, with_priority, qu_priority, act_token); +} + +void OSDService::queue_for_rep_scrub_resched(PG* pg, + Scrub::scrub_prio_t with_priority, + unsigned int qu_priority, + Scrub::act_token_t act_token) +{ + // Resulting scrub event: 'SchedReplica' + queue_scrub_event_msg(pg, with_priority, qu_priority, + act_token); +} + +void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'RemotesReserved' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'ReservationFailure' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'InternalSchedScrub' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'ActivePushesUpd' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'SelectedChunkFree' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'ChunkIsBusy' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority) +{ + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'Unblocked' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'DigestUpdate' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'IntLocalMapDone' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'GotReplicas' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'MapsCompared' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'ReplicaPushesUpd' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_is_finished(PG *pg) +{ + // Resulting scrub event: 'ScrubFinished' + queue_scrub_event_msg(pg, Scrub::scrub_prio_t::high_priority); +} + +void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'NextChunk' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e) +{ + dout(10) << __func__ << " on " << pgid << " e " << e << dendl; + enqueue_back( + OpSchedulerItem( + unique_ptr( + new PGDelete(pgid, e)), + cct->_conf->osd_pg_delete_cost, + cct->_conf->osd_pg_delete_priority, + ceph_clock_now(), + 0, + e)); +} + +bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num) +{ + return osd->try_finish_pg_delete(pg, old_pg_num); +} + +// --- + +void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << pg->pg_id << dendl; + ready_to_merge_source[pg->pg_id.pgid] = version; + assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0); + _send_ready_to_merge(); +} + +void OSDService::set_ready_to_merge_target(PG *pg, + eversion_t version, + epoch_t last_epoch_started, + epoch_t last_epoch_clean) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << pg->pg_id << dendl; + ready_to_merge_target.insert(make_pair(pg->pg_id.pgid, + make_tuple(version, + last_epoch_started, + last_epoch_clean))); + assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0); + _send_ready_to_merge(); +} + +void OSDService::set_not_ready_to_merge_source(pg_t source) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << source << dendl; + not_ready_to_merge_source.insert(source); + assert(ready_to_merge_source.count(source) == 0); + _send_ready_to_merge(); +} + +void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << target << " source " << source << dendl; + not_ready_to_merge_target[target] = source; + assert(ready_to_merge_target.count(target) == 0); + _send_ready_to_merge(); +} + +void OSDService::send_ready_to_merge() +{ + std::lock_guard l(merge_lock); + _send_ready_to_merge(); +} + +void OSDService::_send_ready_to_merge() +{ + dout(20) << __func__ + << " ready_to_merge_source " << ready_to_merge_source + << " not_ready_to_merge_source " << not_ready_to_merge_source + << " ready_to_merge_target " << ready_to_merge_target + << " not_ready_to_merge_target " << not_ready_to_merge_target + << " sent_ready_to_merge_source " << sent_ready_to_merge_source + << dendl; + for (auto src : not_ready_to_merge_source) { + if (sent_ready_to_merge_source.count(src) == 0) { + monc->send_mon_message(new MOSDPGReadyToMerge( + src, + {}, {}, 0, 0, + false, + osdmap->get_epoch())); + sent_ready_to_merge_source.insert(src); + } + } + for (auto p : not_ready_to_merge_target) { + if (sent_ready_to_merge_source.count(p.second) == 0) { + monc->send_mon_message(new MOSDPGReadyToMerge( + p.second, + {}, {}, 0, 0, + false, + osdmap->get_epoch())); + sent_ready_to_merge_source.insert(p.second); + } + } + for (auto src : ready_to_merge_source) { + if (not_ready_to_merge_source.count(src.first) || + not_ready_to_merge_target.count(src.first.get_parent())) { + continue; + } + auto p = ready_to_merge_target.find(src.first.get_parent()); + if (p != ready_to_merge_target.end() && + sent_ready_to_merge_source.count(src.first) == 0) { + monc->send_mon_message(new MOSDPGReadyToMerge( + src.first, // source pgid + src.second, // src version + std::get<0>(p->second), // target version + std::get<1>(p->second), // PG's last_epoch_started + std::get<2>(p->second), // PG's last_epoch_clean + true, + osdmap->get_epoch())); + sent_ready_to_merge_source.insert(src.first); + } + } +} + +void OSDService::clear_ready_to_merge(PG *pg) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << pg->pg_id << dendl; + ready_to_merge_source.erase(pg->pg_id.pgid); + ready_to_merge_target.erase(pg->pg_id.pgid); + not_ready_to_merge_source.erase(pg->pg_id.pgid); + not_ready_to_merge_target.erase(pg->pg_id.pgid); + sent_ready_to_merge_source.erase(pg->pg_id.pgid); +} + +void OSDService::clear_sent_ready_to_merge() +{ + std::lock_guard l(merge_lock); + sent_ready_to_merge_source.clear(); +} + +void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap) +{ + std::lock_guard l(merge_lock); + auto i = sent_ready_to_merge_source.begin(); + while (i != sent_ready_to_merge_source.end()) { + if (!osdmap->pg_exists(*i)) { + dout(10) << __func__ << " " << *i << dendl; + i = sent_ready_to_merge_source.erase(i); + } else { + ++i; + } + } +} + +// --- + +void OSDService::_queue_for_recovery( + std::pair p, + uint64_t reserved_pushes) +{ + ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock)); + enqueue_back( + OpSchedulerItem( + unique_ptr( + new PGRecovery( + p.second->get_pgid(), p.first, reserved_pushes)), + cct->_conf->osd_recovery_cost, + cct->_conf->osd_recovery_priority, + ceph_clock_now(), + 0, + p.first)); +} + +// ==================================================================== +// OSD + +#undef dout_prefix +#define dout_prefix *_dout + +// Commands shared between OSD's console and admin console: +namespace ceph::osd_cmds { + +int heap(CephContext& cct, + const cmdmap_t& cmdmap, + std::ostream& outos, + std::ostream& erros); + +} // namespace ceph::osd_cmds + +int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity) +{ + int ret; + + OSDSuperblock sb; + bufferlist sbbl; + ObjectStore::CollectionHandle ch; + + // if we are fed a uuid for this osd, use it. + store->set_fsid(cct->_conf->osd_uuid); + + ret = store->mkfs(); + if (ret) { + derr << "OSD::mkfs: ObjectStore::mkfs failed with error " + << cpp_strerror(ret) << dendl; + goto free_store; + } + + store->set_cache_shards(1); // doesn't matter for mkfs! + + ret = store->mount(); + if (ret) { + derr << "OSD::mkfs: couldn't mount ObjectStore: error " + << cpp_strerror(ret) << dendl; + goto free_store; + } + + ch = store->open_collection(coll_t::meta()); + if (ch) { + ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl); + if (ret < 0) { + derr << "OSD::mkfs: have meta collection but no superblock" << dendl; + goto free_store; + } + /* if we already have superblock, check content of superblock */ + dout(0) << " have superblock" << dendl; + auto p = sbbl.cbegin(); + decode(sb, p); + if (whoami != sb.whoami) { + derr << "provided osd id " << whoami << " != superblock's " << sb.whoami + << dendl; + ret = -EINVAL; + goto umount_store; + } + if (fsid != sb.cluster_fsid) { + derr << "provided cluster fsid " << fsid + << " != superblock's " << sb.cluster_fsid << dendl; + ret = -EINVAL; + goto umount_store; + } + } else { + // create superblock + sb.cluster_fsid = fsid; + sb.osd_fsid = store->get_fsid(); + sb.whoami = whoami; + sb.compat_features = get_osd_initial_compat_set(); + + bufferlist bl; + encode(sb, bl); + + ObjectStore::CollectionHandle ch = store->create_new_collection( + coll_t::meta()); + ObjectStore::Transaction t; + t.create_collection(coll_t::meta(), 0); + t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); + ret = store->queue_transaction(ch, std::move(t)); + if (ret) { + derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: " + << "queue_transaction returned " << cpp_strerror(ret) << dendl; + goto umount_store; + } + ch->flush(); + } + + ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity); + if (ret) { + derr << "OSD::mkfs: failed to write fsid file: error " + << cpp_strerror(ret) << dendl; + goto umount_store; + } + +umount_store: + if (ch) { + ch.reset(); + } + store->umount(); +free_store: + delete store; + return ret; +} + +int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity) +{ + char val[80]; + int r; + + snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC); + r = store->write_meta("magic", val); + if (r < 0) + return r; + + snprintf(val, sizeof(val), "%d", whoami); + r = store->write_meta("whoami", val); + if (r < 0) + return r; + + cluster_fsid.print(val); + r = store->write_meta("ceph_fsid", val); + if (r < 0) + return r; + + string key = cct->_conf.get_val("key"); + if (key.size()) { + r = store->write_meta("osd_key", key); + if (r < 0) + return r; + } else { + string keyfile = cct->_conf.get_val("keyfile"); + if (!keyfile.empty()) { + bufferlist keybl; + string err; + r = keybl.read_file(keyfile.c_str(), &err); + if (r < 0) { + derr << __func__ << " failed to read keyfile " << keyfile << ": " + << err << ": " << cpp_strerror(r) << dendl; + return r; + } + r = store->write_meta("osd_key", keybl.to_str()); + if (r < 0) + return r; + } + } + if (!osdspec_affinity.empty()) { + r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str()); + if (r < 0) + return r; + } + + r = store->write_meta("ceph_version_when_created", pretty_version_to_str()); + if (r < 0) + return r; + + ostringstream created_at; + utime_t now = ceph_clock_now(); + now.gmtime(created_at); + r = store->write_meta("created_at", created_at.str()); + if (r < 0) + return r; + + r = store->write_meta("ready", "ready"); + if (r < 0) + return r; + + return 0; +} + +int OSD::peek_meta(ObjectStore *store, + std::string *magic, + uuid_d *cluster_fsid, + uuid_d *osd_fsid, + int *whoami, + ceph_release_t *require_osd_release) +{ + string val; + + int r = store->read_meta("magic", &val); + if (r < 0) + return r; + *magic = val; + + r = store->read_meta("whoami", &val); + if (r < 0) + return r; + *whoami = atoi(val.c_str()); + + r = store->read_meta("ceph_fsid", &val); + if (r < 0) + return r; + r = cluster_fsid->parse(val.c_str()); + if (!r) + return -EINVAL; + + r = store->read_meta("fsid", &val); + if (r < 0) { + *osd_fsid = uuid_d(); + } else { + r = osd_fsid->parse(val.c_str()); + if (!r) + return -EINVAL; + } + + r = store->read_meta("require_osd_release", &val); + if (r >= 0) { + *require_osd_release = ceph_release_from_name(val); + } + + return 0; +} + + +#undef dout_prefix +#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch()) + +// cons/des + +OSD::OSD(CephContext *cct_, ObjectStore *store_, + int id, + Messenger *internal_messenger, + Messenger *external_messenger, + Messenger *hb_client_front, + Messenger *hb_client_back, + Messenger *hb_front_serverm, + Messenger *hb_back_serverm, + Messenger *osdc_messenger, + MonClient *mc, + const std::string &dev, const std::string &jdev, + ceph::async::io_context_pool& poolctx) : + Dispatcher(cct_), + tick_timer(cct, osd_lock), + tick_timer_without_osd_lock(cct, tick_timer_lock), + gss_ktfile_client(cct->_conf.get_val("gss_ktab_client_file")), + cluster_messenger(internal_messenger), + client_messenger(external_messenger), + objecter_messenger(osdc_messenger), + monc(mc), + mgrc(cct_, client_messenger, &mc->monmap), + logger(create_logger()), + recoverystate_perf(create_recoverystate_perf()), + store(store_), + log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS), + clog(log_client.create_channel()), + whoami(id), + dev_path(dev), journal_path(jdev), + store_is_rotational(store->is_rotational()), + trace_endpoint("0.0.0.0", 0, "osd"), + asok_hook(NULL), + m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val( + "osd_pg_epoch_max_lag_factor")), + osd_compat(get_osd_compat_set()), + osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp", + get_num_op_threads()), + heartbeat_stop(false), + heartbeat_need_update(true), + hb_front_client_messenger(hb_client_front), + hb_back_client_messenger(hb_client_back), + hb_front_server_messenger(hb_front_serverm), + hb_back_server_messenger(hb_back_serverm), + daily_loadavg(0.0), + heartbeat_thread(this), + heartbeat_dispatcher(this), + op_tracker(cct, cct->_conf->osd_enable_op_tracker, + cct->_conf->osd_num_op_tracker_shard), + test_ops_hook(NULL), + op_shardedwq( + this, + ceph::make_timespan(cct->_conf->osd_op_thread_timeout), + ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout), + &osd_op_tp), + last_pg_create_epoch(0), + boot_finisher(cct), + up_thru_wanted(0), + requested_full_first(0), + requested_full_last(0), + service(this, poolctx) +{ + + if (!gss_ktfile_client.empty()) { + // Assert we can export environment variable + /* + The default client keytab is used, if it is present and readable, + to automatically obtain initial credentials for GSSAPI client + applications. The principal name of the first entry in the client + keytab is used by default when obtaining initial credentials. + 1. The KRB5_CLIENT_KTNAME environment variable. + 2. The default_client_keytab_name profile variable in [libdefaults]. + 3. The hardcoded default, DEFCKTNAME. + */ + const int32_t set_result(setenv("KRB5_CLIENT_KTNAME", + gss_ktfile_client.c_str(), 1)); + ceph_assert(set_result == 0); + } + + monc->set_messenger(client_messenger); + op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time, + cct->_conf->osd_op_log_threshold); + op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size, + cct->_conf->osd_op_history_duration); + op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size, + cct->_conf->osd_op_history_slow_op_threshold); + ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals); +#ifdef WITH_BLKIN + std::stringstream ss; + ss << "osd." << whoami; + trace_endpoint.copy_name(ss.str()); +#endif + + // initialize shards + num_shards = get_num_op_shards(); + for (uint32_t i = 0; i < num_shards; i++) { + OSDShard *one_shard = new OSDShard( + i, + cct, + this); + shards.push_back(one_shard); + } +} + +OSD::~OSD() +{ + while (!shards.empty()) { + delete shards.back(); + shards.pop_back(); + } + cct->get_perfcounters_collection()->remove(recoverystate_perf); + cct->get_perfcounters_collection()->remove(logger); + delete recoverystate_perf; + delete logger; + delete store; +} + +double OSD::get_tick_interval() const +{ + // vary +/- 5% to avoid scrub scheduling livelocks + constexpr auto delta = 0.05; + return (OSD_TICK_INTERVAL * + ceph::util::generate_random_number(1.0 - delta, 1.0 + delta)); +} + +void OSD::handle_signal(int signum) +{ + ceph_assert(signum == SIGINT || signum == SIGTERM); + derr << "*** Got signal " << sig_str(signum) << " ***" << dendl; + shutdown(); +} + +int OSD::pre_init() +{ + std::lock_guard lock(osd_lock); + if (is_stopping()) + return 0; + + if (store->test_mount_in_use()) { + derr << "OSD::pre_init: object store '" << dev_path << "' is " + << "currently in use. (Is ceph-osd already running?)" << dendl; + return -EBUSY; + } + + cct->_conf.add_observer(this); + return 0; +} + +int OSD::set_numa_affinity() +{ + // storage numa node + int store_node = -1; + store->get_numa_node(&store_node, nullptr, nullptr); + if (store_node >= 0) { + dout(1) << __func__ << " storage numa node " << store_node << dendl; + } + + // check network numa node(s) + int front_node = -1, back_node = -1; + string front_iface = pick_iface( + cct, + client_messenger->get_myaddrs().front().get_sockaddr_storage()); + string back_iface = pick_iface( + cct, + cluster_messenger->get_myaddrs().front().get_sockaddr_storage()); + int r = get_iface_numa_node(front_iface, &front_node); + if (r >= 0 && front_node >= 0) { + dout(1) << __func__ << " public network " << front_iface << " numa node " + << front_node << dendl; + r = get_iface_numa_node(back_iface, &back_node); + if (r >= 0 && back_node >= 0) { + dout(1) << __func__ << " cluster network " << back_iface << " numa node " + << back_node << dendl; + if (front_node == back_node && + front_node == store_node) { + dout(1) << " objectstore and network numa nodes all match" << dendl; + if (g_conf().get_val("osd_numa_auto_affinity")) { + numa_node = front_node; + } + } else if (front_node != back_node) { + dout(1) << __func__ << " public and cluster network numa nodes do not match" + << dendl; + } else { + dout(1) << __func__ << " objectstore and network numa nodes do not match" + << dendl; + } + } else if (back_node == -2) { + dout(1) << __func__ << " cluster network " << back_iface + << " ports numa nodes do not match" << dendl; + } else { + derr << __func__ << " unable to identify cluster interface '" << back_iface + << "' numa node: " << cpp_strerror(r) << dendl; + } + } else if (front_node == -2) { + dout(1) << __func__ << " public network " << front_iface + << " ports numa nodes do not match" << dendl; + } else { + derr << __func__ << " unable to identify public interface '" << front_iface + << "' numa node: " << cpp_strerror(r) << dendl; + } + if (int node = g_conf().get_val("osd_numa_node"); node >= 0) { + // this takes precedence over the automagic logic above + numa_node = node; + } + if (numa_node >= 0) { + int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set); + if (r < 0) { + dout(1) << __func__ << " unable to determine numa node " << numa_node + << " CPUs" << dendl; + numa_node = -1; + } else { + dout(1) << __func__ << " setting numa affinity to node " << numa_node + << " cpus " + << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set) + << dendl; + r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r) + << dendl; + numa_node = -1; + } + } + } else { + dout(1) << __func__ << " not setting numa affinity" << dendl; + } + return 0; +} + +// asok + +class OSDSocketHook : public AdminSocketHook { + OSD *osd; +public: + explicit OSDSocketHook(OSD *o) : osd(o) {} + int call(std::string_view prefix, const cmdmap_t& cmdmap, + Formatter *f, + std::ostream& ss, + bufferlist& out) override { + ceph_abort("should use async hook"); + } + void call_async( + std::string_view prefix, + const cmdmap_t& cmdmap, + Formatter *f, + const bufferlist& inbl, + std::function on_finish) override { + try { + osd->asok_command(prefix, cmdmap, f, inbl, on_finish); + } catch (const TOPNSPC::common::bad_cmd_get& e) { + bufferlist empty; + on_finish(-EINVAL, e.what(), empty); + } + } +}; + +std::set OSD::get_mapped_pools() +{ + std::set pools; + std::vector pgids; + _get_pgids(&pgids); + for (const auto &pgid : pgids) { + pools.insert(pgid.pool()); + } + return pools; +} + +void OSD::asok_command( + std::string_view prefix, const cmdmap_t& cmdmap, + Formatter *f, + const bufferlist& inbl, + std::function on_finish) +{ + int ret = 0; + stringstream ss; // stderr error message stream + bufferlist outbl; // if empty at end, we'll dump formatter as output + + // --- PG commands are routed here to PG::do_command --- + if (prefix == "pg" || + prefix == "query" || + prefix == "mark_unfound_lost" || + prefix == "list_unfound" || + prefix == "scrub" || + prefix == "deep_scrub" + ) { + string pgidstr; + pg_t pgid; + if (!cmd_getval(cmdmap, "pgid", pgidstr)) { + ss << "no pgid specified"; + ret = -EINVAL; + goto out; + } + if (!pgid.parse(pgidstr.c_str())) { + ss << "couldn't parse pgid '" << pgidstr << "'"; + ret = -EINVAL; + goto out; + } + spg_t pcand; + PGRef pg; + if (get_osdmap()->get_primary_shard(pgid, &pcand) && + (pg = _lookup_lock_pg(pcand))) { + if (pg->is_primary()) { + cmdmap_t new_cmdmap = cmdmap; + try { + pg->do_command(prefix, new_cmdmap, inbl, on_finish); + pg->unlock(); + return; // the pg handler calls on_finish directly + } catch (const TOPNSPC::common::bad_cmd_get& e) { + pg->unlock(); + ss << e.what(); + ret = -EINVAL; + goto out; + } + } else { + ss << "not primary for pgid " << pgid; + // do not reply; they will get newer maps and realize they + // need to resend. + pg->unlock(); + ret = -EAGAIN; + goto out; + } + } else { + ss << "i don't have pgid " << pgid; + ret = -ENOENT; + } + } + + // --- OSD commands follow --- + + else if (prefix == "status") { + lock_guard l(osd_lock); + f->open_object_section("status"); + f->dump_stream("cluster_fsid") << superblock.cluster_fsid; + f->dump_stream("osd_fsid") << superblock.osd_fsid; + f->dump_unsigned("whoami", superblock.whoami); + f->dump_string("state", get_state_name(get_state())); + f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_unsigned("newest_map", superblock.newest_map); + f->dump_unsigned("num_pgs", num_pgs); + f->close_section(); + } else if (prefix == "flush_journal") { + store->flush_journal(); + } else if (prefix == "dump_ops_in_flight" || + prefix == "ops" || + prefix == "dump_blocked_ops" || + prefix == "dump_historic_ops" || + prefix == "dump_historic_ops_by_duration" || + prefix == "dump_historic_slow_ops") { + + const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \ +even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \ +will start to track new ops received afterwards."; + + set filters; + vector filter_str; + if (cmd_getval(cmdmap, "filterstr", filter_str)) { + copy(filter_str.begin(), filter_str.end(), + inserter(filters, filters.end())); + } + + if (prefix == "dump_ops_in_flight" || + prefix == "ops") { + if (!op_tracker.dump_ops_in_flight(f, false, filters)) { + ss << error_str; + ret = -EINVAL; + goto out; + } + } + if (prefix == "dump_blocked_ops") { + if (!op_tracker.dump_ops_in_flight(f, true, filters)) { + ss << error_str; + ret = -EINVAL; + goto out; + } + } + if (prefix == "dump_historic_ops") { + if (!op_tracker.dump_historic_ops(f, false, filters)) { + ss << error_str; + ret = -EINVAL; + goto out; + } + } + if (prefix == "dump_historic_ops_by_duration") { + if (!op_tracker.dump_historic_ops(f, true, filters)) { + ss << error_str; + ret = -EINVAL; + goto out; + } + } + if (prefix == "dump_historic_slow_ops") { + if (!op_tracker.dump_historic_slow_ops(f, filters)) { + ss << error_str; + ret = -EINVAL; + goto out; + } + } + } else if (prefix == "dump_op_pq_state") { + f->open_object_section("pq"); + op_shardedwq.dump(f); + f->close_section(); + } else if (prefix == "dump_blocklist") { + list > bl; + list > rbl; + OSDMapRef curmap = service.get_osdmap(); + curmap->get_blocklist(&bl, &rbl); + + f->open_array_section("blocklist"); + for (list >::iterator it = bl.begin(); + it != bl.end(); ++it) { + f->open_object_section("entry"); + f->open_object_section("entity_addr_t"); + it->first.dump(f); + f->close_section(); //entity_addr_t + it->second.localtime(f->dump_stream("expire_time")); + f->close_section(); //entry + } + f->close_section(); //blocklist + f->open_array_section("range_blocklist"); + for (list >::iterator it = rbl.begin(); + it != rbl.end(); ++it) { + f->open_object_section("entry"); + f->open_object_section("entity_addr_t"); + it->first.dump(f); + f->close_section(); //entity_addr_t + it->second.localtime(f->dump_stream("expire_time")); + f->close_section(); //entry + } + f->close_section(); //blocklist + } else if (prefix == "dump_watchers") { + list watchers; + // scan pg's + vector pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + list pg_watchers; + pg->get_watchers(&pg_watchers); + watchers.splice(watchers.end(), pg_watchers); + } + + f->open_array_section("watchers"); + for (list::iterator it = watchers.begin(); + it != watchers.end(); ++it) { + + f->open_object_section("watch"); + + f->dump_string("namespace", it->obj.nspace); + f->dump_string("object", it->obj.oid.name); + + f->open_object_section("entity_name"); + it->wi.name.dump(f); + f->close_section(); //entity_name_t + + f->dump_unsigned("cookie", it->wi.cookie); + f->dump_unsigned("timeout", it->wi.timeout_seconds); + + f->open_object_section("entity_addr_t"); + it->wi.addr.dump(f); + f->close_section(); //entity_addr_t + + f->close_section(); //watch + } + + f->close_section(); //watchers + } else if (prefix == "dump_recovery_reservations") { + f->open_object_section("reservations"); + f->open_object_section("local_reservations"); + service.local_reserver.dump(f); + f->close_section(); + f->open_object_section("remote_reservations"); + service.remote_reserver.dump(f); + f->close_section(); + f->close_section(); + } else if (prefix == "dump_scrub_reservations") { + f->open_object_section("scrub_reservations"); + service.dump_scrub_reservations(f); + f->close_section(); + } else if (prefix == "get_latest_osdmap") { + get_latest_osdmap(); + } else if (prefix == "set_heap_property") { + string property; + int64_t value = 0; + string error; + bool success = false; + if (!cmd_getval(cmdmap, "property", property)) { + error = "unable to get property"; + success = false; + } else if (!cmd_getval(cmdmap, "value", value)) { + error = "unable to get value"; + success = false; + } else if (value < 0) { + error = "negative value not allowed"; + success = false; + } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) { + error = "invalid property"; + success = false; + } else { + success = true; + } + f->open_object_section("result"); + f->dump_string("error", error); + f->dump_bool("success", success); + f->close_section(); + } else if (prefix == "get_heap_property") { + string property; + size_t value = 0; + string error; + bool success = false; + if (!cmd_getval(cmdmap, "property", property)) { + error = "unable to get property"; + success = false; + } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) { + error = "invalid property"; + success = false; + } else { + success = true; + } + f->open_object_section("result"); + f->dump_string("error", error); + f->dump_bool("success", success); + f->dump_int("value", value); + f->close_section(); + } else if (prefix == "dump_objectstore_kv_stats") { + store->get_db_statistics(f); + } else if (prefix == "dump_scrubs") { + service.dumps_scrub(f); + } else if (prefix == "calc_objectstore_db_histogram") { + store->generate_db_histogram(f); + } else if (prefix == "flush_store_cache") { + store->flush_cache(&ss); + } else if (prefix == "dump_pgstate_history") { + f->open_object_section("pgstate_history"); + f->open_array_section("pgs"); + vector pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + f->open_object_section("pg"); + f->dump_stream("pg") << pg->pg_id; + f->dump_string("currently", pg->get_current_state()); + pg->dump_pgstate_history(f); + f->close_section(); + } + f->close_section(); + f->close_section(); + } else if (prefix == "compact") { + dout(1) << "triggering manual compaction" << dendl; + auto start = ceph::coarse_mono_clock::now(); + store->compact(); + auto end = ceph::coarse_mono_clock::now(); + double duration = std::chrono::duration(end-start).count(); + dout(1) << "finished manual compaction in " + << duration + << " seconds" << dendl; + f->open_object_section("compact_result"); + f->dump_float("elapsed_time", duration); + f->close_section(); + } else if (prefix == "get_mapped_pools") { + f->open_array_section("mapped_pools"); + set poollist = get_mapped_pools(); + for (auto pool : poollist) { + f->dump_int("pool_id", pool); + } + f->close_section(); + } else if (prefix == "smart") { + string devid; + cmd_getval(cmdmap, "devid", devid); + ostringstream out; + probe_smart(devid, out); + outbl.append(out.str()); + } else if (prefix == "list_devices") { + set devnames; + store->get_devices(&devnames); + f->open_array_section("list_devices"); + for (auto dev : devnames) { + if (dev.find("dm-") == 0) { + continue; + } + string err; + f->open_object_section("device"); + f->dump_string("device", "/dev/" + dev); + f->dump_string("device_id", get_device_id(dev, &err)); + f->close_section(); + } + f->close_section(); + } else if (prefix == "send_beacon") { + lock_guard l(osd_lock); + if (is_active()) { + send_beacon(ceph::coarse_mono_clock::now()); + } + } + + else if (prefix == "cluster_log") { + vector msg; + cmd_getval(cmdmap, "message", msg); + if (msg.empty()) { + ret = -EINVAL; + ss << "ignoring empty log message"; + goto out; + } + string message = msg.front(); + for (vector::iterator a = ++msg.begin(); a != msg.end(); ++a) + message += " " + *a; + string lvl; + cmd_getval(cmdmap, "level", lvl); + clog_type level = string_to_clog_type(lvl); + if (level < 0) { + ret = -EINVAL; + ss << "unknown level '" << lvl << "'"; + goto out; + } + clog->do_log(level, message); + } + + else if (prefix == "bench") { + int64_t count; + int64_t bsize; + int64_t osize, onum; + // default count 1G, size 4MB + cmd_getval(cmdmap, "count", count, (int64_t)1 << 30); + cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20); + cmd_getval(cmdmap, "object_size", osize, (int64_t)0); + cmd_getval(cmdmap, "object_num", onum, (int64_t)0); + double elapsed = 0.0; + + ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss); + if (ret != 0) { + goto out; + } + + double rate = count / elapsed; + double iops = rate / bsize; + f->open_object_section("osd_bench_results"); + f->dump_int("bytes_written", count); + f->dump_int("blocksize", bsize); + f->dump_float("elapsed_sec", elapsed); + f->dump_float("bytes_per_sec", rate); + f->dump_float("iops", iops); + f->close_section(); + } + + else if (prefix == "flush_pg_stats") { + mgrc.send_pgstats(); + f->dump_unsigned("stat_seq", service.get_osd_stat_seq()); + } + + else if (prefix == "heap") { + std::stringstream outss; + ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss); + outbl.append(outss); + } + + else if (prefix == "debug dump_missing") { + f->open_array_section("pgs"); + vector pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + string s = stringify(pg->pg_id); + f->open_array_section(s.c_str()); + pg->lock(); + pg->dump_missing(f); + pg->unlock(); + f->close_section(); + } + f->close_section(); + } + + else if (prefix == "debug kick_recovery_wq") { + int64_t delay; + cmd_getval(cmdmap, "delay", delay); + ostringstream oss; + oss << delay; + ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str()); + if (ret != 0) { + ss << "kick_recovery_wq: error setting " + << "osd_recovery_delay_start to '" << delay << "': error " + << ret; + goto out; + } + cct->_conf.apply_changes(nullptr); + ss << "kicking recovery queue. set osd_recovery_delay_start " + << "to " << cct->_conf->osd_recovery_delay_start; + } + + else if (prefix == "cpu_profiler") { + ostringstream ds; + string arg; + cmd_getval(cmdmap, "arg", arg); + vector argvec; + get_str_vec(arg, argvec); + cpu_profiler_handle_command(argvec, ds); + outbl.append(ds.str()); + } + + else if (prefix == "dump_pg_recovery_stats") { + lock_guard l(osd_lock); + pg_recovery_stats.dump_formatted(f); + } + + else if (prefix == "reset_pg_recovery_stats") { + lock_guard l(osd_lock); + pg_recovery_stats.reset(); + } + + else if (prefix == "perf histogram dump") { + std::string logger; + std::string counter; + cmd_getval(cmdmap, "logger", logger); + cmd_getval(cmdmap, "counter", counter); + cct->get_perfcounters_collection()->dump_formatted_histograms( + f, false, logger, counter); + } + + else if (prefix == "cache drop") { + lock_guard l(osd_lock); + dout(20) << "clearing all caches" << dendl; + // Clear the objectstore's cache - onode and buffer for Bluestore, + // system's pagecache for Filestore + ret = store->flush_cache(&ss); + if (ret < 0) { + ss << "Error flushing objectstore cache: " << cpp_strerror(ret); + goto out; + } + // Clear the objectcontext cache (per PG) + vector pgs; + _get_pgs(&pgs); + for (auto& pg: pgs) { + pg->clear_cache(); + } + } + + else if (prefix == "cache status") { + lock_guard l(osd_lock); + int obj_ctx_count = 0; + vector pgs; + _get_pgs(&pgs); + for (auto& pg: pgs) { + obj_ctx_count += pg->get_cache_obj_count(); + } + f->open_object_section("cache_status"); + f->dump_int("object_ctx", obj_ctx_count); + store->dump_cache_stats(f); + f->close_section(); + } + + else if (prefix == "scrub_purged_snaps") { + lock_guard l(osd_lock); + scrub_purged_snaps(); + } + + else if (prefix == "dump_osd_network") { + lock_guard l(osd_lock); + int64_t value = 0; + if (!(cmd_getval(cmdmap, "value", value))) { + // Convert milliseconds to microseconds + value = static_cast(g_conf().get_val( + "mon_warn_on_slow_ping_time")) * 1000; + if (value == 0) { + double ratio = g_conf().get_val("mon_warn_on_slow_ping_ratio"); + value = g_conf().get_val("osd_heartbeat_grace"); + value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio + } + } else { + // Convert user input to microseconds + value *= 1000; + } + if (value < 0) value = 0; + + struct osd_ping_time_t { + uint32_t pingtime; + int to; + bool back; + std::array times; + std::array min; + std::array max; + uint32_t last; + uint32_t last_update; + + bool operator<(const osd_ping_time_t& rhs) const { + if (pingtime < rhs.pingtime) + return true; + if (pingtime > rhs.pingtime) + return false; + if (to < rhs.to) + return true; + if (to > rhs.to) + return false; + return back; + } + }; + + set sorted; + // Get pingtimes under lock and not on the stack + map *pingtimes = new map; + service.get_hb_pingtime(pingtimes); + for (auto j : *pingtimes) { + if (j.second.last_update == 0) + continue; + osd_ping_time_t item; + item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); + item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]); + if (item.pingtime >= value) { + item.to = j.first; + item.times[0] = j.second.back_pingtime[0]; + item.times[1] = j.second.back_pingtime[1]; + item.times[2] = j.second.back_pingtime[2]; + item.min[0] = j.second.back_min[0]; + item.min[1] = j.second.back_min[1]; + item.min[2] = j.second.back_min[2]; + item.max[0] = j.second.back_max[0]; + item.max[1] = j.second.back_max[1]; + item.max[2] = j.second.back_max[2]; + item.last = j.second.back_last; + item.back = true; + item.last_update = j.second.last_update; + sorted.emplace(item); + } + if (j.second.front_last == 0) + continue; + item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]); + item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]); + if (item.pingtime >= value) { + item.to = j.first; + item.times[0] = j.second.front_pingtime[0]; + item.times[1] = j.second.front_pingtime[1]; + item.times[2] = j.second.front_pingtime[2]; + item.min[0] = j.second.front_min[0]; + item.min[1] = j.second.front_min[1]; + item.min[2] = j.second.front_min[2]; + item.max[0] = j.second.front_max[0]; + item.max[1] = j.second.front_max[1]; + item.max[2] = j.second.front_max[2]; + item.last = j.second.front_last; + item.last_update = j.second.last_update; + item.back = false; + sorted.emplace(item); + } + } + delete pingtimes; + // + // Network ping times (1min 5min 15min) + f->open_object_section("network_ping_times"); + f->dump_int("threshold", value / 1000); + f->open_array_section("entries"); + for (auto &sitem : boost::adaptors::reverse(sorted)) { + ceph_assert(sitem.pingtime >= value); + f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = cct->_conf.get_val("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); + f->dump_int("from osd", whoami); + f->dump_int("to osd", sitem.to); + f->dump_string("interface", (sitem.back ? "back" : "front")); + f->open_object_section("average"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str()); + f->close_section(); // average + f->open_object_section("min"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); + f->close_section(); // min + f->open_object_section("max"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); + f->close_section(); // max + f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str()); + f->close_section(); // entry + } + f->close_section(); // entries + f->close_section(); // network_ping_times + } else { + ceph_abort_msg("broken asok registration"); + } + + out: + on_finish(ret, ss.str(), outbl); +} + +int OSD::run_osd_bench_test( + int64_t count, + int64_t bsize, + int64_t osize, + int64_t onum, + double *elapsed, + ostream &ss) +{ + int ret = 0; + uint32_t duration = cct->_conf->osd_bench_duration; + + if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) { + // let us limit the block size because the next checks rely on it + // having a sane value. If we allow any block size to be set things + // can still go sideways. + ss << "block 'size' values are capped at " + << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use" + << " a higher value, please adjust 'osd_bench_max_block_size'"; + ret = -EINVAL; + return ret; + } else if (bsize < (int64_t) (1 << 20)) { + // entering the realm of small block sizes. + // limit the count to a sane value, assuming a configurable amount of + // IOPS and duration, so that the OSD doesn't get hung up on this, + // preventing timeouts from going off + int64_t max_count = + bsize * duration * cct->_conf->osd_bench_small_size_max_iops; + if (count > max_count) { + ss << "'count' values greater than " << max_count + << " for a block size of " << byte_u_t(bsize) << ", assuming " + << cct->_conf->osd_bench_small_size_max_iops << " IOPS," + << " for " << duration << " seconds," + << " can cause ill effects on osd. " + << " Please adjust 'osd_bench_small_size_max_iops' with a higher" + << " value if you wish to use a higher 'count'."; + ret = -EINVAL; + return ret; + } + } else { + // 1MB block sizes are big enough so that we get more stuff done. + // However, to avoid the osd from getting hung on this and having + // timers being triggered, we are going to limit the count assuming + // a configurable throughput and duration. + // NOTE: max_count is the total amount of bytes that we believe we + // will be able to write during 'duration' for the given + // throughput. The block size hardly impacts this unless it's + // way too big. Given we already check how big the block size + // is, it's safe to assume everything will check out. + int64_t max_count = + cct->_conf->osd_bench_large_size_max_throughput * duration; + if (count > max_count) { + ss << "'count' values greater than " << max_count + << " for a block size of " << byte_u_t(bsize) << ", assuming " + << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s," + << " for " << duration << " seconds," + << " can cause ill effects on osd. " + << " Please adjust 'osd_bench_large_size_max_throughput'" + << " with a higher value if you wish to use a higher 'count'."; + ret = -EINVAL; + return ret; + } + } + + if (osize && bsize > osize) { + bsize = osize; + } + + dout(1) << " bench count " << count + << " bsize " << byte_u_t(bsize) << dendl; + + ObjectStore::Transaction cleanupt; + + if (osize && onum) { + bufferlist bl; + bufferptr bp(osize); + bp.zero(); + bl.push_back(std::move(bp)); + bl.rebuild_page_aligned(); + for (int i=0; iqueue_transaction(service.meta_ch, std::move(t), nullptr); + cleanupt.remove(coll_t(), ghobject_t(soid)); + } + } + + bufferlist bl; + bufferptr bp(bsize); + bp.zero(); + bl.push_back(std::move(bp)); + bl.rebuild_page_aligned(); + + { + C_SaferCond waiter; + if (!service.meta_ch->flush_commit(&waiter)) { + waiter.wait(); + } + } + + utime_t start = ceph_clock_now(); + for (int64_t pos = 0; pos < count; pos += bsize) { + char nm[30]; + unsigned offset = 0; + if (onum && osize) { + snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum)); + offset = rand() % (osize / bsize) * bsize; + } else { + snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos); + } + object_t oid(nm); + hobject_t soid(sobject_t(oid, 0)); + ObjectStore::Transaction t; + t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl); + store->queue_transaction(service.meta_ch, std::move(t), nullptr); + if (!onum || !osize) { + cleanupt.remove(coll_t::meta(), ghobject_t(soid)); + } + } + + { + C_SaferCond waiter; + if (!service.meta_ch->flush_commit(&waiter)) { + waiter.wait(); + } + } + utime_t end = ceph_clock_now(); + *elapsed = end - start; + + // clean up + store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr); + { + C_SaferCond waiter; + if (!service.meta_ch->flush_commit(&waiter)) { + waiter.wait(); + } + } + + return ret; +} + +class TestOpsSocketHook : public AdminSocketHook { + OSDService *service; + ObjectStore *store; +public: + TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {} + int call(std::string_view command, const cmdmap_t& cmdmap, + Formatter *f, + std::ostream& errss, + bufferlist& out) override { + int r = 0; + stringstream outss; + try { + test_ops(service, store, command, cmdmap, outss); + out.append(outss); + } catch (const TOPNSPC::common::bad_cmd_get& e) { + errss << e.what(); + r = -EINVAL; + } + return r; + } + void test_ops(OSDService *service, ObjectStore *store, + std::string_view command, const cmdmap_t& cmdmap, ostream &ss); + +}; + +class OSD::C_Tick : public Context { + OSD *osd; + public: + explicit C_Tick(OSD *o) : osd(o) {} + void finish(int r) override { + osd->tick(); + } +}; + +class OSD::C_Tick_WithoutOSDLock : public Context { + OSD *osd; + public: + explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {} + void finish(int r) override { + osd->tick_without_osd_lock(); + } +}; + +int OSD::enable_disable_fuse(bool stop) +{ +#ifdef HAVE_LIBFUSE + int r; + string mntpath = cct->_conf->osd_data + "/fuse"; + if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) { + dout(1) << __func__ << " disabling" << dendl; + fuse_store->stop(); + delete fuse_store; + fuse_store = NULL; + r = ::rmdir(mntpath.c_str()); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to rmdir " << mntpath << ": " + << cpp_strerror(r) << dendl; + return r; + } + return 0; + } + if (!fuse_store && cct->_conf->osd_objectstore_fuse) { + dout(1) << __func__ << " enabling" << dendl; + r = ::mkdir(mntpath.c_str(), 0700); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " unable to create " << mntpath << ": " + << cpp_strerror(r) << dendl; + return r; + } + fuse_store = new FuseStore(store, mntpath); + r = fuse_store->start(); + if (r < 0) { + derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl; + delete fuse_store; + fuse_store = NULL; + return r; + } + } +#endif // HAVE_LIBFUSE + return 0; +} + +size_t OSD::get_num_cache_shards() +{ + return cct->_conf.get_val("osd_num_cache_shards"); +} + +int OSD::get_num_op_shards() +{ + if (cct->_conf->osd_op_num_shards) + return cct->_conf->osd_op_num_shards; + if (store_is_rotational) + return cct->_conf->osd_op_num_shards_hdd; + else + return cct->_conf->osd_op_num_shards_ssd; +} + +int OSD::get_num_op_threads() +{ + if (cct->_conf->osd_op_num_threads_per_shard) + return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard; + if (store_is_rotational) + return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd; + else + return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd; +} + +float OSD::get_osd_recovery_sleep() +{ + if (cct->_conf->osd_recovery_sleep) + return cct->_conf->osd_recovery_sleep; + if (!store_is_rotational && !journal_is_rotational) + return cct->_conf->osd_recovery_sleep_ssd; + else if (store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val("osd_recovery_sleep_hybrid"); + else + return cct->_conf->osd_recovery_sleep_hdd; +} + +float OSD::get_osd_delete_sleep() +{ + float osd_delete_sleep = cct->_conf.get_val("osd_delete_sleep"); + if (osd_delete_sleep > 0) + return osd_delete_sleep; + if (!store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val("osd_delete_sleep_ssd"); + if (store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val("osd_delete_sleep_hybrid"); + return cct->_conf.get_val("osd_delete_sleep_hdd"); +} + +int OSD::get_recovery_max_active() +{ + if (cct->_conf->osd_recovery_max_active) + return cct->_conf->osd_recovery_max_active; + if (store_is_rotational) + return cct->_conf->osd_recovery_max_active_hdd; + else + return cct->_conf->osd_recovery_max_active_ssd; +} + +float OSD::get_osd_snap_trim_sleep() +{ + float osd_snap_trim_sleep = cct->_conf.get_val("osd_snap_trim_sleep"); + if (osd_snap_trim_sleep > 0) + return osd_snap_trim_sleep; + if (!store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val("osd_snap_trim_sleep_ssd"); + if (store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val("osd_snap_trim_sleep_hybrid"); + return cct->_conf.get_val("osd_snap_trim_sleep_hdd"); +} + +int OSD::init() +{ + OSDMapRef osdmap; + CompatSet initial, diff; + std::lock_guard lock(osd_lock); + if (is_stopping()) + return 0; + + tick_timer.init(); + tick_timer_without_osd_lock.init(); + service.recovery_request_timer.init(); + service.sleep_timer.init(); + + boot_finisher.start(); + + { + string val; + store->read_meta("require_osd_release", &val); + last_require_osd_release = ceph_release_from_name(val); + } + + // mount. + dout(2) << "init " << dev_path + << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")" + << dendl; + dout(2) << "journal " << journal_path << dendl; + ceph_assert(store); // call pre_init() first! + + store->set_cache_shards(get_num_cache_shards()); + + int r = store->mount(); + if (r < 0) { + derr << "OSD:init: unable to mount object store" << dendl; + return r; + } + journal_is_rotational = store->is_journal_rotational(); + dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd") + << dendl; + + enable_disable_fuse(false); + + dout(2) << "boot" << dendl; + + service.meta_ch = store->open_collection(coll_t::meta()); + + // initialize the daily loadavg with current 15min loadavg + double loadavgs[3]; + if (getloadavg(loadavgs, 3) == 3) { + daily_loadavg = loadavgs[2]; + } else { + derr << "OSD::init() : couldn't read loadavgs\n" << dendl; + daily_loadavg = 1.0; + } + + int rotating_auth_attempts = 0; + auto rotating_auth_timeout = + g_conf().get_val("rotating_keys_bootstrap_timeout"); + + // sanity check long object name handling + { + hobject_t l; + l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n'); + l.set_key(string(cct->_conf->osd_max_object_name_len, 'k')); + l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's'); + r = store->validate_hobject_key(l); + if (r < 0) { + derr << "backend (" << store->get_type() << ") is unable to support max " + << "object name[space] len" << dendl; + derr << " osd max object name len = " + << cct->_conf->osd_max_object_name_len << dendl; + derr << " osd max object namespace len = " + << cct->_conf->osd_max_object_namespace_len << dendl; + derr << cpp_strerror(r) << dendl; + if (cct->_conf->osd_check_max_object_name_len_on_startup) { + goto out; + } + derr << "osd_check_max_object_name_len_on_startup = false, starting anyway" + << dendl; + } else { + dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl; + } + } + + // read superblock + r = read_superblock(); + if (r < 0) { + derr << "OSD::init() : unable to read osd superblock" << dendl; + r = -EINVAL; + goto out; + } + + if (osd_compat.compare(superblock.compat_features) < 0) { + derr << "The disk uses features unsupported by the executable." << dendl; + derr << " ondisk features " << superblock.compat_features << dendl; + derr << " daemon features " << osd_compat << dendl; + + if (osd_compat.writeable(superblock.compat_features)) { + CompatSet diff = osd_compat.unsupported(superblock.compat_features); + derr << "it is still writeable, though. Missing features: " << diff << dendl; + r = -EOPNOTSUPP; + goto out; + } + else { + CompatSet diff = osd_compat.unsupported(superblock.compat_features); + derr << "Cannot write to disk! Missing features: " << diff << dendl; + r = -EOPNOTSUPP; + goto out; + } + } + + assert_warn(whoami == superblock.whoami); + if (whoami != superblock.whoami) { + derr << "OSD::init: superblock says osd" + << superblock.whoami << " but I am osd." << whoami << dendl; + r = -EINVAL; + goto out; + } + + startup_time = ceph::mono_clock::now(); + + // load up "current" osdmap + assert_warn(!get_osdmap()); + if (get_osdmap()) { + derr << "OSD::init: unable to read current osdmap" << dendl; + r = -EINVAL; + goto out; + } + osdmap = get_map(superblock.current_epoch); + set_osdmap(osdmap); + + // make sure we don't have legacy pgs deleting + { + vector ls; + int r = store->list_collections(ls); + ceph_assert(r >= 0); + for (auto c : ls) { + spg_t pgid; + if (c.is_pg(&pgid) && + !osdmap->have_pg_pool(pgid.pool())) { + ghobject_t oid = make_final_pool_info_oid(pgid.pool()); + if (!store->exists(service.meta_ch, oid)) { + derr << __func__ << " missing pg_pool_t for deleted pool " + << pgid.pool() << " for pg " << pgid + << "; please downgrade to luminous and allow " + << "pg deletion to complete before upgrading" << dendl; + ceph_abort(); + } + } + } + } + + initial = get_osd_initial_compat_set(); + diff = superblock.compat_features.unsupported(initial); + if (superblock.compat_features.merge(initial)) { + // Are we adding SNAPMAPPER2? + if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) { + dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)" + << dendl; + auto ch = service.meta_ch; + auto hoid = make_snapmapper_oid(); + unsigned max = cct->_conf->osd_target_transaction_size; + r = SnapMapper::convert_legacy(cct, store, ch, hoid, max); + if (r < 0) + goto out; + } + // We need to persist the new compat_set before we + // do anything else + dout(5) << "Upgrading superblock adding: " << diff << dendl; + ObjectStore::Transaction t; + write_superblock(t); + r = store->queue_transaction(service.meta_ch, std::move(t)); + if (r < 0) + goto out; + } + + // make sure snap mapper object exists + if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) { + dout(10) << "init creating/touching snapmapper object" << dendl; + ObjectStore::Transaction t; + t.touch(coll_t::meta(), OSD::make_snapmapper_oid()); + r = store->queue_transaction(service.meta_ch, std::move(t)); + if (r < 0) + goto out; + } + if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) { + dout(10) << "init creating/touching purged_snaps object" << dendl; + ObjectStore::Transaction t; + t.touch(coll_t::meta(), OSD::make_purged_snaps_oid()); + r = store->queue_transaction(service.meta_ch, std::move(t)); + if (r < 0) + goto out; + } + + if (cct->_conf->osd_open_classes_on_start) { + int r = ClassHandler::get_instance().open_all_classes(); + if (r) + dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl; + } + + check_osdmap_features(); + + { + epoch_t bind_epoch = osdmap->get_epoch(); + service.set_epochs(NULL, NULL, &bind_epoch); + } + + clear_temp_objects(); + + // initialize osdmap references in sharded wq + for (auto& shard : shards) { + std::lock_guard l(shard->osdmap_lock); + shard->shard_osdmap = osdmap; + } + + // load up pgs (as they previously existed) + load_pgs(); + + dout(2) << "superblock: I am osd." << superblock.whoami << dendl; + + if (cct->_conf.get_val("osd_compact_on_start")) { + dout(2) << "compacting object store's omap" << dendl; + store->compact(); + } + + // prime osd stats + { + struct store_statfs_t stbuf; + osd_alert_list_t alerts; + int r = store->statfs(&stbuf, &alerts); + ceph_assert(r == 0); + service.set_statfs(stbuf, alerts); + } + + // client_messenger's auth_client will be set up by monc->init() later. + for (auto m : { cluster_messenger, + objecter_messenger, + hb_front_client_messenger, + hb_back_client_messenger, + hb_front_server_messenger, + hb_back_server_messenger } ) { + m->set_auth_client(monc); + } + for (auto m : { client_messenger, + cluster_messenger, + hb_front_server_messenger, + hb_back_server_messenger }) { + m->set_auth_server(monc); + } + monc->set_handle_authentication_dispatcher(this); + + monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD + | CEPH_ENTITY_TYPE_MGR); + r = monc->init(); + if (r < 0) + goto out; + + mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); }); + mgrc.set_perf_metric_query_cb( + [this](const ConfigPayload &config_payload) { + set_perf_queries(config_payload); + }, + [this] { + return get_perf_reports(); + }); + mgrc.init(); + + // tell monc about log_client so it will know about mon session resets + monc->set_log_client(&log_client); + update_log_config(); + + // i'm ready! + client_messenger->add_dispatcher_tail(&mgrc); + client_messenger->add_dispatcher_tail(this); + cluster_messenger->add_dispatcher_head(this); + + hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher); + hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher); + hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher); + hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher); + + objecter_messenger->add_dispatcher_head(service.objecter.get()); + + service.init(); + service.publish_map(osdmap); + service.publish_superblock(superblock); + service.max_oldest_map = superblock.oldest_map; + + for (auto& shard : shards) { + // put PGs in a temporary set because we may modify pg_slots + // unordered_map below. + set pgs; + for (auto& i : shard->pg_slots) { + PGRef pg = i.second->pg; + if (!pg) { + continue; + } + pgs.insert(pg); + } + for (auto pg : pgs) { + std::scoped_lock l{*pg}; + set> new_children; + set> merge_pgs; + service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id, + &new_children, &merge_pgs); + if (!new_children.empty()) { + for (auto shard : shards) { + shard->prime_splits(osdmap, &new_children); + } + assert(new_children.empty()); + } + if (!merge_pgs.empty()) { + for (auto shard : shards) { + shard->prime_merges(osdmap, &merge_pgs); + } + assert(merge_pgs.empty()); + } + } + } + + osd_op_tp.start(); + + // start the heartbeat + heartbeat_thread.create("osd_srv_heartbt"); + + // tick + tick_timer.add_event_after(get_tick_interval(), + new C_Tick(this)); + { + std::lock_guard l(tick_timer_lock); + tick_timer_without_osd_lock.add_event_after(get_tick_interval(), + new C_Tick_WithoutOSDLock(this)); + } + + osd_lock.unlock(); + + r = monc->authenticate(); + if (r < 0) { + derr << __func__ << " authentication failed: " << cpp_strerror(r) + << dendl; + exit(1); + } + + while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) { + derr << "unable to obtain rotating service keys; retrying" << dendl; + ++rotating_auth_attempts; + if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) { + derr << __func__ << " wait_auth_rotating timed out" << dendl; + exit(1); + } + } + + r = update_crush_device_class(); + if (r < 0) { + derr << __func__ << " unable to update_crush_device_class: " + << cpp_strerror(r) << dendl; + exit(1); + } + + r = update_crush_location(); + if (r < 0) { + derr << __func__ << " unable to update_crush_location: " + << cpp_strerror(r) << dendl; + exit(1); + } + + osd_lock.lock(); + if (is_stopping()) + return 0; + + // start objecter *after* we have authenticated, so that we don't ignore + // the OSDMaps it requests. + service.final_init(); + + check_config(); + + dout(10) << "ensuring pgs have consumed prior maps" << dendl; + consume_map(); + + dout(0) << "done with init, starting boot process" << dendl; + + // subscribe to any pg creations + monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0); + + // MgrClient needs this (it doesn't have MonClient reference itself) + monc->sub_want("mgrmap", 0, 0); + + // we don't need to ask for an osdmap here; objecter will + //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME); + + monc->renew_subs(); + + start_boot(); + + // Override a few options if mclock scheduler is enabled. + maybe_override_max_osd_capacity_for_qos(); + maybe_override_options_for_qos(); + + return 0; + +out: + enable_disable_fuse(true); + store->umount(); + delete store; + store = NULL; + return r; +} + +void OSD::final_init() +{ + AdminSocket *admin_socket = cct->get_admin_socket(); + asok_hook = new OSDSocketHook(this); + int r = admin_socket->register_command("status", asok_hook, + "high-level status of OSD"); + ceph_assert(r == 0); + r = admin_socket->register_command("flush_journal", + asok_hook, + "flush the journal to permanent store"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_ops_in_flight " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show the ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("ops " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show the ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_blocked_ops " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show the blocked ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_historic_ops " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show recent ops"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_historic_slow_ops " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show slowest recent ops"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_historic_ops_by_duration " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show slowest recent ops, sorted by duration"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_op_pq_state", + asok_hook, + "dump op priority queue state"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_blocklist", + asok_hook, + "dump blocklisted clients and times"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_watchers", + asok_hook, + "show clients which have active watches," + " and on which objects"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_recovery_reservations", + asok_hook, + "show recovery reservations"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_scrub_reservations", + asok_hook, + "show scrub reservations"); + ceph_assert(r == 0); + r = admin_socket->register_command("get_latest_osdmap", + asok_hook, + "force osd to update the latest map from " + "the mon"); + ceph_assert(r == 0); + + r = admin_socket->register_command("set_heap_property " \ + "name=property,type=CephString " \ + "name=value,type=CephInt", + asok_hook, + "update malloc extension heap property"); + ceph_assert(r == 0); + + r = admin_socket->register_command("get_heap_property " \ + "name=property,type=CephString", + asok_hook, + "get malloc extension heap property"); + ceph_assert(r == 0); + + r = admin_socket->register_command("dump_objectstore_kv_stats", + asok_hook, + "print statistics of kvdb which used by bluestore"); + ceph_assert(r == 0); + + r = admin_socket->register_command("dump_scrubs", + asok_hook, + "print scheduled scrubs"); + ceph_assert(r == 0); + + r = admin_socket->register_command("calc_objectstore_db_histogram", + asok_hook, + "Generate key value histogram of kvdb(rocksdb) which used by bluestore"); + ceph_assert(r == 0); + + r = admin_socket->register_command("flush_store_cache", + asok_hook, + "Flush bluestore internal cache"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_pgstate_history", + asok_hook, + "show recent state history"); + ceph_assert(r == 0); + + r = admin_socket->register_command("compact", + asok_hook, + "Commpact object store's omap." + " WARNING: Compaction probably slows your requests"); + ceph_assert(r == 0); + + r = admin_socket->register_command("get_mapped_pools", + asok_hook, + "dump pools whose PG(s) are mapped to this OSD."); + + ceph_assert(r == 0); + + r = admin_socket->register_command("smart name=devid,type=CephString,req=false", + asok_hook, + "probe OSD devices for SMART data."); + + ceph_assert(r == 0); + + r = admin_socket->register_command("list_devices", + asok_hook, + "list OSD devices."); + r = admin_socket->register_command("send_beacon", + asok_hook, + "send OSD beacon to mon immediately"); + + r = admin_socket->register_command( + "dump_osd_network name=value,type=CephInt,req=false", asok_hook, + "Dump osd heartbeat network ping times"); + ceph_assert(r == 0); + + test_ops_hook = new TestOpsSocketHook(&(this->service), this->store); + // Note: pools are CephString instead of CephPoolname because + // these commands traditionally support both pool names and numbers + r = admin_socket->register_command( + "setomapval " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=key,type=CephString "\ + "name=val,type=CephString", + test_ops_hook, + "set omap key"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "rmomapkey " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=key,type=CephString", + test_ops_hook, + "remove omap key"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "setomapheader " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=header,type=CephString", + test_ops_hook, + "set omap header"); + ceph_assert(r == 0); + + r = admin_socket->register_command( + "getomap " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname", + test_ops_hook, + "output entire object map"); + ceph_assert(r == 0); + + r = admin_socket->register_command( + "truncobj " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=len,type=CephInt", + test_ops_hook, + "truncate object to length"); + ceph_assert(r == 0); + + r = admin_socket->register_command( + "injectdataerr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=false,range=0|255", + test_ops_hook, + "inject data error to an object"); + ceph_assert(r == 0); + + r = admin_socket->register_command( + "injectmdataerr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=false,range=0|255", + test_ops_hook, + "inject metadata error to an object"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "set_recovery_delay " \ + "name=utime,type=CephInt,req=false", + test_ops_hook, + "Delay osd recovery by specified seconds"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "injectfull " \ + "name=type,type=CephString,req=false " \ + "name=count,type=CephInt,req=false ", + test_ops_hook, + "Inject a full disk (optional count times)"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "bench " \ + "name=count,type=CephInt,req=false " \ + "name=size,type=CephInt,req=false " \ + "name=object_size,type=CephInt,req=false " \ + "name=object_num,type=CephInt,req=false ", + asok_hook, + "OSD benchmark: write -byte objects(with ), " \ + "(default count=1G default size=4MB). Results in log."); + ceph_assert(r == 0); + r = admin_socket->register_command( + "cluster_log " \ + "name=level,type=CephChoices,strings=error,warning,info,debug " \ + "name=message,type=CephString,n=N", + asok_hook, + "log a message to the cluster log"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "flush_pg_stats", + asok_hook, + "flush pg stats"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "heap " \ + "name=heapcmd,type=CephChoices,strings=" \ + "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \ + "name=value,type=CephString,req=false", + asok_hook, + "show heap usage info (available only if compiled with tcmalloc)"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "debug dump_missing " \ + "name=filename,type=CephFilepath", + asok_hook, + "dump missing objects to a named file"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "debug kick_recovery_wq " \ + "name=delay,type=CephInt,range=0", + asok_hook, + "set osd_recovery_delay_start to "); + ceph_assert(r == 0); + r = admin_socket->register_command( + "cpu_profiler " \ + "name=arg,type=CephChoices,strings=status|flush", + asok_hook, + "run cpu profiling on daemon"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "dump_pg_recovery_stats", + asok_hook, + "dump pg recovery statistics"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "reset_pg_recovery_stats", + asok_hook, + "reset pg recovery statistics"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "cache drop", + asok_hook, + "Drop all OSD caches"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "cache status", + asok_hook, + "Get OSD caches statistics"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "scrub_purged_snaps", + asok_hook, + "Scrub purged_snaps vs snapmapper index"); + ceph_assert(r == 0); + + // -- pg commands -- + // old form: ceph pg command ... + r = admin_socket->register_command( + "pg " \ + "name=pgid,type=CephPgid " \ + "name=cmd,type=CephChoices,strings=query", + asok_hook, + ""); + ceph_assert(r == 0); + r = admin_socket->register_command( + "pg " \ + "name=pgid,type=CephPgid " \ + "name=cmd,type=CephChoices,strings=mark_unfound_lost " \ + "name=mulcmd,type=CephChoices,strings=revert|delete", + asok_hook, + ""); + ceph_assert(r == 0); + r = admin_socket->register_command( + "pg " \ + "name=pgid,type=CephPgid " \ + "name=cmd,type=CephChoices,strings=list_unfound " \ + "name=offset,type=CephString,req=false", + asok_hook, + ""); + ceph_assert(r == 0); + r = admin_socket->register_command( + "pg " \ + "name=pgid,type=CephPgid " \ + "name=cmd,type=CephChoices,strings=scrub " \ + "name=time,type=CephInt,req=false", + asok_hook, + ""); + ceph_assert(r == 0); + r = admin_socket->register_command( + "pg " \ + "name=pgid,type=CephPgid " \ + "name=cmd,type=CephChoices,strings=deep_scrub " \ + "name=time,type=CephInt,req=false", + asok_hook, + ""); + ceph_assert(r == 0); + // new form: tell for both cli and rest + r = admin_socket->register_command( + "query", + asok_hook, + "show details of a specific pg"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "mark_unfound_lost " \ + "name=pgid,type=CephPgid,req=false " \ + "name=mulcmd,type=CephChoices,strings=revert|delete", + asok_hook, + "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "list_unfound " \ + "name=pgid,type=CephPgid,req=false " \ + "name=offset,type=CephString,req=false", + asok_hook, + "list unfound objects on this pg, perhaps starting at an offset given in JSON"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "scrub " \ + "name=pgid,type=CephPgid,req=false " \ + "name=time,type=CephInt,req=false", + asok_hook, + "Trigger a scheduled scrub "); + ceph_assert(r == 0); + r = admin_socket->register_command( + "deep_scrub " \ + "name=pgid,type=CephPgid,req=false " \ + "name=time,type=CephInt,req=false", + asok_hook, + "Trigger a scheduled deep scrub "); + ceph_assert(r == 0); +} + +PerfCounters* OSD::create_logger() +{ + PerfCounters* logger = build_osd_logger(cct); + cct->get_perfcounters_collection()->add(logger); + return logger; +} + +PerfCounters* OSD::create_recoverystate_perf() +{ + PerfCounters* recoverystate_perf = build_recoverystate_perf(cct); + cct->get_perfcounters_collection()->add(recoverystate_perf); + return recoverystate_perf; +} + +int OSD::shutdown() +{ + if (cct->_conf->osd_fast_shutdown) { + derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl; + if (cct->_conf->osd_fast_shutdown_notify_mon) + service.prepare_to_stop(); + cct->_log->flush(); + _exit(0); + } + + if (!service.prepare_to_stop()) + return 0; // already shutting down + osd_lock.lock(); + if (is_stopping()) { + osd_lock.unlock(); + return 0; + } + dout(0) << "shutdown" << dendl; + + set_state(STATE_STOPPING); + + // Debugging + if (cct->_conf.get_val("osd_debug_shutdown")) { + cct->_conf.set_val("debug_osd", "100"); + cct->_conf.set_val("debug_journal", "100"); + cct->_conf.set_val("debug_filestore", "100"); + cct->_conf.set_val("debug_bluestore", "100"); + cct->_conf.set_val("debug_ms", "100"); + cct->_conf.apply_changes(nullptr); + } + + // stop MgrClient earlier as it's more like an internal consumer of OSD + mgrc.shutdown(); + + service.start_shutdown(); + + // stop sending work to pgs. this just prevents any new work in _process + // from racing with on_shutdown and potentially entering the pg after. + op_shardedwq.drain(); + + // Shutdown PGs + { + vector pgs; + _get_pgs(&pgs); + for (auto pg : pgs) { + pg->shutdown(); + } + } + + // drain op queue again (in case PGs requeued something) + op_shardedwq.drain(); + { + finished.clear(); // zap waiters (bleh, this is messy) + waiting_for_osdmap.clear(); + } + + // unregister commands + cct->get_admin_socket()->unregister_commands(asok_hook); + delete asok_hook; + asok_hook = NULL; + + cct->get_admin_socket()->unregister_commands(test_ops_hook); + delete test_ops_hook; + test_ops_hook = NULL; + + osd_lock.unlock(); + + { + std::lock_guard l{heartbeat_lock}; + heartbeat_stop = true; + heartbeat_cond.notify_all(); + heartbeat_peers.clear(); + } + heartbeat_thread.join(); + + hb_back_server_messenger->mark_down_all(); + hb_front_server_messenger->mark_down_all(); + hb_front_client_messenger->mark_down_all(); + hb_back_client_messenger->mark_down_all(); + + osd_op_tp.drain(); + osd_op_tp.stop(); + dout(10) << "op sharded tp stopped" << dendl; + + dout(10) << "stopping agent" << dendl; + service.agent_stop(); + + boot_finisher.wait_for_empty(); + + osd_lock.lock(); + + boot_finisher.stop(); + reset_heartbeat_peers(true); + + tick_timer.shutdown(); + + { + std::lock_guard l(tick_timer_lock); + tick_timer_without_osd_lock.shutdown(); + } + + // note unmount epoch + dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl; + superblock.mounted = service.get_boot_epoch(); + superblock.clean_thru = get_osdmap_epoch(); + ObjectStore::Transaction t; + write_superblock(t); + int r = store->queue_transaction(service.meta_ch, std::move(t)); + if (r) { + derr << "OSD::shutdown: error writing superblock: " + << cpp_strerror(r) << dendl; + } + + + service.shutdown_reserver(); + + // Remove PGs +#ifdef PG_DEBUG_REFS + service.dump_live_pgids(); +#endif + while (true) { + vector pgs; + _get_pgs(&pgs, true); + if (pgs.empty()) { + break; + } + for (auto& pg : pgs) { + if (pg->is_deleted()) { + continue; + } + dout(20) << " kicking pg " << pg << dendl; + pg->lock(); + if (pg->get_num_ref() != 1) { + derr << "pgid " << pg->get_pgid() << " has ref count of " + << pg->get_num_ref() << dendl; +#ifdef PG_DEBUG_REFS + pg->dump_live_ids(); +#endif + if (cct->_conf->osd_shutdown_pgref_assert) { + ceph_abort(); + } + } + pg->ch.reset(); + pg->unlock(); + } + } +#ifdef PG_DEBUG_REFS + service.dump_live_pgids(); +#endif + + osd_lock.unlock(); + cct->_conf.remove_observer(this); + osd_lock.lock(); + + service.meta_ch.reset(); + + dout(10) << "syncing store" << dendl; + enable_disable_fuse(true); + + if (cct->_conf->osd_journal_flush_on_shutdown) { + dout(10) << "flushing journal" << dendl; + store->flush_journal(); + } + + monc->shutdown(); + osd_lock.unlock(); + { + std::unique_lock l{map_lock}; + set_osdmap(OSDMapRef()); + } + for (auto s : shards) { + std::lock_guard l(s->osdmap_lock); + s->shard_osdmap = OSDMapRef(); + } + service.shutdown(); + + std::lock_guard lock(osd_lock); + store->umount(); + delete store; + store = nullptr; + dout(10) << "Store synced" << dendl; + + op_tracker.on_shutdown(); + + ClassHandler::get_instance().shutdown(); + client_messenger->shutdown(); + cluster_messenger->shutdown(); + hb_front_client_messenger->shutdown(); + hb_back_client_messenger->shutdown(); + objecter_messenger->shutdown(); + hb_front_server_messenger->shutdown(); + hb_back_server_messenger->shutdown(); + + return r; +} + +int OSD::mon_cmd_maybe_osd_create(string &cmd) +{ + bool created = false; + while (true) { + dout(10) << __func__ << " cmd: " << cmd << dendl; + vector vcmd{cmd}; + bufferlist inbl; + C_SaferCond w; + string outs; + monc->start_mon_command(vcmd, inbl, NULL, &outs, &w); + int r = w.wait(); + if (r < 0) { + if (r == -ENOENT && !created) { + string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami) + + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}"; + vector vnewcmd{newcmd}; + bufferlist inbl; + C_SaferCond w; + string outs; + monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w); + int r = w.wait(); + if (r < 0) { + derr << __func__ << " fail: osd does not exist and created failed: " + << cpp_strerror(r) << dendl; + return r; + } + created = true; + continue; + } + derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl; + return r; + } + break; + } + + return 0; +} + +int OSD::update_crush_location() +{ + if (!cct->_conf->osd_crush_update_on_start) { + dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl; + return 0; + } + + char weight[32]; + if (cct->_conf->osd_crush_initial_weight >= 0) { + snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight); + } else { + struct store_statfs_t st; + osd_alert_list_t alerts; + int r = store->statfs(&st, &alerts); + if (r < 0) { + derr << "statfs: " << cpp_strerror(r) << dendl; + return r; + } + snprintf(weight, sizeof(weight), "%.4lf", + std::max(.00001, + double(st.total) / + double(1ull << 40 /* TB */))); + } + + dout(10) << __func__ << " crush location is " << cct->crush_location << dendl; + + string cmd = + string("{\"prefix\": \"osd crush create-or-move\", ") + + string("\"id\": ") + stringify(whoami) + ", " + + string("\"weight\":") + weight + ", " + + string("\"args\": [") + stringify(cct->crush_location) + "]}"; + return mon_cmd_maybe_osd_create(cmd); +} + +int OSD::update_crush_device_class() +{ + if (!cct->_conf->osd_class_update_on_start) { + dout(10) << __func__ << " osd_class_update_on_start = false" << dendl; + return 0; + } + + string device_class; + int r = store->read_meta("crush_device_class", &device_class); + if (r < 0 || device_class.empty()) { + device_class = store->get_default_device_class(); + } + + if (device_class.empty()) { + dout(20) << __func__ << " no device class stored locally" << dendl; + return 0; + } + + string cmd = + string("{\"prefix\": \"osd crush set-device-class\", ") + + string("\"class\": \"") + device_class + string("\", ") + + string("\"ids\": [\"") + stringify(whoami) + string("\"]}"); + + r = mon_cmd_maybe_osd_create(cmd); + if (r == -EBUSY) { + // good, already bound to a device-class + return 0; + } else { + return r; + } +} + +void OSD::write_superblock(ObjectStore::Transaction& t) +{ + dout(10) << "write_superblock " << superblock << dendl; + + //hack: at minimum it's using the baseline feature set + if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE)) + superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + + bufferlist bl; + encode(superblock, bl); + t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); +} + +int OSD::read_superblock() +{ + bufferlist bl; + int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl); + if (r < 0) + return r; + + auto p = bl.cbegin(); + decode(superblock, p); + + dout(10) << "read_superblock " << superblock << dendl; + + return 0; +} + +void OSD::clear_temp_objects() +{ + dout(10) << __func__ << dendl; + vector ls; + store->list_collections(ls); + for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { + spg_t pgid; + if (!p->is_pg(&pgid)) + continue; + + // list temp objects + dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl; + + vector temps; + ghobject_t next; + while (1) { + vector objects; + auto ch = store->open_collection(*p); + ceph_assert(ch); + store->collection_list(ch, next, ghobject_t::get_max(), + store->get_ideal_list_max(), + &objects, &next); + if (objects.empty()) + break; + vector::iterator q; + for (q = objects.begin(); q != objects.end(); ++q) { + // Hammer set pool for temps to -1, so check for clean-up + if (q->hobj.is_temp() || (q->hobj.pool == -1)) { + temps.push_back(*q); + } else { + break; + } + } + // If we saw a non-temp object and hit the break above we can + // break out of the while loop too. + if (q != objects.end()) + break; + } + if (!temps.empty()) { + ObjectStore::Transaction t; + int removed = 0; + for (vector::iterator q = temps.begin(); q != temps.end(); ++q) { + dout(20) << " removing " << *p << " object " << *q << dendl; + t.remove(*p, *q); + if (++removed > cct->_conf->osd_target_transaction_size) { + store->queue_transaction(service.meta_ch, std::move(t)); + t = ObjectStore::Transaction(); + removed = 0; + } + } + if (removed) { + store->queue_transaction(service.meta_ch, std::move(t)); + } + } + } +} + +void OSD::recursive_remove_collection(CephContext* cct, + ObjectStore *store, spg_t pgid, + coll_t tmp) +{ + OSDriver driver( + store, + coll_t(), + make_snapmapper_oid()); + + ObjectStore::CollectionHandle ch = store->open_collection(tmp); + ObjectStore::Transaction t; + SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard); + + ghobject_t next; + int max = cct->_conf->osd_target_transaction_size; + vector objects; + objects.reserve(max); + while (true) { + objects.clear(); + store->collection_list(ch, next, ghobject_t::get_max(), + max, &objects, &next); + generic_dout(10) << __func__ << " " << objects << dendl; + if (objects.empty()) + break; + for (auto& p: objects) { + OSDriver::OSTransaction _t(driver.get_transaction(&t)); + int r = mapper.remove_oid(p.hobj, &_t); + if (r != 0 && r != -ENOENT) + ceph_abort(); + t.remove(tmp, p); + } + int r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + t = ObjectStore::Transaction(); + } + t.remove_collection(tmp); + int r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + + C_SaferCond waiter; + if (!ch->flush_commit(&waiter)) { + waiter.wait(); + } +} + + +// ====================================================== +// PG's + +PG* OSD::_make_pg( + OSDMapRef createmap, + spg_t pgid) +{ + dout(10) << __func__ << " " << pgid << dendl; + pg_pool_t pi; + map ec_profile; + string name; + if (createmap->have_pg_pool(pgid.pool())) { + pi = *createmap->get_pg_pool(pgid.pool()); + name = createmap->get_pool_name(pgid.pool()); + if (pi.is_erasure()) { + ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile); + } + } else { + // pool was deleted; grab final pg_pool_t off disk. + ghobject_t oid = make_final_pool_info_oid(pgid.pool()); + bufferlist bl; + int r = store->read(service.meta_ch, oid, 0, 0, bl); + if (r < 0) { + derr << __func__ << " missing pool " << pgid.pool() << " tombstone" + << dendl; + return nullptr; + } + ceph_assert(r >= 0); + auto p = bl.cbegin(); + decode(pi, p); + decode(name, p); + if (p.end()) { // dev release v13.0.2 did not include ec_profile + derr << __func__ << " missing ec_profile from pool " << pgid.pool() + << " tombstone" << dendl; + return nullptr; + } + decode(ec_profile, p); + } + PGPool pool(createmap, pgid.pool(), pi, name); + PG *pg; + if (pi.type == pg_pool_t::TYPE_REPLICATED || + pi.type == pg_pool_t::TYPE_ERASURE) + pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid); + else + ceph_abort(); + return pg; +} + +void OSD::_get_pgs(vector *v, bool clear_too) +{ + v->clear(); + v->reserve(get_num_pgs()); + for (auto& s : shards) { + std::lock_guard l(s->shard_lock); + for (auto& j : s->pg_slots) { + if (j.second->pg && + !j.second->pg->is_deleted()) { + v->push_back(j.second->pg); + if (clear_too) { + s->_detach_pg(j.second.get()); + } + } + } + } +} + +void OSD::_get_pgids(vector *v) +{ + v->clear(); + v->reserve(get_num_pgs()); + for (auto& s : shards) { + std::lock_guard l(s->shard_lock); + for (auto& j : s->pg_slots) { + if (j.second->pg && + !j.second->pg->is_deleted()) { + v->push_back(j.first); + } + } + } +} + +void OSD::register_pg(PGRef pg) +{ + spg_t pgid = pg->get_pgid(); + uint32_t shard_index = pgid.hash_to_shard(num_shards); + auto sdata = shards[shard_index]; + std::lock_guard l(sdata->shard_lock); + auto r = sdata->pg_slots.emplace(pgid, make_unique()); + ceph_assert(r.second); + auto *slot = r.first->second.get(); + dout(20) << __func__ << " " << pgid << " " << pg << dendl; + sdata->_attach_pg(slot, pg.get()); +} + +bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num) +{ + auto sdata = pg->osd_shard; + ceph_assert(sdata); + { + std::lock_guard l(sdata->shard_lock); + auto p = sdata->pg_slots.find(pg->pg_id); + if (p == sdata->pg_slots.end() || + !p->second->pg) { + dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl; + return false; + } + if (p->second->waiting_for_merge_epoch) { + dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl; + return false; + } + dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl; + sdata->_detach_pg(p->second.get()); + } + + for (auto shard : shards) { + shard->unprime_split_children(pg->pg_id, old_pg_num); + } + + // update pg count now since we might not get an osdmap any time soon. + if (pg->is_primary()) + service.logger->dec(l_osd_pg_primary); + else if (pg->is_nonprimary()) + service.logger->dec(l_osd_pg_replica); // misnomver + else + service.logger->dec(l_osd_pg_stray); + + return true; +} + +PGRef OSD::_lookup_pg(spg_t pgid) +{ + uint32_t shard_index = pgid.hash_to_shard(num_shards); + auto sdata = shards[shard_index]; + std::lock_guard l(sdata->shard_lock); + auto p = sdata->pg_slots.find(pgid); + if (p == sdata->pg_slots.end()) { + return nullptr; + } + return p->second->pg; +} + +PGRef OSD::_lookup_lock_pg(spg_t pgid) +{ + PGRef pg = _lookup_pg(pgid); + if (!pg) { + return nullptr; + } + pg->lock(); + if (!pg->is_deleted()) { + return pg; + } + pg->unlock(); + return nullptr; +} + +PGRef OSD::lookup_lock_pg(spg_t pgid) +{ + return _lookup_lock_pg(pgid); +} + +void OSD::load_pgs() +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + dout(0) << "load_pgs" << dendl; + + { + auto pghist = make_pg_num_history_oid(); + bufferlist bl; + int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0); + if (r >= 0 && bl.length() > 0) { + auto p = bl.cbegin(); + decode(pg_num_history, p); + } + dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; + } + + vector ls; + int r = store->list_collections(ls); + if (r < 0) { + derr << "failed to list pgs: " << cpp_strerror(-r) << dendl; + } + + int num = 0; + for (vector::iterator it = ls.begin(); + it != ls.end(); + ++it) { + spg_t pgid; + if (it->is_temp(&pgid) || + (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) { + dout(10) << "load_pgs " << *it + << " removing, legacy or flagged for removal pg" << dendl; + recursive_remove_collection(cct, store, pgid, *it); + continue; + } + + if (!it->is_pg(&pgid)) { + dout(10) << "load_pgs ignoring unrecognized " << *it << dendl; + continue; + } + + dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl; + epoch_t map_epoch = 0; + int r = PG::peek_map_epoch(store, pgid, &map_epoch); + if (r < 0) { + derr << __func__ << " unable to peek at " << pgid << " metadata, skipping" + << dendl; + continue; + } + + PGRef pg; + if (map_epoch > 0) { + OSDMapRef pgosdmap = service.try_get_map(map_epoch); + if (!pgosdmap) { + if (!get_osdmap()->have_pg_pool(pgid.pool())) { + derr << __func__ << ": could not find map for epoch " << map_epoch + << " on pg " << pgid << ", but the pool is not present in the " + << "current map, so this is probably a result of bug 10617. " + << "Skipping the pg for now, you can use ceph-objectstore-tool " + << "to clean it up later." << dendl; + continue; + } else { + derr << __func__ << ": have pgid " << pgid << " at epoch " + << map_epoch << ", but missing map. Crashing." + << dendl; + ceph_abort_msg("Missing map in load_pgs"); + } + } + pg = _make_pg(pgosdmap, pgid); + } else { + pg = _make_pg(get_osdmap(), pgid); + } + if (!pg) { + recursive_remove_collection(cct, store, pgid, *it); + continue; + } + + // there can be no waiters here, so we don't call _wake_pg_slot + + pg->lock(); + pg->ch = store->open_collection(pg->coll); + + // read pg state, log + pg->read_state(store); + + if (pg->dne()) { + dout(10) << "load_pgs " << *it << " deleting dne" << dendl; + pg->ch = nullptr; + pg->unlock(); + recursive_remove_collection(cct, store, pgid, *it); + continue; + } + { + uint32_t shard_index = pgid.hash_to_shard(shards.size()); + assert(NULL != shards[shard_index]); + store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue)); + } + + pg->reg_next_scrub(); + + dout(10) << __func__ << " loaded " << *pg << dendl; + pg->unlock(); + + register_pg(pg); + ++num; + } + dout(0) << __func__ << " opened " << num << " pgs" << dendl; +} + + +PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap, + const PGCreateInfo *info) +{ + spg_t pgid = info->pgid; + + if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) { + dout(10) << __func__ << " hit max pg, dropping" << dendl; + return nullptr; + } + + PeeringCtx rctx = create_context(); + + OSDMapRef startmap = get_map(info->epoch); + + if (info->by_mon) { + int64_t pool_id = pgid.pgid.pool(); + const pg_pool_t *pool = osdmap->get_pg_pool(pool_id); + if (!pool) { + dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl; + return nullptr; + } + if (osdmap->require_osd_release >= ceph_release_t::nautilus && + !pool->has_flag(pg_pool_t::FLAG_CREATING)) { + // this ensures we do not process old creating messages after the + // pool's initial pgs have been created (and pg are subsequently + // allowed to split or merge). + dout(20) << __func__ << " dropping " << pgid + << "create, pool does not have CREATING flag set" << dendl; + return nullptr; + } + } + + int up_primary, acting_primary; + vector up, acting; + startmap->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &acting_primary); + + const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool()); + if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) && + store->get_type() != "bluestore") { + clog->warn() << "pg " << pgid + << " is at risk of silent data corruption: " + << "the pool allows ec overwrites but is not stored in " + << "bluestore, so deep scrubbing will not detect bitrot"; + } + create_pg_collection( + rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num())); + init_pg_ondisk(rctx.transaction, pgid, pp); + + int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting); + + PGRef pg = _make_pg(startmap, pgid); + pg->ch = store->create_new_collection(pg->coll); + + { + uint32_t shard_index = pgid.hash_to_shard(shards.size()); + assert(NULL != shards[shard_index]); + store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue)); + } + + pg->lock(true); + + // we are holding the shard lock + ceph_assert(!pg->is_deleted()); + + pg->init( + role, + up, + up_primary, + acting, + acting_primary, + info->history, + info->past_intervals, + false, + rctx.transaction); + + pg->init_collection_pool_opts(); + + if (pg->is_primary()) { + std::lock_guard locker{m_perf_queries_lock}; + pg->set_dynamic_perf_stats_queries(m_perf_queries); + } + + pg->handle_initialize(rctx); + pg->handle_activate_map(rctx); + + dispatch_context(rctx, pg.get(), osdmap, nullptr); + + dout(10) << __func__ << " new pg " << *pg << dendl; + return pg; +} + +bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap, + spg_t pgid, + bool is_mon_create) +{ + const auto max_pgs_per_osd = + (cct->_conf.get_val("mon_max_pg_per_osd") * + cct->_conf.get_val("osd_max_pg_per_osd_hard_ratio")); + + if (num_pgs < max_pgs_per_osd) { + return false; + } + + std::lock_guard l(pending_creates_lock); + if (is_mon_create) { + pending_creates_from_mon++; + } else { + bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0; + pending_creates_from_osd.emplace(pgid, is_primary); + } + dout(1) << __func__ << " withhold creation of pg " << pgid + << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl; + return true; +} + +// to re-trigger a peering, we have to twiddle the pg mapping a little bit, +// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn +// to up set if pg_temp is empty. so an empty pg_temp won't work. +static vector twiddle(const vector& acting) { + if (acting.size() > 1) { + return {acting[0]}; + } else { + vector twiddled(acting.begin(), acting.end()); + twiddled.push_back(-1); + return twiddled; + } +} + +void OSD::resume_creating_pg() +{ + bool do_sub_pg_creates = false; + bool have_pending_creates = false; + { + const auto max_pgs_per_osd = + (cct->_conf.get_val("mon_max_pg_per_osd") * + cct->_conf.get_val("osd_max_pg_per_osd_hard_ratio")); + if (max_pgs_per_osd <= num_pgs) { + // this could happen if admin decreases this setting before a PG is removed + return; + } + unsigned spare_pgs = max_pgs_per_osd - num_pgs; + std::lock_guard l(pending_creates_lock); + if (pending_creates_from_mon > 0) { + dout(20) << __func__ << " pending_creates_from_mon " + << pending_creates_from_mon << dendl; + do_sub_pg_creates = true; + if (pending_creates_from_mon >= spare_pgs) { + spare_pgs = pending_creates_from_mon = 0; + } else { + spare_pgs -= pending_creates_from_mon; + pending_creates_from_mon = 0; + } + } + auto pg = pending_creates_from_osd.cbegin(); + while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) { + dout(20) << __func__ << " pg " << pg->first << dendl; + vector acting; + get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr); + service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true); + pg = pending_creates_from_osd.erase(pg); + do_sub_pg_creates = true; + spare_pgs--; + } + have_pending_creates = (pending_creates_from_mon > 0 || + !pending_creates_from_osd.empty()); + } + + bool do_renew_subs = false; + if (do_sub_pg_creates) { + if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) { + dout(4) << __func__ << ": resolicit pg creates from mon since " + << last_pg_create_epoch << dendl; + do_renew_subs = true; + } + } + version_t start = get_osdmap_epoch() + 1; + if (have_pending_creates) { + // don't miss any new osdmap deleting PGs + if (monc->sub_want("osdmap", start, 0)) { + dout(4) << __func__ << ": resolicit osdmap from mon since " + << start << dendl; + do_renew_subs = true; + } + } else if (do_sub_pg_creates) { + // no need to subscribe the osdmap continuously anymore + // once the pgtemp and/or mon_subscribe(pg_creates) is sent + if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) { + dout(4) << __func__ << ": re-subscribe osdmap(onetime) since " + << start << dendl; + do_renew_subs = true; + } + } + + if (do_renew_subs) { + monc->renew_subs(); + } + + service.send_pg_temp(); +} + +void OSD::build_initial_pg_history( + spg_t pgid, + epoch_t created, + utime_t created_stamp, + pg_history_t *h, + PastIntervals *pi) +{ + dout(10) << __func__ << " " << pgid << " created " << created << dendl; + *h = pg_history_t(created, created_stamp); + + OSDMapRef lastmap = service.get_map(created); + int up_primary, acting_primary; + vector up, acting; + lastmap->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &acting_primary); + + ostringstream debug; + for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) { + OSDMapRef osdmap = service.get_map(e); + int new_up_primary, new_acting_primary; + vector new_up, new_acting; + osdmap->pg_to_up_acting_osds( + pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary); + + // this is a bit imprecise, but sufficient? + struct min_size_predicate_t : public IsPGRecoverablePredicate { + const pg_pool_t *pi; + bool operator()(const set &have) const { + return have.size() >= pi->min_size; + } + explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {} + } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool())); + + bool new_interval = PastIntervals::check_new_interval( + acting_primary, + new_acting_primary, + acting, new_acting, + up_primary, + new_up_primary, + up, new_up, + h->same_interval_since, + h->last_epoch_clean, + osdmap.get(), + lastmap.get(), + pgid.pgid, + min_size_predicate, + pi, + &debug); + if (new_interval) { + h->same_interval_since = e; + if (up != new_up) { + h->same_up_since = e; + } + if (acting_primary != new_acting_primary) { + h->same_primary_since = e; + } + if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()), + osdmap->get_pg_num(pgid.pgid.pool()), + nullptr)) { + h->last_epoch_split = e; + } + up = new_up; + acting = new_acting; + up_primary = new_up_primary; + acting_primary = new_acting_primary; + } + lastmap = osdmap; + } + dout(20) << __func__ << " " << debug.str() << dendl; + dout(10) << __func__ << " " << *h << " " << *pi + << " [" << (pi->empty() ? pair(0,0) : + pi->get_bounds()) << ")" + << dendl; +} + +void OSD::_add_heartbeat_peer(int p) +{ + if (p == whoami) + return; + HeartbeatInfo *hi; + + map::iterator i = heartbeat_peers.find(p); + if (i == heartbeat_peers.end()) { + pair cons = service.get_con_osd_hb(p, get_osdmap_epoch()); + if (!cons.first) + return; + assert(cons.second); + + hi = &heartbeat_peers[p]; + hi->peer = p; + + auto stamps = service.get_hb_stamps(p); + + auto sb = ceph::make_ref(cct, cons.first.get()); + sb->peer = p; + sb->stamps = stamps; + hi->hb_interval_start = ceph_clock_now(); + hi->con_back = cons.first.get(); + hi->con_back->set_priv(sb); + + auto sf = ceph::make_ref(cct, cons.second.get()); + sf->peer = p; + sf->stamps = stamps; + hi->con_front = cons.second.get(); + hi->con_front->set_priv(sf); + + dout(10) << "_add_heartbeat_peer: new peer osd." << p + << " " << hi->con_back->get_peer_addr() + << " " << hi->con_front->get_peer_addr() + << dendl; + } else { + hi = &i->second; + } + hi->epoch = get_osdmap_epoch(); +} + +void OSD::_remove_heartbeat_peer(int n) +{ + map::iterator q = heartbeat_peers.find(n); + ceph_assert(q != heartbeat_peers.end()); + dout(20) << " removing heartbeat peer osd." << n + << " " << q->second.con_back->get_peer_addr() + << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t()) + << dendl; + q->second.clear_mark_down(); + heartbeat_peers.erase(q); +} + +void OSD::need_heartbeat_peer_update() +{ + if (is_stopping()) + return; + dout(20) << "need_heartbeat_peer_update" << dendl; + heartbeat_set_peers_need_update(); +} + +void OSD::maybe_update_heartbeat_peers() +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + + if (is_waiting_for_healthy() || is_active()) { + utime_t now = ceph_clock_now(); + if (last_heartbeat_resample == utime_t()) { + last_heartbeat_resample = now; + heartbeat_set_peers_need_update(); + } else if (!heartbeat_peers_need_update()) { + utime_t dur = now - last_heartbeat_resample; + if (dur > cct->_conf->osd_heartbeat_grace) { + dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl; + heartbeat_set_peers_need_update(); + last_heartbeat_resample = now; + // automatically clean up any stale heartbeat peers + // if we are unhealthy, then clean all + reset_heartbeat_peers(is_waiting_for_healthy()); + } + } + } + + if (!heartbeat_peers_need_update()) + return; + heartbeat_clear_peers_need_update(); + + std::lock_guard l(heartbeat_lock); + + dout(10) << "maybe_update_heartbeat_peers updating" << dendl; + + + // build heartbeat from set + if (is_active()) { + vector pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + pg->with_heartbeat_peers([&](int peer) { + if (get_osdmap()->is_up(peer)) { + _add_heartbeat_peer(peer); + } + }); + } + } + + // include next and previous up osds to ensure we have a fully-connected set + set want, extras; + const int next = get_osdmap()->get_next_up_osd_after(whoami); + if (next >= 0) + want.insert(next); + int prev = get_osdmap()->get_previous_up_osd_before(whoami); + if (prev >= 0 && prev != next) + want.insert(prev); + + // make sure we have at least **min_down** osds coming from different + // subtree level (e.g., hosts) for fast failure detection. + auto min_down = cct->_conf.get_val("mon_osd_min_down_reporters"); + auto subtree = cct->_conf.get_val("mon_osd_reporter_subtree_level"); + auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers); + get_osdmap()->get_random_up_osds_by_subtree( + whoami, subtree, limit, want, &want); + + for (set::iterator p = want.begin(); p != want.end(); ++p) { + dout(10) << " adding neighbor peer osd." << *p << dendl; + extras.insert(*p); + _add_heartbeat_peer(*p); + } + + // remove down peers; enumerate extras + map::iterator p = heartbeat_peers.begin(); + while (p != heartbeat_peers.end()) { + if (!get_osdmap()->is_up(p->first)) { + int o = p->first; + ++p; + _remove_heartbeat_peer(o); + continue; + } + if (p->second.epoch < get_osdmap_epoch()) { + extras.insert(p->first); + } + ++p; + } + + // too few? + for (int n = next; n >= 0; ) { + if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers) + break; + if (!extras.count(n) && !want.count(n) && n != whoami) { + dout(10) << " adding random peer osd." << n << dendl; + extras.insert(n); + _add_heartbeat_peer(n); + } + n = get_osdmap()->get_next_up_osd_after(n); + if (n == next) + break; // came full circle; stop + } + + // too many? + for (set::iterator p = extras.begin(); + (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end(); + ++p) { + if (want.count(*p)) + continue; + _remove_heartbeat_peer(*p); + } + + dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl; + + // clean up stale failure pending + for (auto it = failure_pending.begin(); it != failure_pending.end();) { + if (heartbeat_peers.count(it->first) == 0) { + send_still_alive(get_osdmap_epoch(), it->first, it->second.second); + failure_pending.erase(it++); + } else { + it++; + } + } +} + +void OSD::reset_heartbeat_peers(bool all) +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + dout(10) << "reset_heartbeat_peers" << dendl; + utime_t stale = ceph_clock_now(); + stale -= cct->_conf.get_val("osd_heartbeat_stale"); + std::lock_guard l(heartbeat_lock); + for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) { + auto& [peer, hi] = *it; + if (all || hi.is_stale(stale)) { + hi.clear_mark_down(); + // stop sending failure_report to mon too + failure_queue.erase(peer); + failure_pending.erase(peer); + it = heartbeat_peers.erase(it); + } else { + ++it; + } + } +} + +void OSD::handle_osd_ping(MOSDPing *m) +{ + if (superblock.cluster_fsid != m->fsid) { + dout(20) << "handle_osd_ping from " << m->get_source_inst() + << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid + << dendl; + m->put(); + return; + } + + int from = m->get_source().num(); + + heartbeat_lock.lock(); + if (is_stopping()) { + heartbeat_lock.unlock(); + m->put(); + return; + } + + utime_t now = ceph_clock_now(); + auto mnow = service.get_mnow(); + ConnectionRef con(m->get_connection()); + OSDMapRef curmap = service.get_osdmap(); + if (!curmap) { + heartbeat_lock.unlock(); + m->put(); + return; + } + + auto sref = con->get_priv(); + Session *s = static_cast(sref.get()); + if (!s) { + heartbeat_lock.unlock(); + m->put(); + return; + } + if (!s->stamps) { + s->peer = from; + s->stamps = service.get_hb_stamps(from); + } + + switch (m->op) { + + case MOSDPing::PING: + { + if (cct->_conf->osd_debug_drop_ping_probability > 0) { + auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from); + if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) { + if (heartbeat_drop->second == 0) { + debug_heartbeat_drops_remaining.erase(heartbeat_drop); + } else { + --heartbeat_drop->second; + dout(5) << "Dropping heartbeat from " << from + << ", " << heartbeat_drop->second + << " remaining to drop" << dendl; + break; + } + } else if (cct->_conf->osd_debug_drop_ping_probability > + ((((double)(rand()%100))/100.0))) { + heartbeat_drop = + debug_heartbeat_drops_remaining.insert(std::make_pair(from, + cct->_conf->osd_debug_drop_ping_duration)).first; + dout(5) << "Dropping heartbeat from " << from + << ", " << heartbeat_drop->second + << " remaining to drop" << dendl; + break; + } + } + + ceph::signedspan sender_delta_ub{}; + s->stamps->got_ping( + m->up_from, + mnow, + m->mono_send_stamp, + m->delta_ub, + &sender_delta_ub); + dout(20) << __func__ << " new stamps " << *s->stamps << dendl; + + if (!cct->get_heartbeat_map()->is_healthy()) { + dout(10) << "internal heartbeat not healthy, dropping ping request" + << dendl; + break; + } + + Message *r = new MOSDPing(monc->get_fsid(), + curmap->get_epoch(), + MOSDPing::PING_REPLY, + m->ping_stamp, + m->mono_ping_stamp, + mnow, + service.get_up_epoch(), + cct->_conf->osd_heartbeat_min_size, + sender_delta_ub); + con->send_message(r); + + if (curmap->is_up(from)) { + if (is_active()) { + ConnectionRef cluster_con = service.get_con_osd_cluster( + from, curmap->get_epoch()); + if (cluster_con) { + service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch); + } + } + } else if (!curmap->exists(from) || + curmap->get_down_at(from) > m->map_epoch) { + // tell them they have died + Message *r = new MOSDPing(monc->get_fsid(), + curmap->get_epoch(), + MOSDPing::YOU_DIED, + m->ping_stamp, + m->mono_ping_stamp, + mnow, + service.get_up_epoch(), + cct->_conf->osd_heartbeat_min_size); + con->send_message(r); + } + } + break; + + case MOSDPing::PING_REPLY: + { + map::iterator i = heartbeat_peers.find(from); + if (i != heartbeat_peers.end()) { + auto acked = i->second.ping_history.find(m->ping_stamp); + if (acked != i->second.ping_history.end()) { + int &unacknowledged = acked->second.second; + if (con == i->second.con_back) { + dout(25) << "handle_osd_ping got reply from osd." << from + << " first_tx " << i->second.first_tx + << " last_tx " << i->second.last_tx + << " last_rx_back " << i->second.last_rx_back + << " -> " << now + << " last_rx_front " << i->second.last_rx_front + << dendl; + i->second.last_rx_back = now; + ceph_assert(unacknowledged > 0); + --unacknowledged; + // if there is no front con, set both stamps. + if (i->second.con_front == NULL) { + i->second.last_rx_front = now; + ceph_assert(unacknowledged > 0); + --unacknowledged; + } + } else if (con == i->second.con_front) { + dout(25) << "handle_osd_ping got reply from osd." << from + << " first_tx " << i->second.first_tx + << " last_tx " << i->second.last_tx + << " last_rx_back " << i->second.last_rx_back + << " last_rx_front " << i->second.last_rx_front + << " -> " << now + << dendl; + i->second.last_rx_front = now; + ceph_assert(unacknowledged > 0); + --unacknowledged; + } + + if (unacknowledged == 0) { + // succeeded in getting all replies + dout(25) << "handle_osd_ping got all replies from osd." << from + << " , erase pending ping(sent at " << m->ping_stamp << ")" + << " and older pending ping(s)" + << dendl; + +#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5) + ++i->second.hb_average_count; + uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp); + i->second.hb_total_back += back_pingtime; + if (back_pingtime < i->second.hb_min_back) + i->second.hb_min_back = back_pingtime; + if (back_pingtime > i->second.hb_max_back) + i->second.hb_max_back = back_pingtime; + uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp); + i->second.hb_total_front += front_pingtime; + if (front_pingtime < i->second.hb_min_front) + i->second.hb_min_front = front_pingtime; + if (front_pingtime > i->second.hb_max_front) + i->second.hb_max_front = front_pingtime; + + ceph_assert(i->second.hb_interval_start != utime_t()); + if (i->second.hb_interval_start == utime_t()) + i->second.hb_interval_start = now; + int64_t hb_avg_time_period = 60; + if (cct->_conf.get_val("debug_heartbeat_testing_span")) { + hb_avg_time_period = cct->_conf.get_val("debug_heartbeat_testing_span"); + } + if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) { + uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count; + uint32_t back_min = i->second.hb_min_back; + uint32_t back_max = i->second.hb_max_back; + uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count; + uint32_t front_min = i->second.hb_min_front; + uint32_t front_max = i->second.hb_max_front; + + // Reset for new interval + i->second.hb_average_count = 0; + i->second.hb_interval_start = now; + i->second.hb_total_back = i->second.hb_max_back = 0; + i->second.hb_min_back = UINT_MAX; + i->second.hb_total_front = i->second.hb_max_front = 0; + i->second.hb_min_front = UINT_MAX; + + // Record per osd interace ping times + // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval + if (i->second.hb_back_pingtime.size() == 0) { + ceph_assert(i->second.hb_front_pingtime.size() == 0); + for (unsigned k = 0 ; k < hb_vector_size; ++k) { + i->second.hb_back_pingtime.push_back(back_avg); + i->second.hb_back_min.push_back(back_min); + i->second.hb_back_max.push_back(back_max); + i->second.hb_front_pingtime.push_back(front_avg); + i->second.hb_front_min.push_back(front_min); + i->second.hb_front_max.push_back(front_max); + ++i->second.hb_index; + } + } else { + int index = i->second.hb_index & (hb_vector_size - 1); + i->second.hb_back_pingtime[index] = back_avg; + i->second.hb_back_min[index] = back_min; + i->second.hb_back_max[index] = back_max; + i->second.hb_front_pingtime[index] = front_avg; + i->second.hb_front_min[index] = front_min; + i->second.hb_front_max[index] = front_max; + ++i->second.hb_index; + } + + { + std::lock_guard l(service.stat_lock); + service.osd_stat.hb_pingtime[from].last_update = now.sec(); + service.osd_stat.hb_pingtime[from].back_last = back_pingtime; + + uint32_t total = 0; + uint32_t min = UINT_MAX; + uint32_t max = 0; + uint32_t count = 0; + uint32_t which = 0; + uint32_t size = (uint32_t)i->second.hb_back_pingtime.size(); + for (int32_t k = size - 1 ; k >= 0; --k) { + ++count; + int index = (i->second.hb_index + k) % size; + total += i->second.hb_back_pingtime[index]; + if (i->second.hb_back_min[index] < min) + min = i->second.hb_back_min[index]; + if (i->second.hb_back_max[index] > max) + max = i->second.hb_back_max[index]; + if (count == 1 || count == 5 || count == 15) { + service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count; + service.osd_stat.hb_pingtime[from].back_min[which] = min; + service.osd_stat.hb_pingtime[from].back_max[which] = max; + which++; + if (count == 15) + break; + } + } + + if (i->second.con_front != NULL) { + service.osd_stat.hb_pingtime[from].front_last = front_pingtime; + + total = 0; + min = UINT_MAX; + max = 0; + count = 0; + which = 0; + for (int32_t k = size - 1 ; k >= 0; --k) { + ++count; + int index = (i->second.hb_index + k) % size; + total += i->second.hb_front_pingtime[index]; + if (i->second.hb_front_min[index] < min) + min = i->second.hb_front_min[index]; + if (i->second.hb_front_max[index] > max) + max = i->second.hb_front_max[index]; + if (count == 1 || count == 5 || count == 15) { + service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count; + service.osd_stat.hb_pingtime[from].front_min[which] = min; + service.osd_stat.hb_pingtime[from].front_max[which] = max; + which++; + if (count == 15) + break; + } + } + } + } + } else { + std::lock_guard l(service.stat_lock); + service.osd_stat.hb_pingtime[from].back_last = back_pingtime; + if (i->second.con_front != NULL) + service.osd_stat.hb_pingtime[from].front_last = front_pingtime; + } + i->second.ping_history.erase(i->second.ping_history.begin(), ++acked); + } + + if (i->second.is_healthy(now)) { + // Cancel false reports + auto failure_queue_entry = failure_queue.find(from); + if (failure_queue_entry != failure_queue.end()) { + dout(10) << "handle_osd_ping canceling queued " + << "failure report for osd." << from << dendl; + failure_queue.erase(failure_queue_entry); + } + + auto failure_pending_entry = failure_pending.find(from); + if (failure_pending_entry != failure_pending.end()) { + dout(10) << "handle_osd_ping canceling in-flight " + << "failure report for osd." << from << dendl; + send_still_alive(curmap->get_epoch(), + from, + failure_pending_entry->second.second); + failure_pending.erase(failure_pending_entry); + } + } + } else { + // old replies, deprecated by newly sent pings. + dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp + << ") is found, treat as covered by newly sent pings " + << "and ignore" + << dendl; + } + } + + if (m->map_epoch && + curmap->is_up(from)) { + if (is_active()) { + ConnectionRef cluster_con = service.get_con_osd_cluster( + from, curmap->get_epoch()); + if (cluster_con) { + service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch); + } + } + } + + s->stamps->got_ping_reply( + mnow, + m->mono_send_stamp, + m->delta_ub); + dout(20) << __func__ << " new stamps " << *s->stamps << dendl; + } + break; + + case MOSDPing::YOU_DIED: + dout(10) << "handle_osd_ping " << m->get_source_inst() + << " says i am down in " << m->map_epoch << dendl; + osdmap_subscribe(curmap->get_epoch()+1, false); + break; + } + + heartbeat_lock.unlock(); + m->put(); +} + +void OSD::heartbeat_entry() +{ + std::unique_lock l(heartbeat_lock); + if (is_stopping()) + return; + while (!heartbeat_stop) { + heartbeat(); + + double wait; + if (cct->_conf.get_val("debug_disable_randomized_ping")) { + wait = (float)cct->_conf->osd_heartbeat_interval; + } else { + wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval; + } + auto w = ceph::make_timespan(wait); + dout(30) << "heartbeat_entry sleeping for " << wait << dendl; + heartbeat_cond.wait_for(l, w); + if (is_stopping()) + return; + dout(30) << "heartbeat_entry woke up" << dendl; + } +} + +void OSD::heartbeat_check() +{ + ceph_assert(ceph_mutex_is_locked(heartbeat_lock)); + utime_t now = ceph_clock_now(); + + // check for incoming heartbeats (move me elsewhere?) + for (map::iterator p = heartbeat_peers.begin(); + p != heartbeat_peers.end(); + ++p) { + + if (p->second.first_tx == utime_t()) { + dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first + << " yet, skipping" << dendl; + continue; + } + + dout(25) << "heartbeat_check osd." << p->first + << " first_tx " << p->second.first_tx + << " last_tx " << p->second.last_tx + << " last_rx_back " << p->second.last_rx_back + << " last_rx_front " << p->second.last_rx_front + << dendl; + if (p->second.is_unhealthy(now)) { + utime_t oldest_deadline = p->second.ping_history.begin()->second.first; + if (p->second.last_rx_back == utime_t() || + p->second.last_rx_front == utime_t()) { + derr << "heartbeat_check: no reply from " + << p->second.con_front->get_peer_addr().get_sockaddr() + << " osd." << p->first + << " ever on either front or back, first ping sent " + << p->second.first_tx + << " (oldest deadline " << oldest_deadline << ")" + << dendl; + // fail + failure_queue[p->first] = p->second.first_tx; + } else { + derr << "heartbeat_check: no reply from " + << p->second.con_front->get_peer_addr().get_sockaddr() + << " osd." << p->first << " since back " << p->second.last_rx_back + << " front " << p->second.last_rx_front + << " (oldest deadline " << oldest_deadline << ")" + << dendl; + // fail + failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front); + } + } + } +} + +void OSD::heartbeat() +{ + ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock)); + dout(30) << "heartbeat" << dendl; + + // get CPU load avg + double loadavgs[1]; + int hb_interval = cct->_conf->osd_heartbeat_interval; + int n_samples = 86400; + if (hb_interval > 1) { + n_samples /= hb_interval; + if (n_samples < 1) + n_samples = 1; + } + + if (getloadavg(loadavgs, 1) == 1) { + logger->set(l_osd_loadavg, 100 * loadavgs[0]); + daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples; + dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl; + } + + dout(30) << "heartbeat checking stats" << dendl; + + // refresh peer list and osd stats + vector hb_peers; + for (map::iterator p = heartbeat_peers.begin(); + p != heartbeat_peers.end(); + ++p) + hb_peers.push_back(p->first); + + auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs()); + dout(5) << __func__ << " " << new_stat << dendl; + ceph_assert(new_stat.statfs.total); + + float pratio; + float ratio = service.compute_adjusted_ratio(new_stat, &pratio); + + service.check_full_status(ratio, pratio); + + utime_t now = ceph_clock_now(); + auto mnow = service.get_mnow(); + utime_t deadline = now; + deadline += cct->_conf->osd_heartbeat_grace; + + // send heartbeats + for (map::iterator i = heartbeat_peers.begin(); + i != heartbeat_peers.end(); + ++i) { + int peer = i->first; + Session *s = static_cast(i->second.con_back->get_priv().get()); + if (!s) { + dout(30) << "heartbeat osd." << peer << " has no open con" << dendl; + continue; + } + dout(30) << "heartbeat sending ping to osd." << peer << dendl; + + i->second.last_tx = now; + if (i->second.first_tx == utime_t()) + i->second.first_tx = now; + i->second.ping_history[now] = make_pair(deadline, + HeartbeatInfo::HEARTBEAT_MAX_CONN); + if (i->second.hb_interval_start == utime_t()) + i->second.hb_interval_start = now; + + std::optional delta_ub; + s->stamps->sent_ping(&delta_ub); + + i->second.con_back->send_message( + new MOSDPing(monc->get_fsid(), + service.get_osdmap_epoch(), + MOSDPing::PING, + now, + mnow, + mnow, + service.get_up_epoch(), + cct->_conf->osd_heartbeat_min_size, + delta_ub)); + + if (i->second.con_front) + i->second.con_front->send_message( + new MOSDPing(monc->get_fsid(), + service.get_osdmap_epoch(), + MOSDPing::PING, + now, + mnow, + mnow, + service.get_up_epoch(), + cct->_conf->osd_heartbeat_min_size, + delta_ub)); + } + + logger->set(l_osd_hb_to, heartbeat_peers.size()); + + // hmm.. am i all alone? + dout(30) << "heartbeat lonely?" << dendl; + if (heartbeat_peers.empty()) { + if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) { + last_mon_heartbeat = now; + dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl; + osdmap_subscribe(get_osdmap_epoch() + 1, false); + } + } + + dout(30) << "heartbeat done" << dendl; +} + +bool OSD::heartbeat_reset(Connection *con) +{ + std::lock_guard l(heartbeat_lock); + auto s = con->get_priv(); + dout(20) << __func__ << " con " << con << " s " << s.get() << dendl; + con->set_priv(nullptr); + if (s) { + if (is_stopping()) { + return true; + } + auto session = static_cast(s.get()); + auto p = heartbeat_peers.find(session->peer); + if (p != heartbeat_peers.end() && + (p->second.con_back == con || + p->second.con_front == con)) { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", reopening" << dendl; + p->second.clear_mark_down(con); + pair newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); + if (newcon.first) { + p->second.con_back = newcon.first.get(); + p->second.con_back->set_priv(s); + if (newcon.second) { + p->second.con_front = newcon.second.get(); + p->second.con_front->set_priv(s); + } + p->second.ping_history.clear(); + } else { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", raced with osdmap update, closing out peer" << dendl; + heartbeat_peers.erase(p); + } + } else { + dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl; + } + } + return true; +} + + + +// ========================================= + +void OSD::tick() +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + dout(10) << "tick" << dendl; + + utime_t now = ceph_clock_now(); + // throw out any obsolete markdown log + utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0); + while (!osd_markdown_log.empty() && + osd_markdown_log.front() + grace < now) + osd_markdown_log.pop_front(); + + if (is_active() || is_waiting_for_healthy()) { + maybe_update_heartbeat_peers(); + } + + if (is_waiting_for_healthy()) { + start_boot(); + } + + if (is_waiting_for_healthy() || is_booting()) { + std::lock_guard l(heartbeat_lock); + if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) { + last_mon_heartbeat = now; + dout(1) << __func__ << " checking mon for new map" << dendl; + osdmap_subscribe(get_osdmap_epoch() + 1, false); + } + } + + do_waiters(); + + // scrub purged_snaps every deep scrub interval + { + const utime_t last = superblock.last_purged_snaps_scrub; + utime_t next = last; + next += cct->_conf->osd_scrub_min_interval; + std::mt19937 rng; + // use a seed that is stable for each scrub interval, but varies + // by OSD to avoid any herds. + rng.seed(whoami + superblock.last_purged_snaps_scrub.sec()); + double r = (rng() % 1024) / 1024.0; + next += + cct->_conf->osd_scrub_min_interval * + cct->_conf->osd_scrub_interval_randomize_ratio * r; + if (next < ceph_clock_now()) { + dout(20) << __func__ << " last_purged_snaps_scrub " << last + << " next " << next << " ... now" << dendl; + scrub_purged_snaps(); + } else { + dout(20) << __func__ << " last_purged_snaps_scrub " << last + << " next " << next << dendl; + } + } + + tick_timer.add_event_after(get_tick_interval(), new C_Tick(this)); +} + +void OSD::tick_without_osd_lock() +{ + ceph_assert(ceph_mutex_is_locked(tick_timer_lock)); + dout(10) << "tick_without_osd_lock" << dendl; + + logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc()); + logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted()); + logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc()); + + // refresh osd stats + struct store_statfs_t stbuf; + osd_alert_list_t alerts; + int r = store->statfs(&stbuf, &alerts); + ceph_assert(r == 0); + service.set_statfs(stbuf, alerts); + + // osd_lock is not being held, which means the OSD state + // might change when doing the monitor report + if (is_active() || is_waiting_for_healthy()) { + { + std::lock_guard l{heartbeat_lock}; + heartbeat_check(); + } + map_lock.lock_shared(); + std::lock_guard l(mon_report_lock); + + // mon report? + utime_t now = ceph_clock_now(); + if (service.need_fullness_update() || + now - last_mon_report > cct->_conf->osd_mon_report_interval) { + last_mon_report = now; + send_full_update(); + send_failures(); + } + map_lock.unlock_shared(); + + epoch_t max_waiting_epoch = 0; + for (auto s : shards) { + max_waiting_epoch = std::max(max_waiting_epoch, + s->get_max_waiting_epoch()); + } + if (max_waiting_epoch > get_osdmap()->get_epoch()) { + dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch + << ", requesting new map" << dendl; + osdmap_subscribe(superblock.newest_map + 1, false); + } + } + + if (is_active()) { + if (!scrub_random_backoff()) { + sched_scrub(); + } + service.promote_throttle_recalibrate(); + resume_creating_pg(); + bool need_send_beacon = false; + const auto now = ceph::coarse_mono_clock::now(); + { + // borrow lec lock to pretect last_sent_beacon from changing + std::lock_guard l{min_last_epoch_clean_lock}; + const auto elapsed = now - last_sent_beacon; + if (std::chrono::duration_cast(elapsed).count() > + cct->_conf->osd_beacon_report_interval) { + need_send_beacon = true; + } + } + if (need_send_beacon) { + send_beacon(now); + } + } + + mgrc.update_daemon_health(get_health_metrics()); + service.kick_recovery_queue(); + tick_timer_without_osd_lock.add_event_after(get_tick_interval(), + new C_Tick_WithoutOSDLock(this)); +} + +// Usage: +// setomapval [namespace/] +// rmomapkey [namespace/] +// setomapheader [namespace/]
+// getomap [namespace/] +// truncobj [namespace/] +// injectmdataerr [namespace/] [shardid] +// injectdataerr [namespace/] [shardid] +// +// set_recovery_delay [utime] +void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, + std::string_view command, + const cmdmap_t& cmdmap, ostream &ss) +{ + //Test support + //Support changing the omap on a single osd by using the Admin Socket to + //directly request the osd make a change. + if (command == "setomapval" || command == "rmomapkey" || + command == "setomapheader" || command == "getomap" || + command == "truncobj" || command == "injectmdataerr" || + command == "injectdataerr" + ) { + pg_t rawpg; + int64_t pool; + OSDMapRef curmap = service->get_osdmap(); + int r = -1; + + string poolstr; + + cmd_getval(cmdmap, "pool", poolstr); + pool = curmap->lookup_pg_pool_name(poolstr); + //If we can't find it by name then maybe id specified + if (pool < 0 && isdigit(poolstr[0])) + pool = atoll(poolstr.c_str()); + if (pool < 0) { + ss << "Invalid pool '" << poolstr << "''"; + return; + } + + string objname, nspace; + cmd_getval(cmdmap, "objname", objname); + std::size_t found = objname.find_first_of('/'); + if (found != string::npos) { + nspace = objname.substr(0, found); + objname = objname.substr(found+1); + } + object_locator_t oloc(pool, nspace); + r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg); + + if (r < 0) { + ss << "Invalid namespace/objname"; + return; + } + + int64_t shardid; + cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD)); + hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace); + ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid))); + spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid)); + if (curmap->pg_is_ec(rawpg)) { + if ((command != "injectdataerr") && (command != "injectmdataerr")) { + ss << "Must not call on ec pool, except injectdataerr or injectmdataerr"; + return; + } + } + + ObjectStore::Transaction t; + + if (command == "setomapval") { + map newattrs; + bufferlist val; + string key, valstr; + cmd_getval(cmdmap, "key", key); + cmd_getval(cmdmap, "val", valstr); + + val.append(valstr); + newattrs[key] = val; + t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs); + r = store->queue_transaction(service->meta_ch, std::move(t)); + if (r < 0) + ss << "error=" << r; + else + ss << "ok"; + } else if (command == "rmomapkey") { + string key; + cmd_getval(cmdmap, "key", key); + + t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key); + r = store->queue_transaction(service->meta_ch, std::move(t)); + if (r < 0) + ss << "error=" << r; + else + ss << "ok"; + } else if (command == "setomapheader") { + bufferlist newheader; + string headerstr; + + cmd_getval(cmdmap, "header", headerstr); + newheader.append(headerstr); + t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader); + r = store->queue_transaction(service->meta_ch, std::move(t)); + if (r < 0) + ss << "error=" << r; + else + ss << "ok"; + } else if (command == "getomap") { + //Debug: Output entire omap + bufferlist hdrbl; + map keyvals; + auto ch = store->open_collection(coll_t(pgid)); + if (!ch) { + ss << "unable to open collection for " << pgid; + r = -ENOENT; + } else { + r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals); + if (r >= 0) { + ss << "header=" << string(hdrbl.c_str(), hdrbl.length()); + for (map::iterator it = keyvals.begin(); + it != keyvals.end(); ++it) + ss << " key=" << (*it).first << " val=" + << string((*it).second.c_str(), (*it).second.length()); + } else { + ss << "error=" << r; + } + } + } else if (command == "truncobj") { + int64_t trunclen; + cmd_getval(cmdmap, "len", trunclen); + t.truncate(coll_t(pgid), ghobject_t(obj), trunclen); + r = store->queue_transaction(service->meta_ch, std::move(t)); + if (r < 0) + ss << "error=" << r; + else + ss << "ok"; + } else if (command == "injectdataerr") { + store->inject_data_error(gobj); + ss << "ok"; + } else if (command == "injectmdataerr") { + store->inject_mdata_error(gobj); + ss << "ok"; + } + return; + } + if (command == "set_recovery_delay") { + int64_t delay; + cmd_getval(cmdmap, "utime", delay, (int64_t)0); + ostringstream oss; + oss << delay; + int r = service->cct->_conf.set_val("osd_recovery_delay_start", + oss.str().c_str()); + if (r != 0) { + ss << "set_recovery_delay: error setting " + << "osd_recovery_delay_start to '" << delay << "': error " + << r; + return; + } + service->cct->_conf.apply_changes(nullptr); + ss << "set_recovery_delay: set osd_recovery_delay_start " + << "to " << service->cct->_conf->osd_recovery_delay_start; + return; + } + if (command == "injectfull") { + int64_t count; + string type; + OSDService::s_names state; + cmd_getval(cmdmap, "type", type, string("full")); + cmd_getval(cmdmap, "count", count, (int64_t)-1); + if (type == "none" || count == 0) { + type = "none"; + count = 0; + } + state = service->get_full_state(type); + if (state == OSDService::s_names::INVALID) { + ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)"; + return; + } + service->set_injectfull(state, count); + return; + } + ss << "Internal error - command=" << command; +} + +// ========================================= + +void OSD::ms_handle_connect(Connection *con) +{ + dout(10) << __func__ << " con " << con << dendl; + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + std::lock_guard l(osd_lock); + if (is_stopping()) + return; + dout(10) << __func__ << " on mon" << dendl; + + if (is_preboot()) { + start_boot(); + } else if (is_booting()) { + _send_boot(); // resend boot message + } else { + map_lock.lock_shared(); + std::lock_guard l2(mon_report_lock); + + utime_t now = ceph_clock_now(); + last_mon_report = now; + + // resend everything, it's a new session + send_full_update(); + send_alive(); + service.requeue_pg_temp(); + service.clear_sent_ready_to_merge(); + service.send_pg_temp(); + service.send_ready_to_merge(); + service.send_pg_created(); + requeue_failures(); + send_failures(); + + map_lock.unlock_shared(); + if (is_active()) { + send_beacon(ceph::coarse_mono_clock::now()); + } + } + + // full map requests may happen while active or pre-boot + if (requested_full_first) { + rerequest_full_maps(); + } + } +} + +void OSD::ms_handle_fast_connect(Connection *con) +{ + if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON && + con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) { + if (auto s = ceph::ref_cast(con->get_priv()); !s) { + s = ceph::make_ref(cct, con); + con->set_priv(s); + dout(10) << " new session (outgoing) " << s << " con=" << s->con + << " addr=" << s->con->get_peer_addr() << dendl; + // we don't connect to clients + ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD); + s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD); + } + } +} + +void OSD::ms_handle_fast_accept(Connection *con) +{ + if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON && + con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) { + if (auto s = ceph::ref_cast(con->get_priv()); !s) { + s = ceph::make_ref(cct, con); + con->set_priv(s); + dout(10) << "new session (incoming)" << s << " con=" << con + << " addr=" << con->get_peer_addr() + << " must have raced with connect" << dendl; + ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD); + s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD); + } + } +} + +bool OSD::ms_handle_reset(Connection *con) +{ + auto session = ceph::ref_cast(con->get_priv()); + dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl; + if (!session) + return false; + session->wstate.reset(con); + session->con->set_priv(nullptr); + session->con.reset(); // break con <-> session ref cycle + // note that we break session->con *before* the session_handle_reset + // cleanup below. this avoids a race between us and + // PG::add_backoff, Session::check_backoff, etc. + session_handle_reset(session); + return true; +} + +bool OSD::ms_handle_refused(Connection *con) +{ + if (!cct->_conf->osd_fast_fail_on_connection_refused) + return false; + + auto session = ceph::ref_cast(con->get_priv()); + dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl; + if (!session) + return false; + int type = con->get_peer_type(); + // handle only OSD failures here + if (monc && (type == CEPH_ENTITY_TYPE_OSD)) { + OSDMapRef osdmap = get_osdmap(); + if (osdmap) { + int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr()); + if (id >= 0 && osdmap->is_up(id)) { + // I'm cheating mon heartbeat grace logic, because we know it's not going + // to respawn alone. +1 so we won't hit any boundary case. + monc->send_mon_message( + new MOSDFailure( + monc->get_fsid(), + id, + osdmap->get_addrs(id), + cct->_conf->osd_heartbeat_grace + 1, + osdmap->get_epoch(), + MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED + )); + } + } + } + return true; +} + +struct CB_OSD_GetVersion { + OSD *osd; + explicit CB_OSD_GetVersion(OSD *o) : osd(o) {} + void operator ()(boost::system::error_code ec, version_t newest, + version_t oldest) { + if (!ec) + osd->_got_mon_epochs(oldest, newest); + } +}; + +void OSD::start_boot() +{ + if (!_is_healthy()) { + // if we are not healthy, do not mark ourselves up (yet) + dout(1) << "not healthy; waiting to boot" << dendl; + if (!is_waiting_for_healthy()) + start_waiting_for_healthy(); + // send pings sooner rather than later + heartbeat_kick(); + return; + } + dout(1) << __func__ << dendl; + set_state(STATE_PREBOOT); + dout(10) << "start_boot - have maps " << superblock.oldest_map + << ".." << superblock.newest_map << dendl; + monc->get_version("osdmap", CB_OSD_GetVersion(this)); +} + +void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest) +{ + std::lock_guard l(osd_lock); + if (is_preboot()) { + _preboot(oldest, newest); + } +} + +void OSD::_preboot(epoch_t oldest, epoch_t newest) +{ + ceph_assert(is_preboot()); + dout(10) << __func__ << " _preboot mon has osdmaps " + << oldest << ".." << newest << dendl; + + // ensure our local fullness awareness is accurate + { + std::lock_guard l(heartbeat_lock); + heartbeat(); + } + + const auto& monmap = monc->monmap; + const auto osdmap = get_osdmap(); + // if our map within recent history, try to add ourselves to the osdmap. + if (osdmap->get_epoch() == 0) { + derr << "waiting for initial osdmap" << dendl; + } else if (osdmap->is_destroyed(whoami)) { + derr << "osdmap says I am destroyed" << dendl; + // provide a small margin so we don't livelock seeing if we + // un-destroyed ourselves. + if (osdmap->get_epoch() > newest - 1) { + exit(0); + } + } else if (osdmap->is_noup(whoami)) { + derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl; + } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { + derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it" + << dendl; + } else if (service.need_fullness_update()) { + derr << "osdmap fullness state needs update" << dendl; + send_full_update(); + } else if (monmap.min_mon_release >= ceph_release_t::octopus && + superblock.purged_snaps_last < superblock.current_epoch) { + dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last + << " < newest_map " << superblock.current_epoch << dendl; + _get_purged_snaps(); + } else if (osdmap->get_epoch() >= oldest - 1 && + osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) { + + // wait for pgs to fully catch up in a different thread, since + // this thread might be required for splitting and merging PGs to + // make progress. + boot_finisher.queue( + new LambdaContext( + [this](int r) { + std::unique_lock l(osd_lock); + if (is_preboot()) { + dout(10) << __func__ << " waiting for peering work to drain" + << dendl; + l.unlock(); + for (auto shard : shards) { + shard->wait_min_pg_epoch(get_osdmap_epoch()); + } + l.lock(); + } + if (is_preboot()) { + _send_boot(); + } + })); + return; + } + + // get all the latest maps + if (osdmap->get_epoch() + 1 >= oldest) + osdmap_subscribe(osdmap->get_epoch() + 1, false); + else + osdmap_subscribe(oldest - 1, true); +} + +void OSD::_get_purged_snaps() +{ + // NOTE: this is a naive, stateless implementaiton. it may send multiple + // overlapping requests to the mon, which will be somewhat inefficient, but + // it should be reliable. + dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last + << ", newest_map " << superblock.current_epoch << dendl; + MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps( + superblock.purged_snaps_last + 1, + superblock.current_epoch + 1); + monc->send_mon_message(m); +} + +void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m) +{ + dout(10) << __func__ << " " << *m << dendl; + ObjectStore::Transaction t; + if (!is_preboot() || + m->last < superblock.purged_snaps_last) { + goto out; + } + SnapMapper::record_purged_snaps(cct, store, service.meta_ch, + make_purged_snaps_oid(), &t, + m->purged_snaps); + superblock.purged_snaps_last = m->last; + write_superblock(t); + store->queue_transaction( + service.meta_ch, + std::move(t)); + service.publish_superblock(superblock); + if (m->last < superblock.current_epoch) { + _get_purged_snaps(); + } else { + start_boot(); + } +out: + m->put(); +} + +void OSD::send_full_update() +{ + if (!service.need_fullness_update()) + return; + unsigned state = 0; + if (service.is_full()) { + state = CEPH_OSD_FULL; + } else if (service.is_backfillfull()) { + state = CEPH_OSD_BACKFILLFULL; + } else if (service.is_nearfull()) { + state = CEPH_OSD_NEARFULL; + } + set s; + OSDMap::calc_state_set(state, s); + dout(10) << __func__ << " want state " << s << dendl; + monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state)); +} + +void OSD::start_waiting_for_healthy() +{ + dout(1) << "start_waiting_for_healthy" << dendl; + set_state(STATE_WAITING_FOR_HEALTHY); + last_heartbeat_resample = utime_t(); + + // subscribe to osdmap updates, in case our peers really are known to be dead + osdmap_subscribe(get_osdmap_epoch() + 1, false); +} + +bool OSD::_is_healthy() +{ + if (!cct->get_heartbeat_map()->is_healthy()) { + dout(1) << "is_healthy false -- internal heartbeat failed" << dendl; + return false; + } + + if (is_waiting_for_healthy()) { + utime_t now = ceph_clock_now(); + if (osd_markdown_log.empty()) { + dout(5) << __func__ << " force returning true since last markdown" + << " was " << cct->_conf->osd_max_markdown_period + << "s ago" << dendl; + return true; + } + std::lock_guard l(heartbeat_lock); + int num = 0, up = 0; + for (map::iterator p = heartbeat_peers.begin(); + p != heartbeat_peers.end(); + ++p) { + if (p->second.is_healthy(now)) + ++up; + ++num; + } + if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) { + dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than " + << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl; + return false; + } + } + + return true; +} + +void OSD::_send_boot() +{ + dout(10) << "_send_boot" << dendl; + Connection *local_connection = + cluster_messenger->get_loopback_connection().get(); + entity_addrvec_t client_addrs = client_messenger->get_myaddrs(); + entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs(); + entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs(); + entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs(); + + dout(20) << " initial client_addrs " << client_addrs + << ", cluster_addrs " << cluster_addrs + << ", hb_back_addrs " << hb_back_addrs + << ", hb_front_addrs " << hb_front_addrs + << dendl; + if (cluster_messenger->set_addr_unknowns(client_addrs)) { + dout(10) << " assuming cluster_addrs match client_addrs " + << client_addrs << dendl; + cluster_addrs = cluster_messenger->get_myaddrs(); + } + if (auto session = local_connection->get_priv(); !session) { + cluster_messenger->ms_deliver_handle_fast_connect(local_connection); + } + + local_connection = hb_back_server_messenger->get_loopback_connection().get(); + if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) { + dout(10) << " assuming hb_back_addrs match cluster_addrs " + << cluster_addrs << dendl; + hb_back_addrs = hb_back_server_messenger->get_myaddrs(); + } + if (auto session = local_connection->get_priv(); !session) { + hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection); + } + + local_connection = hb_front_server_messenger->get_loopback_connection().get(); + if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) { + dout(10) << " assuming hb_front_addrs match client_addrs " + << client_addrs << dendl; + hb_front_addrs = hb_front_server_messenger->get_myaddrs(); + } + if (auto session = local_connection->get_priv(); !session) { + hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection); + } + + // we now know what our front and back addrs will be, and we are + // about to tell the mon what our metadata (including numa bindings) + // are, so now is a good time! + set_numa_affinity(); + + MOSDBoot *mboot = new MOSDBoot( + superblock, get_osdmap_epoch(), service.get_boot_epoch(), + hb_back_addrs, hb_front_addrs, cluster_addrs, + CEPH_FEATURES_ALL); + dout(10) << " final client_addrs " << client_addrs + << ", cluster_addrs " << cluster_addrs + << ", hb_back_addrs " << hb_back_addrs + << ", hb_front_addrs " << hb_front_addrs + << dendl; + _collect_metadata(&mboot->metadata); + monc->send_mon_message(mboot); + set_state(STATE_BOOTING); +} + +void OSD::_collect_metadata(map *pm) +{ + // config info + (*pm)["osd_data"] = dev_path; + if (store->get_type() == "filestore") { + // not applicable for bluestore + (*pm)["osd_journal"] = journal_path; + } + (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs()); + (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs()); + (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs()); + (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs()); + + // backend + (*pm)["osd_objectstore"] = store->get_type(); + (*pm)["rotational"] = store_is_rotational ? "1" : "0"; + (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0"; + (*pm)["default_device_class"] = store->get_default_device_class(); + string osdspec_affinity; + int r = store->read_meta("osdspec_affinity", &osdspec_affinity); + if (r < 0 || osdspec_affinity.empty()) { + osdspec_affinity = ""; + } + (*pm)["osdspec_affinity"] = osdspec_affinity; + string ceph_version_when_created; + r = store->read_meta("ceph_version_when_created", &ceph_version_when_created); + if (r <0 || ceph_version_when_created.empty()) { + ceph_version_when_created = ""; + } + (*pm)["ceph_version_when_created"] = ceph_version_when_created; + string created_at; + r = store->read_meta("created_at", &created_at); + if (r < 0 || created_at.empty()) { + created_at = ""; + } + (*pm)["created_at"] = created_at; + store->collect_metadata(pm); + + collect_sys_info(pm, cct); + + (*pm)["front_iface"] = pick_iface( + cct, + client_messenger->get_myaddrs().front().get_sockaddr_storage()); + (*pm)["back_iface"] = pick_iface( + cct, + cluster_messenger->get_myaddrs().front().get_sockaddr_storage()); + + // network numa + { + int node = -1; + set nodes; + set unknown; + for (auto nm : { "front_iface", "back_iface" }) { + if (!(*pm)[nm].size()) { + unknown.insert(nm); + continue; + } + int n = -1; + int r = get_iface_numa_node((*pm)[nm], &n); + if (r < 0) { + unknown.insert((*pm)[nm]); + continue; + } + nodes.insert(n); + if (node < 0) { + node = n; + } + } + if (unknown.size()) { + (*pm)["network_numa_unknown_ifaces"] = stringify(unknown); + } + if (!nodes.empty()) { + (*pm)["network_numa_nodes"] = stringify(nodes); + } + if (node >= 0 && nodes.size() == 1 && unknown.empty()) { + (*pm)["network_numa_node"] = stringify(node); + } + } + + if (numa_node >= 0) { + (*pm)["numa_node"] = stringify(numa_node); + (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size, + &numa_cpu_set); + } + + set devnames; + store->get_devices(&devnames); + map errs; + get_device_metadata(devnames, pm, &errs); + for (auto& i : errs) { + dout(1) << __func__ << " " << i.first << ": " << i.second << dendl; + } + dout(10) << __func__ << " " << *pm << dendl; +} + +void OSD::queue_want_up_thru(epoch_t want) +{ + std::shared_lock map_locker{map_lock}; + epoch_t cur = get_osdmap()->get_up_thru(whoami); + std::lock_guard report_locker(mon_report_lock); + if (want > up_thru_wanted) { + dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")" + << ", currently " << cur + << dendl; + up_thru_wanted = want; + send_alive(); + } else { + dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted + << ", currently " << cur + << dendl; + } +} + +void OSD::send_alive() +{ + ceph_assert(ceph_mutex_is_locked(mon_report_lock)); + const auto osdmap = get_osdmap(); + if (!osdmap->exists(whoami)) + return; + epoch_t up_thru = osdmap->get_up_thru(whoami); + dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl; + if (up_thru_wanted > up_thru) { + dout(10) << "send_alive want " << up_thru_wanted << dendl; + monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted)); + } +} + +void OSD::request_full_map(epoch_t first, epoch_t last) +{ + dout(10) << __func__ << " " << first << ".." << last + << ", previously requested " + << requested_full_first << ".." << requested_full_last << dendl; + ceph_assert(ceph_mutex_is_locked(osd_lock)); + ceph_assert(first > 0 && last > 0); + ceph_assert(first <= last); + ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps + if (requested_full_first == 0) { + // first request + requested_full_first = first; + requested_full_last = last; + } else if (last <= requested_full_last) { + // dup + return; + } else { + // additional request + first = requested_full_last + 1; + requested_full_last = last; + } + MMonGetOSDMap *req = new MMonGetOSDMap; + req->request_full(first, last); + monc->send_mon_message(req); +} + +void OSD::got_full_map(epoch_t e) +{ + ceph_assert(requested_full_first <= requested_full_last); + ceph_assert(ceph_mutex_is_locked(osd_lock)); + if (requested_full_first == 0) { + dout(20) << __func__ << " " << e << ", nothing requested" << dendl; + return; + } + if (e < requested_full_first) { + dout(10) << __func__ << " " << e << ", requested " << requested_full_first + << ".." << requested_full_last + << ", ignoring" << dendl; + return; + } + if (e >= requested_full_last) { + dout(10) << __func__ << " " << e << ", requested " << requested_full_first + << ".." << requested_full_last << ", resetting" << dendl; + requested_full_first = requested_full_last = 0; + return; + } + + requested_full_first = e + 1; + + dout(10) << __func__ << " " << e << ", requested " << requested_full_first + << ".." << requested_full_last + << ", still need more" << dendl; +} + +void OSD::requeue_failures() +{ + std::lock_guard l(heartbeat_lock); + unsigned old_queue = failure_queue.size(); + unsigned old_pending = failure_pending.size(); + for (auto p = failure_pending.begin(); p != failure_pending.end(); ) { + failure_queue[p->first] = p->second.first; + failure_pending.erase(p++); + } + dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> " + << failure_queue.size() << dendl; +} + +void OSD::send_failures() +{ + ceph_assert(ceph_mutex_is_locked(map_lock)); + ceph_assert(ceph_mutex_is_locked(mon_report_lock)); + std::lock_guard l(heartbeat_lock); + utime_t now = ceph_clock_now(); + const auto osdmap = get_osdmap(); + while (!failure_queue.empty()) { + int osd = failure_queue.begin()->first; + if (!failure_pending.count(osd)) { + int failed_for = (int)(double)(now - failure_queue.begin()->second); + monc->send_mon_message( + new MOSDFailure( + monc->get_fsid(), + osd, + osdmap->get_addrs(osd), + failed_for, + osdmap->get_epoch())); + failure_pending[osd] = make_pair(failure_queue.begin()->second, + osdmap->get_addrs(osd)); + } + failure_queue.erase(osd); + } +} + +void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs) +{ + MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch, + MOSDFailure::FLAG_ALIVE); + monc->send_mon_message(m); +} + +void OSD::cancel_pending_failures() +{ + std::lock_guard l(heartbeat_lock); + auto it = failure_pending.begin(); + while (it != failure_pending.end()) { + dout(10) << __func__ << " canceling in-flight failure report for osd." + << it->first << dendl; + send_still_alive(get_osdmap_epoch(), it->first, it->second.second); + failure_pending.erase(it++); + } +} + +void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now) +{ + const auto& monmap = monc->monmap; + // send beacon to mon even if we are just connected, and the monmap is not + // initialized yet by then. + if (monmap.epoch > 0 && + monmap.get_required_features().contains_all( + ceph::features::mon::FEATURE_LUMINOUS)) { + dout(20) << __func__ << " sending" << dendl; + MOSDBeacon* beacon = nullptr; + { + std::lock_guard l{min_last_epoch_clean_lock}; + beacon = new MOSDBeacon(get_osdmap_epoch(), + min_last_epoch_clean, + superblock.last_purged_snaps_scrub, + cct->_conf->osd_beacon_report_interval); + beacon->pgs = min_last_epoch_clean_pgs; + last_sent_beacon = now; + } + monc->send_mon_message(beacon); + } else { + dout(20) << __func__ << " not sending" << dendl; + } +} + +void OSD::handle_command(MCommand *m) +{ + ConnectionRef con = m->get_connection(); + auto session = ceph::ref_cast(con->get_priv()); + if (!session) { + con->send_message(new MCommandReply(m, -EACCES)); + m->put(); + return; + } + if (!session->caps.allow_all()) { + con->send_message(new MCommandReply(m, -EACCES)); + m->put(); + return; + } + cct->get_admin_socket()->queue_tell_command(m); + m->put(); +} + +namespace { + class unlock_guard { + ceph::mutex& m; + public: + explicit unlock_guard(ceph::mutex& mutex) + : m(mutex) + { + m.unlock(); + } + unlock_guard(unlock_guard&) = delete; + ~unlock_guard() { + m.lock(); + } + }; +} + +void OSD::scrub_purged_snaps() +{ + dout(10) << __func__ << dendl; + ceph_assert(ceph_mutex_is_locked(osd_lock)); + SnapMapper::Scrubber s(cct, store, service.meta_ch, + make_snapmapper_oid(), + make_purged_snaps_oid()); + clog->debug() << "purged_snaps scrub starts"; + osd_lock.unlock(); + s.run(); + if (s.stray.size()) { + clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays"; + } else { + clog->debug() << "purged_snaps scrub ok"; + } + set> queued; + for (auto& [pool, snap, hash, shard] : s.stray) { + const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool); + if (!pi) { + dout(20) << __func__ << " pool " << pool << " dne" << dendl; + continue; + } + pg_t pgid(pi->raw_hash_to_pg(hash), pool); + spg_t spgid(pgid, shard); + pair p(spgid, snap); + if (queued.count(p)) { + dout(20) << __func__ << " pg " << spgid << " snap " << snap + << " already queued" << dendl; + continue; + } + PGRef pg = lookup_lock_pg(spgid); + if (!pg) { + dout(20) << __func__ << " pg " << spgid << " not found" << dendl; + continue; + } + queued.insert(p); + dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap " + << snap << dendl; + pg->queue_snap_retrim(snap); + pg->unlock(); + } + osd_lock.lock(); + if (is_stopping()) { + return; + } + dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl; + ObjectStore::Transaction t; + superblock.last_purged_snaps_scrub = ceph_clock_now(); + write_superblock(t); + int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); + ceph_assert(tr == 0); + if (is_active()) { + send_beacon(ceph::coarse_mono_clock::now()); + } + dout(10) << __func__ << " done" << dendl; +} + +void OSD::probe_smart(const string& only_devid, ostream& ss) +{ + set devnames; + store->get_devices(&devnames); + uint64_t smart_timeout = cct->_conf.get_val( + "osd_smart_report_timeout"); + + // == typedef std::map mObject; + json_spirit::mObject json_map; + + for (auto dev : devnames) { + // smartctl works only on physical devices; filter out any logical device + if (dev.find("dm-") == 0) { + continue; + } + + string err; + string devid = get_device_id(dev, &err); + if (devid.size() == 0) { + dout(10) << __func__ << " no unique id for dev " << dev << " (" + << err << "), skipping" << dendl; + continue; + } + if (only_devid.size() && devid != only_devid) { + continue; + } + + json_spirit::mValue smart_json; + if (block_device_get_metrics(dev, smart_timeout, + &smart_json)) { + dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl; + continue; + } + json_map[devid] = smart_json; + } + json_spirit::write(json_map, ss, json_spirit::pretty_print); +} + +bool OSD::heartbeat_dispatch(Message *m) +{ + dout(30) << "heartbeat_dispatch " << m << dendl; + switch (m->get_type()) { + + case CEPH_MSG_PING: + dout(10) << "ping from " << m->get_source_inst() << dendl; + m->put(); + break; + + case MSG_OSD_PING: + handle_osd_ping(static_cast(m)); + break; + + default: + dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl; + m->put(); + } + + return true; +} + +bool OSD::ms_dispatch(Message *m) +{ + dout(20) << "OSD::ms_dispatch: " << *m << dendl; + if (m->get_type() == MSG_OSD_MARK_ME_DOWN) { + service.got_stop_ack(); + m->put(); + return true; + } + + // lock! + + osd_lock.lock(); + if (is_stopping()) { + osd_lock.unlock(); + m->put(); + return true; + } + + do_waiters(); + _dispatch(m); + + osd_lock.unlock(); + + return true; +} + +void OSDService::maybe_share_map( + Connection *con, + const OSDMapRef& osdmap, + epoch_t peer_epoch_lb) +{ + // NOTE: we assume caller hold something that keeps the Connection itself + // pinned (e.g., an OpRequest's MessageRef). + auto session = ceph::ref_cast(con->get_priv()); + if (!session) { + return; + } + + // assume the peer has the newer of the op's sent_epoch and what + // we think we sent them. + session->sent_epoch_lock.lock(); + if (peer_epoch_lb > session->last_sent_epoch) { + dout(10) << __func__ << " con " << con + << " " << con->get_peer_addr() + << " map epoch " << session->last_sent_epoch + << " -> " << peer_epoch_lb << " (as per caller)" << dendl; + session->last_sent_epoch = peer_epoch_lb; + } + epoch_t last_sent_epoch = session->last_sent_epoch; + session->sent_epoch_lock.unlock(); + + if (osdmap->get_epoch() <= last_sent_epoch) { + return; + } + + send_incremental_map(last_sent_epoch, con, osdmap); + last_sent_epoch = osdmap->get_epoch(); + + session->sent_epoch_lock.lock(); + if (session->last_sent_epoch < last_sent_epoch) { + dout(10) << __func__ << " con " << con + << " " << con->get_peer_addr() + << " map epoch " << session->last_sent_epoch + << " -> " << last_sent_epoch << " (shared)" << dendl; + session->last_sent_epoch = last_sent_epoch; + } + session->sent_epoch_lock.unlock(); +} + +void OSD::dispatch_session_waiting(const ceph::ref_t& session, OSDMapRef osdmap) +{ + ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock)); + + auto i = session->waiting_on_map.begin(); + while (i != session->waiting_on_map.end()) { + OpRequestRef op = &(*i); + ceph_assert(ms_can_fast_dispatch(op->get_req())); + auto m = op->get_req(); + if (m->get_min_epoch() > osdmap->get_epoch()) { + break; + } + session->waiting_on_map.erase(i++); + op->put(); + + spg_t pgid; + if (m->get_type() == CEPH_MSG_OSD_OP) { + pg_t actual_pgid = osdmap->raw_pg_to_pg( + static_cast(m)->get_pg()); + if (!osdmap->get_primary_shard(actual_pgid, &pgid)) { + continue; + } + } else { + pgid = m->get_spg(); + } + enqueue_op(pgid, std::move(op), m->get_map_epoch()); + } + + if (session->waiting_on_map.empty()) { + clear_session_waiting_on_map(session); + } else { + register_session_waiting_on_map(session); + } +} + +void OSD::ms_fast_dispatch(Message *m) +{ + +#ifdef HAVE_JAEGER + jaeger_tracing::init_tracer("osd-services-reinit"); + dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl; + auto dispatch_span = jaeger_tracing::new_span(__func__); +#endif + FUNCTRACE(cct); + if (service.is_stopping()) { + m->put(); + return; + } + + // peering event? + switch (m->get_type()) { + case CEPH_MSG_PING: + dout(10) << "ping from " << m->get_source() << dendl; + m->put(); + return; + case MSG_OSD_FORCE_RECOVERY: + handle_fast_force_recovery(static_cast(m)); + return; + case MSG_OSD_SCRUB2: + handle_fast_scrub(static_cast(m)); + return; + + case MSG_OSD_PG_CREATE2: + return handle_fast_pg_create(static_cast(m)); + case MSG_OSD_PG_QUERY: + return handle_fast_pg_query(static_cast(m)); + case MSG_OSD_PG_NOTIFY: + return handle_fast_pg_notify(static_cast(m)); + case MSG_OSD_PG_INFO: + return handle_fast_pg_info(static_cast(m)); + case MSG_OSD_PG_REMOVE: + return handle_fast_pg_remove(static_cast(m)); + + // these are single-pg messages that handle themselves + case MSG_OSD_PG_LOG: + case MSG_OSD_PG_TRIM: + case MSG_OSD_PG_NOTIFY2: + case MSG_OSD_PG_QUERY2: + case MSG_OSD_PG_INFO2: + case MSG_OSD_BACKFILL_RESERVE: + case MSG_OSD_RECOVERY_RESERVE: + case MSG_OSD_PG_LEASE: + case MSG_OSD_PG_LEASE_ACK: + { + MOSDPeeringOp *pm = static_cast(m); + if (require_osd_peer(pm)) { + enqueue_peering_evt( + pm->get_spg(), + PGPeeringEventRef(pm->get_event())); + } + pm->put(); + return; + } + } + + OpRequestRef op = op_tracker.create_request(m); + { +#ifdef WITH_LTTNG + osd_reqid_t reqid = op->get_reqid(); +#endif + tracepoint(osd, ms_fast_dispatch, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } +#ifdef HAVE_JAEGER + op->set_osd_parent_span(dispatch_span); + if (op->osd_parent_span) { + auto op_req_span = jaeger_tracing::child_span("op-request-created", op->osd_parent_span); + op->set_osd_parent_span(op_req_span); + } +#endif + if (m->trace) + op->osd_trace.init("osd op", &trace_endpoint, &m->trace); + + // note sender epoch, min req's epoch + op->sent_epoch = static_cast(m)->get_map_epoch(); + op->min_epoch = static_cast(m)->get_min_epoch(); + ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check! + + service.maybe_inject_dispatch_delay(); + + if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) || + m->get_type() != CEPH_MSG_OSD_OP) { + // queue it directly + enqueue_op( + static_cast(m)->get_spg(), + std::move(op), + static_cast(m)->get_map_epoch()); + } else { + // legacy client, and this is an MOSDOp (the *only* fast dispatch + // message that didn't have an explicit spg_t); we need to map + // them to an spg_t while preserving delivery order. + auto priv = m->get_connection()->get_priv(); + if (auto session = static_cast(priv.get()); session) { + std::lock_guard l{session->session_dispatch_lock}; + op->get(); + session->waiting_on_map.push_back(*op); + OSDMapRef nextmap = service.get_nextmap_reserved(); + dispatch_session_waiting(session, nextmap); + service.release_map(nextmap); + } + } + OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false); +} + +int OSD::ms_handle_authentication(Connection *con) +{ + int ret = 0; + auto s = ceph::ref_cast(con->get_priv()); + if (!s) { + s = ceph::make_ref(cct, con); + con->set_priv(s); + s->entity_name = con->get_peer_entity_name(); + dout(10) << __func__ << " new session " << s << " con " << s->con + << " entity " << s->entity_name + << " addr " << con->get_peer_addrs() << dendl; + } else { + dout(10) << __func__ << " existing session " << s << " con " << s->con + << " entity " << s->entity_name + << " addr " << con->get_peer_addrs() << dendl; + } + + AuthCapsInfo &caps_info = con->get_peer_caps_info(); + if (caps_info.allow_all) { + s->caps.set_allow_all(); + } else if (caps_info.caps.length() > 0) { + bufferlist::const_iterator p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } + catch (ceph::buffer::error& e) { + dout(10) << __func__ << " session " << s << " " << s->entity_name + << " failed to decode caps string" << dendl; + ret = -EACCES; + } + if (!ret) { + bool success = s->caps.parse(str); + if (success) { + dout(10) << __func__ << " session " << s + << " " << s->entity_name + << " has caps " << s->caps << " '" << str << "'" << dendl; + ret = 1; + } else { + dout(10) << __func__ << " session " << s << " " << s->entity_name + << " failed to parse caps '" << str << "'" << dendl; + ret = -EACCES; + } + } + } + return ret; +} + +void OSD::do_waiters() +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + + dout(10) << "do_waiters -- start" << dendl; + while (!finished.empty()) { + OpRequestRef next = finished.front(); + finished.pop_front(); + dispatch_op(next); + } + dout(10) << "do_waiters -- finish" << dendl; +} + +void OSD::dispatch_op(OpRequestRef op) +{ + switch (op->get_req()->get_type()) { + + case MSG_OSD_PG_CREATE: + handle_pg_create(op); + break; + } +} + +void OSD::_dispatch(Message *m) +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + dout(20) << "_dispatch " << m << " " << *m << dendl; + + switch (m->get_type()) { + // -- don't need OSDMap -- + + // map and replication + case CEPH_MSG_OSD_MAP: + handle_osd_map(static_cast(m)); + break; + case MSG_MON_GET_PURGED_SNAPS_REPLY: + handle_get_purged_snaps_reply(static_cast(m)); + break; + + // osd + case MSG_OSD_SCRUB: + handle_scrub(static_cast(m)); + break; + + case MSG_COMMAND: + handle_command(static_cast(m)); + return; + + // -- need OSDMap -- + + case MSG_OSD_PG_CREATE: + { + OpRequestRef op = op_tracker.create_request(m); + if (m->trace) + op->osd_trace.init("osd op", &trace_endpoint, &m->trace); + // no map? starting up? + if (!get_osdmap()) { + dout(7) << "no OSDMap, not booted" << dendl; + logger->inc(l_osd_waiting_for_map); + waiting_for_osdmap.push_back(op); + op->mark_delayed("no osdmap"); + break; + } + + // need OSDMap + dispatch_op(op); + } + } +} + +// remove me post-nautilus +void OSD::handle_scrub(MOSDScrub *m) +{ + dout(10) << "handle_scrub " << *m << dendl; + if (!require_mon_or_mgr_peer(m)) { + m->put(); + return; + } + if (m->fsid != monc->get_fsid()) { + dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() + << dendl; + m->put(); + return; + } + + vector spgs; + _get_pgids(&spgs); + + if (!m->scrub_pgs.empty()) { + vector v; + for (auto pgid : m->scrub_pgs) { + spg_t pcand; + if (get_osdmap()->get_primary_shard(pgid, &pcand) && + std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) { + v.push_back(pcand); + } + } + spgs.swap(v); + } + + for (auto pgid : spgs) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestScrub(m->deep, m->repair)))); + } + + m->put(); +} + +void OSD::handle_fast_scrub(MOSDScrub2 *m) +{ + dout(10) << __func__ << " " << *m << dendl; + if (!require_mon_or_mgr_peer(m)) { + m->put(); + return; + } + if (m->fsid != monc->get_fsid()) { + dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid() + << dendl; + m->put(); + return; + } + for (auto pgid : m->scrub_pgs) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + m->epoch, + m->epoch, + PeeringState::RequestScrub(m->deep, m->repair)))); + } + m->put(); +} + +bool OSD::scrub_random_backoff() +{ + bool coin_flip = (rand() / (double)RAND_MAX >= + cct->_conf->osd_scrub_backoff_ratio); + if (!coin_flip) { + dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl; + return true; + } + return false; +} + +OSDService::ScrubJob::ScrubJob(CephContext* cct, + const spg_t& pg, const utime_t& timestamp, + double pool_scrub_min_interval, + double pool_scrub_max_interval, bool must) + : cct(cct), + pgid(pg), + sched_time(timestamp), + deadline(timestamp) +{ + // if not explicitly requested, postpone the scrub with a random delay + if (!must) { + double scrub_min_interval = pool_scrub_min_interval > 0 ? + pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval; + double scrub_max_interval = pool_scrub_max_interval > 0 ? + pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval; + + sched_time += scrub_min_interval; + double r = rand() / (double)RAND_MAX; + sched_time += + scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r; + if (scrub_max_interval == 0) { + deadline = utime_t(); + } else { + deadline += scrub_max_interval; + } + + } +} + +bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const { + if (sched_time < rhs.sched_time) + return true; + if (sched_time > rhs.sched_time) + return false; + return pgid < rhs.pgid; +} + +void OSDService::dumps_scrub(ceph::Formatter *f) +{ + ceph_assert(f != nullptr); + std::lock_guard l(sched_scrub_lock); + + f->open_array_section("scrubs"); + for (const auto &i: sched_scrub_pg) { + f->open_object_section("scrub"); + f->dump_stream("pgid") << i.pgid; + f->dump_stream("sched_time") << i.sched_time; + f->dump_stream("deadline") << i.deadline; + f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp()); + f->close_section(); + } + f->close_section(); +} + +double OSD::scrub_sleep_time(bool must_scrub) +{ + if (must_scrub) { + return cct->_conf->osd_scrub_sleep; + } + utime_t now = ceph_clock_now(); + if (scrub_time_permit(now)) { + return cct->_conf->osd_scrub_sleep; + } + double normal_sleep = cct->_conf->osd_scrub_sleep; + double extended_sleep = cct->_conf->osd_scrub_extended_sleep; + return std::max(extended_sleep, normal_sleep); +} + +bool OSD::scrub_time_permit(utime_t now) +{ + struct tm bdt; + time_t tt = now.sec(); + localtime_r(&tt, &bdt); + + bool day_permit = false; + if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) { + if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) { + day_permit = true; + } + } else { + if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) { + day_permit = true; + } + } + + if (!day_permit) { + dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day + << " - " << cct->_conf->osd_scrub_end_week_day + << " now " << bdt.tm_wday << " = no" << dendl; + return false; + } + + bool time_permit = false; + if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) { + if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) { + time_permit = true; + } + } else { + if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) { + time_permit = true; + } + } + if (time_permit) { + dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour + << " - " << cct->_conf->osd_scrub_end_hour + << " now " << bdt.tm_hour << " = yes" << dendl; + } else { + dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour + << " - " << cct->_conf->osd_scrub_end_hour + << " now " << bdt.tm_hour << " = no" << dendl; + } + return time_permit; +} + +bool OSD::scrub_load_below_threshold() +{ + double loadavgs[3]; + if (getloadavg(loadavgs, 3) != 3) { + dout(10) << __func__ << " couldn't read loadavgs\n" << dendl; + return false; + } + + // allow scrub if below configured threshold + long cpus = sysconf(_SC_NPROCESSORS_ONLN); + double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0]; + if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) { + dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu + << " < max " << cct->_conf->osd_scrub_load_threshold + << " = yes" << dendl; + return true; + } + + // allow scrub if below daily avg and currently decreasing + if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) { + dout(20) << __func__ << " loadavg " << loadavgs[0] + << " < daily_loadavg " << daily_loadavg + << " and < 15m avg " << loadavgs[2] + << " = yes" << dendl; + return true; + } + + dout(20) << __func__ << " loadavg " << loadavgs[0] + << " >= max " << cct->_conf->osd_scrub_load_threshold + << " and ( >= daily_loadavg " << daily_loadavg + << " or >= 15m avg " << loadavgs[2] + << ") = no" << dendl; + return false; +} + +void OSD::sched_scrub() +{ + dout(20) << __func__ << " sched_scrub starts" << dendl; + + // if not permitted, fail fast + if (!service.can_inc_scrubs()) { + dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl; + return; + } + bool allow_requested_repair_only = false; + if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) { + if (!cct->_conf->osd_repair_during_recovery) { + dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl; + return; + } + dout(10) << __func__ + << " will only schedule explicitly requested repair due to active recovery" + << dendl; + allow_requested_repair_only = true; + } + + utime_t now = ceph_clock_now(); + bool time_permit = scrub_time_permit(now); + bool load_is_low = scrub_load_below_threshold(); + dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl; + + OSDService::ScrubJob scrub_job; + if (service.first_scrub_stamp(&scrub_job)) { + do { + dout(30) << "sched_scrub examine " << scrub_job.pgid << " at " << scrub_job.sched_time << dendl; + + if (scrub_job.sched_time > now) { + // save ourselves some effort + dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time + << " > " << now << dendl; + break; + } + + if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) { + dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to " + << (!time_permit ? "time not permit" : "high load") << dendl; + continue; + } + + PGRef pg = _lookup_lock_pg(scrub_job.pgid); + if (!pg) { + dout(20) << __func__ << " pg " << scrub_job.pgid << " not found" << dendl; + continue; + } + + // This has already started, so go on to the next scrub job + if (pg->is_scrub_queued_or_active()) { + pg->unlock(); + dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl; + continue; + } + // Skip other kinds of scrubbing if only explicitly requested repairing is allowed + if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) { + pg->unlock(); + dout(10) << __func__ << " skip " << scrub_job.pgid + << " because repairing is not explicitly requested on it" + << dendl; + continue; + } + + // If it is reserving, let it resolve before going to the next scrub job + if (pg->m_scrubber->is_reserving()) { + pg->unlock(); + dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl; + break; + } + dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time + << (pg->get_must_scrub() ? ", explicitly requested" : + (load_is_low ? ", load_is_low" : " deadline < now")) + << dendl; + if (pg->sched_scrub()) { + pg->unlock(); + dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl; + break; + } + pg->unlock(); + } while (service.next_scrub_stamp(scrub_job, &scrub_job)); + } + dout(20) << "sched_scrub done" << dendl; +} + +void OSD::resched_all_scrubs() +{ + dout(10) << __func__ << ": start" << dendl; + const vector pgs = [this] { + vector pgs; + OSDService::ScrubJob job; + if (service.first_scrub_stamp(&job)) { + do { + pgs.push_back(job.pgid); + } while (service.next_scrub_stamp(job, &job)); + } + return pgs; + }(); + for (auto& pgid : pgs) { + dout(20) << __func__ << ": examine " << pgid << dendl; + PGRef pg = _lookup_lock_pg(pgid); + if (!pg) + continue; + if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) { + dout(15) << __func__ << ": reschedule " << pgid << dendl; + pg->on_info_history_change(); + } + pg->unlock(); + } + dout(10) << __func__ << ": done" << dendl; +} + +MPGStats* OSD::collect_pg_stats() +{ + // This implementation unconditionally sends every is_primary PG's + // stats every time we're called. This has equivalent cost to the + // previous implementation's worst case where all PGs are busy and + // their stats are always enqueued for sending. + std::shared_lock l{map_lock}; + + osd_stat_t cur_stat = service.get_osd_stat(); + cur_stat.os_perf_stat = store->get_cur_stats(); + + auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch()); + m->osd_stat = cur_stat; + + std::lock_guard lec{min_last_epoch_clean_lock}; + min_last_epoch_clean = get_osdmap_epoch(); + min_last_epoch_clean_pgs.clear(); + + std::set pool_set; + vector pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + auto pool = pg->pg_id.pgid.pool(); + pool_set.emplace((int64_t)pool); + if (!pg->is_primary()) { + continue; + } + pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) { + m->pg_stat[pg->pg_id.pgid] = s; + min_last_epoch_clean = std::min(min_last_epoch_clean, lec); + min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid); + }); + } + store_statfs_t st; + bool per_pool_stats = true; + bool per_pool_omap_stats = false; + for (auto p : pool_set) { + int r = store->pool_statfs(p, &st, &per_pool_omap_stats); + if (r == -ENOTSUP) { + per_pool_stats = false; + break; + } else { + assert(r >= 0); + m->pool_stat[p] = st; + } + } + + // indicate whether we are reporting per-pool stats + m->osd_stat.num_osds = 1; + m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0; + m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0; + + return m; +} + +vector OSD::get_health_metrics() +{ + vector metrics; + { + utime_t oldest_secs; + const utime_t now = ceph_clock_now(); + auto too_old = now; + too_old -= cct->_conf.get_val("osd_op_complaint_time"); + int slow = 0; + TrackedOpRef oldest_op; + OSDMapRef osdmap = get_osdmap(); + // map of slow op counts by slow op event type for an aggregated logging to + // the cluster log. + map slow_op_types; + // map of slow op counts by pool for reporting a pool name with highest + // slow ops. + map slow_op_pools; + bool log_aggregated_slow_op = + cct->_conf.get_val("osd_aggregated_slow_ops_logging"); + auto count_slow_ops = [&](TrackedOp& op) { + if (op.get_initiated() < too_old) { + stringstream ss; + ss << "slow request " << op.get_desc() + << " initiated " + << op.get_initiated() + << " currently " + << op.state_string(); + lgeneric_subdout(cct,osd,20) << ss.str() << dendl; + if (log_aggregated_slow_op) { + if (const OpRequest *req = dynamic_cast(&op)) { + uint8_t op_type = req->state_flag(); + auto m = req->get_req(); + uint64_t poolid = m->get_spg().pgid.m_pool; + slow_op_types[op_type]++; + if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) { + slow_op_pools[poolid]++; + } + } + } else { + clog->warn() << ss.str(); + } + slow++; + if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) { + oldest_op = &op; + } + return true; + } else { + return false; + } + }; + if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) { + if (slow) { + derr << __func__ << " reporting " << slow << " slow ops, oldest is " + << oldest_op->get_desc() << dendl; + if (log_aggregated_slow_op && + slow_op_types.size() > 0) { + stringstream ss; + ss << slow << " slow requests (by type [ "; + for (const auto& [op_type, count] : slow_op_types) { + ss << "'" << OpRequest::get_state_string(op_type) + << "' : " << count + << " "; + } + auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(), + [](std::pair p1, std::pair p2) { + return p1.second < p2.second; + }); + if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) { + string pool_name = osdmap->get_pool_name(slow_pool_it->first); + ss << "] most affected pool [ '" + << pool_name + << "' : " + << slow_pool_it->second + << " ])"; + } else { + ss << "])"; + } + lgeneric_subdout(cct,osd,20) << ss.str() << dendl; + clog->warn() << ss.str(); + } + } + metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs); + } else { + // no news is not good news. + metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0); + } + } + { + std::lock_guard l(pending_creates_lock); + auto n_primaries = pending_creates_from_mon; + for (const auto& create : pending_creates_from_osd) { + if (create.second) { + n_primaries++; + } + } + metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries); + } + return metrics; +} + +// ===================================================== +// MAP + +void OSD::wait_for_new_map(OpRequestRef op) +{ + // ask? + if (waiting_for_osdmap.empty()) { + osdmap_subscribe(get_osdmap_epoch() + 1, false); + } + + logger->inc(l_osd_waiting_for_map); + waiting_for_osdmap.push_back(op); + op->mark_delayed("wait for new map"); +} + + +/** update_map + * assimilate new OSDMap(s). scan pgs, etc. + */ + +void OSD::note_down_osd(int peer) +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer)); + + std::lock_guard l{heartbeat_lock}; + failure_queue.erase(peer); + failure_pending.erase(peer); + map::iterator p = heartbeat_peers.find(peer); + if (p != heartbeat_peers.end()) { + p->second.clear_mark_down(); + heartbeat_peers.erase(p); + } +} + +void OSD::note_up_osd(int peer) +{ + heartbeat_set_peers_need_update(); +} + +struct C_OnMapCommit : public Context { + OSD *osd; + epoch_t first, last; + MOSDMap *msg; + C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m) + : osd(o), first(f), last(l), msg(m) {} + void finish(int r) override { + osd->_committed_osd_maps(first, last, msg); + msg->put(); + } +}; + +void OSD::osdmap_subscribe(version_t epoch, bool force_request) +{ + std::lock_guard l(osdmap_subscribe_lock); + if (latest_subscribed_epoch >= epoch && !force_request) + return; + + latest_subscribed_epoch = std::max(epoch, latest_subscribed_epoch); + + if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) || + force_request) { + monc->renew_subs(); + } +} + +void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps) +{ + epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); + if (min <= superblock.oldest_map) + return; + + int num = 0; + ObjectStore::Transaction t; + for (epoch_t e = superblock.oldest_map; e < min; ++e) { + dout(20) << " removing old osdmap epoch " << e << dendl; + t.remove(coll_t::meta(), get_osdmap_pobject_name(e)); + t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e)); + superblock.oldest_map = e + 1; + num++; + if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) { + service.publish_superblock(superblock); + write_superblock(t); + int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); + ceph_assert(tr == 0); + num = 0; + if (!skip_maps) { + // skip_maps leaves us with a range of old maps if we fail to remove all + // of them before moving superblock.oldest_map forward to the first map + // in the incoming MOSDMap msg. so we should continue removing them in + // this case, even we could do huge series of delete transactions all at + // once. + break; + } + } + } + if (num > 0) { + service.publish_superblock(superblock); + write_superblock(t); + int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); + ceph_assert(tr == 0); + } + // we should not remove the cached maps + ceph_assert(min <= service.map_cache.cached_key_lower_bound()); +} + +void OSD::handle_osd_map(MOSDMap *m) +{ + // wait for pgs to catch up + { + // we extend the map cache pins to accomodate pgs slow to consume maps + // for some period, until we hit the max_lag_factor bound, at which point + // we block here to stop injesting more maps than they are able to keep + // up with. + epoch_t max_lag = cct->_conf->osd_map_cache_size * + m_osd_pg_epoch_max_lag_factor; + ceph_assert(max_lag > 0); + epoch_t osd_min = 0; + for (auto shard : shards) { + epoch_t min = shard->get_min_pg_epoch(); + if (osd_min == 0 || min < osd_min) { + osd_min = min; + } + } + epoch_t osdmap_epoch = get_osdmap_epoch(); + if (osd_min > 0 && + osdmap_epoch > max_lag && + osdmap_epoch - max_lag > osd_min) { + epoch_t need = osdmap_epoch - max_lag; + dout(10) << __func__ << " waiting for pgs to catch up (need " << need + << " max_lag " << max_lag << ")" << dendl; + for (auto shard : shards) { + epoch_t min = shard->get_min_pg_epoch(); + if (need > min) { + dout(10) << __func__ << " waiting for pgs to consume " << need + << " (shard " << shard->shard_id << " min " << min + << ", map cache is " << cct->_conf->osd_map_cache_size + << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor + << ")" << dendl; + unlock_guard unlock{osd_lock}; + shard->wait_min_pg_epoch(need); + } + } + } + } + + ceph_assert(ceph_mutex_is_locked(osd_lock)); + map added_maps; + map added_maps_bl; + if (m->fsid != monc->get_fsid()) { + dout(0) << "handle_osd_map fsid " << m->fsid << " != " + << monc->get_fsid() << dendl; + m->put(); + return; + } + if (is_initializing()) { + dout(0) << "ignoring osdmap until we have initialized" << dendl; + m->put(); + return; + } + + auto session = ceph::ref_cast(m->get_connection()->get_priv()); + if (session && !(session->entity_name.is_mon() || + session->entity_name.is_osd())) { + //not enough perms! + dout(10) << "got osd map from Session " << session + << " which we can't take maps from (not a mon or osd)" << dendl; + m->put(); + return; + } + + // share with the objecter + if (!is_preboot()) + service.objecter->handle_osd_map(m); + + epoch_t first = m->get_first(); + epoch_t last = m->get_last(); + dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have " + << superblock.newest_map + << ", src has [" << m->oldest_map << "," << m->newest_map << "]" + << dendl; + + logger->inc(l_osd_map); + logger->inc(l_osd_mape, last - first + 1); + if (first <= superblock.newest_map) + logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1); + if (service.max_oldest_map < m->oldest_map) { + service.max_oldest_map = m->oldest_map; + ceph_assert(service.max_oldest_map >= superblock.oldest_map); + } + + // make sure there is something new, here, before we bother flushing + // the queues and such + if (last <= superblock.newest_map) { + dout(10) << " no new maps here, dropping" << dendl; + m->put(); + return; + } + + // missing some? + bool skip_maps = false; + if (first > superblock.newest_map + 1) { + dout(10) << "handle_osd_map message skips epochs " + << superblock.newest_map + 1 << ".." << (first-1) << dendl; + if (m->oldest_map <= superblock.newest_map + 1) { + osdmap_subscribe(superblock.newest_map + 1, false); + m->put(); + return; + } + // always try to get the full range of maps--as many as we can. this + // 1- is good to have + // 2- is at present the only way to ensure that we get a *full* map as + // the first map! + if (m->oldest_map < first) { + osdmap_subscribe(m->oldest_map - 1, true); + m->put(); + return; + } + skip_maps = true; + } + + ObjectStore::Transaction t; + uint64_t txn_size = 0; + + map> purged_snaps; + + // store new maps: queue for disk and put in the osdmap cache + epoch_t start = std::max(superblock.newest_map + 1, first); + for (epoch_t e = start; e <= last; e++) { + if (txn_size >= t.get_num_bytes()) { + derr << __func__ << " transaction size overflowed" << dendl; + ceph_assert(txn_size < t.get_num_bytes()); + } + txn_size = t.get_num_bytes(); + map::iterator p; + p = m->maps.find(e); + if (p != m->maps.end()) { + dout(10) << "handle_osd_map got full map for epoch " << e << dendl; + OSDMap *o = new OSDMap; + bufferlist& bl = p->second; + + o->decode(bl); + + purged_snaps[e] = o->get_new_purged_snaps(); + + ghobject_t fulloid = get_osdmap_pobject_name(e); + t.write(coll_t::meta(), fulloid, 0, bl.length(), bl); + added_maps[e] = add_map(o); + added_maps_bl[e] = bl; + got_full_map(e); + continue; + } + + p = m->incremental_maps.find(e); + if (p != m->incremental_maps.end()) { + dout(10) << "handle_osd_map got inc map for epoch " << e << dendl; + bufferlist& bl = p->second; + ghobject_t oid = get_inc_osdmap_pobject_name(e); + t.write(coll_t::meta(), oid, 0, bl.length(), bl); + + OSDMap *o = new OSDMap; + if (e > 1) { + bufferlist obl; + bool got = get_map_bl(e - 1, obl); + if (!got) { + auto p = added_maps_bl.find(e - 1); + ceph_assert(p != added_maps_bl.end()); + obl = p->second; + } + o->decode(obl); + } + + OSDMap::Incremental inc; + auto p = bl.cbegin(); + inc.decode(p); + + if (o->apply_incremental(inc) < 0) { + derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl; + ceph_abort_msg("bad fsid"); + } + + bufferlist fbl; + o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); + + bool injected_failure = false; + if (cct->_conf->osd_inject_bad_map_crc_probability > 0 && + (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) { + derr << __func__ << " injecting map crc failure" << dendl; + injected_failure = true; + } + + if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) { + dout(2) << "got incremental " << e + << " but failed to encode full with correct crc; requesting" + << dendl; + clog->warn() << "failed to encode map e" << e << " with expected crc"; + dout(20) << "my encoded map was:\n"; + fbl.hexdump(*_dout); + *_dout << dendl; + delete o; + request_full_map(e, last); + last = e - 1; + + // don't continue committing if we failed to enc the first inc map + if (last < start) { + dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl; + m->put(); + return; + } + break; + } + got_full_map(e); + purged_snaps[e] = o->get_new_purged_snaps(); + + ghobject_t fulloid = get_osdmap_pobject_name(e); + t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl); + added_maps[e] = add_map(o); + added_maps_bl[e] = fbl; + continue; + } + + ceph_abort_msg("MOSDMap lied about what maps it had?"); + } + + // even if this map isn't from a mon, we may have satisfied our subscription + monc->sub_got("osdmap", last); + + if (!m->maps.empty() && requested_full_first) { + dout(10) << __func__ << " still missing full maps " << requested_full_first + << ".." << requested_full_last << dendl; + rerequest_full_maps(); + } + + if (superblock.oldest_map) { + // make sure we at least keep pace with incoming maps + trim_maps(m->oldest_map, last - first + 1, skip_maps); + pg_num_history.prune(superblock.oldest_map); + } + + if (!superblock.oldest_map || skip_maps) + superblock.oldest_map = first; + superblock.newest_map = last; + superblock.current_epoch = last; + + // note in the superblock that we were clean thru the prior epoch + epoch_t boot_epoch = service.get_boot_epoch(); + if (boot_epoch && boot_epoch >= superblock.mounted) { + superblock.mounted = boot_epoch; + superblock.clean_thru = last; + } + + // check for pg_num changes and deleted pools + OSDMapRef lastmap; + for (auto& i : added_maps) { + if (!lastmap) { + if (!(lastmap = service.try_get_map(i.first - 1))) { + dout(10) << __func__ << " can't get previous map " << i.first - 1 + << " probably first start of this osd" << dendl; + continue; + } + } + ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch()); + for (auto& j : lastmap->get_pools()) { + if (!i.second->have_pg_pool(j.first)) { + pg_num_history.log_pool_delete(i.first, j.first); + dout(10) << __func__ << " recording final pg_pool_t for pool " + << j.first << dendl; + // this information is needed by _make_pg() if have to restart before + // the pool is deleted and need to instantiate a new (zombie) PG[Pool]. + ghobject_t obj = make_final_pool_info_oid(j.first); + bufferlist bl; + encode(j.second, bl, CEPH_FEATURES_ALL); + string name = lastmap->get_pool_name(j.first); + encode(name, bl); + map profile; + if (lastmap->get_pg_pool(j.first)->is_erasure()) { + profile = lastmap->get_erasure_code_profile( + lastmap->get_pg_pool(j.first)->erasure_code_profile); + } + encode(profile, bl); + t.write(coll_t::meta(), obj, 0, bl.length(), bl); + } else if (unsigned new_pg_num = i.second->get_pg_num(j.first); + new_pg_num != j.second.get_pg_num()) { + dout(10) << __func__ << " recording pool " << j.first << " pg_num " + << j.second.get_pg_num() << " -> " << new_pg_num << dendl; + pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num); + } + } + for (auto& j : i.second->get_pools()) { + if (!lastmap->have_pg_pool(j.first)) { + dout(10) << __func__ << " recording new pool " << j.first << " pg_num " + << j.second.get_pg_num() << dendl; + pg_num_history.log_pg_num_change(i.first, j.first, + j.second.get_pg_num()); + } + } + lastmap = i.second; + } + pg_num_history.epoch = last; + { + bufferlist bl; + ::encode(pg_num_history, bl); + t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl); + dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; + } + + // record new purged_snaps + if (superblock.purged_snaps_last == start - 1) { + SnapMapper::record_purged_snaps(cct, store, service.meta_ch, + make_purged_snaps_oid(), &t, + purged_snaps); + superblock.purged_snaps_last = last; + } else { + dout(10) << __func__ << " superblock purged_snaps_last is " + << superblock.purged_snaps_last + << ", not recording new purged_snaps" << dendl; + } + + // superblock and commit + write_superblock(t); + t.register_on_commit(new C_OnMapCommit(this, start, last, m)); + store->queue_transaction( + service.meta_ch, + std::move(t)); + service.publish_superblock(superblock); +} + +void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) +{ + dout(10) << __func__ << " " << first << ".." << last << dendl; + if (is_stopping()) { + dout(10) << __func__ << " bailing, we are shutting down" << dendl; + return; + } + std::lock_guard l(osd_lock); + if (is_stopping()) { + dout(10) << __func__ << " bailing, we are shutting down" << dendl; + return; + } + map_lock.lock(); + + ceph_assert(first <= last); + + bool do_shutdown = false; + bool do_restart = false; + bool network_error = false; + OSDMapRef osdmap = get_osdmap(); + + // advance through the new maps + for (epoch_t cur = first; cur <= last; cur++) { + dout(10) << " advance to epoch " << cur + << " (<= last " << last + << " <= newest_map " << superblock.newest_map + << ")" << dendl; + + OSDMapRef newmap = get_map(cur); + ceph_assert(newmap); // we just cached it above! + + // start blocklisting messages sent to peers that go down. + service.pre_publish_map(newmap); + + // kill connections to newly down osds + bool waited_for_reservations = false; + set old; + osdmap = get_osdmap(); + osdmap->get_all_osds(old); + for (set::iterator p = old.begin(); p != old.end(); ++p) { + if (*p != whoami && + osdmap->is_up(*p) && // in old map + newmap->is_down(*p)) { // but not the new one + if (!waited_for_reservations) { + service.await_reserved_maps(); + waited_for_reservations = true; + } + note_down_osd(*p); + } else if (*p != whoami && + osdmap->is_down(*p) && + newmap->is_up(*p)) { + note_up_osd(*p); + } + } + + if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) { + dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch() + << dendl; + if (is_booting()) { + // this captures the case where we sent the boot message while + // NOUP was being set on the mon and our boot request was + // dropped, and then later it is cleared. it imperfectly + // handles the case where our original boot message was not + // dropped and we restart even though we might have booted, but + // that is harmless (boot will just take slightly longer). + do_restart = true; + } + } + + osdmap = std::move(newmap); + set_osdmap(osdmap); + epoch_t up_epoch; + epoch_t boot_epoch; + service.retrieve_epochs(&boot_epoch, &up_epoch, NULL); + if (!up_epoch && + osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) { + up_epoch = osdmap->get_epoch(); + dout(10) << "up_epoch is " << up_epoch << dendl; + if (!boot_epoch) { + boot_epoch = osdmap->get_epoch(); + dout(10) << "boot_epoch is " << boot_epoch << dendl; + } + service.set_epochs(&boot_epoch, &up_epoch, NULL); + } + } + + epoch_t _bind_epoch = service.get_bind_epoch(); + if (osdmap->is_up(whoami) && + osdmap->get_addrs(whoami).legacy_equals( + client_messenger->get_myaddrs()) && + _bind_epoch < osdmap->get_up_from(whoami)) { + + if (is_booting()) { + dout(1) << "state: booting -> active" << dendl; + set_state(STATE_ACTIVE); + do_restart = false; + + // set incarnation so that osd_reqid_t's we generate for our + // objecter requests are unique across restarts. + service.objecter->set_client_incarnation(osdmap->get_epoch()); + cancel_pending_failures(); + } + } + + if (osdmap->get_epoch() > 0 && + is_active()) { + if (!osdmap->exists(whoami)) { + derr << "map says i do not exist. shutting down." << dendl; + do_shutdown = true; // don't call shutdown() while we have + // everything paused + } else if (osdmap->is_stop(whoami)) { + derr << "map says i am stopped by admin. shutting down." << dendl; + do_shutdown = true; + } else if (!osdmap->is_up(whoami) || + !osdmap->get_addrs(whoami).legacy_equals( + client_messenger->get_myaddrs()) || + !osdmap->get_cluster_addrs(whoami).legacy_equals( + cluster_messenger->get_myaddrs()) || + !osdmap->get_hb_back_addrs(whoami).legacy_equals( + hb_back_server_messenger->get_myaddrs()) || + !osdmap->get_hb_front_addrs(whoami).legacy_equals( + hb_front_server_messenger->get_myaddrs())) { + if (!osdmap->is_up(whoami)) { + if (service.is_preparing_to_stop() || service.is_stopping()) { + service.got_stop_ack(); + } else { + clog->warn() << "Monitor daemon marked osd." << whoami << " down, " + "but it is still running"; + clog->debug() << "map e" << osdmap->get_epoch() + << " wrongly marked me down at e" + << osdmap->get_down_at(whoami); + } + if (monc->monmap.min_mon_release >= ceph_release_t::octopus) { + // note that this is best-effort... + monc->send_mon_message( + new MOSDMarkMeDead( + monc->get_fsid(), + whoami, + osdmap->get_epoch())); + } + } else if (!osdmap->get_addrs(whoami).legacy_equals( + client_messenger->get_myaddrs())) { + clog->error() << "map e" << osdmap->get_epoch() + << " had wrong client addr (" << osdmap->get_addrs(whoami) + << " != my " << client_messenger->get_myaddrs() << ")"; + } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals( + cluster_messenger->get_myaddrs())) { + clog->error() << "map e" << osdmap->get_epoch() + << " had wrong cluster addr (" + << osdmap->get_cluster_addrs(whoami) + << " != my " << cluster_messenger->get_myaddrs() << ")"; + } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals( + hb_back_server_messenger->get_myaddrs())) { + clog->error() << "map e" << osdmap->get_epoch() + << " had wrong heartbeat back addr (" + << osdmap->get_hb_back_addrs(whoami) + << " != my " << hb_back_server_messenger->get_myaddrs() + << ")"; + } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals( + hb_front_server_messenger->get_myaddrs())) { + clog->error() << "map e" << osdmap->get_epoch() + << " had wrong heartbeat front addr (" + << osdmap->get_hb_front_addrs(whoami) + << " != my " << hb_front_server_messenger->get_myaddrs() + << ")"; + } + + if (!service.is_stopping()) { + epoch_t up_epoch = 0; + epoch_t bind_epoch = osdmap->get_epoch(); + service.set_epochs(NULL,&up_epoch, &bind_epoch); + do_restart = true; + + //add markdown log + utime_t now = ceph_clock_now(); + utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0); + osd_markdown_log.push_back(now); + if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) { + derr << __func__ << " marked down " + << osd_markdown_log.size() + << " > osd_max_markdown_count " + << cct->_conf->osd_max_markdown_count + << " in last " << grace << " seconds, shutting down" + << dendl; + do_restart = false; + do_shutdown = true; + } + + start_waiting_for_healthy(); + + set avoid_ports; +#if defined(__FreeBSD__) + // prevent FreeBSD from grabbing the client_messenger port during + // rebinding. In which case a cluster_meesneger will connect also + // to the same port + client_messenger->get_myaddrs().get_ports(&avoid_ports); +#endif + cluster_messenger->get_myaddrs().get_ports(&avoid_ports); + + int r = cluster_messenger->rebind(avoid_ports); + if (r != 0) { + do_shutdown = true; // FIXME: do_restart? + network_error = true; + derr << __func__ << " marked down:" + << " rebind cluster_messenger failed" << dendl; + } + + hb_back_server_messenger->mark_down_all(); + hb_front_server_messenger->mark_down_all(); + hb_front_client_messenger->mark_down_all(); + hb_back_client_messenger->mark_down_all(); + + reset_heartbeat_peers(true); + } + } + } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) { + derr << "map says i am stopped by admin. shutting down." << dendl; + do_shutdown = true; + } + + map_lock.unlock(); + + check_osdmap_features(); + + // yay! + consume_map(); + + if (is_active() || is_waiting_for_healthy()) + maybe_update_heartbeat_peers(); + + if (is_active()) { + activate_map(); + } + + if (do_shutdown) { + if (network_error) { + cancel_pending_failures(); + } + // trigger shutdown in a different thread + dout(0) << __func__ << " shutdown OSD via async signal" << dendl; + queue_async_signal(SIGINT); + } + else if (m->newest_map && m->newest_map > last) { + dout(10) << " msg say newest map is " << m->newest_map + << ", requesting more" << dendl; + osdmap_subscribe(osdmap->get_epoch()+1, false); + } + else if (is_preboot()) { + if (m->get_source().is_mon()) + _preboot(m->oldest_map, m->newest_map); + else + start_boot(); + } + else if (do_restart) + start_boot(); + +} + +void OSD::check_osdmap_features() +{ + // adjust required feature bits? + + // we have to be a bit careful here, because we are accessing the + // Policy structures without taking any lock. in particular, only + // modify integer values that can safely be read by a racing CPU. + // since we are only accessing existing Policy structures a their + // current memory location, and setting or clearing bits in integer + // fields, and we are the only writer, this is not a problem. + + const auto osdmap = get_osdmap(); + { + Messenger::Policy p = client_messenger->get_default_policy(); + uint64_t mask; + uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask); + if ((p.features_required & mask) != features) { + dout(0) << "crush map has features " << features + << ", adjusting msgr requires for clients" << dendl; + p.features_required = (p.features_required & ~mask) | features; + client_messenger->set_default_policy(p); + } + } + { + Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON); + uint64_t mask; + uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask); + if ((p.features_required & mask) != features) { + dout(0) << "crush map has features " << features + << " was " << p.features_required + << ", adjusting msgr requires for mons" << dendl; + p.features_required = (p.features_required & ~mask) | features; + client_messenger->set_policy(entity_name_t::TYPE_MON, p); + } + } + { + Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD); + uint64_t mask; + uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask); + + if ((p.features_required & mask) != features) { + dout(0) << "crush map has features " << features + << ", adjusting msgr requires for osds" << dendl; + p.features_required = (p.features_required & ~mask) | features; + cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p); + } + + if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) { + dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl; + superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + ObjectStore::Transaction t; + write_superblock(t); + int err = store->queue_transaction(service.meta_ch, std::move(t), NULL); + ceph_assert(err == 0); + } + } + + if (osdmap->require_osd_release < ceph_release_t::nautilus) { + hb_front_server_messenger->set_require_authorizer(false); + hb_back_server_messenger->set_require_authorizer(false); + } else { + hb_front_server_messenger->set_require_authorizer(true); + hb_back_server_messenger->set_require_authorizer(true); + } + + if (osdmap->require_osd_release != last_require_osd_release) { + dout(1) << __func__ << " require_osd_release " << last_require_osd_release + << " -> " << to_string(osdmap->require_osd_release) << dendl; + store->write_meta("require_osd_release", + stringify((int)osdmap->require_osd_release)); + last_require_osd_release = osdmap->require_osd_release; + } +} + +struct C_FinishSplits : public Context { + OSD *osd; + set pgs; + C_FinishSplits(OSD *osd, const set &in) + : osd(osd), pgs(in) {} + void finish(int r) override { + osd->_finish_splits(pgs); + } +}; + +void OSD::_finish_splits(set& pgs) +{ + dout(10) << __func__ << " " << pgs << dendl; + if (is_stopping()) + return; + for (set::iterator i = pgs.begin(); + i != pgs.end(); + ++i) { + PG *pg = i->get(); + + PeeringCtx rctx = create_context(); + pg->lock(); + dout(10) << __func__ << " " << *pg << dendl; + epoch_t e = pg->get_osdmap_epoch(); + pg->handle_initialize(rctx); + pg->queue_null(e, e); + dispatch_context(rctx, pg, service.get_osdmap()); + pg->unlock(); + + unsigned shard_index = pg->pg_id.hash_to_shard(num_shards); + shards[shard_index]->register_and_wake_split_child(pg); + } +}; + +bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src, + unsigned need) +{ + std::lock_guard l(merge_lock); + auto& p = merge_waiters[nextmap->get_epoch()][target]; + p[src->pg_id] = src; + dout(10) << __func__ << " added merge_waiter " << src->pg_id + << " for " << target << ", have " << p.size() << "/" << need + << dendl; + return p.size() == need; +} + +bool OSD::advance_pg( + epoch_t osd_epoch, + PG *pg, + ThreadPool::TPHandle &handle, + PeeringCtx &rctx) +{ + if (osd_epoch <= pg->get_osdmap_epoch()) { + return true; + } + ceph_assert(pg->is_locked()); + OSDMapRef lastmap = pg->get_osdmap(); + set new_pgs; // any split children + bool ret = true; + + unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ? + lastmap->get_pg_num(pg->pg_id.pool()) : 0; + for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1; + next_epoch <= osd_epoch; + ++next_epoch) { + OSDMapRef nextmap = service.try_get_map(next_epoch); + if (!nextmap) { + dout(20) << __func__ << " missing map " << next_epoch << dendl; + continue; + } + + unsigned new_pg_num = + (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ? + nextmap->get_pg_num(pg->pg_id.pool()) : 0; + if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) { + // check for merge + if (nextmap->have_pg_pool(pg->pg_id.pool())) { + spg_t parent; + if (pg->pg_id.is_merge_source( + old_pg_num, + new_pg_num, + &parent)) { + // we are merge source + PGRef spg = pg; // carry a ref + dout(1) << __func__ << " " << pg->pg_id + << " is merge source, target is " << parent + << dendl; + pg->write_if_dirty(rctx); + if (!new_pgs.empty()) { + rctx.transaction.register_on_applied(new C_FinishSplits(this, + new_pgs)); + new_pgs.clear(); + } + dispatch_context(rctx, pg, pg->get_osdmap(), &handle); + pg->ch->flush(); + // release backoffs explicitly, since the on_shutdown path + // aggressively tears down backoff state. + if (pg->is_primary()) { + pg->release_pg_backoffs(); + } + pg->on_shutdown(); + OSDShard *sdata = pg->osd_shard; + { + std::lock_guard l(sdata->shard_lock); + if (pg->pg_slot) { + sdata->_detach_pg(pg->pg_slot); + // update pg count now since we might not get an osdmap + // any time soon. + if (pg->is_primary()) + logger->dec(l_osd_pg_primary); + else if (pg->is_nonprimary()) + logger->dec(l_osd_pg_replica); // misnomer + else + logger->dec(l_osd_pg_stray); + } + } + pg->unlock(); + + set children; + parent.is_split(new_pg_num, old_pg_num, &children); + if (add_merge_waiter(nextmap, parent, pg, children.size())) { + enqueue_peering_evt( + parent, + PGPeeringEventRef( + std::make_shared( + nextmap->get_epoch(), + nextmap->get_epoch(), + NullEvt()))); + } + ret = false; + goto out; + } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) { + // we are merge target + set children; + pg->pg_id.is_split(new_pg_num, old_pg_num, &children); + dout(20) << __func__ << " " << pg->pg_id + << " is merge target, sources are " << children + << dendl; + map sources; + { + std::lock_guard l(merge_lock); + auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id]; + unsigned need = children.size(); + dout(20) << __func__ << " have " << s.size() << "/" + << need << dendl; + if (s.size() == need) { + sources.swap(s); + merge_waiters[nextmap->get_epoch()].erase(pg->pg_id); + if (merge_waiters[nextmap->get_epoch()].empty()) { + merge_waiters.erase(nextmap->get_epoch()); + } + } + } + if (!sources.empty()) { + unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool()); + unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num); + dout(1) << __func__ << " merging " << pg->pg_id << dendl; + pg->merge_from( + sources, rctx, split_bits, + nextmap->get_pg_pool( + pg->pg_id.pool())->last_pg_merge_meta); + pg->pg_slot->waiting_for_merge_epoch = 0; + } else { + dout(20) << __func__ << " not ready to merge yet" << dendl; + pg->write_if_dirty(rctx); + if (!new_pgs.empty()) { + rctx.transaction.register_on_applied(new C_FinishSplits(this, + new_pgs)); + new_pgs.clear(); + } + dispatch_context(rctx, pg, pg->get_osdmap(), &handle); + pg->unlock(); + // kick source(s) to get them ready + for (auto& i : children) { + dout(20) << __func__ << " kicking source " << i << dendl; + enqueue_peering_evt( + i, + PGPeeringEventRef( + std::make_shared( + nextmap->get_epoch(), + nextmap->get_epoch(), + NullEvt()))); + } + ret = false; + goto out; + } + } + } + } + + vector newup, newacting; + int up_primary, acting_primary; + nextmap->pg_to_up_acting_osds( + pg->pg_id.pgid, + &newup, &up_primary, + &newacting, &acting_primary); + pg->handle_advance_map( + nextmap, lastmap, newup, up_primary, + newacting, acting_primary, rctx); + + auto oldpool = lastmap->get_pools().find(pg->pg_id.pool()); + auto newpool = nextmap->get_pools().find(pg->pg_id.pool()); + if (oldpool != lastmap->get_pools().end() + && newpool != nextmap->get_pools().end()) { + dout(20) << __func__ + << " new pool opts " << newpool->second.opts + << " old pool opts " << oldpool->second.opts + << dendl; + + double old_min_interval = 0, new_min_interval = 0; + oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval); + newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval); + + double old_max_interval = 0, new_max_interval = 0; + oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval); + newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval); + + // Assume if an interval is change from set to unset or vice versa the actual config + // is different. Keep it simple even if it is possible to call resched_all_scrub() + // unnecessarily. + if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) { + pg->on_info_history_change(); + } + } + + if (new_pg_num && old_pg_num != new_pg_num) { + // check for split + set children; + if (pg->pg_id.is_split( + old_pg_num, + new_pg_num, + &children)) { + split_pgs( + pg, children, &new_pgs, lastmap, nextmap, + rctx); + } + } + + lastmap = nextmap; + old_pg_num = new_pg_num; + handle.reset_tp_timeout(); + } + pg->handle_activate_map(rctx); + + ret = true; + out: + if (!new_pgs.empty()) { + rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs)); + } + return ret; +} + +void OSD::consume_map() +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + auto osdmap = get_osdmap(); + dout(7) << "consume_map version " << osdmap->get_epoch() << dendl; + + /** make sure the cluster is speaking in SORTBITWISE, because we don't + * speak the older sorting version any more. Be careful not to force + * a shutdown if we are merely processing old maps, though. + */ + if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) { + derr << __func__ << " SORTBITWISE flag is not set" << dendl; + ceph_abort(); + } + + service.pre_publish_map(osdmap); + service.await_reserved_maps(); + service.publish_map(osdmap); + + // prime splits and merges + set> newly_split; // splits, and when + set> merge_pgs; // merge participants, and when + for (auto& shard : shards) { + shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs); + } + if (!newly_split.empty()) { + for (auto& shard : shards) { + shard->prime_splits(osdmap, &newly_split); + } + ceph_assert(newly_split.empty()); + } + + // prune sent_ready_to_merge + service.prune_sent_ready_to_merge(osdmap); + + // FIXME, maybe: We could race against an incoming peering message + // that instantiates a merge PG after identify_merges() below and + // never set up its peer to complete the merge. An OSD restart + // would clear it up. This is a hard race to resolve, + // extraordinarily rare (we only merge PGs that are stable and + // clean, so it'd have to be an imported PG to an OSD with a + // slightly stale OSDMap...), so I'm ignoring it for now. We plan to + // replace all of this with a seastar-based code soon anyway. + if (!merge_pgs.empty()) { + // mark the pgs we already have, or create new and empty merge + // participants for those we are missing. do this all under the + // shard lock so we don't have to worry about racing pg creates + // via _process. + for (auto& shard : shards) { + shard->prime_merges(osdmap, &merge_pgs); + } + ceph_assert(merge_pgs.empty()); + } + + service.prune_pg_created(); + + unsigned pushes_to_free = 0; + for (auto& shard : shards) { + shard->consume_map(osdmap, &pushes_to_free); + } + + vector pgids; + _get_pgids(&pgids); + + // count (FIXME, probably during seastar rewrite) + int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0; + vector pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + // FIXME (probably during seastar rewrite): this is lockless and + // racy, but we don't want to take pg lock here. + if (pg->is_primary()) + num_pg_primary++; + else if (pg->is_nonprimary()) + num_pg_replica++; // misnomer + else + num_pg_stray++; + } + + { + // FIXME (as part of seastar rewrite): move to OSDShard + std::lock_guard l(pending_creates_lock); + for (auto pg = pending_creates_from_osd.begin(); + pg != pending_creates_from_osd.end();) { + if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) { + dout(10) << __func__ << " pg " << pg->first << " doesn't map here, " + << "discarding pending_create_from_osd" << dendl; + pg = pending_creates_from_osd.erase(pg); + } else { + ++pg; + } + } + } + + service.maybe_inject_dispatch_delay(); + + dispatch_sessions_waiting_on_map(); + + service.maybe_inject_dispatch_delay(); + + service.release_reserved_pushes(pushes_to_free); + + // queue null events to push maps down to individual PGs + for (auto pgid : pgids) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + osdmap->get_epoch(), + osdmap->get_epoch(), + NullEvt()))); + } + logger->set(l_osd_pg, pgids.size()); + logger->set(l_osd_pg_primary, num_pg_primary); + logger->set(l_osd_pg_replica, num_pg_replica); + logger->set(l_osd_pg_stray, num_pg_stray); +} + +void OSD::activate_map() +{ + ceph_assert(ceph_mutex_is_locked(osd_lock)); + auto osdmap = get_osdmap(); + + dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; + + // norecover? + if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) { + if (!service.recovery_is_paused()) { + dout(1) << "pausing recovery (NORECOVER flag set)" << dendl; + service.pause_recovery(); + } + } else { + if (service.recovery_is_paused()) { + dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl; + service.unpause_recovery(); + } + } + + service.activate_map(); + + // process waiters + take_waiters(waiting_for_osdmap); +} + +bool OSD::require_mon_peer(const Message *m) +{ + if (!m->get_connection()->peer_is_mon()) { + dout(0) << "require_mon_peer received from non-mon " + << m->get_connection()->get_peer_addr() + << " " << *m << dendl; + return false; + } + return true; +} + +bool OSD::require_mon_or_mgr_peer(const Message *m) +{ + if (!m->get_connection()->peer_is_mon() && + !m->get_connection()->peer_is_mgr()) { + dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr " + << m->get_connection()->get_peer_addr() + << " " << *m << dendl; + return false; + } + return true; +} + +bool OSD::require_osd_peer(const Message *m) +{ + if (!m->get_connection()->peer_is_osd()) { + dout(0) << "require_osd_peer received from non-osd " + << m->get_connection()->get_peer_addr() + << " " << *m << dendl; + return false; + } + return true; +} + +bool OSD::require_self_aliveness(const Message *m, epoch_t epoch) +{ + epoch_t up_epoch = service.get_up_epoch(); + if (epoch < up_epoch) { + dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl; + return false; + } + + if (!is_active()) { + dout(7) << "still in boot state, dropping message " << *m << dendl; + return false; + } + + return true; +} + +bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map, + bool is_fast_dispatch) +{ + int from = m->get_source().num(); + + if (map->is_down(from) || + (map->get_cluster_addrs(from) != m->get_source_addrs())) { + dout(5) << "from dead osd." << from << ", marking down, " + << " msg was " << m->get_source_inst().addr + << " expected " + << (map->is_up(from) ? + map->get_cluster_addrs(from) : entity_addrvec_t()) + << dendl; + ConnectionRef con = m->get_connection(); + con->mark_down(); + if (auto s = ceph::ref_cast(con->get_priv()); s) { + if (!is_fast_dispatch) + s->session_dispatch_lock.lock(); + clear_session_waiting_on_map(s); + con->set_priv(nullptr); // break ref <-> session cycle, if any + s->con.reset(); + if (!is_fast_dispatch) + s->session_dispatch_lock.unlock(); + } + return false; + } + return true; +} + + +/* + * require that we have same (or newer) map, and that + * the source is the pg primary. + */ +bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch, + bool is_fast_dispatch) +{ + const Message *m = op->get_req(); + const auto osdmap = get_osdmap(); + dout(15) << "require_same_or_newer_map " << epoch + << " (i am " << osdmap->get_epoch() << ") " << m << dendl; + + ceph_assert(ceph_mutex_is_locked(osd_lock)); + + // do they have a newer map? + if (epoch > osdmap->get_epoch()) { + dout(7) << "waiting for newer map epoch " << epoch + << " > my " << osdmap->get_epoch() << " with " << m << dendl; + wait_for_new_map(op); + return false; + } + + if (!require_self_aliveness(op->get_req(), epoch)) { + return false; + } + + // ok, our map is same or newer.. do they still exist? + if (m->get_connection()->get_messenger() == cluster_messenger && + !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) { + return false; + } + + return true; +} + + + + + +// ---------------------------------------- +// pg creation + +void OSD::split_pgs( + PG *parent, + const set &childpgids, set *out_pgs, + OSDMapRef curmap, + OSDMapRef nextmap, + PeeringCtx &rctx) +{ + unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool()); + parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num)); + + vector updated_stats; + parent->start_split_stats(childpgids, &updated_stats); + + vector::iterator stat_iter = updated_stats.begin(); + for (set::const_iterator i = childpgids.begin(); + i != childpgids.end(); + ++i, ++stat_iter) { + ceph_assert(stat_iter != updated_stats.end()); + dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl; + PG* child = _make_pg(nextmap, *i); + child->lock(true); + out_pgs->insert(child); + child->ch = store->create_new_collection(child->coll); + + { + uint32_t shard_index = i->hash_to_shard(shards.size()); + assert(NULL != shards[shard_index]); + store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue)); + } + + unsigned split_bits = i->get_split_bits(pg_num); + dout(10) << " pg_num is " << pg_num + << ", m_seed " << i->ps() + << ", split_bits is " << split_bits << dendl; + parent->split_colls( + *i, + split_bits, + i->ps(), + &child->get_pool().info, + rctx.transaction); + parent->split_into( + i->pgid, + child, + split_bits); + + child->init_collection_pool_opts(); + + child->finish_split_stats(*stat_iter, rctx.transaction); + child->unlock(); + } + ceph_assert(stat_iter != updated_stats.end()); + parent->finish_split_stats(*stat_iter, rctx.transaction); +} + +/* + * holding osd_lock + */ +void OSD::handle_pg_create(OpRequestRef op) +{ + // NOTE: this can be removed in P release (mimic is the last version to + // send MOSDPGCreate messages). + + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_CREATE); + + dout(10) << "handle_pg_create " << *m << dendl; + + if (!require_mon_peer(op->get_req())) { + return; + } + + if (!require_same_or_newer_map(op, m->epoch, false)) + return; + + op->mark_started(); + + const auto osdmap = get_osdmap(); + map::const_iterator ci = m->ctimes.begin(); + for (map::const_iterator p = m->mkpg.begin(); + p != m->mkpg.end(); + ++p, ++ci) { + ceph_assert(ci != m->ctimes.end() && ci->first == p->first); + epoch_t created = p->second.created; + if (p->second.split_bits) // Skip split pgs + continue; + pg_t on = p->first; + + if (!osdmap->have_pg_pool(on.pool())) { + dout(20) << "ignoring pg on deleted pool " << on << dendl; + continue; + } + + dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl; + + spg_t pgid; + bool mapped = osdmap->get_primary_shard(on, &pgid); + ceph_assert(mapped); + + // is it still ours? + vector up, acting; + int up_primary = -1; + int acting_primary = -1; + osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary); + int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting); + + if (acting_primary != whoami) { + dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary + << "), my role=" << role << ", skipping" << dendl; + continue; + } + + + PastIntervals pi; + pg_history_t history; + build_initial_pg_history(pgid, created, ci->second, &history, &pi); + + // The mon won't resend unless the primary changed, so we ignore + // same_interval_since. We'll pass this history with the current + // epoch as the event. + if (history.same_primary_since > m->epoch) { + dout(10) << __func__ << ": got obsolete pg create on pgid " + << pgid << " from epoch " << m->epoch + << ", primary changed in " << history.same_primary_since + << dendl; + continue; + } + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + osdmap->get_epoch(), + osdmap->get_epoch(), + NullEvt(), + true, + new PGCreateInfo( + pgid, + osdmap->get_epoch(), + history, + pi, + true) + ))); + } + + { + std::lock_guard l(pending_creates_lock); + if (pending_creates_from_mon == 0) { + last_pg_create_epoch = m->epoch; + } + } + + maybe_update_heartbeat_peers(); +} + + +// ---------------------------------------- +// peering and recovery + +PeeringCtx OSD::create_context() +{ + return PeeringCtx(get_osdmap()->require_osd_release); +} + +void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap, + ThreadPool::TPHandle *handle) +{ + if (!service.get_osdmap()->is_up(whoami)) { + dout(20) << __func__ << " not up in osdmap" << dendl; + } else if (!is_active()) { + dout(20) << __func__ << " not active" << dendl; + } else { + for (auto& [osd, ls] : ctx.message_map) { + if (!curmap->is_up(osd)) { + dout(20) << __func__ << " skipping down osd." << osd << dendl; + continue; + } + ConnectionRef con = service.get_con_osd_cluster( + osd, curmap->get_epoch()); + if (!con) { + dout(20) << __func__ << " skipping osd." << osd << " (NULL con)" + << dendl; + continue; + } + service.maybe_share_map(con.get(), curmap); + for (auto m : ls) { + con->send_message2(m); + } + ls.clear(); + } + } + if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) { + int tr = store->queue_transaction( + pg->ch, + std::move(ctx.transaction), TrackedOpRef(), + handle); + ceph_assert(tr == 0); + } +} + +void OSD::handle_fast_pg_create(MOSDPGCreate2 *m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_mon_peer(m)) { + m->put(); + return; + } + for (auto& p : m->pgs) { + spg_t pgid = p.first; + epoch_t created = p.second.first; + utime_t created_stamp = p.second.second; + auto q = m->pg_extra.find(pgid); + if (q == m->pg_extra.end()) { + dout(20) << __func__ << " " << pgid << " e" << created + << "@" << created_stamp + << " (no history or past_intervals)" << dendl; + // pre-octopus ... no pg history. this can be removed in Q release. + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + m->epoch, + m->epoch, + NullEvt(), + true, + new PGCreateInfo( + pgid, + created, + pg_history_t(created, created_stamp), + PastIntervals(), + true) + ))); + } else { + dout(20) << __func__ << " " << pgid << " e" << created + << "@" << created_stamp + << " history " << q->second.first + << " pi " << q->second.second << dendl; + if (!q->second.second.empty() && + m->epoch < q->second.second.get_bounds().second) { + clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch + << " and unmatched past_intervals " << q->second.second + << " (history " << q->second.first << ")"; + } else { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + m->epoch, + m->epoch, + NullEvt(), + true, + new PGCreateInfo( + pgid, + m->epoch, + q->second.first, + q->second.second, + true) + ))); + } + } + } + + { + std::lock_guard l(pending_creates_lock); + if (pending_creates_from_mon == 0) { + last_pg_create_epoch = m->epoch; + } + } + + m->put(); +} + +void OSD::handle_fast_pg_query(MOSDPGQuery *m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_osd_peer(m)) { + m->put(); + return; + } + int from = m->get_source().num(); + for (auto& p : m->pg_list) { + enqueue_peering_evt( + p.first, + PGPeeringEventRef( + std::make_shared( + p.second.epoch_sent, p.second.epoch_sent, + MQuery( + p.first, + pg_shard_t(from, p.second.from), + p.second, + p.second.epoch_sent), + false)) + ); + } + m->put(); +} + +void OSD::handle_fast_pg_notify(MOSDPGNotify* m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_osd_peer(m)) { + m->put(); + return; + } + int from = m->get_source().num(); + for (auto& p : m->get_pg_list()) { + spg_t pgid(p.info.pgid.pgid, p.to); + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + p.epoch_sent, + p.query_epoch, + MNotifyRec( + pgid, pg_shard_t(from, p.from), + p, + m->get_connection()->get_features()), + true, + new PGCreateInfo( + pgid, + p.query_epoch, + p.info.history, + p.past_intervals, + false) + ))); + } + m->put(); +} + +void OSD::handle_fast_pg_info(MOSDPGInfo* m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_osd_peer(m)) { + m->put(); + return; + } + int from = m->get_source().num(); + for (auto& p : m->pg_list) { + enqueue_peering_evt( + spg_t(p.info.pgid.pgid, p.to), + PGPeeringEventRef( + std::make_shared( + p.epoch_sent, p.query_epoch, + MInfoRec( + pg_shard_t(from, p.from), + p.info, + p.epoch_sent))) + ); + } + m->put(); +} + +void OSD::handle_fast_pg_remove(MOSDPGRemove *m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_osd_peer(m)) { + m->put(); + return; + } + for (auto& pgid : m->pg_list) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + m->get_epoch(), m->get_epoch(), + PeeringState::DeleteStart()))); + } + m->put(); +} + +void OSD::handle_fast_force_recovery(MOSDForceRecovery *m) +{ + dout(10) << __func__ << " " << *m << dendl; + if (!require_mon_or_mgr_peer(m)) { + m->put(); + return; + } + epoch_t epoch = get_osdmap_epoch(); + for (auto pgid : m->forced_pgs) { + if (m->options & OFR_BACKFILL) { + if (m->options & OFR_CANCEL) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + epoch, epoch, + PeeringState::UnsetForceBackfill()))); + } else { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + epoch, epoch, + PeeringState::SetForceBackfill()))); + } + } else if (m->options & OFR_RECOVERY) { + if (m->options & OFR_CANCEL) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + epoch, epoch, + PeeringState::UnsetForceRecovery()))); + } else { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared( + epoch, epoch, + PeeringState::SetForceRecovery()))); + } + } + } + m->put(); +} + +void OSD::handle_pg_query_nopg(const MQuery& q) +{ + spg_t pgid = q.pgid; + dout(10) << __func__ << " " << pgid << dendl; + + OSDMapRef osdmap = get_osdmap(); + if (!osdmap->have_pg_pool(pgid.pool())) + return; + + dout(10) << " pg " << pgid << " dne" << dendl; + pg_info_t empty(spg_t(pgid.pgid, q.query.to)); + ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch()); + if (con) { + Message *m; + if (q.query.type == pg_query_t::LOG || + q.query.type == pg_query_t::FULLLOG) { + m = new MOSDPGLog( + q.query.from, q.query.to, + osdmap->get_epoch(), empty, + q.query.epoch_sent); + } else { + vector ls; + ls.push_back( + pg_notify_t( + q.query.from, q.query.to, + q.query.epoch_sent, + osdmap->get_epoch(), + empty, + PastIntervals())); + m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls)); + } + service.maybe_share_map(con.get(), osdmap); + con->send_message(m); + } +} + +void OSDService::queue_check_readable(spg_t spgid, + epoch_t lpr, + ceph::signedspan delay) +{ + if (delay == ceph::signedspan::zero()) { + osd->enqueue_peering_evt( + spgid, + PGPeeringEventRef( + std::make_shared( + lpr, lpr, + PeeringState::CheckReadable()))); + } else { + mono_timer.add_event( + delay, + [this, spgid, lpr]() { + queue_check_readable(spgid, lpr); + }); + } +} + + +// ========================================================= +// RECOVERY + +void OSDService::_maybe_queue_recovery() { + ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock)); + uint64_t available_pushes; + while (!awaiting_throttle.empty() && + _recover_now(&available_pushes)) { + uint64_t to_start = std::min( + available_pushes, + cct->_conf->osd_recovery_max_single_start); + _queue_for_recovery(awaiting_throttle.front(), to_start); + awaiting_throttle.pop_front(); + dout(10) << __func__ << " starting " << to_start + << ", recovery_ops_reserved " << recovery_ops_reserved + << " -> " << (recovery_ops_reserved + to_start) << dendl; + recovery_ops_reserved += to_start; + } +} + +bool OSDService::_recover_now(uint64_t *available_pushes) +{ + if (available_pushes) + *available_pushes = 0; + + if (ceph_clock_now() < defer_recovery_until) { + dout(15) << __func__ << " defer until " << defer_recovery_until << dendl; + return false; + } + + if (recovery_paused) { + dout(15) << __func__ << " paused" << dendl; + return false; + } + + uint64_t max = osd->get_recovery_max_active(); + if (max <= recovery_ops_active + recovery_ops_reserved) { + dout(15) << __func__ << " active " << recovery_ops_active + << " + reserved " << recovery_ops_reserved + << " >= max " << max << dendl; + return false; + } + + if (available_pushes) + *available_pushes = max - recovery_ops_active - recovery_ops_reserved; + + return true; +} + +unsigned OSDService::get_target_pg_log_entries() const +{ + auto num_pgs = osd->get_num_pgs(); + auto target = cct->_conf->osd_target_pg_log_entries_per_osd; + if (num_pgs > 0 && target > 0) { + // target an even spread of our budgeted log entries across all + // PGs. note that while we only get to control the entry count + // for primary PGs, we'll normally be responsible for a mix of + // primary and replica PGs (for the same pool(s) even), so this + // will work out. + return std::max( + std::min(target / num_pgs, + cct->_conf->osd_max_pg_log_entries), + cct->_conf->osd_min_pg_log_entries); + } else { + // fall back to a per-pg value. + return cct->_conf->osd_min_pg_log_entries; + } +} + +void OSD::do_recovery( + PG *pg, epoch_t queued, uint64_t reserved_pushes, + ThreadPool::TPHandle &handle) +{ + uint64_t started = 0; + + /* + * When the value of osd_recovery_sleep is set greater than zero, recovery + * ops are scheduled after osd_recovery_sleep amount of time from the previous + * recovery event's schedule time. This is done by adding a + * recovery_requeue_callback event, which re-queues the recovery op using + * queue_recovery_after_sleep. + */ + float recovery_sleep = get_osd_recovery_sleep(); + { + std::lock_guard l(service.sleep_lock); + if (recovery_sleep > 0 && service.recovery_needs_sleep) { + PGRef pgref(pg); + auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) { + dout(20) << "do_recovery wake up at " + << ceph_clock_now() + << ", re-queuing recovery" << dendl; + std::lock_guard l(service.sleep_lock); + service.recovery_needs_sleep = false; + service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes); + }); + + // This is true for the first recovery op and when the previous recovery op + // has been scheduled in the past. The next recovery op is scheduled after + // completing the sleep from now. + + if (auto now = ceph::real_clock::now(); + service.recovery_schedule_time < now) { + service.recovery_schedule_time = now; + } + service.recovery_schedule_time += ceph::make_timespan(recovery_sleep); + service.sleep_timer.add_event_at(service.recovery_schedule_time, + recovery_requeue_callback); + dout(20) << "Recovery event scheduled at " + << service.recovery_schedule_time << dendl; + return; + } + } + + { + { + std::lock_guard l(service.sleep_lock); + service.recovery_needs_sleep = true; + } + + if (pg->pg_has_reset_since(queued)) { + goto out; + } + + dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl; +#ifdef DEBUG_RECOVERY_OIDS + dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl; +#endif + + bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started); + dout(10) << "do_recovery started " << started << "/" << reserved_pushes + << " on " << *pg << dendl; + + if (do_unfound) { + PeeringCtx rctx = create_context(); + rctx.handle = &handle; + pg->find_unfound(queued, rctx); + dispatch_context(rctx, pg, pg->get_osdmap()); + } + } + + out: + ceph_assert(started <= reserved_pushes); + service.release_reserved_pushes(reserved_pushes); +} + +void OSDService::start_recovery_op(PG *pg, const hobject_t& soid) +{ + std::lock_guard l(recovery_lock); + dout(10) << "start_recovery_op " << *pg << " " << soid + << " (" << recovery_ops_active << "/" + << osd->get_recovery_max_active() << " rops)" + << dendl; + recovery_ops_active++; + +#ifdef DEBUG_RECOVERY_OIDS + dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl; + ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0); + recovery_oids[pg->pg_id].insert(soid); +#endif +} + +void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue) +{ + std::lock_guard l(recovery_lock); + dout(10) << "finish_recovery_op " << *pg << " " << soid + << " dequeue=" << dequeue + << " (" << recovery_ops_active << "/" + << osd->get_recovery_max_active() << " rops)" + << dendl; + + // adjust count + ceph_assert(recovery_ops_active > 0); + recovery_ops_active--; + +#ifdef DEBUG_RECOVERY_OIDS + dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl; + ceph_assert(recovery_oids[pg->pg_id].count(soid)); + recovery_oids[pg->pg_id].erase(soid); +#endif + + _maybe_queue_recovery(); +} + +bool OSDService::is_recovery_active() +{ + if (cct->_conf->osd_debug_pretend_recovery_active) { + return true; + } + return local_reserver.has_reservation() || remote_reserver.has_reservation(); +} + +void OSDService::release_reserved_pushes(uint64_t pushes) +{ + std::lock_guard l(recovery_lock); + dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved " + << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes) + << dendl; + ceph_assert(recovery_ops_reserved >= pushes); + recovery_ops_reserved -= pushes; + _maybe_queue_recovery(); +} + +// ========================================================= +// OPS + +bool OSD::op_is_discardable(const MOSDOp *op) +{ + // drop client request if they are not connected and can't get the + // reply anyway. + if (!op->get_connection()->is_connected()) { + return true; + } + return false; +} + +void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch) +{ + const utime_t stamp = op->get_req()->get_recv_stamp(); + const utime_t latency = ceph_clock_now() - stamp; + const unsigned priority = op->get_req()->get_priority(); + const int cost = op->get_req()->get_cost(); + const uint64_t owner = op->get_req()->get_source().num(); + const int type = op->get_req()->get_type(); + + dout(15) << "enqueue_op " << op << " prio " << priority + << " type " << type + << " cost " << cost + << " latency " << latency + << " epoch " << epoch + << " " << *(op->get_req()) << dendl; + op->osd_trace.event("enqueue op"); + op->osd_trace.keyval("priority", priority); + op->osd_trace.keyval("cost", cost); +#ifdef HAVE_JAEGER + if (op->osd_parent_span) { + auto enqueue_span = jaeger_tracing::child_span(__func__, op->osd_parent_span); + enqueue_span->Log({ + {"priority", priority}, + {"cost", cost}, + {"epoch", epoch}, + {"owner", owner}, + {"type", type} + }); + } +#endif + op->mark_queued_for_pg(); + logger->tinc(l_osd_op_before_queue_op_lat, latency); + if (type == MSG_OSD_PG_PUSH || + type == MSG_OSD_PG_PUSH_REPLY) { + op_shardedwq.queue( + OpSchedulerItem( + unique_ptr(new PGRecoveryMsg(pg, std::move(op))), + cost, priority, stamp, owner, epoch)); + } else { + op_shardedwq.queue( + OpSchedulerItem( + unique_ptr(new PGOpItem(pg, std::move(op))), + cost, priority, stamp, owner, epoch)); + } +} + +void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt) +{ + dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl; + op_shardedwq.queue( + OpSchedulerItem( + unique_ptr(new PGPeeringItem(pgid, evt)), + 10, + cct->_conf->osd_peering_op_priority, + utime_t(), + 0, + evt->get_epoch_sent())); +} + +/* + * NOTE: dequeue called in worker thread, with pg lock + */ +void OSD::dequeue_op( + PGRef pg, OpRequestRef op, + ThreadPool::TPHandle &handle) +{ + const Message *m = op->get_req(); + + FUNCTRACE(cct); + OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false); + + utime_t now = ceph_clock_now(); + op->set_dequeued_time(now); + + utime_t latency = now - m->get_recv_stamp(); + dout(10) << "dequeue_op " << op << " prio " << m->get_priority() + << " cost " << m->get_cost() + << " latency " << latency + << " " << *m + << " pg " << *pg << dendl; + + logger->tinc(l_osd_op_before_dequeue_op_lat, latency); + + service.maybe_share_map(m->get_connection().get(), + pg->get_osdmap(), + op->sent_epoch); + + if (pg->is_deleting()) + return; + + op->mark_reached_pg(); + op->osd_trace.event("dequeue_op"); + + pg->do_request(op, handle); + + // finish + dout(10) << "dequeue_op " << op << " finish" << dendl; + OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false); +} + + +void OSD::dequeue_peering_evt( + OSDShard *sdata, + PG *pg, + PGPeeringEventRef evt, + ThreadPool::TPHandle& handle) +{ + PeeringCtx rctx = create_context(); + auto curmap = sdata->get_osdmap(); + bool need_up_thru = false; + epoch_t same_interval_since = 0; + if (!pg) { + if (const MQuery *q = dynamic_cast(evt->evt.get())) { + handle_pg_query_nopg(*q); + } else { + derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl; + ceph_abort(); + } + } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) { + pg->do_peering_event(evt, rctx); + if (pg->is_deleted()) { + pg->unlock(); + return; + } + dispatch_context(rctx, pg, curmap, &handle); + need_up_thru = pg->get_need_up_thru(); + same_interval_since = pg->get_same_interval_since(); + pg->unlock(); + } + + if (need_up_thru) { + queue_want_up_thru(same_interval_since); + } + + service.send_pg_temp(); +} + +void OSD::dequeue_delete( + OSDShard *sdata, + PG *pg, + epoch_t e, + ThreadPool::TPHandle& handle) +{ + dequeue_peering_evt( + sdata, + pg, + PGPeeringEventRef( + std::make_shared( + e, e, + PeeringState::DeleteSome())), + handle); +} + + + +// -------------------------------- + +const char** OSD::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_max_backfills", + "osd_min_recovery_priority", + "osd_max_trimming_pgs", + "osd_op_complaint_time", + "osd_op_log_threshold", + "osd_op_history_size", + "osd_op_history_duration", + "osd_op_history_slow_op_size", + "osd_op_history_slow_op_threshold", + "osd_enable_op_tracker", + "osd_map_cache_size", + "osd_pg_epoch_max_lag_factor", + "osd_pg_epoch_persisted_max_stale", + "osd_recovery_sleep", + "osd_recovery_sleep_hdd", + "osd_recovery_sleep_ssd", + "osd_recovery_sleep_hybrid", + "osd_delete_sleep", + "osd_delete_sleep_hdd", + "osd_delete_sleep_ssd", + "osd_delete_sleep_hybrid", + "osd_snap_trim_sleep", + "osd_snap_trim_sleep_hdd", + "osd_snap_trim_sleep_ssd", + "osd_snap_trim_sleep_hybrid" + "osd_scrub_sleep", + "osd_recovery_max_active", + "osd_recovery_max_active_hdd", + "osd_recovery_max_active_ssd", + // clog & admin clog + "clog_to_monitors", + "clog_to_syslog", + "clog_to_syslog_facility", + "clog_to_syslog_level", + "osd_objectstore_fuse", + "clog_to_graylog", + "clog_to_graylog_host", + "clog_to_graylog_port", + "host", + "fsid", + "osd_recovery_delay_start", + "osd_client_message_size_cap", + "osd_client_message_cap", + "osd_heartbeat_min_size", + "osd_heartbeat_interval", + "osd_object_clean_region_max_num_intervals", + "osd_scrub_min_interval", + "osd_scrub_max_interval", + NULL + }; + return KEYS; +} + +void OSD::handle_conf_change(const ConfigProxy& conf, + const std::set &changed) +{ + std::lock_guard l{osd_lock}; + + if (changed.count("osd_max_backfills") || + changed.count("osd_delete_sleep") || + changed.count("osd_delete_sleep_hdd") || + changed.count("osd_delete_sleep_ssd") || + changed.count("osd_delete_sleep_hybrid") || + changed.count("osd_snap_trim_sleep") || + changed.count("osd_snap_trim_sleep_hdd") || + changed.count("osd_snap_trim_sleep_ssd") || + changed.count("osd_snap_trim_sleep_hybrid") || + changed.count("osd_scrub_sleep") || + changed.count("osd_recovery_sleep") || + changed.count("osd_recovery_sleep_hdd") || + changed.count("osd_recovery_sleep_ssd") || + changed.count("osd_recovery_sleep_hybrid") || + changed.count("osd_recovery_max_active") || + changed.count("osd_recovery_max_active_hdd") || + changed.count("osd_recovery_max_active_ssd")) { + if (!maybe_override_options_for_qos() && + changed.count("osd_max_backfills")) { + // Scheduler is not "mclock". Fallback to earlier behavior + service.local_reserver.set_max(cct->_conf->osd_max_backfills); + service.remote_reserver.set_max(cct->_conf->osd_max_backfills); + } + } + if (changed.count("osd_min_recovery_priority")) { + service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority); + service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority); + } + if (changed.count("osd_max_trimming_pgs")) { + service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs); + } + if (changed.count("osd_op_complaint_time") || + changed.count("osd_op_log_threshold")) { + op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time, + cct->_conf->osd_op_log_threshold); + } + if (changed.count("osd_op_history_size") || + changed.count("osd_op_history_duration")) { + op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size, + cct->_conf->osd_op_history_duration); + } + if (changed.count("osd_op_history_slow_op_size") || + changed.count("osd_op_history_slow_op_threshold")) { + op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size, + cct->_conf->osd_op_history_slow_op_threshold); + } + if (changed.count("osd_enable_op_tracker")) { + op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker); + } + if (changed.count("osd_map_cache_size")) { + service.map_cache.set_size(cct->_conf->osd_map_cache_size); + service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size); + service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size); + } + if (changed.count("clog_to_monitors") || + changed.count("clog_to_syslog") || + changed.count("clog_to_syslog_level") || + changed.count("clog_to_syslog_facility") || + changed.count("clog_to_graylog") || + changed.count("clog_to_graylog_host") || + changed.count("clog_to_graylog_port") || + changed.count("host") || + changed.count("fsid")) { + update_log_config(); + } + if (changed.count("osd_pg_epoch_max_lag_factor")) { + m_osd_pg_epoch_max_lag_factor = conf.get_val( + "osd_pg_epoch_max_lag_factor"); + } + +#ifdef HAVE_LIBFUSE + if (changed.count("osd_objectstore_fuse")) { + if (store) { + enable_disable_fuse(false); + } + } +#endif + + if (changed.count("osd_recovery_delay_start")) { + service.defer_recovery(cct->_conf->osd_recovery_delay_start); + service.kick_recovery_queue(); + } + + if (changed.count("osd_client_message_cap")) { + uint64_t newval = cct->_conf->osd_client_message_cap; + Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT); + if (pol.throttler_messages && newval > 0) { + pol.throttler_messages->reset_max(newval); + } + } + if (changed.count("osd_client_message_size_cap")) { + uint64_t newval = cct->_conf->osd_client_message_size_cap; + Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT); + if (pol.throttler_bytes && newval > 0) { + pol.throttler_bytes->reset_max(newval); + } + } + if (changed.count("osd_object_clean_region_max_num_intervals")) { + ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals); + } + + if (changed.count("osd_scrub_min_interval") || + changed.count("osd_scrub_max_interval")) { + resched_all_scrubs(); + dout(0) << __func__ << ": scrub interval change" << dendl; + } + check_config(); + if (changed.count("osd_asio_thread_count")) { + service.poolctx.stop(); + service.poolctx.start(conf.get_val("osd_asio_thread_count")); + } +} + +void OSD::maybe_override_max_osd_capacity_for_qos() +{ + // If the scheduler enabled is mclock, override the default + // osd capacity with the value obtained from running the + // osd bench test. This is later used to setup mclock. + if ((cct->_conf.get_val("osd_op_queue") == "mclock_scheduler") && + (cct->_conf.get_val("osd_mclock_skip_benchmark") == false)) { + std::string max_capacity_iops_config; + bool force_run_benchmark = + cct->_conf.get_val("osd_mclock_force_run_benchmark_on_init"); + + if (store_is_rotational) { + max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd"; + } else { + max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd"; + } + + if (!force_run_benchmark) { + double default_iops = 0.0; + + // Get the current osd iops capacity + double cur_iops = cct->_conf.get_val(max_capacity_iops_config); + + // Get the default max iops capacity + auto val = cct->_conf.get_val_default(max_capacity_iops_config); + if (!val.has_value()) { + derr << __func__ << " Unable to determine default value of " + << max_capacity_iops_config << dendl; + // Cannot determine default iops. Force a run of the OSD benchmark. + force_run_benchmark = true; + } else { + // Default iops + default_iops = std::stod(val.value()); + } + + // Determine if we really need to run the osd benchmark + if (!force_run_benchmark && (default_iops != cur_iops)) { + dout(1) << __func__ << std::fixed << std::setprecision(2) + << " default_iops: " << default_iops + << " cur_iops: " << cur_iops + << ". Skip OSD benchmark test." << dendl; + return; + } + } + + // Run osd bench: write 100 4MiB objects with blocksize 4KiB + int64_t count = 12288000; // Count of bytes to write + int64_t bsize = 4096; // Block size + int64_t osize = 4194304; // Object size + int64_t onum = 100; // Count of objects to write + double elapsed = 0.0; // Time taken to complete the test + double iops = 0.0; + stringstream ss; + int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss); + if (ret != 0) { + derr << __func__ + << " osd bench err: " << ret + << " osd bench errstr: " << ss.str() + << dendl; + return; + } + + double rate = count / elapsed; + iops = rate / bsize; + dout(1) << __func__ + << " osd bench result -" + << std::fixed << std::setprecision(3) + << " bandwidth (MiB/sec): " << rate / (1024 * 1024) + << " iops: " << iops + << " elapsed_sec: " << elapsed + << dendl; + + // Persist iops to the MON store + ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops)); + if (ret < 0) { + // Fallback to setting the config within the in-memory "values" map. + cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops)); + } + + // Override the max osd capacity for all shards + for (auto& shard : shards) { + shard->update_scheduler_config(); + } + } +} + +bool OSD::maybe_override_options_for_qos() +{ + // If the scheduler enabled is mclock, override the recovery, backfill + // and sleep options so that mclock can meet the QoS goals. + if (cct->_conf.get_val("osd_op_queue") == "mclock_scheduler") { + dout(1) << __func__ + << ": Changing recovery/backfill/sleep settings for QoS" << dendl; + + // Set high value for recovery max active + uint32_t rec_max_active = 1000; + cct->_conf.set_val( + "osd_recovery_max_active", std::to_string(rec_max_active)); + cct->_conf.set_val( + "osd_recovery_max_active_hdd", std::to_string(rec_max_active)); + cct->_conf.set_val( + "osd_recovery_max_active_ssd", std::to_string(rec_max_active)); + + // Set high value for osd_max_backfill + uint32_t max_backfills = 1000; + cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills)); + service.local_reserver.set_max(max_backfills); + service.remote_reserver.set_max(max_backfills); + + // Disable recovery sleep + cct->_conf.set_val("osd_recovery_sleep", std::to_string(0)); + cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0)); + cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0)); + cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0)); + + // Disable delete sleep + cct->_conf.set_val("osd_delete_sleep", std::to_string(0)); + cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0)); + cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0)); + cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0)); + + // Disable snap trim sleep + cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0)); + cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0)); + cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0)); + cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0)); + + // Disable scrub sleep + cct->_conf.set_val("osd_scrub_sleep", std::to_string(0)); + return true; + } + return false; +} + +int OSD::mon_cmd_set_config(const std::string &key, const std::string &val) +{ + std::string cmd = + "{" + "\"prefix\": \"config set\", " + "\"who\": \"osd." + std::to_string(whoami) + "\", " + "\"name\": \"" + key + "\", " + "\"value\": \"" + val + "\"" + "}"; + + vector vcmd{cmd}; + bufferlist inbl; + std::string outs; + C_SaferCond cond; + monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond); + int r = cond.wait(); + if (r < 0) { + derr << __func__ << " Failed to set config key " << key + << " err: " << cpp_strerror(r) + << " errstr: " << outs << dendl; + return r; + } + + return 0; +} + +void OSD::update_log_config() +{ + map log_to_monitors; + map log_to_syslog; + map log_channel; + map log_prio; + map log_to_graylog; + map log_to_graylog_host; + map log_to_graylog_port; + uuid_d fsid; + string host; + + if (parse_log_client_options(cct, log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host) == 0) + clog->update_config(log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host); + derr << "log_to_monitors " << log_to_monitors << dendl; +} + +void OSD::check_config() +{ + // some sanity checks + if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) { + clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")" + << " is not > osd_pg_epoch_persisted_max_stale (" + << cct->_conf->osd_pg_epoch_persisted_max_stale << ")"; + } + if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) { + clog->warn() << "osd_object_clean_region_max_num_intervals (" + << cct->_conf->osd_object_clean_region_max_num_intervals + << ") is < 0"; + } +} + +// -------------------------------- + +void OSD::get_latest_osdmap() +{ + dout(10) << __func__ << " -- start" << dendl; + + boost::system::error_code ec; + service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]); + + dout(10) << __func__ << " -- finish" << dendl; +} + +// -------------------------------- + +void OSD::set_perf_queries(const ConfigPayload &config_payload) { + const OSDConfigPayload &osd_config_payload = boost::get(config_payload); + const std::map &queries = osd_config_payload.config; + dout(10) << "setting " << queries.size() << " queries" << dendl; + + std::list supported_queries; + for (auto &it : queries) { + auto &query = it.first; + if (!query.key_descriptor.empty()) { + supported_queries.push_back(query); + } + } + if (supported_queries.size() < queries.size()) { + dout(1) << queries.size() - supported_queries.size() + << " unsupported queries" << dendl; + } + { + std::lock_guard locker{m_perf_queries_lock}; + m_perf_queries = supported_queries; + m_perf_limits = queries; + } + std::vector pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + std::scoped_lock l{*pg}; + pg->set_dynamic_perf_stats_queries(supported_queries); + } +} + +MetricPayload OSD::get_perf_reports() { + OSDMetricPayload payload; + std::map &reports = payload.report; + + std::vector pgs; + _get_pgs(&pgs); + DynamicPerfStats dps; + for (auto& pg : pgs) { + // m_perf_queries can be modified only in set_perf_queries by mgr client + // request, and it is protected by by mgr client's lock, which is held + // when set_perf_queries/get_perf_reports are called, so we may not hold + // m_perf_queries_lock here. + DynamicPerfStats pg_dps(m_perf_queries); + pg->lock(); + pg->get_dynamic_perf_stats(&pg_dps); + pg->unlock(); + dps.merge(pg_dps); + } + dps.add_to_reports(m_perf_limits, &reports); + dout(20) << "reports for " << reports.size() << " queries" << dendl; + + return payload; +} + +// ============================================================= + +#undef dout_context +#define dout_context cct +#undef dout_prefix +#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " " + +void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg) +{ + dout(10) << pg->pg_id << " " << pg << dendl; + slot->pg = pg; + pg->osd_shard = this; + pg->pg_slot = slot; + osd->inc_num_pgs(); + + slot->epoch = pg->get_osdmap_epoch(); + pg_slots_by_epoch.insert(*slot); +} + +void OSDShard::_detach_pg(OSDShardPGSlot *slot) +{ + dout(10) << slot->pg->pg_id << " " << slot->pg << dendl; + slot->pg->osd_shard = nullptr; + slot->pg->pg_slot = nullptr; + slot->pg = nullptr; + osd->dec_num_pgs(); + + pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot)); + slot->epoch = 0; + if (waiting_for_min_pg_epoch) { + min_pg_epoch_cond.notify_all(); + } +} + +void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e) +{ + std::lock_guard l(shard_lock); + dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch + << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl; + pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot)); + dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl; + slot->epoch = e; + pg_slots_by_epoch.insert(*slot); + dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch + << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl; + if (waiting_for_min_pg_epoch) { + min_pg_epoch_cond.notify_all(); + } +} + +epoch_t OSDShard::get_min_pg_epoch() +{ + std::lock_guard l(shard_lock); + auto p = pg_slots_by_epoch.begin(); + if (p == pg_slots_by_epoch.end()) { + return 0; + } + return p->epoch; +} + +void OSDShard::wait_min_pg_epoch(epoch_t need) +{ + std::unique_lock l{shard_lock}; + ++waiting_for_min_pg_epoch; + min_pg_epoch_cond.wait(l, [need, this] { + if (pg_slots_by_epoch.empty()) { + return true; + } else if (pg_slots_by_epoch.begin()->epoch >= need) { + return true; + } else { + dout(10) << need << " waiting on " + << pg_slots_by_epoch.begin()->epoch << dendl; + return false; + } + }); + --waiting_for_min_pg_epoch; +} + +epoch_t OSDShard::get_max_waiting_epoch() +{ + std::lock_guard l(shard_lock); + epoch_t r = 0; + for (auto& i : pg_slots) { + if (!i.second->waiting_peering.empty()) { + r = std::max(r, i.second->waiting_peering.rbegin()->first); + } + } + return r; +} + +void OSDShard::consume_map( + const OSDMapRef& new_osdmap, + unsigned *pushes_to_free) +{ + std::lock_guard l(shard_lock); + OSDMapRef old_osdmap; + { + std::lock_guard l(osdmap_lock); + old_osdmap = std::move(shard_osdmap); + shard_osdmap = new_osdmap; + } + dout(10) << new_osdmap->get_epoch() + << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")" + << dendl; + bool queued = false; + + // check slots + auto p = pg_slots.begin(); + while (p != pg_slots.end()) { + OSDShardPGSlot *slot = p->second.get(); + const spg_t& pgid = p->first; + dout(20) << __func__ << " " << pgid << dendl; + if (!slot->waiting_for_split.empty()) { + dout(20) << __func__ << " " << pgid + << " waiting for split " << slot->waiting_for_split << dendl; + ++p; + continue; + } + if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) { + dout(20) << __func__ << " " << pgid + << " waiting for merge by epoch " << slot->waiting_for_merge_epoch + << dendl; + ++p; + continue; + } + if (!slot->waiting_peering.empty()) { + epoch_t first = slot->waiting_peering.begin()->first; + if (first <= new_osdmap->get_epoch()) { + dout(20) << __func__ << " " << pgid + << " pending_peering first epoch " << first + << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl; + _wake_pg_slot(pgid, slot); + queued = true; + } + ++p; + continue; + } + if (!slot->waiting.empty()) { + if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) { + dout(20) << __func__ << " " << pgid << " maps to us, keeping" + << dendl; + ++p; + continue; + } + while (!slot->waiting.empty() && + slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) { + auto& qi = slot->waiting.front(); + dout(20) << __func__ << " " << pgid + << " waiting item " << qi + << " epoch " << qi.get_map_epoch() + << " <= " << new_osdmap->get_epoch() + << ", " + << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" : + "misdirected") + << ", dropping" << dendl; + *pushes_to_free += qi.get_reserved_pushes(); + slot->waiting.pop_front(); + } + } + if (slot->waiting.empty() && + slot->num_running == 0 && + slot->waiting_for_split.empty() && + !slot->pg) { + dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl; + p = pg_slots.erase(p); + continue; + } + + ++p; + } + if (queued) { + std::lock_guard l{sdata_wait_lock}; + sdata_cond.notify_one(); + } +} + +void OSDShard::_wake_pg_slot( + spg_t pgid, + OSDShardPGSlot *slot) +{ + dout(20) << __func__ << " " << pgid + << " to_process " << slot->to_process + << " waiting " << slot->waiting + << " waiting_peering " << slot->waiting_peering << dendl; + for (auto i = slot->to_process.rbegin(); + i != slot->to_process.rend(); + ++i) { + scheduler->enqueue_front(std::move(*i)); + } + slot->to_process.clear(); + for (auto i = slot->waiting.rbegin(); + i != slot->waiting.rend(); + ++i) { + scheduler->enqueue_front(std::move(*i)); + } + slot->waiting.clear(); + for (auto i = slot->waiting_peering.rbegin(); + i != slot->waiting_peering.rend(); + ++i) { + // this is overkill; we requeue everything, even if some of these + // items are waiting for maps we don't have yet. FIXME, maybe, + // someday, if we decide this inefficiency matters + for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) { + scheduler->enqueue_front(std::move(*j)); + } + } + slot->waiting_peering.clear(); + ++slot->requeue_seq; +} + +void OSDShard::identify_splits_and_merges( + const OSDMapRef& as_of_osdmap, + set> *split_pgs, + set> *merge_pgs) +{ + std::lock_guard l(shard_lock); + if (shard_osdmap) { + for (auto& i : pg_slots) { + const spg_t& pgid = i.first; + auto *slot = i.second.get(); + if (slot->pg) { + osd->service.identify_splits_and_merges( + shard_osdmap, as_of_osdmap, pgid, + split_pgs, merge_pgs); + } else if (!slot->waiting_for_split.empty()) { + osd->service.identify_splits_and_merges( + shard_osdmap, as_of_osdmap, pgid, + split_pgs, nullptr); + } else { + dout(20) << __func__ << " slot " << pgid + << " has no pg and waiting_for_split " << dendl; + } + } + } +} + +void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap, + set> *pgids) +{ + std::lock_guard l(shard_lock); + _prime_splits(pgids); + if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) { + set> newer_children; + for (auto i : *pgids) { + osd->service.identify_splits_and_merges( + as_of_osdmap, shard_osdmap, i.first, + &newer_children, nullptr); + } + newer_children.insert(pgids->begin(), pgids->end()); + dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard " + << shard_osdmap->get_epoch() << ", new children " << newer_children + << dendl; + _prime_splits(&newer_children); + // note: we don't care what is left over here for other shards. + // if this shard is ahead of us and one isn't, e.g., one thread is + // calling into prime_splits via _process (due to a newly created + // pg) and this shard has a newer map due to a racing consume_map, + // then any grandchildren left here will be identified (or were + // identified) when the slower shard's osdmap is advanced. + // _prime_splits() will tolerate the case where the pgid is + // already primed. + } +} + +void OSDShard::_prime_splits(set> *pgids) +{ + dout(10) << *pgids << dendl; + auto p = pgids->begin(); + while (p != pgids->end()) { + unsigned shard_index = p->first.hash_to_shard(osd->num_shards); + if (shard_index == shard_id) { + auto r = pg_slots.emplace(p->first, nullptr); + if (r.second) { + dout(10) << "priming slot " << p->first << " e" << p->second << dendl; + r.first->second = make_unique(); + r.first->second->waiting_for_split.insert(p->second); + } else { + auto q = r.first; + ceph_assert(q != pg_slots.end()); + dout(10) << "priming (existing) slot " << p->first << " e" << p->second + << dendl; + q->second->waiting_for_split.insert(p->second); + } + p = pgids->erase(p); + } else { + ++p; + } + } +} + +void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap, + set> *merge_pgs) +{ + std::lock_guard l(shard_lock); + dout(20) << __func__ << " checking shard " << shard_id + << " for remaining merge pgs " << merge_pgs << dendl; + auto p = merge_pgs->begin(); + while (p != merge_pgs->end()) { + spg_t pgid = p->first; + epoch_t epoch = p->second; + unsigned shard_index = pgid.hash_to_shard(osd->num_shards); + if (shard_index != shard_id) { + ++p; + continue; + } + OSDShardPGSlot *slot; + auto r = pg_slots.emplace(pgid, nullptr); + if (r.second) { + r.first->second = make_unique(); + } + slot = r.first->second.get(); + if (slot->pg) { + // already have pg + dout(20) << __func__ << " have merge participant pg " << pgid + << " " << slot->pg << dendl; + } else if (!slot->waiting_for_split.empty() && + *slot->waiting_for_split.begin() < epoch) { + dout(20) << __func__ << " pending split on merge participant pg " << pgid + << " " << slot->waiting_for_split << dendl; + } else { + dout(20) << __func__ << " creating empty merge participant " << pgid + << " for merge in " << epoch << dendl; + // leave history zeroed; PG::merge_from() will fill it in. + pg_history_t history; + PGCreateInfo cinfo(pgid, epoch - 1, + history, PastIntervals(), false); + PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo); + _attach_pg(r.first->second.get(), pg.get()); + _wake_pg_slot(pgid, slot); + pg->unlock(); + } + // mark slot for merge + dout(20) << __func__ << " marking merge participant " << pgid << dendl; + slot->waiting_for_merge_epoch = epoch; + p = merge_pgs->erase(p); + } +} + +void OSDShard::register_and_wake_split_child(PG *pg) +{ + epoch_t epoch; + { + std::lock_guard l(shard_lock); + dout(10) << pg->pg_id << " " << pg << dendl; + auto p = pg_slots.find(pg->pg_id); + ceph_assert(p != pg_slots.end()); + auto *slot = p->second.get(); + dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split + << dendl; + ceph_assert(!slot->pg); + ceph_assert(!slot->waiting_for_split.empty()); + _attach_pg(slot, pg); + + epoch = pg->get_osdmap_epoch(); + ceph_assert(slot->waiting_for_split.count(epoch)); + slot->waiting_for_split.erase(epoch); + if (slot->waiting_for_split.empty()) { + _wake_pg_slot(pg->pg_id, slot); + } else { + dout(10) << __func__ << " still waiting for split on " + << slot->waiting_for_split << dendl; + } + } + + // kick child to ensure it pulls up to the latest osdmap + osd->enqueue_peering_evt( + pg->pg_id, + PGPeeringEventRef( + std::make_shared( + epoch, + epoch, + NullEvt()))); + + std::lock_guard l{sdata_wait_lock}; + sdata_cond.notify_one(); +} + +void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num) +{ + std::lock_guard l(shard_lock); + vector to_delete; + for (auto& i : pg_slots) { + if (i.first != parent && + i.first.get_ancestor(old_pg_num) == parent) { + dout(10) << __func__ << " parent " << parent << " clearing " << i.first + << dendl; + _wake_pg_slot(i.first, i.second.get()); + to_delete.push_back(i.first); + } + } + for (auto pgid : to_delete) { + pg_slots.erase(pgid); + } +} + +void OSDShard::update_scheduler_config() +{ + std::lock_guard l(shard_lock); + scheduler->update_configuration(); +} + +OSDShard::OSDShard( + int id, + CephContext *cct, + OSD *osd) + : shard_id(id), + cct(cct), + osd(osd), + shard_name(string("OSDShard.") + stringify(id)), + sdata_wait_lock_name(shard_name + "::sdata_wait_lock"), + sdata_wait_lock{make_mutex(sdata_wait_lock_name)}, + osdmap_lock{make_mutex(shard_name + "::osdmap_lock")}, + shard_lock_name(shard_name + "::shard_lock"), + shard_lock{make_mutex(shard_lock_name)}, + scheduler(ceph::osd::scheduler::make_scheduler( + cct, osd->num_shards, osd->store->is_rotational())), + context_queue(sdata_wait_lock, sdata_cond) +{ + dout(0) << "using op scheduler " << *scheduler << dendl; +} + + +// ============================================================= + +#undef dout_context +#define dout_context osd->cct +#undef dout_prefix +#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq " + +void OSD::ShardedOpWQ::_add_slot_waiter( + spg_t pgid, + OSDShardPGSlot *slot, + OpSchedulerItem&& qi) +{ + if (qi.is_peering()) { + dout(20) << __func__ << " " << pgid + << " peering, item epoch is " + << qi.get_map_epoch() + << ", will wait on " << qi << dendl; + slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi)); + } else { + dout(20) << __func__ << " " << pgid + << " item epoch is " + << qi.get_map_epoch() + << ", will wait on " << qi << dendl; + slot->waiting.push_back(std::move(qi)); + } +} + +#undef dout_prefix +#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") " + +void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb) +{ + uint32_t shard_index = thread_index % osd->num_shards; + auto& sdata = osd->shards[shard_index]; + ceph_assert(sdata); + + // If all threads of shards do oncommits, there is a out-of-order + // problem. So we choose the thread which has the smallest + // thread_index(thread_index < num_shards) of shard to do oncommit + // callback. + bool is_smallest_thread_index = thread_index < osd->num_shards; + + // peek at spg_t + sdata->shard_lock.lock(); + if (sdata->scheduler->empty() && + (!is_smallest_thread_index || sdata->context_queue.empty())) { + std::unique_lock wait_lock{sdata->sdata_wait_lock}; + if (is_smallest_thread_index && !sdata->context_queue.empty()) { + // we raced with a context_queue addition, don't wait + wait_lock.unlock(); + } else if (!sdata->stop_waiting) { + dout(20) << __func__ << " empty q, waiting" << dendl; + osd->cct->get_heartbeat_map()->clear_timeout(hb); + sdata->shard_lock.unlock(); + sdata->sdata_cond.wait(wait_lock); + wait_lock.unlock(); + sdata->shard_lock.lock(); + if (sdata->scheduler->empty() && + !(is_smallest_thread_index && !sdata->context_queue.empty())) { + sdata->shard_lock.unlock(); + return; + } + // found a work item; reapply default wq timeouts + osd->cct->get_heartbeat_map()->reset_timeout(hb, + timeout_interval, suicide_interval); + } else { + dout(20) << __func__ << " need return immediately" << dendl; + wait_lock.unlock(); + sdata->shard_lock.unlock(); + return; + } + } + + list oncommits; + if (is_smallest_thread_index) { + sdata->context_queue.move_to(oncommits); + } + + WorkItem work_item; + while (!std::get_if(&work_item)) { + if (sdata->scheduler->empty()) { + if (osd->is_stopping()) { + sdata->shard_lock.unlock(); + for (auto c : oncommits) { + dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl; + delete c; + } + return; // OSD shutdown, discard. + } + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + + work_item = sdata->scheduler->dequeue(); + if (osd->is_stopping()) { + sdata->shard_lock.unlock(); + for (auto c : oncommits) { + dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl; + delete c; + } + return; // OSD shutdown, discard. + } + + // If the work item is scheduled in the future, wait until + // the time returned in the dequeue response before retrying. + if (auto when_ready = std::get_if(&work_item)) { + if (is_smallest_thread_index) { + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + std::unique_lock wait_lock{sdata->sdata_wait_lock}; + auto future_time = ceph::real_clock::from_double(*when_ready); + dout(10) << __func__ << " dequeue future request at " << future_time << dendl; + // Disable heartbeat timeout until we find a non-future work item to process. + osd->cct->get_heartbeat_map()->clear_timeout(hb); + sdata->shard_lock.unlock(); + ++sdata->waiting_threads; + sdata->sdata_cond.wait_until(wait_lock, future_time); + --sdata->waiting_threads; + wait_lock.unlock(); + sdata->shard_lock.lock(); + // Reapply default wq timeouts + osd->cct->get_heartbeat_map()->reset_timeout(hb, + timeout_interval, suicide_interval); + } + } // while + + // Access the stored item + auto item = std::move(std::get(work_item)); + if (osd->is_stopping()) { + sdata->shard_lock.unlock(); + for (auto c : oncommits) { + dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl; + delete c; + } + return; // OSD shutdown, discard. + } + + const auto token = item.get_ordering_token(); + auto r = sdata->pg_slots.emplace(token, nullptr); + if (r.second) { + r.first->second = make_unique(); + } + OSDShardPGSlot *slot = r.first->second.get(); + dout(20) << __func__ << " " << token + << (r.second ? " (new)" : "") + << " to_process " << slot->to_process + << " waiting " << slot->waiting + << " waiting_peering " << slot->waiting_peering + << dendl; + slot->to_process.push_back(std::move(item)); + dout(20) << __func__ << " " << slot->to_process.back() + << " queued" << dendl; + + retry_pg: + PGRef pg = slot->pg; + + // lock pg (if we have it) + if (pg) { + // note the requeue seq now... + uint64_t requeue_seq = slot->requeue_seq; + ++slot->num_running; + + sdata->shard_lock.unlock(); + osd->service.maybe_inject_dispatch_delay(); + pg->lock(); + osd->service.maybe_inject_dispatch_delay(); + sdata->shard_lock.lock(); + + auto q = sdata->pg_slots.find(token); + if (q == sdata->pg_slots.end()) { + // this can happen if we race with pg removal. + dout(20) << __func__ << " slot " << token << " no longer there" << dendl; + pg->unlock(); + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + slot = q->second.get(); + --slot->num_running; + + if (slot->to_process.empty()) { + // raced with _wake_pg_slot or consume_map + dout(20) << __func__ << " " << token + << " nothing queued" << dendl; + pg->unlock(); + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + if (requeue_seq != slot->requeue_seq) { + dout(20) << __func__ << " " << token + << " requeue_seq " << slot->requeue_seq << " > our " + << requeue_seq << ", we raced with _wake_pg_slot" + << dendl; + pg->unlock(); + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + if (slot->pg != pg) { + // this can happen if we race with pg removal. + dout(20) << __func__ << " slot " << token << " no longer attached to " + << pg << dendl; + pg->unlock(); + goto retry_pg; + } + } + + dout(20) << __func__ << " " << token + << " to_process " << slot->to_process + << " waiting " << slot->waiting + << " waiting_peering " << slot->waiting_peering << dendl; + + ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval, + suicide_interval); + + // take next item + auto qi = std::move(slot->to_process.front()); + slot->to_process.pop_front(); + dout(20) << __func__ << " " << qi << " pg " << pg << dendl; + set> new_children; + OSDMapRef osdmap; + + while (!pg) { + // should this pg shard exist on this osd in this (or a later) epoch? + osdmap = sdata->shard_osdmap; + const PGCreateInfo *create_info = qi.creates_pg(); + if (!slot->waiting_for_split.empty()) { + dout(20) << __func__ << " " << token + << " splitting " << slot->waiting_for_split << dendl; + _add_slot_waiter(token, slot, std::move(qi)); + } else if (qi.get_map_epoch() > osdmap->get_epoch()) { + dout(20) << __func__ << " " << token + << " map " << qi.get_map_epoch() << " > " + << osdmap->get_epoch() << dendl; + _add_slot_waiter(token, slot, std::move(qi)); + } else if (qi.is_peering()) { + if (!qi.peering_requires_pg()) { + // for pg-less events, we run them under the ordering lock, since + // we don't have the pg lock to keep them ordered. + qi.run(osd, sdata, pg, tp_handle); + } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) { + if (create_info) { + if (create_info->by_mon && + osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) { + dout(20) << __func__ << " " << token + << " no pg, no longer primary, ignoring mon create on " + << qi << dendl; + } else { + dout(20) << __func__ << " " << token + << " no pg, should create on " << qi << dendl; + pg = osd->handle_pg_create_info(osdmap, create_info); + if (pg) { + // we created the pg! drop out and continue "normally"! + sdata->_attach_pg(slot, pg.get()); + sdata->_wake_pg_slot(token, slot); + + // identify split children between create epoch and shard epoch. + osd->service.identify_splits_and_merges( + pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr); + sdata->_prime_splits(&new_children); + // distribute remaining split children to other shards below! + break; + } + dout(20) << __func__ << " ignored create on " << qi << dendl; + } + } else { + dout(20) << __func__ << " " << token + << " no pg, peering, !create, discarding " << qi << dendl; + } + } else { + dout(20) << __func__ << " " << token + << " no pg, peering, doesn't map here e" << osdmap->get_epoch() + << ", discarding " << qi + << dendl; + } + } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) { + dout(20) << __func__ << " " << token + << " no pg, should exist e" << osdmap->get_epoch() + << ", will wait on " << qi << dendl; + _add_slot_waiter(token, slot, std::move(qi)); + } else { + dout(20) << __func__ << " " << token + << " no pg, shouldn't exist e" << osdmap->get_epoch() + << ", dropping " << qi << dendl; + // share map with client? + if (std::optional _op = qi.maybe_get_op()) { + osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(), + sdata->shard_osdmap, + (*_op)->sent_epoch); + } + unsigned pushes_to_free = qi.get_reserved_pushes(); + if (pushes_to_free > 0) { + sdata->shard_lock.unlock(); + osd->service.release_reserved_pushes(pushes_to_free); + handle_oncommits(oncommits); + return; + } + } + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + if (qi.is_peering()) { + OSDMapRef osdmap = sdata->shard_osdmap; + if (qi.get_map_epoch() > osdmap->get_epoch()) { + _add_slot_waiter(token, slot, std::move(qi)); + sdata->shard_lock.unlock(); + pg->unlock(); + handle_oncommits(oncommits); + return; + } + } + sdata->shard_lock.unlock(); + + if (!new_children.empty()) { + for (auto shard : osd->shards) { + shard->prime_splits(osdmap, &new_children); + } + ceph_assert(new_children.empty()); + } + + // osd_opwq_process marks the point at which an operation has been dequeued + // and will begin to be handled by a worker thread. + { +#ifdef WITH_LTTNG + osd_reqid_t reqid; + if (std::optional _op = qi.maybe_get_op()) { + reqid = (*_op)->get_reqid(); + } +#endif + tracepoint(osd, opwq_process_start, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: "; + Formatter *f = Formatter::create("json"); + f->open_object_section("q"); + dump(f); + f->close_section(); + f->flush(*_dout); + delete f; + *_dout << dendl; + + qi.run(osd, sdata, pg, tp_handle); + + { +#ifdef WITH_LTTNG + osd_reqid_t reqid; + if (std::optional _op = qi.maybe_get_op()) { + reqid = (*_op)->get_reqid(); + } +#endif + tracepoint(osd, opwq_process_finish, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + handle_oncommits(oncommits); +} + +void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) { + uint32_t shard_index = + item.get_ordering_token().hash_to_shard(osd->shards.size()); + + dout(20) << __func__ << " " << item << dendl; + + OSDShard* sdata = osd->shards[shard_index]; + assert (NULL != sdata); + + bool empty = true; + { + std::lock_guard l{sdata->shard_lock}; + empty = sdata->scheduler->empty(); + sdata->scheduler->enqueue(std::move(item)); + } + + { + std::lock_guard l{sdata->sdata_wait_lock}; + if (empty) { + sdata->sdata_cond.notify_all(); + } else if (sdata->waiting_threads) { + sdata->sdata_cond.notify_one(); + } + } +} + +void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item) +{ + auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size()); + auto& sdata = osd->shards[shard_index]; + ceph_assert(sdata); + sdata->shard_lock.lock(); + auto p = sdata->pg_slots.find(item.get_ordering_token()); + if (p != sdata->pg_slots.end() && + !p->second->to_process.empty()) { + // we may be racing with _process, which has dequeued a new item + // from scheduler, put it on to_process, and is now busy taking the + // pg lock. ensure this old requeued item is ordered before any + // such newer item in to_process. + p->second->to_process.push_front(std::move(item)); + item = std::move(p->second->to_process.back()); + p->second->to_process.pop_back(); + dout(20) << __func__ + << " " << p->second->to_process.front() + << " shuffled w/ " << item << dendl; + } else { + dout(20) << __func__ << " " << item << dendl; + } + sdata->scheduler->enqueue_front(std::move(item)); + sdata->shard_lock.unlock(); + std::lock_guard l{sdata->sdata_wait_lock}; + sdata->sdata_cond.notify_one(); +} + +namespace ceph::osd_cmds { + +int heap(CephContext& cct, + const cmdmap_t& cmdmap, + std::ostream& outos, + std::ostream& erros) +{ + if (!ceph_using_tcmalloc()) { + erros << "could not issue heap profiler command -- not using tcmalloc!"; + return -EOPNOTSUPP; + } + + string cmd; + if (!cmd_getval(cmdmap, "heapcmd", cmd)) { + erros << "unable to get value for command \"" << cmd << "\""; + return -EINVAL; + } + + std::vector cmd_vec; + get_str_vec(cmd, cmd_vec); + + string val; + if (cmd_getval(cmdmap, "value", val)) { + cmd_vec.push_back(val); + } + + ceph_heap_profiler_handle_command(cmd_vec, outos); + + return 0; +} + +} // namespace ceph::osd_cmds diff --git a/src/osd/OSD.h b/src/osd/OSD.h new file mode 100644 index 000000000..efbcb40f7 --- /dev/null +++ b/src/osd/OSD.h @@ -0,0 +1,2152 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_H +#define CEPH_OSD_H + +#include "PG.h" + +#include "msg/Dispatcher.h" + +#include "common/async/context_pool.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "common/AsyncReserver.h" +#include "common/ceph_context.h" +#include "common/config_cacher.h" +#include "common/zipkin_trace.h" +#include "common/ceph_timer.h" + +#include "mgr/MgrClient.h" + +#include "os/ObjectStore.h" + +#include "include/CompatSet.h" +#include "include/common_fwd.h" + +#include "OpRequest.h" +#include "Session.h" + +#include "osd/scheduler/OpScheduler.h" + +#include +#include +#include +#include + +#include "include/unordered_map.h" + +#include "common/shared_cache.hpp" +#include "common/simple_cache.hpp" +#include "messages/MOSDOp.h" +#include "common/EventTrace.h" +#include "osd/osd_perf_counters.h" +#include "common/Finisher.h" + +#define CEPH_OSD_PROTOCOL 10 /* cluster internal */ + +/* + + lock ordering for pg map + + PG::lock + ShardData::lock + OSD::pg_map_lock + + */ + +class Messenger; +class Message; +class MonClient; +class ObjectStore; +class FuseStore; +class OSDMap; +class MLog; +class Objecter; +class KeyStore; + +class Watch; +class PrimaryLogPG; + +class TestOpsSocketHook; +struct C_FinishSplits; +struct C_OpenPGs; +class LogChannel; + +class MOSDPGCreate2; +class MOSDPGQuery; +class MOSDPGNotify; +class MOSDPGInfo; +class MOSDPGRemove; +class MOSDForceRecovery; +class MMonGetPurgedSnapsReply; + +class OSD; + +class OSDService { + using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem; +public: + OSD *osd; + CephContext *cct; + ObjectStore::CollectionHandle meta_ch; + const int whoami; + ObjectStore *&store; + LogClient &log_client; + LogChannelRef clog; + PGRecoveryStats &pg_recovery_stats; +private: + Messenger *&cluster_messenger; + Messenger *&client_messenger; +public: + PerfCounters *&logger; + PerfCounters *&recoverystate_perf; + MonClient *&monc; + + md_config_cacher_t osd_max_object_size; + md_config_cacher_t osd_skip_data_digest; + + void enqueue_back(OpSchedulerItem&& qi); + void enqueue_front(OpSchedulerItem&& qi); + + void maybe_inject_dispatch_delay() { + if (g_conf()->osd_debug_inject_dispatch_delay_probability > 0) { + if (rand() % 10000 < + g_conf()->osd_debug_inject_dispatch_delay_probability * 10000) { + utime_t t; + t.set_from_double(g_conf()->osd_debug_inject_dispatch_delay_duration); + t.sleep(); + } + } + } + + ceph::signedspan get_mnow(); + +private: + // -- superblock -- + ceph::mutex publish_lock, pre_publish_lock; // pre-publish orders before publish + OSDSuperblock superblock; + +public: + OSDSuperblock get_superblock() { + std::lock_guard l(publish_lock); + return superblock; + } + void publish_superblock(const OSDSuperblock &block) { + std::lock_guard l(publish_lock); + superblock = block; + } + + int get_nodeid() const { return whoami; } + + std::atomic max_oldest_map; +private: + OSDMapRef osdmap; + +public: + OSDMapRef get_osdmap() { + std::lock_guard l(publish_lock); + return osdmap; + } + epoch_t get_osdmap_epoch() { + std::lock_guard l(publish_lock); + return osdmap ? osdmap->get_epoch() : 0; + } + void publish_map(OSDMapRef map) { + std::lock_guard l(publish_lock); + osdmap = map; + } + + /* + * osdmap - current published std::map + * next_osdmap - pre_published std::map that is about to be published. + * + * We use the next_osdmap to send messages and initiate connections, + * but only if the target is the same instance as the one in the std::map + * epoch the current user is working from (i.e., the result is + * equivalent to what is in next_osdmap). + * + * This allows the helpers to start ignoring osds that are about to + * go down, and let OSD::handle_osd_map()/note_down_osd() mark them + * down, without worrying about reopening connections from threads + * working from old maps. + */ +private: + OSDMapRef next_osdmap; + ceph::condition_variable pre_publish_cond; + int pre_publish_waiter = 0; + +public: + void pre_publish_map(OSDMapRef map) { + std::lock_guard l(pre_publish_lock); + next_osdmap = std::move(map); + } + + void activate_map(); + /// map epochs reserved below + std::map map_reservations; + + /// gets ref to next_osdmap and registers the epoch as reserved + OSDMapRef get_nextmap_reserved() { + std::lock_guard l(pre_publish_lock); + epoch_t e = next_osdmap->get_epoch(); + std::map::iterator i = + map_reservations.insert(std::make_pair(e, 0)).first; + i->second++; + return next_osdmap; + } + /// releases reservation on map + void release_map(OSDMapRef osdmap) { + std::lock_guard l(pre_publish_lock); + std::map::iterator i = + map_reservations.find(osdmap->get_epoch()); + ceph_assert(i != map_reservations.end()); + ceph_assert(i->second > 0); + if (--(i->second) == 0) { + map_reservations.erase(i); + } + if (pre_publish_waiter) { + pre_publish_cond.notify_all(); + } + } + /// blocks until there are no reserved maps prior to next_osdmap + void await_reserved_maps() { + std::unique_lock l{pre_publish_lock}; + ceph_assert(next_osdmap); + pre_publish_waiter++; + pre_publish_cond.wait(l, [this] { + auto i = map_reservations.cbegin(); + return (i == map_reservations.cend() || + i->first >= next_osdmap->get_epoch()); + }); + pre_publish_waiter--; + } + OSDMapRef get_next_osdmap() { + std::lock_guard l(pre_publish_lock); + return next_osdmap; + } + + void maybe_share_map(Connection *con, + const OSDMapRef& osdmap, + epoch_t peer_epoch_lb=0); + + void send_map(class MOSDMap *m, Connection *con); + void send_incremental_map(epoch_t since, Connection *con, + const OSDMapRef& osdmap); + MOSDMap *build_incremental_map_msg(epoch_t from, epoch_t to, + OSDSuperblock& superblock); + + ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch); + std::pair get_con_osd_hb(int peer, epoch_t from_epoch); // (back, front) + void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch); + void send_message_osd_cluster(std::vector>& messages, epoch_t from_epoch); + void send_message_osd_cluster(MessageRef m, Connection *con) { + con->send_message2(std::move(m)); + } + void send_message_osd_cluster(Message *m, const ConnectionRef& con) { + con->send_message(m); + } + void send_message_osd_client(Message *m, const ConnectionRef& con) { + con->send_message(m); + } + entity_name_t get_cluster_msgr_name() const; + +private: + // -- scrub scheduling -- + ceph::mutex sched_scrub_lock = ceph::make_mutex("OSDService::sched_scrub_lock"); + int scrubs_local; + int scrubs_remote; + +public: + struct ScrubJob { + CephContext* cct; + /// pg to be scrubbed + spg_t pgid; + /// a time scheduled for scrub. but the scrub could be delayed if system + /// load is too high or it fails to fall in the scrub hours + utime_t sched_time; + /// the hard upper bound of scrub time + utime_t deadline; + ScrubJob() : cct(nullptr) {} + explicit ScrubJob(CephContext* cct, const spg_t& pg, + const utime_t& timestamp, + double pool_scrub_min_interval = 0, + double pool_scrub_max_interval = 0, bool must = true); + /// order the jobs by sched_time + bool operator<(const ScrubJob& rhs) const; + }; + std::set sched_scrub_pg; + + /// @returns the scrub_reg_stamp used for unregistering the scrub job + utime_t reg_pg_scrub(spg_t pgid, + utime_t t, + double pool_scrub_min_interval, + double pool_scrub_max_interval, + bool must) { + ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval, + must); + std::lock_guard l(OSDService::sched_scrub_lock); + sched_scrub_pg.insert(scrub_job); + return scrub_job.sched_time; + } + + void unreg_pg_scrub(spg_t pgid, utime_t t) { + std::lock_guard l(sched_scrub_lock); + size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t)); + ceph_assert(removed); + } + + bool first_scrub_stamp(ScrubJob *out) { + std::lock_guard l(sched_scrub_lock); + if (sched_scrub_pg.empty()) + return false; + std::set::iterator iter = sched_scrub_pg.begin(); + *out = *iter; + return true; + } + bool next_scrub_stamp(const ScrubJob& next, + ScrubJob *out) { + std::lock_guard l(sched_scrub_lock); + if (sched_scrub_pg.empty()) + return false; + std::set::const_iterator iter = sched_scrub_pg.upper_bound(next); + if (iter == sched_scrub_pg.cend()) + return false; + *out = *iter; + return true; + } + + void dumps_scrub(ceph::Formatter* f); + + bool can_inc_scrubs(); + bool inc_scrubs_local(); + void dec_scrubs_local(); + bool inc_scrubs_remote(); + void dec_scrubs_remote(); + void dump_scrub_reservations(ceph::Formatter *f); + + void reply_op_error(OpRequestRef op, int err); + void reply_op_error(OpRequestRef op, int err, eversion_t v, version_t uv, + std::vector op_returns); + void handle_misdirected_op(PG *pg, OpRequestRef op); + + +private: + // -- agent shared state -- + ceph::mutex agent_lock = ceph::make_mutex("OSDService::agent_lock"); + ceph::condition_variable agent_cond; + std::map > agent_queue; + std::set::iterator agent_queue_pos; + bool agent_valid_iterator; + int agent_ops; + int flush_mode_high_count; //once have one pg with FLUSH_MODE_HIGH then flush objects with high speed + std::set agent_oids; + bool agent_active; + struct AgentThread : public Thread { + OSDService *osd; + explicit AgentThread(OSDService *o) : osd(o) {} + void *entry() override { + osd->agent_entry(); + return NULL; + } + } agent_thread; + bool agent_stop_flag; + ceph::mutex agent_timer_lock = ceph::make_mutex("OSDService::agent_timer_lock"); + SafeTimer agent_timer; + +public: + void agent_entry(); + void agent_stop(); + + void _enqueue(PG *pg, uint64_t priority) { + if (!agent_queue.empty() && + agent_queue.rbegin()->first < priority) + agent_valid_iterator = false; // inserting higher-priority queue + std::set& nq = agent_queue[priority]; + if (nq.empty()) + agent_cond.notify_all(); + nq.insert(pg); + } + + void _dequeue(PG *pg, uint64_t old_priority) { + std::set& oq = agent_queue[old_priority]; + std::set::iterator p = oq.find(pg); + ceph_assert(p != oq.end()); + if (p == agent_queue_pos) + ++agent_queue_pos; + oq.erase(p); + if (oq.empty()) { + if (agent_queue.rbegin()->first == old_priority) + agent_valid_iterator = false; + agent_queue.erase(old_priority); + } + } + + /// enable agent for a pg + void agent_enable_pg(PG *pg, uint64_t priority) { + std::lock_guard l(agent_lock); + _enqueue(pg, priority); + } + + /// adjust priority for an enagled pg + void agent_adjust_pg(PG *pg, uint64_t old_priority, uint64_t new_priority) { + std::lock_guard l(agent_lock); + ceph_assert(new_priority != old_priority); + _enqueue(pg, new_priority); + _dequeue(pg, old_priority); + } + + /// disable agent for a pg + void agent_disable_pg(PG *pg, uint64_t old_priority) { + std::lock_guard l(agent_lock); + _dequeue(pg, old_priority); + } + + /// note start of an async (evict) op + void agent_start_evict_op() { + std::lock_guard l(agent_lock); + ++agent_ops; + } + + /// note finish or cancellation of an async (evict) op + void agent_finish_evict_op() { + std::lock_guard l(agent_lock); + ceph_assert(agent_ops > 0); + --agent_ops; + agent_cond.notify_all(); + } + + /// note start of an async (flush) op + void agent_start_op(const hobject_t& oid) { + std::lock_guard l(agent_lock); + ++agent_ops; + ceph_assert(agent_oids.count(oid) == 0); + agent_oids.insert(oid); + } + + /// note finish or cancellation of an async (flush) op + void agent_finish_op(const hobject_t& oid) { + std::lock_guard l(agent_lock); + ceph_assert(agent_ops > 0); + --agent_ops; + ceph_assert(agent_oids.count(oid) == 1); + agent_oids.erase(oid); + agent_cond.notify_all(); + } + + /// check if we are operating on an object + bool agent_is_active_oid(const hobject_t& oid) { + std::lock_guard l(agent_lock); + return agent_oids.count(oid); + } + + /// get count of active agent ops + int agent_get_num_ops() { + std::lock_guard l(agent_lock); + return agent_ops; + } + + void agent_inc_high_count() { + std::lock_guard l(agent_lock); + flush_mode_high_count ++; + } + + void agent_dec_high_count() { + std::lock_guard l(agent_lock); + flush_mode_high_count --; + } + +private: + /// throttle promotion attempts + std::atomic promote_probability_millis{1000}; ///< probability thousands. one word. + PromoteCounter promote_counter; + utime_t last_recalibrate; + unsigned long promote_max_objects, promote_max_bytes; + +public: + bool promote_throttle() { + // NOTE: lockless! we rely on the probability being a single word. + promote_counter.attempt(); + if ((unsigned)rand() % 1000 > promote_probability_millis) + return true; // yes throttle (no promote) + if (promote_max_objects && + promote_counter.objects > promote_max_objects) + return true; // yes throttle + if (promote_max_bytes && + promote_counter.bytes > promote_max_bytes) + return true; // yes throttle + return false; // no throttle (promote) + } + void promote_finish(uint64_t bytes) { + promote_counter.finish(bytes); + } + void promote_throttle_recalibrate(); + unsigned get_num_shards() const { + return m_objecter_finishers; + } + Finisher* get_objecter_finisher(int shard) { + return objecter_finishers[shard].get(); + } + + // -- Objecter, for tiering reads/writes from/to other OSDs -- + ceph::async::io_context_pool& poolctx; + std::unique_ptr objecter; + int m_objecter_finishers; + std::vector> objecter_finishers; + + // -- Watch -- + ceph::mutex watch_lock = ceph::make_mutex("OSDService::watch_lock"); + SafeTimer watch_timer; + uint64_t next_notif_id; + uint64_t get_next_id(epoch_t cur_epoch) { + std::lock_guard l(watch_lock); + return (((uint64_t)cur_epoch) << 32) | ((uint64_t)(next_notif_id++)); + } + + // -- Recovery/Backfill Request Scheduling -- + ceph::mutex recovery_request_lock = ceph::make_mutex("OSDService::recovery_request_lock"); + SafeTimer recovery_request_timer; + + // For async recovery sleep + bool recovery_needs_sleep = true; + ceph::real_clock::time_point recovery_schedule_time; + + // For recovery & scrub & snap + ceph::mutex sleep_lock = ceph::make_mutex("OSDService::sleep_lock"); + SafeTimer sleep_timer; + + // -- tids -- + // for ops i issue + std::atomic last_tid{0}; + ceph_tid_t get_tid() { + return (ceph_tid_t)last_tid++; + } + + // -- backfill_reservation -- + Finisher reserver_finisher; + AsyncReserver local_reserver; + AsyncReserver remote_reserver; + + // -- pg merge -- + ceph::mutex merge_lock = ceph::make_mutex("OSD::merge_lock"); + std::map ready_to_merge_source; // pg -> version + std::map> ready_to_merge_target; // pg -> (version,les,lec) + std::set not_ready_to_merge_source; + std::map not_ready_to_merge_target; + std::set sent_ready_to_merge_source; + + void set_ready_to_merge_source(PG *pg, + eversion_t version); + void set_ready_to_merge_target(PG *pg, + eversion_t version, + epoch_t last_epoch_started, + epoch_t last_epoch_clean); + void set_not_ready_to_merge_source(pg_t source); + void set_not_ready_to_merge_target(pg_t target, pg_t source); + void clear_ready_to_merge(PG *pg); + void send_ready_to_merge(); + void _send_ready_to_merge(); + void clear_sent_ready_to_merge(); + void prune_sent_ready_to_merge(const OSDMapRef& osdmap); + + // -- pg_temp -- +private: + ceph::mutex pg_temp_lock = ceph::make_mutex("OSDService::pg_temp_lock"); + struct pg_temp_t { + std::vector acting; + bool forced = false; + }; + std::map pg_temp_wanted; + std::map pg_temp_pending; + void _sent_pg_temp(); + friend std::ostream& operator<<(std::ostream&, const pg_temp_t&); +public: + void queue_want_pg_temp(pg_t pgid, const std::vector& want, + bool forced = false); + void remove_want_pg_temp(pg_t pgid); + void requeue_pg_temp(); + void send_pg_temp(); + + ceph::mutex pg_created_lock = ceph::make_mutex("OSDService::pg_created_lock"); + std::set pg_created; + void send_pg_created(pg_t pgid); + void prune_pg_created(); + void send_pg_created(); + + AsyncReserver snap_reserver; + void queue_recovery_context(PG *pg, GenContext *c); + void queue_for_snap_trim(PG *pg); + void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority); + + void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority); + + /// queue the message (-> event) that all replicas have reserved scrub resources for us + void queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority); + + /// queue the message (-> event) that some replicas denied our scrub resources request + void queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals either (a) the end of a sleep period, or (b) a recheck of the availability + /// of the primary map being created by the backend. + void queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals a change in the number of in-flight recovery writes + void queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that all pending updates were applied + void queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that the selected chunk (objects range) is available for scrubbing + void queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority); + + /// The chunk selected is blocked by user operations, and cannot be scrubbed now + void queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority); + + /// The block-range that was locked and prevented the scrubbing - is freed + void queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that all write OPs are done + void queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that the the local (Primary's) scrub map is ready + void queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that we (the Primary) got all waited-for scrub-maps from our replicas + void queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that all chunks were handled + /// Note: always with high priority, as must be acted upon before the + /// next scrub request arrives from the Primary (and the primary is free + /// to send the request once the replica's map is received). + void queue_scrub_is_finished(PG* pg); + + /// Signals that there are more chunks to handle + void queue_scrub_next_chunk(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that we have finished comparing the maps for this chunk + /// Note: required, as in Crimson this operation is 'futurized'. + void queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority); + + void queue_for_rep_scrub(PG* pg, + Scrub::scrub_prio_t with_high_priority, + unsigned int qu_priority, + Scrub::act_token_t act_token); + + /// Signals a change in the number of in-flight recovery writes + void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority); + + /// (not in Crimson) Queue a SchedReplica event to be sent to the replica, to + /// trigger a re-check of the availability of the scrub map prepared by the + /// backend. + void queue_for_rep_scrub_resched(PG* pg, + Scrub::scrub_prio_t with_high_priority, + unsigned int qu_priority, + Scrub::act_token_t act_token); + + void queue_for_pg_delete(spg_t pgid, epoch_t e); + bool try_finish_pg_delete(PG *pg, unsigned old_pg_num); + +private: + // -- pg recovery and associated throttling -- + ceph::mutex recovery_lock = ceph::make_mutex("OSDService::recovery_lock"); + std::list > awaiting_throttle; + + /// queue a scrub-related message for a PG + template + void queue_scrub_event_msg(PG* pg, + Scrub::scrub_prio_t with_priority, + unsigned int qu_priority, + Scrub::act_token_t act_token); + + /// An alternative version of queue_scrub_event_msg(), in which the queuing priority is + /// provided by the executing scrub (i.e. taken from PgScrubber::m_flags) + template + void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority); + + utime_t defer_recovery_until; + uint64_t recovery_ops_active; + uint64_t recovery_ops_reserved; + bool recovery_paused; +#ifdef DEBUG_RECOVERY_OIDS + std::map > recovery_oids; +#endif + bool _recover_now(uint64_t *available_pushes); + void _maybe_queue_recovery(); + void _queue_for_recovery( + std::pair p, uint64_t reserved_pushes); +public: + void start_recovery_op(PG *pg, const hobject_t& soid); + void finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue); + bool is_recovery_active(); + void release_reserved_pushes(uint64_t pushes); + void defer_recovery(float defer_for) { + defer_recovery_until = ceph_clock_now(); + defer_recovery_until += defer_for; + } + void pause_recovery() { + std::lock_guard l(recovery_lock); + recovery_paused = true; + } + bool recovery_is_paused() { + std::lock_guard l(recovery_lock); + return recovery_paused; + } + void unpause_recovery() { + std::lock_guard l(recovery_lock); + recovery_paused = false; + _maybe_queue_recovery(); + } + void kick_recovery_queue() { + std::lock_guard l(recovery_lock); + _maybe_queue_recovery(); + } + void clear_queued_recovery(PG *pg) { + std::lock_guard l(recovery_lock); + awaiting_throttle.remove_if( + [pg](decltype(awaiting_throttle)::const_reference awaiting ) { + return awaiting.second.get() == pg; + }); + } + + unsigned get_target_pg_log_entries() const; + + // delayed pg activation + void queue_for_recovery(PG *pg) { + std::lock_guard l(recovery_lock); + + if (pg->is_forced_recovery_or_backfill()) { + awaiting_throttle.push_front(std::make_pair(pg->get_osdmap()->get_epoch(), pg)); + } else { + awaiting_throttle.push_back(std::make_pair(pg->get_osdmap()->get_epoch(), pg)); + } + _maybe_queue_recovery(); + } + void queue_recovery_after_sleep(PG *pg, epoch_t queued, uint64_t reserved_pushes) { + std::lock_guard l(recovery_lock); + _queue_for_recovery(std::make_pair(queued, pg), reserved_pushes); + } + + void queue_check_readable(spg_t spgid, + epoch_t lpr, + ceph::signedspan delay = ceph::signedspan::zero()); + + // osd map cache (past osd maps) + ceph::mutex map_cache_lock = ceph::make_mutex("OSDService::map_cache_lock"); + SharedLRU map_cache; + SimpleLRU map_bl_cache; + SimpleLRU map_bl_inc_cache; + + OSDMapRef try_get_map(epoch_t e); + OSDMapRef get_map(epoch_t e) { + OSDMapRef ret(try_get_map(e)); + ceph_assert(ret); + return ret; + } + OSDMapRef add_map(OSDMap *o) { + std::lock_guard l(map_cache_lock); + return _add_map(o); + } + OSDMapRef _add_map(OSDMap *o); + + void _add_map_bl(epoch_t e, ceph::buffer::list& bl); + bool get_map_bl(epoch_t e, ceph::buffer::list& bl) { + std::lock_guard l(map_cache_lock); + return _get_map_bl(e, bl); + } + bool _get_map_bl(epoch_t e, ceph::buffer::list& bl); + + void _add_map_inc_bl(epoch_t e, ceph::buffer::list& bl); + bool get_inc_map_bl(epoch_t e, ceph::buffer::list& bl); + + /// identify split child pgids over a osdmap interval + void identify_splits_and_merges( + OSDMapRef old_map, + OSDMapRef new_map, + spg_t pgid, + std::set> *new_children, + std::set> *merge_pgs); + + void need_heartbeat_peer_update(); + + void init(); + void final_init(); + void start_shutdown(); + void shutdown_reserver(); + void shutdown(); + + // -- stats -- + ceph::mutex stat_lock = ceph::make_mutex("OSDService::stat_lock"); + osd_stat_t osd_stat; + uint32_t seq = 0; + + void set_statfs(const struct store_statfs_t &stbuf, + osd_alert_list_t& alerts); + osd_stat_t set_osd_stat(std::vector& hb_peers, int num_pgs); + void inc_osd_stat_repaired(void); + float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0); + osd_stat_t get_osd_stat() { + std::lock_guard l(stat_lock); + ++seq; + osd_stat.up_from = up_epoch; + osd_stat.seq = ((uint64_t)osd_stat.up_from << 32) + seq; + return osd_stat; + } + uint64_t get_osd_stat_seq() { + std::lock_guard l(stat_lock); + return osd_stat.seq; + } + void get_hb_pingtime(std::map *pp) + { + std::lock_guard l(stat_lock); + *pp = osd_stat.hb_pingtime; + return; + } + + // -- OSD Full Status -- +private: + friend TestOpsSocketHook; + mutable ceph::mutex full_status_lock = ceph::make_mutex("OSDService::full_status_lock"); + enum s_names { INVALID = -1, NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state; // ascending + const char *get_full_state_name(s_names s) const { + switch (s) { + case NONE: return "none"; + case NEARFULL: return "nearfull"; + case BACKFILLFULL: return "backfillfull"; + case FULL: return "full"; + case FAILSAFE: return "failsafe"; + default: return "???"; + } + } + s_names get_full_state(std::string type) const { + if (type == "none") + return NONE; + else if (type == "failsafe") + return FAILSAFE; + else if (type == "full") + return FULL; + else if (type == "backfillfull") + return BACKFILLFULL; + else if (type == "nearfull") + return NEARFULL; + else + return INVALID; + } + double cur_ratio, physical_ratio; ///< current utilization + mutable int64_t injectfull = 0; + s_names injectfull_state = NONE; + float get_failsafe_full_ratio(); + bool _check_inject_full(DoutPrefixProvider *dpp, s_names type) const; + bool _check_full(DoutPrefixProvider *dpp, s_names type) const; +public: + void check_full_status(float ratio, float pratio); + s_names recalc_full_state(float ratio, float pratio, std::string &inject); + bool _tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t); + bool check_failsafe_full(DoutPrefixProvider *dpp) const; + bool check_full(DoutPrefixProvider *dpp) const; + bool tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t); + bool check_backfill_full(DoutPrefixProvider *dpp) const; + bool check_nearfull(DoutPrefixProvider *dpp) const; + bool is_failsafe_full() const; + bool is_full() const; + bool is_backfillfull() const; + bool is_nearfull() const; + bool need_fullness_update(); ///< osdmap state needs update + void set_injectfull(s_names type, int64_t count); + + + // -- epochs -- +private: + // protects access to boot_epoch, up_epoch, bind_epoch + mutable ceph::mutex epoch_lock = ceph::make_mutex("OSDService::epoch_lock"); + epoch_t boot_epoch; // _first_ epoch we were marked up (after this process started) + epoch_t up_epoch; // _most_recent_ epoch we were marked up + epoch_t bind_epoch; // epoch we last did a bind to new ip:ports +public: + /** + * Retrieve the boot_, up_, and bind_ epochs the OSD has std::set. The params + * can be NULL if you don't care about them. + */ + void retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch, + epoch_t *_bind_epoch) const; + /** + * Std::set the boot, up, and bind epochs. Any NULL params will not be std::set. + */ + void set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch, + const epoch_t *_bind_epoch); + epoch_t get_boot_epoch() const { + epoch_t ret; + retrieve_epochs(&ret, NULL, NULL); + return ret; + } + epoch_t get_up_epoch() const { + epoch_t ret; + retrieve_epochs(NULL, &ret, NULL); + return ret; + } + epoch_t get_bind_epoch() const { + epoch_t ret; + retrieve_epochs(NULL, NULL, &ret); + return ret; + } + + void request_osdmap_update(epoch_t e); + + // -- heartbeats -- + ceph::mutex hb_stamp_lock = ceph::make_mutex("OSDServce::hb_stamp_lock"); + + /// osd -> heartbeat stamps + std::vector hb_stamps; + + /// get or create a ref for a peer's HeartbeatStamps + HeartbeatStampsRef get_hb_stamps(unsigned osd); + + + // Timer for readable leases + ceph::timer mono_timer = ceph::timer{ceph::construct_suspended}; + + void queue_renew_lease(epoch_t epoch, spg_t spgid); + + // -- stopping -- + ceph::mutex is_stopping_lock = ceph::make_mutex("OSDService::is_stopping_lock"); + ceph::condition_variable is_stopping_cond; + enum { + NOT_STOPPING, + PREPARING_TO_STOP, + STOPPING }; + std::atomic state{NOT_STOPPING}; + int get_state() const { + return state; + } + void set_state(int s) { + state = s; + } + bool is_stopping() const { + return state == STOPPING; + } + bool is_preparing_to_stop() const { + return state == PREPARING_TO_STOP; + } + bool prepare_to_stop(); + void got_stop_ack(); + + +#ifdef PG_DEBUG_REFS + ceph::mutex pgid_lock = ceph::make_mutex("OSDService::pgid_lock"); + std::map pgid_tracker; + std::map live_pgs; + void add_pgid(spg_t pgid, PG *pg); + void remove_pgid(spg_t pgid, PG *pg); + void dump_live_pgids(); +#endif + + explicit OSDService(OSD *osd, ceph::async::io_context_pool& poolctx); + ~OSDService() = default; +}; + +/* + + Each PG slot includes queues for events that are processing and/or waiting + for a PG to be materialized in the slot. + + These are the constraints: + + - client ops must remained ordered by client, regardless of std::map epoch + - peering messages/events from peers must remain ordered by peer + - peering messages and client ops need not be ordered relative to each other + + - some peering events can create a pg (e.g., notify) + - the query peering event can proceed when a PG doesn't exist + + Implementation notes: + + - everybody waits for split. If the OSD has the parent PG it will instantiate + the PGSlot early and mark it waiting_for_split. Everything will wait until + the parent is able to commit the split operation and the child PG's are + materialized in the child slots. + + - every event has an epoch property and will wait for the OSDShard to catch + up to that epoch. For example, if we get a peering event from a future + epoch, the event will wait in the slot until the local OSD has caught up. + (We should be judicious in specifying the required epoch [by, e.g., setting + it to the same_interval_since epoch] so that we don't wait for epochs that + don't affect the given PG.) + + - we maintain two separate wait lists, *waiting* and *waiting_peering*. The + OpSchedulerItem has an is_peering() bool to determine which we use. Waiting + peering events are queued up by epoch required. + + - when we wake a PG slot (e.g., we finished split, or got a newer osdmap, or + materialized the PG), we wake *all* waiting items. (This could be optimized, + probably, but we don't bother.) We always requeue peering items ahead of + client ops. + + - some peering events are marked !peering_requires_pg (PGQuery). if we do + not have a PG these are processed immediately (under the shard lock). + + - we do not have a PG present, we check if the slot maps to the current host. + if so, we either queue the item and wait for the PG to materialize, or + (if the event is a pg creating event like PGNotify), we materialize the PG. + + - when we advance the osdmap on the OSDShard, we scan pg slots and + discard any slots with no pg (and not waiting_for_split) that no + longer std::map to the current host. + + */ + +struct OSDShardPGSlot { + using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem; + PGRef pg; ///< pg reference + std::deque to_process; ///< order items for this slot + int num_running = 0; ///< _process threads doing pg lookup/lock + + std::deque waiting; ///< waiting for pg (or map + pg) + + /// waiting for map (peering evt) + std::map> waiting_peering; + + /// incremented by wake_pg_waiters; indicates racing _process threads + /// should bail out (their op has been requeued) + uint64_t requeue_seq = 0; + + /// waiting for split child to materialize in these epoch(s) + std::set waiting_for_split; + + epoch_t epoch = 0; + boost::intrusive::set_member_hook<> pg_epoch_item; + + /// waiting for a merge (source or target) by this epoch + epoch_t waiting_for_merge_epoch = 0; +}; + +struct OSDShard { + const unsigned shard_id; + CephContext *cct; + OSD *osd; + + std::string shard_name; + + std::string sdata_wait_lock_name; + ceph::mutex sdata_wait_lock; + ceph::condition_variable sdata_cond; + int waiting_threads = 0; + + ceph::mutex osdmap_lock; ///< protect shard_osdmap updates vs users w/o shard_lock + OSDMapRef shard_osdmap; + + OSDMapRef get_osdmap() { + std::lock_guard l(osdmap_lock); + return shard_osdmap; + } + + std::string shard_lock_name; + ceph::mutex shard_lock; ///< protects remaining members below + + /// map of slots for each spg_t. maintains ordering of items dequeued + /// from scheduler while _process thread drops shard lock to acquire the + /// pg lock. stale slots are removed by consume_map. + std::unordered_map> pg_slots; + + struct pg_slot_compare_by_epoch { + bool operator()(const OSDShardPGSlot& l, const OSDShardPGSlot& r) const { + return l.epoch < r.epoch; + } + }; + + /// maintain an ordering of pg slots by pg epoch + boost::intrusive::multiset< + OSDShardPGSlot, + boost::intrusive::member_hook< + OSDShardPGSlot, + boost::intrusive::set_member_hook<>, + &OSDShardPGSlot::pg_epoch_item>, + boost::intrusive::compare> pg_slots_by_epoch; + int waiting_for_min_pg_epoch = 0; + ceph::condition_variable min_pg_epoch_cond; + + /// priority queue + ceph::osd::scheduler::OpSchedulerRef scheduler; + + bool stop_waiting = false; + + ContextQueue context_queue; + + void _attach_pg(OSDShardPGSlot *slot, PG *pg); + void _detach_pg(OSDShardPGSlot *slot); + + void update_pg_epoch(OSDShardPGSlot *slot, epoch_t epoch); + epoch_t get_min_pg_epoch(); + void wait_min_pg_epoch(epoch_t need); + + /// return newest epoch we are waiting for + epoch_t get_max_waiting_epoch(); + + /// push osdmap into shard + void consume_map( + const OSDMapRef& osdmap, + unsigned *pushes_to_free); + + void _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot); + + void identify_splits_and_merges( + const OSDMapRef& as_of_osdmap, + std::set> *split_children, + std::set> *merge_pgs); + void _prime_splits(std::set> *pgids); + void prime_splits(const OSDMapRef& as_of_osdmap, + std::set> *pgids); + void prime_merges(const OSDMapRef& as_of_osdmap, + std::set> *merge_pgs); + void register_and_wake_split_child(PG *pg); + void unprime_split_children(spg_t parent, unsigned old_pg_num); + void update_scheduler_config(); + + OSDShard( + int id, + CephContext *cct, + OSD *osd); +}; + +class OSD : public Dispatcher, + public md_config_obs_t { + using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem; + + /** OSD **/ + // global lock + ceph::mutex osd_lock = ceph::make_mutex("OSD::osd_lock"); + SafeTimer tick_timer; // safe timer (osd_lock) + + // Tick timer for those stuff that do not need osd_lock + ceph::mutex tick_timer_lock = ceph::make_mutex("OSD::tick_timer_lock"); + SafeTimer tick_timer_without_osd_lock; + std::string gss_ktfile_client{}; + +public: + // config observer bits + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set &changed) override; + void update_log_config(); + void check_config(); + +protected: + + const double OSD_TICK_INTERVAL = { 1.0 }; + double get_tick_interval() const; + + Messenger *cluster_messenger; + Messenger *client_messenger; + Messenger *objecter_messenger; + MonClient *monc; // check the "monc helpers" list before accessing directly + MgrClient mgrc; + PerfCounters *logger; + PerfCounters *recoverystate_perf; + ObjectStore *store; +#ifdef HAVE_LIBFUSE + FuseStore *fuse_store = nullptr; +#endif + LogClient log_client; + LogChannelRef clog; + + int whoami; + std::string dev_path, journal_path; + + ceph_release_t last_require_osd_release{ceph_release_t::unknown}; + + int numa_node = -1; + size_t numa_cpu_set_size = 0; + cpu_set_t numa_cpu_set; + + bool store_is_rotational = true; + bool journal_is_rotational = true; + + ZTracer::Endpoint trace_endpoint; + PerfCounters* create_logger(); + PerfCounters* create_recoverystate_perf(); + void tick(); + void tick_without_osd_lock(); + void _dispatch(Message *m); + void dispatch_op(OpRequestRef op); + + void check_osdmap_features(); + + // asok + friend class OSDSocketHook; + class OSDSocketHook *asok_hook; + void asok_command( + std::string_view prefix, + const cmdmap_t& cmdmap, + ceph::Formatter *f, + const ceph::buffer::list& inbl, + std::function on_finish); + +public: + int get_nodeid() { return whoami; } + + static ghobject_t get_osdmap_pobject_name(epoch_t epoch) { + char foo[20]; + snprintf(foo, sizeof(foo), "osdmap.%d", epoch); + return ghobject_t(hobject_t(sobject_t(object_t(foo), 0))); + } + static ghobject_t get_inc_osdmap_pobject_name(epoch_t epoch) { + char foo[22]; + snprintf(foo, sizeof(foo), "inc_osdmap.%d", epoch); + return ghobject_t(hobject_t(sobject_t(object_t(foo), 0))); + } + + static ghobject_t make_snapmapper_oid() { + return ghobject_t(hobject_t( + sobject_t( + object_t("snapmapper"), + 0))); + } + static ghobject_t make_purged_snaps_oid() { + return ghobject_t(hobject_t( + sobject_t( + object_t("purged_snaps"), + 0))); + } + + static ghobject_t make_pg_log_oid(spg_t pg) { + std::stringstream ss; + ss << "pglog_" << pg; + std::string s; + getline(ss, s); + return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0))); + } + + static ghobject_t make_pg_biginfo_oid(spg_t pg) { + std::stringstream ss; + ss << "pginfo_" << pg; + std::string s; + getline(ss, s); + return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0))); + } + static ghobject_t make_infos_oid() { + hobject_t oid(sobject_t("infos", CEPH_NOSNAP)); + return ghobject_t(oid); + } + + static ghobject_t make_final_pool_info_oid(int64_t pool) { + return ghobject_t( + hobject_t( + sobject_t( + object_t(std::string("final_pool_") + stringify(pool)), + CEPH_NOSNAP))); + } + + static ghobject_t make_pg_num_history_oid() { + return ghobject_t(hobject_t(sobject_t("pg_num_history", CEPH_NOSNAP))); + } + + static void recursive_remove_collection(CephContext* cct, + ObjectStore *store, + spg_t pgid, + coll_t tmp); + + /** + * get_osd_initial_compat_set() + * + * Get the initial feature std::set for this OSD. Features + * here are automatically upgraded. + * + * Return value: Initial osd CompatSet + */ + static CompatSet get_osd_initial_compat_set(); + + /** + * get_osd_compat_set() + * + * Get all features supported by this OSD + * + * Return value: CompatSet of all supported features + */ + static CompatSet get_osd_compat_set(); + + +private: + class C_Tick; + class C_Tick_WithoutOSDLock; + + // -- config settings -- + float m_osd_pg_epoch_max_lag_factor; + + // -- superblock -- + OSDSuperblock superblock; + + void write_superblock(); + void write_superblock(ObjectStore::Transaction& t); + int read_superblock(); + + void clear_temp_objects(); + + CompatSet osd_compat; + + // -- state -- +public: + typedef enum { + STATE_INITIALIZING = 1, + STATE_PREBOOT, + STATE_BOOTING, + STATE_ACTIVE, + STATE_STOPPING, + STATE_WAITING_FOR_HEALTHY + } osd_state_t; + + static const char *get_state_name(int s) { + switch (s) { + case STATE_INITIALIZING: return "initializing"; + case STATE_PREBOOT: return "preboot"; + case STATE_BOOTING: return "booting"; + case STATE_ACTIVE: return "active"; + case STATE_STOPPING: return "stopping"; + case STATE_WAITING_FOR_HEALTHY: return "waiting_for_healthy"; + default: return "???"; + } + } + +private: + std::atomic state{STATE_INITIALIZING}; + +public: + int get_state() const { + return state; + } + void set_state(int s) { + state = s; + } + bool is_initializing() const { + return state == STATE_INITIALIZING; + } + bool is_preboot() const { + return state == STATE_PREBOOT; + } + bool is_booting() const { + return state == STATE_BOOTING; + } + bool is_active() const { + return state == STATE_ACTIVE; + } + bool is_stopping() const { + return state == STATE_STOPPING; + } + bool is_waiting_for_healthy() const { + return state == STATE_WAITING_FOR_HEALTHY; + } + +private: + + ShardedThreadPool osd_op_tp; + + void get_latest_osdmap(); + + // -- sessions -- +private: + void dispatch_session_waiting(const ceph::ref_t& session, OSDMapRef osdmap); + + ceph::mutex session_waiting_lock = ceph::make_mutex("OSD::session_waiting_lock"); + std::set> session_waiting_for_map; + + /// Caller assumes refs for included Sessions + void get_sessions_waiting_for_map(std::set> *out) { + std::lock_guard l(session_waiting_lock); + out->swap(session_waiting_for_map); + } + void register_session_waiting_on_map(const ceph::ref_t& session) { + std::lock_guard l(session_waiting_lock); + session_waiting_for_map.insert(session); + } + void clear_session_waiting_on_map(const ceph::ref_t& session) { + std::lock_guard l(session_waiting_lock); + session_waiting_for_map.erase(session); + } + void dispatch_sessions_waiting_on_map() { + std::set> sessions_to_check; + get_sessions_waiting_for_map(&sessions_to_check); + for (auto i = sessions_to_check.begin(); + i != sessions_to_check.end(); + sessions_to_check.erase(i++)) { + std::lock_guard l{(*i)->session_dispatch_lock}; + dispatch_session_waiting(*i, get_osdmap()); + } + } + void session_handle_reset(const ceph::ref_t& session) { + std::lock_guard l(session->session_dispatch_lock); + clear_session_waiting_on_map(session); + + session->clear_backoffs(); + + /* Messages have connection refs, we need to clear the + * connection->session->message->connection + * cycles which result. + * Bug #12338 + */ + session->waiting_on_map.clear_and_dispose(TrackedOp::Putter()); + } + +private: + /** + * @defgroup monc helpers + * @{ + * Right now we only have the one + */ + + /** + * Ask the Monitors for a sequence of OSDMaps. + * + * @param epoch The epoch to start with when replying + * @param force_request True if this request forces a new subscription to + * the monitors; false if an outstanding request that encompasses it is + * sufficient. + */ + void osdmap_subscribe(version_t epoch, bool force_request); + /** @} monc helpers */ + + ceph::mutex osdmap_subscribe_lock = ceph::make_mutex("OSD::osdmap_subscribe_lock"); + epoch_t latest_subscribed_epoch{0}; + + // -- heartbeat -- + /// information about a heartbeat peer + struct HeartbeatInfo { + int peer; ///< peer + ConnectionRef con_front; ///< peer connection (front) + ConnectionRef con_back; ///< peer connection (back) + utime_t first_tx; ///< time we sent our first ping request + utime_t last_tx; ///< last time we sent a ping request + utime_t last_rx_front; ///< last time we got a ping reply on the front side + utime_t last_rx_back; ///< last time we got a ping reply on the back side + epoch_t epoch; ///< most recent epoch we wanted this peer + /// number of connections we send and receive heartbeat pings/replies + static constexpr int HEARTBEAT_MAX_CONN = 2; + /// history of inflight pings, arranging by timestamp we sent + /// send time -> deadline -> remaining replies + std::map> ping_history; + + utime_t hb_interval_start; + uint32_t hb_average_count = 0; + uint32_t hb_index = 0; + + uint32_t hb_total_back = 0; + uint32_t hb_min_back = UINT_MAX; + uint32_t hb_max_back = 0; + std::vector hb_back_pingtime; + std::vector hb_back_min; + std::vector hb_back_max; + + uint32_t hb_total_front = 0; + uint32_t hb_min_front = UINT_MAX; + uint32_t hb_max_front = 0; + std::vector hb_front_pingtime; + std::vector hb_front_min; + std::vector hb_front_max; + + bool is_stale(utime_t stale) const { + if (ping_history.empty()) { + return false; + } + utime_t oldest_deadline = ping_history.begin()->second.first; + return oldest_deadline <= stale; + } + + bool is_unhealthy(utime_t now) const { + if (ping_history.empty()) { + /// we haven't sent a ping yet or we have got all replies, + /// in either way we are safe and healthy for now + return false; + } + + utime_t oldest_deadline = ping_history.begin()->second.first; + return now > oldest_deadline; + } + + bool is_healthy(utime_t now) const { + if (last_rx_front == utime_t() || last_rx_back == utime_t()) { + // only declare to be healthy until we have received the first + // replies from both front/back connections + return false; + } + return !is_unhealthy(now); + } + + void clear_mark_down(Connection *except = nullptr) { + if (con_back && con_back != except) { + con_back->mark_down(); + con_back->clear_priv(); + con_back.reset(nullptr); + } + if (con_front && con_front != except) { + con_front->mark_down(); + con_front->clear_priv(); + con_front.reset(nullptr); + } + } + }; + + ceph::mutex heartbeat_lock = ceph::make_mutex("OSD::heartbeat_lock"); + std::map debug_heartbeat_drops_remaining; + ceph::condition_variable heartbeat_cond; + bool heartbeat_stop; + std::atomic heartbeat_need_update; + std::map heartbeat_peers; ///< map of osd id to HeartbeatInfo + utime_t last_mon_heartbeat; + Messenger *hb_front_client_messenger; + Messenger *hb_back_client_messenger; + Messenger *hb_front_server_messenger; + Messenger *hb_back_server_messenger; + utime_t last_heartbeat_resample; ///< last time we chose random peers in waiting-for-healthy state + double daily_loadavg; + ceph::mono_time startup_time; + + // Track ping repsonse times using vector as a circular buffer + // MUST BE A POWER OF 2 + const uint32_t hb_vector_size = 16; + + void _add_heartbeat_peer(int p); + void _remove_heartbeat_peer(int p); + bool heartbeat_reset(Connection *con); + void maybe_update_heartbeat_peers(); + void reset_heartbeat_peers(bool all); + bool heartbeat_peers_need_update() { + return heartbeat_need_update.load(); + } + void heartbeat_set_peers_need_update() { + heartbeat_need_update.store(true); + } + void heartbeat_clear_peers_need_update() { + heartbeat_need_update.store(false); + } + void heartbeat(); + void heartbeat_check(); + void heartbeat_entry(); + void need_heartbeat_peer_update(); + + void heartbeat_kick() { + std::lock_guard l(heartbeat_lock); + heartbeat_cond.notify_all(); + } + + struct T_Heartbeat : public Thread { + OSD *osd; + explicit T_Heartbeat(OSD *o) : osd(o) {} + void *entry() override { + osd->heartbeat_entry(); + return 0; + } + } heartbeat_thread; + +public: + bool heartbeat_dispatch(Message *m); + + struct HeartbeatDispatcher : public Dispatcher { + OSD *osd; + explicit HeartbeatDispatcher(OSD *o) : Dispatcher(o->cct), osd(o) {} + + bool ms_can_fast_dispatch_any() const override { return true; } + bool ms_can_fast_dispatch(const Message *m) const override { + switch (m->get_type()) { + case CEPH_MSG_PING: + case MSG_OSD_PING: + return true; + default: + return false; + } + } + void ms_fast_dispatch(Message *m) override { + osd->heartbeat_dispatch(m); + } + bool ms_dispatch(Message *m) override { + return osd->heartbeat_dispatch(m); + } + bool ms_handle_reset(Connection *con) override { + return osd->heartbeat_reset(con); + } + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override { + return osd->ms_handle_refused(con); + } + int ms_handle_authentication(Connection *con) override { + return true; + } + } heartbeat_dispatcher; + +private: + // -- waiters -- + std::list finished; + + void take_waiters(std::list& ls) { + ceph_assert(ceph_mutex_is_locked(osd_lock)); + finished.splice(finished.end(), ls); + } + void do_waiters(); + + // -- op tracking -- + OpTracker op_tracker; + void test_ops(std::string command, std::string args, std::ostream& ss); + friend class TestOpsSocketHook; + TestOpsSocketHook *test_ops_hook; + friend struct C_FinishSplits; + friend struct C_OpenPGs; + +protected: + + /* + * The ordered op delivery chain is: + * + * fast dispatch -> scheduler back + * scheduler front <-> to_process back + * to_process front -> RunVis(item) + * <- queue_front() + * + * The scheduler is per-shard, and to_process is per pg_slot. Items can be + * pushed back up into to_process and/or scheduler while order is preserved. + * + * Multiple worker threads can operate on each shard. + * + * Under normal circumstances, num_running == to_process.size(). There are + * two times when that is not true: (1) when waiting_for_pg == true and + * to_process is accumulating requests that are waiting for the pg to be + * instantiated; in that case they will all get requeued together by + * wake_pg_waiters, and (2) when wake_pg_waiters just ran, waiting_for_pg + * and already requeued the items. + */ + friend class ceph::osd::scheduler::PGOpItem; + friend class ceph::osd::scheduler::PGPeeringItem; + friend class ceph::osd::scheduler::PGRecovery; + friend class ceph::osd::scheduler::PGRecoveryMsg; + friend class ceph::osd::scheduler::PGDelete; + + class ShardedOpWQ + : public ShardedThreadPool::ShardedWQ + { + OSD *osd; + + public: + ShardedOpWQ(OSD *o, + ceph::timespan ti, + ceph::timespan si, + ShardedThreadPool* tp) + : ShardedThreadPool::ShardedWQ(ti, si, tp), + osd(o) { + } + + void _add_slot_waiter( + spg_t token, + OSDShardPGSlot *slot, + OpSchedulerItem&& qi); + + /// try to do some work + void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb) override; + + /// enqueue a new item + void _enqueue(OpSchedulerItem&& item) override; + + /// requeue an old item (at the front of the line) + void _enqueue_front(OpSchedulerItem&& item) override; + + void return_waiting_threads() override { + for(uint32_t i = 0; i < osd->num_shards; i++) { + OSDShard* sdata = osd->shards[i]; + assert (NULL != sdata); + std::scoped_lock l{sdata->sdata_wait_lock}; + sdata->stop_waiting = true; + sdata->sdata_cond.notify_all(); + } + } + + void stop_return_waiting_threads() override { + for(uint32_t i = 0; i < osd->num_shards; i++) { + OSDShard* sdata = osd->shards[i]; + assert (NULL != sdata); + std::scoped_lock l{sdata->sdata_wait_lock}; + sdata->stop_waiting = false; + } + } + + void dump(ceph::Formatter *f) { + for(uint32_t i = 0; i < osd->num_shards; i++) { + auto &&sdata = osd->shards[i]; + + char queue_name[32] = {0}; + snprintf(queue_name, sizeof(queue_name), "%s%" PRIu32, "OSD:ShardedOpWQ:", i); + ceph_assert(NULL != sdata); + + std::scoped_lock l{sdata->shard_lock}; + f->open_object_section(queue_name); + sdata->scheduler->dump(*f); + f->close_section(); + } + } + + bool is_shard_empty(uint32_t thread_index) override { + uint32_t shard_index = thread_index % osd->num_shards; + auto &&sdata = osd->shards[shard_index]; + ceph_assert(sdata); + std::lock_guard l(sdata->shard_lock); + if (thread_index < osd->num_shards) { + return sdata->scheduler->empty() && sdata->context_queue.empty(); + } else { + return sdata->scheduler->empty(); + } + } + + void handle_oncommits(std::list& oncommits) { + for (auto p : oncommits) { + p->complete(0); + } + } + } op_shardedwq; + + + void enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch); + void dequeue_op( + PGRef pg, OpRequestRef op, + ThreadPool::TPHandle &handle); + + void enqueue_peering_evt( + spg_t pgid, + PGPeeringEventRef ref); + void dequeue_peering_evt( + OSDShard *sdata, + PG *pg, + PGPeeringEventRef ref, + ThreadPool::TPHandle& handle); + + void dequeue_delete( + OSDShard *sdata, + PG *pg, + epoch_t epoch, + ThreadPool::TPHandle& handle); + + friend class PG; + friend struct OSDShard; + friend class PrimaryLogPG; + friend class PgScrubber; + + + protected: + + // -- osd map -- + // TODO: switch to std::atomic when C++20 will be available. + OSDMapRef _osdmap; + void set_osdmap(OSDMapRef osdmap) { + std::atomic_store(&_osdmap, osdmap); + } + OSDMapRef get_osdmap() const { + return std::atomic_load(&_osdmap); + } + epoch_t get_osdmap_epoch() const { + // XXX: performance? + auto osdmap = get_osdmap(); + return osdmap ? osdmap->get_epoch() : 0; + } + + pool_pg_num_history_t pg_num_history; + + ceph::shared_mutex map_lock = ceph::make_shared_mutex("OSD::map_lock"); + std::list waiting_for_osdmap; + std::deque osd_markdown_log; + + friend struct send_map_on_destruct; + + void wait_for_new_map(OpRequestRef op); + void handle_osd_map(class MOSDMap *m); + void _committed_osd_maps(epoch_t first, epoch_t last, class MOSDMap *m); + void trim_maps(epoch_t oldest, int nreceived, bool skip_maps); + void note_down_osd(int osd); + void note_up_osd(int osd); + friend struct C_OnMapCommit; + + bool advance_pg( + epoch_t advance_to, + PG *pg, + ThreadPool::TPHandle &handle, + PeeringCtx &rctx); + void consume_map(); + void activate_map(); + + // osd map cache (past osd maps) + OSDMapRef get_map(epoch_t e) { + return service.get_map(e); + } + OSDMapRef add_map(OSDMap *o) { + return service.add_map(o); + } + bool get_map_bl(epoch_t e, ceph::buffer::list& bl) { + return service.get_map_bl(e, bl); + } + +public: + // -- shards -- + std::vector shards; + uint32_t num_shards = 0; + + void inc_num_pgs() { + ++num_pgs; + } + void dec_num_pgs() { + --num_pgs; + } + int get_num_pgs() const { + return num_pgs; + } + +protected: + ceph::mutex merge_lock = ceph::make_mutex("OSD::merge_lock"); + /// merge epoch -> target pgid -> source pgid -> pg + std::map>> merge_waiters; + + bool add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef source, + unsigned need); + + // -- placement groups -- + std::atomic num_pgs = {0}; + + std::mutex pending_creates_lock; + using create_from_osd_t = std::pair; + std::set pending_creates_from_osd; + unsigned pending_creates_from_mon = 0; + + PGRecoveryStats pg_recovery_stats; + + PGRef _lookup_pg(spg_t pgid); + PGRef _lookup_lock_pg(spg_t pgid); + void register_pg(PGRef pg); + bool try_finish_pg_delete(PG *pg, unsigned old_pg_num); + + void _get_pgs(std::vector *v, bool clear_too=false); + void _get_pgids(std::vector *v); + +public: + PGRef lookup_lock_pg(spg_t pgid); + + std::set get_mapped_pools(); + +protected: + PG* _make_pg(OSDMapRef createmap, spg_t pgid); + + bool maybe_wait_for_max_pg(const OSDMapRef& osdmap, + spg_t pgid, bool is_mon_create); + void resume_creating_pg(); + + void load_pgs(); + + /// build initial pg history and intervals on create + void build_initial_pg_history( + spg_t pgid, + epoch_t created, + utime_t created_stamp, + pg_history_t *h, + PastIntervals *pi); + + epoch_t last_pg_create_epoch; + + void handle_pg_create(OpRequestRef op); + + void split_pgs( + PG *parent, + const std::set &childpgids, std::set *out_pgs, + OSDMapRef curmap, + OSDMapRef nextmap, + PeeringCtx &rctx); + void _finish_splits(std::set& pgs); + + // == monitor interaction == + ceph::mutex mon_report_lock = ceph::make_mutex("OSD::mon_report_lock"); + utime_t last_mon_report; + Finisher boot_finisher; + + // -- boot -- + void start_boot(); + void _got_mon_epochs(epoch_t oldest, epoch_t newest); + void _preboot(epoch_t oldest, epoch_t newest); + void _send_boot(); + void _collect_metadata(std::map *pmeta); + void _get_purged_snaps(); + void handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *r); + + void start_waiting_for_healthy(); + bool _is_healthy(); + + void send_full_update(); + + friend struct CB_OSD_GetVersion; + + // -- alive -- + epoch_t up_thru_wanted; + + void queue_want_up_thru(epoch_t want); + void send_alive(); + + // -- full map requests -- + epoch_t requested_full_first, requested_full_last; + + void request_full_map(epoch_t first, epoch_t last); + void rerequest_full_maps() { + epoch_t first = requested_full_first; + epoch_t last = requested_full_last; + requested_full_first = 0; + requested_full_last = 0; + request_full_map(first, last); + } + void got_full_map(epoch_t e); + + // -- failures -- + std::map failure_queue; + std::map > failure_pending; + + void requeue_failures(); + void send_failures(); + void send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs); + void cancel_pending_failures(); + + ceph::coarse_mono_clock::time_point last_sent_beacon; + ceph::mutex min_last_epoch_clean_lock = ceph::make_mutex("OSD::min_last_epoch_clean_lock"); + epoch_t min_last_epoch_clean = 0; + // which pgs were scanned for min_lec + std::vector min_last_epoch_clean_pgs; + void send_beacon(const ceph::coarse_mono_clock::time_point& now); + + ceph_tid_t get_tid() { + return service.get_tid(); + } + + double scrub_sleep_time(bool must_scrub); + + // -- generic pg peering -- + PeeringCtx create_context(); + void dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap, + ThreadPool::TPHandle *handle = NULL); + + bool require_mon_peer(const Message *m); + bool require_mon_or_mgr_peer(const Message *m); + bool require_osd_peer(const Message *m); + /*** + * Verifies that we were alive in the given epoch, and that + * still are. + */ + bool require_self_aliveness(const Message *m, epoch_t alive_since); + /** + * Verifies that the OSD who sent the given op has the same + * address as in the given std::map. + * @pre op was sent by an OSD using the cluster messenger + */ + bool require_same_peer_instance(const Message *m, const OSDMapRef& map, + bool is_fast_dispatch); + + bool require_same_or_newer_map(OpRequestRef& op, epoch_t e, + bool is_fast_dispatch); + + void handle_fast_pg_create(MOSDPGCreate2 *m); + void handle_fast_pg_query(MOSDPGQuery *m); + void handle_pg_query_nopg(const MQuery& q); + void handle_fast_pg_notify(MOSDPGNotify *m); + void handle_pg_notify_nopg(const MNotifyRec& q); + void handle_fast_pg_info(MOSDPGInfo *m); + void handle_fast_pg_remove(MOSDPGRemove *m); + +public: + // used by OSDShard + PGRef handle_pg_create_info(const OSDMapRef& osdmap, const PGCreateInfo *info); +protected: + + void handle_fast_force_recovery(MOSDForceRecovery *m); + + // -- commands -- + void handle_command(class MCommand *m); + + + // -- pg recovery -- + void do_recovery(PG *pg, epoch_t epoch_queued, uint64_t pushes_reserved, + ThreadPool::TPHandle &handle); + + + // -- scrubbing -- + void sched_scrub(); + void resched_all_scrubs(); + bool scrub_random_backoff(); + bool scrub_load_below_threshold(); + bool scrub_time_permit(utime_t now); + + // -- status reporting -- + MPGStats *collect_pg_stats(); + std::vector get_health_metrics(); + + +private: + bool ms_can_fast_dispatch_any() const override { return true; } + bool ms_can_fast_dispatch(const Message *m) const override { + switch (m->get_type()) { + case CEPH_MSG_PING: + case CEPH_MSG_OSD_OP: + case CEPH_MSG_OSD_BACKOFF: + case MSG_OSD_SCRUB2: + case MSG_OSD_FORCE_RECOVERY: + case MSG_MON_COMMAND: + case MSG_OSD_PG_CREATE2: + case MSG_OSD_PG_QUERY: + case MSG_OSD_PG_QUERY2: + case MSG_OSD_PG_INFO: + case MSG_OSD_PG_INFO2: + case MSG_OSD_PG_NOTIFY: + case MSG_OSD_PG_NOTIFY2: + case MSG_OSD_PG_LOG: + case MSG_OSD_PG_TRIM: + case MSG_OSD_PG_REMOVE: + case MSG_OSD_BACKFILL_RESERVE: + case MSG_OSD_RECOVERY_RESERVE: + case MSG_OSD_REPOP: + case MSG_OSD_REPOPREPLY: + case MSG_OSD_PG_PUSH: + case MSG_OSD_PG_PULL: + case MSG_OSD_PG_PUSH_REPLY: + case MSG_OSD_PG_SCAN: + case MSG_OSD_PG_BACKFILL: + case MSG_OSD_PG_BACKFILL_REMOVE: + case MSG_OSD_EC_WRITE: + case MSG_OSD_EC_WRITE_REPLY: + case MSG_OSD_EC_READ: + case MSG_OSD_EC_READ_REPLY: + case MSG_OSD_SCRUB_RESERVE: + case MSG_OSD_REP_SCRUB: + case MSG_OSD_REP_SCRUBMAP: + case MSG_OSD_PG_UPDATE_LOG_MISSING: + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + case MSG_OSD_PG_RECOVERY_DELETE: + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + case MSG_OSD_PG_LEASE: + case MSG_OSD_PG_LEASE_ACK: + return true; + default: + return false; + } + } + void ms_fast_dispatch(Message *m) override; + bool ms_dispatch(Message *m) override; + void ms_handle_connect(Connection *con) override; + void ms_handle_fast_connect(Connection *con) override; + void ms_handle_fast_accept(Connection *con) override; + int ms_handle_authentication(Connection *con) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override; + + public: + /* internal and external can point to the same messenger, they will still + * be cleaned up properly*/ + OSD(CephContext *cct_, + ObjectStore *store_, + int id, + Messenger *internal, + Messenger *external, + Messenger *hb_front_client, + Messenger *hb_back_client, + Messenger *hb_front_server, + Messenger *hb_back_server, + Messenger *osdc_messenger, + MonClient *mc, const std::string &dev, const std::string &jdev, + ceph::async::io_context_pool& poolctx); + ~OSD() override; + + // static bits + static int mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, std::string osdspec_affinity); + + /* remove any non-user xattrs from a std::map of them */ + void filter_xattrs(std::map& attrs) { + for (std::map::iterator iter = attrs.begin(); + iter != attrs.end(); + ) { + if (('_' != iter->first.at(0)) || (iter->first.size() == 1)) + attrs.erase(iter++); + else ++iter; + } + } + +private: + int mon_cmd_maybe_osd_create(std::string &cmd); + int update_crush_device_class(); + int update_crush_location(); + + static int write_meta(CephContext *cct, + ObjectStore *store, + uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, std::string& osdspec_affinity); + + void handle_scrub(class MOSDScrub *m); + void handle_fast_scrub(class MOSDScrub2 *m); + void handle_osd_ping(class MOSDPing *m); + + size_t get_num_cache_shards(); + int get_num_op_shards(); + int get_num_op_threads(); + + float get_osd_recovery_sleep(); + float get_osd_delete_sleep(); + float get_osd_snap_trim_sleep(); + + int get_recovery_max_active(); + void maybe_override_max_osd_capacity_for_qos(); + bool maybe_override_options_for_qos(); + int run_osd_bench_test(int64_t count, + int64_t bsize, + int64_t osize, + int64_t onum, + double *elapsed, + std::ostream& ss); + int mon_cmd_set_config(const std::string &key, const std::string &val); + + void scrub_purged_snaps(); + void probe_smart(const std::string& devid, std::ostream& ss); + +public: + static int peek_meta(ObjectStore *store, + std::string *magic, + uuid_d *cluster_fsid, + uuid_d *osd_fsid, + int *whoami, + ceph_release_t *min_osd_release); + + + // startup/shutdown + int pre_init(); + int init(); + void final_init(); + + int enable_disable_fuse(bool stop); + int set_numa_affinity(); + + void suicide(int exitcode); + int shutdown(); + + void handle_signal(int signum); + + /// check if we can throw out op from a disconnected client + static bool op_is_discardable(const MOSDOp *m); + +public: + OSDService service; + friend class OSDService; + +private: + void set_perf_queries(const ConfigPayload &config_payload); + MetricPayload get_perf_reports(); + + ceph::mutex m_perf_queries_lock = ceph::make_mutex("OSD::m_perf_queries_lock"); + std::list m_perf_queries; + std::map m_perf_limits; +}; + + +//compatibility of the executable +extern const CompatSet::Feature ceph_osd_feature_compat[]; +extern const CompatSet::Feature ceph_osd_feature_ro_compat[]; +extern const CompatSet::Feature ceph_osd_feature_incompat[]; + +#endif // CEPH_OSD_H diff --git a/src/osd/OSDCap.cc b/src/osd/OSDCap.cc new file mode 100644 index 000000000..e7bf05827 --- /dev/null +++ b/src/osd/OSDCap.cc @@ -0,0 +1,532 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2009-2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include + +#include "OSDCap.h" +#include "common/config.h" +#include "common/debug.h" +#include "include/ipaddr.h" + +using std::ostream; +using std::string; +using std::vector; + +ostream& operator<<(ostream& out, const osd_rwxa_t& p) +{ + if (p == OSD_CAP_ANY) + return out << "*"; + + if (p & OSD_CAP_R) + out << "r"; + if (p & OSD_CAP_W) + out << "w"; + if ((p & OSD_CAP_X) == OSD_CAP_X) { + out << "x"; + } else { + if (p & OSD_CAP_CLS_R) + out << " class-read"; + if (p & OSD_CAP_CLS_W) + out << " class-write"; + } + return out; +} + +ostream& operator<<(ostream& out, const OSDCapSpec& s) +{ + if (s.allow) + return out << s.allow; + if (s.class_name.length()) { + out << "class '" << s.class_name << "'"; + if (!s.method_name.empty()) { + out << " '" << s.method_name << "'"; + } + } + return out; +} + +ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns) +{ + if (!pns.pool_name.empty()) { + out << "pool " << pns.pool_name << " "; + } + if (pns.nspace) { + out << "namespace "; + if (pns.nspace->empty()) { + out << "\"\""; + } else { + out << *pns.nspace; + } + out << " "; + } + return out; +} + +ostream& operator<<(ostream &out, const OSDCapPoolTag &pt) +{ + out << "app " << pt.application << " key " << pt.key << " val " << pt.value + << " "; + return out; +} + +ostream& operator<<(ostream& out, const OSDCapMatch& m) +{ + if (!m.pool_namespace.pool_name.empty() || m.pool_namespace.nspace) { + out << m.pool_namespace; + } + + if (!m.pool_tag.application.empty()) { + out << m.pool_tag; + } + + if (m.object_prefix.length()) { + out << "object_prefix " << m.object_prefix << " "; + } + return out; +} + +ostream& operator<<(ostream& out, const OSDCapProfile& m) +{ + out << "profile " << m.name; + out << m.pool_namespace; + return out; +} + +bool OSDCapPoolNamespace::is_match(const std::string& pn, + const std::string& ns) const +{ + if (!pool_name.empty()) { + if (pool_name != pn) { + return false; + } + } + if (nspace) { + if (!nspace->empty() && nspace->back() == '*' && + boost::starts_with(ns, nspace->substr(0, nspace->length() - 1))) { + return true; + } + + if (*nspace != ns) { + return false; + } + } + return true; +} + +bool OSDCapPoolNamespace::is_match_all() const +{ + if (!pool_name.empty()) + return false; + if (nspace) + return false; + return true; +} + +bool OSDCapPoolTag::is_match(const app_map_t& app_map) const +{ + if (application.empty()) { + return true; + } + auto kv_map = app_map.find(application); + if (kv_map == app_map.end()) { + return false; + } + if (!key.compare("*") && !value.compare("*")) { + return true; + } + if (!key.compare("*")) { + for (auto it : kv_map->second) { + if (it.second == value) { + return true; + } + } + return false; + } + auto kv_val = kv_map->second.find(key); + if (kv_val == kv_map->second.end()) { + return false; + } + if (!value.compare("*")) { + return true; + } + return kv_val->second == value; +} + +bool OSDCapPoolTag::is_match_all() const { + return application.empty(); +} + +bool OSDCapMatch::is_match(const string& pn, const string& ns, + const OSDCapPoolTag::app_map_t& app_map, + const string& object) const +{ + if (!pool_namespace.is_match(pn, ns)) { + return false; + } else if (!pool_tag.is_match(app_map)) { + return false; + } + + if (object_prefix.length()) { + if (object.find(object_prefix) != 0) + return false; + } + return true; +} + +bool OSDCapMatch::is_match_all() const +{ +if (!pool_namespace.is_match_all()) { + return false; + } else if (!pool_tag.is_match_all()) { + return false; + } + + if (object_prefix.length()) { + return false; + } + return true; +} + +ostream& operator<<(ostream& out, const OSDCapGrant& g) +{ + out << "grant("; + if (g.profile.is_valid()) { + out << g.profile << " ["; + for (auto it = g.profile_grants.cbegin(); + it != g.profile_grants.cend(); ++it) { + if (it != g.profile_grants.cbegin()) { + out << ","; + } + out << *it; + } + out << "]"; + } else { + out << g.match << g.spec; + } + if (g.network.size()) { + out << " network " << g.network; + } + out << ")"; + return out; +} + +void OSDCapGrant::set_network(const string& n) +{ + network = n; + network_valid = ::parse_network(n.c_str(), &network_parsed, &network_prefix); +} + +bool OSDCapGrant::allow_all() const +{ + if (profile.is_valid()) { + return std::any_of(profile_grants.cbegin(), profile_grants.cend(), + [](const OSDCapGrant& grant) { + return grant.allow_all(); + }); + } + + return (match.is_match_all() && spec.allow_all()); +} + +bool OSDCapGrant::is_capable( + const string& pool_name, + const string& ns, + const OSDCapPoolTag::app_map_t& application_metadata, + const string& object, + bool op_may_read, + bool op_may_write, + const std::vector& classes, + const entity_addr_t& addr, + std::vector* class_allowed) const +{ + osd_rwxa_t allow = 0; + + if (network.size() && + (!network_valid || + !network_contains(network_parsed, + network_prefix, + addr))) { + return false; + } + + if (profile.is_valid()) { + return std::any_of(profile_grants.cbegin(), profile_grants.cend(), + [&](const OSDCapGrant& grant) { + return grant.is_capable(pool_name, ns, + application_metadata, + object, op_may_read, + op_may_write, classes, addr, + class_allowed); + }); + } else { + if (match.is_match(pool_name, ns, application_metadata, object)) { + allow = allow | spec.allow; + if ((op_may_read && !(allow & OSD_CAP_R)) || + (op_may_write && !(allow & OSD_CAP_W))) { + return false; + } + if (!classes.empty()) { + // check 'allow *' + if (spec.allow_all()) { + return true; + } + + // compare this grant to each class in the operation + for (size_t i = 0; i < classes.size(); ++i) { + // check 'allow class foo [method_name]' + if (!spec.class_name.empty() && + classes[i].class_name == spec.class_name && + (spec.method_name.empty() || + classes[i].method_name == spec.method_name)) { + (*class_allowed)[i] = true; + continue; + } + // check 'allow x | class-{rw}': must be on allow list + if (!classes[i].allowed) { + continue; + } + if ((classes[i].read && !(allow & OSD_CAP_CLS_R)) || + (classes[i].write && !(allow & OSD_CAP_CLS_W))) { + continue; + } + (*class_allowed)[i] = true; + } + if (!std::all_of(class_allowed->cbegin(), class_allowed->cend(), + [](bool v) { return v; })) { + return false; + } + } + return true; + } + } + return false; +} + +void OSDCapGrant::expand_profile() +{ + if (profile.name == "read-only") { + // grants READ-ONLY caps to the OSD + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R))); + return; + } + if (profile.name == "read-write") { + // grants READ-WRITE caps to the OSD + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R | OSD_CAP_W))); + } + + if (profile.name == "rbd") { + // RBD read-write grant + profile_grants.emplace_back(OSDCapMatch(string(), "rbd_info"), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R))); + profile_grants.emplace_back(OSDCapMatch(string(), "rbd_children"), + OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R))); + profile_grants.emplace_back(OSDCapMatch(string(), "rbd_mirroring"), + OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R))); + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace.pool_name), + OSDCapSpec("rbd", "metadata_list")); + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R | + OSD_CAP_W | + OSD_CAP_X))); + } + if (profile.name == "rbd-read-only") { + // RBD read-only grant + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R | + OSD_CAP_CLS_R))); + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace, + "rbd_header."), + OSDCapSpec("rbd", "child_attach")); + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace, + "rbd_header."), + OSDCapSpec("rbd", "child_detach")); + } +} + +bool OSDCap::allow_all() const +{ + for (auto &grant : grants) { + if (grant.allow_all()) { + return true; + } + } + return false; +} + +void OSDCap::set_allow_all() +{ + grants.clear(); + grants.push_back(OSDCapGrant(OSDCapMatch(), OSDCapSpec(OSD_CAP_ANY))); +} + +bool OSDCap::is_capable(const string& pool_name, const string& ns, + const OSDCapPoolTag::app_map_t& application_metadata, + const string& object, + bool op_may_read, bool op_may_write, + const std::vector& classes, + const entity_addr_t& addr) const +{ + std::vector class_allowed(classes.size(), false); + for (auto &grant : grants) { + if (grant.is_capable(pool_name, ns, application_metadata, + object, op_may_read, op_may_write, classes, addr, + &class_allowed)) { + return true; + } + } + return false; +} + + +// grammar +namespace qi = boost::spirit::qi; +namespace ascii = boost::spirit::ascii; +namespace phoenix = boost::phoenix; + +template +struct OSDCapParser : qi::grammar +{ + OSDCapParser() : OSDCapParser::base_type(osdcap) + { + using qi::char_; + using qi::int_; + using qi::lexeme; + using qi::alnum; + using qi::_val; + using qi::_1; + using qi::_2; + using qi::_3; + using qi::eps; + using qi::lit; + + quoted_string %= + lexeme['"' >> +(char_ - '"') >> '"'] | + lexeme['\'' >> +(char_ - '\'') >> '\'']; + equoted_string %= + lexeme['"' >> *(char_ - '"') >> '"'] | + lexeme['\'' >> *(char_ - '\'') >> '\'']; + unquoted_word %= +char_("a-zA-Z0-9_./-"); + str %= quoted_string | unquoted_word; + estr %= equoted_string | unquoted_word; + network_str %= +char_("/.:a-fA-F0-9]["); + + spaces = +ascii::space; + + wildcard = (lit('*') | lit("all")) [_val = "*"]; + + pool_name %= -(spaces >> lit("pool") >> (lit('=') | spaces) >> str); + nspace %= (spaces >> lit("namespace") + >> (lit('=') | spaces) + >> estr >> -char_('*')); + + // match := [pool[=] [namespace[=]]] [object_prefix ] + object_prefix %= -(spaces >> lit("object_prefix") >> spaces >> str); + pooltag %= (spaces >> lit("tag") + >> spaces >> str // application + >> spaces >> (wildcard | str) // key + >> -spaces >> lit('=') >> -spaces >> (wildcard | str)); // value + + match = ( + pooltag [_val = phoenix::construct(_1)] | + (nspace >> pooltag) [_val = phoenix::construct(_1, _2)] | + (pool_name >> nspace >> object_prefix) [_val = phoenix::construct(_1, _2, _3)] | + (pool_name >> object_prefix) [_val = phoenix::construct(_1, _2)] + ); + + // rwxa := * | [r][w][x] [class-read] [class-write] + rwxa = + (spaces >> wildcard[_val = OSD_CAP_ANY]) | + ( eps[_val = 0] >> + ( + spaces >> + ( lit('r')[_val |= OSD_CAP_R] || + lit('w')[_val |= OSD_CAP_W] || + lit('x')[_val |= OSD_CAP_X] )) || + ( (spaces >> lit("class-read")[_val |= OSD_CAP_CLS_R]) || + (spaces >> lit("class-write")[_val |= OSD_CAP_CLS_W]) )); + + // capspec := * | rwx | class [] + class_name %= (spaces >> lit("class") >> spaces >> str); + method_name %= -(spaces >> str); + capspec = ( + (rwxa) [_val = phoenix::construct(_1)] | + (class_name >> method_name) [_val = phoenix::construct(_1, _2)]); + + // profile := profile [pool[=] [namespace[=]]] + profile_name %= (lit("profile") >> (lit('=') | spaces) >> str); + profile = ( + (profile_name >> pool_name >> nspace) [_val = phoenix::construct(_1, _2, _3)] | + (profile_name >> pool_name) [_val = phoenix::construct(_1, _2)]); + + // grant := allow match capspec + grant = (*ascii::blank >> + ((lit("allow") >> capspec >> match >> + -(spaces >> lit("network") >> spaces >> network_str)) + [_val = phoenix::construct(_2, _1, _3)] | + (lit("allow") >> match >> capspec >> + -(spaces >> lit("network") >> spaces >> network_str)) + [_val = phoenix::construct(_1, _2, _3)] | + (profile >> -(spaces >> lit("network") >> spaces >> network_str)) + [_val = phoenix::construct(_1, _2)] + ) >> *ascii::blank); + // osdcap := grant [grant ...] + grants %= (grant % (lit(';') | lit(','))); + osdcap = grants [_val = phoenix::construct(_1)]; + } + qi::rule spaces; + qi::rule rwxa; + qi::rule quoted_string, equoted_string; + qi::rule unquoted_word; + qi::rule str, estr, network_str; + qi::rule wildcard; + qi::rule class_name; + qi::rule method_name; + qi::rule capspec; + qi::rule pool_name; + qi::rule nspace; + qi::rule object_prefix; + qi::rule pooltag; + qi::rule match; + qi::rule profile_name; + qi::rule profile; + qi::rule grant; + qi::rule()> grants; + qi::rule osdcap; +}; + +bool OSDCap::parse(const string& str, ostream *err) +{ + OSDCapParser g; + string::const_iterator iter = str.begin(); + string::const_iterator end = str.end(); + + bool r = qi::phrase_parse(iter, end, g, ascii::space, *this); + if (r && iter == end) + return true; + + // Make sure no grants are kept after parsing failed! + grants.clear(); + + if (err) + *err << "osd capability parse failed, stopped at '" << std::string(iter, end) + << "' of '" << str << "'"; + + return false; +} diff --git a/src/osd/OSDCap.h b/src/osd/OSDCap.h new file mode 100644 index 000000000..394b1a726 --- /dev/null +++ b/src/osd/OSDCap.h @@ -0,0 +1,261 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + * OSDCaps: Hold the capabilities associated with a single authenticated + * user key. These are specified by text strings of the form + * "allow r" (which allows reading anything on the OSD) + * "allow rwx pool foo" (which allows full access to listed pools) + * "allow *" (which allows full access to EVERYTHING) + * + * The full grammar is documented in the parser in OSDCap.cc. + * + * The OSD assumes that anyone with * caps is an admin and has full + * message permissions. This means that only the monitor and the OSDs + * should get * + */ + +#ifndef CEPH_OSDCAP_H +#define CEPH_OSDCAP_H + +#include +using std::ostream; + +#include "include/types.h" +#include "OpRequest.h" + +#include +#include +#include +#include + +static const __u8 OSD_CAP_R = (1 << 1); // read +static const __u8 OSD_CAP_W = (1 << 2); // write +static const __u8 OSD_CAP_CLS_R = (1 << 3); // class read +static const __u8 OSD_CAP_CLS_W = (1 << 4); // class write +static const __u8 OSD_CAP_X = (OSD_CAP_CLS_R | OSD_CAP_CLS_W); // execute +static const __u8 OSD_CAP_ANY = 0xff; // * + +struct osd_rwxa_t { + __u8 val; + + // cppcheck-suppress noExplicitConstructor + osd_rwxa_t(__u8 v = 0) : val(v) {} + osd_rwxa_t& operator=(__u8 v) { + val = v; + return *this; + } + operator __u8() const { + return val; + } +}; + +ostream& operator<<(ostream& out, const osd_rwxa_t& p); + +struct OSDCapSpec { + osd_rwxa_t allow; + std::string class_name; + std::string method_name; + + OSDCapSpec() : allow(0) {} + explicit OSDCapSpec(osd_rwxa_t v) : allow(v) {} + OSDCapSpec(std::string class_name, std::string method_name) + : allow(0), class_name(std::move(class_name)), + method_name(std::move(method_name)) {} + + bool allow_all() const { + return allow == OSD_CAP_ANY; + } +}; + +ostream& operator<<(ostream& out, const OSDCapSpec& s); + +struct OSDCapPoolNamespace { + std::string pool_name; + boost::optional nspace = boost::none; + + OSDCapPoolNamespace() { + } + OSDCapPoolNamespace(const std::string& pool_name, + const boost::optional& nspace = boost::none) + : pool_name(pool_name), nspace(nspace) { + } + + bool is_match(const std::string& pn, const std::string& ns) const; + bool is_match_all() const; +}; + +ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns); + +struct OSDCapPoolTag { + typedef std::map > app_map_t; + std::string application; + std::string key; + std::string value; + + OSDCapPoolTag () {} + OSDCapPoolTag(const std::string& application, const std::string& key, + const std::string& value) : + application(application), key(key), value(value) {} + + bool is_match(const app_map_t& app_map) const; + bool is_match_all() const; +}; +// adapt for parsing with boost::spirit::qi in OSDCapParser +BOOST_FUSION_ADAPT_STRUCT(OSDCapPoolTag, + (std::string, application) + (std::string, key) + (std::string, value)) + +ostream& operator<<(ostream& out, const OSDCapPoolTag& pt); + +struct OSDCapMatch { + typedef std::map > app_map_t; + OSDCapPoolNamespace pool_namespace; + OSDCapPoolTag pool_tag; + std::string object_prefix; + + OSDCapMatch() {} + explicit OSDCapMatch(const OSDCapPoolTag& pt) : pool_tag(pt) {} + explicit OSDCapMatch(const OSDCapPoolNamespace& pns) : pool_namespace(pns) {} + OSDCapMatch(const OSDCapPoolNamespace& pns, const std::string& pre) + : pool_namespace(pns), object_prefix(pre) {} + OSDCapMatch(const std::string& pl, const std::string& pre) + : pool_namespace(pl), object_prefix(pre) {} + OSDCapMatch(const std::string& pl, const std::string& ns, + const std::string& pre) + : pool_namespace(pl, ns), object_prefix(pre) {} + OSDCapMatch(const std::string& dummy, const std::string& app, + const std::string& key, const std::string& val) + : pool_tag(app, key, val) {} + OSDCapMatch(const std::string& ns, const OSDCapPoolTag& pt) + : pool_namespace("", ns), pool_tag(pt) {} + + /** + * check if given request parameters match our constraints + * + * @param pool_name pool name + * @param nspace_name namespace name + * @param object object name + * @return true if we match, false otherwise + */ + bool is_match(const std::string& pool_name, const std::string& nspace_name, + const app_map_t& app_map, + const std::string& object) const; + bool is_match_all() const; +}; + +ostream& operator<<(ostream& out, const OSDCapMatch& m); + + +struct OSDCapProfile { + std::string name; + OSDCapPoolNamespace pool_namespace; + + OSDCapProfile() { + } + OSDCapProfile(const std::string& name, + const std::string& pool_name, + const boost::optional& nspace = boost::none) + : name(name), pool_namespace(pool_name, nspace) { + } + + inline bool is_valid() const { + return !name.empty(); + } +}; + +ostream& operator<<(ostream& out, const OSDCapProfile& m); + +struct OSDCapGrant { + OSDCapMatch match; + OSDCapSpec spec; + OSDCapProfile profile; + std::string network; + entity_addr_t network_parsed; + unsigned network_prefix = 0; + bool network_valid = true; + + // explicit grants that a profile grant expands to; populated as + // needed by expand_profile() and cached here. + std::list profile_grants; + + OSDCapGrant() {} + OSDCapGrant(const OSDCapMatch& m, const OSDCapSpec& s, + boost::optional n = {}) + : match(m), spec(s) { + if (n) { + set_network(*n); + } + } + explicit OSDCapGrant(const OSDCapProfile& profile, + boost::optional n = {}) + : profile(profile) { + if (n) { + set_network(*n); + } + expand_profile(); + } + + void set_network(const std::string& n); + + bool allow_all() const; + bool is_capable(const std::string& pool_name, const std::string& ns, + const OSDCapPoolTag::app_map_t& application_metadata, + const std::string& object, bool op_may_read, bool op_may_write, + const std::vector& classes, + const entity_addr_t& addr, + std::vector* class_allowed) const; + + void expand_profile(); +}; + +ostream& operator<<(ostream& out, const OSDCapGrant& g); + + +struct OSDCap { + std::vector grants; + + OSDCap() {} + explicit OSDCap(std::vector g) : grants(std::move(g)) {} + + bool allow_all() const; + void set_allow_all(); + bool parse(const std::string& str, ostream *err=NULL); + + /** + * check if we are capable of something + * + * This method actually checks a description of a particular operation against + * what the capability has specified. Currently that is just rwx with matches + * against pool, and object name prefix. + * + * @param pool_name name of the pool we are accessing + * @param ns name of the namespace we are accessing + * @param object name of the object we are accessing + * @param op_may_read whether the operation may need to read + * @param op_may_write whether the operation may need to write + * @param classes (class-name, rd, wr, allowed-flag) tuples + * @return true if the operation is allowed, false otherwise + */ + bool is_capable(const std::string& pool_name, const std::string& ns, + const OSDCapPoolTag::app_map_t& application_metadata, + const std::string& object, bool op_may_read, bool op_may_write, + const std::vector& classes, + const entity_addr_t& addr) const; +}; + +inline std::ostream& operator<<(std::ostream& out, const OSDCap& cap) +{ + return out << "osdcap" << cap.grants; +} + +#endif diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc new file mode 100644 index 000000000..6e5caf53a --- /dev/null +++ b/src/osd/OSDMap.cc @@ -0,0 +1,6412 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include + +#include + +#include "OSDMap.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "include/ceph_features.h" +#include "include/common_fwd.h" +#include "include/str_map.h" + +#include "common/code_environment.h" +#include "mon/health_check.h" + +#include "crush/CrushTreeDumper.h" +#include "common/Clock.h" +#include "mon/PGMap.h" + +using std::list; +using std::make_pair; +using std::map; +using std::multimap; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::stringstream; +using std::unordered_map; +using std::vector; + +using ceph::decode; +using ceph::encode; +using ceph::Formatter; + +#define dout_subsys ceph_subsys_osd + +MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap); +MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap); + + +// ---------------------------------- +// osd_info_t + +void osd_info_t::dump(Formatter *f) const +{ + f->dump_int("last_clean_begin", last_clean_begin); + f->dump_int("last_clean_end", last_clean_end); + f->dump_int("up_from", up_from); + f->dump_int("up_thru", up_thru); + f->dump_int("down_at", down_at); + f->dump_int("lost_at", lost_at); +} + +void osd_info_t::encode(ceph::buffer::list& bl) const +{ + using ceph::encode; + __u8 struct_v = 1; + encode(struct_v, bl); + encode(last_clean_begin, bl); + encode(last_clean_end, bl); + encode(up_from, bl); + encode(up_thru, bl); + encode(down_at, bl); + encode(lost_at, bl); +} + +void osd_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + using ceph::decode; + __u8 struct_v; + decode(struct_v, bl); + decode(last_clean_begin, bl); + decode(last_clean_end, bl); + decode(up_from, bl); + decode(up_thru, bl); + decode(down_at, bl); + decode(lost_at, bl); +} + +void osd_info_t::generate_test_instances(list& o) +{ + o.push_back(new osd_info_t); + o.push_back(new osd_info_t); + o.back()->last_clean_begin = 1; + o.back()->last_clean_end = 2; + o.back()->up_from = 30; + o.back()->up_thru = 40; + o.back()->down_at = 5; + o.back()->lost_at = 6; +} + +ostream& operator<<(ostream& out, const osd_info_t& info) +{ + out << "up_from " << info.up_from + << " up_thru " << info.up_thru + << " down_at " << info.down_at + << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")"; + if (info.lost_at) + out << " lost_at " << info.lost_at; + return out; +} + +// ---------------------------------- +// osd_xinfo_t + +void osd_xinfo_t::dump(Formatter *f) const +{ + f->dump_stream("down_stamp") << down_stamp; + f->dump_float("laggy_probability", laggy_probability); + f->dump_int("laggy_interval", laggy_interval); + f->dump_int("features", features); + f->dump_unsigned("old_weight", old_weight); + f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub; + f->dump_int("dead_epoch", dead_epoch); +} + +void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const +{ + uint8_t v = 4; + if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) { + v = 3; + } + ENCODE_START(v, 1, bl); + encode(down_stamp, bl); + __u32 lp = laggy_probability * float(0xfffffffful); + encode(lp, bl); + encode(laggy_interval, bl); + encode(features, bl); + encode(old_weight, bl); + if (v >= 4) { + encode(last_purged_snaps_scrub, bl); + encode(dead_epoch, bl); + } + ENCODE_FINISH(bl); +} + +void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(4, bl); + decode(down_stamp, bl); + __u32 lp; + decode(lp, bl); + laggy_probability = (float)lp / (float)0xffffffff; + decode(laggy_interval, bl); + if (struct_v >= 2) + decode(features, bl); + else + features = 0; + if (struct_v >= 3) + decode(old_weight, bl); + else + old_weight = 0; + if (struct_v >= 4) { + decode(last_purged_snaps_scrub, bl); + decode(dead_epoch, bl); + } else { + dead_epoch = 0; + } + DECODE_FINISH(bl); +} + +void osd_xinfo_t::generate_test_instances(list& o) +{ + o.push_back(new osd_xinfo_t); + o.push_back(new osd_xinfo_t); + o.back()->down_stamp = utime_t(2, 3); + o.back()->laggy_probability = .123; + o.back()->laggy_interval = 123456; + o.back()->old_weight = 0x7fff; +} + +ostream& operator<<(ostream& out, const osd_xinfo_t& xi) +{ + return out << "down_stamp " << xi.down_stamp + << " laggy_probability " << xi.laggy_probability + << " laggy_interval " << xi.laggy_interval + << " old_weight " << xi.old_weight + << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub + << " dead_epoch " << xi.dead_epoch; +} + +// ---------------------------------- +// OSDMap::Incremental + +int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const +{ + int n = 0; + for (auto &weight : new_weight) { + if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first)) + n++; // marked out + else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first)) + n--; // marked in + } + return n; +} + +int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const +{ + int n = 0; + for (auto &state : new_state) { // + if (state.second & CEPH_OSD_UP) { + if (previous->is_up(state.first)) + n++; // marked down + else + n--; // marked up + } + } + return n; +} + +int OSDMap::Incremental::identify_osd(uuid_d u) const +{ + for (auto &uuid : new_uuid) + if (uuid.second == u) + return uuid.first; + return -1; +} + +int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct, + const OSDMap& osdmap) +{ + ceph_assert(epoch == osdmap.get_epoch() + 1); + + for (auto &new_pool : new_pools) { + if (!new_pool.second.tiers.empty()) { + pg_pool_t& base = new_pool.second; + + auto new_rem_it = new_removed_snaps.find(new_pool.first); + + for (const auto &tier_pool : base.tiers) { + const auto &r = new_pools.find(tier_pool); + pg_pool_t *tier = 0; + if (r == new_pools.end()) { + const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool); + if (!orig) { + lderr(cct) << __func__ << " no pool " << tier_pool << dendl; + return -EIO; + } + tier = get_new_pool(tier_pool, orig); + } else { + tier = &r->second; + } + if (tier->tier_of != new_pool.first) { + lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl; + return -EIO; + } + + ldout(cct, 10) << __func__ << " from " << new_pool.first << " to " + << tier_pool << dendl; + tier->snap_seq = base.snap_seq; + tier->snap_epoch = base.snap_epoch; + tier->snaps = base.snaps; + tier->removed_snaps = base.removed_snaps; + tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS| + pg_pool_t::FLAG_POOL_SNAPS); + + if (new_rem_it != new_removed_snaps.end()) { + new_removed_snaps[tier_pool] = new_rem_it->second; + } + + tier->application_metadata = base.application_metadata; + } + } + } + return 0; +} + +// ---------------------------------- +// OSDMap + +bool OSDMap::subtree_is_down(int id, set *down_cache) const +{ + if (id >= 0) + return is_down(id); + + if (down_cache && + down_cache->count(id)) { + return true; + } + + list children; + crush->get_children(id, &children); + for (const auto &child : children) { + if (!subtree_is_down(child, down_cache)) { + return false; + } + } + if (down_cache) { + down_cache->insert(id); + } + return true; +} + +bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set *down_cache) const +{ + // use a stack-local down_cache if we didn't get one from the + // caller. then at least this particular call will avoid duplicated + // work. + set local_down_cache; + if (!down_cache) { + down_cache = &local_down_cache; + } + + int current = id; + while (true) { + int type; + if (current >= 0) { + type = 0; + } else { + type = crush->get_bucket_type(current); + } + ceph_assert(type >= 0); + + if (!subtree_is_down(current, down_cache)) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl; + return false; + } + + // is this a big enough subtree to be marked as down? + if (type >= subtree_type) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl; + return true; + } + + int r = crush->get_immediate_parent_id(current, ¤t); + if (r < 0) { + return false; + } + } +} + +bool OSDMap::subtree_type_is_down( + CephContext *cct, + int id, + int subtree_type, + set *down_in_osds, + set *up_in_osds, + set *subtree_up, + unordered_map > *subtree_type_down) const +{ + if (id >= 0) { + bool is_down_ret = is_down(id); + if (!is_out(id)) { + if (is_down_ret) { + down_in_osds->insert(id); + } else { + up_in_osds->insert(id); + } + } + return is_down_ret; + } + + if (subtree_type_down && + (*subtree_type_down)[subtree_type].count(id)) { + return true; + } + + list children; + crush->get_children(id, &children); + for (const auto &child : children) { + if (!subtree_type_is_down( + cct, child, crush->get_bucket_type(child), + down_in_osds, up_in_osds, subtree_up, subtree_type_down)) { + subtree_up->insert(id); + return false; + } + } + if (subtree_type_down) { + (*subtree_type_down)[subtree_type].insert(id); + } + return true; +} + +void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const +{ + using ceph::encode; + __u16 v = 5; + encode(v, bl); + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + int32_t new_t = new_pool_max; + encode(new_t, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + // for encode(new_pools, bl); + __u32 n = new_pools.size(); + encode(n, bl); + for (const auto &new_pool : new_pools) { + n = new_pool.first; + encode(n, bl); + encode(new_pool.second, bl, 0); + } + // for encode(new_pool_names, bl); + n = new_pool_names.size(); + encode(n, bl); + + for (const auto &new_pool_name : new_pool_names) { + n = new_pool_name.first; + encode(n, bl); + encode(new_pool_name.second, bl); + } + // for encode(old_pools, bl); + n = old_pools.size(); + encode(n, bl); + for (auto &old_pool : old_pools) { + n = old_pool; + encode(n, bl); + } + encode(new_up_client, bl, 0); + { + // legacy is map + map os; + for (auto p : new_state) { + // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT) + // that an old client could not understand. + // skip those! + uint8_t s = p.second; + if (p.second != 0 && s == 0) + continue; + os[p.first] = s; + } + uint32_t n = os.size(); + encode(n, bl); + for (auto p : os) { + encode(p.first, bl); + encode(p.second, bl); + } + } + encode(new_weight, bl); + // for encode(new_pg_temp, bl); + n = new_pg_temp.size(); + encode(n, bl); + + for (const auto &pg_temp : new_pg_temp) { + old_pg_t opg = pg_temp.first.get_old_pg(); + encode(opg, bl); + encode(pg_temp.second, bl); + } +} + +void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGID64) == 0) { + encode_client_old(bl); + return; + } + + // base + __u16 v = 6; + encode(v, bl); + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + encode(new_pool_max, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + encode(new_pools, bl, features); + encode(new_pool_names, bl); + encode(old_pools, bl); + encode(new_up_client, bl, features); + { + map os; + for (auto p : new_state) { + // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT) + // that an old client could not understand. + // skip those! + uint8_t s = p.second; + if (p.second != 0 && s == 0) + continue; + os[p.first] = s; + } + uint32_t n = os.size(); + encode(n, bl); + for (auto p : os) { + encode(p.first, bl); + encode(p.second, bl); + } + } + encode(new_weight, bl); + encode(new_pg_temp, bl); + + // extended + __u16 ev = 10; + encode(ev, bl); + encode(new_hb_back_up, bl, features); + encode(new_up_thru, bl); + encode(new_last_clean_interval, bl); + encode(new_lost, bl); + encode(new_blocklist, bl, features); + encode(old_blocklist, bl, features); + encode(new_up_cluster, bl, features); + encode(cluster_snapshot, bl); + encode(new_uuid, bl); + encode(new_xinfo, bl, features); + encode(new_hb_front_up, bl, features); +} + +template +static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f) +{ + uint32_t n = m.size(); + encode(n, bl); + for (auto& i : m) { + encode(i.first, bl); + encode(i.second.legacy_addr(), bl, f); + } +} + +template +static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f) +{ + uint32_t n = m.size(); + encode(n, bl); + for (auto& i : m) { + if (i) { + encode(i->legacy_addr(), bl, f); + } else { + encode(entity_addr_t(), bl, f); + } + } +} + +/* for a description of osdmap incremental versions, and when they were + * introduced, please refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) { + encode_classic(bl, features); + return; + } + + // only a select set of callers should *ever* be encoding new + // OSDMaps. others should be passing around the canonical encoded + // buffers from on high. select out those callers by passing in an + // "impossible" feature bit. + ceph_assert(features & CEPH_FEATURE_RESERVED); + features &= ~CEPH_FEATURE_RESERVED; + + size_t start_offset = bl.length(); + size_t tail_offset; + size_t crc_offset; + std::optional crc_filler; + + // meta-encoding: how we include client-used and osd-specific data + ENCODE_START(8, 7, bl); + + { + uint8_t v = 8; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 3; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 5; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 6; + } + ENCODE_START(v, 1, bl); // client-usable data + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + encode(new_pool_max, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + encode(new_pools, bl, features); + encode(new_pool_names, bl); + encode(old_pools, bl); + if (v >= 7) { + encode(new_up_client, bl, features); + } else { + encode_addrvec_map_as_addr(new_up_client, bl, features); + } + if (v >= 5) { + encode(new_state, bl); + } else { + map os; + for (auto p : new_state) { + // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT) + // that an old client could not understand. + // skip those! + uint8_t s = p.second; + if (p.second != 0 && s == 0) + continue; + os[p.first] = s; + } + uint32_t n = os.size(); + encode(n, bl); + for (auto p : os) { + encode(p.first, bl); + encode(p.second, bl); + } + } + encode(new_weight, bl); + encode(new_pg_temp, bl); + encode(new_primary_temp, bl); + encode(new_primary_affinity, bl); + encode(new_erasure_code_profiles, bl); + encode(old_erasure_code_profiles, bl); + if (v >= 4) { + encode(new_pg_upmap, bl); + encode(old_pg_upmap, bl); + encode(new_pg_upmap_items, bl); + encode(old_pg_upmap_items, bl); + } + if (v >= 6) { + encode(new_removed_snaps, bl); + encode(new_purged_snaps, bl); + } + if (v >= 8) { + encode(new_last_up_change, bl); + encode(new_last_in_change, bl); + } + ENCODE_FINISH(bl); // client-usable data + } + + { + uint8_t target_v = 9; // if bumping this, be aware of range_blocklist 11 + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + target_v = 2; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + target_v = 6; + } + if (change_stretch_mode) { + target_v = std::max((uint8_t)10, target_v); + } + if (!new_range_blocklist.empty() || + !old_range_blocklist.empty()) { + target_v = std::max((uint8_t)11, target_v); + } + ENCODE_START(target_v, 1, bl); // extended, osd-only data + if (target_v < 7) { + encode_addrvec_map_as_addr(new_hb_back_up, bl, features); + } else { + encode(new_hb_back_up, bl, features); + } + encode(new_up_thru, bl); + encode(new_last_clean_interval, bl); + encode(new_lost, bl); + encode(new_blocklist, bl, features); + encode(old_blocklist, bl, features); + if (target_v < 7) { + encode_addrvec_map_as_addr(new_up_cluster, bl, features); + } else { + encode(new_up_cluster, bl, features); + } + encode(cluster_snapshot, bl); + encode(new_uuid, bl); + encode(new_xinfo, bl, features); + if (target_v < 7) { + encode_addrvec_map_as_addr(new_hb_front_up, bl, features); + } else { + encode(new_hb_front_up, bl, features); + } + encode(features, bl); // NOTE: features arg, not the member + if (target_v >= 3) { + encode(new_nearfull_ratio, bl); + encode(new_full_ratio, bl); + encode(new_backfillfull_ratio, bl); + } + // 5 was string-based new_require_min_compat_client + if (target_v >= 6) { + encode(new_require_min_compat_client, bl); + encode(new_require_osd_release, bl); + } + if (target_v >= 8) { + encode(new_crush_node_flags, bl); + } + if (target_v >= 9) { + encode(new_device_class_flags, bl); + } + if (target_v >= 10) { + encode(change_stretch_mode, bl); + encode(new_stretch_bucket_count, bl); + encode(new_degraded_stretch_mode, bl); + encode(new_recovering_stretch_mode, bl); + encode(new_stretch_mode_bucket, bl); + encode(stretch_mode_enabled, bl); + } + if (target_v >= 11) { + encode(new_range_blocklist, bl, features); + encode(old_range_blocklist, bl, features); + } + ENCODE_FINISH(bl); // osd-only data + } + + crc_offset = bl.length(); + crc_filler = bl.append_hole(sizeof(uint32_t)); + tail_offset = bl.length(); + + encode(full_crc, bl); + + ENCODE_FINISH(bl); // meta-encoding wrapper + + // fill in crc + ceph::buffer::list front; + front.substr_of(bl, start_offset, crc_offset - start_offset); + inc_crc = front.crc32c(-1); + ceph::buffer::list tail; + tail.substr_of(bl, tail_offset, bl.length() - tail_offset); + inc_crc = tail.crc32c(inc_crc); + ceph_le32 crc_le; + crc_le = inc_crc; + crc_filler->copy_in(4u, (char*)&crc_le); + have_crc = true; +} + +void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p) +{ + using ceph::decode; + __u32 n, t; + // base + __u16 v; + decode(v, p); + decode(fsid, p); + decode(epoch, p); + decode(modified, p); + if (v == 4 || v == 5) { + decode(n, p); + new_pool_max = n; + } else if (v >= 6) + decode(new_pool_max, p); + decode(new_flags, p); + decode(fullmap, p); + decode(crush, p); + + decode(new_max_osd, p); + if (v < 6) { + new_pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(new_pools[t], p); + } + } else { + decode(new_pools, p); + } + if (v == 5) { + new_pool_names.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(new_pool_names[t], p); + } + } else if (v >= 6) { + decode(new_pool_names, p); + } + if (v < 6) { + old_pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + old_pools.insert(t); + } + } else { + decode(old_pools, p); + } + decode(new_up_client, p); + { + map ns; + decode(ns, p); + for (auto q : ns) { + new_state[q.first] = q.second; + } + } + decode(new_weight, p); + + if (v < 6) { + new_pg_temp.clear(); + decode(n, p); + while (n--) { + old_pg_t opg; + ceph::decode_raw(opg, p); + decode(new_pg_temp[pg_t(opg)], p); + } + } else { + decode(new_pg_temp, p); + } + + // decode short map, too. + if (v == 5 && p.end()) + return; + + // extended + __u16 ev = 0; + if (v >= 5) + decode(ev, p); + decode(new_hb_back_up, p); + if (v < 5) + decode(new_pool_names, p); + decode(new_up_thru, p); + decode(new_last_clean_interval, p); + decode(new_lost, p); + decode(new_blocklist, p); + decode(old_blocklist, p); + if (ev >= 6) + decode(new_up_cluster, p); + if (ev >= 7) + decode(cluster_snapshot, p); + if (ev >= 8) + decode(new_uuid, p); + if (ev >= 9) + decode(new_xinfo, p); + if (ev >= 10) + decode(new_hb_front_up, p); +} + +/* for a description of osdmap incremental versions, and when they were + * introduced, please refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl) +{ + using ceph::decode; + /** + * Older encodings of the Incremental had a single struct_v which + * covered the whole encoding, and was prior to our modern + * stuff which includes a compatv and a size. So if we see + * a struct_v < 7, we must rewind to the beginning and use our + * classic decoder. + */ + size_t start_offset = bl.get_off(); + size_t tail_offset = 0; + ceph::buffer::list crc_front, crc_tail; + + DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper + if (struct_v < 7) { + bl.seek(start_offset); + decode_classic(bl); + encode_features = 0; + if (struct_v >= 6) + encode_features = CEPH_FEATURE_PGID64; + else + encode_features = 0; + return; + } + { + DECODE_START(8, bl); // client-usable data + decode(fsid, bl); + decode(epoch, bl); + decode(modified, bl); + decode(new_pool_max, bl); + decode(new_flags, bl); + decode(fullmap, bl); + decode(crush, bl); + + decode(new_max_osd, bl); + decode(new_pools, bl); + decode(new_pool_names, bl); + decode(old_pools, bl); + decode(new_up_client, bl); + if (struct_v >= 5) { + decode(new_state, bl); + } else { + map ns; + decode(ns, bl); + for (auto q : ns) { + new_state[q.first] = q.second; + } + } + decode(new_weight, bl); + decode(new_pg_temp, bl); + decode(new_primary_temp, bl); + if (struct_v >= 2) + decode(new_primary_affinity, bl); + else + new_primary_affinity.clear(); + if (struct_v >= 3) { + decode(new_erasure_code_profiles, bl); + decode(old_erasure_code_profiles, bl); + } else { + new_erasure_code_profiles.clear(); + old_erasure_code_profiles.clear(); + } + if (struct_v >= 4) { + decode(new_pg_upmap, bl); + decode(old_pg_upmap, bl); + decode(new_pg_upmap_items, bl); + decode(old_pg_upmap_items, bl); + } + if (struct_v >= 6) { + decode(new_removed_snaps, bl); + decode(new_purged_snaps, bl); + } + if (struct_v >= 8) { + decode(new_last_up_change, bl); + decode(new_last_in_change, bl); + } + DECODE_FINISH(bl); // client-usable data + } + + { + DECODE_START(10, bl); // extended, osd-only data + decode(new_hb_back_up, bl); + decode(new_up_thru, bl); + decode(new_last_clean_interval, bl); + decode(new_lost, bl); + decode(new_blocklist, bl); + decode(old_blocklist, bl); + decode(new_up_cluster, bl); + decode(cluster_snapshot, bl); + decode(new_uuid, bl); + decode(new_xinfo, bl); + decode(new_hb_front_up, bl); + if (struct_v >= 2) + decode(encode_features, bl); + else + encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC; + if (struct_v >= 3) { + decode(new_nearfull_ratio, bl); + decode(new_full_ratio, bl); + } else { + new_nearfull_ratio = -1; + new_full_ratio = -1; + } + if (struct_v >= 4) { + decode(new_backfillfull_ratio, bl); + } else { + new_backfillfull_ratio = -1; + } + if (struct_v == 5) { + string r; + decode(r, bl); + if (r.length()) { + new_require_min_compat_client = ceph_release_from_name(r); + } + } + if (struct_v >= 6) { + decode(new_require_min_compat_client, bl); + decode(new_require_osd_release, bl); + } else { + if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) { + // only for compat with post-kraken pre-luminous test clusters + new_require_osd_release = ceph_release_t::luminous; + new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) { + new_require_osd_release = ceph_release_t::kraken; + } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) { + new_require_osd_release = ceph_release_t::jewel; + } else { + new_require_osd_release = ceph_release_t::unknown; + } + } + if (struct_v >= 8) { + decode(new_crush_node_flags, bl); + } + if (struct_v >= 9) { + decode(new_device_class_flags, bl); + } + if (struct_v >= 10) { + decode(change_stretch_mode, bl); + decode(new_stretch_bucket_count, bl); + decode(new_degraded_stretch_mode, bl); + decode(new_recovering_stretch_mode, bl); + decode(new_stretch_mode_bucket, bl); + decode(stretch_mode_enabled, bl); + } + if (struct_v >= 11) { + decode(new_range_blocklist, bl); + decode(old_range_blocklist, bl); + } + DECODE_FINISH(bl); // osd-only data + } + + if (struct_v >= 8) { + have_crc = true; + crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset); + decode(inc_crc, bl); + tail_offset = bl.get_off(); + decode(full_crc, bl); + } else { + have_crc = false; + full_crc = 0; + inc_crc = 0; + } + + DECODE_FINISH(bl); // wrapper + + if (have_crc) { + // verify crc + uint32_t actual = crc_front.crc32c(-1); + if (tail_offset < bl.get_off()) { + ceph::buffer::list tail; + tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset); + actual = tail.crc32c(actual); + } + if (inc_crc != actual) { + ostringstream ss; + ss << "bad crc, actual " << actual << " != expected " << inc_crc; + string s = ss.str(); + throw ceph::buffer::malformed_input(s.c_str()); + } + } +} + +void OSDMap::Incremental::dump(Formatter *f) const +{ + f->dump_int("epoch", epoch); + f->dump_stream("fsid") << fsid; + f->dump_stream("modified") << modified; + f->dump_stream("new_last_up_change") << new_last_up_change; + f->dump_stream("new_last_in_change") << new_last_in_change; + f->dump_int("new_pool_max", new_pool_max); + f->dump_int("new_flags", new_flags); + f->dump_float("new_full_ratio", new_full_ratio); + f->dump_float("new_nearfull_ratio", new_nearfull_ratio); + f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio); + f->dump_int("new_require_min_compat_client", to_integer(new_require_min_compat_client)); + f->dump_int("new_require_osd_release", to_integer(new_require_osd_release)); + + if (fullmap.length()) { + f->open_object_section("full_map"); + OSDMap full; + ceph::buffer::list fbl = fullmap; // kludge around constness. + auto p = fbl.cbegin(); + full.decode(p); + full.dump(f); + f->close_section(); + } + if (crush.length()) { + f->open_object_section("crush"); + CrushWrapper c; + ceph::buffer::list tbl = crush; // kludge around constness. + auto p = tbl.cbegin(); + c.decode(p); + c.dump(f); + f->close_section(); + } + + f->dump_int("new_max_osd", new_max_osd); + + f->open_array_section("new_pools"); + + for (const auto &new_pool : new_pools) { + f->open_object_section("pool"); + f->dump_int("pool", new_pool.first); + new_pool.second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_pool_names"); + + for (const auto &new_pool_name : new_pool_names) { + f->open_object_section("pool_name"); + f->dump_int("pool", new_pool_name.first); + f->dump_string("name", new_pool_name.second); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pools"); + + for (const auto &old_pool : old_pools) + f->dump_int("pool", old_pool); + f->close_section(); + + f->open_array_section("new_up_osds"); + + for (const auto &upclient : new_up_client) { + f->open_object_section("osd"); + f->dump_int("osd", upclient.first); + f->dump_stream("public_addr") << upclient.second.legacy_addr(); + f->dump_object("public_addrs", upclient.second); + if (auto p = new_up_cluster.find(upclient.first); + p != new_up_cluster.end()) { + f->dump_stream("cluster_addr") << p->second.legacy_addr(); + f->dump_object("cluster_addrs", p->second); + } + if (auto p = new_hb_back_up.find(upclient.first); + p != new_hb_back_up.end()) { + f->dump_object("heartbeat_back_addrs", p->second); + } + if (auto p = new_hb_front_up.find(upclient.first); + p != new_hb_front_up.end()) { + f->dump_object("heartbeat_front_addrs", p->second); + } + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_weight"); + + for (const auto &weight : new_weight) { + f->open_object_section("osd"); + f->dump_int("osd", weight.first); + f->dump_int("weight", weight.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("osd_state_xor"); + for (const auto &ns : new_state) { + f->open_object_section("osd"); + f->dump_int("osd", ns.first); + set st; + calc_state_set(new_state.find(ns.first)->second, st); + f->open_array_section("state_xor"); + for (auto &state : st) + f->dump_string("state", state); + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_pg_temp"); + + for (const auto &pg_temp : new_pg_temp) { + f->open_object_section("pg"); + f->dump_stream("pgid") << pg_temp.first; + f->open_array_section("osds"); + + for (const auto &osd : pg_temp.second) + f->dump_int("osd", osd); + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("primary_temp"); + + for (const auto &primary_temp : new_primary_temp) { + f->dump_stream("pgid") << primary_temp.first; + f->dump_int("osd", primary_temp.second); + } + f->close_section(); // primary_temp + + f->open_array_section("new_pg_upmap"); + for (auto& i : new_pg_upmap) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << i.first; + f->open_array_section("osds"); + for (auto osd : i.second) { + f->dump_int("osd", osd); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pg_upmap"); + for (auto& i : old_pg_upmap) { + f->dump_stream("pgid") << i; + } + f->close_section(); + + f->open_array_section("new_pg_upmap_items"); + for (auto& i : new_pg_upmap_items) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << i.first; + f->open_array_section("mappings"); + for (auto& p : i.second) { + f->open_object_section("mapping"); + f->dump_int("from", p.first); + f->dump_int("to", p.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pg_upmap_items"); + for (auto& i : old_pg_upmap_items) { + f->dump_stream("pgid") << i; + } + f->close_section(); + + f->open_array_section("new_up_thru"); + + for (const auto &up_thru : new_up_thru) { + f->open_object_section("osd"); + f->dump_int("osd", up_thru.first); + f->dump_int("up_thru", up_thru.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_lost"); + + for (const auto &lost : new_lost) { + f->open_object_section("osd"); + f->dump_int("osd", lost.first); + f->dump_int("epoch_lost", lost.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_last_clean_interval"); + + for (const auto &last_clean_interval : new_last_clean_interval) { + f->open_object_section("osd"); + f->dump_int("osd", last_clean_interval.first); + f->dump_int("first", last_clean_interval.second.first); + f->dump_int("last", last_clean_interval.second.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_blocklist"); + for (const auto &blist : new_blocklist) { + stringstream ss; + ss << blist.first; + f->dump_stream(ss.str().c_str()) << blist.second; + } + f->close_section(); + f->open_array_section("old_blocklist"); + for (const auto &blist : old_blocklist) + f->dump_stream("addr") << blist; + f->close_section(); + f->open_array_section("new_range_blocklist"); + for (const auto &blist : new_range_blocklist) { + stringstream ss; + ss << blist.first; + f->dump_stream(ss.str().c_str()) << blist.second; + } + f->close_section(); + f->open_array_section("old_range_blocklist"); + for (const auto &blist : old_range_blocklist) + f->dump_stream("addr") << blist; + f->close_section(); + + f->open_array_section("new_xinfo"); + for (const auto &xinfo : new_xinfo) { + f->open_object_section("xinfo"); + f->dump_int("osd", xinfo.first); + xinfo.second.dump(f); + f->close_section(); + } + f->close_section(); + + if (cluster_snapshot.size()) + f->dump_string("cluster_snapshot", cluster_snapshot); + + f->open_array_section("new_uuid"); + for (const auto &uuid : new_uuid) { + f->open_object_section("osd"); + f->dump_int("osd", uuid.first); + f->dump_stream("uuid") << uuid.second; + f->close_section(); + } + f->close_section(); + + OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f); + f->open_array_section("old_erasure_code_profiles"); + for (const auto &erasure_code_profile : old_erasure_code_profiles) { + f->dump_string("old", erasure_code_profile); + } + f->close_section(); + + f->open_array_section("new_removed_snaps"); + for (auto& p : new_removed_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_purged_snaps"); + for (auto& p : new_purged_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->open_array_section("new_crush_node_flags"); + for (auto& i : new_crush_node_flags) { + f->open_object_section("node"); + f->dump_int("id", i.first); + set st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_array_section("new_device_class_flags"); + for (auto& i : new_device_class_flags) { + f->open_object_section("device_class"); + f->dump_int("id", i.first); + set st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_object_section("stretch_mode"); + { + f->dump_bool("change_stretch_mode", change_stretch_mode); + f->dump_bool("stretch_mode_enabled", stretch_mode_enabled); + f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count); + f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode); + f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode); + f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket); + } + f->close_section(); + f->close_section(); +} + +void OSDMap::Incremental::generate_test_instances(list& o) +{ + o.push_back(new Incremental); +} + +// ---------------------------------- +// OSDMap + +void OSDMap::set_epoch(epoch_t e) +{ + epoch = e; + for (auto &pool : pools) + pool.second.last_change = e; +} + +OSDMap::range_bits::range_bits() : ipv6(false) { + memset(&bits, 0, sizeof(bits)); +} + +OSDMap::range_bits::range_bits(const entity_addr_t& addr) : ipv6(false) { + memset(&bits, 0, sizeof(bits)); + parse(addr); +} + +void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr, + uint64_t *upper, uint64_t *lower) +{ + *upper = ((uint64_t)(ntohl(*(uint32_t*)(addr)))) << 32 | + ((uint64_t)(ntohl(*(uint32_t*)(&addr[4])))); + *lower = ((uint64_t)(ntohl(*(uint32_t*)(&addr[8])))) << 32 | + ((uint64_t)(ntohl(*(uint32_t*)(&addr[12])))); +} + +void OSDMap::range_bits::parse(const entity_addr_t& addr) { + // parse it into meaningful data + if (addr.is_ipv6()) { + get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, + &bits.ipv6.upper_64_bits, &bits.ipv6.lower_64_bits); + int32_t lower_shift = std::min(128- + static_cast(addr.get_nonce()), 64); + int32_t upper_shift = std::max(64- //(128-b.first.get_nonce())-64 + static_cast(addr.get_nonce()), 0); + + auto get_mask = [](int32_t shift) -> uint64_t { + if (shift >= 0 && shift < 64) { + return UINT64_MAX << shift; + } + return 0; + }; + + bits.ipv6.lower_mask = get_mask(lower_shift); + bits.ipv6.upper_mask = get_mask(upper_shift); + ipv6 = true; + } else if (addr.is_ipv4()) { + bits.ipv4.ip_32_bits = ntohl(addr.in4_addr().sin_addr.s_addr); + if (addr.get_nonce() > 0) { + bits.ipv4.mask = UINT32_MAX << (32-addr.get_nonce()); + } else { + bits.ipv4.mask = 0; + } + } else { + // uh... + } +} + +bool OSDMap::range_bits::matches(const entity_addr_t& addr) const { + if (addr.is_ipv4() && !ipv6) { + return ((ntohl(addr.in4_addr().sin_addr.s_addr) & bits.ipv4.mask) == + (bits.ipv4.ip_32_bits & bits.ipv4.mask)); + } else if (addr.is_ipv6() && ipv6) { + uint64_t upper_64, lower_64; + get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, &upper_64, &lower_64); + return (((upper_64 & bits.ipv6.upper_mask) == + (bits.ipv6.upper_64_bits & bits.ipv6.upper_mask)) && + ((lower_64 & bits.ipv6.lower_mask) == + (bits.ipv6.lower_64_bits & bits.ipv6.lower_mask))); + } + return false; +} + +bool OSDMap::is_blocklisted(const entity_addr_t& orig, CephContext *cct) const +{ + if (cct) ldout(cct, 25) << "is_blocklisted: " << orig << dendl; + if (blocklist.empty() && range_blocklist.empty()) { + if (cct) ldout(cct, 30) << "not blocklisted: " << orig << dendl; + return false; + } + + // all blocklist entries are type ANY for nautilus+ + // FIXME: avoid this copy! + entity_addr_t a = orig; + if (require_osd_release < ceph_release_t::nautilus) { + a.set_type(entity_addr_t::TYPE_LEGACY); + } else { + a.set_type(entity_addr_t::TYPE_ANY); + } + + // this specific instance? + if (blocklist.count(a)) { + if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl; + return true; + } + + // is entire ip blocklisted? + if (a.is_ip()) { + a.set_port(0); + a.set_nonce(0); + if (blocklist.count(a)) { + if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl; + return true; + } + } + + // is it in a blocklisted range? + for (const auto& i : calculated_ranges) { + bool blocked = i.second.matches(a); + if (blocked) { + if (cct) ldout(cct, 20) << "range_blocklist contains " << a << dendl; + return true; + } + } + + if (cct) ldout(cct, 25) << "not blocklisted: " << orig << dendl; + return false; +} + +bool OSDMap::is_blocklisted(const entity_addrvec_t& av, CephContext *cct) const +{ + if (blocklist.empty() && range_blocklist.empty()) + return false; + + for (auto& a : av.v) { + if (is_blocklisted(a, cct)) { + return true; + } + } + + return false; +} + +void OSDMap::get_blocklist(list > *bl, + std::list > *rl) const +{ + std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl)); + std::copy(range_blocklist.begin(), range_blocklist.end(), + std::back_inserter(*rl)); +} + +void OSDMap::get_blocklist(std::set *bl, + std::set *rl) const +{ + for (const auto &i : blocklist) { + bl->insert(i.first); + } + for (const auto &i : range_blocklist) { + rl->insert(i.first); + } +} + +void OSDMap::set_max_osd(int m) +{ + max_osd = m; + osd_state.resize(max_osd, 0); + osd_weight.resize(max_osd, CEPH_OSD_OUT); + osd_info.resize(max_osd); + osd_xinfo.resize(max_osd); + osd_addrs->client_addrs.resize(max_osd); + osd_addrs->cluster_addrs.resize(max_osd); + osd_addrs->hb_back_addrs.resize(max_osd); + osd_addrs->hb_front_addrs.resize(max_osd); + osd_uuid->resize(max_osd); + if (osd_primary_affinity) + osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + + calc_num_osds(); +} + +int OSDMap::calc_num_osds() +{ + num_osd = 0; + num_up_osd = 0; + num_in_osd = 0; + for (int i=0; i *full, + set *backfillfull, + set *nearfull) const +{ + ceph_assert(full); + ceph_assert(backfillfull); + ceph_assert(nearfull); + full->clear(); + backfillfull->clear(); + nearfull->clear(); + + vector full_osds; + vector backfillfull_osds; + vector nearfull_osds; + for (int i = 0; i < max_osd; ++i) { + if (exists(i) && is_up(i) && is_in(i)) { + if (osd_state[i] & CEPH_OSD_FULL) + full_osds.push_back(i); + else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) + backfillfull_osds.push_back(i); + else if (osd_state[i] & CEPH_OSD_NEARFULL) + nearfull_osds.push_back(i); + } + } + + for (auto i: full_osds) { + get_pool_ids_by_osd(cct, i, full); + } + for (auto i: backfillfull_osds) { + get_pool_ids_by_osd(cct, i, backfillfull); + } + for (auto i: nearfull_osds) { + get_pool_ids_by_osd(cct, i, nearfull); + } +} + +void OSDMap::get_full_osd_counts(set *full, set *backfill, + set *nearfull) const +{ + full->clear(); + backfill->clear(); + nearfull->clear(); + for (int i = 0; i < max_osd; ++i) { + if (exists(i) && is_up(i) && is_in(i)) { + if (osd_state[i] & CEPH_OSD_FULL) + full->emplace(i); + else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) + backfill->emplace(i); + else if (osd_state[i] & CEPH_OSD_NEARFULL) + nearfull->emplace(i); + } + } +} + +void OSDMap::get_all_osds(set& ls) const +{ + for (int i=0; i& ls) const +{ + for (int i = 0; i < max_osd; i++) { + if (is_up(i)) + ls.insert(i); + } +} + +void OSDMap::get_out_existing_osds(set& ls) const +{ + for (int i = 0; i < max_osd; i++) { + if (exists(i) && get_weight(i) == CEPH_OSD_OUT) + ls.insert(i); + } +} + +void OSDMap::get_flag_set(set *flagset) const +{ + for (unsigned i = 0; i < sizeof(flags) * 8; ++i) { + if (flags & (1<insert(get_flag_string(flags & (1<& st) +{ + unsigned t = state; + for (unsigned s = 1; t; s <<= 1) { + if (t & s) { + t &= ~s; + st.insert(ceph_osd_state_name(s)); + } + } +} + +void OSDMap::adjust_osd_weights(const map& weights, Incremental& inc) const +{ + float max = 0; + for (const auto &weight : weights) { + if (weight.second > max) + max = weight.second; + } + + for (const auto &weight : weights) { + inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN); + } +} + +int OSDMap::identify_osd(const entity_addr_t& addr) const +{ + for (int i=0; ihas_nondefault_tunables()) + features |= CEPH_FEATURE_CRUSH_TUNABLES; + if (crush->has_nondefault_tunables2()) + features |= CEPH_FEATURE_CRUSH_TUNABLES2; + if (crush->has_nondefault_tunables3()) + features |= CEPH_FEATURE_CRUSH_TUNABLES3; + if (crush->has_v4_buckets()) + features |= CEPH_FEATURE_CRUSH_V4; + if (crush->has_nondefault_tunables5()) + features |= CEPH_FEATURE_CRUSH_TUNABLES5; + if (crush->has_incompat_choose_args()) { + features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS; + } + mask |= CEPH_FEATURES_CRUSH; + + if (!pg_upmap.empty() || !pg_upmap_items.empty()) + features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP; + mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP; + + for (auto &pool: pools) { + if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) { + features |= CEPH_FEATURE_OSDHASHPSPOOL; + } + if (!pool.second.tiers.empty() || + pool.second.is_tier()) { + features |= CEPH_FEATURE_OSD_CACHEPOOL; + } + int ruleid = crush->find_rule(pool.second.get_crush_rule(), + pool.second.get_type(), + pool.second.get_size()); + if (ruleid >= 0) { + if (crush->is_v2_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_V2; + if (crush->is_v3_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_TUNABLES3; + if (crush->is_v5_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_TUNABLES5; + } + } + mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL; + + if (osd_primary_affinity) { + for (int i = 0; i < max_osd; ++i) { + if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY; + break; + } + } + } + mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY; + + if (entity_type == CEPH_ENTITY_TYPE_OSD) { + const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL; + if (require_osd_release >= ceph_release_t::jewel) { + features |= jewel_features; + } + mask |= jewel_features; + + const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN + | CEPH_FEATURE_MSG_ADDR2; + if (require_osd_release >= ceph_release_t::kraken) { + features |= kraken_features; + } + mask |= kraken_features; + + if (stretch_mode_enabled) { + features |= CEPH_FEATUREMASK_STRETCH_MODE; + mask |= CEPH_FEATUREMASK_STRETCH_MODE; + } + } + + if (require_min_compat_client >= ceph_release_t::nautilus) { + // if min_compat_client is >= nautilus, require v2 cephx signatures + // from everyone + features |= CEPH_FEATUREMASK_CEPHX_V2; + } else if (require_osd_release >= ceph_release_t::nautilus && + entity_type == CEPH_ENTITY_TYPE_OSD) { + // if osds are >= nautilus, at least require the signatures from them + features |= CEPH_FEATUREMASK_CEPHX_V2; + } + mask |= CEPH_FEATUREMASK_CEPHX_V2; + + if (pmask) + *pmask = mask; + return features; +} + +ceph_release_t OSDMap::get_min_compat_client() const +{ + uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr); + + if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43 + HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28 + return ceph_release_t::luminous; // v12.2.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737 + return ceph_release_t::jewel; // v10.2.0 + } + if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56 + return ceph_release_t::hammer; // v0.94.0 + } + if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624 + HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d + HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5 + return ceph_release_t::firefly; // v0.80.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff + HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f + return ceph_release_t::dumpling; // v0.67.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af + return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af + } + return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af +} + +ceph_release_t OSDMap::get_require_min_compat_client() const +{ + return require_min_compat_client; +} + +void OSDMap::_calc_up_osd_features() +{ + bool first = true; + cached_up_osd_features = 0; + for (int osd = 0; osd < max_osd; ++osd) { + if (!is_up(osd)) + continue; + const osd_xinfo_t &xi = get_xinfo(osd); + if (xi.features == 0) + continue; // bogus xinfo, maybe #20751 or similar, skipping + if (first) { + cached_up_osd_features = xi.features; + first = false; + } else { + cached_up_osd_features &= xi.features; + } + } +} + +uint64_t OSDMap::get_up_osd_features() const +{ + return cached_up_osd_features; +} + +void OSDMap::dedup(const OSDMap *o, OSDMap *n) +{ + using ceph::encode; + if (o->epoch == n->epoch) + return; + + int diff = 0; + + // do addrs match? + if (o->max_osd != n->max_osd) + diff++; + for (int i = 0; i < o->max_osd && i < n->max_osd; i++) { + if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] && + *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i]) + n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i]; + else + diff++; + if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] && + *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i]) + n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i]; + else + diff++; + if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] && + *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i]) + n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i]; + else + diff++; + if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] && + *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i]) + n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i]; + else + diff++; + } + if (diff == 0) { + // zoinks, no differences at all! + n->osd_addrs = o->osd_addrs; + } + + // does crush match? + ceph::buffer::list oc, nc; + encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT); + encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT); + if (oc.contents_equal(nc)) { + n->crush = o->crush; + } + + // does pg_temp match? + if (*o->pg_temp == *n->pg_temp) + n->pg_temp = o->pg_temp; + + // does primary_temp match? + if (o->primary_temp->size() == n->primary_temp->size()) { + if (*o->primary_temp == *n->primary_temp) + n->primary_temp = o->primary_temp; + } + + // do uuids match? + if (o->osd_uuid->size() == n->osd_uuid->size() && + *o->osd_uuid == *n->osd_uuid) + n->osd_uuid = o->osd_uuid; +} + +void OSDMap::clean_temps(CephContext *cct, + const OSDMap& oldmap, + const OSDMap& nextmap, + Incremental *pending_inc) +{ + ldout(cct, 10) << __func__ << dendl; + + for (auto pg : *nextmap.pg_temp) { + // if pool does not exist, remove any existing pg_temps associated with + // it. we don't care about pg_temps on the pending_inc either; if there + // are new_pg_temp entries on the pending, clear them out just as well. + if (!nextmap.have_pg_pool(pg.first.pool())) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first + << " for nonexistent pool " << pg.first.pool() << dendl; + pending_inc->new_pg_temp[pg.first].clear(); + continue; + } + if (!nextmap.pg_exists(pg.first)) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first + << " for nonexistent pg " << dendl; + pending_inc->new_pg_temp[pg.first].clear(); + continue; + } + // all osds down? + unsigned num_up = 0; + for (auto o : pg.second) { + if (!nextmap.is_down(o)) { + ++num_up; + break; + } + } + if (num_up == 0) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first + << " with all down osds" << pg.second << dendl; + pending_inc->new_pg_temp[pg.first].clear(); + continue; + } + // redundant pg_temp? + vector raw_up; + int primary; + nextmap.pg_to_raw_up(pg.first, &raw_up, &primary); + bool remove = false; + if (raw_up == pg.second) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " " + << pg.second << " that matches raw_up mapping" << dendl; + remove = true; + } + // oversized pg_temp? + if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " " + << pg.second << " exceeds pool size" << dendl; + remove = true; + } + if (remove) { + if (oldmap.pg_temp->count(pg.first)) + pending_inc->new_pg_temp[pg.first].clear(); + else + pending_inc->new_pg_temp.erase(pg.first); + } + } + + for (auto &pg : *nextmap.primary_temp) { + // primary down? + if (nextmap.is_down(pg.second)) { + ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first + << " to down " << pg.second << dendl; + pending_inc->new_primary_temp[pg.first] = -1; + continue; + } + // redundant primary_temp? + vector real_up, templess_up; + int real_primary, templess_primary; + pg_t pgid = pg.first; + nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary); + nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary); + if (real_primary == templess_primary){ + ldout(cct, 10) << __func__ << " removing primary_temp " + << pgid << " -> " << real_primary + << " (unnecessary/redundant)" << dendl; + if (oldmap.primary_temp->count(pgid)) + pending_inc->new_primary_temp[pgid] = -1; + else + pending_inc->new_primary_temp.erase(pgid); + } + } +} + +void OSDMap::get_upmap_pgs(vector *upmap_pgs) const +{ + upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size()); + for (auto& p : pg_upmap) + upmap_pgs->push_back(p.first); + for (auto& p : pg_upmap_items) + upmap_pgs->push_back(p.first); +} + +bool OSDMap::check_pg_upmaps( + CephContext *cct, + const vector& to_check, + vector *to_cancel, + map>> *to_remap) const +{ + bool any_change = false; + map> rule_weight_map; + for (auto& pg : to_check) { + const pg_pool_t *pi = get_pg_pool(pg.pool()); + if (!pi || pg.ps() >= pi->get_pg_num_pending()) { + ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source" + << dendl; + to_cancel->push_back(pg); + continue; + } + if (pi->is_pending_merge(pg, nullptr)) { + ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge" + << dendl; + to_cancel->push_back(pg); + continue; + } + vector raw, up; + pg_to_raw_upmap(pg, &raw, &up); + auto crush_rule = get_pg_pool_crush_rule(pg); + auto r = crush->verify_upmap(cct, + crush_rule, + get_pg_pool_size(pg), + up); + if (r < 0) { + ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg + << " returning " << r + << dendl; + to_cancel->push_back(pg); + continue; + } + // below we check against crush-topology changing.. + map weight_map; + auto it = rule_weight_map.find(crush_rule); + if (it == rule_weight_map.end()) { + auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map); + if (r < 0) { + lderr(cct) << __func__ << " unable to get crush weight_map for " + << "crush_rule " << crush_rule + << dendl; + continue; + } + rule_weight_map[crush_rule] = weight_map; + } else { + weight_map = it->second; + } + ldout(cct, 10) << __func__ << " pg " << pg + << " weight_map " << weight_map + << dendl; + for (auto osd : up) { + auto it = weight_map.find(osd); + if (it == weight_map.end()) { + ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has " + << "been moved out of the specific crush-tree" + << dendl; + to_cancel->push_back(pg); + break; + } + auto adjusted_weight = get_weightf(it->first) * it->second; + if (adjusted_weight == 0) { + ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd + << " is out/crush-out" + << dendl; + to_cancel->push_back(pg); + break; + } + } + if (!to_cancel->empty() && to_cancel->back() == pg) + continue; + // okay, upmap is valid + // continue to check if it is still necessary + auto i = pg_upmap.find(pg); + if (i != pg_upmap.end()) { + if (i->second == raw) { + ldout(cct, 10) << "removing redundant pg_upmap " << i->first << " " + << i->second << dendl; + to_cancel->push_back(pg); + continue; + } + if ((int)i->second.size() != get_pg_pool_size(pg)) { + ldout(cct, 10) << "removing pg_upmap " << i->first << " " + << i->second << " != pool size " << get_pg_pool_size(pg) + << dendl; + to_cancel->push_back(pg); + continue; + } + } + auto j = pg_upmap_items.find(pg); + if (j != pg_upmap_items.end()) { + mempool::osdmap::vector> newmap; + for (auto& p : j->second) { + if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) { + // cancel mapping if source osd does not exist anymore + continue; + } + if (p.second != CRUSH_ITEM_NONE && p.second < max_osd && + p.second >= 0 && osd_weight[p.second] == 0) { + // cancel mapping if target osd is out + continue; + } + newmap.push_back(p); + } + if (newmap.empty()) { + ldout(cct, 10) << " removing no-op pg_upmap_items " + << j->first << " " << j->second + << dendl; + to_cancel->push_back(pg); + } else if (newmap != j->second) { + ldout(cct, 10) << " simplifying partially no-op pg_upmap_items " + << j->first << " " << j->second + << " -> " << newmap + << dendl; + to_remap->insert({pg, newmap}); + any_change = true; + } + } + } + any_change = any_change || !to_cancel->empty(); + return any_change; +} + +void OSDMap::clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc, + const vector& to_cancel, + const map>>& to_remap) const +{ + for (auto &pg: to_cancel) { + auto i = pending_inc->new_pg_upmap.find(pg); + if (i != pending_inc->new_pg_upmap.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pending " + << "pg_upmap entry " + << i->first << "->" << i->second + << dendl; + pending_inc->new_pg_upmap.erase(i); + } + auto j = pg_upmap.find(pg); + if (j != pg_upmap.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry " + << j->first << "->" << j->second + << dendl; + pending_inc->old_pg_upmap.insert(pg); + } + auto p = pending_inc->new_pg_upmap_items.find(pg); + if (p != pending_inc->new_pg_upmap_items.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pending " + << "pg_upmap_items entry " + << p->first << "->" << p->second + << dendl; + pending_inc->new_pg_upmap_items.erase(p); + } + auto q = pg_upmap_items.find(pg); + if (q != pg_upmap_items.end()) { + ldout(cct, 10) << __func__ << " cancel invalid " + << "pg_upmap_items entry " + << q->first << "->" << q->second + << dendl; + pending_inc->old_pg_upmap_items.insert(pg); + } + } + for (auto& i : to_remap) + pending_inc->new_pg_upmap_items[i.first] = i.second; +} + +bool OSDMap::clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc) const +{ + ldout(cct, 10) << __func__ << dendl; + vector to_check; + vector to_cancel; + map>> to_remap; + + get_upmap_pgs(&to_check); + auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap); + clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap); + return any_change; +} + +int OSDMap::apply_incremental(const Incremental &inc) +{ + new_blocklist_entries = false; + if (inc.epoch == 1) + fsid = inc.fsid; + else if (inc.fsid != fsid) + return -EINVAL; + + ceph_assert(inc.epoch == epoch+1); + + epoch++; + modified = inc.modified; + + // full map? + if (inc.fullmap.length()) { + ceph::buffer::list bl(inc.fullmap); + decode(bl); + return 0; + } + + // nope, incremental. + if (inc.new_flags >= 0) { + flags = inc.new_flags; + // the below is just to cover a newly-upgraded luminous mon + // cluster that has to set require_jewel_osds or + // require_kraken_osds before the osds can be upgraded to + // luminous. + if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) { + if (require_osd_release < ceph_release_t::kraken) { + require_osd_release = ceph_release_t::kraken; + } + } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) { + if (require_osd_release < ceph_release_t::jewel) { + require_osd_release = ceph_release_t::jewel; + } + } + } + + if (inc.new_max_osd >= 0) + set_max_osd(inc.new_max_osd); + + if (inc.new_pool_max != -1) + pool_max = inc.new_pool_max; + + for (const auto &pool : inc.new_pools) { + pools[pool.first] = pool.second; + pools[pool.first].last_change = epoch; + } + + new_removed_snaps = inc.new_removed_snaps; + new_purged_snaps = inc.new_purged_snaps; + for (auto p = new_removed_snaps.begin(); + p != new_removed_snaps.end(); + ++p) { + removed_snaps_queue[p->first].union_of(p->second); + } + for (auto p = new_purged_snaps.begin(); + p != new_purged_snaps.end(); + ++p) { + auto q = removed_snaps_queue.find(p->first); + ceph_assert(q != removed_snaps_queue.end()); + q->second.subtract(p->second); + if (q->second.empty()) { + removed_snaps_queue.erase(q); + } + } + + if (inc.new_last_up_change != utime_t()) { + last_up_change = inc.new_last_up_change; + } + if (inc.new_last_in_change != utime_t()) { + last_in_change = inc.new_last_in_change; + } + + for (const auto &pname : inc.new_pool_names) { + auto pool_name_entry = pool_name.find(pname.first); + if (pool_name_entry != pool_name.end()) { + name_pool.erase(pool_name_entry->second); + pool_name_entry->second = pname.second; + } else { + pool_name[pname.first] = pname.second; + } + name_pool[pname.second] = pname.first; + } + + for (const auto &pool : inc.old_pools) { + pools.erase(pool); + name_pool.erase(pool_name[pool]); + pool_name.erase(pool); + } + + for (const auto &weight : inc.new_weight) { + set_weight(weight.first, weight.second); + + // if we are marking in, clear the AUTOOUT and NEW bits, and clear + // xinfo old_weight. + if (weight.second) { + osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW); + osd_xinfo[weight.first].old_weight = 0; + } + } + + for (const auto &primary_affinity : inc.new_primary_affinity) { + set_primary_affinity(primary_affinity.first, primary_affinity.second); + } + + // erasure_code_profiles + for (const auto &profile : inc.old_erasure_code_profiles) + erasure_code_profiles.erase(profile); + + for (const auto &profile : inc.new_erasure_code_profiles) { + set_erasure_code_profile(profile.first, profile.second); + } + + // up/down + for (const auto &state : inc.new_state) { + const auto osd = state.first; + int s = state.second ? state.second : CEPH_OSD_UP; + if ((osd_state[osd] & CEPH_OSD_UP) && + (s & CEPH_OSD_UP)) { + osd_info[osd].down_at = epoch; + osd_xinfo[osd].down_stamp = modified; + } + if ((osd_state[osd] & CEPH_OSD_EXISTS) && + (s & CEPH_OSD_EXISTS)) { + // osd is destroyed; clear out anything interesting. + (*osd_uuid)[osd] = uuid_d(); + osd_info[osd] = osd_info_t(); + osd_xinfo[osd] = osd_xinfo_t(); + set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + osd_addrs->client_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t()); + osd_state[osd] = 0; + } else { + osd_state[osd] ^= s; + } + } + + for (const auto &client : inc.new_up_client) { + osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; + osd_state[client.first] &= ~CEPH_OSD_STOP; // if any + osd_addrs->client_addrs[client.first].reset( + new entity_addrvec_t(client.second)); + osd_addrs->hb_back_addrs[client.first].reset( + new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second)); + osd_addrs->hb_front_addrs[client.first].reset( + new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second)); + + osd_info[client.first].up_from = epoch; + } + + for (const auto &cluster : inc.new_up_cluster) + osd_addrs->cluster_addrs[cluster.first].reset( + new entity_addrvec_t(cluster.second)); + + // info + for (const auto &thru : inc.new_up_thru) + osd_info[thru.first].up_thru = thru.second; + + for (const auto &interval : inc.new_last_clean_interval) { + osd_info[interval.first].last_clean_begin = interval.second.first; + osd_info[interval.first].last_clean_end = interval.second.second; + } + + for (const auto &lost : inc.new_lost) + osd_info[lost.first].lost_at = lost.second; + + // xinfo + for (const auto &xinfo : inc.new_xinfo) + osd_xinfo[xinfo.first] = xinfo.second; + + // uuid + for (const auto &uuid : inc.new_uuid) + (*osd_uuid)[uuid.first] = uuid.second; + + // pg rebuild + for (const auto &pg : inc.new_pg_temp) { + if (pg.second.empty()) + pg_temp->erase(pg.first); + else + pg_temp->set(pg.first, pg.second); + } + if (!inc.new_pg_temp.empty()) { + // make sure pg_temp is efficiently stored + pg_temp->rebuild(); + } + + for (const auto &pg : inc.new_primary_temp) { + if (pg.second == -1) + primary_temp->erase(pg.first); + else + (*primary_temp)[pg.first] = pg.second; + } + + for (auto& p : inc.new_pg_upmap) { + pg_upmap[p.first] = p.second; + } + for (auto& pg : inc.old_pg_upmap) { + pg_upmap.erase(pg); + } + for (auto& p : inc.new_pg_upmap_items) { + pg_upmap_items[p.first] = p.second; + } + for (auto& pg : inc.old_pg_upmap_items) { + pg_upmap_items.erase(pg); + } + + // blocklist + if (!inc.new_blocklist.empty()) { + blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end()); + new_blocklist_entries = true; + } + for (const auto &addr : inc.old_blocklist) + blocklist.erase(addr); + + for (const auto& addr_p : inc.new_range_blocklist) { + range_blocklist.insert(addr_p); + calculated_ranges.emplace(addr_p.first, addr_p.first); + new_blocklist_entries = true; + } + for (const auto &addr : inc.old_range_blocklist) { + calculated_ranges.erase(addr); + range_blocklist.erase(addr); + } + + for (auto& i : inc.new_crush_node_flags) { + if (i.second) { + crush_node_flags[i.first] = i.second; + } else { + crush_node_flags.erase(i.first); + } + } + + for (auto& i : inc.new_device_class_flags) { + if (i.second) { + device_class_flags[i.first] = i.second; + } else { + device_class_flags.erase(i.first); + } + } + + // cluster snapshot? + if (inc.cluster_snapshot.length()) { + cluster_snapshot = inc.cluster_snapshot; + cluster_snapshot_epoch = inc.epoch; + } else { + cluster_snapshot.clear(); + cluster_snapshot_epoch = 0; + } + + if (inc.new_nearfull_ratio >= 0) { + nearfull_ratio = inc.new_nearfull_ratio; + } + if (inc.new_backfillfull_ratio >= 0) { + backfillfull_ratio = inc.new_backfillfull_ratio; + } + if (inc.new_full_ratio >= 0) { + full_ratio = inc.new_full_ratio; + } + if (inc.new_require_min_compat_client > ceph_release_t::unknown) { + require_min_compat_client = inc.new_require_min_compat_client; + } + if (inc.new_require_osd_release >= ceph_release_t::unknown) { + require_osd_release = inc.new_require_osd_release; + if (require_osd_release >= ceph_release_t::luminous) { + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } + } + + if (inc.new_require_osd_release >= ceph_release_t::unknown) { + require_osd_release = inc.new_require_osd_release; + if (require_osd_release >= ceph_release_t::nautilus) { + flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT; + } + } + // do new crush map last (after up/down stuff) + if (inc.crush.length()) { + ceph::buffer::list bl(inc.crush); + auto blp = bl.cbegin(); + crush.reset(new CrushWrapper); + crush->decode(blp); + if (require_osd_release >= ceph_release_t::luminous) { + // only increment if this is a luminous-encoded osdmap, lest + // the mon's crush_version diverge from what the osds or others + // are decoding and applying on their end. if we won't encode + // it in the canonical version, don't change it. + ++crush_version; + } + for (auto it = device_class_flags.begin(); + it != device_class_flags.end();) { + const char* class_name = crush->get_class_name(it->first); + if (!class_name) // device class is gone + it = device_class_flags.erase(it); + else + it++; + } + } + + if (inc.change_stretch_mode) { + stretch_mode_enabled = inc.stretch_mode_enabled; + stretch_bucket_count = inc.new_stretch_bucket_count; + degraded_stretch_mode = inc.new_degraded_stretch_mode; + recovering_stretch_mode = inc.new_recovering_stretch_mode; + stretch_mode_bucket = inc.new_stretch_mode_bucket; + } + + calc_num_osds(); + _calc_up_osd_features(); + return 0; +} + +// mapping +int OSDMap::map_to_pg( + int64_t poolid, + const string& name, + const string& key, + const string& nspace, + pg_t *pg) const +{ + // calculate ps (placement seed) + const pg_pool_t *pool = get_pg_pool(poolid); + if (!pool) + return -ENOENT; + ps_t ps; + if (!key.empty()) + ps = pool->hash_key(key, nspace); + else + ps = pool->hash_key(name, nspace); + *pg = pg_t(ps, poolid); + return 0; +} + +int OSDMap::object_locator_to_pg( + const object_t& oid, const object_locator_t& loc, pg_t &pg) const +{ + if (loc.hash >= 0) { + if (!get_pg_pool(loc.get_pool())) { + return -ENOENT; + } + pg = pg_t(loc.hash, loc.get_pool()); + return 0; + } + return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg); +} + +ceph_object_layout OSDMap::make_object_layout( + object_t oid, int pg_pool, string nspace) const +{ + object_locator_t loc(pg_pool, nspace); + + ceph_object_layout ol; + pg_t pgid = object_locator_to_pg(oid, loc); + ol.ol_pgid = pgid.get_old_pg().v; + ol.ol_stripe_unit = 0; + return ol; +} + +void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool, + vector& osds) const +{ + if (pool.can_shift_osds()) { + unsigned removed = 0; + for (unsigned i = 0; i < osds.size(); i++) { + if (!exists(osds[i])) { + removed++; + continue; + } + if (removed) { + osds[i - removed] = osds[i]; + } + } + if (removed) + osds.resize(osds.size() - removed); + } else { + for (auto& osd : osds) { + if (!exists(osd)) + osd = CRUSH_ITEM_NONE; + } + } +} + +void OSDMap::_pg_to_raw_osds( + const pg_pool_t& pool, pg_t pg, + vector *osds, + ps_t *ppps) const +{ + // map to osds[] + ps_t pps = pool.raw_pg_to_pps(pg); // placement ps + unsigned size = pool.get_size(); + + // what crush rule? + int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size); + if (ruleno >= 0) + crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool()); + + _remove_nonexistent_osds(pool, *osds); + + if (ppps) + *ppps = pps; +} + +int OSDMap::_pick_primary(const vector& osds) const +{ + for (auto osd : osds) { + if (osd != CRUSH_ITEM_NONE) { + return osd; + } + } + return -1; +} + +void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector *raw) const +{ + pg_t pg = pi.raw_pg_to_pg(raw_pg); + auto p = pg_upmap.find(pg); + if (p != pg_upmap.end()) { + // make sure targets aren't marked out + for (auto osd : p->second) { + if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 && + osd_weight[osd] == 0) { + // reject/ignore the explicit mapping + return; + } + } + *raw = vector(p->second.begin(), p->second.end()); + // continue to check and apply pg_upmap_items if any + } + + auto q = pg_upmap_items.find(pg); + if (q != pg_upmap_items.end()) { + // NOTE: this approach does not allow a bidirectional swap, + // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + for (auto& r : q->second) { + // make sure the replacement value doesn't already appear + bool exists = false; + ssize_t pos = -1; + for (unsigned i = 0; i < raw->size(); ++i) { + int osd = (*raw)[i]; + if (osd == r.second) { + exists = true; + break; + } + // ignore mapping if target is marked out (or invalid osd id) + if (osd == r.first && + pos < 0 && + !(r.second != CRUSH_ITEM_NONE && r.second < max_osd && + r.second >= 0 && osd_weight[r.second] == 0)) { + pos = i; + } + } + if (!exists && pos >= 0) { + (*raw)[pos] = r.second; + } + } + } +} + +// pg -> (up osd list) +void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector& raw, + vector *up) const +{ + if (pool.can_shift_osds()) { + // shift left + up->clear(); + up->reserve(raw.size()); + for (unsigned i=0; ipush_back(raw[i]); + } + } else { + // set down/dne devices to NONE + up->resize(raw.size()); + for (int i = raw.size() - 1; i >= 0; --i) { + if (!exists(raw[i]) || is_down(raw[i])) { + (*up)[i] = CRUSH_ITEM_NONE; + } else { + (*up)[i] = raw[i]; + } + } + } +} + +void OSDMap::_apply_primary_affinity(ps_t seed, + const pg_pool_t& pool, + vector *osds, + int *primary) const +{ + // do we have any non-default primary_affinity values for these osds? + if (!osd_primary_affinity) + return; + + bool any = false; + for (const auto osd : *osds) { + if (osd != CRUSH_ITEM_NONE && + (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + any = true; + break; + } + } + if (!any) + return; + + // pick the primary. feed both the seed (for the pg) and the osd + // into the hash/rng so that a proportional fraction of an osd's pgs + // get rejected as primary. + int pos = -1; + for (unsigned i = 0; i < osds->size(); ++i) { + int o = (*osds)[i]; + if (o == CRUSH_ITEM_NONE) + continue; + unsigned a = (*osd_primary_affinity)[o]; + if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY && + (crush_hash32_2(CRUSH_HASH_RJENKINS1, + seed, o) >> 16) >= a) { + // we chose not to use this primary. note it anyway as a + // fallback in case we don't pick anyone else, but keep looking. + if (pos < 0) + pos = i; + } else { + pos = i; + break; + } + } + if (pos < 0) + return; + + *primary = (*osds)[pos]; + + if (pool.can_shift_osds() && pos > 0) { + // move the new primary to the front. + for (int i = pos; i > 0; --i) { + (*osds)[i] = (*osds)[i-1]; + } + (*osds)[0] = *primary; + } +} + +void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg, + vector *temp_pg, int *temp_primary) const +{ + pg = pool.raw_pg_to_pg(pg); + const auto p = pg_temp->find(pg); + temp_pg->clear(); + if (p != pg_temp->end()) { + for (unsigned i=0; isecond.size(); i++) { + if (!exists(p->second[i]) || is_down(p->second[i])) { + if (pool.can_shift_osds()) { + continue; + } else { + temp_pg->push_back(CRUSH_ITEM_NONE); + } + } else { + temp_pg->push_back(p->second[i]); + } + } + } + const auto &pp = primary_temp->find(pg); + *temp_primary = -1; + if (pp != primary_temp->end()) { + *temp_primary = pp->second; + } else if (!temp_pg->empty()) { // apply pg_temp's primary + for (unsigned i = 0; i < temp_pg->size(); ++i) { + if ((*temp_pg)[i] != CRUSH_ITEM_NONE) { + *temp_primary = (*temp_pg)[i]; + break; + } + } + } +} + +void OSDMap::pg_to_raw_osds(pg_t pg, vector *raw, int *primary) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) { + *primary = -1; + raw->clear(); + return; + } + _pg_to_raw_osds(*pool, pg, raw, NULL); + *primary = _pick_primary(*raw); +} + +void OSDMap::pg_to_raw_upmap(pg_t pg, vector*raw, + vector *raw_upmap) const +{ + auto pool = get_pg_pool(pg.pool()); + if (!pool) { + raw_upmap->clear(); + return; + } + _pg_to_raw_osds(*pool, pg, raw, NULL); + *raw_upmap = *raw; + _apply_upmap(*pool, pg, raw_upmap); +} + +void OSDMap::pg_to_raw_up(pg_t pg, vector *up, int *primary) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) { + *primary = -1; + up->clear(); + return; + } + vector raw; + ps_t pps; + _pg_to_raw_osds(*pool, pg, &raw, &pps); + _apply_upmap(*pool, pg, &raw); + _raw_to_up_osds(*pool, raw, up); + *primary = _pick_primary(raw); + _apply_primary_affinity(pps, *pool, up, primary); +} + +void OSDMap::_pg_to_up_acting_osds( + const pg_t& pg, vector *up, int *up_primary, + vector *acting, int *acting_primary, + bool raw_pg_to_pg) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool || + (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) { + if (up) + up->clear(); + if (up_primary) + *up_primary = -1; + if (acting) + acting->clear(); + if (acting_primary) + *acting_primary = -1; + return; + } + vector raw; + vector _up; + vector _acting; + int _up_primary; + int _acting_primary; + ps_t pps; + _get_temp_osds(*pool, pg, &_acting, &_acting_primary); + if (_acting.empty() || up || up_primary) { + _pg_to_raw_osds(*pool, pg, &raw, &pps); + _apply_upmap(*pool, pg, &raw); + _raw_to_up_osds(*pool, raw, &_up); + _up_primary = _pick_primary(_up); + _apply_primary_affinity(pps, *pool, &_up, &_up_primary); + if (_acting.empty()) { + _acting = _up; + if (_acting_primary == -1) { + _acting_primary = _up_primary; + } + } + + if (up) + up->swap(_up); + if (up_primary) + *up_primary = _up_primary; + } + + if (acting) + acting->swap(_acting); + if (acting_primary) + *acting_primary = _acting_primary; +} + +int OSDMap::calc_pg_role_broken(int osd, const vector& acting, int nrep) +{ + // This implementation is broken for EC PGs since the osd may appear + // multiple times in the acting set. See + // https://tracker.ceph.com/issues/43213 + if (!nrep) + nrep = acting.size(); + for (int i=0; i& acting) +{ + int nrep = acting.size(); + if (who.shard == shard_id_t::NO_SHARD) { + for (int i=0; i &oldacting, + int newprimary, + const vector &newacting) +{ + if (oldacting.empty() && newacting.empty()) + return false; // both still empty + if (oldacting.empty() ^ newacting.empty()) + return true; // was empty, now not, or vice versa + if (oldprimary != newprimary) + return true; // primary changed + if (calc_pg_role_broken(oldprimary, oldacting) != + calc_pg_role_broken(newprimary, newacting)) + return true; + return false; // same primary (tho replicas may have changed) +} + +uint64_t OSDMap::get_encoding_features() const +{ + uint64_t f = SIGNIFICANT_FEATURES; + if (require_osd_release < ceph_release_t::octopus) { + f &= ~CEPH_FEATURE_SERVER_OCTOPUS; + } + if (require_osd_release < ceph_release_t::nautilus) { + f &= ~CEPH_FEATURE_SERVER_NAUTILUS; + } + if (require_osd_release < ceph_release_t::mimic) { + f &= ~CEPH_FEATURE_SERVER_MIMIC; + } + if (require_osd_release < ceph_release_t::luminous) { + f &= ~(CEPH_FEATURE_SERVER_LUMINOUS | + CEPH_FEATURE_CRUSH_CHOOSE_ARGS); + } + if (require_osd_release < ceph_release_t::kraken) { + f &= ~(CEPH_FEATURE_SERVER_KRAKEN | + CEPH_FEATURE_MSG_ADDR2); + } + if (require_osd_release < ceph_release_t::jewel) { + f &= ~(CEPH_FEATURE_SERVER_JEWEL | + CEPH_FEATURE_NEW_OSDOP_ENCODING | + CEPH_FEATURE_CRUSH_TUNABLES5); + } + return f; +} + +// serialize, unserialize +void OSDMap::encode_client_old(ceph::buffer::list& bl) const +{ + using ceph::encode; + __u16 v = 5; + encode(v, bl); + + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + // for encode(pools, bl); + __u32 n = pools.size(); + encode(n, bl); + + for (const auto &pool : pools) { + n = pool.first; + encode(n, bl); + encode(pool.second, bl, 0); + } + // for encode(pool_name, bl); + n = pool_name.size(); + encode(n, bl); + for (const auto &pname : pool_name) { + n = pname.first; + encode(n, bl); + encode(pname.second, bl); + } + // for encode(pool_max, bl); + n = pool_max; + encode(n, bl); + + encode(flags, bl); + + encode(max_osd, bl); + { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + encode(osd_addrs->client_addrs, bl, 0); + + // for encode(pg_temp, bl); + n = pg_temp->size(); + encode(n, bl); + for (const auto& pg : *pg_temp) { + old_pg_t opg = pg.first.get_old_pg(); + encode(opg, bl); + encode(pg.second, bl); + } + + // crush + ceph::buffer::list cbl; + crush->encode(cbl, 0 /* legacy (no) features */); + encode(cbl, bl); +} + +void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGID64) == 0) { + encode_client_old(bl); + return; + } + + __u16 v = 6; + encode(v, bl); + + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + encode(pools, bl, features); + encode(pool_name, bl); + encode(pool_max, bl); + + encode(flags, bl); + + encode(max_osd, bl); + { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + encode(osd_addrs->client_addrs, bl, features); + + encode(*pg_temp, bl); + + // crush + ceph::buffer::list cbl; + crush->encode(cbl, 0 /* legacy (no) features */); + encode(cbl, bl); + + // extended + __u16 ev = 10; + encode(ev, bl); + encode(osd_addrs->hb_back_addrs, bl, features); + encode(osd_info, bl); + encode(blocklist, bl, features); + encode(osd_addrs->cluster_addrs, bl, features); + encode(cluster_snapshot_epoch, bl); + encode(cluster_snapshot, bl); + encode(*osd_uuid, bl); + encode(osd_xinfo, bl, features); + encode(osd_addrs->hb_front_addrs, bl, features); +} + +/* for a description of osdmap versions, and when they were introduced, please + * refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) { + encode_classic(bl, features); + return; + } + + // only a select set of callers should *ever* be encoding new + // OSDMaps. others should be passing around the canonical encoded + // buffers from on high. select out those callers by passing in an + // "impossible" feature bit. + ceph_assert(features & CEPH_FEATURE_RESERVED); + features &= ~CEPH_FEATURE_RESERVED; + + size_t start_offset = bl.length(); + size_t tail_offset; + size_t crc_offset; + std::optional crc_filler; + + // meta-encoding: how we include client-used and osd-specific data + ENCODE_START(8, 7, bl); + + { + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + uint8_t v = 9; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 3; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 6; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 7; + } + ENCODE_START(v, 1, bl); // client-usable data + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + encode(pools, bl, features); + encode(pool_name, bl); + encode(pool_max, bl); + + if (v < 4) { + decltype(flags) f = flags; + if (require_osd_release >= ceph_release_t::luminous) + f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES; + else if (require_osd_release == ceph_release_t::kraken) + f |= CEPH_OSDMAP_REQUIRE_KRAKEN; + else if (require_osd_release == ceph_release_t::jewel) + f |= CEPH_OSDMAP_REQUIRE_JEWEL; + encode(f, bl); + } else { + encode(flags, bl); + } + + encode(max_osd, bl); + if (v >= 5) { + encode(osd_state, bl); + } else { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + if (v >= 8) { + encode(osd_addrs->client_addrs, bl, features); + } else { + encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features); + } + + encode(*pg_temp, bl); + encode(*primary_temp, bl); + if (osd_primary_affinity) { + encode(*osd_primary_affinity, bl); + } else { + vector<__u32> v; + encode(v, bl); + } + + // crush + ceph::buffer::list cbl; + crush->encode(cbl, features); + encode(cbl, bl); + encode(erasure_code_profiles, bl); + + if (v >= 4) { + encode(pg_upmap, bl); + encode(pg_upmap_items, bl); + } else { + ceph_assert(pg_upmap.empty()); + ceph_assert(pg_upmap_items.empty()); + } + if (v >= 6) { + encode(crush_version, bl); + } + if (v >= 7) { + encode(new_removed_snaps, bl); + encode(new_purged_snaps, bl); + } + if (v >= 9) { + encode(last_up_change, bl); + encode(last_in_change, bl); + } + ENCODE_FINISH(bl); // client-usable data + } + + { + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + uint8_t target_v = 9; // when bumping this, be aware of range blocklist + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + target_v = 1; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + target_v = 5; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + target_v = 6; + } + if (stretch_mode_enabled) { + target_v = std::max((uint8_t)10, target_v); + } + if (!range_blocklist.empty()) { + target_v = std::max((uint8_t)11, target_v); + } + ENCODE_START(target_v, 1, bl); // extended, osd-only data + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features); + } else { + encode(osd_addrs->hb_back_addrs, bl, features); + } + encode(osd_info, bl); + { + // put this in a sorted, ordered map<> so that we encode in a + // deterministic order. + map blocklist_map; + for (const auto &addr : blocklist) + blocklist_map.insert(make_pair(addr.first, addr.second)); + encode(blocklist_map, bl, features); + } + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features); + } else { + encode(osd_addrs->cluster_addrs, bl, features); + } + encode(cluster_snapshot_epoch, bl); + encode(cluster_snapshot, bl); + encode(*osd_uuid, bl); + encode(osd_xinfo, bl, features); + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features); + } else { + encode(osd_addrs->hb_front_addrs, bl, features); + } + if (target_v >= 2) { + encode(nearfull_ratio, bl); + encode(full_ratio, bl); + encode(backfillfull_ratio, bl); + } + // 4 was string-based new_require_min_compat_client + if (target_v >= 5) { + encode(require_min_compat_client, bl); + encode(require_osd_release, bl); + } + if (target_v >= 6) { + encode(removed_snaps_queue, bl); + } + if (target_v >= 8) { + encode(crush_node_flags, bl); + } + if (target_v >= 9) { + encode(device_class_flags, bl); + } + if (target_v >= 10) { + encode(stretch_mode_enabled, bl); + encode(stretch_bucket_count, bl); + encode(degraded_stretch_mode, bl); + encode(recovering_stretch_mode, bl); + encode(stretch_mode_bucket, bl); + } + if (target_v >= 11) { + ::encode(range_blocklist, bl, features); + } + ENCODE_FINISH(bl); // osd-only data + } + + crc_offset = bl.length(); + crc_filler = bl.append_hole(sizeof(uint32_t)); + tail_offset = bl.length(); + + ENCODE_FINISH(bl); // meta-encoding wrapper + + // fill in crc + ceph::buffer::list front; + front.substr_of(bl, start_offset, crc_offset - start_offset); + crc = front.crc32c(-1); + if (tail_offset < bl.length()) { + ceph::buffer::list tail; + tail.substr_of(bl, tail_offset, bl.length() - tail_offset); + crc = tail.crc32c(crc); + } + ceph_le32 crc_le; + crc_le = crc; + crc_filler->copy_in(4, (char*)&crc_le); + crc_defined = true; +} + +/* for a description of osdmap versions, and when they were introduced, please + * refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::decode(ceph::buffer::list& bl) +{ + auto p = bl.cbegin(); + decode(p); +} + +void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p) +{ + using ceph::decode; + __u32 n, t; + __u16 v; + decode(v, p); + + // base + decode(fsid, p); + decode(epoch, p); + decode(created, p); + decode(modified, p); + + if (v < 6) { + if (v < 4) { + int32_t max_pools = 0; + decode(max_pools, p); + pool_max = max_pools; + } + pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(pools[t], p); + } + if (v == 4) { + decode(n, p); + pool_max = n; + } else if (v == 5) { + pool_name.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(pool_name[t], p); + } + decode(n, p); + pool_max = n; + } + } else { + decode(pools, p); + decode(pool_name, p); + decode(pool_max, p); + } + // kludge around some old bug that zeroed out pool_max (#2307) + if (pools.size() && pool_max < pools.rbegin()->first) { + pool_max = pools.rbegin()->first; + } + + decode(flags, p); + + decode(max_osd, p); + { + vector os; + decode(os, p); + osd_state.resize(os.size()); + for (unsigned i = 0; i < os.size(); ++i) { + osd_state[i] = os[i]; + } + } + decode(osd_weight, p); + decode(osd_addrs->client_addrs, p); + if (v <= 5) { + pg_temp->clear(); + decode(n, p); + while (n--) { + old_pg_t opg; + ceph::decode_raw(opg, p); + mempool::osdmap::vector v; + decode(v, p); + pg_temp->set(pg_t(opg), v); + } + } else { + decode(*pg_temp, p); + } + + // crush + ceph::buffer::list cbl; + decode(cbl, p); + auto cblp = cbl.cbegin(); + crush->decode(cblp); + + // extended + __u16 ev = 0; + if (v >= 5) + decode(ev, p); + decode(osd_addrs->hb_back_addrs, p); + decode(osd_info, p); + if (v < 5) + decode(pool_name, p); + + decode(blocklist, p); + if (ev >= 6) + decode(osd_addrs->cluster_addrs, p); + else + osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size()); + + if (ev >= 7) { + decode(cluster_snapshot_epoch, p); + decode(cluster_snapshot, p); + } + + if (ev >= 8) { + decode(*osd_uuid, p); + } else { + osd_uuid->resize(max_osd); + } + if (ev >= 9) + decode(osd_xinfo, p); + else + osd_xinfo.resize(max_osd); + + if (ev >= 10) + decode(osd_addrs->hb_front_addrs, p); + else + osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size()); + + osd_primary_affinity.reset(); + + post_decode(); +} + +void OSDMap::decode(ceph::buffer::list::const_iterator& bl) +{ + using ceph::decode; + /** + * Older encodings of the OSDMap had a single struct_v which + * covered the whole encoding, and was prior to our modern + * stuff which includes a compatv and a size. So if we see + * a struct_v < 7, we must rewind to the beginning and use our + * classic decoder. + */ + size_t start_offset = bl.get_off(); + size_t tail_offset = 0; + ceph::buffer::list crc_front, crc_tail; + + DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper + if (struct_v < 7) { + bl.seek(start_offset); + decode_classic(bl); + return; + } + /** + * Since we made it past that hurdle, we can use our normal paths. + */ + { + DECODE_START(9, bl); // client-usable data + // base + decode(fsid, bl); + decode(epoch, bl); + decode(created, bl); + decode(modified, bl); + + decode(pools, bl); + decode(pool_name, bl); + decode(pool_max, bl); + + decode(flags, bl); + + decode(max_osd, bl); + if (struct_v >= 5) { + decode(osd_state, bl); + } else { + vector os; + decode(os, bl); + osd_state.resize(os.size()); + for (unsigned i = 0; i < os.size(); ++i) { + osd_state[i] = os[i]; + } + } + decode(osd_weight, bl); + decode(osd_addrs->client_addrs, bl); + + decode(*pg_temp, bl); + decode(*primary_temp, bl); + // dates back to firefly. version increased from 2 to 3 still in firefly. + // do we really still need to keep this around? even for old clients? + if (struct_v >= 2) { + osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>); + decode(*osd_primary_affinity, bl); + if (osd_primary_affinity->empty()) + osd_primary_affinity.reset(); + } else { + osd_primary_affinity.reset(); + } + + // crush + ceph::buffer::list cbl; + decode(cbl, bl); + auto cblp = cbl.cbegin(); + crush->decode(cblp); + // added in firefly; version increased in luminous, so it affects + // giant, hammer, infernallis, jewel, and kraken. probably should be left + // alone until we require clients to be all luminous? + if (struct_v >= 3) { + decode(erasure_code_profiles, bl); + } else { + erasure_code_profiles.clear(); + } + // version increased from 3 to 4 still in luminous, so same as above + // applies. + if (struct_v >= 4) { + decode(pg_upmap, bl); + decode(pg_upmap_items, bl); + } else { + pg_upmap.clear(); + pg_upmap_items.clear(); + } + // again, version increased from 5 to 6 still in luminous, so above + // applies. + if (struct_v >= 6) { + decode(crush_version, bl); + } + // version increase from 6 to 7 in mimic + if (struct_v >= 7) { + decode(new_removed_snaps, bl); + decode(new_purged_snaps, bl); + } + // version increase from 7 to 8, 8 to 9, in nautilus. + if (struct_v >= 9) { + decode(last_up_change, bl); + decode(last_in_change, bl); + } + DECODE_FINISH(bl); // client-usable data + } + + { + DECODE_START(10, bl); // extended, osd-only data + decode(osd_addrs->hb_back_addrs, bl); + decode(osd_info, bl); + decode(blocklist, bl); + decode(osd_addrs->cluster_addrs, bl); + decode(cluster_snapshot_epoch, bl); + decode(cluster_snapshot, bl); + decode(*osd_uuid, bl); + decode(osd_xinfo, bl); + decode(osd_addrs->hb_front_addrs, bl); + // + if (struct_v >= 2) { + decode(nearfull_ratio, bl); + decode(full_ratio, bl); + } else { + nearfull_ratio = 0; + full_ratio = 0; + } + if (struct_v >= 3) { + decode(backfillfull_ratio, bl); + } else { + backfillfull_ratio = 0; + } + if (struct_v == 4) { + string r; + decode(r, bl); + if (r.length()) + require_min_compat_client = ceph_release_from_name(r.c_str()); + } + if (struct_v >= 5) { + decode(require_min_compat_client, bl); + decode(require_osd_release, bl); + if (require_osd_release >= ceph_release_t::nautilus) { + flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT; + } + if (require_osd_release >= ceph_release_t::luminous) { + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } + } else { + if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) { + // only for compat with post-kraken pre-luminous test clusters + require_osd_release = ceph_release_t::luminous; + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) { + require_osd_release = ceph_release_t::kraken; + } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) { + require_osd_release = ceph_release_t::jewel; + } else { + require_osd_release = ceph_release_t::unknown; + } + } + if (struct_v >= 6) { + decode(removed_snaps_queue, bl); + } + if (struct_v >= 8) { + decode(crush_node_flags, bl); + } else { + crush_node_flags.clear(); + } + if (struct_v >= 9) { + decode(device_class_flags, bl); + } else { + device_class_flags.clear(); + } + if (struct_v >= 10) { + decode(stretch_mode_enabled, bl); + decode(stretch_bucket_count, bl); + decode(degraded_stretch_mode, bl); + decode(recovering_stretch_mode, bl); + decode(stretch_mode_bucket, bl); + } else { + stretch_mode_enabled = false; + stretch_bucket_count = 0; + degraded_stretch_mode = 0; + recovering_stretch_mode = 0; + stretch_mode_bucket = 0; + } + if (struct_v >= 11) { + decode(range_blocklist, bl); + calculated_ranges.clear(); + for (const auto& i : range_blocklist) { + calculated_ranges.emplace(i.first, i.first); + } + } + DECODE_FINISH(bl); // osd-only data + } + + if (struct_v >= 8) { + crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset); + decode(crc, bl); + tail_offset = bl.get_off(); + crc_defined = true; + } else { + crc_defined = false; + crc = 0; + } + + DECODE_FINISH(bl); // wrapper + + if (tail_offset) { + // verify crc + uint32_t actual = crc_front.crc32c(-1); + if (tail_offset < bl.get_off()) { + ceph::buffer::list tail; + tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset); + actual = tail.crc32c(actual); + } + if (crc != actual) { + ostringstream ss; + ss << "bad crc, actual " << actual << " != expected " << crc; + string s = ss.str(); + throw ceph::buffer::malformed_input(s.c_str()); + } + } + + post_decode(); +} + +void OSDMap::post_decode() +{ + // index pool names + name_pool.clear(); + for (const auto &pname : pool_name) { + name_pool[pname.second] = pname.first; + } + + calc_num_osds(); + _calc_up_osd_features(); +} + +void OSDMap::dump_erasure_code_profiles( + const mempool::osdmap::map>& profiles, + Formatter *f) +{ + f->open_object_section("erasure_code_profiles"); + for (const auto &profile : profiles) { + f->open_object_section(profile.first.c_str()); + for (const auto &profm : profile.second) { + f->dump_string(profm.first.c_str(), profm.second); + } + f->close_section(); + } + f->close_section(); +} + +void OSDMap::dump_osds(Formatter *f) const +{ + f->open_array_section("osds"); + for (int i=0; iclose_section(); +} + +void OSDMap::dump_osd(int id, Formatter *f) const +{ + ceph_assert(f != nullptr); + if (!exists(id)) { + return; + } + + f->open_object_section("osd_info"); + f->dump_int("osd", id); + f->dump_stream("uuid") << get_uuid(id); + f->dump_int("up", is_up(id)); + f->dump_int("in", is_in(id)); + f->dump_float("weight", get_weightf(id)); + f->dump_float("primary_affinity", get_primary_affinityf(id)); + get_info(id).dump(f); + f->dump_object("public_addrs", get_addrs(id)); + f->dump_object("cluster_addrs", get_cluster_addrs(id)); + f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id)); + f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id)); + // compat + f->dump_stream("public_addr") << get_addrs(id).get_legacy_str(); + f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str(); + f->dump_stream("heartbeat_back_addr") + << get_hb_back_addrs(id).get_legacy_str(); + f->dump_stream("heartbeat_front_addr") + << get_hb_front_addrs(id).get_legacy_str(); + + set st; + get_state(id, st); + f->open_array_section("state"); + for (const auto &state : st) + f->dump_string("state", state); + f->close_section(); + + f->close_section(); +} + +void OSDMap::dump(Formatter *f) const +{ + f->dump_int("epoch", get_epoch()); + f->dump_stream("fsid") << get_fsid(); + f->dump_stream("created") << get_created(); + f->dump_stream("modified") << get_modified(); + f->dump_stream("last_up_change") << last_up_change; + f->dump_stream("last_in_change") << last_in_change; + f->dump_string("flags", get_flag_string()); + f->dump_unsigned("flags_num", flags); + f->open_array_section("flags_set"); + set flagset; + get_flag_set(&flagset); + for (auto p : flagset) { + f->dump_string("flag", p); + } + f->close_section(); + f->dump_unsigned("crush_version", get_crush_version()); + f->dump_float("full_ratio", full_ratio); + f->dump_float("backfillfull_ratio", backfillfull_ratio); + f->dump_float("nearfull_ratio", nearfull_ratio); + f->dump_string("cluster_snapshot", get_cluster_snapshot()); + f->dump_int("pool_max", get_pool_max()); + f->dump_int("max_osd", get_max_osd()); + f->dump_string("require_min_compat_client", + to_string(require_min_compat_client)); + f->dump_string("min_compat_client", + to_string(get_min_compat_client())); + f->dump_string("require_osd_release", + to_string(require_osd_release)); + + f->open_array_section("pools"); + for (const auto &pool : pools) { + std::string name(""); + const auto &pni = pool_name.find(pool.first); + if (pni != pool_name.end()) + name = pni->second; + f->open_object_section("pool"); + f->dump_int("pool", pool.first); + f->dump_string("pool_name", name); + pool.second.dump(f); + f->close_section(); + } + f->close_section(); + + dump_osds(f); + + f->open_array_section("osd_xinfo"); + for (int i=0; iopen_object_section("xinfo"); + f->dump_int("osd", i); + osd_xinfo[i].dump(f); + f->close_section(); + } + } + f->close_section(); + + f->open_array_section("pg_upmap"); + for (auto& p : pg_upmap) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << p.first; + f->open_array_section("osds"); + for (auto q : p.second) { + f->dump_int("osd", q); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("pg_upmap_items"); + for (auto& p : pg_upmap_items) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << p.first; + f->open_array_section("mappings"); + for (auto& q : p.second) { + f->open_object_section("mapping"); + f->dump_int("from", q.first); + f->dump_int("to", q.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("pg_temp"); + pg_temp->dump(f); + f->close_section(); + + f->open_array_section("primary_temp"); + for (const auto &pg : *primary_temp) { + f->dump_stream("pgid") << pg.first; + f->dump_int("osd", pg.second); + } + f->close_section(); // primary_temp + + f->open_object_section("blocklist"); + for (const auto &addr : blocklist) { + stringstream ss; + ss << addr.first; + f->dump_stream(ss.str().c_str()) << addr.second; + } + f->close_section(); + f->open_object_section("range_blocklist"); + for (const auto &addr : range_blocklist) { + stringstream ss; + ss << addr.first; + f->dump_stream(ss.str().c_str()) << addr.second; + } + f->close_section(); + + dump_erasure_code_profiles(erasure_code_profiles, f); + + f->open_array_section("removed_snaps_queue"); + for (auto& p : removed_snaps_queue) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_removed_snaps"); + for (auto& p : new_removed_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_purged_snaps"); + for (auto& p : new_purged_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_object_section("crush_node_flags"); + for (auto& i : crush_node_flags) { + string s = crush->item_exists(i.first) ? crush->get_item_name(i.first) + : stringify(i.first); + f->open_array_section(s.c_str()); + set st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_object_section("device_class_flags"); + for (auto& i : device_class_flags) { + const char* class_name = crush->get_class_name(i.first); + string s = class_name ? class_name : stringify(i.first); + f->open_array_section(s.c_str()); + set st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_object_section("stretch_mode"); + { + f->dump_bool("stretch_mode_enabled", stretch_mode_enabled); + f->dump_unsigned("stretch_bucket_count", stretch_bucket_count); + f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode); + f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode); + f->dump_int("stretch_mode_bucket", stretch_mode_bucket); + } + f->close_section(); +} + +void OSDMap::generate_test_instances(list& o) +{ + o.push_back(new OSDMap); + + CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY); + o.push_back(new OSDMap); + uuid_d fsid; + o.back()->build_simple(cct, 1, fsid, 16); + o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp + o.back()->blocklist[entity_addr_t()] = utime_t(5, 6); + cct->put(); +} + +string OSDMap::get_flag_string(unsigned f) +{ + string s; + if (f & CEPH_OSDMAP_PAUSERD) + s += ",pauserd"; + if (f & CEPH_OSDMAP_PAUSEWR) + s += ",pausewr"; + if (f & CEPH_OSDMAP_PAUSEREC) + s += ",pauserec"; + if (f & CEPH_OSDMAP_NOUP) + s += ",noup"; + if (f & CEPH_OSDMAP_NODOWN) + s += ",nodown"; + if (f & CEPH_OSDMAP_NOOUT) + s += ",noout"; + if (f & CEPH_OSDMAP_NOIN) + s += ",noin"; + if (f & CEPH_OSDMAP_NOBACKFILL) + s += ",nobackfill"; + if (f & CEPH_OSDMAP_NOREBALANCE) + s += ",norebalance"; + if (f & CEPH_OSDMAP_NORECOVER) + s += ",norecover"; + if (f & CEPH_OSDMAP_NOSCRUB) + s += ",noscrub"; + if (f & CEPH_OSDMAP_NODEEP_SCRUB) + s += ",nodeep-scrub"; + if (f & CEPH_OSDMAP_NOTIERAGENT) + s += ",notieragent"; + if (f & CEPH_OSDMAP_NOSNAPTRIM) + s += ",nosnaptrim"; + if (f & CEPH_OSDMAP_SORTBITWISE) + s += ",sortbitwise"; + if (f & CEPH_OSDMAP_REQUIRE_JEWEL) + s += ",require_jewel_osds"; + if (f & CEPH_OSDMAP_REQUIRE_KRAKEN) + s += ",require_kraken_osds"; + if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS) + s += ",require_luminous_osds"; + if (f & CEPH_OSDMAP_RECOVERY_DELETES) + s += ",recovery_deletes"; + if (f & CEPH_OSDMAP_PURGED_SNAPDIRS) + s += ",purged_snapdirs"; + if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT) + s += ",pglog_hardlimit"; + if (s.length()) + s.erase(0, 1); + return s; +} + +string OSDMap::get_flag_string() const +{ + return get_flag_string(flags); +} + +void OSDMap::print_pools(ostream& out) const +{ + for (const auto &pool : pools) { + std::string name(""); + const auto &pni = pool_name.find(pool.first); + if (pni != pool_name.end()) + name = pni->second; + out << "pool " << pool.first + << " '" << name + << "' " << pool.second << "\n"; + + for (const auto &snap : pool.second.snaps) + out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n"; + + if (!pool.second.removed_snaps.empty()) + out << "\tremoved_snaps " << pool.second.removed_snaps << "\n"; + auto p = removed_snaps_queue.find(pool.first); + if (p != removed_snaps_queue.end()) { + out << "\tremoved_snaps_queue " << p->second << "\n"; + } + } + out << std::endl; +} + +void OSDMap::print_osds(ostream& out) const +{ + for (int i=0; i st; + get_state(id, st); + out << " " << st; + if (!get_uuid(id).is_zero()) { + out << " " << get_uuid(id); + } + out << "\n"; +} + +void OSDMap::print(ostream& out) const +{ + out << "epoch " << get_epoch() << "\n" + << "fsid " << get_fsid() << "\n" + << "created " << get_created() << "\n" + << "modified " << get_modified() << "\n"; + + out << "flags " << get_flag_string() << "\n"; + out << "crush_version " << get_crush_version() << "\n"; + out << "full_ratio " << full_ratio << "\n"; + out << "backfillfull_ratio " << backfillfull_ratio << "\n"; + out << "nearfull_ratio " << nearfull_ratio << "\n"; + if (require_min_compat_client != ceph_release_t::unknown) { + out << "require_min_compat_client " + << require_min_compat_client << "\n"; + } + out << "min_compat_client " << get_min_compat_client() + << "\n"; + if (require_osd_release > ceph_release_t::unknown) { + out << "require_osd_release " << require_osd_release + << "\n"; + } + out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n"; + if (stretch_mode_enabled) { + out << "stretch_bucket_count " << stretch_bucket_count << "\n"; + out << "degraded_stretch_mode " << degraded_stretch_mode << "\n"; + out << "recovering_stretch_mode " << recovering_stretch_mode << "\n"; + out << "stretch_mode_bucket " << stretch_mode_bucket << "\n"; + } + if (get_cluster_snapshot().length()) + out << "cluster_snapshot " << get_cluster_snapshot() << "\n"; + out << "\n"; + + print_pools(out); + + out << "max_osd " << get_max_osd() << "\n"; + print_osds(out); + out << std::endl; + + for (auto& p : pg_upmap) { + out << "pg_upmap " << p.first << " " << p.second << "\n"; + } + for (auto& p : pg_upmap_items) { + out << "pg_upmap_items " << p.first << " " << p.second << "\n"; + } + + for (const auto& pg : *pg_temp) + out << "pg_temp " << pg.first << " " << pg.second << "\n"; + + for (const auto& pg : *primary_temp) + out << "primary_temp " << pg.first << " " << pg.second << "\n"; + + for (const auto &addr : blocklist) + out << "blocklist " << addr.first << " expires " << addr.second << "\n"; + for (const auto &addr : range_blocklist) + out << "range blocklist " << addr.first << " expires " << addr.second << "\n"; +} + +class OSDTreePlainDumper : public CrushTreeDumper::Dumper { +public: + typedef CrushTreeDumper::Dumper Parent; + + OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + unsigned f) + : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { } + + bool should_dump_leaf(int i) const override { + if (!filter) { + return true; // normal case + } + if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) || + ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) || + ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) || + ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) || + ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) { + return true; + } + return false; + } + + bool should_dump_empty_bucket() const override { + return !filter; + } + + void init_table(TextTable *tbl) { + tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT); + tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT); + } + void dump(TextTable *tbl, string& bucket) { + init_table(tbl); + + if (!bucket.empty()) { + set_root(bucket); + Parent::dump(tbl); + } else { + Parent::dump(tbl); + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) { + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl); + } + } + } + } + +protected: + void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override { + const char *c = crush->get_item_class(qi.id); + if (!c) + c = ""; + *tbl << qi.id + << c + << weightf_t(qi.weight); + + ostringstream name; + for (int k = 0; k < qi.depth; k++) + name << " "; + if (qi.is_bucket()) { + name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " " + << crush->get_item_name(qi.id); + } else { + name << "osd." << qi.id; + } + *tbl << name.str(); + + if (!qi.is_bucket()) { + if (!osdmap->exists(qi.id)) { + *tbl << "DNE" + << 0; + } else { + string s; + if (osdmap->is_up(qi.id)) { + s = "up"; + } else if (osdmap->is_destroyed(qi.id)) { + s = "destroyed"; + } else { + s = "down"; + } + *tbl << s + << weightf_t(osdmap->get_weightf(qi.id)) + << weightf_t(osdmap->get_primary_affinityf(qi.id)); + } + } + *tbl << TextTable::endrow; + } + +private: + const OSDMap *osdmap; + const unsigned filter; +}; + +class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper { +public: + typedef CrushTreeDumper::FormattingDumper Parent; + + OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + unsigned f) + : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { } + + bool should_dump_leaf(int i) const override { + if (!filter) { + return true; // normal case + } + if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) || + ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) || + ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) || + ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) || + ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) { + return true; + } + return false; + } + + bool should_dump_empty_bucket() const override { + return !filter; + } + + void dump(Formatter *f, string& bucket) { + if (!bucket.empty()) { + set_root(bucket); + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + } else { + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + f->open_array_section("stray"); + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f); + } + f->close_section(); + } + } + +protected: + void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override { + Parent::dump_item_fields(qi, f); + if (!qi.is_bucket()) + { + string s; + if (osdmap->is_up(qi.id)) { + s = "up"; + } else if (osdmap->is_destroyed(qi.id)) { + s = "destroyed"; + } else { + s = "down"; + } + f->dump_unsigned("exists", (int)osdmap->exists(qi.id)); + f->dump_string("status", s); + f->dump_float("reweight", osdmap->get_weightf(qi.id)); + f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id)); + } + } + +private: + const OSDMap *osdmap; + const unsigned filter; +}; + +void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const +{ + if (f) { + OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket); + } else { + ceph_assert(out); + TextTable tbl; + OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket); + *out << tbl; + } +} + +void OSDMap::print_summary(Formatter *f, ostream& out, + const string& prefix, bool extra) const +{ + if (f) { + f->dump_int("epoch", get_epoch()); + f->dump_int("num_osds", get_num_osds()); + f->dump_int("num_up_osds", get_num_up_osds()); + f->dump_int("osd_up_since", last_up_change.to_msec() / 1000); + f->dump_int("num_in_osds", get_num_in_osds()); + f->dump_int("osd_in_since", last_in_change.to_msec() / 1000); + f->dump_unsigned("num_remapped_pgs", get_num_pg_temp()); + } else { + utime_t now = ceph_clock_now(); + out << get_num_osds() << " osds: " + << get_num_up_osds() << " up"; + if (last_up_change != utime_t()) { + out << " (since " << utimespan_str(now - last_up_change) << ")"; + } + out << ", " << get_num_in_osds() << " in"; + if (last_in_change != utime_t()) { + out << " (since " << utimespan_str(now - last_in_change) << ")"; + } + if (extra) + out << "; epoch: e" << get_epoch(); + if (get_num_pg_temp()) + out << "; " << get_num_pg_temp() << " remapped pgs"; + out << "\n"; + uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS; + if (important_flags) + out << prefix << "flags " << get_flag_string(important_flags) << "\n"; + } +} + +void OSDMap::print_oneline_summary(ostream& out) const +{ + out << "e" << get_epoch() << ": " + << get_num_osds() << " total, " + << get_num_up_osds() << " up, " + << get_num_in_osds() << " in"; +} + +bool OSDMap::crush_rule_in_use(int rule_id) const +{ + for (const auto &pool : pools) { + if (pool.second.crush_rule == rule_id) + return true; + } + return false; +} + +int OSDMap::validate_crush_rules(CrushWrapper *newcrush, + ostream *ss) const +{ + for (auto& i : pools) { + auto& pool = i.second; + int ruleno = pool.get_crush_rule(); + if (!newcrush->rule_exists(ruleno)) { + *ss << "pool " << i.first << " references crush_rule " << ruleno + << " but it is not present"; + return -EINVAL; + } + if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) { + *ss << "rule " << ruleno << " mask ruleset does not match rule id"; + return -EINVAL; + } + if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) { + *ss << "pool " << i.first << " type does not match rule " << ruleno; + return -EINVAL; + } + int poolsize = pool.get_size(); + if (poolsize < newcrush->get_rule_mask_min_size(ruleno) || + poolsize > newcrush->get_rule_mask_max_size(ruleno)) { + *ss << "pool " << i.first << " size " << poolsize << " does not" + << " fall within rule " << ruleno + << " min_size " << newcrush->get_rule_mask_min_size(ruleno) + << " and max_size " << newcrush->get_rule_mask_max_size(ruleno); + return -EINVAL; + } + } + return 0; +} + +int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid, + int nosd, int pg_bits, int pgp_bits, + bool default_pool) +{ + ldout(cct, 10) << "build_simple on " << nosd + << " osds" << dendl; + epoch = e; + set_fsid(fsid); + created = modified = ceph_clock_now(); + + if (nosd >= 0) { + set_max_osd(nosd); + } else { + // count osds + int maxosd = 0; + const auto& conf = cct->_conf; + vector sections; + conf.get_all_sections(sections); + + for (auto §ion : sections) { + if (section.find("osd.") != 0) + continue; + + const char *begin = section.c_str() + 4; + char *end = (char*)begin; + int o = strtol(begin, &end, 10); + if (*end != '\0') + continue; + + if (o > cct->_conf->mon_max_osd) { + lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl; + return -ERANGE; + } + + if (o > maxosd) + maxosd = o; + } + + set_max_osd(maxosd + 1); + } + + + stringstream ss; + int r; + if (nosd >= 0) + r = build_simple_crush_map(cct, *crush, nosd, &ss); + else + r = build_simple_crush_map_from_conf(cct, *crush, &ss); + ceph_assert(r == 0); + + int poolbase = get_max_osd() ? get_max_osd() : 1; + + const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct); + ceph_assert(default_replicated_rule >= 0); + + if (default_pool) { + // pgp_num <= pg_num + if (pgp_bits > pg_bits) + pgp_bits = pg_bits; + + vector pool_names; + pool_names.push_back("rbd"); + for (auto &plname : pool_names) { + int64_t pool = ++pool_max; + pools[pool].type = pg_pool_t::TYPE_REPLICATED; + pools[pool].flags = cct->_conf->osd_pool_default_flags; + if (cct->_conf->osd_pool_default_flag_hashpspool) + pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL); + if (cct->_conf->osd_pool_default_flag_nodelete) + pools[pool].set_flag(pg_pool_t::FLAG_NODELETE); + if (cct->_conf->osd_pool_default_flag_nopgchange) + pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE); + if (cct->_conf->osd_pool_default_flag_nosizechange) + pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE); + if (cct->_conf->osd_pool_default_flag_bulk) + pools[pool].set_flag(pg_pool_t::FLAG_BULK); + pools[pool].size = cct->_conf.get_val("osd_pool_default_size"); + pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size( + pools[pool].size); + pools[pool].crush_rule = default_replicated_rule; + pools[pool].object_hash = CEPH_STR_HASH_RJENKINS; + pools[pool].set_pg_num(poolbase << pg_bits); + pools[pool].set_pgp_num(poolbase << pgp_bits); + pools[pool].set_pg_num_target(poolbase << pg_bits); + pools[pool].set_pgp_num_target(poolbase << pgp_bits); + pools[pool].last_change = epoch; + pools[pool].application_metadata.insert( + {pg_pool_t::APPLICATION_NAME_RBD, {}}); + if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name( + cct->_conf.get_val("osd_pool_default_pg_autoscale_mode")); + m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) { + pools[pool].pg_autoscale_mode = m; + } else { + pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF; + } + pool_name[pool] = plname; + name_pool[plname] = pool; + } + } + + map profile_map; + r = get_erasure_code_profile_default(cct, profile_map, &ss); + if (r < 0) { + lderr(cct) << ss.str() << dendl; + return r; + } + set_erasure_code_profile("default", profile_map); + return 0; +} + +int OSDMap::get_erasure_code_profile_default(CephContext *cct, + map &profile_map, + ostream *ss) +{ + int r = get_json_str_map(cct->_conf.get_val("osd_pool_default_erasure_code_profile"), + *ss, + &profile_map); + return r; +} + +int OSDMap::_build_crush_types(CrushWrapper& crush) +{ + crush.set_type_name(0, "osd"); + crush.set_type_name(1, "host"); + crush.set_type_name(2, "chassis"); + crush.set_type_name(3, "rack"); + crush.set_type_name(4, "row"); + crush.set_type_name(5, "pdu"); + crush.set_type_name(6, "pod"); + crush.set_type_name(7, "room"); + crush.set_type_name(8, "datacenter"); + crush.set_type_name(9, "zone"); + crush.set_type_name(10, "region"); + crush.set_type_name(11, "root"); + return 11; +} + +int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush, + int nosd, ostream *ss) +{ + crush.create(); + + // root + int root_type = _build_crush_types(crush); + int rootid; + int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT, + root_type, 0, NULL, NULL, &rootid); + ceph_assert(r == 0); + crush.set_item_name(rootid, "default"); + + map loc{ + {"host", "localhost"}, + {"rack", "localrack"}, + {"root", "default"} + }; + for (int o=0; o_conf; + + crush.create(); + + // root + int root_type = _build_crush_types(crush); + int rootid; + int r = crush.add_bucket(0, 0, + CRUSH_HASH_DEFAULT, + root_type, 0, NULL, NULL, &rootid); + ceph_assert(r == 0); + crush.set_item_name(rootid, "default"); + + // add osds + vector sections; + conf.get_all_sections(sections); + + for (auto §ion : sections) { + if (section.find("osd.") != 0) + continue; + + const char *begin = section.c_str() + 4; + char *end = (char*)begin; + int o = strtol(begin, &end, 10); + if (*end != '\0') + continue; + + string host, rack, row, room, dc, pool; + vector sectiontmp; + sectiontmp.push_back("osd"); + sectiontmp.push_back(section); + conf.get_val_from_conf_file(sectiontmp, "host", host, false); + conf.get_val_from_conf_file(sectiontmp, "rack", rack, false); + conf.get_val_from_conf_file(sectiontmp, "row", row, false); + conf.get_val_from_conf_file(sectiontmp, "room", room, false); + conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false); + conf.get_val_from_conf_file(sectiontmp, "root", pool, false); + + if (host.length() == 0) + host = "unknownhost"; + if (rack.length() == 0) + rack = "unknownrack"; + + map loc; + loc["host"] = host; + loc["rack"] = rack; + if (row.size()) + loc["row"] = row; + if (room.size()) + loc["room"] = room; + if (dc.size()) + loc["datacenter"] = dc; + loc["root"] = "default"; + + ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl; + crush.insert_item(cct, o, 1.0, section, loc); + } + + build_simple_crush_rules(cct, crush, "default", ss); + + crush.finalize(); + + return 0; +} + + +int OSDMap::build_simple_crush_rules( + CephContext *cct, + CrushWrapper& crush, + const string& root, + ostream *ss) +{ + int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct); + string failure_domain = + crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type); + + int r; + r = crush.add_simple_rule_at( + "replicated_rule", root, failure_domain, "", + "firstn", pg_pool_t::TYPE_REPLICATED, + crush_rule, ss); + if (r < 0) + return r; + // do not add an erasure rule by default or else we will implicitly + // require the crush_v2 feature of clients + return 0; +} + +int OSDMap::summarize_mapping_stats( + OSDMap *newmap, + const set *pools, + std::string *out, + Formatter *f) const +{ + set ls; + if (pools) { + ls = *pools; + } else { + for (auto &p : get_pools()) + ls.insert(p.first); + } + + unsigned total_pg = 0; + unsigned moved_pg = 0; + vector base_by_osd(get_max_osd(), 0); + vector new_by_osd(get_max_osd(), 0); + for (int64_t pool_id : ls) { + const pg_pool_t *pi = get_pg_pool(pool_id); + vector up, up2; + int up_primary; + for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) { + pg_t pgid(ps, pool_id); + total_pg += pi->get_size(); + pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr); + for (int osd : up) { + if (osd >= 0 && osd < get_max_osd()) + ++base_by_osd[osd]; + } + if (newmap) { + newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr); + for (int osd : up2) { + if (osd >= 0 && osd < get_max_osd()) + ++new_by_osd[osd]; + } + if (pi->type == pg_pool_t::TYPE_ERASURE) { + for (unsigned i=0; itype == pg_pool_t::TYPE_REPLICATED) { + for (int osd : up) { + if (std::find(up2.begin(), up2.end(), osd) == up2.end()) { + ++moved_pg; + } + } + } else { + ceph_abort_msg("unhandled pool type"); + } + } + } + } + + unsigned num_up_in = 0; + for (int osd = 0; osd < get_max_osd(); ++osd) { + if (is_up(osd) && is_in(osd)) + ++num_up_in; + } + if (!num_up_in) { + return -EINVAL; + } + + float avg_pg = (float)total_pg / (float)num_up_in; + float base_stddev = 0, new_stddev = 0; + int min = -1, max = -1; + unsigned min_base_pg = 0, max_base_pg = 0; + unsigned min_new_pg = 0, max_new_pg = 0; + for (int osd = 0; osd < get_max_osd(); ++osd) { + if (is_up(osd) && is_in(osd)) { + float base_diff = (float)base_by_osd[osd] - avg_pg; + base_stddev += base_diff * base_diff; + float new_diff = (float)new_by_osd[osd] - avg_pg; + new_stddev += new_diff * new_diff; + if (min < 0 || base_by_osd[osd] < min_base_pg) { + min = osd; + min_base_pg = base_by_osd[osd]; + min_new_pg = new_by_osd[osd]; + } + if (max < 0 || base_by_osd[osd] > max_base_pg) { + max = osd; + max_base_pg = base_by_osd[osd]; + max_new_pg = new_by_osd[osd]; + } + } + } + base_stddev = sqrt(base_stddev / num_up_in); + new_stddev = sqrt(new_stddev / num_up_in); + + float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in))); + + ostringstream ss; + if (f) + f->open_object_section("utilization"); + if (newmap) { + if (f) { + f->dump_unsigned("moved_pgs", moved_pg); + f->dump_unsigned("total_pgs", total_pg); + } else { + float percent = 0; + if (total_pg) + percent = (float)moved_pg * 100.0 / (float)total_pg; + ss << "moved " << moved_pg << " / " << total_pg + << " (" << percent << "%)\n"; + } + } + if (f) { + f->dump_float("avg_pgs", avg_pg); + f->dump_float("std_dev", base_stddev); + f->dump_float("expected_baseline_std_dev", edev); + if (newmap) + f->dump_float("new_std_dev", new_stddev); + } else { + ss << "avg " << avg_pg << "\n"; + ss << "stddev " << base_stddev; + if (newmap) + ss << " -> " << new_stddev; + ss << " (expected baseline " << edev << ")\n"; + } + if (min >= 0) { + if (f) { + f->dump_unsigned("min_osd", min); + f->dump_unsigned("min_osd_pgs", min_base_pg); + if (newmap) + f->dump_unsigned("new_min_osd_pgs", min_new_pg); + } else { + ss << "min osd." << min << " with " << min_base_pg; + if (newmap) + ss << " -> " << min_new_pg; + ss << " pgs (" << (float)min_base_pg / avg_pg; + if (newmap) + ss << " -> " << (float)min_new_pg / avg_pg; + ss << " * mean)\n"; + } + } + if (max >= 0) { + if (f) { + f->dump_unsigned("max_osd", max); + f->dump_unsigned("max_osd_pgs", max_base_pg); + if (newmap) + f->dump_unsigned("new_max_osd_pgs", max_new_pg); + } else { + ss << "max osd." << max << " with " << max_base_pg; + if (newmap) + ss << " -> " << max_new_pg; + ss << " pgs (" << (float)max_base_pg / avg_pg; + if (newmap) + ss << " -> " << (float)max_new_pg / avg_pg; + ss << " * mean)\n"; + } + } + if (f) + f->close_section(); + if (out) + *out = ss.str(); + return 0; +} + +bool OSDMap::try_pg_upmap( + CephContext *cct, + pg_t pg, ///< pg to potentially remap + const set& overfull, ///< osds we'd want to evacuate + const vector& underfull, ///< osds to move to, in order of preference + const vector& more_underfull, ///< more osds only slightly underfull + vector *orig, + vector *out) ///< resulting alternative mapping +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) + return false; + int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(), + pool->get_size()); + if (rule < 0) + return false; + + // make sure there is something there to remap + bool any = false; + for (auto osd : *orig) { + if (overfull.count(osd)) { + any = true; + break; + } + } + if (!any) { + return false; + } + + int r = crush->try_remap_rule( + cct, + rule, + pool->get_size(), + overfull, underfull, + more_underfull, + *orig, + out); + if (r < 0) + return false; + if (*out == *orig) + return false; + return true; +} + +int OSDMap::calc_pg_upmaps( + CephContext *cct, + uint32_t max_deviation, + int max, + const set& only_pools, + OSDMap::Incremental *pending_inc) +{ + ldout(cct, 10) << __func__ << " pools " << only_pools << dendl; + OSDMap tmp; + // Can't be less than 1 pg + if (max_deviation < 1) + max_deviation = 1; + tmp.deepish_copy_from(*this); + int num_changed = 0; + map> pgs_by_osd; + int total_pgs = 0; + float osd_weight_total = 0; + map osd_weight; + for (auto& i : pools) { + if (!only_pools.empty() && !only_pools.count(i.first)) + continue; + for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) { + pg_t pg(ps, i.first); + vector up; + tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr); + ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl; + for (auto osd : up) { + if (osd != CRUSH_ITEM_NONE) + pgs_by_osd[osd].insert(pg); + } + } + total_pgs += i.second.get_size() * i.second.get_pg_num(); + + map pmap; + int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(), + i.second.get_type(), + i.second.get_size()); + tmp.crush->get_rule_weight_osd_map(ruleno, &pmap); + ldout(cct,20) << __func__ << " pool " << i.first + << " ruleno " << ruleno + << " weight-map " << pmap + << dendl; + for (auto p : pmap) { + auto adjusted_weight = tmp.get_weightf(p.first) * p.second; + if (adjusted_weight == 0) { + continue; + } + osd_weight[p.first] += adjusted_weight; + osd_weight_total += adjusted_weight; + } + } + for (auto& i : osd_weight) { + int pgs = 0; + auto p = pgs_by_osd.find(i.first); + if (p != pgs_by_osd.end()) + pgs = p->second.size(); + else + pgs_by_osd.emplace(i.first, set()); + ldout(cct, 20) << " osd." << i.first << " weight " << i.second + << " pgs " << pgs << dendl; + } + if (osd_weight_total == 0) { + lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl; + return 0; + } + float pgs_per_weight = total_pgs / osd_weight_total; + ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl; + ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl; + + if (max <= 0) { + lderr(cct) << __func__ << " abort due to max <= 0" << dendl; + return 0; + } + float stddev = 0; + map osd_deviation; // osd, deviation(pgs) + multimap deviation_osd; // deviation(pgs), osd + float cur_max_deviation = 0; + for (auto& i : pgs_by_osd) { + // make sure osd is still there (belongs to this crush-tree) + ceph_assert(osd_weight.count(i.first)); + float target = osd_weight[i.first] * pgs_per_weight; + float deviation = (float)i.second.size() - target; + ldout(cct, 20) << " osd." << i.first + << "\tpgs " << i.second.size() + << "\ttarget " << target + << "\tdeviation " << deviation + << dendl; + osd_deviation[i.first] = deviation; + deviation_osd.insert(make_pair(deviation, i.first)); + stddev += deviation * deviation; + if (fabsf(deviation) > cur_max_deviation) + cur_max_deviation = fabsf(deviation); + } + ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl; + if (cur_max_deviation <= max_deviation) { + ldout(cct, 10) << __func__ << " distribution is almost perfect" + << dendl; + return 0; + } + bool skip_overfull = false; + auto aggressive = + cct->_conf.get_val("osd_calc_pg_upmaps_aggressively"); + auto local_fallback_retries = + cct->_conf.get_val("osd_calc_pg_upmaps_local_fallback_retries"); + while (max--) { + ldout(cct, 30) << "Top of loop #" << max+1 << dendl; + // build overfull and underfull + set overfull; + set more_overfull; + bool using_more_overfull = false; + vector underfull; + vector more_underfull; + for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) { + ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl; + if (i->first <= 0) + break; + if (i->first > max_deviation) { + ldout(cct, 30) << " add overfull osd." << i->second << dendl; + overfull.insert(i->second); + } else { + more_overfull.insert(i->second); + } + } + + for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) { + ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl; + if (i->first >= 0) + break; + if (i->first < -(int)max_deviation) { + ldout(cct, 30) << " add underfull osd." << i->second << dendl; + underfull.push_back(i->second); + } else { + more_underfull.push_back(i->second); + } + } + if (underfull.empty() && overfull.empty()) { + ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl; + break; + } + if (overfull.empty() && !underfull.empty()) { + ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl; + overfull = more_overfull; + using_more_overfull = true; + } + + ldout(cct, 10) << " overfull " << overfull + << " underfull " << underfull + << dendl; + set to_skip; + uint64_t local_fallback_retried = 0; + + retry: + + set to_unmap; + map>> to_upmap; + auto temp_pgs_by_osd = pgs_by_osd; + // always start with fullest, break if we find any changes to make + for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) { + if (skip_overfull && !underfull.empty()) { + ldout(cct, 10) << " skipping overfull " << dendl; + break; // fall through to check underfull + } + int osd = p->second; + float deviation = p->first; + if (deviation < 0) { + ldout(cct, 10) << " hitting underfull osds now" + << " when trying to remap overfull osds" + << dendl; + break; + } + float target = osd_weight[osd] * pgs_per_weight; + ldout(cct, 10) << " Overfull search osd." << osd + << " target " << target + << " deviation " << deviation + << dendl; + ceph_assert(target > 0); + if (!using_more_overfull && deviation <= max_deviation) { + ldout(cct, 10) << " osd." << osd + << " target " << target + << " deviation " << deviation + << " < max deviation " << max_deviation + << dendl; + break; + } + + vector pgs; + pgs.reserve(pgs_by_osd[osd].size()); + for (auto& pg : pgs_by_osd[osd]) { + if (to_skip.count(pg)) + continue; + pgs.push_back(pg); + } + if (aggressive) { + // shuffle PG list so they all get equal (in)attention + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(pgs.begin(), pgs.end(), rng); + } + // look for remaps we can un-remap + for (auto pg : pgs) { + auto p = tmp.pg_upmap_items.find(pg); + if (p == tmp.pg_upmap_items.end()) + continue; + mempool::osdmap::vector> new_upmap_items; + for (auto q : p->second) { + if (q.second == osd) { + ldout(cct, 10) << " will try dropping existing" + << " remapping pair " + << q.first << " -> " << q.second + << " which remapped " << pg + << " into overfull osd." << osd + << dendl; + temp_pgs_by_osd[q.second].erase(pg); + temp_pgs_by_osd[q.first].insert(pg); + } else { + new_upmap_items.push_back(q); + } + } + if (new_upmap_items.empty()) { + // drop whole item + ldout(cct, 10) << " existing pg_upmap_items " << p->second + << " remapped " << pg << " into overfull osd." << osd + << ", will try cancelling it entirely" + << dendl; + to_unmap.insert(pg); + goto test_change; + } else if (new_upmap_items.size() != p->second.size()) { + // drop single remapping pair, updating + ceph_assert(new_upmap_items.size() < p->second.size()); + ldout(cct, 10) << " existing pg_upmap_items " << p->second + << " remapped " << pg << " into overfull osd." << osd + << ", new_pg_upmap_items now " << new_upmap_items + << dendl; + to_upmap[pg] = new_upmap_items; + goto test_change; + } + } + + // try upmap + for (auto pg : pgs) { + auto temp_it = tmp.pg_upmap.find(pg); + if (temp_it != tmp.pg_upmap.end()) { + // leave pg_upmap alone + // it must be specified by admin since balancer does not + // support pg_upmap yet + ldout(cct, 10) << " " << pg << " already has pg_upmap " + << temp_it->second << ", skipping" + << dendl; + continue; + } + auto pg_pool_size = tmp.get_pg_pool_size(pg); + mempool::osdmap::vector> new_upmap_items; + set existing; + auto it = tmp.pg_upmap_items.find(pg); + if (it != tmp.pg_upmap_items.end() && + it->second.size() >= (size_t)pg_pool_size) { + ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items " + << it->second << ", skipping" + << dendl; + continue; + } else if (it != tmp.pg_upmap_items.end()) { + ldout(cct, 10) << " " << pg << " already has pg_upmap_items " + << it->second + << dendl; + new_upmap_items = it->second; + // build existing too (for dedup) + for (auto i : it->second) { + existing.insert(i.first); + existing.insert(i.second); + } + // fall through + // to see if we can append more remapping pairs + } + ldout(cct, 10) << " trying " << pg << dendl; + vector raw, orig, out; + tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too + if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) { + continue; + } + ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl; + if (orig.size() != out.size()) { + continue; + } + ceph_assert(orig != out); + int pos = -1; + float max_dev = 0; + for (unsigned i = 0; i < out.size(); ++i) { + if (orig[i] == out[i]) + continue; // skip invalid remappings + if (existing.count(orig[i]) || existing.count(out[i])) + continue; // we want new remappings only! + if (osd_deviation[orig[i]] > max_dev) { + max_dev = osd_deviation[orig[i]]; + pos = i; + ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl; + } + } + if (pos != -1) { + int i = pos; + ldout(cct, 10) << " will try adding new remapping pair " + << orig[i] << " -> " << out[i] << " for " << pg + << (orig[i] != osd ? " NOT selected osd" : "") + << dendl; + existing.insert(orig[i]); + existing.insert(out[i]); + temp_pgs_by_osd[orig[i]].erase(pg); + temp_pgs_by_osd[out[i]].insert(pg); + ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size); + new_upmap_items.push_back(make_pair(orig[i], out[i])); + // append new remapping pairs slowly + // This way we can make sure that each tiny change will + // definitely make distribution of PGs converging to + // the perfect status. + to_upmap[pg] = new_upmap_items; + goto test_change; + } + } + } + + ceph_assert(!(to_unmap.size() || to_upmap.size())); + ldout(cct, 10) << " failed to find any changes for overfull osds" + << dendl; + for (auto& p : deviation_osd) { + if (std::find(underfull.begin(), underfull.end(), p.second) == + underfull.end()) + break; + int osd = p.second; + float deviation = p.first; + float target = osd_weight[osd] * pgs_per_weight; + ceph_assert(target > 0); + if (fabsf(deviation) < max_deviation) { + // respect max_deviation too + ldout(cct, 10) << " osd." << osd + << " target " << target + << " deviation " << deviation + << " -> absolute " << fabsf(deviation) + << " < max " << max_deviation + << dendl; + break; + } + // look for remaps we can un-remap + vector>>> candidates; + candidates.reserve(tmp.pg_upmap_items.size()); + for (auto& i : tmp.pg_upmap_items) { + if (to_skip.count(i.first)) + continue; + if (!only_pools.empty() && !only_pools.count(i.first.pool())) + continue; + candidates.push_back(make_pair(i.first, i.second)); + } + if (aggressive) { + // shuffle candidates so they all get equal (in)attention + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(candidates.begin(), candidates.end(), rng); + } + for (auto& i : candidates) { + auto pg = i.first; + mempool::osdmap::vector> new_upmap_items; + for (auto& j : i.second) { + if (j.first == osd) { + ldout(cct, 10) << " will try dropping existing" + << " remapping pair " + << j.first << " -> " << j.second + << " which remapped " << pg + << " out from underfull osd." << osd + << dendl; + temp_pgs_by_osd[j.second].erase(pg); + temp_pgs_by_osd[j.first].insert(pg); + } else { + new_upmap_items.push_back(j); + } + } + if (new_upmap_items.empty()) { + // drop whole item + ldout(cct, 10) << " existing pg_upmap_items " << i.second + << " remapped " << pg + << " out from underfull osd." << osd + << ", will try cancelling it entirely" + << dendl; + to_unmap.insert(pg); + goto test_change; + } else if (new_upmap_items.size() != i.second.size()) { + // drop single remapping pair, updating + ceph_assert(new_upmap_items.size() < i.second.size()); + ldout(cct, 10) << " existing pg_upmap_items " << i.second + << " remapped " << pg + << " out from underfull osd." << osd + << ", new_pg_upmap_items now " << new_upmap_items + << dendl; + to_upmap[pg] = new_upmap_items; + goto test_change; + } + } + } + + ceph_assert(!(to_unmap.size() || to_upmap.size())); + ldout(cct, 10) << " failed to find any changes for underfull osds" + << dendl; + if (!aggressive) { + ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl; + break; + } else if (!skip_overfull) { + // safe to quit because below here we know + // we've done checking both overfull and underfull osds.. + ldout(cct, 10) << " break due to not being able to find any" + << " further optimizations" + << dendl; + break; + } + // restart with fullest and do exhaustive searching + skip_overfull = false; + continue; + + test_change: + + // test change, apply if change is good + ceph_assert(to_unmap.size() || to_upmap.size()); + float new_stddev = 0; + map temp_osd_deviation; + multimap temp_deviation_osd; + float cur_max_deviation = 0; + for (auto& i : temp_pgs_by_osd) { + // make sure osd is still there (belongs to this crush-tree) + ceph_assert(osd_weight.count(i.first)); + float target = osd_weight[i.first] * pgs_per_weight; + float deviation = (float)i.second.size() - target; + ldout(cct, 20) << " osd." << i.first + << "\tpgs " << i.second.size() + << "\ttarget " << target + << "\tdeviation " << deviation + << dendl; + temp_osd_deviation[i.first] = deviation; + temp_deviation_osd.insert(make_pair(deviation, i.first)); + new_stddev += deviation * deviation; + if (fabsf(deviation) > cur_max_deviation) + cur_max_deviation = fabsf(deviation); + } + ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl; + if (new_stddev >= stddev) { + if (!aggressive) { + ldout(cct, 10) << " break because stddev is not decreasing" + << " and aggressive mode is not enabled" + << dendl; + break; + } + local_fallback_retried++; + if (local_fallback_retried >= local_fallback_retries) { + // does not make progress + // flip *skip_overfull* so both overfull and underfull + // get equal (in)attention + skip_overfull = !skip_overfull; + ldout(cct, 10) << " hit local_fallback_retries " + << local_fallback_retries + << dendl; + continue; + } + for (auto& i : to_unmap) + to_skip.insert(i); + for (auto& i : to_upmap) + to_skip.insert(i.first); + ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried + << " to_skip " << to_skip + << dendl; + goto retry; + } + + // ready to go + ceph_assert(new_stddev < stddev); + stddev = new_stddev; + pgs_by_osd = temp_pgs_by_osd; + osd_deviation = temp_osd_deviation; + deviation_osd = temp_deviation_osd; + for (auto& i : to_unmap) { + ldout(cct, 10) << " unmap pg " << i << dendl; + ceph_assert(tmp.pg_upmap_items.count(i)); + tmp.pg_upmap_items.erase(i); + pending_inc->old_pg_upmap_items.insert(i); + ++num_changed; + } + for (auto& i : to_upmap) { + ldout(cct, 10) << " upmap pg " << i.first + << " new pg_upmap_items " << i.second + << dendl; + tmp.pg_upmap_items[i.first] = i.second; + pending_inc->new_pg_upmap_items[i.first] = i.second; + ++num_changed; + } + ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl; + if (cur_max_deviation <= max_deviation) { + ldout(cct, 10) << __func__ << " Optimization plan is almost perfect" + << dendl; + break; + } + } + ldout(cct, 10) << " num_changed = " << num_changed << dendl; + return num_changed; +} + +int OSDMap::get_osds_by_bucket_name(const string &name, set *osds) const +{ + return crush->get_leaves(name, osds); +} + +// get pools whose crush rules might reference the given osd +void OSDMap::get_pool_ids_by_osd(CephContext *cct, + int osd, + set *pool_ids) const +{ + ceph_assert(pool_ids); + set raw_rules; + int r = crush->get_rules_by_osd(osd, &raw_rules); + if (r < 0) { + lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r) + << dendl; + ceph_assert(r >= 0); + } + set rules; + for (auto &i: raw_rules) { + // exclude any dead rule + if (crush_rule_in_use(i)) { + rules.insert(i); + } + } + for (auto &r: rules) { + get_pool_ids_by_rule(r, pool_ids); + } +} + +template +class OSDUtilizationDumper : public CrushTreeDumper::Dumper { +public: + typedef CrushTreeDumper::Dumper Parent; + + OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + const PGMap& pgmap_, bool tree_, + const string& filter) : + Parent(crush, osdmap_->get_pool_names()), + osdmap(osdmap_), + pgmap(pgmap_), + tree(tree_), + min_var(-1), + max_var(-1), + stddev(0), + sum(0) { + if (osdmap->crush->name_exists(filter)) { + // filter by crush node + auto item_id = osdmap->crush->get_item_id(filter); + allowed.insert(item_id); + osdmap->crush->get_all_children(item_id, &allowed); + } else if (osdmap->crush->class_exists(filter)) { + // filter by device class + class_id = osdmap->crush->get_class_id(filter); + } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter); + pool_id >= 0) { + // filter by pool + auto crush_rule = osdmap->get_pool_crush_rule(pool_id); + set roots; + osdmap->crush->find_takes_by_rule(crush_rule, &roots); + allowed = roots; + for (auto r : roots) + osdmap->crush->get_all_children(r, &allowed); + } + average_util = average_utilization(); + } + +protected: + + bool should_dump(int id) const { + if (!allowed.empty() && !allowed.count(id)) // filter by name + return false; + if (id >= 0 && class_id >= 0) { + auto item_class_id = osdmap->crush->get_item_class_id(id); + if (item_class_id < 0 || // not bound to a class yet + item_class_id != class_id) // or already bound to a different class + return false; + } + return true; + } + + set get_dumped_osds() { + if (allowed.empty() && class_id < 0) { + // old way, all + return {}; + } + return dumped_osds; + } + + void dump_stray(F *f) { + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !this->is_touched(i)) + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f); + } + } + + void dump_item(const CrushTreeDumper::Item &qi, F *f) override { + if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id))) + return; + if (!should_dump(qi.id)) + return; + + if (!qi.is_bucket()) + dumped_osds.insert(qi.id); + float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id); + int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0, + kb_used_meta = 0, kb_avail = 0; + double util = 0; + if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data, + &kb_used_omap, &kb_used_meta, &kb_avail)) + if (kb_used && kb) + util = 100.0 * (double)kb_used / (double)kb; + + double var = 1.0; + if (average_util) + var = util / average_util; + + size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id); + + dump_item(qi, reweight, kb, kb_used, + kb_used_data, kb_used_omap, kb_used_meta, + kb_avail, util, var, num_pgs, f); + + if (!qi.is_bucket() && reweight > 0) { + if (min_var < 0 || var < min_var) + min_var = var; + if (max_var < 0 || var > max_var) + max_var = var; + + double dev = util - average_util; + dev *= dev; + stddev += reweight * dev; + sum += reweight; + } + } + + virtual void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + F *f) = 0; + + double dev() { + return sum > 0 ? sqrt(stddev / sum) : 0; + } + + double average_utilization() { + int64_t kb = 0, kb_used = 0; + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (!osdmap->exists(i) || + osdmap->get_weight(i) == 0 || + !should_dump(i)) + continue; + int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i, + kb_avail_i; + if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i, + &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) { + kb += kb_i; + kb_used += kb_used_i; + } + } + return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0; + } + + bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used, + int64_t* kb_used_data, + int64_t* kb_used_omap, + int64_t* kb_used_meta, + int64_t* kb_avail) const { + const osd_stat_t *p = pgmap.get_osd_stat(id); + if (!p) return false; + *kb = p->statfs.kb(); + *kb_used = p->statfs.kb_used_raw(); + *kb_used_data = p->statfs.kb_used_data(); + *kb_used_omap = p->statfs.kb_used_omap(); + *kb_used_meta = p->statfs.kb_used_internal_metadata(); + *kb_avail = p->statfs.kb_avail(); + + return true; + } + + bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used, + int64_t* kb_used_data, + int64_t* kb_used_omap, + int64_t* kb_used_meta, + int64_t* kb_avail) const { + if (id >= 0) { + if (osdmap->is_out(id) || !should_dump(id)) { + *kb = 0; + *kb_used = 0; + *kb_used_data = 0; + *kb_used_omap = 0; + *kb_used_meta = 0; + *kb_avail = 0; + return true; + } + return get_osd_utilization(id, kb, kb_used, kb_used_data, + kb_used_omap, kb_used_meta, kb_avail); + } + + *kb = 0; + *kb_used = 0; + *kb_used_data = 0; + *kb_used_omap = 0; + *kb_used_meta = 0; + *kb_avail = 0; + + for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) { + int item = osdmap->crush->get_bucket_item(id, k); + int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0, + kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0; + if (!get_bucket_utilization(item, &kb_i, &kb_used_i, + &kb_used_data_i, &kb_used_omap_i, + &kb_used_meta_i, &kb_avail_i)) + return false; + *kb += kb_i; + *kb_used += kb_used_i; + *kb_used_data += kb_used_data_i; + *kb_used_omap += kb_used_omap_i; + *kb_used_meta += kb_used_meta_i; + *kb_avail += kb_avail_i; + } + return true; + } + +protected: + const OSDMap *osdmap; + const PGMap& pgmap; + bool tree; + double average_util; + double min_var; + double max_var; + double stddev; + double sum; + int class_id = -1; + set allowed; + set dumped_osds; +}; + + +class OSDUtilizationPlainDumper : public OSDUtilizationDumper { +public: + typedef OSDUtilizationDumper Parent; + + OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap, + const PGMap& pgmap, bool tree, + const string& filter) : + Parent(crush, osdmap, pgmap, tree, filter) {} + + void dump(TextTable *tbl) { + tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT); + if (tree) + tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT); + + Parent::dump(tbl); + + dump_stray(tbl); + + auto sum = pgmap.get_osd_sum(get_dumped_osds()); + *tbl << "" + << "" + << "" << "TOTAL" + << byte_u_t(sum.statfs.total) + << byte_u_t(sum.statfs.get_used_raw()) + << byte_u_t(sum.statfs.allocated) + << byte_u_t(sum.statfs.omap_allocated) + << byte_u_t(sum.statfs.internal_metadata) + << byte_u_t(sum.statfs.available) + << lowprecision_t(average_util) + << "" + << TextTable::endrow; + } + +protected: + struct lowprecision_t { + float v; + explicit lowprecision_t(float _v) : v(_v) {} + }; + friend std::ostream &operator<<(ostream& out, const lowprecision_t& v); + + using OSDUtilizationDumper::dump_item; + void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + TextTable *tbl) override { + const char *c = crush->get_item_class(qi.id); + if (!c) + c = ""; + *tbl << qi.id + << c + << weightf_t(qi.weight) + << weightf_t(reweight) + << byte_u_t(kb << 10) + << byte_u_t(kb_used << 10) + << byte_u_t(kb_used_data << 10) + << byte_u_t(kb_used_omap << 10) + << byte_u_t(kb_used_meta << 10) + << byte_u_t(kb_avail << 10) + << lowprecision_t(util) + << lowprecision_t(var); + + if (qi.is_bucket()) { + *tbl << "-"; + *tbl << ""; + } else { + *tbl << num_pgs; + if (osdmap->is_up(qi.id)) { + *tbl << "up"; + } else if (osdmap->is_destroyed(qi.id)) { + *tbl << "destroyed"; + } else { + *tbl << "down"; + } + } + + if (tree) { + ostringstream name; + for (int k = 0; k < qi.depth; k++) + name << " "; + if (qi.is_bucket()) { + int type = crush->get_bucket_type(qi.id); + name << crush->get_type_name(type) << " " + << crush->get_item_name(qi.id); + } else { + name << "osd." << qi.id; + } + *tbl << name.str(); + } + + *tbl << TextTable::endrow; + } + +public: + string summary() { + ostringstream out; + out << "MIN/MAX VAR: " << lowprecision_t(min_var) + << "/" << lowprecision_t(max_var) << " " + << "STDDEV: " << lowprecision_t(dev()); + return out.str(); + } +}; + +ostream& operator<<(ostream& out, + const OSDUtilizationPlainDumper::lowprecision_t& v) +{ + if (v.v < -0.01) { + return out << "-"; + } else if (v.v < 0.001) { + return out << "0"; + } else { + std::streamsize p = out.precision(); + return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p); + } +} + +class OSDUtilizationFormatDumper : public OSDUtilizationDumper { +public: + typedef OSDUtilizationDumper Parent; + + OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap, + const PGMap& pgmap, bool tree, + const string& filter) : + Parent(crush, osdmap, pgmap, tree, filter) {} + + void dump(Formatter *f) { + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + + f->open_array_section("stray"); + dump_stray(f); + f->close_section(); + } + +protected: + using OSDUtilizationDumper::dump_item; + void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + Formatter *f) override { + f->open_object_section("item"); + CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f); + f->dump_float("reweight", reweight); + f->dump_int("kb", kb); + f->dump_int("kb_used", kb_used); + f->dump_int("kb_used_data", kb_used_data); + f->dump_int("kb_used_omap", kb_used_omap); + f->dump_int("kb_used_meta", kb_used_meta); + f->dump_int("kb_avail", kb_avail); + f->dump_float("utilization", util); + f->dump_float("var", var); + f->dump_unsigned("pgs", num_pgs); + if (!qi.is_bucket()) { + if (osdmap->is_up(qi.id)) { + f->dump_string("status", "up"); + } else if (osdmap->is_destroyed(qi.id)) { + f->dump_string("status", "destroyed"); + } else { + f->dump_string("status", "down"); + } + } + CrushTreeDumper::dump_bucket_children(crush, qi, f); + f->close_section(); + } + +public: + void summary(Formatter *f) { + f->open_object_section("summary"); + auto sum = pgmap.get_osd_sum(get_dumped_osds()); + auto& s = sum.statfs; + + f->dump_int("total_kb", s.kb()); + f->dump_int("total_kb_used", s.kb_used_raw()); + f->dump_int("total_kb_used_data", s.kb_used_data()); + f->dump_int("total_kb_used_omap", s.kb_used_omap()); + f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata()); + f->dump_int("total_kb_avail", s.kb_avail()); + f->dump_float("average_utilization", average_util); + f->dump_float("min_var", min_var); + f->dump_float("max_var", max_var); + f->dump_float("dev", dev()); + f->close_section(); + } +}; + +void print_osd_utilization(const OSDMap& osdmap, + const PGMap& pgmap, + ostream& out, + Formatter *f, + bool tree, + const string& filter) +{ + const CrushWrapper *crush = osdmap.crush.get(); + if (f) { + f->open_object_section("df"); + OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter); + d.dump(f); + d.summary(f); + f->close_section(); + f->flush(out); + } else { + OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter); + TextTable tbl; + d.dump(&tbl); + out << tbl << d.summary() << "\n"; + } +} + +void OSDMap::check_health(CephContext *cct, + health_check_map_t *checks) const +{ + int num_osds = get_num_osds(); + + // OSD_DOWN + // OSD_$subtree_DOWN + // OSD_ORPHAN + if (num_osds >= 0) { + int num_in_osds = 0; + int num_down_in_osds = 0; + set osds; + set down_in_osds; + set up_in_osds; + set subtree_up; + unordered_map > subtree_type_down; + unordered_map num_osds_subtree; + int max_type = crush->get_max_type_id(); + + for (int i = 0; i < get_max_osd(); i++) { + if (!exists(i)) { + if (crush->item_exists(i)) { + osds.insert(i); + } + continue; + } + if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW)) + continue; + ++num_in_osds; + if (down_in_osds.count(i) || up_in_osds.count(i)) + continue; + if (!is_up(i)) { + down_in_osds.insert(i); + int parent_id = 0; + int current = i; + for (int type = 0; type <= max_type; type++) { + if (!crush->get_type_name(type)) + continue; + int r = crush->get_immediate_parent_id(current, &parent_id); + if (r == -ENOENT) + break; + // break early if this parent is already marked as up + if (subtree_up.count(parent_id)) + break; + type = crush->get_bucket_type(parent_id); + if (!subtree_type_is_down( + cct, parent_id, type, + &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down)) + break; + current = parent_id; + } + } + } + + // calculate the number of down osds in each down subtree and + // store it in num_osds_subtree + for (int type = 1; type <= max_type; type++) { + if (!crush->get_type_name(type)) + continue; + for (auto j = subtree_type_down[type].begin(); + j != subtree_type_down[type].end(); + ++j) { + list children; + int num = 0; + int num_children = crush->get_children(*j, &children); + if (num_children == 0) + continue; + for (auto l = children.begin(); l != children.end(); ++l) { + if (*l >= 0) { + ++num; + } else if (num_osds_subtree[*l] > 0) { + num = num + num_osds_subtree[*l]; + } + } + num_osds_subtree[*j] = num; + } + } + num_down_in_osds = down_in_osds.size(); + ceph_assert(num_down_in_osds <= num_in_osds); + if (num_down_in_osds > 0) { + // summary of down subtree types and osds + for (int type = max_type; type > 0; type--) { + if (!crush->get_type_name(type)) + continue; + if (subtree_type_down[type].size() > 0) { + ostringstream ss; + ss << subtree_type_down[type].size() << " " + << crush->get_type_name(type); + if (subtree_type_down[type].size() > 1) { + ss << "s"; + } + int sum_down_osds = 0; + for (auto j = subtree_type_down[type].begin(); + j != subtree_type_down[type].end(); + ++j) { + sum_down_osds = sum_down_osds + num_osds_subtree[*j]; + } + ss << " (" << sum_down_osds << " osds) down"; + string err = string("OSD_") + + string(crush->get_type_name(type)) + "_DOWN"; + boost::to_upper(err); + auto& d = checks->add(err, HEALTH_WARN, ss.str(), + subtree_type_down[type].size()); + for (auto j = subtree_type_down[type].rbegin(); + j != subtree_type_down[type].rend(); + ++j) { + ostringstream ss; + ss << crush->get_type_name(type); + ss << " "; + ss << crush->get_item_name(*j); + // at the top level, do not print location + if (type != max_type) { + ss << " ("; + ss << crush->get_full_location_ordered_string(*j); + ss << ")"; + } + int num = num_osds_subtree[*j]; + ss << " (" << num << " osds)"; + ss << " is down"; + d.detail.push_back(ss.str()); + } + } + } + ostringstream ss; + ss << down_in_osds.size() << " osds down"; + auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(), + down_in_osds.size()); + for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) { + ostringstream ss; + ss << "osd." << *it << " ("; + ss << crush->get_full_location_ordered_string(*it); + ss << ") is down"; + d.detail.push_back(ss.str()); + } + } + + if (!osds.empty()) { + ostringstream ss; + ss << osds.size() << " osds exist in the crush map but not in the osdmap"; + auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(), + osds.size()); + for (auto osd : osds) { + ostringstream ss; + ss << "osd." << osd << " exists in crush map but not in osdmap"; + d.detail.push_back(ss.str()); + } + } + } + + std::list scrub_messages; + bool noscrub = false, nodeepscrub = false; + for (const auto &p : pools) { + if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) { + ostringstream ss; + ss << "Pool " << get_pool_name(p.first) << " has noscrub flag"; + scrub_messages.push_back(ss.str()); + noscrub = true; + } + if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) { + ostringstream ss; + ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag"; + scrub_messages.push_back(ss.str()); + nodeepscrub = true; + } + } + if (noscrub || nodeepscrub) { + string out = ""; + out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : ""; + out += nodeepscrub ? "nodeep-scrub" : ""; + auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK, + "Some pool(s) have the " + out + " flag(s) set", 0); + d.detail.splice(d.detail.end(), scrub_messages); + } + + // OSD_OUT_OF_ORDER_FULL + { + // An osd could configure failsafe ratio, to something different + // but for now assume it is the same here. + float fsr = cct->_conf->osd_failsafe_full_ratio; + if (fsr > 1.0) fsr /= 100; + float fr = get_full_ratio(); + float br = get_backfillfull_ratio(); + float nr = get_nearfull_ratio(); + + list detail; + // These checks correspond to how OSDService::check_full_status() in an OSD + // handles the improper setting of these values. + if (br < nr) { + ostringstream ss; + ss << "backfillfull_ratio (" << br + << ") < nearfull_ratio (" << nr << "), increased"; + detail.push_back(ss.str()); + br = nr; + } + if (fr < br) { + ostringstream ss; + ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br + << "), increased"; + detail.push_back(ss.str()); + fr = br; + } + if (fsr < fr) { + ostringstream ss; + ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr + << "), increased"; + detail.push_back(ss.str()); + } + if (!detail.empty()) { + auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR, + "full ratio(s) out of order", 0); + d.detail.swap(detail); + } + } + + // OSD_FULL + // OSD_NEARFULL + // OSD_BACKFILLFULL + // OSD_FAILSAFE_FULL + { + set full, backfillfull, nearfull; + get_full_osd_counts(&full, &backfillfull, &nearfull); + if (full.size()) { + ostringstream ss; + ss << full.size() << " full osd(s)"; + auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size()); + for (auto& i: full) { + ostringstream ss; + ss << "osd." << i << " is full"; + d.detail.push_back(ss.str()); + } + } + if (backfillfull.size()) { + ostringstream ss; + ss << backfillfull.size() << " backfillfull osd(s)"; + auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(), + backfillfull.size()); + for (auto& i: backfillfull) { + ostringstream ss; + ss << "osd." << i << " is backfill full"; + d.detail.push_back(ss.str()); + } + } + if (nearfull.size()) { + ostringstream ss; + ss << nearfull.size() << " nearfull osd(s)"; + auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size()); + for (auto& i: nearfull) { + ostringstream ss; + ss << "osd." << i << " is near full"; + d.detail.push_back(ss.str()); + } + } + } + + // OSDMAP_FLAGS + { + // warn about flags + uint64_t warn_flags = + CEPH_OSDMAP_PAUSERD | + CEPH_OSDMAP_PAUSEWR | + CEPH_OSDMAP_PAUSEREC | + CEPH_OSDMAP_NOUP | + CEPH_OSDMAP_NODOWN | + CEPH_OSDMAP_NOIN | + CEPH_OSDMAP_NOOUT | + CEPH_OSDMAP_NOBACKFILL | + CEPH_OSDMAP_NORECOVER | + CEPH_OSDMAP_NOSCRUB | + CEPH_OSDMAP_NODEEP_SCRUB | + CEPH_OSDMAP_NOTIERAGENT | + CEPH_OSDMAP_NOSNAPTRIM | + CEPH_OSDMAP_NOREBALANCE; + if (test_flag(warn_flags)) { + ostringstream ss; + string s = get_flag_string(get_flags() & warn_flags); + ss << s << " flag(s) set"; + checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(), + s.size() /* kludgey but sufficient */); + } + } + + // OSD_FLAGS + { + list detail; + const unsigned flags = + CEPH_OSD_NOUP | + CEPH_OSD_NOIN | + CEPH_OSD_NODOWN | + CEPH_OSD_NOOUT; + for (int i = 0; i < max_osd; ++i) { + if (osd_state[i] & flags) { + ostringstream ss; + set states; + OSDMap::calc_state_set(osd_state[i] & flags, states); + ss << "osd." << i << " has flags " << states; + detail.push_back(ss.str()); + } + } + for (auto& i : crush_node_flags) { + if (i.second && crush->item_exists(i.first)) { + ostringstream ss; + set states; + OSDMap::calc_state_set(i.second, states); + int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first); + const char *tn = crush->get_type_name(t); + ss << (tn ? tn : "node") << " " + << crush->get_item_name(i.first) << " has flags " << states; + detail.push_back(ss.str()); + } + } + for (auto& i : device_class_flags) { + const char* class_name = crush->get_class_name(i.first); + if (i.second && class_name) { + ostringstream ss; + set states; + OSDMap::calc_state_set(i.second, states); + ss << "device class '" << class_name << "' has flags " << states; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set"; + auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size()); + d.detail.swap(detail); + } + } + + // OLD_CRUSH_TUNABLES + if (cct->_conf->mon_warn_on_legacy_crush_tunables) { + string min = crush->get_min_required_version(); + if (min < cct->_conf->mon_crush_min_required_version) { + ostringstream ss; + ss << "crush map has legacy tunables (require " << min + << ", min is " << cct->_conf->mon_crush_min_required_version << ")"; + auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0); + d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables"); + } + } + + // OLD_CRUSH_STRAW_CALC_VERSION + if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) { + if (crush->get_straw_calc_version() == 0) { + ostringstream ss; + ss << "crush map has straw_calc_version=0"; + auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0); + d.detail.push_back( + "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables"); + } + } + + // CACHE_POOL_NO_HIT_SET + if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) { + list detail; + for (auto p = pools.cbegin(); p != pools.cend(); ++p) { + const pg_pool_t& info = p->second; + if (info.cache_mode_requires_hit_set() && + info.hit_set_params.get_type() == HitSet::TYPE_NONE) { + ostringstream ss; + ss << "pool '" << get_pool_name(p->first) + << "' with cache_mode " << info.get_cache_mode_name() + << " needs hit_set_type to be set but it is not"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " cache pools are missing hit_sets"; + auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(), + detail.size()); + d.detail.swap(detail); + } + } + + // OSD_NO_SORTBITWISE + if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) { + ostringstream ss; + ss << "'sortbitwise' flag is not set"; + checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0); + } + + // OSD_UPGRADE_FINISHED + if (auto require_release = pending_require_osd_release()) { + ostringstream ss; + ss << "all OSDs are running " << *require_release << " or later but" + << " require_osd_release < " << *require_release; + auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0); + d.detail.push_back(ss.str()); + } + + // POOL_NEARFULL/BACKFILLFULL/FULL + { + list full_detail, backfillfull_detail, nearfull_detail; + for (auto it : get_pools()) { + const pg_pool_t &pool = it.second; + const string& pool_name = get_pool_name(it.first); + if (pool.has_flag(pg_pool_t::FLAG_FULL)) { + stringstream ss; + if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + // may run out of space too, + // but we want EQUOTA taking precedence + ss << "pool '" << pool_name << "' is full (running out of quota)"; + } else { + ss << "pool '" << pool_name << "' is full (no space)"; + } + full_detail.push_back(ss.str()); + } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) { + stringstream ss; + ss << "pool '" << pool_name << "' is backfillfull"; + backfillfull_detail.push_back(ss.str()); + } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) { + stringstream ss; + ss << "pool '" << pool_name << "' is nearfull"; + nearfull_detail.push_back(ss.str()); + } + } + if (!full_detail.empty()) { + ostringstream ss; + ss << full_detail.size() << " pool(s) full"; + auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size()); + d.detail.swap(full_detail); + } + if (!backfillfull_detail.empty()) { + ostringstream ss; + ss << backfillfull_detail.size() << " pool(s) backfillfull"; + auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(), + backfillfull_detail.size()); + d.detail.swap(backfillfull_detail); + } + if (!nearfull_detail.empty()) { + ostringstream ss; + ss << nearfull_detail.size() << " pool(s) nearfull"; + auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(), + nearfull_detail.size()); + d.detail.swap(nearfull_detail); + } + } + + // POOL_PG_NUM_NOT_POWER_OF_TWO + if (cct->_conf.get_val("mon_warn_on_pool_pg_num_not_power_of_two")) { + list detail; + for (auto it : get_pools()) { + if (!isp2(it.second.get_pg_num_target())) { + ostringstream ss; + ss << "pool '" << get_pool_name(it.first) + << "' pg_num " << it.second.get_pg_num_target() + << " is not a power of two"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " pool(s) have non-power-of-two pg_num"; + auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN, + ss.str(), detail.size()); + d.detail.swap(detail); + } + } + + // POOL_NO_REDUNDANCY + if (cct->_conf.get_val("mon_warn_on_pool_no_redundancy")) + { + list detail; + for (auto it : get_pools()) { + if (it.second.get_size() == 1) { + ostringstream ss; + ss << "pool '" << get_pool_name(it.first) + << "' has no replicas configured"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " pool(s) have no replicas configured"; + auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN, + ss.str(), detail.size()); + d.detail.swap(detail); + } + } + + // DEGRADED STRETCH MODE + if (cct->_conf.get_val("mon_warn_on_degraded_stretch_mode")) { + if (recovering_stretch_mode) { + stringstream ss; + ss << "We are recovering stretch mode buckets, only requiring " + << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ; + checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN, + ss.str(), 0); + } else if (degraded_stretch_mode) { + stringstream ss; + ss << "We are missing stretch mode buckets, only requiring " + << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ; + checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN, + ss.str(), 0); + } + } +} + +int OSDMap::parse_osd_id_list(const vector& ls, set *out, + ostream *ss) const +{ + out->clear(); + for (auto i = ls.begin(); i != ls.end(); ++i) { + if (i == ls.begin() && + (*i == "any" || *i == "all" || *i == "*")) { + get_all_osds(*out); + break; + } + long osd = TOPNSPC::common::parse_osd_id(i->c_str(), ss); + if (osd < 0) { + *ss << "invalid osd id '" << *i << "'"; + return -EINVAL; + } + out->insert(osd); + } + return 0; +} + +void OSDMap::get_random_up_osds_by_subtree(int n, // whoami + string &subtree, + int limit, // how many + set skip, + set *want) const { + if (limit <= 0) + return; + int subtree_type = crush->get_type_id(subtree); + if (subtree_type < 1) + return; + vector subtrees; + crush->get_subtree_of_type(subtree_type, &subtrees); + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(subtrees.begin(), subtrees.end(), rng); + for (auto s : subtrees) { + if (limit <= 0) + break; + if (crush->subtree_contains(s, n)) + continue; + vector osds; + crush->get_children_of_type(s, 0, &osds); + if (osds.empty()) + continue; + vector up_osds; + for (auto o : osds) { + if (is_up(o) && !skip.count(o)) + up_osds.push_back(o); + } + if (up_osds.empty()) + continue; + auto it = up_osds.begin(); + std::advance(it, (n % up_osds.size())); + want->insert(*it); + --limit; + } +} + +float OSDMap::pool_raw_used_rate(int64_t poolid) const +{ + const pg_pool_t *pool = get_pg_pool(poolid); + assert(pool != nullptr); + + switch (pool->get_type()) { + case pg_pool_t::TYPE_REPLICATED: + return pool->get_size(); + case pg_pool_t::TYPE_ERASURE: + { + auto& ecp = + get_erasure_code_profile(pool->erasure_code_profile); + auto pm = ecp.find("m"); + auto pk = ecp.find("k"); + if (pm != ecp.end() && pk != ecp.end()) { + int k = atoi(pk->second.c_str()); + int m = atoi(pm->second.c_str()); + int mk = m + k; + ceph_assert(mk != 0); + ceph_assert(k != 0); + return (float)mk / k; + } else { + return 0.0; + } + } + break; + default: + ceph_abort_msg("unrecognized pool type"); + } +} + +unsigned OSDMap::get_osd_crush_node_flags(int osd) const +{ + unsigned flags = 0; + if (!crush_node_flags.empty()) { + // the map will contain type -> name + std::map ploc = crush->get_full_location(osd); + for (auto& i : ploc) { + int id = crush->get_item_id(i.second); + auto p = crush_node_flags.find(id); + if (p != crush_node_flags.end()) { + flags |= p->second; + } + } + } + return flags; +} + +unsigned OSDMap::get_crush_node_flags(int id) const +{ + unsigned flags = 0; + auto it = crush_node_flags.find(id); + if (it != crush_node_flags.end()) + flags = it->second; + return flags; +} + +unsigned OSDMap::get_device_class_flags(int id) const +{ + unsigned flags = 0; + auto it = device_class_flags.find(id); + if (it != device_class_flags.end()) + flags = it->second; + return flags; +} + +std::optional OSDMap::pending_require_osd_release() const +{ + if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) && + require_osd_release < ceph_release_t::pacific) { + return "pacific"; + } + if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) && + require_osd_release < ceph_release_t::octopus) { + return "octopus"; + } + if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) && + require_osd_release < ceph_release_t::nautilus) { + return "nautilus"; + } + + return std::nullopt; +} diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h new file mode 100644 index 000000000..83ab75e0d --- /dev/null +++ b/src/osd/OSDMap.h @@ -0,0 +1,1600 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_OSDMAP_H +#define CEPH_OSDMAP_H + +/* + * describe properties of the OSD cluster. + * disks, disk groups, total # osds, + * + */ +#include +#include +#include +#include +#include + +#include +#include "include/btree_map.h" +#include "include/common_fwd.h" +#include "include/types.h" +#include "common/ceph_releases.h" +#include "osd_types.h" + +//#include "include/ceph_features.h" +#include "crush/CrushWrapper.h" + +// forward declaration +class CrushWrapper; +class health_check_map_t; + +/* + * we track up to two intervals during which the osd was alive and + * healthy. the most recent is [up_from,up_thru), where up_thru is + * the last epoch the osd is known to have _started_. i.e., a lower + * bound on the actual osd death. down_at (if it is > up_from) is an + * upper bound on the actual osd death. + * + * the second is the last_clean interval [begin,end). in that case, + * the last interval is the last epoch known to have been either + * _finished_, or during which the osd cleanly shut down. when + * possible, we push this forward to the epoch the osd was eventually + * marked down. + * + * the lost_at is used to allow build_prior to proceed without waiting + * for an osd to recover. In certain cases, progress may be blocked + * because an osd is down that may contain updates (i.e., a pg may have + * gone rw during an interval). If the osd can't be brought online, we + * can force things to proceed knowing that we _might_ be losing some + * acked writes. If the osd comes back to life later, that's fine to, + * but those writes will still be lost (the divergent objects will be + * thrown out). + */ +struct osd_info_t { + epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown + epoch_t last_clean_end; + epoch_t up_from; // epoch osd marked up + epoch_t up_thru; // lower bound on actual osd death (if > up_from) + epoch_t down_at; // upper bound on actual osd death (if > up_from) + epoch_t lost_at; // last epoch we decided data was "lost" + + osd_info_t() : last_clean_begin(0), last_clean_end(0), + up_from(0), up_thru(0), down_at(0), lost_at(0) {} + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(osd_info_t) + +std::ostream& operator<<(std::ostream& out, const osd_info_t& info); + +struct osd_xinfo_t { + utime_t down_stamp; ///< timestamp when we were last marked down + float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy + __u32 laggy_interval; ///< average interval between being marked laggy and recovering + uint64_t features; ///< features supported by this osd we should know about + __u32 old_weight; ///< weight prior to being auto marked out + utime_t last_purged_snaps_scrub; ///< last scrub of purged_snaps + epoch_t dead_epoch = 0; ///< last epoch we were confirmed dead (not just down) + + osd_xinfo_t() : laggy_probability(0), laggy_interval(0), + features(0), old_weight(0) {} + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(osd_xinfo_t) + +std::ostream& operator<<(std::ostream& out, const osd_xinfo_t& xi); + + +struct PGTempMap { +#if 1 + ceph::buffer::list data; + typedef btree::btree_map map_t; + map_t map; + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + uint32_t n = map.size(); + encode(n, bl); + for (auto &p : map) { + encode(p.first, bl); + bl.append((char*)p.second, (*p.second + 1) * sizeof(ceph_le32)); + } + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + data.clear(); + map.clear(); + uint32_t n; + decode(n, p); + if (!n) + return; + auto pstart = p; + size_t start_off = pstart.get_off(); + std::vector> offsets; + offsets.resize(n); + for (unsigned i=0; i 1) { + data.rebuild(); + } + //map.reserve(n); + char *start = data.c_str(); + for (auto i : offsets) { + map.insert(map.end(), std::make_pair(i.first, (ceph_le32*)(start + i.second))); + } + } + void rebuild() { + ceph::buffer::list bl; + encode(bl); + auto p = std::cbegin(bl); + decode(p); + } + friend bool operator==(const PGTempMap& l, const PGTempMap& r) { + return + l.map.size() == r.map.size() && + l.data.contents_equal(r.data); + } + + class iterator { + map_t::const_iterator it; + map_t::const_iterator end; + std::pair> current; + void init_current() { + if (it != end) { + current.first = it->first; + ceph_assert(it->second); + current.second.resize(*it->second); + ceph_le32 *p = it->second + 1; + for (uint32_t n = 0; n < *it->second; ++n, ++p) { + current.second[n] = *p; + } + } + } + public: + iterator(map_t::const_iterator p, + map_t::const_iterator e) + : it(p), end(e) { + init_current(); + } + + const std::pair>& operator*() const { + return current; + } + const std::pair>* operator->() const { + return ¤t; + } + friend bool operator==(const iterator& l, const iterator& r) { + return l.it == r.it; + } + friend bool operator!=(const iterator& l, const iterator& r) { + return l.it != r.it; + } + iterator& operator++() { + ++it; + if (it != end) + init_current(); + return *this; + } + iterator operator++(int) { + iterator r = *this; + ++it; + if (it != end) + init_current(); + return r; + } + }; + iterator begin() const { + return iterator(map.begin(), map.end()); + } + iterator end() const { + return iterator(map.end(), map.end()); + } + iterator find(pg_t pgid) const { + return iterator(map.find(pgid), map.end()); + } + size_t size() const { + return map.size(); + } + size_t count(pg_t pgid) const { + return map.count(pgid); + } + void erase(pg_t pgid) { + map.erase(pgid); + } + void clear() { + map.clear(); + data.clear(); + } + void set(pg_t pgid, const mempool::osdmap::vector& v) { + using ceph::encode; + size_t need = sizeof(ceph_le32) * (1 + v.size()); + if (need < data.get_append_buffer_unused_tail_length()) { + ceph::buffer::ptr z(data.get_append_buffer_unused_tail_length()); + z.zero(); + data.append(z.c_str(), z.length()); + } + encode(v, data); + map[pgid] = (ceph_le32*)(data.back().end_c_str()) - (1 + v.size()); + } + mempool::osdmap::vector get(pg_t pgid) { + mempool::osdmap::vector v; + ceph_le32 *p = map[pgid]; + size_t n = *p++; + v.resize(n); + for (size_t i = 0; i < n; ++i, ++p) { + v[i] = *p; + } + return v; + } +#else + // trivial implementation + mempool::osdmap::map > pg_temp; + + void encode(ceph::buffer::list& bl) const { + encode(pg_temp, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + decode(pg_temp, p); + } + friend bool operator==(const PGTempMap& l, const PGTempMap& r) { + return + l.pg_temp.size() == r.pg_temp.size() && + l.pg_temp == r.pg_temp; + } + + class iterator { + mempool::osdmap::map >::const_iterator it; + public: + iterator(mempool::osdmap::map >::const_iterator p) + : it(p) {} + + std::pair&> operator*() const { + return *it; + } + const std::pair>* operator->() const { + return &*it; + } + friend bool operator==(const iterator& l, const iterator& r) { + return l.it == r.it; + } + friend bool operator!=(const iterator& l, const iterator& r) { + return l.it != r.it; + } + iterator& operator++() { + ++it; + return *this; + } + iterator operator++(int) { + iterator r = *this; + ++it; + return r; + } + }; + iterator begin() const { + return iterator(pg_temp.cbegin()); + } + iterator end() const { + return iterator(pg_temp.cend()); + } + iterator find(pg_t pgid) const { + return iterator(pg_temp.find(pgid)); + } + size_t size() const { + return pg_temp.size(); + } + size_t count(pg_t pgid) const { + return pg_temp.count(pgid); + } + void erase(pg_t pgid) { + pg_temp.erase(pgid); + } + void clear() { + pg_temp.clear(); + } + void set(pg_t pgid, const mempool::osdmap::vector& v) { + pg_temp[pgid] = v; + } + const mempool::osdmap::vector& get(pg_t pgid) { + return pg_temp.at(pgid); + } +#endif + void dump(ceph::Formatter *f) const { + for (const auto &pg : *this) { + f->open_object_section("osds"); + f->dump_stream("pgid") << pg.first; + f->open_array_section("osds"); + for (const auto osd : pg.second) + f->dump_int("osd", osd); + f->close_section(); + f->close_section(); + } + } +}; +WRITE_CLASS_ENCODER(PGTempMap) + +/** OSDMap + */ +class OSDMap { +public: + MEMPOOL_CLASS_HELPERS(); + + class Incremental { + public: + MEMPOOL_CLASS_HELPERS(); + + /// feature bits we were encoded with. the subsequent OSDMap + /// encoding should match. + uint64_t encode_features; + uuid_d fsid; + epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch + utime_t modified; + int64_t new_pool_max; //incremented by the OSDMonitor on each pool create + int32_t new_flags; + ceph_release_t new_require_osd_release{0xff}; + uint32_t new_stretch_bucket_count{0}; + uint32_t new_degraded_stretch_mode{0}; + uint32_t new_recovering_stretch_mode{0}; + int32_t new_stretch_mode_bucket{0}; + bool stretch_mode_enabled{false}; + bool change_stretch_mode{false}; + + // full (rare) + ceph::buffer::list fullmap; // in lieu of below. + ceph::buffer::list crush; + + // incremental + int32_t new_max_osd; + mempool::osdmap::map new_pools; + mempool::osdmap::map new_pool_names; + mempool::osdmap::set old_pools; + mempool::osdmap::map > new_erasure_code_profiles; + mempool::osdmap::vector old_erasure_code_profiles; + mempool::osdmap::map new_up_client; + mempool::osdmap::map new_up_cluster; + mempool::osdmap::map new_state; // XORed onto previous state. + mempool::osdmap::map new_weight; + mempool::osdmap::map > new_pg_temp; // [] to remove + mempool::osdmap::map new_primary_temp; // [-1] to remove + mempool::osdmap::map new_primary_affinity; + mempool::osdmap::map new_up_thru; + mempool::osdmap::map > new_last_clean_interval; + mempool::osdmap::map new_lost; + mempool::osdmap::map new_uuid; + mempool::osdmap::map new_xinfo; + + mempool::osdmap::map new_blocklist; + mempool::osdmap::vector old_blocklist; + mempool::osdmap::map new_range_blocklist; + mempool::osdmap::vector old_range_blocklist; + mempool::osdmap::map new_hb_back_up; + mempool::osdmap::map new_hb_front_up; + + mempool::osdmap::map> new_pg_upmap; + mempool::osdmap::map>> new_pg_upmap_items; + mempool::osdmap::set old_pg_upmap, old_pg_upmap_items; + mempool::osdmap::map new_removed_snaps; + mempool::osdmap::map new_purged_snaps; + + mempool::osdmap::map new_crush_node_flags; + mempool::osdmap::map new_device_class_flags; + + std::string cluster_snapshot; + + float new_nearfull_ratio = -1; + float new_backfillfull_ratio = -1; + float new_full_ratio = -1; + + ceph_release_t new_require_min_compat_client{0xff}; + + utime_t new_last_up_change, new_last_in_change; + + mutable bool have_crc; ///< crc values are defined + uint32_t full_crc; ///< crc of the resulting OSDMap + mutable uint32_t inc_crc; ///< crc of this incremental + + int get_net_marked_out(const OSDMap *previous) const; + int get_net_marked_down(const OSDMap *previous) const; + int identify_osd(uuid_d u) const; + + void encode_client_old(ceph::buffer::list& bl) const; + void encode_classic(ceph::buffer::list& bl, uint64_t features) const; + void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const; + void decode_classic(ceph::buffer::list::const_iterator &p); + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + + explicit Incremental(epoch_t e=0) : + encode_features(0), + epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1), + have_crc(false), full_crc(0), inc_crc(0) { + } + explicit Incremental(ceph::buffer::list &bl) { + auto p = std::cbegin(bl); + decode(p); + } + explicit Incremental(ceph::buffer::list::const_iterator &p) { + decode(p); + } + + pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) { + if (new_pools.count(pool) == 0) + new_pools[pool] = *orig; + return &new_pools[pool]; + } + bool has_erasure_code_profile(const std::string &name) const { + auto i = new_erasure_code_profiles.find(name); + return i != new_erasure_code_profiles.end(); + } + void set_erasure_code_profile(const std::string &name, + const std::map& profile) { + new_erasure_code_profiles[name] = profile; + } + mempool::osdmap::map> get_erasure_code_profiles() const { + return new_erasure_code_profiles; + } + + /// propagate update pools' (snap and other) metadata to any of their tiers + int propagate_base_properties_to_tiers(CephContext *cct, const OSDMap &base); + + /// filter out osds with any pending state changing + size_t get_pending_state_osds(std::vector *osds) { + ceph_assert(osds); + osds->clear(); + + for (auto &p : new_state) { + osds->push_back(p.first); + } + + return osds->size(); + } + + bool pending_osd_has_state(int osd, unsigned state) { + return new_state.count(osd) && (new_state[osd] & state) != 0; + } + + bool pending_osd_state_set(int osd, unsigned state) { + if (pending_osd_has_state(osd, state)) + return false; + new_state[osd] |= state; + return true; + } + + // cancel the specified pending osd state if there is any + // return ture on success, false otherwise. + bool pending_osd_state_clear(int osd, unsigned state) { + if (!pending_osd_has_state(osd, state)) { + // never has been set or already has been cancelled. + return false; + } + + new_state[osd] &= ~state; + if (!new_state[osd]) { + // all flags cleared + new_state.erase(osd); + } + return true; + } + + bool in_new_removed_snaps(int64_t pool, snapid_t snap) const { + auto p = new_removed_snaps.find(pool); + if (p == new_removed_snaps.end()) { + return false; + } + return p->second.contains(snap); + } + }; + +private: + uuid_d fsid; + epoch_t epoch; // what epoch of the osd cluster descriptor is this + utime_t created, modified; // epoch start time + int32_t pool_max; // the largest pool num, ever + + uint32_t flags; + + int num_osd; // not saved; see calc_num_osds + int num_up_osd; // not saved; see calc_num_osds + int num_in_osd; // not saved; see calc_num_osds + + int32_t max_osd; + std::vector osd_state; + + mempool::osdmap::map crush_node_flags; // crush node -> CEPH_OSD_* flags + mempool::osdmap::map device_class_flags; // device class -> CEPH_OSD_* flags + + utime_t last_up_change, last_in_change; + + // These features affect OSDMap[::Incremental] encoding, or the + // encoding of some type embedded therein (CrushWrapper, something + // from osd_types, etc.). + static constexpr uint64_t SIGNIFICANT_FEATURES = + CEPH_FEATUREMASK_PGID64 | + CEPH_FEATUREMASK_PGPOOL3 | + CEPH_FEATUREMASK_OSDENC | + CEPH_FEATUREMASK_OSDMAP_ENC | + CEPH_FEATUREMASK_OSD_POOLRESEND | + CEPH_FEATUREMASK_NEW_OSDOP_ENCODING | + CEPH_FEATUREMASK_MSG_ADDR2 | + CEPH_FEATUREMASK_CRUSH_TUNABLES5 | + CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS | + CEPH_FEATUREMASK_SERVER_LUMINOUS | + CEPH_FEATUREMASK_SERVER_MIMIC | + CEPH_FEATUREMASK_SERVER_NAUTILUS | + CEPH_FEATUREMASK_SERVER_OCTOPUS; + + struct addrs_s { + mempool::osdmap::vector > client_addrs; + mempool::osdmap::vector > cluster_addrs; + mempool::osdmap::vector > hb_back_addrs; + mempool::osdmap::vector > hb_front_addrs; + }; + std::shared_ptr osd_addrs; + + entity_addrvec_t _blank_addrvec; + + mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" + mempool::osdmap::vector osd_info; + std::shared_ptr pg_temp; // temp pg mapping (e.g. while we rebuild) + std::shared_ptr< mempool::osdmap::map > primary_temp; // temp primary mapping (e.g. while we rebuild) + std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline + + // remap (post-CRUSH, pre-up) + mempool::osdmap::map> pg_upmap; ///< remap pg + mempool::osdmap::map>> pg_upmap_items; ///< remap osds in up set + + mempool::osdmap::map pools; + mempool::osdmap::map pool_name; + mempool::osdmap::map> erasure_code_profiles; + mempool::osdmap::map> name_pool; + + std::shared_ptr< mempool::osdmap::vector > osd_uuid; + mempool::osdmap::vector osd_xinfo; + + class range_bits { + struct ip6 { + uint64_t upper_64_bits, lower_64_bits; + uint64_t upper_mask, lower_mask; + }; + struct ip4 { + uint32_t ip_32_bits; + uint32_t mask; + }; + union { + ip6 ipv6; + ip4 ipv4; + } bits; + bool ipv6; + static void get_ipv6_bytes(unsigned const char *addr, + uint64_t *upper, uint64_t *lower); + public: + range_bits(); + range_bits(const entity_addr_t& addr); + void parse(const entity_addr_t& addr); + bool matches(const entity_addr_t& addr) const; + }; + mempool::osdmap::unordered_map blocklist; + mempool::osdmap::map range_blocklist; + mempool::osdmap::map calculated_ranges; + + /// queue of snaps to remove + mempool::osdmap::map removed_snaps_queue; + + /// removed_snaps additions this epoch + mempool::osdmap::map new_removed_snaps; + + /// removed_snaps removals this epoch + mempool::osdmap::map new_purged_snaps; + + epoch_t cluster_snapshot_epoch; + std::string cluster_snapshot; + bool new_blocklist_entries; + + float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0; + + /// min compat client we want to support + ceph_release_t require_min_compat_client{ceph_release_t::unknown}; + +public: + /// require osds to run at least this release + ceph_release_t require_osd_release{ceph_release_t::unknown}; + +private: + mutable uint64_t cached_up_osd_features; + + mutable bool crc_defined; + mutable uint32_t crc; + + void _calc_up_osd_features(); + + public: + bool have_crc() const { return crc_defined; } + uint32_t get_crc() const { return crc; } + + std::shared_ptr crush; // hierarchical map + bool stretch_mode_enabled; // we are in stretch mode, requiring multiple sites + uint32_t stretch_bucket_count; // number of sites we expect to be in + uint32_t degraded_stretch_mode; // 0 if not degraded; else count of up sites + uint32_t recovering_stretch_mode; // 0 if not recovering; else 1 + int32_t stretch_mode_bucket; // the bucket type we're stretched across +private: + uint32_t crush_version = 1; + + friend class OSDMonitor; + + public: + OSDMap() : epoch(0), + pool_max(0), + flags(0), + num_osd(0), num_up_osd(0), num_in_osd(0), + max_osd(0), + osd_addrs(std::make_shared()), + pg_temp(std::make_shared()), + primary_temp(std::make_shared>()), + osd_uuid(std::make_shared>()), + cluster_snapshot_epoch(0), + new_blocklist_entries(false), + cached_up_osd_features(0), + crc_defined(false), crc(0), + crush(std::make_shared()), + stretch_mode_enabled(false), stretch_bucket_count(0), + degraded_stretch_mode(0), recovering_stretch_mode(0), stretch_mode_bucket(0) { + } + +private: + OSDMap(const OSDMap& other) = default; + OSDMap& operator=(const OSDMap& other) = default; +public: + + /// return feature mask subset that is relevant to OSDMap encoding + static uint64_t get_significant_features(uint64_t features) { + return SIGNIFICANT_FEATURES & features; + } + + uint64_t get_encoding_features() const; + + void deepish_copy_from(const OSDMap& o) { + *this = o; + primary_temp.reset(new mempool::osdmap::map(*o.primary_temp)); + pg_temp.reset(new PGTempMap(*o.pg_temp)); + osd_uuid.reset(new mempool::osdmap::vector(*o.osd_uuid)); + + if (o.osd_primary_affinity) + osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity)); + + // NOTE: this still references shared entity_addrvec_t's. + osd_addrs.reset(new addrs_s(*o.osd_addrs)); + + // NOTE: we do not copy crush. note that apply_incremental will + // allocate a new CrushWrapper, though. + } + + // map info + const uuid_d& get_fsid() const { return fsid; } + void set_fsid(uuid_d& f) { fsid = f; } + + epoch_t get_epoch() const { return epoch; } + void inc_epoch() { epoch++; } + + void set_epoch(epoch_t e); + + uint32_t get_crush_version() const { + return crush_version; + } + + /* stamps etc */ + const utime_t& get_created() const { return created; } + const utime_t& get_modified() const { return modified; } + + bool is_blocklisted(const entity_addr_t& a, CephContext *cct=nullptr) const; + bool is_blocklisted(const entity_addrvec_t& a, CephContext *cct=nullptr) const; + void get_blocklist(std::list > *bl, + std::list > *rl) const; + void get_blocklist(std::set *bl, + std::set *rl) const; + + std::string get_cluster_snapshot() const { + if (cluster_snapshot_epoch == epoch) + return cluster_snapshot; + return std::string(); + } + + float get_full_ratio() const { + return full_ratio; + } + float get_backfillfull_ratio() const { + return backfillfull_ratio; + } + float get_nearfull_ratio() const { + return nearfull_ratio; + } + void get_full_pools(CephContext *cct, + std::set *full, + std::set *backfillfull, + std::set *nearfull) const; + void get_full_osd_counts(std::set *full, std::set *backfill, + std::set *nearfull) const; + + + /***** cluster state *****/ + /* osds */ + int get_max_osd() const { return max_osd; } + void set_max_osd(int m); + + unsigned get_num_osds() const { + return num_osd; + } + unsigned get_num_up_osds() const { + return num_up_osd; + } + unsigned get_num_in_osds() const { + return num_in_osd; + } + /// recalculate cached values for get_num{,_up,_in}_osds + int calc_num_osds(); + + void get_all_osds(std::set& ls) const; + void get_up_osds(std::set& ls) const; + void get_out_existing_osds(std::set& ls) const; + unsigned get_num_pg_temp() const { + return pg_temp->size(); + } + + int get_flags() const { return flags; } + bool test_flag(int f) const { return flags & f; } + void set_flag(int f) { flags |= f; } + void clear_flag(int f) { flags &= ~f; } + + void get_flag_set(std::set *flagset) const; + + static void calc_state_set(int state, std::set& st); + + int get_state(int o) const { + ceph_assert(o < max_osd); + return osd_state[o]; + } + int get_state(int o, std::set& st) const { + ceph_assert(o < max_osd); + unsigned t = osd_state[o]; + calc_state_set(t, st); + return osd_state[o]; + } + void set_state(int o, unsigned s) { + ceph_assert(o < max_osd); + osd_state[o] = s; + } + void set_weight(int o, unsigned w) { + ceph_assert(o < max_osd); + osd_weight[o] = w; + if (w) + osd_state[o] |= CEPH_OSD_EXISTS; + } + unsigned get_weight(int o) const { + ceph_assert(o < max_osd); + return osd_weight[o]; + } + float get_weightf(int o) const { + return (float)get_weight(o) / (float)CEPH_OSD_IN; + } + void adjust_osd_weights(const std::map& weights, Incremental& inc) const; + + void set_primary_affinity(int o, int w) { + ceph_assert(o < max_osd); + if (!osd_primary_affinity) + osd_primary_affinity.reset( + new mempool::osdmap::vector<__u32>( + max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)); + (*osd_primary_affinity)[o] = w; + } + unsigned get_primary_affinity(int o) const { + ceph_assert(o < max_osd); + if (!osd_primary_affinity) + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + return (*osd_primary_affinity)[o]; + } + float get_primary_affinityf(int o) const { + return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY; + } + + bool has_erasure_code_profile(const std::string &name) const { + auto i = erasure_code_profiles.find(name); + return i != erasure_code_profiles.end(); + } + int get_erasure_code_profile_default(CephContext *cct, + std::map &profile_map, + std::ostream *ss); + void set_erasure_code_profile(const std::string &name, + const std::map& profile) { + erasure_code_profiles[name] = profile; + } + const std::map &get_erasure_code_profile( + const std::string &name) const { + static std::map empty; + auto i = erasure_code_profiles.find(name); + if (i == erasure_code_profiles.end()) + return empty; + else + return i->second; + } + const mempool::osdmap::map> &get_erasure_code_profiles() const { + return erasure_code_profiles; + } + + bool exists(int osd) const { + //assert(osd >= 0); + return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS); + } + + bool is_destroyed(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED); + } + + bool is_up(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_UP); + } + + bool has_been_up_since(int osd, epoch_t epoch) const { + return is_up(osd) && get_up_from(osd) <= epoch; + } + + bool is_down(int osd) const { + return !is_up(osd); + } + + bool is_stop(int osd) const { + return exists(osd) && is_down(osd) && + (osd_state[osd] & CEPH_OSD_STOP); + } + + bool is_out(int osd) const { + return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT; + } + + bool is_in(int osd) const { + return !is_out(osd); + } + + bool is_dead(int osd) const { + if (!exists(osd)) { + return false; // unclear if they know they are removed from map + } + return get_xinfo(osd).dead_epoch > get_info(osd).up_from; + } + + unsigned get_osd_crush_node_flags(int osd) const; + unsigned get_crush_node_flags(int id) const; + unsigned get_device_class_flags(int id) const; + + bool is_noup_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP); + } + + bool is_nodown_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN); + } + + bool is_noin_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN); + } + + bool is_noout_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT); + } + + bool is_noup(int osd) const { + if (test_flag(CEPH_OSDMAP_NOUP)) // global? + return true; + if (is_noup_by_osd(osd)) // by osd? + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOUP) // by crush-node? + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOUP) // by device-class? + return true; + return false; + } + + bool is_nodown(int osd) const { + if (test_flag(CEPH_OSDMAP_NODOWN)) + return true; + if (is_nodown_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NODOWN) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NODOWN) + return true; + return false; + } + + bool is_noin(int osd) const { + if (test_flag(CEPH_OSDMAP_NOIN)) + return true; + if (is_noin_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOIN) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOIN) + return true; + return false; + } + + bool is_noout(int osd) const { + if (test_flag(CEPH_OSDMAP_NOOUT)) + return true; + if (is_noout_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOOUT) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOOUT) + return true; + return false; + } + + /** + * check if an entire crush subtree is down + */ + bool subtree_is_down(int id, std::set *down_cache) const; + bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, std::set *down_cache) const; + + bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, std::set *down_in_osds, std::set *up_in_osds, + std::set *subtree_up, std::unordered_map > *subtree_type_down) const; + + int identify_osd(const entity_addr_t& addr) const; + int identify_osd(const uuid_d& u) const; + int identify_osd_on_all_channels(const entity_addr_t& addr) const; + + bool have_addr(const entity_addr_t& addr) const { + return identify_osd(addr) >= 0; + } + int find_osd_on_ip(const entity_addr_t& ip) const; + + const entity_addrvec_t& get_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->client_addrs[osd] ? + *osd_addrs->client_addrs[osd] : _blank_addrvec; + } + const entity_addrvec_t& get_most_recent_addrs(int osd) const { + return get_addrs(osd); + } + const entity_addrvec_t &get_cluster_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->cluster_addrs[osd] ? + *osd_addrs->cluster_addrs[osd] : _blank_addrvec; + } + const entity_addrvec_t &get_hb_back_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->hb_back_addrs[osd] ? + *osd_addrs->hb_back_addrs[osd] : _blank_addrvec; + } + const entity_addrvec_t &get_hb_front_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->hb_front_addrs[osd] ? + *osd_addrs->hb_front_addrs[osd] : _blank_addrvec; + } + + const uuid_d& get_uuid(int osd) const { + ceph_assert(exists(osd)); + return (*osd_uuid)[osd]; + } + + const epoch_t& get_up_from(int osd) const { + ceph_assert(exists(osd)); + return osd_info[osd].up_from; + } + const epoch_t& get_up_thru(int osd) const { + ceph_assert(exists(osd)); + return osd_info[osd].up_thru; + } + const epoch_t& get_down_at(int osd) const { + ceph_assert(exists(osd)); + return osd_info[osd].down_at; + } + const osd_info_t& get_info(int osd) const { + ceph_assert(osd < max_osd); + return osd_info[osd]; + } + + const osd_xinfo_t& get_xinfo(int osd) const { + ceph_assert(osd < max_osd); + return osd_xinfo[osd]; + } + + int get_next_up_osd_after(int n) const { + if (get_max_osd() == 0) + return -1; + for (int i = n + 1; i != n; ++i) { + if (i >= get_max_osd()) + i = 0; + if (i == n) + break; + if (is_up(i)) + return i; + } + return -1; + } + + int get_previous_up_osd_before(int n) const { + if (get_max_osd() == 0) + return -1; + for (int i = n - 1; i != n; --i) { + if (i < 0) + i = get_max_osd() - 1; + if (i == n) + break; + if (is_up(i)) + return i; + } + return -1; + } + + + void get_random_up_osds_by_subtree(int n, // whoami + std::string &subtree, + int limit, // how many + std::set skip, + std::set *want) const; + + /** + * get feature bits required by the current structure + * + * @param entity_type [in] what entity type we are asking about + * @param mask [out] std::set of all possible map-related features we could std::set + * @return feature bits used by this map + */ + uint64_t get_features(int entity_type, uint64_t *mask) const; + + /** + * get oldest *client* version (firefly, hammer, etc.) that can connect given + * the feature bits required (according to get_features()). + */ + ceph_release_t get_min_compat_client() const; + + /** + * gets the required minimum *client* version that can connect to the cluster. + */ + ceph_release_t get_require_min_compat_client() const; + + /** + * get intersection of features supported by up osds + */ + uint64_t get_up_osd_features() const; + + void get_upmap_pgs(std::vector *upmap_pgs) const; + bool check_pg_upmaps( + CephContext *cct, + const std::vector& to_check, + std::vector *to_cancel, + std::map>> *to_remap) const; + void clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc, + const std::vector& to_cancel, + const std::map>>& to_remap) const; + bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const; + + int apply_incremental(const Incremental &inc); + + /// try to re-use/reference addrs in oldmap from newmap + static void dedup(const OSDMap *oldmap, OSDMap *newmap); + + static void clean_temps(CephContext *cct, + const OSDMap& oldmap, + const OSDMap& nextmap, + Incremental *pending_inc); + + // serialize, unserialize +private: + void encode_client_old(ceph::buffer::list& bl) const; + void encode_classic(ceph::buffer::list& bl, uint64_t features) const; + void decode_classic(ceph::buffer::list::const_iterator& p); + void post_decode(); +public: + void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const; + void decode(ceph::buffer::list& bl); + void decode(ceph::buffer::list::const_iterator& bl); + + + /**** mapping facilities ****/ + int map_to_pg( + int64_t pool, + const std::string& name, + const std::string& key, + const std::string& nspace, + pg_t *pg) const; + int object_locator_to_pg(const object_t& oid, const object_locator_t& loc, + pg_t &pg) const; + pg_t object_locator_to_pg(const object_t& oid, + const object_locator_t& loc) const { + pg_t pg; + int ret = object_locator_to_pg(oid, loc, pg); + ceph_assert(ret == 0); + return pg; + } + + + static object_locator_t file_to_object_locator(const file_layout_t& layout) { + return object_locator_t(layout.pool_id, layout.pool_ns); + } + + ceph_object_layout file_to_object_layout(object_t oid, + file_layout_t& layout) const { + return make_object_layout(oid, layout.pool_id, layout.pool_ns); + } + + ceph_object_layout make_object_layout(object_t oid, int pg_pool, + std::string nspace) const; + + int get_pg_num(int pg_pool) const + { + const pg_pool_t *pool = get_pg_pool(pg_pool); + ceph_assert(NULL != pool); + return pool->get_pg_num(); + } + + bool pg_exists(pg_t pgid) const { + const pg_pool_t *p = get_pg_pool(pgid.pool()); + return p && pgid.ps() < p->get_pg_num(); + } + + int get_pg_pool_min_size(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_min_size(); + } + + int get_pg_pool_size(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_size(); + } + + int get_pg_pool_crush_rule(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_crush_rule(); + } + +private: + /// pg -> (raw osd std::list) + void _pg_to_raw_osds( + const pg_pool_t& pool, pg_t pg, + std::vector *osds, + ps_t *ppps) const; + int _pick_primary(const std::vector& osds) const; + void _remove_nonexistent_osds(const pg_pool_t& pool, std::vector& osds) const; + + void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool, + std::vector *osds, int *primary) const; + + /// apply pg_upmap[_items] mappings + void _apply_upmap(const pg_pool_t& pi, pg_t pg, std::vector *raw) const; + + /// pg -> (up osd std::list) + void _raw_to_up_osds(const pg_pool_t& pool, const std::vector& raw, + std::vector *up) const; + + + /** + * Get the pg and primary temp, if they are specified. + * @param temp_pg [out] Will be empty or contain the temp PG mapping on return + * @param temp_primary [out] Will be the value in primary_temp, or a value derived + * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary. + */ + void _get_temp_osds(const pg_pool_t& pool, pg_t pg, + std::vector *temp_pg, int *temp_primary) const; + + /** + * map to up and acting. Fills in whatever fields are non-NULL. + */ + void _pg_to_up_acting_osds(const pg_t& pg, std::vector *up, int *up_primary, + std::vector *acting, int *acting_primary, + bool raw_pg_to_pg = true) const; + +public: + /*** + * This is suitable only for looking at raw CRUSH outputs. It skips + * applying the temp and up checks and should not be used + * by anybody for data mapping purposes. + * raw and primary must be non-NULL + */ + void pg_to_raw_osds(pg_t pg, std::vector *raw, int *primary) const; + void pg_to_raw_upmap(pg_t pg, std::vector *raw, + std::vector *raw_upmap) const; + /// map a pg to its acting set. @return acting set size + void pg_to_acting_osds(const pg_t& pg, std::vector *acting, + int *acting_primary) const { + _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary); + } + void pg_to_acting_osds(pg_t pg, std::vector& acting) const { + return pg_to_acting_osds(pg, &acting, NULL); + } + /** + * This does not apply temp overrides and should not be used + * by anybody for data mapping purposes. Specify both pointers. + */ + void pg_to_raw_up(pg_t pg, std::vector *up, int *primary) const; + /** + * map a pg to its acting set as well as its up set. You must use + * the acting set for data mapping purposes, but some users will + * also find the up set useful for things like deciding what to + * set as pg_temp. + * Each of these pointers must be non-NULL. + */ + void pg_to_up_acting_osds(pg_t pg, std::vector *up, int *up_primary, + std::vector *acting, int *acting_primary) const { + _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary); + } + void pg_to_up_acting_osds(pg_t pg, std::vector& up, std::vector& acting) const { + int up_primary, acting_primary; + pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary); + } + bool pg_is_ec(pg_t pg) const { + auto i = pools.find(pg.pool()); + ceph_assert(i != pools.end()); + return i->second.is_erasure(); + } + bool get_primary_shard(const pg_t& pgid, spg_t *out) const { + auto i = get_pools().find(pgid.pool()); + if (i == get_pools().end()) { + return false; + } + if (!i->second.is_erasure()) { + *out = spg_t(pgid); + return true; + } + int primary; + std::vector acting; + pg_to_acting_osds(pgid, &acting, &primary); + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == primary) { + *out = spg_t(pgid, shard_id_t(i)); + return true; + } + } + return false; + } + bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const { + auto i = get_pools().find(pgid.pool()); + if (i == get_pools().end()) { + return false; + } + std::vector acting; + pg_to_acting_osds(pgid, &acting, primary); + if (i->second.is_erasure()) { + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == *primary) { + *out = spg_t(pgid, shard_id_t(i)); + return true; + } + } + } else { + *out = spg_t(pgid); + return true; + } + return false; + } + + bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const { + auto p = removed_snaps_queue.find(pool); + if (p == removed_snaps_queue.end()) { + return false; + } + return p->second.contains(snap); + } + + const mempool::osdmap::map& + get_removed_snaps_queue() const { + return removed_snaps_queue; + } + const mempool::osdmap::map& + get_new_removed_snaps() const { + return new_removed_snaps; + } + const mempool::osdmap::map& + get_new_purged_snaps() const { + return new_purged_snaps; + } + + int64_t lookup_pg_pool_name(std::string_view name) const { + auto p = name_pool.find(name); + if (p == name_pool.end()) + return -ENOENT; + return p->second; + } + + int64_t get_pool_max() const { + return pool_max; + } + const mempool::osdmap::map& get_pools() const { + return pools; + } + mempool::osdmap::map& get_pools() { + return pools; + } + void get_pool_ids_by_rule(int rule_id, std::set *pool_ids) const { + ceph_assert(pool_ids); + for (auto &p: pools) { + if (p.second.get_crush_rule() == rule_id) { + pool_ids->insert(p.first); + } + } + } + void get_pool_ids_by_osd(CephContext *cct, + int osd, + std::set *pool_ids) const; + const std::string& get_pool_name(int64_t p) const { + auto i = pool_name.find(p); + ceph_assert(i != pool_name.end()); + return i->second; + } + const mempool::osdmap::map& get_pool_names() const { + return pool_name; + } + bool have_pg_pool(int64_t p) const { + return pools.count(p); + } + const pg_pool_t* get_pg_pool(int64_t p) const { + auto i = pools.find(p); + if (i != pools.end()) + return &i->second; + return NULL; + } + unsigned get_pg_size(pg_t pg) const { + auto p = pools.find(pg.pool()); + ceph_assert(p != pools.end()); + return p->second.get_size(); + } + int get_pg_type(pg_t pg) const { + auto p = pools.find(pg.pool()); + ceph_assert(p != pools.end()); + return p->second.get_type(); + } + int get_pool_crush_rule(int64_t pool_id) const { + auto pool = get_pg_pool(pool_id); + if (!pool) + return -ENOENT; + return pool->get_crush_rule(); + } + + + pg_t raw_pg_to_pg(pg_t pg) const { + auto p = pools.find(pg.pool()); + ceph_assert(p != pools.end()); + return p->second.raw_pg_to_pg(pg); + } + + // pg -> acting primary osd + int get_pg_acting_primary(pg_t pg) const { + int primary = -1; + _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary); + return primary; + } + + /* + * check whether an spg_t maps to a particular osd + */ + bool is_up_acting_osd_shard(spg_t pg, int osd) const { + std::vector up, acting; + _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false); + if (calc_pg_role(pg_shard_t(osd, pg.shard), acting) >= 0 || + calc_pg_role(pg_shard_t(osd, pg.shard), up) >= 0) { + return true; + } + return false; + } + + + static int calc_pg_role_broken(int osd, const std::vector& acting, int nrep=0); + static int calc_pg_role(pg_shard_t who, const std::vector& acting); + static bool primary_changed_broken( + int oldprimary, + const std::vector &oldacting, + int newprimary, + const std::vector &newacting); + + /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ + int get_pg_acting_role(spg_t pg, int osd) const { + std::vector group; + pg_to_acting_osds(pg.pgid, group); + return calc_pg_role(pg_shard_t(osd, pg.shard), group); + } + + bool try_pg_upmap( + CephContext *cct, + pg_t pg, ///< pg to potentially remap + const std::set& overfull, ///< osds we'd want to evacuate + const std::vector& underfull, ///< osds to move to, in order of preference + const std::vector& more_underfull, ///< less full osds to move to, in order of preference + std::vector *orig, + std::vector *out); ///< resulting alternative mapping + + int calc_pg_upmaps( + CephContext *cct, + uint32_t max_deviation, ///< max deviation from target (value >= 1) + int max_iterations, ///< max iterations to run + const std::set& pools, ///< [optional] restrict to pool + Incremental *pending_inc + ); + + int get_osds_by_bucket_name(const std::string &name, std::set *osds) const; + + bool have_pg_upmaps(pg_t pg) const { + return pg_upmap.count(pg) || + pg_upmap_items.count(pg); + } + + bool check_full(const std::set &missing_on) const { + for (auto shard : missing_on) { + if (get_state(shard.osd) & CEPH_OSD_FULL) + return true; + } + return false; + } + + /* + * handy helpers to build simple maps... + */ + /** + * Build an OSD map suitable for basic usage. If **num_osd** is >= 0 + * it will be initialized with the specified number of OSDs in a + * single host. If **num_osd** is < 0 the layout of the OSD map will + * be built by reading the content of the configuration file. + * + * @param cct [in] in core ceph context + * @param e [in] initial epoch + * @param fsid [in] id of the cluster + * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0 + * @return **0** on success, negative errno on error. + */ +private: + int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd, int pg_bits, int pgp_bits, + bool default_pool); +public: + int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd) { + return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false); + } + int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd, int pg_bits, int pgp_bits) { + return build_simple_optioned(cct, e, fsid, num_osd, + pg_bits, pgp_bits, true); + } + static int _build_crush_types(CrushWrapper& crush); + static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush, + int num_osd, std::ostream *ss); + static int build_simple_crush_map_from_conf(CephContext *cct, + CrushWrapper& crush, + std::ostream *ss); + static int build_simple_crush_rules( + CephContext *cct, CrushWrapper& crush, + const std::string& root, + std::ostream *ss); + + bool crush_rule_in_use(int rule_id) const; + + int validate_crush_rules(CrushWrapper *crush, std::ostream *ss) const; + + void clear_temp() { + pg_temp->clear(); + primary_temp->clear(); + } + +private: + void print_osd_line(int cur, std::ostream *out, ceph::Formatter *f) const; +public: + void print(std::ostream& out) const; + void print_osd(int id, std::ostream& out) const; + void print_osds(std::ostream& out) const; + void print_pools(std::ostream& out) const; + void print_summary(ceph::Formatter *f, std::ostream& out, + const std::string& prefix, bool extra=false) const; + void print_oneline_summary(std::ostream& out) const; + + enum { + DUMP_IN = 1, // only 'in' osds + DUMP_OUT = 2, // only 'out' osds + DUMP_UP = 4, // only 'up' osds + DUMP_DOWN = 8, // only 'down' osds + DUMP_DESTROYED = 16, // only 'destroyed' osds + }; + void print_tree(ceph::Formatter *f, std::ostream *out, + unsigned dump_flags=0, std::string bucket="") const; + + int summarize_mapping_stats( + OSDMap *newmap, + const std::set *pools, + std::string *out, + ceph::Formatter *f) const; + + std::string get_flag_string() const; + static std::string get_flag_string(unsigned flags); + static void dump_erasure_code_profiles( + const mempool::osdmap::map > &profiles, + ceph::Formatter *f); + void dump(ceph::Formatter *f) const; + void dump_osd(int id, ceph::Formatter *f) const; + void dump_osds(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + bool check_new_blocklist_entries() const { return new_blocklist_entries; } + + void check_health(CephContext *cct, health_check_map_t *checks) const; + + int parse_osd_id_list(const std::vector& ls, + std::set *out, + std::ostream *ss) const; + + float pool_raw_used_rate(int64_t poolid) const; + std::optional pending_require_osd_release() const; + +}; +WRITE_CLASS_ENCODER_FEATURES(OSDMap) +WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental) + +#ifdef WITH_SEASTAR +using OSDMapRef = boost::local_shared_ptr; +#else +using OSDMapRef = std::shared_ptr; +#endif + + +inline std::ostream& operator<<(std::ostream& out, const OSDMap& m) { + m.print_oneline_summary(out); + return out; +} + +class PGMap; + +void print_osd_utilization(const OSDMap& osdmap, + const PGMap& pgmap, + std::ostream& out, + ceph::Formatter *f, + bool tree, + const std::string& filter); + +#endif diff --git a/src/osd/OSDMapMapping.cc b/src/osd/OSDMapMapping.cc new file mode 100644 index 000000000..9cd1fbf58 --- /dev/null +++ b/src/osd/OSDMapMapping.cc @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "OSDMapMapping.h" +#include "OSDMap.h" + +#define dout_subsys ceph_subsys_mon + +#include "common/debug.h" + +using std::vector; + +MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMapMapping, osdmapmapping, + osdmap_mapping); + +// ensure that we have a PoolMappings for each pool and that +// the dimensions (pg_num and size) match up. +void OSDMapMapping::_init_mappings(const OSDMap& osdmap) +{ + num_pgs = 0; + auto q = pools.begin(); + for (auto& p : osdmap.get_pools()) { + num_pgs += p.second.get_pg_num(); + // drop unneeded pools + while (q != pools.end() && q->first < p.first) { + q = pools.erase(q); + } + if (q != pools.end() && q->first == p.first) { + if (q->second.pg_num != p.second.get_pg_num() || + q->second.size != p.second.get_size()) { + // pg_num changed + q = pools.erase(q); + } else { + // keep it + ++q; + continue; + } + } + pools.emplace(p.first, PoolMapping(p.second.get_size(), + p.second.get_pg_num(), + p.second.is_erasure())); + } + pools.erase(q, pools.end()); + ceph_assert(pools.size() == osdmap.get_pools().size()); +} + +void OSDMapMapping::update(const OSDMap& osdmap) +{ + _start(osdmap); + for (auto& p : osdmap.get_pools()) { + _update_range(osdmap, p.first, 0, p.second.get_pg_num()); + } + _finish(osdmap); + //_dump(); // for debugging +} + +void OSDMapMapping::update(const OSDMap& osdmap, pg_t pgid) +{ + _update_range(osdmap, pgid.pool(), pgid.ps(), pgid.ps() + 1); +} + +void OSDMapMapping::_build_rmap(const OSDMap& osdmap) +{ + acting_rmap.resize(osdmap.get_max_osd()); + //up_rmap.resize(osdmap.get_max_osd()); + for (auto& v : acting_rmap) { + v.resize(0); + } + //for (auto& v : up_rmap) { + // v.resize(0); + //} + for (auto& p : pools) { + pg_t pgid(0, p.first); + for (unsigned ps = 0; ps < p.second.pg_num; ++ps) { + pgid.set_ps(ps); + int32_t *row = &p.second.table[p.second.row_size() * ps]; + for (int i = 0; i < row[2]; ++i) { + if (row[4 + i] != CRUSH_ITEM_NONE) { + acting_rmap[row[4 + i]].push_back(pgid); + } + } + //for (int i = 0; i < row[3]; ++i) { + //up_rmap[row[4 + p.second.size + i]].push_back(pgid); + //} + } + } +} + +void OSDMapMapping::_finish(const OSDMap& osdmap) +{ + _build_rmap(osdmap); + epoch = osdmap.get_epoch(); +} + +void OSDMapMapping::_dump() +{ + for (auto& p : pools) { + std::cout << "pool " << p.first << std::endl; + for (unsigned i = 0; i < p.second.table.size(); ++i) { + std::cout << " " << p.second.table[i]; + if (i % p.second.row_size() == p.second.row_size() - 1) + std::cout << std::endl; + } + } +} + +void OSDMapMapping::_update_range( + const OSDMap& osdmap, + int64_t pool, + unsigned pg_begin, + unsigned pg_end) +{ + auto i = pools.find(pool); + ceph_assert(i != pools.end()); + ceph_assert(pg_begin <= pg_end); + ceph_assert(pg_end <= i->second.pg_num); + for (unsigned ps = pg_begin; ps < pg_end; ++ps) { + std::vector up, acting; + int up_primary, acting_primary; + osdmap.pg_to_up_acting_osds( + pg_t(ps, pool), + &up, &up_primary, &acting, &acting_primary); + i->second.set(ps, std::move(up), up_primary, + std::move(acting), acting_primary); + } +} + +// --------------------------- + +void ParallelPGMapper::Job::finish_one() +{ + Context *fin = nullptr; + { + std::lock_guard l(lock); + if (--shards == 0) { + if (!aborted) { + finish = ceph_clock_now(); + complete(); + } + cond.notify_all(); + fin = onfinish; + onfinish = nullptr; + } + } + if (fin) { + fin->complete(0); + } +} + +void ParallelPGMapper::WQ::_process(Item *i, ThreadPool::TPHandle &h) +{ + ldout(m->cct, 20) << __func__ << " " << i->job << " pool " << i->pool + << " [" << i->begin << "," << i->end << ")" + << " pgs " << i->pgs + << dendl; + if (!i->pgs.empty()) + i->job->process(i->pgs); + else + i->job->process(i->pool, i->begin, i->end); + i->job->finish_one(); + delete i; +} + +void ParallelPGMapper::queue( + Job *job, + unsigned pgs_per_item, + const vector& input_pgs) +{ + bool any = false; + if (!input_pgs.empty()) { + unsigned i = 0; + vector item_pgs; + item_pgs.reserve(pgs_per_item); + for (auto& pg : input_pgs) { + if (i < pgs_per_item) { + ++i; + item_pgs.push_back(pg); + } + if (i >= pgs_per_item) { + job->start_one(); + wq.queue(new Item(job, item_pgs)); + i = 0; + item_pgs.clear(); + any = true; + } + } + if (!item_pgs.empty()) { + job->start_one(); + wq.queue(new Item(job, item_pgs)); + any = true; + } + ceph_assert(any); + return; + } + // no input pgs, load all from map + for (auto& p : job->osdmap->get_pools()) { + for (unsigned ps = 0; ps < p.second.get_pg_num(); ps += pgs_per_item) { + unsigned ps_end = std::min(ps + pgs_per_item, p.second.get_pg_num()); + job->start_one(); + wq.queue(new Item(job, p.first, ps, ps_end)); + ldout(cct, 20) << __func__ << " " << job << " " << p.first << " [" << ps + << "," << ps_end << ")" << dendl; + any = true; + } + } + ceph_assert(any); +} diff --git a/src/osd/OSDMapMapping.h b/src/osd/OSDMapMapping.h new file mode 100644 index 000000000..3274d02e4 --- /dev/null +++ b/src/osd/OSDMapMapping.h @@ -0,0 +1,352 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#ifndef CEPH_OSDMAPMAPPING_H +#define CEPH_OSDMAPMAPPING_H + +#include +#include + +#include "osd/osd_types.h" +#include "common/WorkQueue.h" +#include "common/Cond.h" + +class OSDMap; + +/// work queue to perform work on batches of pgids on multiple CPUs +class ParallelPGMapper { +public: + struct Job { + utime_t start, finish; + unsigned shards = 0; + const OSDMap *osdmap; + bool aborted = false; + Context *onfinish = nullptr; + + ceph::mutex lock = ceph::make_mutex("ParallelPGMapper::Job::lock"); + ceph::condition_variable cond; + + Job(const OSDMap *om) : start(ceph_clock_now()), osdmap(om) {} + virtual ~Job() { + ceph_assert(shards == 0); + } + + // child must implement either form of process + virtual void process(const std::vector& pgs) = 0; + virtual void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) = 0; + virtual void complete() = 0; + + void set_finish_event(Context *fin) { + lock.lock(); + if (shards == 0) { + // already done. + lock.unlock(); + fin->complete(0); + } else { + // set finisher + onfinish = fin; + lock.unlock(); + } + } + bool is_done() { + std::lock_guard l(lock); + return shards == 0; + } + utime_t get_duration() { + return finish - start; + } + void wait() { + std::unique_lock l(lock); + cond.wait(l, [this] { return shards == 0; }); + } + bool wait_for(double duration) { + utime_t until = start; + until += duration; + std::unique_lock l(lock); + while (shards > 0) { + if (ceph_clock_now() >= until) { + return false; + } + cond.wait(l); + } + return true; + } + void abort() { + Context *fin = nullptr; + { + std::unique_lock l(lock); + aborted = true; + fin = onfinish; + onfinish = nullptr; + cond.wait(l, [this] { return shards == 0; }); + } + if (fin) { + fin->complete(-ECANCELED); + } + } + + void start_one() { + std::lock_guard l(lock); + ++shards; + } + void finish_one(); + }; + +protected: + CephContext *cct; + + struct Item { + Job *job; + int64_t pool; + unsigned begin, end; + std::vector pgs; + + Item(Job *j, std::vector pgs) : job(j), pgs(pgs) {} + Item(Job *j, int64_t p, unsigned b, unsigned e) + : job(j), + pool(p), + begin(b), + end(e) {} + }; + std::deque q; + + struct WQ : public ThreadPool::WorkQueue { + ParallelPGMapper *m; + + WQ(ParallelPGMapper *m_, ThreadPool *tp) + : ThreadPool::WorkQueue( + "ParallelPGMapper::WQ", + ceph::make_timespan(m_->cct->_conf->threadpool_default_timeout), + ceph::timespan::zero(), + tp), + m(m_) {} + + bool _enqueue(Item *i) override { + m->q.push_back(i); + return true; + } + void _dequeue(Item *i) override { + ceph_abort(); + } + Item *_dequeue() override { + while (!m->q.empty()) { + Item *i = m->q.front(); + m->q.pop_front(); + if (i->job->aborted) { + i->job->finish_one(); + delete i; + } else { + return i; + } + } + return nullptr; + } + + void _process(Item *i, ThreadPool::TPHandle &h) override; + + void _clear() override { + ceph_assert(_empty()); + } + + bool _empty() override { + return m->q.empty(); + } + } wq; + +public: + ParallelPGMapper(CephContext *cct, ThreadPool *tp) + : cct(cct), + wq(this, tp) {} + + void queue( + Job *job, + unsigned pgs_per_item, + const std::vector& input_pgs); + + void drain() { + wq.drain(); + } +}; + + +/// a precalculated mapping of every PG for a given OSDMap +class OSDMapMapping { +public: + MEMPOOL_CLASS_HELPERS(); +private: + + struct PoolMapping { + MEMPOOL_CLASS_HELPERS(); + + unsigned size = 0; + unsigned pg_num = 0; + bool erasure = false; + mempool::osdmap_mapping::vector table; + + size_t row_size() const { + return + 1 + // acting_primary + 1 + // up_primary + 1 + // num acting + 1 + // num up + size + // acting + size; // up + } + + PoolMapping(int s, int p, bool e) + : size(s), + pg_num(p), + erasure(e), + table(pg_num * row_size()) { + } + + void get(size_t ps, + std::vector *up, + int *up_primary, + std::vector *acting, + int *acting_primary) const { + const int32_t *row = &table[row_size() * ps]; + if (acting_primary) { + *acting_primary = row[0]; + } + if (up_primary) { + *up_primary = row[1]; + } + if (acting) { + acting->resize(row[2]); + for (int i = 0; i < row[2]; ++i) { + (*acting)[i] = row[4 + i]; + } + } + if (up) { + up->resize(row[3]); + for (int i = 0; i < row[3]; ++i) { + (*up)[i] = row[4 + size + i]; + } + } + } + + void set(size_t ps, + const std::vector& up, + int up_primary, + const std::vector& acting, + int acting_primary) { + int32_t *row = &table[row_size() * ps]; + row[0] = acting_primary; + row[1] = up_primary; + // these should always be <= the pool size, but just in case, avoid + // blowing out the array. Note that our mapping is not completely + // accurate in this case--this is just to avoid crashing. + row[2] = std::min(acting.size(), size); + row[3] = std::min(up.size(), size); + for (int i = 0; i < row[2]; ++i) { + row[4 + i] = acting[i]; + } + for (int i = 0; i < row[3]; ++i) { + row[4 + size + i] = up[i]; + } + } + }; + + mempool::osdmap_mapping::map pools; + mempool::osdmap_mapping::vector< + mempool::osdmap_mapping::vector> acting_rmap; // osd -> pg + //unused: mempool::osdmap_mapping::vector> up_rmap; // osd -> pg + epoch_t epoch = 0; + uint64_t num_pgs = 0; + + void _init_mappings(const OSDMap& osdmap); + void _update_range( + const OSDMap& map, + int64_t pool, + unsigned pg_begin, unsigned pg_end); + + void _build_rmap(const OSDMap& osdmap); + + void _start(const OSDMap& osdmap) { + _init_mappings(osdmap); + } + void _finish(const OSDMap& osdmap); + + void _dump(); + + friend class ParallelPGMapper; + + struct MappingJob : public ParallelPGMapper::Job { + OSDMapMapping *mapping; + MappingJob(const OSDMap *osdmap, OSDMapMapping *m) + : Job(osdmap), mapping(m) { + mapping->_start(*osdmap); + } + void process(const std::vector& pgs) override {} + void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override { + mapping->_update_range(*osdmap, pool, ps_begin, ps_end); + } + void complete() override { + mapping->_finish(*osdmap); + } + }; + +public: + void get(pg_t pgid, + std::vector *up, + int *up_primary, + std::vector *acting, + int *acting_primary) const { + auto p = pools.find(pgid.pool()); + ceph_assert(p != pools.end()); + ceph_assert(pgid.ps() < p->second.pg_num); + p->second.get(pgid.ps(), up, up_primary, acting, acting_primary); + } + + bool get_primary_and_shard(pg_t pgid, + int *acting_primary, + spg_t *spgid) { + auto p = pools.find(pgid.pool()); + ceph_assert(p != pools.end()); + ceph_assert(pgid.ps() < p->second.pg_num); + std::vector acting; + p->second.get(pgid.ps(), nullptr, nullptr, &acting, acting_primary); + if (p->second.erasure) { + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == *acting_primary) { + *spgid = spg_t(pgid, shard_id_t(i)); + return true; + } + } + return false; + } else { + *spgid = spg_t(pgid); + return true; + } + } + + const mempool::osdmap_mapping::vector& get_osd_acting_pgs(unsigned osd) { + ceph_assert(osd < acting_rmap.size()); + return acting_rmap[osd]; + } + + void update(const OSDMap& map); + void update(const OSDMap& map, pg_t pgid); + + std::unique_ptr start_update( + const OSDMap& map, + ParallelPGMapper& mapper, + unsigned pgs_per_item) { + std::unique_ptr job(new MappingJob(&map, this)); + mapper.queue(job.get(), pgs_per_item, {}); + return job; + } + + epoch_t get_epoch() const { + return epoch; + } + + uint64_t get_num_pgs() const { + return num_pgs; + } +}; + + +#endif diff --git a/src/osd/ObjectVersioner.h b/src/osd/ObjectVersioner.h new file mode 100644 index 000000000..f7d756330 --- /dev/null +++ b/src/osd/ObjectVersioner.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_OBJECTVERSIONER_H +#define CEPH_OSD_OBJECTVERSIONER_H + +class ObjectVersioner { + public: + pobject_t oid; + + void get_versions(list& ls); + version_t head(); // newest + version_t committed(); // last committed + version_t tail(); // oldest + + /* + * prepare a new version, starting wit "raw" transaction t. + */ + void prepare(ObjectStore::Transaction& t, version_t v); + void rollback_to(version_t v); + void commit_to(version_t v); +}; + +#endif diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc new file mode 100644 index 000000000..0eb92c23a --- /dev/null +++ b/src/osd/OpRequest.cc @@ -0,0 +1,170 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "OpRequest.h" +#include "common/Formatter.h" +#include +#include +#include "common/debug.h" +#include "common/config.h" +#include "msg/Message.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "include/ceph_assert.h" +#include "osd/osd_types.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/oprequest.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +using std::ostream; +using std::set; +using std::string; +using std::stringstream; + +using ceph::Formatter; + +OpRequest::OpRequest(Message* req, OpTracker* tracker) + : TrackedOp(tracker, req->get_throttle_stamp()), + request(req), + hit_flag_points(0), + latest_flag_point(0), + hitset_inserted(false) { + if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) { + // don't warn as quickly for low priority ops + warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple; + } + if (req->get_type() == CEPH_MSG_OSD_OP) { + reqid = static_cast(req)->get_reqid(); + } else if (req->get_type() == MSG_OSD_REPOP) { + reqid = static_cast(req)->reqid; + } else if (req->get_type() == MSG_OSD_REPOPREPLY) { + reqid = static_cast(req)->reqid; + } + req_src_inst = req->get_source_inst(); +} + +void OpRequest::_dump(Formatter *f) const +{ + Message *m = request; + f->dump_string("flag_point", state_string()); + if (m->get_orig_source().is_client()) { + f->open_object_section("client_info"); + stringstream client_name, client_addr; + client_name << req_src_inst.name; + client_addr << req_src_inst.addr; + f->dump_string("client", client_name.str()); + f->dump_string("client_addr", client_addr.str()); + f->dump_unsigned("tid", m->get_tid()); + f->close_section(); // client_info + } + + { + f->open_array_section("events"); + std::lock_guard l(lock); + + for (auto i = events.begin(); i != events.end(); ++i) { + f->open_object_section("event"); + f->dump_string("event", i->str); + f->dump_stream("time") << i->stamp; + + auto i_next = i + 1; + + if (i_next < events.end()) { + f->dump_float("duration", i_next->stamp - i->stamp); + } else { + f->dump_float("duration", events.rbegin()->stamp - get_initiated()); + } + + f->close_section(); + } + f->close_section(); + } +} + +void OpRequest::_dump_op_descriptor_unlocked(ostream& stream) const +{ + get_req()->print(stream); +} + +void OpRequest::_unregistered() { + request->clear_data(); + request->clear_payload(); + request->release_message_throttle(); + request->set_connection(nullptr); +} + +int OpRequest::maybe_init_op_info(const OSDMap &osdmap) { + if (op_info.get_flags()) + return 0; + + auto m = get_req(); + +#ifdef WITH_LTTNG + auto old_rmw_flags = op_info.get_flags(); +#endif + auto ret = op_info.set_from_op(m, osdmap); + tracepoint(oprequest, set_rmw_flags, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc, + op_info.get_flags(), old_rmw_flags, op_info.get_flags()); + return ret; +} + +void OpRequest::mark_flag_point(uint8_t flag, const char *s) { +#ifdef WITH_LTTNG + uint8_t old_flags = hit_flag_points; +#endif + mark_event(s); + hit_flag_points |= flag; + latest_flag_point = flag; + tracepoint(oprequest, mark_flag_point, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc, op_info.get_flags(), + flag, s, old_flags, hit_flag_points); +} + +void OpRequest::mark_flag_point_string(uint8_t flag, const string& s) { +#ifdef WITH_LTTNG + uint8_t old_flags = hit_flag_points; +#endif + mark_event(s); + hit_flag_points |= flag; + latest_flag_point = flag; + tracepoint(oprequest, mark_flag_point, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc, op_info.get_flags(), + flag, s.c_str(), old_flags, hit_flag_points); +} + +bool OpRequest::filter_out(const set& filters) +{ + set addrs; + for (auto it = filters.begin(); it != filters.end(); it++) { + entity_addr_t addr; + if (addr.parse((*it).c_str())) { + addrs.insert(addr); + } + } + if (addrs.empty()) + return true; + + entity_addr_t cmp_addr = req_src_inst.addr; + if (addrs.count(cmp_addr)) { + return true; + } + cmp_addr.set_nonce(0); + if (addrs.count(cmp_addr)) { + return true; + } + cmp_addr.set_port(0); + if (addrs.count(cmp_addr)) { + return true; + } + + return false; +} + diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h new file mode 100644 index 000000000..daa0e1993 --- /dev/null +++ b/src/osd/OpRequest.h @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 New Dream Network/Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef OPREQUEST_H_ +#define OPREQUEST_H_ + +#include "osd/osd_op_util.h" +#include "osd/osd_types.h" +#include "common/TrackedOp.h" +#ifdef HAVE_JAEGER +#include "common/tracer.h" +#endif + +/** + * The OpRequest takes in a Message* and takes over a single reference + * to it, which it puts() when destroyed. + */ +struct OpRequest : public TrackedOp { + friend class OpTracker; + +private: + OpInfo op_info; + +public: + int maybe_init_op_info(const OSDMap &osdmap); + + auto get_flags() const { return op_info.get_flags(); } + bool op_info_needs_init() const { return op_info.get_flags() == 0; } + bool check_rmw(int flag) const { return op_info.check_rmw(flag); } + bool may_read() const { return op_info.may_read(); } + bool may_write() const { return op_info.may_write(); } + bool may_cache() const { return op_info.may_cache(); } + bool rwordered_forced() const { return op_info.rwordered_forced(); } + bool rwordered() const { return op_info.rwordered(); } + bool includes_pg_op() const { return op_info.includes_pg_op(); } + bool need_read_cap() const { return op_info.need_read_cap(); } + bool need_write_cap() const { return op_info.need_write_cap(); } + bool need_promote() const { return op_info.need_promote(); } + bool need_skip_handle_cache() const { return op_info.need_skip_handle_cache(); } + bool need_skip_promote() const { return op_info.need_skip_promote(); } + bool allows_returnvec() const { return op_info.allows_returnvec(); } + + std::vector classes() const { + return op_info.get_classes(); + } + + void _dump(ceph::Formatter *f) const override; + + bool has_feature(uint64_t f) const { + return request->get_connection()->has_feature(f); + } + +private: + Message *request; /// the logical request we are tracking + osd_reqid_t reqid; + entity_inst_t req_src_inst; + uint8_t hit_flag_points; + uint8_t latest_flag_point; + utime_t dequeued_time; + static const uint8_t flag_queued_for_pg=1 << 0; + static const uint8_t flag_reached_pg = 1 << 1; + static const uint8_t flag_delayed = 1 << 2; + static const uint8_t flag_started = 1 << 3; + static const uint8_t flag_sub_op_sent = 1 << 4; + static const uint8_t flag_commit_sent = 1 << 5; + + OpRequest(Message *req, OpTracker *tracker); + +protected: + void _dump_op_descriptor_unlocked(std::ostream& stream) const override; + void _unregistered() override; + bool filter_out(const std::set& filters) override; + +public: + ~OpRequest() override { + request->put(); + } + + bool check_send_map = true; ///< true until we check if sender needs a map + epoch_t sent_epoch = 0; ///< client's map epoch + epoch_t min_epoch = 0; ///< min epoch needed to handle this msg + + bool hitset_inserted; +#ifdef HAVE_JAEGER + jspan osd_parent_span = nullptr; + void set_osd_parent_span(jspan& span) { + if(osd_parent_span){ + jaeger_tracing::finish_span(osd_parent_span); + } + osd_parent_span = move(span); + } +#else + void set_osd_parent_span(...) {} +#endif + template + const T* get_req() const { return static_cast(request); } + + const Message *get_req() const { return request; } + Message *get_nonconst_req() { return request; } + + entity_name_t get_source() { + if (request) { + return request->get_source(); + } else { + return entity_name_t(); + } + } + uint8_t state_flag() const { + return latest_flag_point; + } + + std::string_view state_string() const override { + switch(latest_flag_point) { + case flag_queued_for_pg: return "queued for pg"; + case flag_reached_pg: return "reached pg"; + case flag_delayed: return "delayed"; + case flag_started: return "started"; + case flag_sub_op_sent: return "waiting for sub ops"; + case flag_commit_sent: return "commit sent; apply or cleanup"; + default: break; + } + return "no flag points reached"; + } + + static std::string get_state_string(uint8_t flag) { + std::string flag_point; + + switch(flag) { + case flag_queued_for_pg: + flag_point = "queued for pg"; + break; + case flag_reached_pg: + flag_point = "reached pg"; + break; + case flag_delayed: + flag_point = "delayed"; + break; + case flag_started: + flag_point = "started"; + break; + case flag_sub_op_sent: + flag_point = "waiting for sub ops"; + break; + case flag_commit_sent: + flag_point = "commit sent; apply or cleanup"; + break; + } + return flag_point; + } + + void mark_queued_for_pg() { + mark_flag_point(flag_queued_for_pg, "queued_for_pg"); + } + void mark_reached_pg() { + mark_flag_point(flag_reached_pg, "reached_pg"); + } + void mark_delayed(const std::string& s) { + mark_flag_point_string(flag_delayed, s); + } + void mark_started() { + mark_flag_point(flag_started, "started"); + } + void mark_sub_op_sent(const std::string& s) { + mark_flag_point_string(flag_sub_op_sent, s); + } + void mark_commit_sent() { + mark_flag_point(flag_commit_sent, "commit_sent"); + } + + utime_t get_dequeued_time() const { + return dequeued_time; + } + void set_dequeued_time(utime_t deq_time) { + dequeued_time = deq_time; + } + + osd_reqid_t get_reqid() const { + return reqid; + } + + typedef boost::intrusive_ptr Ref; + +private: + void mark_flag_point(uint8_t flag, const char *s); + void mark_flag_point_string(uint8_t flag, const std::string& s); +}; + +typedef OpRequest::Ref OpRequestRef; + +#endif /* OPREQUEST_H_ */ diff --git a/src/osd/PG.cc b/src/osd/PG.cc new file mode 100644 index 000000000..5b10f1466 --- /dev/null +++ b/src/osd/PG.cc @@ -0,0 +1,2753 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "PG.h" +#include "messages/MOSDRepScrub.h" + +#include "common/errno.h" +#include "common/ceph_releases.h" +#include "common/config.h" +#include "OSD.h" +#include "OpRequest.h" +#include "ScrubStore.h" +#include "pg_scrubber.h" +#include "Session.h" +#include "osd/scheduler/OpSchedulerItem.h" + +#include "common/Timer.h" +#include "common/perf_counters.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGScan.h" +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MBackfillReserve.h" +#include "messages/MRecoveryReserve.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGPull.h" +#include "messages/MOSDECSubOpWrite.h" +#include "messages/MOSDECSubOpWriteReply.h" +#include "messages/MOSDECSubOpRead.h" +#include "messages/MOSDECSubOpReadReply.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDScrubReserve.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDRepScrubMap.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" + +#include "common/BackTrace.h" +#include "common/EventTrace.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/pg.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#include + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +using std::list; +using std::map; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::stringstream; +using std::unique_ptr; +using std::vector; + +using ceph::bufferlist; +using ceph::bufferptr; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; + +using namespace ceph::osd::scheduler; + +template +static ostream& _prefix(std::ostream *_dout, T *t) +{ + return t->gen_prefix(*_dout); +} + +void PG::get(const char* tag) +{ + int after = ++ref; + lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " + << "tag " << (tag ? tag : "(none") << " " + << (after - 1) << " -> " << after << dendl; +#ifdef PG_DEBUG_REFS + std::lock_guard l(_ref_id_lock); + _tag_counts[tag]++; +#endif +} + +void PG::put(const char* tag) +{ +#ifdef PG_DEBUG_REFS + { + std::lock_guard l(_ref_id_lock); + auto tag_counts_entry = _tag_counts.find(tag); + ceph_assert(tag_counts_entry != _tag_counts.end()); + --tag_counts_entry->second; + if (tag_counts_entry->second == 0) { + _tag_counts.erase(tag_counts_entry); + } + } +#endif + auto local_cct = cct; + int after = --ref; + lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " " + << "tag " << (tag ? tag : "(none") << " " + << (after + 1) << " -> " << after + << dendl; + if (after == 0) + delete this; +} + +#ifdef PG_DEBUG_REFS +uint64_t PG::get_with_id() +{ + ref++; + std::lock_guard l(_ref_id_lock); + uint64_t id = ++_ref_id; + BackTrace bt(0); + stringstream ss; + bt.print(ss); + lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid + << " got id " << id << " " + << (ref - 1) << " -> " << ref + << dendl; + ceph_assert(!_live_ids.count(id)); + _live_ids.insert(make_pair(id, ss.str())); + return id; +} + +void PG::put_with_id(uint64_t id) +{ + int newref = --ref; + lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid + << " put id " << id << " " + << (newref + 1) << " -> " << newref + << dendl; + { + std::lock_guard l(_ref_id_lock); + ceph_assert(_live_ids.count(id)); + _live_ids.erase(id); + } + if (newref) + delete this; +} + +void PG::dump_live_ids() +{ + std::lock_guard l(_ref_id_lock); + dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl; + for (map::iterator i = _live_ids.begin(); + i != _live_ids.end(); + ++i) { + dout(0) << "\t\tid: " << *i << dendl; + } + dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl; + for (map::iterator i = _tag_counts.begin(); + i != _tag_counts.end(); + ++i) { + dout(0) << "\t\tid: " << *i << dendl; + } +} +#endif + +PG::PG(OSDService *o, OSDMapRef curmap, + const PGPool &_pool, spg_t p) : + pg_whoami(o->whoami, p.shard), + pg_id(p), + coll(p), + osd(o), + cct(o->cct), + osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()), + snap_mapper( + cct, + &osdriver, + p.ps(), + p.get_split_bits(_pool.info.get_pg_num()), + _pool.id, + p.shard), + trace_endpoint("0.0.0.0", 0, "PG"), + info_struct_v(0), + pgmeta_oid(p.make_pgmeta_oid()), + stat_queue_item(this), + recovery_queued(false), + recovery_ops_active(0), + backfill_reserving(false), + pg_stats_publish_valid(false), + finish_sync_event(NULL), + scrub_after_recovery(false), + active_pushes(0), + recovery_state( + o->cct, + pg_whoami, + p, + _pool, + curmap, + this, + this), + pool(recovery_state.get_pool()), + info(recovery_state.get_info()) +{ +#ifdef PG_DEBUG_REFS + osd->add_pgid(p, this); +#endif +#ifdef WITH_BLKIN + std::stringstream ss; + ss << "PG " << info.pgid; + trace_endpoint.copy_name(ss.str()); +#endif +} + +PG::~PG() +{ +#ifdef PG_DEBUG_REFS + osd->remove_pgid(info.pgid, this); +#endif +} + +void PG::lock(bool no_lockdep) const +{ +#ifdef CEPH_DEBUG_MUTEX + _lock.lock(no_lockdep); +#else + _lock.lock(); + locked_by = std::this_thread::get_id(); +#endif + // if we have unrecorded dirty state with the lock dropped, there is a bug + ceph_assert(!recovery_state.debug_has_dirty_state()); + + dout(30) << "lock" << dendl; +} + +bool PG::is_locked() const +{ + return ceph_mutex_is_locked(_lock); +} + +void PG::unlock() const +{ + //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; + ceph_assert(!recovery_state.debug_has_dirty_state()); +#ifndef CEPH_DEBUG_MUTEX + locked_by = {}; +#endif + _lock.unlock(); +} + +std::ostream& PG::gen_prefix(std::ostream& out) const +{ + OSDMapRef mapref = recovery_state.get_osdmap(); +#ifdef CEPH_DEBUG_MUTEX + if (_lock.is_locked_by_me()) { +#else + if (locked_by == std::this_thread::get_id()) { +#endif + out << "osd." << osd->whoami + << " pg_epoch: " << (mapref ? mapref->get_epoch():0) + << " " << *this << " "; + } else { + out << "osd." << osd->whoami + << " pg_epoch: " << (mapref ? mapref->get_epoch():0) + << " pg[" << pg_id.pgid << "(unlocked)] "; + } + return out; +} + +PerfCounters &PG::get_peering_perf() { + return *(osd->recoverystate_perf); +} + +PerfCounters &PG::get_perf_logger() { + return *(osd->logger); +} + +void PG::log_state_enter(const char *state) { + osd->pg_recovery_stats.log_enter(state); +} + +void PG::log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) { + osd->pg_recovery_stats.log_exit( + state_name, ceph_clock_now() - enter_time, events, event_dur); +} + +/********* PG **********/ + +void PG::remove_snap_mapped_object( + ObjectStore::Transaction &t, const hobject_t &soid) +{ + t.remove( + coll, + ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard)); + clear_object_snap_mapping(&t, soid); +} + +void PG::clear_object_snap_mapping( + ObjectStore::Transaction *t, const hobject_t &soid) +{ + OSDriver::OSTransaction _t(osdriver.get_transaction(t)); + if (soid.snap < CEPH_MAXSNAP) { + int r = snap_mapper.remove_oid( + soid, + &_t); + if (!(r == 0 || r == -ENOENT)) { + derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + } +} + +void PG::update_object_snap_mapping( + ObjectStore::Transaction *t, const hobject_t &soid, const set &snaps) +{ + OSDriver::OSTransaction _t(osdriver.get_transaction(t)); + ceph_assert(soid.snap < CEPH_MAXSNAP); + int r = snap_mapper.remove_oid( + soid, + &_t); + if (!(r == 0 || r == -ENOENT)) { + derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + snap_mapper.add_oid( + soid, + snaps, + &_t); +} + +/******* PG ***********/ +void PG::clear_primary_state() +{ + dout(20) << __func__ << dendl; + + projected_log = PGLog::IndexedLog(); + + snap_trimq.clear(); + snap_trimq_repeat.clear(); + finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread + release_pg_backoffs(); + + if (m_scrubber) { + m_scrubber->discard_replica_reservations(); + } + scrub_after_recovery = false; + + agent_clear(); +} + + +bool PG::op_has_sufficient_caps(OpRequestRef& op) +{ + // only check MOSDOp + if (op->get_req()->get_type() != CEPH_MSG_OSD_OP) + return true; + + auto req = op->get_req(); + auto priv = req->get_connection()->get_priv(); + auto session = static_cast(priv.get()); + if (!session) { + dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl; + return false; + } + OSDCap& caps = session->caps; + priv.reset(); + + const string &key = req->get_hobj().get_key().empty() ? + req->get_oid().name : + req->get_hobj().get_key(); + + bool cap = caps.is_capable(pool.name, req->get_hobj().nspace, + pool.info.application_metadata, + key, + op->need_read_cap(), + op->need_write_cap(), + op->classes(), + session->get_peer_socket_addr()); + + dout(20) << "op_has_sufficient_caps " + << "session=" << session + << " pool=" << pool.id << " (" << pool.name + << " " << req->get_hobj().nspace + << ")" + << " pool_app_metadata=" << pool.info.application_metadata + << " need_read_cap=" << op->need_read_cap() + << " need_write_cap=" << op->need_write_cap() + << " classes=" << op->classes() + << " -> " << (cap ? "yes" : "NO") + << dendl; + return cap; +} + +void PG::queue_recovery() +{ + if (!is_primary() || !is_peered()) { + dout(10) << "queue_recovery -- not primary or not peered " << dendl; + ceph_assert(!recovery_queued); + } else if (recovery_queued) { + dout(10) << "queue_recovery -- already queued" << dendl; + } else { + dout(10) << "queue_recovery -- queuing" << dendl; + recovery_queued = true; + osd->queue_for_recovery(this); + } +} + +void PG::queue_scrub_after_repair() +{ + dout(10) << __func__ << dendl; + ceph_assert(ceph_mutex_is_locked(_lock)); + + m_planned_scrub.must_deep_scrub = true; + m_planned_scrub.check_repair = true; + m_planned_scrub.must_scrub = true; + + if (is_scrub_queued_or_active()) { + dout(10) << __func__ << ": scrubbing already (" + << (is_scrubbing() ? "active)" : "queued)") << dendl; + return; + } + + m_scrubber->set_op_parameters(m_planned_scrub); + dout(15) << __func__ << ": queueing" << dendl; + + m_scrubber->set_queued_or_active(); + osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority); +} + +unsigned PG::get_scrub_priority() +{ + // a higher value -> a higher priority + int64_t pool_scrub_priority = + pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0); + return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority; +} + +Context *PG::finish_recovery() +{ + dout(10) << "finish_recovery" << dendl; + ceph_assert(info.last_complete == info.last_update); + + clear_recovery_state(); + + /* + * sync all this before purging strays. but don't block! + */ + finish_sync_event = new C_PG_FinishRecovery(this); + return finish_sync_event; +} + +void PG::_finish_recovery(Context* c) +{ + dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? " + << is_clean() << dendl; + + std::scoped_lock locker{*this}; + if (recovery_state.is_deleting() || !is_clean()) { + dout(10) << __func__ << " raced with delete or repair" << dendl; + return; + } + // When recovery is initiated by a repair, that flag is left on + state_clear(PG_STATE_REPAIR); + if (c == finish_sync_event) { + dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl; + finish_sync_event = 0; + recovery_state.purge_strays(); + + publish_stats_to_osd(); + + if (scrub_after_recovery) { + dout(10) << "_finish_recovery requeueing for scrub" << dendl; + scrub_after_recovery = false; + queue_scrub_after_repair(); + } + } else { + dout(10) << "_finish_recovery -- stale" << dendl; + } +} + +void PG::start_recovery_op(const hobject_t& soid) +{ + dout(10) << "start_recovery_op " << soid +#ifdef DEBUG_RECOVERY_OIDS + << " (" << recovering_oids << ")" +#endif + << dendl; + ceph_assert(recovery_ops_active >= 0); + recovery_ops_active++; +#ifdef DEBUG_RECOVERY_OIDS + recovering_oids.insert(soid); +#endif + osd->start_recovery_op(this, soid); +} + +void PG::finish_recovery_op(const hobject_t& soid, bool dequeue) +{ + dout(10) << "finish_recovery_op " << soid +#ifdef DEBUG_RECOVERY_OIDS + << " (" << recovering_oids << ")" +#endif + << dendl; + ceph_assert(recovery_ops_active > 0); + recovery_ops_active--; +#ifdef DEBUG_RECOVERY_OIDS + ceph_assert(recovering_oids.count(soid)); + recovering_oids.erase(recovering_oids.find(soid)); +#endif + osd->finish_recovery_op(this, soid, dequeue); + + if (!dequeue) { + queue_recovery(); + } +} + +void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) +{ + recovery_state.split_into(child_pgid, &child->recovery_state, split_bits); + + child->update_snap_mapper_bits(split_bits); + + child->snap_trimq = snap_trimq; + child->snap_trimq_repeat = snap_trimq_repeat; + + _split_into(child_pgid, child, split_bits); + + // release all backoffs for simplicity + release_backoffs(hobject_t(), hobject_t::get_max()); +} + +void PG::start_split_stats(const set& childpgs, vector *out) +{ + recovery_state.start_split_stats(childpgs, out); +} + +void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction &t) +{ + recovery_state.finish_split_stats(stats, t); +} + +void PG::merge_from(map& sources, PeeringCtx &rctx, + unsigned split_bits, + const pg_merge_meta_t& last_pg_merge_meta) +{ + dout(10) << __func__ << " from " << sources << " split_bits " << split_bits + << dendl; + map source_ps; + for (auto &&source : sources) { + source_ps.emplace(source.first, &source.second->recovery_state); + } + recovery_state.merge_from(source_ps, rctx, split_bits, last_pg_merge_meta); + + for (auto& i : sources) { + auto& source = i.second; + // wipe out source's pgmeta + rctx.transaction.remove(source->coll, source->pgmeta_oid); + + // merge (and destroy source collection) + rctx.transaction.merge_collection(source->coll, coll, split_bits); + } + + // merge_collection does this, but maybe all of our sources were missing. + rctx.transaction.collection_set_bits(coll, split_bits); + + snap_mapper.update_bits(split_bits); +} + +void PG::add_backoff(const ceph::ref_t& s, const hobject_t& begin, const hobject_t& end) +{ + auto con = s->con; + if (!con) // OSD::ms_handle_reset clears s->con without a lock + return; + auto b = s->have_backoff(info.pgid, begin); + if (b) { + derr << __func__ << " already have backoff for " << s << " begin " << begin + << " " << *b << dendl; + ceph_abort(); + } + std::lock_guard l(backoff_lock); + b = ceph::make_ref(info.pgid, this, s, ++s->backoff_seq, begin, end); + backoffs[begin].insert(b); + s->add_backoff(b); + dout(10) << __func__ << " session " << s << " added " << *b << dendl; + con->send_message( + new MOSDBackoff( + info.pgid, + get_osdmap_epoch(), + CEPH_OSD_BACKOFF_OP_BLOCK, + b->id, + begin, + end)); +} + +void PG::release_backoffs(const hobject_t& begin, const hobject_t& end) +{ + dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl; + vector> bv; + { + std::lock_guard l(backoff_lock); + auto p = backoffs.lower_bound(begin); + while (p != backoffs.end()) { + int r = cmp(p->first, end); + dout(20) << __func__ << " ? " << r << " " << p->first + << " " << p->second << dendl; + // note: must still examine begin=end=p->first case + if (r > 0 || (r == 0 && begin < end)) { + break; + } + dout(20) << __func__ << " checking " << p->first + << " " << p->second << dendl; + auto q = p->second.begin(); + while (q != p->second.end()) { + dout(20) << __func__ << " checking " << *q << dendl; + int r = cmp((*q)->begin, begin); + if (r == 0 || (r > 0 && (*q)->end < end)) { + bv.push_back(*q); + q = p->second.erase(q); + } else { + ++q; + } + } + if (p->second.empty()) { + p = backoffs.erase(p); + } else { + ++p; + } + } + } + for (auto b : bv) { + std::lock_guard l(b->lock); + dout(10) << __func__ << " " << *b << dendl; + if (b->session) { + ceph_assert(b->pg == this); + ConnectionRef con = b->session->con; + if (con) { // OSD::ms_handle_reset clears s->con without a lock + con->send_message( + new MOSDBackoff( + info.pgid, + get_osdmap_epoch(), + CEPH_OSD_BACKOFF_OP_UNBLOCK, + b->id, + b->begin, + b->end)); + } + if (b->is_new()) { + b->state = Backoff::STATE_DELETING; + } else { + b->session->rm_backoff(b); + b->session.reset(); + } + b->pg.reset(); + } + } +} + +void PG::clear_backoffs() +{ + dout(10) << __func__ << " " << dendl; + map>> ls; + { + std::lock_guard l(backoff_lock); + ls.swap(backoffs); + } + for (auto& p : ls) { + for (auto& b : p.second) { + std::lock_guard l(b->lock); + dout(10) << __func__ << " " << *b << dendl; + if (b->session) { + ceph_assert(b->pg == this); + if (b->is_new()) { + b->state = Backoff::STATE_DELETING; + } else { + b->session->rm_backoff(b); + b->session.reset(); + } + b->pg.reset(); + } + } + } +} + +// called by Session::clear_backoffs() +void PG::rm_backoff(const ceph::ref_t& b) +{ + dout(10) << __func__ << " " << *b << dendl; + std::lock_guard l(backoff_lock); + ceph_assert(ceph_mutex_is_locked_by_me(b->lock)); + ceph_assert(b->pg == this); + auto p = backoffs.find(b->begin); + // may race with release_backoffs() + if (p != backoffs.end()) { + auto q = p->second.find(b); + if (q != p->second.end()) { + p->second.erase(q); + if (p->second.empty()) { + backoffs.erase(p); + } + } + } +} + +void PG::clear_recovery_state() +{ + dout(10) << "clear_recovery_state" << dendl; + + finish_sync_event = 0; + + hobject_t soid; + while (recovery_ops_active > 0) { +#ifdef DEBUG_RECOVERY_OIDS + soid = *recovering_oids.begin(); +#endif + finish_recovery_op(soid, true); + } + + backfill_info.clear(); + peer_backfill_info.clear(); + waiting_on_backfill.clear(); + _clear_recovery_state(); // pg impl specific hook +} + +void PG::cancel_recovery() +{ + dout(10) << "cancel_recovery" << dendl; + clear_recovery_state(); +} + +void PG::set_probe_targets(const set &probe_set) +{ + std::lock_guard l(heartbeat_peer_lock); + probe_targets.clear(); + for (set::iterator i = probe_set.begin(); + i != probe_set.end(); + ++i) { + probe_targets.insert(i->osd); + } +} + +void PG::send_cluster_message( + int target, MessageRef m, + epoch_t epoch, bool share_map_update=false) +{ + ConnectionRef con = osd->get_con_osd_cluster( + target, get_osdmap_epoch()); + if (!con) { + return; + } + + if (share_map_update) { + osd->maybe_share_map(con.get(), get_osdmap()); + } + osd->send_message_osd_cluster(m, con.get()); +} + +void PG::clear_probe_targets() +{ + std::lock_guard l(heartbeat_peer_lock); + probe_targets.clear(); +} + +void PG::update_heartbeat_peers(set new_peers) +{ + bool need_update = false; + heartbeat_peer_lock.lock(); + if (new_peers == heartbeat_peers) { + dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl; + } else { + dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl; + heartbeat_peers.swap(new_peers); + need_update = true; + } + heartbeat_peer_lock.unlock(); + + if (need_update) + osd->need_heartbeat_peer_update(); +} + + +bool PG::check_in_progress_op( + const osd_reqid_t &r, + eversion_t *version, + version_t *user_version, + int *return_code, + vector *op_returns + ) const +{ + return ( + projected_log.get_request(r, version, user_version, return_code, + op_returns) || + recovery_state.get_pg_log().get_log().get_request( + r, version, user_version, return_code, op_returns)); +} + +void PG::publish_stats_to_osd() +{ + if (!is_primary()) + return; + + std::lock_guard l{pg_stats_publish_lock}; + auto stats = recovery_state.prepare_stats_for_publish( + pg_stats_publish_valid, + pg_stats_publish, + unstable_stats); + if (stats) { + pg_stats_publish = stats.value(); + pg_stats_publish_valid = true; + } +} + +unsigned PG::get_target_pg_log_entries() const +{ + return osd->get_target_pg_log_entries(); +} + +void PG::clear_publish_stats() +{ + dout(15) << "clear_stats" << dendl; + std::lock_guard l{pg_stats_publish_lock}; + pg_stats_publish_valid = false; +} + +/** + * initialize a newly instantiated pg + * + * Initialize PG state, as when a PG is initially created, or when it + * is first instantiated on the current node. + * + * @param role our role/rank + * @param newup up set + * @param newacting acting set + * @param history pg history + * @param pi past_intervals + * @param backfill true if info should be marked as backfill + * @param t transaction to write out our new state in + */ +void PG::init( + int role, + const vector& newup, int new_up_primary, + const vector& newacting, int new_acting_primary, + const pg_history_t& history, + const PastIntervals& pi, + bool backfill, + ObjectStore::Transaction &t) +{ + recovery_state.init( + role, newup, new_up_primary, newacting, + new_acting_primary, history, pi, backfill, t); +} + +void PG::shutdown() +{ + ch->flush(); + std::scoped_lock l{*this}; + recovery_state.shutdown(); + on_shutdown(); +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +void PG::upgrade(ObjectStore *store) +{ + dout(0) << __func__ << " " << info_struct_v << " -> " << pg_latest_struct_v + << dendl; + ceph_assert(info_struct_v <= 10); + ObjectStore::Transaction t; + + // + + // finished upgrade! + ceph_assert(info_struct_v == 10); + + // update infover_key + if (info_struct_v < pg_latest_struct_v) { + map v; + __u8 ver = pg_latest_struct_v; + encode(ver, v[string(infover_key)]); + t.omap_setkeys(coll, pgmeta_oid, v); + } + + recovery_state.force_write_state(t); + + ObjectStore::CollectionHandle ch = store->open_collection(coll); + int r = store->queue_transaction(ch, std::move(t)); + if (r != 0) { + derr << __func__ << ": queue_transaction returned " + << cpp_strerror(r) << dendl; + ceph_abort(); + } + ceph_assert(r == 0); + + C_SaferCond waiter; + if (!ch->flush_commit(&waiter)) { + waiter.wait(); + } +} + +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" + +void PG::prepare_write( + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ObjectStore::Transaction &t) +{ + info.stats.stats.add(unstable_stats); + unstable_stats.clear(); + map km; + string key_to_remove; + if (dirty_big_info || dirty_info) { + int ret = prepare_info_keymap( + cct, + &km, + &key_to_remove, + get_osdmap_epoch(), + info, + last_written_info, + past_intervals, + dirty_big_info, + need_write_epoch, + cct->_conf->osd_fast_info, + osd->logger, + this); + ceph_assert(ret == 0); + } + pglog.write_log_and_missing( + t, &km, coll, pgmeta_oid, pool.info.require_rollback()); + if (!km.empty()) + t.omap_setkeys(coll, pgmeta_oid, km); + if (!key_to_remove.empty()) + t.omap_rmkey(coll, pgmeta_oid, key_to_remove); +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +bool PG::_has_removal_flag(ObjectStore *store, + spg_t pgid) +{ + coll_t coll(pgid); + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + + // first try new way + set keys; + keys.insert("_remove"); + map values; + auto ch = store->open_collection(coll); + ceph_assert(ch); + if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 && + values.size() == 1) + return true; + + return false; +} + +int PG::peek_map_epoch(ObjectStore *store, + spg_t pgid, + epoch_t *pepoch) +{ + coll_t coll(pgid); + ghobject_t legacy_infos_oid(OSD::make_infos_oid()); + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + epoch_t cur_epoch = 0; + + // validate collection name + ceph_assert(coll.is_pg()); + + // try for v8 + set keys; + keys.insert(string(infover_key)); + keys.insert(string(epoch_key)); + map values; + auto ch = store->open_collection(coll); + ceph_assert(ch); + int r = store->omap_get_values(ch, pgmeta_oid, keys, &values); + if (r == 0) { + ceph_assert(values.size() == 2); + + // sanity check version + auto bp = values[string(infover_key)].cbegin(); + __u8 struct_v = 0; + decode(struct_v, bp); + ceph_assert(struct_v >= 8); + + // get epoch + bp = values[string(epoch_key)].begin(); + decode(cur_epoch, bp); + } else { + // probably bug 10617; see OSD::load_pgs() + return -1; + } + + *pepoch = cur_epoch; + return 0; +} + +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" + +bool PG::check_log_for_corruption(ObjectStore *store) +{ + /// TODO: this method needs to work with the omap log + return true; +} + +//! Get the name we're going to save our corrupt page log as +std::string PG::get_corrupt_pg_log_name() const +{ + const int MAX_BUF = 512; + char buf[MAX_BUF]; + struct tm tm_buf; + time_t my_time(time(NULL)); + const struct tm *t = localtime_r(&my_time, &tm_buf); + int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t); + if (ret == 0) { + dout(0) << "strftime failed" << dendl; + return "corrupt_log_unknown_time"; + } + string out(buf); + out += stringify(info.pgid); + return out; +} + +int PG::read_info( + ObjectStore *store, spg_t pgid, const coll_t &coll, + pg_info_t &info, PastIntervals &past_intervals, + __u8 &struct_v) +{ + set keys; + keys.insert(string(infover_key)); + keys.insert(string(info_key)); + keys.insert(string(biginfo_key)); + keys.insert(string(fastinfo_key)); + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + map values; + auto ch = store->open_collection(coll); + ceph_assert(ch); + int r = store->omap_get_values(ch, pgmeta_oid, keys, &values); + ceph_assert(r == 0); + ceph_assert(values.size() == 3 || + values.size() == 4); + + auto p = values[string(infover_key)].cbegin(); + decode(struct_v, p); + ceph_assert(struct_v >= 10); + + p = values[string(info_key)].begin(); + decode(info, p); + + p = values[string(biginfo_key)].begin(); + decode(past_intervals, p); + decode(info.purged_snaps, p); + + p = values[string(fastinfo_key)].begin(); + if (!p.end()) { + pg_fast_info_t fast; + decode(fast, p); + fast.try_apply_to(&info); + } + return 0; +} + +void PG::read_state(ObjectStore *store) +{ + PastIntervals past_intervals_from_disk; + pg_info_t info_from_disk; + int r = read_info( + store, + pg_id, + coll, + info_from_disk, + past_intervals_from_disk, + info_struct_v); + ceph_assert(r >= 0); + + if (info_struct_v < pg_compat_struct_v) { + derr << "PG needs upgrade, but on-disk data is too old; upgrade to" + << " an older version first." << dendl; + ceph_abort_msg("PG too old to upgrade"); + } + + recovery_state.init_from_disk_state( + std::move(info_from_disk), + std::move(past_intervals_from_disk), + [this, store] (PGLog &pglog) { + ostringstream oss; + pglog.read_log_and_missing( + store, + ch, + pgmeta_oid, + info, + oss, + cct->_conf->osd_ignore_stale_divergent_priors, + cct->_conf->osd_debug_verify_missing_on_start); + + if (oss.tellp()) + osd->clog->error() << oss.str(); + return 0; + }); + + if (info_struct_v < pg_latest_struct_v) { + upgrade(store); + } + + // initialize current mapping + { + int primary, up_primary; + vector acting, up; + get_osdmap()->pg_to_up_acting_osds( + pg_id.pgid, &up, &up_primary, &acting, &primary); + recovery_state.init_primary_up_acting( + up, + acting, + up_primary, + primary); + recovery_state.set_role(OSDMap::calc_pg_role(pg_whoami, acting)); + } + + // init pool options + store->set_collection_opts(ch, pool.info.opts); + + PeeringCtx rctx(ceph_release_t::unknown); + handle_initialize(rctx); + // note: we don't activate here because we know the OSD will advance maps + // during boot. + write_if_dirty(rctx.transaction); + store->queue_transaction(ch, std::move(rctx.transaction)); +} + +void PG::update_snap_map( + const vector &log_entries, + ObjectStore::Transaction &t) +{ + for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) { + OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); + if (i->soid.snap < CEPH_MAXSNAP) { + if (i->is_delete()) { + int r = snap_mapper.remove_oid( + i->soid, + &_t); + if (r) + derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl; + // On removal tolerate missing key corruption + ceph_assert(r == 0 || r == -ENOENT); + } else if (i->is_update()) { + ceph_assert(i->snaps.length() > 0); + vector snaps; + bufferlist snapbl = i->snaps; + auto p = snapbl.cbegin(); + try { + decode(snaps, p); + } catch (...) { + derr << __func__ << " decode snaps failure on " << *i << dendl; + snaps.clear(); + } + set _snaps(snaps.begin(), snaps.end()); + + if (i->is_clone() || i->is_promote()) { + snap_mapper.add_oid( + i->soid, + _snaps, + &_t); + } else if (i->is_modify()) { + int r = snap_mapper.update_snaps( + i->soid, + _snaps, + 0, + &_t); + ceph_assert(r == 0); + } else { + ceph_assert(i->is_clean()); + } + } + } + } +} + +/** + * filter trimming|trimmed snaps out of snapcontext + */ +void PG::filter_snapc(vector &snaps) +{ + // nothing needs to trim, we can return immediately + if (snap_trimq.empty() && info.purged_snaps.empty()) + return; + + bool filtering = false; + vector newsnaps; + for (vector::iterator p = snaps.begin(); + p != snaps.end(); + ++p) { + if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) { + if (!filtering) { + // start building a new vector with what we've seen so far + dout(10) << "filter_snapc filtering " << snaps << dendl; + newsnaps.insert(newsnaps.begin(), snaps.begin(), p); + filtering = true; + } + dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl; + } else { + if (filtering) + newsnaps.push_back(*p); // continue building new vector + } + } + if (filtering) { + snaps.swap(newsnaps); + dout(10) << "filter_snapc result " << snaps << dendl; + } +} + +void PG::requeue_object_waiters(map>& m) +{ + for (auto it = m.begin(); it != m.end(); ++it) + requeue_ops(it->second); + m.clear(); +} + +void PG::requeue_op(OpRequestRef op) +{ + auto p = waiting_for_map.find(op->get_source()); + if (p != waiting_for_map.end()) { + dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")" + << dendl; + p->second.push_front(op); + } else { + dout(20) << __func__ << " " << op << dendl; + osd->enqueue_front( + OpSchedulerItem( + unique_ptr(new PGOpItem(info.pgid, op)), + op->get_req()->get_cost(), + op->get_req()->get_priority(), + op->get_req()->get_recv_stamp(), + op->get_req()->get_source().num(), + get_osdmap_epoch())); + } +} + +void PG::requeue_ops(list &ls) +{ + for (list::reverse_iterator i = ls.rbegin(); + i != ls.rend(); + ++i) { + requeue_op(*i); + } + ls.clear(); +} + +void PG::requeue_map_waiters() +{ + epoch_t epoch = get_osdmap_epoch(); + auto p = waiting_for_map.begin(); + while (p != waiting_for_map.end()) { + if (epoch < p->second.front()->min_epoch) { + dout(20) << __func__ << " " << p->first << " front op " + << p->second.front() << " must still wait, doing nothing" + << dendl; + ++p; + } else { + dout(20) << __func__ << " " << p->first << " " << p->second << dendl; + for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) { + auto req = *q; + osd->enqueue_front(OpSchedulerItem( + unique_ptr(new PGOpItem(info.pgid, req)), + req->get_req()->get_cost(), + req->get_req()->get_priority(), + req->get_req()->get_recv_stamp(), + req->get_req()->get_source().num(), + epoch)); + } + p = waiting_for_map.erase(p); + } + } +} + +bool PG::get_must_scrub() const +{ + dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl; + return m_planned_scrub.must_scrub; +} + +unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const +{ + return m_scrubber->scrub_requeue_priority(with_priority); +} + +unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const +{ + return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority); +} + +// ========================================================================================== +// SCRUB + +/* + * implementation note: + * PG::sched_scrub() is called only once per a specific scrub session. + * That call commits us to the whatever choices are made (deep/shallow, etc'). + * Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into + * PgScrubber's m_flags, then cleared. + */ +bool PG::sched_scrub() +{ + dout(15) << __func__ << " pg(" << info.pgid + << (is_active() ? ") " : ") ") + << (is_clean() ? " " : " ") << dendl; + ceph_assert(ceph_mutex_is_locked(_lock)); + + if (!is_primary() || !is_active() || !is_clean()) { + return false; + } + + if (is_scrub_queued_or_active()) { + return false; + } + + // analyse the combination of the requested scrub flags, the osd/pool configuration + // and the PG status to determine whether we should scrub now, and what type of scrub + // should that be. + auto updated_flags = verify_scrub_mode(); + if (!updated_flags) { + // the stars do not align for starting a scrub for this PG at this time + // (due to configuration or priority issues) + // The reason was already reported by the callee. + dout(10) << __func__ << ": failed to initiate a scrub" << dendl; + return false; + } + + // try to reserve the local OSD resources. If failing: no harm. We will + // be retried by the OSD later on. + if (!m_scrubber->reserve_local()) { + dout(10) << __func__ << ": failed to reserve locally" << dendl; + return false; + } + + // can commit to the updated flags now, as nothing will stop the scrub + m_planned_scrub = *updated_flags; + + // An interrupted recovery repair could leave this set. + state_clear(PG_STATE_REPAIR); + + // Pass control to the scrubber. It is the scrubber that handles the replicas' + // resources reservations. + m_scrubber->set_op_parameters(m_planned_scrub); + + dout(10) << __func__ << ": queueing" << dendl; + m_scrubber->set_queued_or_active(); + osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority); + return true; +} + +double PG::next_deepscrub_interval() const +{ + double deep_scrub_interval = + pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0); + if (deep_scrub_interval <= 0.0) + deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; + return info.history.last_deep_scrub_stamp + deep_scrub_interval; +} + +bool PG::is_time_for_deep(bool allow_deep_scrub, + bool allow_scrub, + bool has_deep_errors, + const requested_scrub_t& planned) const +{ + dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? " + << allow_deep_scrub << dendl; + + if (!allow_deep_scrub) + return false; + + if (planned.need_auto) { + dout(10) << __func__ << ": need repair after scrub errors" << dendl; + return true; + } + + if (ceph_clock_now() >= next_deepscrub_interval()) { + dout(20) << __func__ << ": now (" << ceph_clock_now() << ") >= time for deep (" + << next_deepscrub_interval() << ")" << dendl; + return true; + } + + if (has_deep_errors) { + osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid + << " Deep scrub errors, upgrading scrub to deep-scrub"; + return true; + } + + // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is + // called often, we will probably be deep-scrubbing most of the time. + if (allow_scrub) { + bool deep_coin_flip = + (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100; + + dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep + << " deep_coin_flip=" << deep_coin_flip << dendl; + + if (deep_coin_flip) + return true; + } + + return false; +} + +bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub, + bool try_to_auto_repair, + bool allow_regular_scrub, + bool has_deep_errors, + requested_scrub_t& planned) const + +{ + ceph_assert(!planned.must_deep_scrub && !planned.must_repair); + + if (!allow_deep_scrub && has_deep_errors) { + osd->clog->error() + << "osd." << osd->whoami << " pg " << info.pgid + << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set"; + return false; + } + + if (allow_deep_scrub) { + // Initial entry and scheduled scrubs without nodeep_scrub set get here + + planned.time_for_deep = + is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned); + + if (try_to_auto_repair) { + if (planned.time_for_deep) { + dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl; + planned.auto_repair = true; + } else if (allow_regular_scrub) { + dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" + << dendl; + planned.deep_scrub_on_error = true; + } + } + } + + dout(20) << __func__ << " updated flags: " << planned + << " allow_regular_scrub: " << allow_regular_scrub << dendl; + + // NOSCRUB so skip regular scrubs + if (!allow_regular_scrub && !planned.time_for_deep) { + return false; + } + + return true; +} + +std::optional PG::verify_scrub_mode() const +{ + dout(10) << __func__ << " processing pg " << info.pgid << dendl; + + bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || + pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)); + bool allow_regular_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || + pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)); + bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0); + bool try_to_auto_repair = + (cct->_conf->osd_scrub_auto_repair && get_pgbackend()->auto_repair_supported()); + + auto upd_flags = m_planned_scrub; + + upd_flags.time_for_deep = false; + // Clear these in case user issues the scrub/repair command during + // the scheduling of the scrub/repair (e.g. request reservation) + upd_flags.deep_scrub_on_error = false; + upd_flags.auto_repair = false; + + if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) { + osd->clog->error() << "osd." << osd->whoami << " pg " << info.pgid + << " Regular scrub request, deep-scrub details will be lost"; + } + + if (!upd_flags.must_scrub) { + // All periodic scrub handling goes here because must_scrub is + // always set for must_deep_scrub and must_repair. + + bool can_start_periodic = + verify_periodic_scrub_mode(allow_deep_scrub, try_to_auto_repair, + allow_regular_scrub, has_deep_errors, upd_flags); + if (!can_start_periodic) { + return std::nullopt; + } + } + + // scrubbing while recovering? + + bool prevented_by_recovery = + osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery && + (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair); + + if (prevented_by_recovery) { + dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl; + return std::nullopt; + } + + upd_flags.need_auto = false; + return upd_flags; +} + +void PG::reg_next_scrub() +{ + m_scrubber->reg_next_scrub(m_planned_scrub); +} + +void PG::on_info_history_change() +{ + dout(20) << __func__ << dendl; + if (m_scrubber) { + m_scrubber->unreg_next_scrub(); + m_scrubber->reg_next_scrub(m_planned_scrub); + } +} + +void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) +{ + if (m_scrubber) { + m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub); + } +} + +void PG::clear_ready_to_merge() { + osd->clear_ready_to_merge(this); +} + +void PG::queue_want_pg_temp(const vector &wanted) { + osd->queue_want_pg_temp(get_pgid().pgid, wanted); +} + +void PG::clear_want_pg_temp() { + osd->remove_want_pg_temp(get_pgid().pgid); +} + +void PG::on_role_change() { + requeue_ops(waiting_for_peered); + plpg_on_role_change(); +} + +void PG::on_new_interval() +{ + dout(20) << __func__ << ": scrub flags on new interval: " << m_planned_scrub + << dendl; + projected_last_update = eversion_t(); + cancel_recovery(); +} + +epoch_t PG::oldest_stored_osdmap() { + return osd->get_superblock().oldest_map; +} + +OstreamTemp PG::get_clog_info() { + return osd->clog->info(); +} + +OstreamTemp PG::get_clog_debug() { + return osd->clog->debug(); +} + +OstreamTemp PG::get_clog_error() { + return osd->clog->error(); +} + +void PG::schedule_event_after( + PGPeeringEventRef event, + float delay) { + std::lock_guard lock(osd->recovery_request_lock); + osd->recovery_request_timer.add_event_after( + delay, + new QueuePeeringEvt( + this, + std::move(event))); +} + +void PG::request_local_background_io_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) { + osd->local_reserver.request_reservation( + pg_id, + on_grant ? new QueuePeeringEvt( + this, std::move(on_grant)) : nullptr, + priority, + on_preempt ? new QueuePeeringEvt( + this, std::move(on_preempt)) : nullptr); +} + +void PG::update_local_background_io_priority( + unsigned priority) { + osd->local_reserver.update_priority( + pg_id, + priority); +} + +void PG::cancel_local_background_io_reservation() { + osd->local_reserver.cancel_reservation( + pg_id); +} + +void PG::request_remote_recovery_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) { + osd->remote_reserver.request_reservation( + pg_id, + on_grant ? new QueuePeeringEvt( + this, std::move(on_grant)) : nullptr, + priority, + on_preempt ? new QueuePeeringEvt( + this, std::move(on_preempt)) : nullptr); +} + +void PG::cancel_remote_recovery_reservation() { + osd->remote_reserver.cancel_reservation( + pg_id); +} + +void PG::schedule_event_on_commit( + ObjectStore::Transaction &t, + PGPeeringEventRef on_commit) +{ + t.register_on_commit(new QueuePeeringEvt(this, on_commit)); +} + +void PG::on_activate(interval_set snaps) +{ + ceph_assert(!m_scrubber->are_callbacks_pending()); + ceph_assert(callbacks_for_degraded_object.empty()); + snap_trimq = snaps; + release_pg_backoffs(); + projected_last_update = info.last_update; +} + +void PG::on_active_exit() +{ + backfill_reserving = false; + agent_stop(); +} + +void PG::on_active_advmap(const OSDMapRef &osdmap) +{ + const auto& new_removed_snaps = osdmap->get_new_removed_snaps(); + auto i = new_removed_snaps.find(get_pgid().pool()); + if (i != new_removed_snaps.end()) { + bool bad = false; + for (auto j : i->second) { + if (snap_trimq.intersects(j.first, j.second)) { + decltype(snap_trimq) added, overlap; + added.insert(j.first, j.second); + overlap.intersection_of(snap_trimq, added); + derr << __func__ << " removed_snaps already contains " + << overlap << dendl; + bad = true; + snap_trimq.union_of(added); + } else { + snap_trimq.insert(j.first, j.second); + } + } + dout(10) << __func__ << " new removed_snaps " << i->second + << ", snap_trimq now " << snap_trimq << dendl; + ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps); + } + + const auto& new_purged_snaps = osdmap->get_new_purged_snaps(); + auto j = new_purged_snaps.find(get_pgid().pgid.pool()); + if (j != new_purged_snaps.end()) { + bool bad = false; + for (auto k : j->second) { + if (!recovery_state.get_info().purged_snaps.contains(k.first, k.second)) { + interval_set rm, overlap; + rm.insert(k.first, k.second); + overlap.intersection_of(recovery_state.get_info().purged_snaps, rm); + derr << __func__ << " purged_snaps does not contain " + << rm << ", only " << overlap << dendl; + recovery_state.adjust_purged_snaps( + [&overlap](auto &purged_snaps) { + purged_snaps.subtract(overlap); + }); + // This can currently happen in the normal (if unlikely) course of + // events. Because adding snaps to purged_snaps does not increase + // the pg version or add a pg log entry, we don't reliably propagate + // purged_snaps additions to other OSDs. + // One example: + // - purge S + // - primary and replicas update purged_snaps + // - no object updates + // - pg mapping changes, new primary on different node + // - new primary pg version == eversion_t(), so info is not + // propagated. + //bad = true; + } else { + recovery_state.adjust_purged_snaps( + [&k](auto &purged_snaps) { + purged_snaps.erase(k.first, k.second); + }); + } + } + dout(10) << __func__ << " new purged_snaps " << j->second + << ", now " << recovery_state.get_info().purged_snaps << dendl; + ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps); + } +} + +void PG::queue_snap_retrim(snapid_t snap) +{ + if (!is_active() || + !is_primary()) { + dout(10) << __func__ << " snap " << snap << " - not active and primary" + << dendl; + return; + } + if (!snap_trimq.contains(snap)) { + snap_trimq.insert(snap); + snap_trimq_repeat.insert(snap); + dout(20) << __func__ << " snap " << snap + << ", trimq now " << snap_trimq + << ", repeat " << snap_trimq_repeat << dendl; + kick_snap_trim(); + } else { + dout(20) << __func__ << " snap " << snap + << " already in trimq " << snap_trimq << dendl; + } +} + +void PG::on_active_actmap() +{ + if (cct->_conf->osd_check_for_log_corruption) + check_log_for_corruption(osd->store); + + + if (recovery_state.is_active()) { + dout(10) << "Active: kicking snap trim" << dendl; + kick_snap_trim(); + } + + if (recovery_state.is_peered() && + !recovery_state.is_clean() && + !recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) && + (!recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || + recovery_state.is_degraded())) { + queue_recovery(); + } +} + +void PG::on_backfill_reserved() +{ + backfill_reserving = false; + queue_recovery(); +} + +void PG::on_backfill_canceled() +{ + if (!waiting_on_backfill.empty()) { + waiting_on_backfill.clear(); + finish_recovery_op(hobject_t::get_max()); + } +} + +void PG::on_recovery_reserved() +{ + queue_recovery(); +} + +void PG::set_not_ready_to_merge_target(pg_t pgid, pg_t src) +{ + osd->set_not_ready_to_merge_target(pgid, src); +} + +void PG::set_not_ready_to_merge_source(pg_t pgid) +{ + osd->set_not_ready_to_merge_source(pgid); +} + +void PG::set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) +{ + osd->set_ready_to_merge_target(this, lu, les, lec); +} + +void PG::set_ready_to_merge_source(eversion_t lu) +{ + osd->set_ready_to_merge_source(this, lu); +} + +void PG::send_pg_created(pg_t pgid) +{ + osd->send_pg_created(pgid); +} + +ceph::signedspan PG::get_mnow() +{ + return osd->get_mnow(); +} + +HeartbeatStampsRef PG::get_hb_stamps(int peer) +{ + return osd->get_hb_stamps(peer); +} + +void PG::schedule_renew_lease(epoch_t lpr, ceph::timespan delay) +{ + auto spgid = info.pgid; + auto o = osd; + osd->mono_timer.add_event( + delay, + [o, lpr, spgid]() { + o->queue_renew_lease(lpr, spgid); + }); +} + +void PG::queue_check_readable(epoch_t lpr, ceph::timespan delay) +{ + osd->queue_check_readable(info.pgid, lpr, delay); +} + +void PG::rebuild_missing_set_with_deletes(PGLog &pglog) +{ + pglog.rebuild_missing_set_with_deletes( + osd->store, + ch, + recovery_state.get_info()); +} + +void PG::on_activate_committed() +{ + if (!is_primary()) { + // waiters + if (recovery_state.needs_flush() == 0) { + requeue_ops(waiting_for_peered); + } else if (!waiting_for_peered.empty()) { + dout(10) << __func__ << " flushes in progress, moving " + << waiting_for_peered.size() << " items to waiting_for_flush" + << dendl; + ceph_assert(waiting_for_flush.empty()); + waiting_for_flush.swap(waiting_for_peered); + } + } +} + +// Compute pending backfill data +static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes) +{ + lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " + << (local_bytes >> 10) << "KiB" + << " primary usage " << (bf_bytes >> 10) + << "KiB" << dendl; + + return std::max((int64_t)0, bf_bytes - local_bytes); +} + + +// We can zero the value of primary num_bytes as just an atomic. +// However, setting above zero reserves space for backfill and requires +// the OSDService::stat_lock which protects all OSD usage +bool PG::try_reserve_recovery_space( + int64_t primary_bytes, int64_t local_bytes) { + // Use tentative_bacfill_full() to make sure enough + // space is available to handle target bytes from primary. + + // TODO: If we passed num_objects from primary we could account for + // an estimate of the metadata overhead. + + // TODO: If we had compressed_allocated and compressed_original from primary + // we could compute compression ratio and adjust accordingly. + + // XXX: There is no way to get omap overhead and this would only apply + // to whatever possibly different partition that is storing the database. + + // update_osd_stat() from heartbeat will do this on a new + // statfs using ps->primary_bytes. + uint64_t pending_adjustment = 0; + if (primary_bytes) { + // For erasure coded pool overestimate by a full stripe per object + // because we don't know how each objected rounded to the nearest stripe + if (pool.info.is_erasure()) { + primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); + primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * + info.stats.stats.sum.num_objects; + local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); + local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * + info.stats.stats.sum.num_objects; + } + pending_adjustment = pending_backfill( + cct, + primary_bytes, + local_bytes); + dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10) + << "KiB" + << " local " << (local_bytes >> 10) << "KiB" + << " pending_adjustments " << (pending_adjustment >> 10) << "KiB" + << dendl; + } + + // This lock protects not only the stats OSDService but also setting the + // pg primary_bytes. That's why we don't immediately unlock + std::lock_guard l{osd->stat_lock}; + osd_stat_t cur_stat = osd->osd_stat; + if (cct->_conf->osd_debug_reject_backfill_probability > 0 && + (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { + dout(10) << "backfill reservation rejected: failure injection" + << dendl; + return false; + } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && + osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) { + dout(10) << "backfill reservation rejected: backfill full" + << dendl; + return false; + } else { + // Don't reserve space if skipped reservation check, this is used + // to test the other backfill full check AND in case a corruption + // of num_bytes requires ignoring that value and trying the + // backfill anyway. + if (primary_bytes && + !cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) { + primary_num_bytes.store(primary_bytes); + local_num_bytes.store(local_bytes); + } else { + unreserve_recovery_space(); + } + return true; + } +} + +void PG::unreserve_recovery_space() { + primary_num_bytes.store(0); + local_num_bytes.store(0); +} + +void PG::_scan_rollback_obs(const vector &rollback_obs) +{ + ObjectStore::Transaction t; + eversion_t trimmed_to = recovery_state.get_last_rollback_info_trimmed_to_applied(); + for (vector::const_iterator i = rollback_obs.begin(); + i != rollback_obs.end(); + ++i) { + if (i->generation < trimmed_to.version) { + dout(10) << __func__ << "osd." << osd->whoami + << " pg " << info.pgid + << " found obsolete rollback obj " + << *i << " generation < trimmed_to " + << trimmed_to + << "...repaired" << dendl; + t.remove(coll, *i); + } + } + if (!t.empty()) { + derr << __func__ << ": queueing trans to clean up obsolete rollback objs" + << dendl; + osd->store->queue_transaction(ch, std::move(t), NULL); + } +} + + +void PG::_repair_oinfo_oid(ScrubMap &smap) +{ + for (map::reverse_iterator i = smap.objects.rbegin(); + i != smap.objects.rend(); + ++i) { + const hobject_t &hoid = i->first; + ScrubMap::object &o = i->second; + + bufferlist bl; + if (o.attrs.find(OI_ATTR) == o.attrs.end()) { + continue; + } + bl.push_back(o.attrs[OI_ATTR]); + object_info_t oi; + try { + oi.decode(bl); + } catch(...) { + continue; + } + if (oi.soid != hoid) { + ObjectStore::Transaction t; + OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); + osd->clog->error() << "osd." << osd->whoami + << " found object info error on pg " + << info.pgid + << " oid " << hoid << " oid in object info: " + << oi.soid + << "...repaired"; + // Fix object info + oi.soid = hoid; + bl.clear(); + encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + + bufferptr bp(bl.c_str(), bl.length()); + o.attrs[OI_ATTR] = bp; + + t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl); + int r = osd->store->queue_transaction(ch, std::move(t)); + if (r != 0) { + derr << __func__ << ": queue_transaction got " << cpp_strerror(r) + << dendl; + } + } + } +} + +void PG::repair_object( + const hobject_t &soid, + const list > &ok_peers, + const set &bad_peers) +{ + set ok_shards; + for (auto &&peer: ok_peers) ok_shards.insert(peer.second); + + dout(10) << "repair_object " << soid + << " bad_peers osd.{" << bad_peers << "}," + << " ok_peers osd.{" << ok_shards << "}" << dendl; + + const ScrubMap::object &po = ok_peers.back().first; + eversion_t v; + object_info_t oi; + try { + bufferlist bv; + if (po.attrs.count(OI_ATTR)) { + bv.push_back(po.attrs.find(OI_ATTR)->second); + } + auto bliter = bv.cbegin(); + decode(oi, bliter); + } catch (...) { + dout(0) << __func__ << ": Need version of replica, bad object_info_t: " + << soid << dendl; + ceph_abort(); + } + + if (bad_peers.count(get_primary())) { + // We should only be scrubbing if the PG is clean. + ceph_assert(waiting_for_unreadable_object.empty()); + dout(10) << __func__ << ": primary = " << get_primary() << dendl; + } + + /* No need to pass ok_peers, they must not be missing the object, so + * force_object_missing will add them to missing_loc anyway */ + recovery_state.force_object_missing(bad_peers, soid, oi.version); +} + +void PG::forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc) +{ + dout(20) << __func__ << ": " << desc << " queued at: " << epoch_queued << dendl; + ceph_assert(m_scrubber); + if (is_active()) { + ((*m_scrubber).*fn)(epoch_queued); + } else { + // pg might be in the process of being deleted + dout(5) << __func__ << " refusing to forward. " << (is_clean() ? "(clean) " : "(not clean) ") << + (is_active() ? "(active) " : "(not active) ") << dendl; + } +} + +void PG::forward_scrub_event(ScrubSafeAPI fn, + epoch_t epoch_queued, + Scrub::act_token_t act_token, + std::string_view desc) +{ + dout(20) << __func__ << ": " << desc << " queued: " << epoch_queued + << " token: " << act_token << dendl; + ceph_assert(m_scrubber); + if (is_active()) { + ((*m_scrubber).*fn)(epoch_queued, act_token); + } else { + // pg might be in the process of being deleted + dout(5) << __func__ << " refusing to forward. " + << (is_clean() ? "(clean) " : "(not clean) ") + << (is_active() ? "(active) " : "(not active) ") << dendl; + } +} + +void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle) +{ + dout(10) << __func__ << " (op)" << dendl; + ceph_assert(m_scrubber); + m_scrubber->replica_scrub_op(op); +} + +void PG::replica_scrub(epoch_t epoch_queued, + Scrub::act_token_t act_token, + [[maybe_unused]] ThreadPool::TPHandle& handle) +{ + dout(10) << __func__ << " queued at: " << epoch_queued + << (is_primary() ? " (primary)" : " (replica)") << dendl; + forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, act_token, + "StartReplica/nw"); +} + +bool PG::ops_blocked_by_scrub() const +{ + return !waiting_for_scrub.empty(); +} + +Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const +{ + return waiting_for_scrub.empty() ? Scrub::scrub_prio_t::low_priority + : Scrub::scrub_prio_t::high_priority; +} + +bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) +{ + if (auto last_reset = get_last_peering_reset(); + last_reset > reply_epoch || last_reset > query_epoch) { + dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " + << query_epoch << " last_peering_reset " << last_reset << dendl; + return true; + } + return false; +} + +struct FlushState { + PGRef pg; + epoch_t epoch; + FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {} + ~FlushState() { + std::scoped_lock l{*pg}; + if (!pg->pg_has_reset_since(epoch)) { + pg->recovery_state.complete_flush(); + } + } +}; +typedef std::shared_ptr FlushStateRef; + +void PG::start_flush_on_transaction(ObjectStore::Transaction &t) +{ + // flush in progress ops + FlushStateRef flush_trigger (std::make_shared( + this, get_osdmap_epoch())); + t.register_on_applied(new ContainerContext(flush_trigger)); + t.register_on_commit(new ContainerContext(flush_trigger)); +} + +bool PG::try_flush_or_schedule_async() +{ + Context *c = new QueuePeeringEvt( + this, get_osdmap_epoch(), PeeringState::IntervalFlush()); + if (!ch->flush_commit(c)) { + return false; + } else { + delete c; + return true; + } +} + +ostream& operator<<(ostream& out, const PG& pg) +{ + out << pg.recovery_state; + + // listing all scrub-related flags - both current and "planned next scrub" + if (pg.is_scrubbing()) { + out << *pg.m_scrubber; + } + out << pg.m_planned_scrub; + + if (pg.recovery_ops_active) + out << " rops=" << pg.recovery_ops_active; + + //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]"; + if (pg.recovery_state.have_missing()) { + out << " m=" << pg.recovery_state.get_num_missing(); + if (pg.is_primary()) { + uint64_t unfound = pg.recovery_state.get_num_unfound(); + if (unfound) + out << " u=" << unfound; + } + } + if (!pg.is_clean()) { + out << " mbc=" << pg.recovery_state.get_missing_by_count(); + } + if (!pg.snap_trimq.empty()) { + out << " trimq="; + // only show a count if the set is large + if (pg.snap_trimq.num_intervals() > 16) { + out << pg.snap_trimq.size(); + if (!pg.snap_trimq_repeat.empty()) { + out << "(" << pg.snap_trimq_repeat.size() << ")"; + } + } else { + out << pg.snap_trimq; + if (!pg.snap_trimq_repeat.empty()) { + out << "(" << pg.snap_trimq_repeat << ")"; + } + } + } + if (!pg.recovery_state.get_info().purged_snaps.empty()) { + out << " ps="; // snap trim queue / purged snaps + if (pg.recovery_state.get_info().purged_snaps.num_intervals() > 16) { + out << pg.recovery_state.get_info().purged_snaps.size(); + } else { + out << pg.recovery_state.get_info().purged_snaps; + } + } + + out << "]"; + return out; +} + +bool PG::can_discard_op(OpRequestRef& op) +{ + auto m = op->get_req(); + if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) { + dout(20) << " discard " << *m << dendl; + return true; + } + + if (m->get_map_epoch() < info.history.same_primary_since) { + dout(7) << " changed after " << m->get_map_epoch() + << ", dropping " << *m << dendl; + return true; + } + + if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) && + !is_primary() && + m->get_map_epoch() < info.history.same_interval_since) { + // Note: the Objecter will resend on interval change without the primary + // changing if it actually sent to a replica. If the primary hasn't + // changed since the send epoch, we got it, and we're primary, it won't + // have resent even if the interval did change as it sent it to the primary + // (us). + return true; + } + + + if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) { + // >= luminous client + if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) { + // >= nautilus client + if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) { + dout(7) << __func__ << " sent before last_force_op_resend " + << pool.info.last_force_op_resend + << ", dropping" << *m << dendl; + return true; + } + } else { + // == < nautilus client (luminous or mimic) + if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) { + dout(7) << __func__ << " sent before last_force_op_resend_prenautilus " + << pool.info.last_force_op_resend_prenautilus + << ", dropping" << *m << dendl; + return true; + } + } + if (m->get_map_epoch() < info.history.last_epoch_split) { + dout(7) << __func__ << " pg split in " + << info.history.last_epoch_split << ", dropping" << dendl; + return true; + } + } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) { + // < luminous client + if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) { + dout(7) << __func__ << " sent before last_force_op_resend_preluminous " + << pool.info.last_force_op_resend_preluminous + << ", dropping" << *m << dendl; + return true; + } + } + + return false; +} + +template +bool PG::can_discard_replica_op(OpRequestRef& op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSGTYPE); + + int from = m->get_source().num(); + + // if a repop is replied after a replica goes down in a new osdmap, and + // before the pg advances to this new osdmap, the repop replies before this + // repop can be discarded by that replica OSD, because the primary resets the + // connection to it when handling the new osdmap marking it down, and also + // resets the messenger sesssion when the replica reconnects. to avoid the + // out-of-order replies, the messages from that replica should be discarded. + OSDMapRef next_map = osd->get_next_osdmap(); + if (next_map->is_down(from)) { + dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl; + return true; + } + /* Mostly, this overlaps with the old_peering_msg + * condition. An important exception is pushes + * sent by replicas not in the acting set, since + * if such a replica goes down it does not cause + * a new interval. */ + if (next_map->get_down_at(from) >= m->map_epoch) { + dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl; + return true; + } + + // same pg? + // if pg changes _at all_, we reset and repeer! + if (old_peering_msg(m->map_epoch, m->map_epoch)) { + dout(10) << "can_discard_replica_op pg changed " << info.history + << " after " << m->map_epoch + << ", dropping" << dendl; + return true; + } + return false; +} + +bool PG::can_discard_scan(OpRequestRef op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_SCAN); + + if (old_peering_msg(m->map_epoch, m->query_epoch)) { + dout(10) << " got old scan, ignoring" << dendl; + return true; + } + return false; +} + +bool PG::can_discard_backfill(OpRequestRef op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL); + + if (old_peering_msg(m->map_epoch, m->query_epoch)) { + dout(10) << " got old backfill, ignoring" << dendl; + return true; + } + + return false; + +} + +bool PG::can_discard_request(OpRequestRef& op) +{ + switch (op->get_req()->get_type()) { + case CEPH_MSG_OSD_OP: + return can_discard_op(op); + case CEPH_MSG_OSD_BACKOFF: + return false; // never discard + case MSG_OSD_REPOP: + return can_discard_replica_op(op); + case MSG_OSD_PG_PUSH: + return can_discard_replica_op(op); + case MSG_OSD_PG_PULL: + return can_discard_replica_op(op); + case MSG_OSD_PG_PUSH_REPLY: + return can_discard_replica_op(op); + case MSG_OSD_REPOPREPLY: + return can_discard_replica_op(op); + case MSG_OSD_PG_RECOVERY_DELETE: + return can_discard_replica_op(op); + + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + return can_discard_replica_op(op); + + case MSG_OSD_EC_WRITE: + return can_discard_replica_op(op); + case MSG_OSD_EC_WRITE_REPLY: + return can_discard_replica_op(op); + case MSG_OSD_EC_READ: + return can_discard_replica_op(op); + case MSG_OSD_EC_READ_REPLY: + return can_discard_replica_op(op); + case MSG_OSD_REP_SCRUB: + return can_discard_replica_op(op); + case MSG_OSD_SCRUB_RESERVE: + return can_discard_replica_op(op); + case MSG_OSD_REP_SCRUBMAP: + return can_discard_replica_op(op); + case MSG_OSD_PG_UPDATE_LOG_MISSING: + return can_discard_replica_op< + MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op); + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + return can_discard_replica_op< + MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op); + + case MSG_OSD_PG_SCAN: + return can_discard_scan(op); + case MSG_OSD_PG_BACKFILL: + return can_discard_backfill(op); + case MSG_OSD_PG_BACKFILL_REMOVE: + return can_discard_replica_op(op); + } + return true; +} + +void PG::do_peering_event(PGPeeringEventRef evt, PeeringCtx &rctx) +{ + dout(10) << __func__ << ": " << evt->get_desc() << dendl; + ceph_assert(have_same_or_newer_map(evt->get_epoch_sent())); + if (old_peering_evt(evt)) { + dout(10) << "discard old " << evt->get_desc() << dendl; + } else { + recovery_state.handle_event(evt, &rctx); + } + // write_if_dirty regardless of path above to ensure we capture any work + // done by OSD::advance_pg(). + write_if_dirty(rctx.transaction); +} + +void PG::queue_peering_event(PGPeeringEventRef evt) +{ + if (old_peering_evt(evt)) + return; + osd->osd->enqueue_peering_evt(info.pgid, evt); +} + +void PG::queue_null(epoch_t msg_epoch, + epoch_t query_epoch) +{ + dout(10) << "null" << dendl; + queue_peering_event( + PGPeeringEventRef(std::make_shared(msg_epoch, query_epoch, + NullEvt()))); +} + +void PG::find_unfound(epoch_t queued, PeeringCtx &rctx) +{ + /* + * if we couldn't start any recovery ops and things are still + * unfound, see if we can discover more missing object locations. + * It may be that our initial locations were bad and we errored + * out while trying to pull. + */ + if (!recovery_state.discover_all_missing(rctx)) { + string action; + if (state_test(PG_STATE_BACKFILLING)) { + auto evt = PGPeeringEventRef( + new PGPeeringEvent( + queued, + queued, + PeeringState::UnfoundBackfill())); + queue_peering_event(evt); + action = "in backfill"; + } else if (state_test(PG_STATE_RECOVERING)) { + auto evt = PGPeeringEventRef( + new PGPeeringEvent( + queued, + queued, + PeeringState::UnfoundRecovery())); + queue_peering_event(evt); + action = "in recovery"; + } else { + action = "already out of recovery/backfill"; + } + dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl; + } else { + dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl; + queue_recovery(); + } +} + +void PG::handle_advance_map( + OSDMapRef osdmap, OSDMapRef lastmap, + vector& newup, int up_primary, + vector& newacting, int acting_primary, + PeeringCtx &rctx) +{ + dout(10) << __func__ << ": " << osdmap->get_epoch() << dendl; + osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch()); + recovery_state.advance_map( + osdmap, + lastmap, + newup, + up_primary, + newacting, + acting_primary, + rctx); +} + +void PG::handle_activate_map(PeeringCtx &rctx) +{ + dout(10) << __func__ << ": " << get_osdmap()->get_epoch() + << dendl; + recovery_state.activate_map(rctx); + + requeue_map_waiters(); +} + +void PG::handle_initialize(PeeringCtx &rctx) +{ + dout(10) << __func__ << dendl; + PeeringState::Initialize evt; + recovery_state.handle_event(evt, &rctx); +} + + +void PG::handle_query_state(Formatter *f) +{ + dout(10) << "handle_query_state" << dendl; + PeeringState::QueryState q(f); + recovery_state.handle_event(q, 0); + + // This code has moved to after the close of recovery_state array. + // I don't think that scrub is a recovery state + if (is_primary() && is_active() && m_scrubber && m_scrubber->is_scrub_active()) { + m_scrubber->handle_query_state(f); + } +} + +void PG::init_collection_pool_opts() +{ + auto r = osd->store->set_collection_opts(ch, pool.info.opts); + if (r < 0 && r != -EOPNOTSUPP) { + derr << __func__ << " set_collection_opts returns error:" << r << dendl; + } +} + +void PG::on_pool_change() +{ + init_collection_pool_opts(); + plpg_on_pool_change(); +} + +void PG::C_DeleteMore::complete(int r) { + ceph_assert(r == 0); + pg->lock(); + if (!pg->pg_has_reset_since(epoch)) { + pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch); + } + pg->unlock(); + delete this; +} + +std::pair PG::do_delete_work( + ObjectStore::Transaction &t, + ghobject_t _next) +{ + dout(10) << __func__ << dendl; + + { + float osd_delete_sleep = osd->osd->get_osd_delete_sleep(); + if (osd_delete_sleep > 0 && delete_needs_sleep) { + epoch_t e = get_osdmap()->get_epoch(); + PGRef pgref(this); + auto delete_requeue_callback = new LambdaContext([this, pgref, e](int r) { + dout(20) << "do_delete_work() [cb] wake up at " + << ceph_clock_now() + << ", re-queuing delete" << dendl; + std::scoped_lock locker{*this}; + delete_needs_sleep = false; + if (!pg_has_reset_since(e)) { + osd->queue_for_pg_delete(get_pgid(), e); + } + }); + + auto delete_schedule_time = ceph::real_clock::now(); + delete_schedule_time += ceph::make_timespan(osd_delete_sleep); + std::lock_guard l{osd->sleep_lock}; + osd->sleep_timer.add_event_at(delete_schedule_time, + delete_requeue_callback); + dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl; + return std::make_pair(_next, true); + } + } + + delete_needs_sleep = true; + + ghobject_t next; + + vector olist; + int max = std::min(osd->store->get_ideal_list_max(), + (int)cct->_conf->osd_target_transaction_size); + + osd->store->collection_list( + ch, + _next, + ghobject_t::get_max(), + max, + &olist, + &next); + dout(20) << __func__ << " " << olist << dendl; + + // make sure we've removed everything + // by one more listing from the beginning + if (_next != ghobject_t() && olist.empty()) { + next = ghobject_t(); + osd->store->collection_list( + ch, + next, + ghobject_t::get_max(), + max, + &olist, + &next); + if (!olist.empty()) { + for (auto& oid : olist) { + if (oid == pgmeta_oid) { + dout(20) << __func__ << " removing pgmeta object " << oid << dendl; + } else { + dout(0) << __func__ << " additional unexpected onode" + <<" new onode has appeared since PG removal started" + << oid << dendl; + } + } + } + } + + OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); + int64_t num = 0; + for (auto& oid : olist) { + if (oid == pgmeta_oid) { + continue; + } + if (oid.is_pgmeta()) { + osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid + << " during PG removal"; + } + int r = snap_mapper.remove_oid(oid.hobj, &_t); + if (r != 0 && r != -ENOENT) { + ceph_abort(); + } + t.remove(coll, oid); + ++num; + } + bool running = true; + if (num) { + dout(20) << __func__ << " deleting " << num << " objects" << dendl; + Context *fin = new C_DeleteMore(this, get_osdmap_epoch()); + t.register_on_commit(fin); + } else { + if (cct->_conf->osd_inject_failure_on_pg_removal) { + _exit(1); + } + + // final flush here to ensure completions drop refs. Of particular concern + // are the SnapMapper ContainerContexts. + { + PGRef pgref(this); + PGLog::clear_info_log(info.pgid, &t); + t.remove_collection(coll); + t.register_on_commit(new ContainerContext(pgref)); + t.register_on_applied(new ContainerContext(pgref)); + osd->store->queue_transaction(ch, std::move(t)); + } + ch->flush(); + + if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) { + dout(1) << __func__ << " raced with merge, reinstantiating" << dendl; + ch = osd->store->create_new_collection(coll); + create_pg_collection(t, + info.pgid, + info.pgid.get_split_bits(pool.info.get_pg_num())); + init_pg_ondisk(t, info.pgid, &pool.info); + recovery_state.reset_last_persisted(); + } else { + recovery_state.set_delete_complete(); + + // cancel reserver here, since the PG is about to get deleted and the + // exit() methods don't run when that happens. + osd->local_reserver.cancel_reservation(info.pgid); + + running = false; + } + } + return {next, running}; +} + +int PG::pg_stat_adjust(osd_stat_t *ns) +{ + osd_stat_t &new_stat = *ns; + if (is_primary()) { + return 0; + } + // Adjust the kb_used by adding pending backfill data + uint64_t reserved_num_bytes = get_reserved_num_bytes(); + + // For now we don't consider projected space gains here + // I suggest we have an optional 2 pass backfill that frees up + // space in a first pass. This could be triggered when at nearfull + // or near to backfillfull. + if (reserved_num_bytes > 0) { + // TODO: Handle compression by adjusting by the PGs average + // compression precentage. + dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB" + << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl; + if (new_stat.statfs.available > reserved_num_bytes) + new_stat.statfs.available -= reserved_num_bytes; + else + new_stat.statfs.available = 0; + dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl; + return 1; + } + return 0; +} + +void PG::dump_pgstate_history(Formatter *f) +{ + std::scoped_lock l{*this}; + recovery_state.dump_history(f); +} + +void PG::dump_missing(Formatter *f) +{ + for (auto& i : recovery_state.get_pg_log().get_missing().get_items()) { + f->open_object_section("object"); + f->dump_object("oid", i.first); + f->dump_object("missing_info", i.second); + if (recovery_state.get_missing_loc().needs_recovery(i.first)) { + f->dump_bool( + "unfound", + recovery_state.get_missing_loc().is_unfound(i.first)); + f->open_array_section("locations"); + for (auto l : recovery_state.get_missing_loc().get_locations(i.first)) { + f->dump_object("shard", l); + } + f->close_section(); + } + f->close_section(); + } +} + +void PG::get_pg_stats(std::function f) +{ + std::lock_guard l{pg_stats_publish_lock}; + if (pg_stats_publish_valid) { + f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean()); + } +} + +void PG::with_heartbeat_peers(std::function f) +{ + std::lock_guard l{heartbeat_peer_lock}; + for (auto p : heartbeat_peers) { + f(p); + } + for (auto p : probe_targets) { + f(p); + } +} + +uint64_t PG::get_min_alloc_size() const { + return osd->store->get_min_alloc_size(); +} diff --git a/src/osd/PG.h b/src/osd/PG.h new file mode 100644 index 000000000..61adae120 --- /dev/null +++ b/src/osd/PG.h @@ -0,0 +1,1341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_PG_H +#define CEPH_PG_H + +#include +#include +#include "include/mempool.h" + +// re-include our assert to clobber boost's +#include "include/ceph_assert.h" +#include "include/common_fwd.h" + +#include "include/types.h" +#include "include/stringify.h" +#include "osd_types.h" +#include "include/xlist.h" +#include "SnapMapper.h" +#include "Session.h" +#include "common/Timer.h" + +#include "PGLog.h" +#include "OSDMap.h" +#include "messages/MOSDPGLog.h" +#include "include/str_list.h" +#include "PGBackend.h" +#include "PGPeeringEvent.h" +#include "PeeringState.h" +#include "recovery_types.h" +#include "MissingLoc.h" +#include "scrubber_common.h" + +#include "mgr/OSDPerfMetricTypes.h" + +#include +#include +#include +#include +#include + +//#define DEBUG_RECOVERY_OIDS // track std::set of recovering oids explicitly, to find counting bugs +//#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks + +class OSD; +class OSDService; +class OSDShard; +class OSDShardPGSlot; +class MOSDPGScan; +class MOSDPGBackfill; +class MOSDPGInfo; + +class PG; +struct OpRequest; +typedef OpRequest::Ref OpRequestRef; +class MOSDPGLog; +class DynamicPerfStats; +class PgScrubber; + +namespace Scrub { + class Store; + class ReplicaReservations; + class LocalReservation; + class ReservedByRemotePrimary; +} + +#ifdef PG_DEBUG_REFS +#include "common/tracked_int_ptr.hpp" + uint64_t get_with_id(PG *pg); + void put_with_id(PG *pg, uint64_t id); + typedef TrackedIntPtr PGRef; +#else + typedef boost::intrusive_ptr PGRef; +#endif + +class PGRecoveryStats { + struct per_state_info { + uint64_t enter, exit; // enter/exit counts + uint64_t events; + utime_t event_time; // time spent processing events + utime_t total_time; // total time in state + utime_t min_time, max_time; + + // cppcheck-suppress unreachableCode + per_state_info() : enter(0), exit(0), events(0) {} + }; + std::map info; + ceph::mutex lock = ceph::make_mutex("PGRecoverStats::lock"); + + public: + PGRecoveryStats() = default; + + void reset() { + std::lock_guard l(lock); + info.clear(); + } + void dump(ostream& out) { + std::lock_guard l(lock); + for (std::map::iterator p = info.begin(); p != info.end(); ++p) { + per_state_info& i = p->second; + out << i.enter << "\t" << i.exit << "\t" + << i.events << "\t" << i.event_time << "\t" + << i.total_time << "\t" + << i.min_time << "\t" << i.max_time << "\t" + << p->first << "\n"; + } + } + + void dump_formatted(ceph::Formatter *f) { + std::lock_guard l(lock); + f->open_array_section("pg_recovery_stats"); + for (std::map::iterator p = info.begin(); + p != info.end(); ++p) { + per_state_info& i = p->second; + f->open_object_section("recovery_state"); + f->dump_int("enter", i.enter); + f->dump_int("exit", i.exit); + f->dump_int("events", i.events); + f->dump_stream("event_time") << i.event_time; + f->dump_stream("total_time") << i.total_time; + f->dump_stream("min_time") << i.min_time; + f->dump_stream("max_time") << i.max_time; + std::vector states; + get_str_vec(p->first, "/", states); + f->open_array_section("nested_states"); + for (std::vector::iterator st = states.begin(); + st != states.end(); ++st) { + f->dump_string("state", *st); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + } + + void log_enter(const char *s) { + std::lock_guard l(lock); + info[s].enter++; + } + void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) { + std::lock_guard l(lock); + per_state_info &i = info[s]; + i.exit++; + i.total_time += dur; + if (dur > i.max_time) + i.max_time = dur; + if (dur < i.min_time || i.min_time == utime_t()) + i.min_time = dur; + i.events += events; + i.event_time += event_dur; + } +}; + +/** PG - Replica Placement Group + * + */ + +class PG : public DoutPrefixProvider, public PeeringState::PeeringListener { + friend struct NamedState; + friend class PeeringState; + friend class PgScrubber; + friend class PrimaryLogScrub; + friend class Scrub::ReplicaReservations; + +public: + const pg_shard_t pg_whoami; + const spg_t pg_id; + + std::unique_ptr m_scrubber; + + /// flags detailing scheduling/operation characteristics of the next scrub + requested_scrub_t m_planned_scrub; + /// scrubbing state for both Primary & replicas + bool is_scrub_active() const { return m_scrubber->is_scrub_active(); } + + /// set when the scrub request is queued, and reset after scrubbing fully + /// cleaned up. + bool is_scrub_queued_or_active() const { return m_scrubber->is_queued_or_active(); } + +public: + // -- members -- + const coll_t coll; + + ObjectStore::CollectionHandle ch; + + // -- methods -- + std::ostream& gen_prefix(std::ostream& out) const override; + CephContext *get_cct() const override { + return cct; + } + unsigned get_subsys() const override { + return ceph_subsys_osd; + } + + const char* const get_current_state() const { + return recovery_state.get_current_state(); + } + + const OSDMapRef& get_osdmap() const { + ceph_assert(is_locked()); + return recovery_state.get_osdmap(); + } + + epoch_t get_osdmap_epoch() const override final { + return recovery_state.get_osdmap()->get_epoch(); + } + + PerfCounters &get_peering_perf() override; + PerfCounters &get_perf_logger() override; + void log_state_enter(const char *state) override; + void log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) override; + + void lock(bool no_lockdep = false) const; + void unlock() const; + bool is_locked() const; + + const spg_t& get_pgid() const { + return pg_id; + } + + const PGPool& get_pool() const { + return pool; + } + uint64_t get_last_user_version() const { + return info.last_user_version; + } + const pg_history_t& get_history() const { + return info.history; + } + bool get_need_up_thru() const { + return recovery_state.get_need_up_thru(); + } + epoch_t get_same_interval_since() const { + return info.history.same_interval_since; + } + + static void set_last_scrub_stamp( + utime_t t, pg_history_t &history, pg_stat_t &stats) { + stats.last_scrub_stamp = t; + history.last_scrub_stamp = t; + } + + void set_last_scrub_stamp(utime_t t) { + recovery_state.update_stats( + [=](auto &history, auto &stats) { + set_last_scrub_stamp(t, history, stats); + return true; + }); + } + + static void set_last_deep_scrub_stamp( + utime_t t, pg_history_t &history, pg_stat_t &stats) { + stats.last_deep_scrub_stamp = t; + history.last_deep_scrub_stamp = t; + } + + void set_last_deep_scrub_stamp(utime_t t) { + recovery_state.update_stats( + [=](auto &history, auto &stats) { + set_last_deep_scrub_stamp(t, history, stats); + return true; + }); + } + + bool is_deleting() const { + return recovery_state.is_deleting(); + } + bool is_deleted() const { + return recovery_state.is_deleted(); + } + bool is_nonprimary() const { + return recovery_state.is_nonprimary(); + } + bool is_primary() const { + return recovery_state.is_primary(); + } + bool pg_has_reset_since(epoch_t e) { + ceph_assert(is_locked()); + return recovery_state.pg_has_reset_since(e); + } + + bool is_ec_pg() const { + return recovery_state.is_ec_pg(); + } + int get_role() const { + return recovery_state.get_role(); + } + const std::vector get_acting() const { + return recovery_state.get_acting(); + } + const std::set &get_actingset() const { + return recovery_state.get_actingset(); + } + int get_acting_primary() const { + return recovery_state.get_acting_primary(); + } + pg_shard_t get_primary() const { + return recovery_state.get_primary(); + } + const std::vector get_up() const { + return recovery_state.get_up(); + } + int get_up_primary() const { + return recovery_state.get_up_primary(); + } + const PastIntervals& get_past_intervals() const { + return recovery_state.get_past_intervals(); + } + bool is_acting_recovery_backfill(pg_shard_t osd) const { + return recovery_state.is_acting_recovery_backfill(osd); + } + const std::set &get_acting_recovery_backfill() const { + return recovery_state.get_acting_recovery_backfill(); + } + bool is_acting(pg_shard_t osd) const { + return recovery_state.is_acting(osd); + } + bool is_up(pg_shard_t osd) const { + return recovery_state.is_up(osd); + } + static bool has_shard(bool ec, const std::vector& v, pg_shard_t osd) { + return PeeringState::has_shard(ec, v, osd); + } + + /// initialize created PG + void init( + int role, + const std::vector& up, + int up_primary, + const std::vector& acting, + int acting_primary, + const pg_history_t& history, + const PastIntervals& pim, + bool backfill, + ObjectStore::Transaction &t); + + /// read existing pg state off disk + void read_state(ObjectStore *store); + static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch); + + static int get_latest_struct_v() { + return pg_latest_struct_v; + } + static int get_compat_struct_v() { + return pg_compat_struct_v; + } + static int read_info( + ObjectStore *store, spg_t pgid, const coll_t &coll, + pg_info_t &info, PastIntervals &past_intervals, + __u8 &); + static bool _has_removal_flag(ObjectStore *store, spg_t pgid); + + void rm_backoff(const ceph::ref_t& b); + + void update_snap_mapper_bits(uint32_t bits) { + snap_mapper.update_bits(bits); + } + void start_split_stats(const std::set& childpgs, std::vector *v); + virtual void split_colls( + spg_t child, + int split_bits, + int seed, + const pg_pool_t *pool, + ObjectStore::Transaction &t) = 0; + void split_into(pg_t child_pgid, PG *child, unsigned split_bits); + void merge_from(std::map& sources, PeeringCtx &rctx, + unsigned split_bits, + const pg_merge_meta_t& last_pg_merge_meta); + void finish_split_stats(const object_stat_sum_t& stats, + ObjectStore::Transaction &t); + + void scrub(epoch_t queued, ThreadPool::TPHandle& handle) + { + // a new scrub + forward_scrub_event(&ScrubPgIF::initiate_regular_scrub, queued, "StartScrub"sv); + } + + /** + * a special version of PG::scrub(), which: + * - is initiated after repair, and + * (not true anymore:) + * - is not required to allocate local/remote OSD scrub resources + */ + void recovery_scrub(epoch_t queued, ThreadPool::TPHandle& handle) + { + // a new scrub + forward_scrub_event(&ScrubPgIF::initiate_scrub_after_repair, queued, + "AfterRepairScrub"sv); + } + + void replica_scrub(epoch_t queued, + Scrub::act_token_t act_token, + ThreadPool::TPHandle& handle); + + void replica_scrub_resched(epoch_t queued, + Scrub::act_token_t act_token, + ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_sched_replica, queued, act_token, + "SchedReplica"); + } + + void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_remotes_reserved, queued, "RemotesReserved"sv); + } + + void scrub_send_resources_denied(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_reservation_failure, queued, + "ReservationFailure"sv); + } + + void scrub_send_scrub_resched(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_scrub_resched, queued, "InternalSchedScrub"); + } + + void scrub_send_pushes_update(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::active_pushes_notification, queued, + "ActivePushesUpd"sv); + } + + void scrub_send_applied_update(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::update_applied_notification, queued, + "UpdatesApplied"sv); + } + + void scrub_send_unblocking(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_scrub_unblock, queued, "Unblocked"sv); + } + + void scrub_send_digest_update(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::digest_update_notification, queued, "DigestUpdate"sv); + } + + void scrub_send_local_map_ready(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_local_map_done, queued, "IntLocalMapDone"sv); + } + + void scrub_send_replmaps_ready(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_replica_maps_ready, queued, "GotReplicas"sv); + } + + void scrub_send_replica_pushes(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_replica_pushes_upd, queued, + "ReplicaPushesUpd"sv); + } + + void scrub_send_maps_compared(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_maps_compared, queued, "MapsCompared"sv); + } + + void scrub_send_get_next_chunk(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_get_next_chunk, queued, "NextChunk"sv); + } + + void scrub_send_scrub_is_finished(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_scrub_is_finished, queued, "ScrubFinished"sv); + } + + void scrub_send_chunk_free(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_chunk_free, queued, "SelectedChunkFree"sv); + } + + void scrub_send_chunk_busy(epoch_t queued, ThreadPool::TPHandle& handle) + { + forward_scrub_event(&ScrubPgIF::send_chunk_busy, queued, "ChunkIsBusy"sv); + } + + void reg_next_scrub(); + + void queue_want_pg_temp(const std::vector &wanted) override; + void clear_want_pg_temp() override; + + void on_new_interval() override; + + void on_role_change() override; + virtual void plpg_on_role_change() = 0; + + void init_collection_pool_opts(); + void on_pool_change() override; + virtual void plpg_on_pool_change() = 0; + + void on_info_history_change() override; + + void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) override; + + uint64_t get_snap_trimq_size() const override { + return snap_trimq.size(); + } + unsigned get_target_pg_log_entries() const override; + + void clear_publish_stats() override; + void clear_primary_state() override; + + epoch_t oldest_stored_osdmap() override; + OstreamTemp get_clog_error() override; + OstreamTemp get_clog_info() override; + OstreamTemp get_clog_debug() override; + + void schedule_event_after( + PGPeeringEventRef event, + float delay) override; + void request_local_background_io_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) override; + void update_local_background_io_priority( + unsigned priority) override; + void cancel_local_background_io_reservation() override; + + void request_remote_recovery_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) override; + void cancel_remote_recovery_reservation() override; + + void schedule_event_on_commit( + ObjectStore::Transaction &t, + PGPeeringEventRef on_commit) override; + + void on_active_exit() override; + + Context *on_clean() override { + if (is_active()) { + kick_snap_trim(); + } + requeue_ops(waiting_for_clean_to_primary_repair); + return finish_recovery(); + } + + void on_activate(interval_set snaps) override; + + void on_activate_committed() override; + + void on_active_actmap() override; + void on_active_advmap(const OSDMapRef &osdmap) override; + + void queue_snap_retrim(snapid_t snap); + + void on_backfill_reserved() override; + void on_backfill_canceled() override; + void on_recovery_reserved() override; + + bool is_forced_recovery_or_backfill() const { + return recovery_state.is_forced_recovery_or_backfill(); + } + + PGLog::LogEntryHandlerRef get_log_handler( + ObjectStore::Transaction &t) override { + return std::make_unique(this, &t); + } + + std::pair do_delete_work(ObjectStore::Transaction &t, + ghobject_t _next) override; + + void clear_ready_to_merge() override; + void set_not_ready_to_merge_target(pg_t pgid, pg_t src) override; + void set_not_ready_to_merge_source(pg_t pgid) override; + void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) override; + void set_ready_to_merge_source(eversion_t lu) override; + + void send_pg_created(pg_t pgid) override; + + ceph::signedspan get_mnow() override; + HeartbeatStampsRef get_hb_stamps(int peer) override; + void schedule_renew_lease(epoch_t lpr, ceph::timespan delay) override; + void queue_check_readable(epoch_t lpr, ceph::timespan delay) override; + + void rebuild_missing_set_with_deletes(PGLog &pglog) override; + + void queue_peering_event(PGPeeringEventRef evt); + void do_peering_event(PGPeeringEventRef evt, PeeringCtx &rcx); + void queue_null(epoch_t msg_epoch, epoch_t query_epoch); + void queue_flushed(epoch_t started_at); + void handle_advance_map( + OSDMapRef osdmap, OSDMapRef lastmap, + std::vector& newup, int up_primary, + std::vector& newacting, int acting_primary, + PeeringCtx &rctx); + void handle_activate_map(PeeringCtx &rctx); + void handle_initialize(PeeringCtx &rxcx); + void handle_query_state(ceph::Formatter *f); + + /** + * @param ops_begun returns how many recovery ops the function started + * @returns true if any useful work was accomplished; false otherwise + */ + virtual bool start_recovery_ops( + uint64_t max, + ThreadPool::TPHandle &handle, + uint64_t *ops_begun) = 0; + + // more work after the above, but with a PeeringCtx + void find_unfound(epoch_t queued, PeeringCtx &rctx); + + virtual void get_watchers(std::list *ls) = 0; + + void dump_pgstate_history(ceph::Formatter *f); + void dump_missing(ceph::Formatter *f); + + void get_pg_stats(std::function f); + void with_heartbeat_peers(std::function f); + + void shutdown(); + virtual void on_shutdown() = 0; + + bool get_must_scrub() const; + bool sched_scrub(); + + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const; + /// the version that refers to flags_.priority + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const; +private: + // auxiliaries used by sched_scrub(): + double next_deepscrub_interval() const; + + /// should we perform deep scrub? + bool is_time_for_deep(bool allow_deep_scrub, + bool allow_scrub, + bool has_deep_errors, + const requested_scrub_t& planned) const; + + /** + * Verify the various 'next scrub' flags in m_planned_scrub against configuration + * and scrub-related timestamps. + * + * @returns an updated copy of the m_planned_flags (or nothing if no scrubbing) + */ + std::optional verify_scrub_mode() const; + + bool verify_periodic_scrub_mode(bool allow_deep_scrub, + bool try_to_auto_repair, + bool allow_regular_scrub, + bool has_deep_errors, + requested_scrub_t& planned) const; + + using ScrubAPI = void (ScrubPgIF::*)(epoch_t epoch_queued); + void forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc); + // and for events that carry a meaningful 'activation token' + using ScrubSafeAPI = void (ScrubPgIF::*)(epoch_t epoch_queued, + Scrub::act_token_t act_token); + void forward_scrub_event(ScrubSafeAPI fn, + epoch_t epoch_queued, + Scrub::act_token_t act_token, + std::string_view desc); + +public: + virtual void do_request( + OpRequestRef& op, + ThreadPool::TPHandle &handle + ) = 0; + virtual void clear_cache() = 0; + virtual int get_cache_obj_count() = 0; + + virtual void snap_trimmer(epoch_t epoch_queued) = 0; + virtual void do_command( + const std::string_view& prefix, + const cmdmap_t& cmdmap, + const ceph::buffer::list& idata, + std::function on_finish) = 0; + + virtual bool agent_work(int max) = 0; + virtual bool agent_work(int max, int agent_flush_quota) = 0; + virtual void agent_stop() = 0; + virtual void agent_delay() = 0; + virtual void agent_clear() = 0; + virtual void agent_choose_mode_restart() = 0; + + struct C_DeleteMore : public Context { + PGRef pg; + epoch_t epoch; + C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {} + void finish(int r) override { + ceph_abort(); + } + void complete(int r) override; + }; + + void _delete_some(ObjectStore::Transaction *t); + + virtual void set_dynamic_perf_stats_queries( + const std::list &queries) { + } + virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) { + } + + uint64_t get_min_alloc_size() const; + + // reference counting +#ifdef PG_DEBUG_REFS + uint64_t get_with_id(); + void put_with_id(uint64_t); + void dump_live_ids(); +#endif + void get(const char* tag); + void put(const char* tag); + int get_num_ref() { + return ref; + } + + // ctor + PG(OSDService *o, OSDMapRef curmap, + const PGPool &pool, spg_t p); + ~PG() override; + + // prevent copying + explicit PG(const PG& rhs) = delete; + PG& operator=(const PG& rhs) = delete; + +protected: + // ------------- + // protected + OSDService *osd; +public: + OSDShard *osd_shard = nullptr; + OSDShardPGSlot *pg_slot = nullptr; +protected: + CephContext *cct; + + // locking and reference counting. + // I destroy myself when the reference count hits zero. + // lock() should be called before doing anything. + // get() should be called on pointer copy (to another thread, etc.). + // put() should be called on destruction of some previously copied pointer. + // unlock() when done with the current pointer (_most common_). + mutable ceph::mutex _lock = ceph::make_mutex("PG::_lock"); +#ifndef CEPH_DEBUG_MUTEX + mutable std::thread::id locked_by; +#endif + std::atomic ref{0}; + +#ifdef PG_DEBUG_REFS + ceph::mutex _ref_id_lock = ceph::make_mutex("PG::_ref_id_lock"); + std::map _live_ids; + std::map _tag_counts; + uint64_t _ref_id = 0; + + friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); } + friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); } +#endif + +private: + friend void intrusive_ptr_add_ref(PG *pg) { + pg->get("intptr"); + } + friend void intrusive_ptr_release(PG *pg) { + pg->put("intptr"); + } + + + // ===================== + +protected: + OSDriver osdriver; + SnapMapper snap_mapper; + + virtual PGBackend *get_pgbackend() = 0; + virtual const PGBackend* get_pgbackend() const = 0; + +protected: + void requeue_map_waiters(); + +protected: + + ZTracer::Endpoint trace_endpoint; + + +protected: + __u8 info_struct_v = 0; + void upgrade(ObjectStore *store); + +protected: + ghobject_t pgmeta_oid; + + // ------------------ + interval_set snap_trimq; + std::set snap_trimq_repeat; + + /* You should not use these items without taking their respective queue locks + * (if they have one) */ + xlist::item stat_queue_item; + bool recovery_queued; + + int recovery_ops_active; + std::set waiting_on_backfill; +#ifdef DEBUG_RECOVERY_OIDS + multiset recovering_oids; +#endif + +public: + bool dne() { return info.dne(); } + + void send_cluster_message( + int osd, MessageRef m, epoch_t epoch, bool share_map_update) override; + +protected: + epoch_t get_last_peering_reset() const { + return recovery_state.get_last_peering_reset(); + } + + /* heartbeat peers */ + void set_probe_targets(const std::set &probe_set) override; + void clear_probe_targets() override; + + ceph::mutex heartbeat_peer_lock = + ceph::make_mutex("PG::heartbeat_peer_lock"); + std::set heartbeat_peers; + std::set probe_targets; + +protected: + BackfillInterval backfill_info; + std::map peer_backfill_info; + bool backfill_reserving; + + // The primary's num_bytes and local num_bytes for this pg, only valid + // during backfill for non-primary shards. + // Both of these are adjusted for EC to reflect the on-disk bytes + std::atomic primary_num_bytes = 0; + std::atomic local_num_bytes = 0; + +public: + // Space reserved for backfill is primary_num_bytes - local_num_bytes + // Don't care that difference itself isn't atomic + uint64_t get_reserved_num_bytes() { + int64_t primary = primary_num_bytes.load(); + int64_t local = local_num_bytes.load(); + if (primary > local) + return primary - local; + else + return 0; + } + + bool is_remote_backfilling() { + return primary_num_bytes.load() > 0; + } + + bool try_reserve_recovery_space(int64_t primary, int64_t local) override; + void unreserve_recovery_space() override; + + // If num_bytes are inconsistent and local_num- goes negative + // it's ok, because it would then be ignored. + + // The value of num_bytes could be negative, + // but we don't let local_num_bytes go negative. + void add_local_num_bytes(int64_t num_bytes) { + if (num_bytes) { + int64_t prev_bytes = local_num_bytes.load(); + int64_t new_bytes; + do { + new_bytes = prev_bytes + num_bytes; + if (new_bytes < 0) + new_bytes = 0; + } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes)); + } + } + void sub_local_num_bytes(int64_t num_bytes) { + ceph_assert(num_bytes >= 0); + if (num_bytes) { + int64_t prev_bytes = local_num_bytes.load(); + int64_t new_bytes; + do { + new_bytes = prev_bytes - num_bytes; + if (new_bytes < 0) + new_bytes = 0; + } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes)); + } + } + // The value of num_bytes could be negative, + // but we don't let info.stats.stats.sum.num_bytes go negative. + void add_num_bytes(int64_t num_bytes) { + ceph_assert(ceph_mutex_is_locked_by_me(_lock)); + if (num_bytes) { + recovery_state.update_stats( + [num_bytes](auto &history, auto &stats) { + stats.stats.sum.num_bytes += num_bytes; + if (stats.stats.sum.num_bytes < 0) { + stats.stats.sum.num_bytes = 0; + } + return false; + }); + } + } + void sub_num_bytes(int64_t num_bytes) { + ceph_assert(ceph_mutex_is_locked_by_me(_lock)); + ceph_assert(num_bytes >= 0); + if (num_bytes) { + recovery_state.update_stats( + [num_bytes](auto &history, auto &stats) { + stats.stats.sum.num_bytes -= num_bytes; + if (stats.stats.sum.num_bytes < 0) { + stats.stats.sum.num_bytes = 0; + } + return false; + }); + } + } + + // Only used in testing so not worried about needing the PG lock here + int64_t get_stats_num_bytes() { + std::lock_guard l{_lock}; + int num_bytes = info.stats.stats.sum.num_bytes; + if (pool.info.is_erasure()) { + num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); + // Round up each object by a stripe + num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects; + } + int64_t lnb = local_num_bytes.load(); + if (lnb && lnb != num_bytes) { + lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch " + << lnb << " vs stats " + << info.stats.stats.sum.num_bytes << " / chunk " + << get_pgbackend()->get_ec_data_chunk_count() + << dendl; + } + return num_bytes; + } + +protected: + + /* + * blocked request wait hierarchy + * + * In order to preserve request ordering we need to be careful about the + * order in which blocked requests get requeued. Generally speaking, we + * push the requests back up to the op_wq in reverse order (most recent + * request first) so that they come back out again in the original order. + * However, because there are multiple wait queues, we need to requeue + * waitlists in order. Generally speaking, we requeue the wait lists + * that are checked first. + * + * Here are the various wait lists, in the order they are used during + * request processing, with notes: + * + * - waiting_for_map + * - may start or stop blocking at any time (depending on client epoch) + * - waiting_for_peered + * - !is_peered() + * - only starts blocking on interval change; never restarts + * - waiting_for_flush + * - flushes_in_progress + * - waiting for final flush during activate + * - waiting_for_active + * - !is_active() + * - only starts blocking on interval change; never restarts + * - waiting_for_readable + * - now > readable_until + * - unblocks when we get fresh(er) osd_pings + * - waiting_for_scrub + * - starts and stops blocking for varying intervals during scrub + * - waiting_for_unreadable_object + * - never restarts once object is readable (* except for EIO?) + * - waiting_for_degraded_object + * - never restarts once object is writeable (* except for EIO?) + * - waiting_for_blocked_object + * - starts and stops based on proxied op activity + * - obc rwlocks + * - starts and stops based on read/write activity + * + * Notes: + * + * 1. During and interval change, we requeue *everything* in the above order. + * + * 2. When an obc rwlock is released, we check for a scrub block and requeue + * the op there if it applies. We ignore the unreadable/degraded/blocked + * queues because we assume they cannot apply at that time (this is + * probably mostly true). + * + * 3. The requeue_ops helper will push ops onto the waiting_for_map std::list if + * it is non-empty. + * + * These three behaviors are generally sufficient to maintain ordering, with + * the possible exception of cases where we make an object degraded or + * unreadable that was previously okay, e.g. when scrub or op processing + * encounter an unexpected error. FIXME. + */ + + // ops with newer maps than our (or blocked behind them) + // track these by client, since inter-request ordering doesn't otherwise + // matter. + std::unordered_map> waiting_for_map; + + // ops waiting on peered + std::list waiting_for_peered; + + /// ops waiting on readble + std::list waiting_for_readable; + + // ops waiting on active (require peered as well) + std::list waiting_for_active; + std::list waiting_for_flush; + std::list waiting_for_scrub; + + std::list waiting_for_cache_not_full; + std::list waiting_for_clean_to_primary_repair; + std::map> waiting_for_unreadable_object, + waiting_for_degraded_object, + waiting_for_blocked_object; + + std::set objects_blocked_on_cache_full; + std::map objects_blocked_on_degraded_snap; + std::map objects_blocked_on_snap_promotion; + + // Callbacks should assume pg (and nothing else) is locked + std::map> callbacks_for_degraded_object; + + std::map>>> waiting_for_ondisk; + + void requeue_object_waiters(std::map>& m); + void requeue_op(OpRequestRef op); + void requeue_ops(std::list &l); + + // stats that persist lazily + object_stat_collection_t unstable_stats; + + // publish stats + ceph::mutex pg_stats_publish_lock = + ceph::make_mutex("PG::pg_stats_publish_lock"); + bool pg_stats_publish_valid; + pg_stat_t pg_stats_publish; + + friend class TestOpsSocketHook; + void publish_stats_to_osd() override; + + bool needs_recovery() const { + return recovery_state.needs_recovery(); + } + bool needs_backfill() const { + return recovery_state.needs_backfill(); + } + + bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const; + + struct PGLogEntryHandler : public PGLog::LogEntryHandler { + PG *pg; + ObjectStore::Transaction *t; + PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {} + + // LogEntryHandler + void remove(const hobject_t &hoid) override { + pg->get_pgbackend()->remove(hoid, t); + } + void try_stash(const hobject_t &hoid, version_t v) override { + pg->get_pgbackend()->try_stash(hoid, v, t); + } + void rollback(const pg_log_entry_t &entry) override { + ceph_assert(entry.can_rollback()); + pg->get_pgbackend()->rollback(entry, t); + } + void rollforward(const pg_log_entry_t &entry) override { + pg->get_pgbackend()->rollforward(entry, t); + } + void trim(const pg_log_entry_t &entry) override { + pg->get_pgbackend()->trim(entry, t); + } + }; + + void update_object_snap_mapping( + ObjectStore::Transaction *t, const hobject_t &soid, + const std::set &snaps); + void clear_object_snap_mapping( + ObjectStore::Transaction *t, const hobject_t &soid); + void remove_snap_mapped_object( + ObjectStore::Transaction& t, const hobject_t& soid); + + bool have_unfound() const { + return recovery_state.have_unfound(); + } + uint64_t get_num_unfound() const { + return recovery_state.get_num_unfound(); + } + + virtual void check_local() = 0; + + void purge_strays(); + + void update_heartbeat_peers(std::set peers) override; + + Context *finish_sync_event; + + Context *finish_recovery(); + void _finish_recovery(Context *c); + struct C_PG_FinishRecovery : public Context { + PGRef pg; + explicit C_PG_FinishRecovery(PG *p) : pg(p) {} + void finish(int r) override { + pg->_finish_recovery(this); + } + }; + void cancel_recovery(); + void clear_recovery_state(); + virtual void _clear_recovery_state() = 0; + void start_recovery_op(const hobject_t& soid); + void finish_recovery_op(const hobject_t& soid, bool dequeue=false); + + virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0; + + friend class C_OSD_RepModify_Commit; + friend struct C_DeleteMore; + + // -- backoff -- + ceph::mutex backoff_lock = // orders inside Backoff::lock + ceph::make_mutex("PG::backoff_lock"); + std::map>> backoffs; + + void add_backoff(const ceph::ref_t& s, const hobject_t& begin, const hobject_t& end); + void release_backoffs(const hobject_t& begin, const hobject_t& end); + void release_backoffs(const hobject_t& o) { + release_backoffs(o, o); + } + void clear_backoffs(); + + void add_pg_backoff(const ceph::ref_t& s) { + hobject_t begin = info.pgid.pgid.get_hobj_start(); + hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + add_backoff(s, begin, end); + } +public: + void release_pg_backoffs() { + hobject_t begin = info.pgid.pgid.get_hobj_start(); + hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + release_backoffs(begin, end); + } + + // -- scrub -- +protected: + bool scrub_after_recovery; + + int active_pushes; + + void repair_object( + const hobject_t &soid, + const std::list > &ok_peers, + const std::set &bad_peers); + + [[nodiscard]] bool ops_blocked_by_scrub() const; + [[nodiscard]] Scrub::scrub_prio_t is_scrub_blocking_ops() const; + + void _repair_oinfo_oid(ScrubMap &map); + void _scan_rollback_obs(const std::vector &rollback_obs); + /** + * returns true if [begin, end) is good to scrub at this time + * a false return value obliges the implementer to requeue scrub when the + * condition preventing scrub clears + */ + virtual bool _range_available_for_scrub( + const hobject_t &begin, const hobject_t &end) = 0; + + /** + * Initiate the process that will create our scrub map for the Primary. + * (triggered by MSG_OSD_REP_SCRUB) + */ + void replica_scrub(OpRequestRef op, ThreadPool::TPHandle &handle); + + // -- recovery state -- + + struct QueuePeeringEvt : Context { + PGRef pg; + PGPeeringEventRef evt; + + template + QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) : + pg(pg), evt(std::make_shared(epoch, epoch, evt)) {} + + QueuePeeringEvt(PG *pg, PGPeeringEventRef evt) : + pg(pg), evt(std::move(evt)) {} + + void finish(int r) override { + pg->lock(); + pg->queue_peering_event(std::move(evt)); + pg->unlock(); + } + }; + + +public: + int pg_stat_adjust(osd_stat_t *new_stat); +protected: + bool delete_needs_sleep = false; + +protected: + bool state_test(uint64_t m) const { return recovery_state.state_test(m); } + void state_set(uint64_t m) { recovery_state.state_set(m); } + void state_clear(uint64_t m) { recovery_state.state_clear(m); } + + bool is_complete() const { + return recovery_state.is_complete(); + } + bool should_send_notify() const { + return recovery_state.should_send_notify(); + } + + bool is_active() const { return recovery_state.is_active(); } + bool is_activating() const { return recovery_state.is_activating(); } + bool is_peering() const { return recovery_state.is_peering(); } + bool is_down() const { return recovery_state.is_down(); } + bool is_recovery_unfound() const { return recovery_state.is_recovery_unfound(); } + bool is_backfill_unfound() const { return recovery_state.is_backfill_unfound(); } + bool is_incomplete() const { return recovery_state.is_incomplete(); } + bool is_clean() const { return recovery_state.is_clean(); } + bool is_degraded() const { return recovery_state.is_degraded(); } + bool is_undersized() const { return recovery_state.is_undersized(); } + bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } // Primary only + bool is_remapped() const { return recovery_state.is_remapped(); } + bool is_peered() const { return recovery_state.is_peered(); } + bool is_recovering() const { return recovery_state.is_recovering(); } + bool is_premerge() const { return recovery_state.is_premerge(); } + bool is_repair() const { return recovery_state.is_repair(); } + bool is_laggy() const { return state_test(PG_STATE_LAGGY); } + bool is_wait() const { return state_test(PG_STATE_WAIT); } + + bool is_empty() const { return recovery_state.is_empty(); } + + // pg on-disk state + void do_pending_flush(); + +public: + void prepare_write( + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ObjectStore::Transaction &t) override; + + void write_if_dirty(PeeringCtx &rctx) { + write_if_dirty(rctx.transaction); + } +protected: + void write_if_dirty(ObjectStore::Transaction& t) { + recovery_state.write_if_dirty(t); + } + + PGLog::IndexedLog projected_log; + bool check_in_progress_op( + const osd_reqid_t &r, + eversion_t *version, + version_t *user_version, + int *return_code, + std::vector *op_returns) const; + eversion_t projected_last_update; + eversion_t get_next_version() const { + eversion_t at_version( + get_osdmap_epoch(), + projected_last_update.version+1); + ceph_assert(at_version > info.last_update); + ceph_assert(at_version > recovery_state.get_pg_log().get_head()); + ceph_assert(at_version > projected_last_update); + return at_version; + } + + bool check_log_for_corruption(ObjectStore *store); + + std::string get_corrupt_pg_log_name() const; + + void update_snap_map( + const std::vector &log_entries, + ObjectStore::Transaction& t); + + void filter_snapc(std::vector &snaps); + + virtual void kick_snap_trim() = 0; + virtual void snap_trimmer_scrub_complete() = 0; + + void queue_recovery(); + void queue_scrub_after_repair(); + unsigned int get_scrub_priority(); + + bool try_flush_or_schedule_async() override; + void start_flush_on_transaction( + ObjectStore::Transaction &t) override; + + void update_history(const pg_history_t& history) { + recovery_state.update_history(history); + } + + // OpRequest queueing + bool can_discard_op(OpRequestRef& op); + bool can_discard_scan(OpRequestRef op); + bool can_discard_backfill(OpRequestRef op); + bool can_discard_request(OpRequestRef& op); + + template + bool can_discard_replica_op(OpRequestRef& op); + + bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch); + bool old_peering_evt(PGPeeringEventRef evt) { + return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested()); + } + bool have_same_or_newer_map(epoch_t e) { + return e <= get_osdmap_epoch(); + } + + bool op_has_sufficient_caps(OpRequestRef& op); + + // abstract bits + friend struct FlushState; + + friend ostream& operator<<(ostream& out, const PG& pg); + +protected: + PeeringState recovery_state; + + // ref to recovery_state.pool + const PGPool &pool; + + // ref to recovery_state.info + const pg_info_t &info; +}; + +#endif diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc new file mode 100644 index 000000000..ef2eb5381 --- /dev/null +++ b/src/osd/PGBackend.cc @@ -0,0 +1,1324 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013,2014 Inktank Storage, Inc. + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/errno.h" +#include "common/scrub_types.h" +#include "ReplicatedBackend.h" +#include "ScrubStore.h" +#include "ECBackend.h" +#include "PGBackend.h" +#include "OSD.h" +#include "erasure-code/ErasureCodePlugin.h" +#include "OSDMap.h" +#include "PGLog.h" +#include "common/LogClient.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" + +using std::list; +using std::make_pair; +using std::map; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::stringstream; +using std::vector; + +using ceph::bufferlist; +using ceph::bufferptr; +using ceph::ErasureCodeProfile; +using ceph::ErasureCodeInterfaceRef; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) { + return pgb->get_parent()->gen_dbg_prefix(*_dout); +} + +void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v, + RecoveryHandle *h) +{ + ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0); + for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) { + if (shard == get_parent()->whoami_shard()) + continue; + if (get_parent()->get_shard_missing(shard).is_missing(oid)) { + dout(20) << __func__ << " will remove " << oid << " " << v << " from " + << shard << dendl; + h->deletes[shard].push_back(make_pair(oid, v)); + get_parent()->begin_peer_recover(shard, oid); + } + } +} + +void PGBackend::send_recovery_deletes(int prio, + const map > > &deletes) +{ + epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch(); + for (const auto& p : deletes) { + const auto& shard = p.first; + const auto& objects = p.second; + ConnectionRef con = get_parent()->get_con_osd_cluster( + shard.osd, + get_osdmap_epoch()); + if (!con) + continue; + auto it = objects.begin(); + while (it != objects.end()) { + uint64_t cost = 0; + uint64_t deletes = 0; + spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard); + MOSDPGRecoveryDelete *msg = + new MOSDPGRecoveryDelete(get_parent()->whoami_shard(), + target_pg, + get_osdmap_epoch(), + min_epoch); + msg->set_priority(prio); + + while (it != objects.end() && + cost < cct->_conf->osd_max_push_cost && + deletes < cct->_conf->osd_max_push_objects) { + dout(20) << __func__ << ": sending recovery delete << " << it->first + << " " << it->second << " to osd." << shard << dendl; + msg->objects.push_back(*it); + cost += cct->_conf->osd_push_per_object_cost; + ++deletes; + ++it; + } + + msg->set_cost(cost); + get_parent()->send_message_osd_cluster(msg, con); + } + } +} + +bool PGBackend::handle_message(OpRequestRef op) +{ + switch (op->get_req()->get_type()) { + case MSG_OSD_PG_RECOVERY_DELETE: + handle_recovery_delete(op); + return true; + + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + handle_recovery_delete_reply(op); + return true; + + default: + break; + } + + return _handle_message(op); +} + +void PGBackend::handle_recovery_delete(OpRequestRef op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE); + dout(20) << __func__ << " " << op << dendl; + + op->mark_started(); + + C_GatherBuilder gather(cct); + for (const auto &p : m->objects) { + get_parent()->remove_missing_object(p.first, p.second, gather.new_sub()); + } + + auto reply = make_message(); + reply->from = get_parent()->whoami_shard(); + reply->set_priority(m->get_priority()); + reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard); + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->objects = m->objects; + ConnectionRef conn = m->get_connection(); + + gather.set_finisher(new LambdaContext( + [=](int r) { + if (r != -EAGAIN) { + get_parent()->send_message_osd_cluster(reply, conn.get()); + } + })); + gather.activate(); +} + +void PGBackend::handle_recovery_delete_reply(OpRequestRef op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY); + dout(20) << __func__ << " " << op << dendl; + + for (const auto &p : m->objects) { + ObjectRecoveryInfo recovery_info; + hobject_t oid = p.first; + recovery_info.version = p.second; + get_parent()->on_peer_recover(m->from, oid, recovery_info); + bool peers_recovered = true; + for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) { + if (shard == get_parent()->whoami_shard()) + continue; + if (get_parent()->get_shard_missing(shard).is_missing(oid)) { + dout(20) << __func__ << " " << oid << " still missing on at least " + << shard << dendl; + peers_recovered = false; + break; + } + } + if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) { + dout(20) << __func__ << " completed recovery, local_missing = " + << get_parent()->get_local_missing() << dendl; + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + get_parent()->on_global_recover(p.first, stat_diff, true); + } + } +} + +void PGBackend::rollback( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t) +{ + + struct RollbackVisitor : public ObjectModDesc::Visitor { + const hobject_t &hoid; + PGBackend *pg; + ObjectStore::Transaction t; + RollbackVisitor( + const hobject_t &hoid, + PGBackend *pg) : hoid(hoid), pg(pg) {} + void append(uint64_t old_size) override { + ObjectStore::Transaction temp; + pg->rollback_append(hoid, old_size, &temp); + temp.append(t); + temp.swap(t); + } + void setattrs(map > &attrs) override { + ObjectStore::Transaction temp; + pg->rollback_setattrs(hoid, attrs, &temp); + temp.append(t); + temp.swap(t); + } + void rmobject(version_t old_version) override { + ObjectStore::Transaction temp; + pg->rollback_stash(hoid, old_version, &temp); + temp.append(t); + temp.swap(t); + } + void try_rmobject(version_t old_version) override { + ObjectStore::Transaction temp; + pg->rollback_try_stash(hoid, old_version, &temp); + temp.append(t); + temp.swap(t); + } + void create() override { + ObjectStore::Transaction temp; + pg->rollback_create(hoid, &temp); + temp.append(t); + temp.swap(t); + } + void update_snaps(const set &snaps) override { + ObjectStore::Transaction temp; + pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp); + temp.append(t); + temp.swap(t); + } + void rollback_extents( + version_t gen, + const vector > &extents) override { + ObjectStore::Transaction temp; + pg->rollback_extents(gen, extents, hoid, &temp); + temp.append(t); + temp.swap(t); + } + }; + + ceph_assert(entry.mod_desc.can_rollback()); + RollbackVisitor vis(entry.soid, this); + entry.mod_desc.visit(&vis); + t->append(vis.t); +} + +struct Trimmer : public ObjectModDesc::Visitor { + const hobject_t &soid; + PGBackend *pg; + ObjectStore::Transaction *t; + Trimmer( + const hobject_t &soid, + PGBackend *pg, + ObjectStore::Transaction *t) + : soid(soid), pg(pg), t(t) {} + void rmobject(version_t old_version) override { + pg->trim_rollback_object( + soid, + old_version, + t); + } + // try_rmobject defaults to rmobject + void rollback_extents( + version_t gen, + const vector > &extents) override { + pg->trim_rollback_object( + soid, + gen, + t); + } +}; + +void PGBackend::rollforward( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t) +{ + auto dpp = get_parent()->get_dpp(); + ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl; + if (!entry.can_rollback()) + return; + Trimmer trimmer(entry.soid, this, t); + entry.mod_desc.visit(&trimmer); +} + +void PGBackend::trim( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t) +{ + if (!entry.can_rollback()) + return; + Trimmer trimmer(entry.soid, this, t); + entry.mod_desc.visit(&trimmer); +} + +void PGBackend::try_stash( + const hobject_t &hoid, + version_t v, + ObjectStore::Transaction *t) +{ + t->try_rename( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ghobject_t(hoid, v, get_parent()->whoami_shard().shard)); +} + +void PGBackend::remove( + const hobject_t &hoid, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->remove( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + get_parent()->pgb_clear_object_snap_mapping(hoid, t); +} + +void PGBackend::on_change_cleanup(ObjectStore::Transaction *t) +{ + dout(10) << __func__ << dendl; + // clear temp + for (set::iterator i = temp_contents.begin(); + i != temp_contents.end(); + ++i) { + dout(10) << __func__ << ": Removing oid " + << *i << " from the temp collection" << dendl; + t->remove( + coll, + ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + } + temp_contents.clear(); +} + +int PGBackend::objects_list_partial( + const hobject_t &begin, + int min, + int max, + vector *ls, + hobject_t *next) +{ + ceph_assert(ls); + // Starts with the smallest generation to make sure the result list + // has the marker object (it might have multiple generations + // though, which would be filtered). + ghobject_t _next; + if (!begin.is_min()) + _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard); + ls->reserve(max); + int r = 0; + + if (min > max) + min = max; + + while (!_next.is_max() && ls->size() < (unsigned)min) { + vector objects; + if (HAVE_FEATURE(parent->min_upacting_features(), + OSD_FIXED_COLLECTION_LIST)) { + r = store->collection_list( + ch, + _next, + ghobject_t::get_max(), + max - ls->size(), + &objects, + &_next); + } else { + r = store->collection_list_legacy( + ch, + _next, + ghobject_t::get_max(), + max - ls->size(), + &objects, + &_next); + } + if (r != 0) { + derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl; + break; + } + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (i->is_pgmeta() || i->hobj.is_temp()) { + continue; + } + if (i->is_no_gen()) { + ls->push_back(i->hobj); + } + } + } + if (r == 0) + *next = _next.hobj; + return r; +} + +int PGBackend::objects_list_range( + const hobject_t &start, + const hobject_t &end, + vector *ls, + vector *gen_obs) +{ + ceph_assert(ls); + vector objects; + int r; + if (HAVE_FEATURE(parent->min_upacting_features(), + OSD_FIXED_COLLECTION_LIST)) { + r = store->collection_list( + ch, + ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + INT_MAX, + &objects, + NULL); + } else { + r = store->collection_list_legacy( + ch, + ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + INT_MAX, + &objects, + NULL); + } + ls->reserve(objects.size()); + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (i->is_pgmeta() || i->hobj.is_temp()) { + continue; + } + if (i->is_no_gen()) { + ls->push_back(i->hobj); + } else if (gen_obs) { + gen_obs->push_back(*i); + } + } + return r; +} + +int PGBackend::objects_get_attr( + const hobject_t &hoid, + const string &attr, + bufferlist *out) +{ + bufferptr bp; + int r = store->getattr( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + attr.c_str(), + bp); + if (r >= 0 && out) { + out->clear(); + out->push_back(std::move(bp)); + } + return r; +} + +int PGBackend::objects_get_attrs( + const hobject_t &hoid, + map *out) +{ + return store->getattrs( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + *out); +} + +void PGBackend::rollback_setattrs( + const hobject_t &hoid, + map > &old_attrs, + ObjectStore::Transaction *t) { + map to_set; + ceph_assert(!hoid.is_temp()); + for (map >::iterator i = old_attrs.begin(); + i != old_attrs.end(); + ++i) { + if (i->second) { + to_set[i->first] = *(i->second); + } else { + t->rmattr( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + i->first); + } + } + t->setattrs( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + to_set); +} + +void PGBackend::rollback_append( + const hobject_t &hoid, + uint64_t old_size, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->truncate( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + old_size); +} + +void PGBackend::rollback_stash( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->remove( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + t->collection_move_rename( + coll, + ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard), + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); +} + +void PGBackend::rollback_try_stash( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->remove( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + t->try_rename( + coll, + ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard), + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); +} + +void PGBackend::rollback_extents( + version_t gen, + const vector > &extents, + const hobject_t &hoid, + ObjectStore::Transaction *t) { + auto shard = get_parent()->whoami_shard().shard; + for (auto &&extent: extents) { + t->clone_range( + coll, + ghobject_t(hoid, gen, shard), + ghobject_t(hoid, ghobject_t::NO_GEN, shard), + extent.first, + extent.second, + extent.first); + } + t->remove( + coll, + ghobject_t(hoid, gen, shard)); +} + +void PGBackend::trim_rollback_object( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->remove( + coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard)); +} + +PGBackend *PGBackend::build_pg_backend( + const pg_pool_t &pool, + const map& profile, + Listener *l, + coll_t coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct) +{ + ErasureCodeProfile ec_profile = profile; + switch (pool.type) { + case pg_pool_t::TYPE_REPLICATED: { + return new ReplicatedBackend(l, coll, ch, store, cct); + } + case pg_pool_t::TYPE_ERASURE: { + ErasureCodeInterfaceRef ec_impl; + stringstream ss; + ceph::ErasureCodePluginRegistry::instance().factory( + profile.find("plugin")->second, + cct->_conf.get_val("erasure_code_dir"), + ec_profile, + &ec_impl, + &ss); + ceph_assert(ec_impl); + return new ECBackend( + l, + coll, + ch, + store, + cct, + ec_impl, + pool.stripe_width); + } + default: + ceph_abort(); + return NULL; + } +} + +int PGBackend::be_scan_list( + ScrubMap &map, + ScrubMapBuilder &pos) +{ + dout(10) << __func__ << " " << pos << dendl; + ceph_assert(!pos.done()); + ceph_assert(pos.pos < pos.ls.size()); + hobject_t& poid = pos.ls[pos.pos]; + + struct stat st; + int r = store->stat( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + &st, + true); + if (r == 0) { + ScrubMap::object &o = map.objects[poid]; + o.size = st.st_size; + ceph_assert(!o.negative); + store->getattrs( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + o.attrs); + + if (pos.deep) { + r = be_deep_scrub(poid, map, pos, o); + } + dout(25) << __func__ << " " << poid << dendl; + } else if (r == -ENOENT) { + dout(25) << __func__ << " " << poid << " got " << r + << ", skipping" << dendl; + } else if (r == -EIO) { + dout(25) << __func__ << " " << poid << " got " << r + << ", stat_error" << dendl; + ScrubMap::object &o = map.objects[poid]; + o.stat_error = true; + } else { + derr << __func__ << " got: " << cpp_strerror(r) << dendl; + ceph_abort(); + } + if (r == -EINPROGRESS) { + return -EINPROGRESS; + } + pos.next_object(); + return 0; +} + +bool PGBackend::be_compare_scrub_objects( + pg_shard_t auth_shard, + const ScrubMap::object &auth, + const object_info_t& auth_oi, + const ScrubMap::object &candidate, + shard_info_wrapper &shard_result, + inconsistent_obj_wrapper &obj_result, + ostream &errorstream, + bool has_snapset) +{ + enum { CLEAN, FOUND_ERROR } error = CLEAN; + if (auth.digest_present && candidate.digest_present) { + if (auth.digest != candidate.digest) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "data_digest 0x" << std::hex << candidate.digest + << " != data_digest 0x" << auth.digest << std::dec + << " from shard " << auth_shard; + obj_result.set_data_digest_mismatch(); + } + } + if (auth.omap_digest_present && candidate.omap_digest_present) { + if (auth.omap_digest != candidate.omap_digest) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest + << " != omap_digest 0x" << auth.omap_digest << std::dec + << " from shard " << auth_shard; + obj_result.set_omap_digest_mismatch(); + } + } + if (parent->get_pool().is_replicated()) { + if (auth_oi.is_data_digest() && candidate.digest_present) { + if (auth_oi.data_digest != candidate.digest) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "data_digest 0x" << std::hex << candidate.digest + << " != data_digest 0x" << auth_oi.data_digest << std::dec + << " from auth oi " << auth_oi; + shard_result.set_data_digest_mismatch_info(); + } + } + if (auth_oi.is_omap_digest() && candidate.omap_digest_present) { + if (auth_oi.omap_digest != candidate.omap_digest) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest + << " != omap_digest 0x" << auth_oi.omap_digest << std::dec + << " from auth oi " << auth_oi; + shard_result.set_omap_digest_mismatch_info(); + } + } + } + if (candidate.stat_error) + return error == FOUND_ERROR; + if (!shard_result.has_info_missing() + && !shard_result.has_info_corrupted()) { + bufferlist can_bl, auth_bl; + auto can_attr = candidate.attrs.find(OI_ATTR); + auto auth_attr = auth.attrs.find(OI_ATTR); + + ceph_assert(auth_attr != auth.attrs.end()); + ceph_assert(can_attr != candidate.attrs.end()); + + can_bl.push_back(can_attr->second); + auth_bl.push_back(auth_attr->second); + if (!can_bl.contents_equal(auth_bl)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + obj_result.set_object_info_inconsistency(); + errorstream << "object info inconsistent "; + } + } + if (has_snapset) { + if (!shard_result.has_snapset_missing() + && !shard_result.has_snapset_corrupted()) { + bufferlist can_bl, auth_bl; + auto can_attr = candidate.attrs.find(SS_ATTR); + auto auth_attr = auth.attrs.find(SS_ATTR); + + ceph_assert(auth_attr != auth.attrs.end()); + ceph_assert(can_attr != candidate.attrs.end()); + + can_bl.push_back(can_attr->second); + auth_bl.push_back(auth_attr->second); + if (!can_bl.contents_equal(auth_bl)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + obj_result.set_snapset_inconsistency(); + errorstream << "snapset inconsistent "; + } + } + } + if (parent->get_pool().is_erasure()) { + if (!shard_result.has_hinfo_missing() + && !shard_result.has_hinfo_corrupted()) { + bufferlist can_bl, auth_bl; + auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key()); + auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key()); + + ceph_assert(auth_hi != auth.attrs.end()); + ceph_assert(can_hi != candidate.attrs.end()); + + can_bl.push_back(can_hi->second); + auth_bl.push_back(auth_hi->second); + if (!can_bl.contents_equal(auth_bl)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + obj_result.set_hinfo_inconsistency(); + errorstream << "hinfo inconsistent "; + } + } + } + uint64_t oi_size = be_get_ondisk_size(auth_oi.size); + if (oi_size != candidate.size) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "size " << candidate.size + << " != size " << oi_size + << " from auth oi " << auth_oi; + shard_result.set_size_mismatch_info(); + } + if (auth.size != candidate.size) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "size " << candidate.size + << " != size " << auth.size + << " from shard " << auth_shard; + obj_result.set_size_mismatch(); + } + // If the replica is too large and we didn't already count it for this object + // + if (candidate.size > cct->_conf->osd_max_object_size + && !obj_result.has_size_too_large()) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "size " << candidate.size + << " > " << cct->_conf->osd_max_object_size + << " is too large"; + obj_result.set_size_too_large(); + } + for (map::const_iterator i = auth.attrs.begin(); + i != auth.attrs.end(); + ++i) { + // We check system keys seperately + if (i->first == OI_ATTR || i->first[0] != '_') + continue; + if (!candidate.attrs.count(i->first)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "attr name mismatch '" << i->first << "'"; + obj_result.set_attr_name_mismatch(); + } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "attr value mismatch '" << i->first << "'"; + obj_result.set_attr_value_mismatch(); + } + } + for (map::const_iterator i = candidate.attrs.begin(); + i != candidate.attrs.end(); + ++i) { + // We check system keys seperately + if (i->first == OI_ATTR || i->first[0] != '_') + continue; + if (!auth.attrs.count(i->first)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "attr name mismatch '" << i->first << "'"; + obj_result.set_attr_name_mismatch(); + } + } + return error == FOUND_ERROR; +} + +static int dcount(const object_info_t &oi) +{ + int count = 0; + if (oi.is_data_digest()) + count++; + if (oi.is_omap_digest()) + count++; + return count; +} + +map::const_iterator + PGBackend::be_select_auth_object( + const hobject_t &obj, + const map &maps, + object_info_t *auth_oi, + map &shard_map, + bool &digest_match, + spg_t pgid, + ostream &errorstream) +{ + eversion_t auth_version; + + // Create list of shards with primary first so it will be auth copy all + // other things being equal. + list shards; + for (map::const_iterator j = maps.begin(); + j != maps.end(); + ++j) { + if (j->first == get_parent()->whoami_shard()) + continue; + shards.push_back(j->first); + } + shards.push_front(get_parent()->whoami_shard()); + + map::const_iterator auth = maps.end(); + digest_match = true; + for (auto &l : shards) { + ostringstream shard_errorstream; + bool error = false; + map::const_iterator j = maps.find(l); + map::iterator i = + j->second->objects.find(obj); + if (i == j->second->objects.end()) { + continue; + } + auto& shard_info = shard_map[j->first]; + if (j->first == get_parent()->whoami_shard()) + shard_info.primary = true; + if (i->second.read_error) { + shard_info.set_read_error(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a read error"; + } + if (i->second.ec_hash_mismatch) { + shard_info.set_ec_hash_mismatch(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had an ec hash mismatch"; + } + if (i->second.ec_size_mismatch) { + shard_info.set_ec_size_mismatch(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had an ec size mismatch"; + } + + object_info_t oi; + bufferlist bl; + map::iterator k; + SnapSet ss; + bufferlist ss_bl, hk_bl; + + if (i->second.stat_error) { + shard_info.set_stat_error(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a stat error"; + // With stat_error no further checking + // We don't need to also see a missing_object_info_attr + goto out; + } + + // We won't pick an auth copy if the snapset is missing or won't decode. + ceph_assert(!obj.is_snapdir()); + if (obj.is_head()) { + k = i->second.attrs.find(SS_ATTR); + if (k == i->second.attrs.end()) { + shard_info.set_snapset_missing(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a missing snapset key"; + } else { + ss_bl.push_back(k->second); + try { + auto bliter = ss_bl.cbegin(); + decode(ss, bliter); + } catch (...) { + // invalid snapset, probably corrupt + shard_info.set_snapset_corrupted(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a corrupt snapset"; + } + } + } + + if (parent->get_pool().is_erasure()) { + ECUtil::HashInfo hi; + k = i->second.attrs.find(ECUtil::get_hinfo_key()); + if (k == i->second.attrs.end()) { + shard_info.set_hinfo_missing(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a missing hinfo key"; + } else { + hk_bl.push_back(k->second); + try { + auto bliter = hk_bl.cbegin(); + decode(hi, bliter); + } catch (...) { + // invalid snapset, probably corrupt + shard_info.set_hinfo_corrupted(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a corrupt hinfo"; + } + } + } + + k = i->second.attrs.find(OI_ATTR); + if (k == i->second.attrs.end()) { + // no object info on object, probably corrupt + shard_info.set_info_missing(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a missing info key"; + goto out; + } + bl.push_back(k->second); + try { + auto bliter = bl.cbegin(); + decode(oi, bliter); + } catch (...) { + // invalid object info, probably corrupt + shard_info.set_info_corrupted(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a corrupt info"; + goto out; + } + + // This is automatically corrected in PG::_repair_oinfo_oid() + ceph_assert(oi.soid == obj); + + if (i->second.size != be_get_ondisk_size(oi.size)) { + shard_info.set_obj_size_info_mismatch(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate size " << i->second.size << " info size " + << oi.size << " mismatch"; + } + + // digest_match will only be true if computed digests are the same + if (auth_version != eversion_t() + && auth->second->objects[obj].digest_present + && i->second.digest_present + && auth->second->objects[obj].digest != i->second.digest) { + digest_match = false; + dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest + << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec + << dendl; + } + + // Don't use this particular shard due to previous errors + // XXX: For now we can't pick one shard for repair and another's object info or snapset + if (shard_info.errors) + goto out; + + if (auth_version == eversion_t() || oi.version > auth_version || + (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) { + auth = j; + *auth_oi = oi; + auth_version = oi.version; + } + +out: + if (error) + errorstream << pgid.pgid << " shard " << l << " soid " << obj + << " : " << shard_errorstream.str() << "\n"; + // Keep scanning other shards + } + dout(10) << __func__ << ": selecting osd " << auth->first + << " for obj " << obj + << " with oi " << *auth_oi + << dendl; + return auth; +} + +void PGBackend::be_compare_scrubmaps( + const map &maps, + const set &master_set, + bool repair, + map> &missing, + map> &inconsistent, + map> &authoritative, + map, + std::optional>> &missing_digest, + int &shallow_errors, int &deep_errors, + Scrub::Store *store, + const spg_t& pgid, + const vector &acting, + ostream &errorstream) +{ + utime_t now = ceph_clock_now(); + + // Check maps against master set and each other + for (set::const_iterator k = master_set.begin(); + k != master_set.end(); + ++k) { + object_info_t auth_oi; + map shard_map; + + inconsistent_obj_wrapper object_error{*k}; + + bool digest_match; + map::const_iterator auth = + be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match, + pgid, errorstream); + + list auth_list; + set object_errors; + if (auth == maps.end()) { + object_error.set_version(0); + object_error.set_auth_missing(*k, maps, shard_map, shallow_errors, + deep_errors, get_parent()->whoami_shard()); + if (object_error.has_deep_errors()) + ++deep_errors; + else if (object_error.has_shallow_errors()) + ++shallow_errors; + store->add_object_error(k->pool, object_error); + errorstream << pgid.pgid << " soid " << *k + << " : failed to pick suitable object info\n"; + continue; + } + object_error.set_version(auth_oi.user_version); + ScrubMap::object& auth_object = auth->second->objects[*k]; + set cur_missing; + set cur_inconsistent; + bool fix_digest = false; + + for (auto j = maps.cbegin(); j != maps.cend(); ++j) { + if (j == auth) + shard_map[auth->first].selected_oi = true; + if (j->second->objects.count(*k)) { + shard_map[j->first].set_object(j->second->objects[*k]); + // Compare + stringstream ss; + bool found = be_compare_scrub_objects(auth->first, + auth_object, + auth_oi, + j->second->objects[*k], + shard_map[j->first], + object_error, + ss, + k->has_snapset()); + + dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "") + << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ") + << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "") + << dendl; + // If all replicas match, but they don't match object_info we can + // repair it by using missing_digest mechanism + if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1 + && digest_match && shard_map[j->first].only_data_digest_mismatch_info() + && auth_object.digest_present) { + // Set in missing_digests + fix_digest = true; + // Clear the error + shard_map[j->first].clear_data_digest_mismatch_info(); + errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n"; + } + // Some errors might have already been set in be_select_auth_object() + if (shard_map[j->first].errors != 0) { + cur_inconsistent.insert(j->first); + if (shard_map[j->first].has_deep_errors()) + ++deep_errors; + else + ++shallow_errors; + // Only true if be_compare_scrub_objects() found errors and put something + // in ss. + if (found) + errorstream << pgid << " shard " << j->first << " soid " << *k + << " : " << ss.str() << "\n"; + } else if (found) { + // Track possible shard to use as authoritative, if needed + // There are errors, without identifying the shard + object_errors.insert(j->first); + errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n"; + } else { + // XXX: The auth shard might get here that we don't know + // that it has the "correct" data. + auth_list.push_back(j->first); + } + } else { + cur_missing.insert(j->first); + shard_map[j->first].set_missing(); + shard_map[j->first].primary = (j->first == get_parent()->whoami_shard()); + // Can't have any other errors if there is no information available + ++shallow_errors; + errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n"; + } + object_error.add_shard(j->first, shard_map[j->first]); + } + + if (auth_list.empty()) { + if (object_errors.empty()) { + errorstream << pgid.pgid << " soid " << *k + << " : failed to pick suitable auth object\n"; + goto out; + } + // Object errors exist and nothing in auth_list + // Prefer the auth shard otherwise take first from list. + pg_shard_t shard; + if (object_errors.count(auth->first)) { + shard = auth->first; + } else { + shard = *(object_errors.begin()); + } + auth_list.push_back(shard); + object_errors.erase(shard); + } + // At this point auth_list is populated, so we add the object errors shards + // as inconsistent. + cur_inconsistent.insert(object_errors.begin(), object_errors.end()); + if (!cur_missing.empty()) { + missing[*k] = cur_missing; + } + if (!cur_inconsistent.empty()) { + inconsistent[*k] = cur_inconsistent; + } + + if (fix_digest) { + std::optional data_digest, omap_digest; + ceph_assert(auth_object.digest_present); + data_digest = auth_object.digest; + if (auth_object.omap_digest_present) { + omap_digest = auth_object.omap_digest; + } + missing_digest[*k] = make_pair(data_digest, omap_digest); + } + if (!cur_inconsistent.empty() || !cur_missing.empty()) { + authoritative[*k] = auth_list; + } else if (!fix_digest && parent->get_pool().is_replicated()) { + enum { + NO = 0, + MAYBE = 1, + FORCE = 2, + } update = NO; + + if (auth_object.digest_present && !auth_oi.is_data_digest()) { + dout(20) << __func__ << " missing data digest on " << *k << dendl; + update = MAYBE; + } + if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) { + dout(20) << __func__ << " missing omap digest on " << *k << dendl; + update = MAYBE; + } + + // recorded digest != actual digest? + if (auth_oi.is_data_digest() && auth_object.digest_present && + auth_oi.data_digest != auth_object.digest) { + ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info()); + errorstream << pgid << " recorded data digest 0x" + << std::hex << auth_oi.data_digest << " != on disk 0x" + << auth_object.digest << std::dec << " on " << auth_oi.soid + << "\n"; + if (repair) + update = FORCE; + } + if (auth_oi.is_omap_digest() && auth_object.omap_digest_present && + auth_oi.omap_digest != auth_object.omap_digest) { + ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info()); + errorstream << pgid << " recorded omap digest 0x" + << std::hex << auth_oi.omap_digest << " != on disk 0x" + << auth_object.omap_digest << std::dec + << " on " << auth_oi.soid << "\n"; + if (repair) + update = FORCE; + } + + if (update != NO) { + utime_t age = now - auth_oi.local_mtime; + if (update == FORCE || + age > cct->_conf->osd_deep_scrub_update_digest_min_age) { + std::optional data_digest, omap_digest; + if (auth_object.digest_present) { + data_digest = auth_object.digest; + dout(20) << __func__ << " will update data digest on " << *k << dendl; + } + if (auth_object.omap_digest_present) { + omap_digest = auth_object.omap_digest; + dout(20) << __func__ << " will update omap digest on " << *k << dendl; + } + missing_digest[*k] = make_pair(data_digest, omap_digest); + } else { + dout(20) << __func__ << " missing digest but age " << age + << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age + << " on " << *k << dendl; + } + } + } +out: + if (object_error.has_deep_errors()) + ++deep_errors; + else if (object_error.has_shallow_errors()) + ++shallow_errors; + if (object_error.errors || object_error.union_shards.errors) { + store->add_object_error(k->pool, object_error); + } + } +} + +void PGBackend::be_omap_checks(const map &maps, + const set &master_set, + omap_stat_t& omap_stats, + ostream &warnstream) const +{ + bool needs_omap_check = false; + for (const auto& map : maps) { + if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) { + needs_omap_check = true; + break; + } + } + + if (!needs_omap_check) { + return; // Nothing to do + } + + // Iterate through objects and update omap stats + for (const auto& k : master_set) { + for (const auto& map : maps) { + if (map.first != get_parent()->primary_shard()) { + // Only set omap stats for the primary + continue; + } + auto it = map.second->objects.find(k); + if (it == map.second->objects.end()) + continue; + ScrubMap::object& obj = it->second; + omap_stats.omap_bytes += obj.object_omap_bytes; + omap_stats.omap_keys += obj.object_omap_keys; + if (obj.large_omap_object_found) { + pg_t pg; + auto osdmap = get_osdmap(); + osdmap->map_to_pg(k.pool, k.oid.name, k.get_key(), k.nspace, &pg); + pg_t mpg = osdmap->raw_pg_to_pg(pg); + omap_stats.large_omap_objects++; + warnstream << "Large omap object found. Object: " << k + << " PG: " << pg << " (" << mpg << ")" + << " Key count: " << obj.large_omap_object_key_count + << " Size (bytes): " << obj.large_omap_object_value_size + << '\n'; + break; + } + } + } +} diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h new file mode 100644 index 000000000..12bdfc0d1 --- /dev/null +++ b/src/osd/PGBackend.h @@ -0,0 +1,641 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013,2014 Inktank Storage, Inc. + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef PGBACKEND_H +#define PGBACKEND_H + +#include "osd_types.h" +#include "common/WorkQueue.h" +#include "include/Context.h" +#include "os/ObjectStore.h" +#include "common/LogClient.h" +#include +#include "PGTransaction.h" +#include "common/ostream_temp.h" + +namespace Scrub { + class Store; +} +struct shard_info_wrapper; +struct inconsistent_obj_wrapper; + +//forward declaration +class OSDMap; +class PGLog; +typedef std::shared_ptr OSDMapRef; + + /** + * PGBackend + * + * PGBackend defines an interface for logic handling IO and + * replication on RADOS objects. The PGBackend implementation + * is responsible for: + * + * 1) Handling client operations + * 2) Handling object recovery + * 3) Handling object access + * 4) Handling scrub, deep-scrub, repair + */ + class PGBackend { + public: + CephContext* cct; + protected: + ObjectStore *store; + const coll_t coll; + ObjectStore::CollectionHandle &ch; + public: + /** + * Provides interfaces for PGBackend callbacks + * + * The intention is that the parent calls into the PGBackend + * implementation holding a lock and that the callbacks are + * called under the same locks. + */ + class Listener { + public: + /// Debugging + virtual DoutPrefixProvider *get_dpp() = 0; + + /// Recovery + + /** + * Called with the transaction recovering oid + */ + virtual void on_local_recover( + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info, + ObjectContextRef obc, + bool is_delete, + ObjectStore::Transaction *t + ) = 0; + + /** + * Called when transaction recovering oid is durable and + * applied on all replicas + */ + virtual void on_global_recover( + const hobject_t &oid, + const object_stat_sum_t &stat_diff, + bool is_delete + ) = 0; + + /** + * Called when peer is recovered + */ + virtual void on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info + ) = 0; + + virtual void begin_peer_recover( + pg_shard_t peer, + const hobject_t oid) = 0; + + virtual void apply_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats) = 0; + + /** + * Called when a read from a std::set of replicas/primary fails + */ + virtual void on_failed_pull( + const std::set &from, + const hobject_t &soid, + const eversion_t &v + ) = 0; + + /** + * Called when a pull on soid cannot be completed due to + * down peers + */ + virtual void cancel_pull( + const hobject_t &soid) = 0; + + /** + * Called to remove an object. + */ + virtual void remove_missing_object( + const hobject_t &oid, + eversion_t v, + Context *on_complete) = 0; + + /** + * Bless a context + * + * Wraps a context in whatever outer layers the parent usually + * uses to call into the PGBackend + */ + virtual Context *bless_context(Context *c) = 0; + virtual GenContext *bless_gencontext( + GenContext *c) = 0; + virtual GenContext *bless_unlocked_gencontext( + GenContext *c) = 0; + + virtual void send_message(int to_osd, Message *m) = 0; + virtual void queue_transaction( + ObjectStore::Transaction&& t, + OpRequestRef op = OpRequestRef() + ) = 0; + virtual void queue_transactions( + std::vector& tls, + OpRequestRef op = OpRequestRef() + ) = 0; + virtual epoch_t get_interval_start_epoch() const = 0; + virtual epoch_t get_last_peering_reset_epoch() const = 0; + + virtual const std::set &get_acting_recovery_backfill_shards() const = 0; + virtual const std::set &get_acting_shards() const = 0; + virtual const std::set &get_backfill_shards() const = 0; + + virtual std::ostream& gen_dbg_prefix(std::ostream& out) const = 0; + + virtual const std::map> &get_missing_loc_shards() + const = 0; + + virtual const pg_missing_tracker_t &get_local_missing() const = 0; + virtual void add_local_next_event(const pg_log_entry_t& e) = 0; + virtual const std::map &get_shard_missing() + const = 0; + virtual const pg_missing_const_i * maybe_get_shard_missing( + pg_shard_t peer) const { + if (peer == primary_shard()) { + return &get_local_missing(); + } else { + std::map::const_iterator i = + get_shard_missing().find(peer); + if (i == get_shard_missing().end()) { + return nullptr; + } else { + return &(i->second); + } + } + } + virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const { + auto m = maybe_get_shard_missing(peer); + ceph_assert(m); + return *m; + } + + virtual const std::map &get_shard_info() const = 0; + virtual const pg_info_t &get_shard_info(pg_shard_t peer) const { + if (peer == primary_shard()) { + return get_info(); + } else { + std::map::const_iterator i = + get_shard_info().find(peer); + ceph_assert(i != get_shard_info().end()); + return i->second; + } + } + + virtual const PGLog &get_log() const = 0; + virtual bool pgb_is_primary() const = 0; + virtual const OSDMapRef& pgb_get_osdmap() const = 0; + virtual epoch_t pgb_get_osdmap_epoch() const = 0; + virtual const pg_info_t &get_info() const = 0; + virtual const pg_pool_t &get_pool() const = 0; + + virtual ObjectContextRef get_obc( + const hobject_t &hoid, + const std::map &attrs) = 0; + + virtual bool try_lock_for_read( + const hobject_t &hoid, + ObcLockManager &manager) = 0; + + virtual void release_locks(ObcLockManager &manager) = 0; + + virtual void op_applied( + const eversion_t &applied_version) = 0; + + virtual bool should_send_op( + pg_shard_t peer, + const hobject_t &hoid) = 0; + + virtual bool pg_is_undersized() const = 0; + virtual bool pg_is_repair() const = 0; + + virtual void log_operation( + std::vector&& logv, + const std::optional &hset_history, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const eversion_t &min_last_complete_ondisk, + bool transaction_applied, + ObjectStore::Transaction &t, + bool async = false) = 0; + + virtual void pgb_set_object_snap_mapping( + const hobject_t &soid, + const std::set &snaps, + ObjectStore::Transaction *t) = 0; + + virtual void pgb_clear_object_snap_mapping( + const hobject_t &soid, + ObjectStore::Transaction *t) = 0; + + virtual void update_peer_last_complete_ondisk( + pg_shard_t fromosd, + eversion_t lcod) = 0; + + virtual void update_last_complete_ondisk( + eversion_t lcod) = 0; + + virtual void update_stats( + const pg_stat_t &stat) = 0; + + virtual void schedule_recovery_work( + GenContext *c) = 0; + + virtual pg_shard_t whoami_shard() const = 0; + int whoami() const { + return whoami_shard().osd; + } + spg_t whoami_spg_t() const { + return get_info().pgid; + } + + virtual spg_t primary_spg_t() const = 0; + virtual pg_shard_t primary_shard() const = 0; + virtual uint64_t min_peer_features() const = 0; + virtual uint64_t min_upacting_features() const = 0; + virtual hobject_t get_temp_recovery_object(const hobject_t& target, + eversion_t version) = 0; + + virtual void send_message_osd_cluster( + int peer, Message *m, epoch_t from_epoch) = 0; + virtual void send_message_osd_cluster( + std::vector>& messages, epoch_t from_epoch) = 0; + virtual void send_message_osd_cluster( + MessageRef, Connection *con) = 0; + virtual void send_message_osd_cluster( + Message *m, const ConnectionRef& con) = 0; + virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0; + virtual entity_name_t get_cluster_msgr_name() = 0; + + virtual PerfCounters *get_logger() = 0; + + virtual ceph_tid_t get_tid() = 0; + + virtual OstreamTemp clog_error() = 0; + virtual OstreamTemp clog_warn() = 0; + + virtual bool check_failsafe_full() = 0; + + virtual bool pg_is_repair() = 0; + virtual void inc_osd_stat_repaired() = 0; + virtual bool pg_is_remote_backfilling() = 0; + virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0; + virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0; + virtual void pg_add_num_bytes(int64_t num_bytes) = 0; + virtual void pg_sub_num_bytes(int64_t num_bytes) = 0; + virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0; + virtual ~Listener() {} + }; + Listener *parent; + Listener *get_parent() const { return parent; } + PGBackend(CephContext* cct, Listener *l, ObjectStore *store, const coll_t &coll, + ObjectStore::CollectionHandle &ch) : + cct(cct), + store(store), + coll(coll), + ch(ch), + parent(l) {} + bool is_primary() const { return get_parent()->pgb_is_primary(); } + const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); } + epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); } + const pg_info_t &get_info() { return get_parent()->get_info(); } + + std::ostream& gen_prefix(std::ostream& out) const { + return parent->gen_dbg_prefix(out); + } + + /** + * RecoveryHandle + * + * We may want to recover multiple objects in the same std::set of + * messages. RecoveryHandle is an interface for the opaque + * object used by the implementation to store the details of + * the pending recovery operations. + */ + struct RecoveryHandle { + bool cache_dont_need; + std::map > > deletes; + + RecoveryHandle(): cache_dont_need(false) {} + virtual ~RecoveryHandle() {} + }; + + /// Get a fresh recovery operation + virtual RecoveryHandle *open_recovery_op() = 0; + + /// run_recovery_op: finish the operation represented by h + virtual void run_recovery_op( + RecoveryHandle *h, ///< [in] op to finish + int priority ///< [in] msg priority + ) = 0; + + void recover_delete_object(const hobject_t &oid, eversion_t v, + RecoveryHandle *h); + void send_recovery_deletes(int prio, + const std::map > > &deletes); + + /** + * recover_object + * + * Triggers a recovery operation on the specified hobject_t + * onreadable must be called before onwriteable + * + * On each replica (primary included), get_parent()->on_not_missing() + * must be called when the transaction finalizing the recovery + * is queued. Similarly, get_parent()->on_readable() must be called + * when the transaction is applied in the backing store. + * + * get_parent()->on_not_degraded() should be called on the primary + * when writes can resume on the object. + * + * obc may be NULL if the primary lacks the object. + * + * head may be NULL only if the head/snapdir is missing + * + * @param missing [in] std::set of info, missing pairs for queried nodes + * @param overlaps [in] mapping of object to file offset overlaps + */ + virtual int recover_object( + const hobject_t &hoid, ///< [in] object to recover + eversion_t v, ///< [in] version to recover + ObjectContextRef head, ///< [in] context of the head/snapdir object + ObjectContextRef obc, ///< [in] context of the object + RecoveryHandle *h ///< [in,out] handle to attach recovery op to + ) = 0; + + /** + * true if PGBackend can handle this message while inactive + * + * If it returns true, handle_message *must* also return true + */ + virtual bool can_handle_while_inactive(OpRequestRef op) = 0; + + /// gives PGBackend a crack at an incoming message + bool handle_message( + OpRequestRef op ///< [in] message received + ); ///< @return true if the message was handled + + /// the variant of handle_message that is overridden by child classes + virtual bool _handle_message(OpRequestRef op) = 0; + + virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0; + + + /** + * clean up any temporary on-disk state due to a pg interval change + */ + void on_change_cleanup(ObjectStore::Transaction *t); + /** + * implementation should clear itself, contexts blessed prior to on_change + * won't be called after on_change() + */ + virtual void on_change() = 0; + virtual void clear_recovery_state() = 0; + + virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0; + virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0; + virtual int get_ec_data_chunk_count() const { return 0; }; + virtual int get_ec_stripe_chunk_size() const { return 0; }; + + virtual void dump_recovery_info(ceph::Formatter *f) const = 0; + + private: + std::set temp_contents; + public: + // Track contents of temp collection, clear on reset + void add_temp_obj(const hobject_t &oid) { + temp_contents.insert(oid); + } + void add_temp_objs(const std::set &oids) { + temp_contents.insert(oids.begin(), oids.end()); + } + void clear_temp_obj(const hobject_t &oid) { + temp_contents.erase(oid); + } + void clear_temp_objs(const std::set &oids) { + for (std::set::const_iterator i = oids.begin(); + i != oids.end(); + ++i) { + temp_contents.erase(*i); + } + } + + virtual ~PGBackend() {} + + /// execute implementation specific transaction + virtual void submit_transaction( + const hobject_t &hoid, ///< [in] object + const object_stat_sum_t &delta_stats,///< [in] stat change + const eversion_t &at_version, ///< [in] version + PGTransactionUPtr &&t, ///< [in] trans to execute (move) + const eversion_t &trim_to, ///< [in] trim log to here + const eversion_t &min_last_complete_ondisk, ///< [in] lower bound on + /// committed version + std::vector&& log_entries, ///< [in] log entries for t + /// [in] hitset history (if updated with this transaction) + std::optional &hset_history, + Context *on_all_commit, ///< [in] called when all commit + ceph_tid_t tid, ///< [in] tid + osd_reqid_t reqid, ///< [in] reqid + OpRequestRef op ///< [in] op + ) = 0; + + /// submit callback to be called in order with pending writes + virtual void call_write_ordered(std::function &&cb) = 0; + + void try_stash( + const hobject_t &hoid, + version_t v, + ObjectStore::Transaction *t); + + void rollback( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t); + + friend class LRBTrimmer; + void rollforward( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t); + + void trim( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t); + + void remove( + const hobject_t &hoid, + ObjectStore::Transaction *t); + + protected: + + void handle_recovery_delete(OpRequestRef op); + void handle_recovery_delete_reply(OpRequestRef op); + + /// Reapply old attributes + void rollback_setattrs( + const hobject_t &hoid, + std::map > &old_attrs, + ObjectStore::Transaction *t); + + /// Truncate object to rollback append + virtual void rollback_append( + const hobject_t &hoid, + uint64_t old_size, + ObjectStore::Transaction *t); + + /// Unstash object to rollback stash + void rollback_stash( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t); + + /// Unstash object to rollback stash + void rollback_try_stash( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t); + + /// Delete object to rollback create + void rollback_create( + const hobject_t &hoid, + ObjectStore::Transaction *t) { + remove(hoid, t); + } + + /// Clone the extents back into place + void rollback_extents( + version_t gen, + const std::vector > &extents, + const hobject_t &hoid, + ObjectStore::Transaction *t); + public: + + /// Trim object stashed at version + void trim_rollback_object( + const hobject_t &hoid, + version_t gen, + ObjectStore::Transaction *t); + + /// Std::list objects in collection + int objects_list_partial( + const hobject_t &begin, + int min, + int max, + std::vector *ls, + hobject_t *next); + + int objects_list_range( + const hobject_t &start, + const hobject_t &end, + std::vector *ls, + std::vector *gen_obs=0); + + int objects_get_attr( + const hobject_t &hoid, + const std::string &attr, + ceph::buffer::list *out); + + virtual int objects_get_attrs( + const hobject_t &hoid, + std::map *out); + + virtual int objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + ceph::buffer::list *bl) = 0; + + virtual int objects_readv_sync( + const hobject_t &hoid, + std::map&& m, + uint32_t op_flags, + ceph::buffer::list *bl) { + return -EOPNOTSUPP; + } + + virtual void objects_read_async( + const hobject_t &hoid, + const std::list, + std::pair > > &to_read, + Context *on_complete, bool fast_read = false) = 0; + + virtual bool auto_repair_supported() const = 0; + int be_scan_list( + ScrubMap &map, + ScrubMapBuilder &pos); + bool be_compare_scrub_objects( + pg_shard_t auth_shard, + const ScrubMap::object &auth, + const object_info_t& auth_oi, + const ScrubMap::object &candidate, + shard_info_wrapper& shard_error, + inconsistent_obj_wrapper &result, + std::ostream &errorstream, + bool has_snapset); + std::map::const_iterator be_select_auth_object( + const hobject_t &obj, + const std::map &maps, + object_info_t *auth_oi, + std::map &shard_map, + bool &digest_match, + spg_t pgid, + std::ostream &errorstream); + void be_compare_scrubmaps( + const std::map &maps, + const std::set &master_set, + bool repair, + std::map> &missing, + std::map> &inconsistent, + std::map> &authoritative, + std::map, + std::optional>> &missing_digest, + int &shallow_errors, int &deep_errors, + Scrub::Store *store, + const spg_t& pgid, + const std::vector &acting, + std::ostream &errorstream); + virtual uint64_t be_get_ondisk_size( + uint64_t logical_size) = 0; + virtual int be_deep_scrub( + const hobject_t &oid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) = 0; + void be_omap_checks( + const std::map &maps, + const std::set &master_set, + omap_stat_t& omap_stats, + std::ostream &warnstream) const; + + static PGBackend *build_pg_backend( + const pg_pool_t &pool, + const std::map& profile, + Listener *l, + coll_t coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct); +}; + +#endif diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc new file mode 100644 index 000000000..c881dbabe --- /dev/null +++ b/src/osd/PGLog.cc @@ -0,0 +1,1189 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "PGLog.h" +#include "include/unordered_map.h" +#include "common/ceph_context.h" + +using std::make_pair; +using std::map; +using std::ostream; +using std::set; +using std::string; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +static ostream& _prefix(std::ostream *_dout, const PGLog *pglog) +{ + return pglog->gen_prefix(*_dout); +} + +//////////////////// PGLog::IndexedLog //////////////////// + +void PGLog::IndexedLog::split_out_child( + pg_t child_pgid, + unsigned split_bits, + PGLog::IndexedLog *target) +{ + unindex(); + *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits)); + index(); + target->index(); + reset_rollback_info_trimmed_to_riter(); +} + +void PGLog::IndexedLog::trim( + CephContext* cct, + eversion_t s, + set *trimmed, + set* trimmed_dups, + eversion_t *write_from_dups) +{ + lgeneric_subdout(cct, osd, 10) << "IndexedLog::trim s=" << s << dendl; + ceph_assert(s <= can_rollback_to); + if (complete_to != log.end()) + lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl; + + auto earliest_dup_version = + log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked + ? 0u + : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked + 1; + + lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl; + while (!log.empty()) { + const pg_log_entry_t &e = *log.begin(); + if (e.version > s) + break; + lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl; + if (trimmed) + trimmed->emplace(e.version); + + unindex(e); // remove from index, + + // add to dup list + if (e.version.version >= earliest_dup_version) { + if (write_from_dups != nullptr && *write_from_dups > e.version) { + lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl; + *write_from_dups = e.version; + } + dups.push_back(pg_log_dup_t(e)); + index(dups.back()); + uint32_t idx = 0; + for (const auto& extra : e.extra_reqids) { + int return_code = e.return_code; + if (return_code >= 0) { + auto it = e.extra_reqid_return_codes.find(idx); + if (it != e.extra_reqid_return_codes.end()) { + return_code = it->second; + // FIXME: we aren't setting op_returns for these extra_reqids + } + } + ++idx; + + // note: extras have the same version as outer op + dups.push_back(pg_log_dup_t(e.version, extra.second, + extra.first, return_code)); + index(dups.back()); + } + } + + bool reset_complete_to = false; + // we are trimming past complete_to, so reset complete_to + if (complete_to != log.end() && e.version >= complete_to->version) + reset_complete_to = true; + if (rollback_info_trimmed_to_riter == log.rend() || + e.version == rollback_info_trimmed_to_riter->version) { + log.pop_front(); + rollback_info_trimmed_to_riter = log.rend(); + } else { + log.pop_front(); + } + + // reset complete_to to the beginning of the log + if (reset_complete_to) { + complete_to = log.begin(); + if (complete_to != log.end()) { + lgeneric_subdout(cct, osd, 20) << " moving complete_to to " + << log.begin()->version << dendl; + } else { + lgeneric_subdout(cct, osd, 20) << " log is now empty" << dendl; + } + } + } + + // we can hit an inflated `dups` b/c of https://tracker.ceph.com/issues/53729 + // the idea is to slowly trim them over a prolonged period of time and mix + // omap deletes with writes (if we're here, a new log entry got added) to + // neither: 1) blow size of single Transaction nor 2) generate-n-accumulate + // large amount of tombstones in BlueStore's RocksDB. + // if trimming immediately is a must, then the ceph-objectstore-tool is + // the way to go. + const size_t max_dups = cct->_conf->osd_pg_log_dups_tracked; + for (size_t max_dups_to_trim = cct->_conf->osd_pg_log_trim_max; + max_dups_to_trim > 0 && dups.size() > max_dups; + max_dups_to_trim--) { + const auto& e = *dups.begin(); + lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl; + if (trimmed_dups) + trimmed_dups->insert(e.get_key_name()); + unindex(e); + dups.pop_front(); + } + + // raise tail? + if (tail < s) + tail = s; + lgeneric_subdout(cct, osd, 20) << "IndexedLog::trim after trim" + << " dups.size()=" << dups.size() + << " tail=" << tail + << " s=" << s << dendl; +} + +ostream& PGLog::IndexedLog::print(ostream& out) const +{ + out << *this << std::endl; + for (auto p = log.begin(); p != log.end(); ++p) { + out << *p << " " << + (logged_object(p->soid) ? "indexed" : "NOT INDEXED") << + std::endl; + ceph_assert(!p->reqid_is_indexed() || logged_req(p->reqid)); + } + + for (auto p = dups.begin(); p != dups.end(); ++p) { + out << *p << std::endl; + } + + return out; +} + +//////////////////// PGLog //////////////////// + +void PGLog::reset_backfill() +{ + missing.clear(); +} + +void PGLog::clear() { + missing.clear(); + log.clear(); + log_keys_debug.clear(); + undirty(); +} + +void PGLog::clear_info_log( + spg_t pgid, + ObjectStore::Transaction *t) { + coll_t coll(pgid); + t->remove(coll, pgid.make_pgmeta_oid()); +} + +void PGLog::trim( + eversion_t trim_to, + pg_info_t &info, + bool transaction_applied, + bool async) +{ + dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl; + // trim? + if (trim_to > log.tail) { + dout(10) << __func__ << " missing = " << missing.num_missing() << dendl; + // Don't assert for async_recovery_targets or backfill_targets + // or whenever there are missing items + if (transaction_applied && !async && (missing.num_missing() == 0)) + ceph_assert(trim_to <= info.last_complete); + + dout(10) << "trim " << log << " to " << trim_to << dendl; + log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups); + info.log_tail = log.tail; + if (log.complete_to != log.log.end()) + dout(10) << " after trim complete_to " << log.complete_to->version << dendl; + } +} + +void PGLog::proc_replica_log( + pg_info_t &oinfo, + const pg_log_t &olog, + pg_missing_t& omissing, + pg_shard_t from) const +{ + dout(10) << "proc_replica_log for osd." << from << ": " + << oinfo << " " << olog << " " << omissing << dendl; + + if (olog.head < log.tail) { + dout(10) << __func__ << ": osd." << from << " does not overlap, not looking " + << "for divergent objects" << dendl; + return; + } + if (olog.head == log.head) { + dout(10) << __func__ << ": osd." << from << " same log head, not looking " + << "for divergent objects" << dendl; + return; + } + + /* + basically what we're doing here is rewinding the remote log, + dropping divergent entries, until we find something that matches + our master log. we then reset last_update to reflect the new + point up to which missing is accurate. + + later, in activate(), missing will get wound forward again and + we will send the peer enough log to arrive at the same state. + */ + + for (auto i = omissing.get_items().begin(); + i != omissing.get_items().end(); + ++i) { + dout(20) << " before missing " << i->first << " need " << i->second.need + << " have " << i->second.have << dendl; + } + + auto first_non_divergent = log.log.rbegin(); + while (1) { + if (first_non_divergent == log.log.rend()) + break; + if (first_non_divergent->version <= olog.head) { + dout(20) << "merge_log point (usually last shared) is " + << *first_non_divergent << dendl; + break; + } + ++first_non_divergent; + } + + /* Because olog.head >= log.tail, we know that both pgs must at least have + * the event represented by log.tail. Similarly, because log.head >= olog.tail, + * we know that the event represented by olog.tail must be common to both logs. + * Furthermore, the event represented by a log tail was necessarily trimmed, + * thus neither olog.tail nor log.tail can be divergent. It's + * possible that olog/log contain no actual events between olog.head and + * max(log.tail, olog.tail), however, since they might have been split out. + * Thus, if we cannot find an event e such that + * log.tail <= e.version <= log.head, the last_update must actually be + * max(log.tail, olog.tail). + */ + eversion_t limit = std::max(olog.tail, log.tail); + eversion_t lu = + (first_non_divergent == log.log.rend() || + first_non_divergent->version < limit) ? + limit : + first_non_divergent->version; + + // we merge and adjust the replica's log, rollback the rollbackable divergent entry, + // remove the unrollbackable divergent entry and mark the according object as missing. + // the rollback boundary must choose crt of the olog which going to be merged. + // The replica log's(olog) crt will not be modified, so it could get passed + // to _merge_divergent_entries() directly. + IndexedLog folog(olog); + auto divergent = folog.rewind_from_head(lu); + _merge_divergent_entries( + folog, + divergent, + oinfo, + olog.get_can_rollback_to(), + omissing, + 0, + this); + + if (lu < oinfo.last_update) { + dout(10) << " peer osd." << from << " last_update now " << lu << dendl; + oinfo.last_update = lu; + } + + if (omissing.have_missing()) { + eversion_t first_missing = + omissing.get_items().at(omissing.get_rmissing().begin()->second).need; + oinfo.last_complete = eversion_t(); + for (auto i = olog.log.begin(); i != olog.log.end(); ++i) { + if (i->version < first_missing) + oinfo.last_complete = i->version; + else + break; + } + } else { + oinfo.last_complete = oinfo.last_update; + } +} // proc_replica_log + +/** + * rewind divergent entries at the head of the log + * + * This rewinds entries off the head of our log that are divergent. + * This is used by replicas during activation. + * + * @param newhead new head to rewind to + */ +void PGLog::rewind_divergent_log(eversion_t newhead, + pg_info_t &info, LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info) +{ + dout(10) << "rewind_divergent_log truncate divergent future " << + newhead << dendl; + + // We need to preserve the original crt before it gets updated in rewind_from_head(). + // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback + // a divergent entry or not. + eversion_t original_crt = log.get_can_rollback_to(); + dout(20) << __func__ << " original_crt = " << original_crt << dendl; + if (info.last_complete > newhead) + info.last_complete = newhead; + + auto divergent = log.rewind_from_head(newhead); + if (!divergent.empty()) { + mark_dirty_from(divergent.front().version); + } + for (auto &&entry: divergent) { + dout(10) << "rewind_divergent_log future divergent " << entry << dendl; + } + info.last_update = newhead; + + _merge_divergent_entries( + log, + divergent, + info, + original_crt, + missing, + rollbacker, + this); + + dirty_info = true; + dirty_big_info = true; +} + +void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd, + pg_info_t &info, LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info) +{ + dout(10) << "merge_log " << olog << " from osd." << fromosd + << " into " << log << dendl; + + // Check preconditions + + // If our log is empty, the incoming log needs to have not been trimmed. + ceph_assert(!log.null() || olog.tail == eversion_t()); + // The logs must overlap. + ceph_assert(log.head >= olog.tail && olog.head >= log.tail); + + for (auto i = missing.get_items().begin(); + i != missing.get_items().end(); + ++i) { + dout(20) << "pg_missing_t sobject: " << i->first << dendl; + } + + bool changed = false; + + // extend on tail? + // this is just filling in history. it does not affect our + // missing set, as that should already be consistent with our + // current log. + eversion_t orig_tail = log.tail; + if (olog.tail < log.tail) { + dout(10) << "merge_log extending tail to " << olog.tail << dendl; + auto from = olog.log.begin(); + auto to = from; + eversion_t last; + for (; to != olog.log.end(); ++to) { + if (to->version > log.tail) + break; + log.index(*to); + dout(15) << *to << dendl; + last = to->version; + } + mark_dirty_to(last); + + // splice into our log. + log.log.splice(log.log.begin(), + std::move(olog.log), from, to); + + info.log_tail = log.tail = olog.tail; + changed = true; + } + + if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases + oinfo.stats.reported_epoch < info.stats.reported_epoch) { + oinfo.stats.reported_seq = info.stats.reported_seq; + oinfo.stats.reported_epoch = info.stats.reported_epoch; + } + if (info.last_backfill.is_max()) + info.stats = oinfo.stats; + info.hit_set = oinfo.hit_set; + + // do we have divergent entries to throw out? + if (olog.head < log.head) { + rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info); + changed = true; + } + + // extend on head? + if (olog.head > log.head) { + dout(10) << "merge_log extending head to " << olog.head << dendl; + + // find start point in olog + auto to = olog.log.end(); + auto from = olog.log.end(); + eversion_t lower_bound = std::max(olog.tail, orig_tail); + while (1) { + if (from == olog.log.begin()) + break; + --from; + dout(20) << " ? " << *from << dendl; + if (from->version <= log.head) { + lower_bound = std::max(lower_bound, from->version); + ++from; + break; + } + } + dout(20) << "merge_log cut point (usually last shared) is " + << lower_bound << dendl; + mark_dirty_from(lower_bound); + + // We need to preserve the original crt before it gets updated in rewind_from_head(). + // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback + // a divergent entry or not. + eversion_t original_crt = log.get_can_rollback_to(); + dout(20) << __func__ << " original_crt = " << original_crt << dendl; + auto divergent = log.rewind_from_head(lower_bound); + // move aside divergent items + for (auto &&oe: divergent) { + dout(10) << "merge_log divergent " << oe << dendl; + } + log.roll_forward_to(log.head, rollbacker); + + mempool::osd_pglog::list new_entries; + new_entries.splice(new_entries.end(), olog.log, from, to); + append_log_entries_update_missing( + info.last_backfill, + new_entries, + false, + &log, + missing, + rollbacker, + this); + + _merge_divergent_entries( + log, + divergent, + info, + original_crt, + missing, + rollbacker, + this); + + info.last_update = log.head = olog.head; + + // We cannot rollback into the new log entries + log.skip_can_rollback_to_to_head(); + + info.last_user_version = oinfo.last_user_version; + info.purged_snaps = oinfo.purged_snaps; + // update num_missing too + // we might have appended some more missing objects above + info.stats.stats.sum.num_objects_missing = missing.num_missing(); + + changed = true; + } + + // now handle dups + if (merge_log_dups(olog)) { + changed = true; + } + + dout(10) << "merge_log result " << log << " " << missing << + " changed=" << changed << dendl; + + if (changed) { + dirty_info = true; + dirty_big_info = true; + } +} + + +// returns true if any changes were made to log.dups +bool PGLog::merge_log_dups(const pg_log_t& olog) { + dout(5) << __func__ + << " log.dups.size()=" << log.dups.size() + << "olog.dups.size()=" << olog.dups.size() << dendl; + bool changed = false; + + if (!olog.dups.empty()) { + if (log.dups.empty()) { + dout(10) << "merge_log copying olog dups to log " << + olog.dups.front().version << " to " << + olog.dups.back().version << dendl; + changed = true; + dirty_from_dups = eversion_t(); + dirty_to_dups = eversion_t::max(); + // since our log.dups is empty just copy them + for (const auto& i : olog.dups) { + log.dups.push_back(i); + log.index(log.dups.back()); + } + } else { + // since our log.dups is not empty try to extend on each end + + if (olog.dups.back().version > log.dups.back().version) { + // extend the dups's tail (i.e., newer dups) + dout(10) << "merge_log extending dups tail to " << + olog.dups.back().version << dendl; + changed = true; + + auto log_tail_version = log.dups.back().version; + + auto insert_cursor = log.dups.end(); + eversion_t last_shared = eversion_t::max(); + for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) { + if (i->version <= log_tail_version) break; + log.dups.insert(insert_cursor, *i); + last_shared = i->version; + + auto prev = insert_cursor; + --prev; + // be sure to pass reference of copy in log.dups + log.index(*prev); + + --insert_cursor; // make sure we insert in reverse order + } + mark_dirty_from_dups(last_shared); + } + + if (olog.dups.front().version < log.dups.front().version) { + // extend the dups's head (i.e., older dups) + dout(10) << "merge_log extending dups head to " << + olog.dups.front().version << dendl; + changed = true; + + eversion_t last; + auto insert_cursor = log.dups.begin(); + for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) { + if (i->version >= insert_cursor->version) break; + log.dups.insert(insert_cursor, *i); + last = i->version; + auto prev = insert_cursor; + --prev; + // be sure to pass address of copy in log.dups + log.index(*prev); + } + mark_dirty_to_dups(last); + } + } + } + + // remove any dup entries that overlap with pglog + if (!log.dups.empty() && log.dups.back().version > log.tail) { + dout(10) << "merge_log removed dups overlapping log entries (" << + log.tail << "," << log.dups.back().version << "]" << dendl; + changed = true; + + while (!log.dups.empty() && log.dups.back().version > log.tail) { + log.unindex(log.dups.back()); + mark_dirty_from_dups(log.dups.back().version); + log.dups.pop_back(); + } + } + + dout(5) << "end of " << __func__ << " changed=" << changed + << " log.dups.size()=" << log.dups.size() + << " olog.dups.size()=" << olog.dups.size() << dendl; + + return changed; +} + +void PGLog::check() { + if (!pg_log_debug) + return; + if (log.log.size() != log_keys_debug.size()) { + derr << "log.log.size() != log_keys_debug.size()" << dendl; + derr << "actual log:" << dendl; + for (auto i = log.log.begin(); i != log.log.end(); ++i) { + derr << " " << *i << dendl; + } + derr << "log_keys_debug:" << dendl; + for (auto i = log_keys_debug.begin(); + i != log_keys_debug.end(); + ++i) { + derr << " " << *i << dendl; + } + } + ceph_assert(log.log.size() == log_keys_debug.size()); + for (auto i = log.log.begin(); i != log.log.end(); ++i) { + ceph_assert(log_keys_debug.count(i->get_key_name())); + } +} + +// non-static +void PGLog::write_log_and_missing( + ObjectStore::Transaction& t, + map *km, + const coll_t& coll, + const ghobject_t &log_oid, + bool require_rollback) +{ + if (needs_write()) { + dout(6) << "write_log_and_missing with: " + << "dirty_to: " << dirty_to + << ", dirty_from: " << dirty_from + << ", writeout_from: " << writeout_from + << ", trimmed: " << trimmed + << ", trimmed_dups: " << trimmed_dups + << ", clear_divergent_priors: " << clear_divergent_priors + << dendl; + _write_log_and_missing( + t, km, log, coll, log_oid, + dirty_to, + dirty_from, + writeout_from, + std::move(trimmed), + std::move(trimmed_dups), + missing, + !touched_log, + require_rollback, + clear_divergent_priors, + dirty_to_dups, + dirty_from_dups, + write_from_dups, + &may_include_deletes_in_missing_dirty, + (pg_log_debug ? &log_keys_debug : nullptr), + this); + undirty(); + } else { + dout(10) << "log is not dirty" << dendl; + } +} + +// static +void PGLog::write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + map *km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + map &divergent_priors, + bool require_rollback, + const DoutPrefixProvider *dpp + ) +{ + _write_log_and_missing_wo_missing( + t, km, log, coll, log_oid, + divergent_priors, eversion_t::max(), eversion_t(), eversion_t(), + true, true, require_rollback, + eversion_t::max(), eversion_t(), eversion_t(), nullptr, dpp); +} + +// static +void PGLog::write_log_and_missing( + ObjectStore::Transaction& t, + map *km, + pg_log_t &log, + const coll_t& coll, + const ghobject_t &log_oid, + const pg_missing_tracker_t &missing, + bool require_rollback, + bool *may_include_deletes_in_missing_dirty, + const DoutPrefixProvider *dpp) +{ + _write_log_and_missing( + t, km, log, coll, log_oid, + eversion_t::max(), + eversion_t(), + eversion_t(), + set(), + set(), + missing, + true, require_rollback, false, + eversion_t::max(), + eversion_t(), + eversion_t(), + may_include_deletes_in_missing_dirty, nullptr, dpp); +} + +// static +void PGLog::_write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + map *km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + map &divergent_priors, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + bool dirty_divergent_priors, + bool touch_log, + bool require_rollback, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + set *log_keys_debug, + const DoutPrefixProvider *dpp + ) +{ + ldpp_dout(dpp, 10) << "_write_log_and_missing_wo_missing, clearing up to " << dirty_to + << " dirty_to_dups=" << dirty_to_dups + << " dirty_from_dups=" << dirty_from_dups + << " write_from_dups=" << write_from_dups << dendl; + if (touch_log) + t.touch(coll, log_oid); + if (dirty_to != eversion_t()) { + t.omap_rmkeyrange( + coll, log_oid, + eversion_t().get_key_name(), dirty_to.get_key_name()); + clear_up_to(log_keys_debug, dirty_to.get_key_name()); + } + if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) { + // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl; + t.omap_rmkeyrange( + coll, log_oid, + dirty_from.get_key_name(), eversion_t::max().get_key_name()); + clear_after(log_keys_debug, dirty_from.get_key_name()); + } + + for (auto p = log.log.begin(); + p != log.log.end() && p->version <= dirty_to; + ++p) { + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + (*km)[p->get_key_name()] = std::move(bl); + } + + for (auto p = log.log.rbegin(); + p != log.log.rend() && + (p->version >= dirty_from || p->version >= writeout_from) && + p->version >= dirty_to; + ++p) { + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + (*km)[p->get_key_name()] = std::move(bl); + } + + if (log_keys_debug) { + for (auto i = (*km).begin(); + i != (*km).end(); + ++i) { + if (i->first[0] == '_') + continue; + ceph_assert(!log_keys_debug->count(i->first)); + log_keys_debug->insert(i->first); + } + } + + // process dups after log_keys_debug is filled, so dups do not + // end up in that set + if (dirty_to_dups != eversion_t()) { + pg_log_dup_t min, dirty_to_dup; + dirty_to_dup.version = dirty_to_dups; + ldpp_dout(dpp, 10) << __func__ << " remove dups min=" << min.get_key_name() + << " to dirty_to_dup=" << dirty_to_dup.get_key_name() << dendl; + t.omap_rmkeyrange( + coll, log_oid, + min.get_key_name(), dirty_to_dup.get_key_name()); + } + if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) { + pg_log_dup_t max, dirty_from_dup; + max.version = eversion_t::max(); + dirty_from_dup.version = dirty_from_dups; + ldpp_dout(dpp, 10) << __func__ << " remove dups dirty_from_dup=" + << dirty_from_dup.get_key_name() + << " to max=" << max.get_key_name() << dendl; + t.omap_rmkeyrange( + coll, log_oid, + dirty_from_dup.get_key_name(), max.get_key_name()); + } + + ldpp_dout(dpp, 10) << __func__ << " going to encode log.dups.size()=" + << log.dups.size() << dendl; + for (const auto& entry : log.dups) { + if (entry.version > dirty_to_dups) + break; + bufferlist bl; + encode(entry, bl); + (*km)[entry.get_key_name()] = std::move(bl); + } + ldpp_dout(dpp, 10) << __func__ << " 1st round encoded log.dups.size()=" + << log.dups.size() << dendl; + for (auto p = log.dups.rbegin(); + p != log.dups.rend() && + (p->version >= dirty_from_dups || p->version >= write_from_dups) && + p->version >= dirty_to_dups; + ++p) { + bufferlist bl; + encode(*p, bl); + (*km)[p->get_key_name()] = std::move(bl); + } + ldpp_dout(dpp, 10) << __func__ << " 2st round encoded log.dups.size()=" + << log.dups.size() << dendl; + + if (dirty_divergent_priors) { + ldpp_dout(dpp, 10) << "write_log_and_missing: writing divergent_priors" + << dendl; + encode(divergent_priors, (*km)["divergent_priors"]); + } + if (require_rollback) { + encode( + log.get_can_rollback_to(), + (*km)["can_rollback_to"]); + encode( + log.get_rollback_info_trimmed_to(), + (*km)["rollback_info_trimmed_to"]); + } + ldpp_dout(dpp, 10) << "end of " << __func__ << dendl; +} + +// static +void PGLog::_write_log_and_missing( + ObjectStore::Transaction& t, + map* km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + set &&trimmed, + set &&trimmed_dups, + const pg_missing_tracker_t &missing, + bool touch_log, + bool require_rollback, + bool clear_divergent_priors, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + bool *may_include_deletes_in_missing_dirty, // in/out param + set *log_keys_debug, + const DoutPrefixProvider *dpp + ) { + ldpp_dout(dpp, 10) << __func__ << " clearing up to " << dirty_to + << " dirty_to_dups=" << dirty_to_dups + << " dirty_from_dups=" << dirty_from_dups + << " write_from_dups=" << write_from_dups + << " trimmed_dups.size()=" << trimmed_dups.size() << dendl; + set to_remove; + to_remove.swap(trimmed_dups); + for (auto& t : trimmed) { + string key = t.get_key_name(); + if (log_keys_debug) { + auto it = log_keys_debug->find(key); + ceph_assert(it != log_keys_debug->end()); + log_keys_debug->erase(it); + } + to_remove.emplace(std::move(key)); + } + trimmed.clear(); + + if (touch_log) + t.touch(coll, log_oid); + if (dirty_to != eversion_t()) { + t.omap_rmkeyrange( + coll, log_oid, + eversion_t().get_key_name(), dirty_to.get_key_name()); + clear_up_to(log_keys_debug, dirty_to.get_key_name()); + } + if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) { + ldpp_dout(dpp, 10) << "write_log_and_missing, clearing from " + << dirty_from << dendl; + t.omap_rmkeyrange( + coll, log_oid, + dirty_from.get_key_name(), eversion_t::max().get_key_name()); + clear_after(log_keys_debug, dirty_from.get_key_name()); + } + + for (auto p = log.log.begin(); + p != log.log.end() && p->version <= dirty_to; + ++p) { + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + (*km)[p->get_key_name()] = std::move(bl); + } + + for (auto p = log.log.rbegin(); + p != log.log.rend() && + (p->version >= dirty_from || p->version >= writeout_from) && + p->version >= dirty_to; + ++p) { + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + (*km)[p->get_key_name()] = std::move(bl); + } + + if (log_keys_debug) { + for (auto i = (*km).begin(); + i != (*km).end(); + ++i) { + if (i->first[0] == '_') + continue; + ceph_assert(!log_keys_debug->count(i->first)); + log_keys_debug->insert(i->first); + } + } + + // process dups after log_keys_debug is filled, so dups do not + // end up in that set + if (dirty_to_dups != eversion_t()) { + pg_log_dup_t min, dirty_to_dup; + dirty_to_dup.version = dirty_to_dups; + ldpp_dout(dpp, 10) << __func__ << " remove dups min=" << min.get_key_name() + << " to dirty_to_dup=" << dirty_to_dup.get_key_name() << dendl; + t.omap_rmkeyrange( + coll, log_oid, + min.get_key_name(), dirty_to_dup.get_key_name()); + } + if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) { + pg_log_dup_t max, dirty_from_dup; + max.version = eversion_t::max(); + dirty_from_dup.version = dirty_from_dups; + ldpp_dout(dpp, 10) << __func__ << " remove dups dirty_from_dup=" + << dirty_from_dup.get_key_name() + << " to max=" << max.get_key_name() << dendl; + t.omap_rmkeyrange( + coll, log_oid, + dirty_from_dup.get_key_name(), max.get_key_name()); + } + + ldpp_dout(dpp, 10) << __func__ << " going to encode log.dups.size()=" + << log.dups.size() << dendl; + for (const auto& entry : log.dups) { + if (entry.version > dirty_to_dups) + break; + bufferlist bl; + encode(entry, bl); + (*km)[entry.get_key_name()] = std::move(bl); + } + ldpp_dout(dpp, 10) << __func__ << " 1st round encoded log.dups.size()=" + << log.dups.size() << dendl; + + for (auto p = log.dups.rbegin(); + p != log.dups.rend() && + (p->version >= dirty_from_dups || p->version >= write_from_dups) && + p->version >= dirty_to_dups; + ++p) { + bufferlist bl; + encode(*p, bl); + (*km)[p->get_key_name()] = std::move(bl); + } + ldpp_dout(dpp, 10) << __func__ << " 2st round encoded log.dups.size()=" + << log.dups.size() << dendl; + + if (clear_divergent_priors) { + ldpp_dout(dpp, 10) << "write_log_and_missing: writing divergent_priors" + << dendl; + to_remove.insert("divergent_priors"); + } + // since we encode individual missing items instead of a whole + // missing set, we need another key to store this bit of state + if (*may_include_deletes_in_missing_dirty) { + (*km)["may_include_deletes_in_missing"] = bufferlist(); + *may_include_deletes_in_missing_dirty = false; + } + missing.get_changed( + [&](const hobject_t &obj) { + string key = string("missing/") + obj.to_str(); + pg_missing_item item; + if (!missing.is_missing(obj, &item)) { + to_remove.insert(key); + } else { + encode(make_pair(obj, item), (*km)[key], CEPH_FEATUREMASK_SERVER_OCTOPUS); + } + }); + if (require_rollback) { + encode( + log.get_can_rollback_to(), + (*km)["can_rollback_to"]); + encode( + log.get_rollback_info_trimmed_to(), + (*km)["rollback_info_trimmed_to"]); + } + + if (!to_remove.empty()) + t.omap_rmkeys(coll, log_oid, to_remove); + ldpp_dout(dpp, 10) << "end of " << __func__ << dendl; +} + +void PGLog::rebuild_missing_set_with_deletes( + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + const pg_info_t &info) +{ + // save entries not generated from the current log (e.g. added due + // to repair, EIO handling, or divergent_priors). + map extra_missing; + for (const auto& p : missing.get_items()) { + if (!log.logged_object(p.first)) { + dout(20) << __func__ << " extra missing entry: " << p.first + << " " << p.second << dendl; + extra_missing[p.first] = p.second; + } + } + missing.clear(); + + // go through the log and add items that are not present or older + // versions on disk, just as if we were reading the log + metadata + // off disk originally + set did; + for (auto i = log.log.rbegin(); + i != log.log.rend(); + ++i) { + if (i->version <= info.last_complete) + break; + if (i->soid > info.last_backfill || + i->is_error() || + did.find(i->soid) != did.end()) + continue; + did.insert(i->soid); + + bufferlist bv; + int r = store->getattr( + ch, + ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl; + + if (r >= 0) { + object_info_t oi(bv); + dout(20) << __func__ << " store version = " << oi.version << dendl; + if (oi.version < i->version) { + missing.add(i->soid, i->version, oi.version, i->is_delete()); + } + } else { + missing.add(i->soid, i->version, eversion_t(), i->is_delete()); + } + } + + for (const auto& p : extra_missing) { + missing.add(p.first, p.second.need, p.second.have, p.second.is_delete()); + } + + set_missing_may_contain_deletes(); +} + +#ifdef WITH_SEASTAR + +namespace { + struct FuturizedStoreLogReader { + crimson::os::FuturizedStore &store; + const pg_info_t &info; + PGLog::IndexedLog &log; + std::set* log_keys_debug = NULL; + pg_missing_tracker_t &missing; + const DoutPrefixProvider *dpp; + + eversion_t on_disk_can_rollback_to; + eversion_t on_disk_rollback_info_trimmed_to; + + std::map divergent_priors; + bool must_rebuild = false; + std::list entries; + std::list dups; + + std::optional next; + + void process_entry(crimson::os::FuturizedStore::OmapIteratorRef &p) { + if (p->key()[0] == '_') + return; + //Copy ceph::buffer::list before creating iterator + auto bl = p->value(); + auto bp = bl.cbegin(); + if (p->key() == "divergent_priors") { + decode(divergent_priors, bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size() + << " divergent_priors" << dendl; + ceph_assert("crimson shouldn't have had divergent_priors" == 0); + } else if (p->key() == "can_rollback_to") { + decode(on_disk_can_rollback_to, bp); + } else if (p->key() == "rollback_info_trimmed_to") { + decode(on_disk_rollback_info_trimmed_to, bp); + } else if (p->key() == "may_include_deletes_in_missing") { + missing.may_include_deletes = true; + } else if (p->key().substr(0, 7) == std::string("missing")) { + hobject_t oid; + pg_missing_item item; + decode(oid, bp); + decode(item, bp); + if (item.is_delete()) { + ceph_assert(missing.may_include_deletes); + } + missing.add(oid, std::move(item)); + } else if (p->key().substr(0, 4) == std::string("dup_")) { + pg_log_dup_t dup; + decode(dup, bp); + if (!dups.empty()) { + ceph_assert(dups.back().version < dup.version); + } + dups.push_back(dup); + } else { + pg_log_entry_t e; + e.decode_with_checksum(bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl; + if (!entries.empty()) { + pg_log_entry_t last_e(entries.back()); + ceph_assert(last_e.version.version < e.version.version); + ceph_assert(last_e.version.epoch <= e.version.epoch); + } + entries.push_back(e); + if (log_keys_debug) + log_keys_debug->insert(e.get_key_name()); + } + } + + seastar::future<> read(crimson::os::CollectionRef ch, + ghobject_t pgmeta_oid) { + // will get overridden if recorded + on_disk_can_rollback_to = info.last_update; + missing.may_include_deletes = false; + + return store.get_omap_iterator(ch, pgmeta_oid).then([this](auto iter) { + return seastar::do_until([iter] { return !iter->valid(); }, + [iter, this]() mutable { + process_entry(iter); + return iter->next(); + }); + }).then([this] { + log = PGLog::IndexedLog( + info.last_update, + info.log_tail, + on_disk_can_rollback_to, + on_disk_rollback_info_trimmed_to, + std::move(entries), + std::move(dups)); + }); + } + }; +} + +seastar::future<> PGLog::read_log_and_missing_crimson( + crimson::os::FuturizedStore &store, + crimson::os::CollectionRef ch, + const pg_info_t &info, + IndexedLog &log, + std::set* log_keys_debug, + pg_missing_tracker_t &missing, + ghobject_t pgmeta_oid, + const DoutPrefixProvider *dpp) +{ + ldpp_dout(dpp, 20) << "read_log_and_missing coll " + << ch->get_cid() + << " " << pgmeta_oid << dendl; + return seastar::do_with(FuturizedStoreLogReader{ + store, info, log, log_keys_debug, + missing, dpp}, + [ch, pgmeta_oid](FuturizedStoreLogReader& reader) { + return reader.read(ch, pgmeta_oid); + }); +} + +#endif diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h new file mode 100644 index 000000000..69ca1d20c --- /dev/null +++ b/src/osd/PGLog.h @@ -0,0 +1,1697 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#pragma once + +// re-include our assert to clobber boost's +#include "include/ceph_assert.h" +#include "include/common_fwd.h" +#include "osd_types.h" +#include "os/ObjectStore.h" +#include + +#ifdef WITH_SEASTAR +#include +#include "crimson/os/futurized_store.h" +#include "crimson/os/cyanstore/cyan_collection.h" +#endif + +/** @name PG Log + * + * The pg log serves three primary purposes: + * + * 1) improving recovery speed + * + * 2) detecting duplicate ops + * + * 3) making erasure coded updates safe + * + * For (1), the main data type is pg_log_entry_t. this is indexed in + * memory by the IndexedLog class - this is where most of the logic + * surrounding pg log is kept, even though the low level types are in + * src/osd/osd_types.h + * + * (2) uses a type which is a subset of the full log entry, containing + * just the pieces we need to identify and respond to a duplicate + * request. + * + * As we trim the log, we convert pg_log_entry_t to smaller + * pg_log_dup_t, and finally remove them once we reach a higher + * limit. This is controlled by a few options: + * + * osd_min_pg_log_entries osd_max_pg_log_entries + * osd_pg_log_dups_tracked + * + * For example, with a min of 100, max of 1000, and dups tracked of + * 3000, the log entries and dups stored would span the following + * versions, assuming the current earliest is version 1: + * + * version: 3000 2001 2000 1 [ pg log entries ] [ pg log dups ] + * + * after osd_pg_log_trim_min subsequent writes to this PG, the log + * would be trimmed to look like: + * + * version: 3100 2101 2100 101 [ pg log entries ] [ pg log dups ] + * + * (3) means tracking the previous state of an object, so that we can + * rollback to that prior state if necessary. It's only used for + * erasure coding. Consider an erasure code of 4+2, for example. + * + * This means we split the object into 4 pieces (called shards) and + * compute 2 parity shards. Each of these shards is stored on a + * separate OSD. As long as 4 shards are the same version, we can + * recover the remaining 2 by computation. Imagine during a write, 3 + * of the osds go down and restart, resulting in shards 0,1,2 + * reflecting version A and shards 3,4,5 reflecting version B, after + * the write. + * + * If we had no way to reconstruct version A for another shard, we + * would have lost the object. + * + * The actual data for rollback is stored in a look-aside object and + * is removed once the EC write commits on all shards. The pg log just + * stores the versions so we can tell how far we can rollback, and a + * description of the type of operation for each log entry. Beyond + * the pg log, see PGBackend::Trimmer and PGBackend::RollbackVisitor + * for more details on this. + * + * An important implication of this is that although the pg log length + * is normally bounded, under extreme conditions, with many EC I/Os + * outstanding, the log may grow beyond that point because we need to + * keep the rollback information for all outstanding EC I/O. + * + * For more on pg log bounds, see where it is calculated in + * PeeringState::calc_trim_to_aggressive(). + * + * For more details on how peering uses the pg log, and architectural + * reasons for its existence, see: + * + * doc/dev/osd_internals/log_based_pg.rst + * + */ + +constexpr auto PGLOG_INDEXED_OBJECTS = 1 << 0; +constexpr auto PGLOG_INDEXED_CALLER_OPS = 1 << 1; +constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2; +constexpr auto PGLOG_INDEXED_DUPS = 1 << 3; +constexpr auto PGLOG_INDEXED_ALL = PGLOG_INDEXED_OBJECTS + | PGLOG_INDEXED_CALLER_OPS + | PGLOG_INDEXED_EXTRA_CALLER_OPS + | PGLOG_INDEXED_DUPS; + +struct PGLog : DoutPrefixProvider { + std::ostream& gen_prefix(std::ostream& out) const override { + return out; + } + unsigned get_subsys() const override { + return static_cast(ceph_subsys_osd); + } + CephContext *get_cct() const override { + return cct; + } + + ////////////////////////////// sub classes ////////////////////////////// + struct LogEntryHandler { + virtual void rollback( + const pg_log_entry_t &entry) = 0; + virtual void rollforward( + const pg_log_entry_t &entry) = 0; + virtual void trim( + const pg_log_entry_t &entry) = 0; + virtual void remove( + const hobject_t &hoid) = 0; + virtual void try_stash( + const hobject_t &hoid, + version_t v) = 0; + virtual ~LogEntryHandler() {} + }; + using LogEntryHandlerRef = std::unique_ptr; + +public: + /** + * IndexLog - adds in-memory index of the log, by oid. + * plus some methods to manipulate it all. + */ + struct IndexedLog : public pg_log_t { + mutable ceph::unordered_map objects; // ptrs into log. be careful! + mutable ceph::unordered_map caller_ops; + mutable ceph::unordered_multimap extra_caller_ops; + mutable ceph::unordered_map dup_index; + + // recovery pointers + std::list::iterator complete_to; // not inclusive of referenced item + version_t last_requested = 0; // last object requested by primary + + // + private: + mutable __u16 indexed_data = 0; + /** + * rollback_info_trimmed_to_riter points to the first log entry <= + * rollback_info_trimmed_to + * + * It's a reverse_iterator because rend() is a natural representation for + * tail, and rbegin() works nicely for head. + */ + mempool::osd_pglog::list::reverse_iterator + rollback_info_trimmed_to_riter; + + /* + * return true if we need to mark the pglog as dirty + */ + template + bool advance_can_rollback_to(eversion_t to, F &&f) { + bool dirty_log = to > can_rollback_to || to > rollback_info_trimmed_to; + if (dirty_log) { + if (to > can_rollback_to) + can_rollback_to = to; + + if (to > rollback_info_trimmed_to) + rollback_info_trimmed_to = to; + } + + while (rollback_info_trimmed_to_riter != log.rbegin()) { + --rollback_info_trimmed_to_riter; + if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) { + ++rollback_info_trimmed_to_riter; + break; + } + f(*rollback_info_trimmed_to_riter); + } + + return dirty_log; + } + + void reset_rollback_info_trimmed_to_riter() { + rollback_info_trimmed_to_riter = log.rbegin(); + while (rollback_info_trimmed_to_riter != log.rend() && + rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) + ++rollback_info_trimmed_to_riter; + } + + // indexes objects, caller ops and extra caller ops + public: + IndexedLog() : + complete_to(log.end()), + last_requested(0), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { } + + template + explicit IndexedLog(Args&&... args) : + pg_log_t(std::forward(args)...), + complete_to(log.end()), + last_requested(0), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { + reset_rollback_info_trimmed_to_riter(); + index(); + } + + IndexedLog(const IndexedLog &rhs) : + pg_log_t(rhs), + complete_to(log.end()), + last_requested(rhs.last_requested), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { + reset_rollback_info_trimmed_to_riter(); + index(rhs.indexed_data); + } + + IndexedLog &operator=(const IndexedLog &rhs) { + this->~IndexedLog(); + new (this) IndexedLog(rhs); + return *this; + } + + void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) { + advance_can_rollback_to( + to, + [&](pg_log_entry_t &entry) { + h->trim(entry); + }); + } + bool roll_forward_to(eversion_t to, LogEntryHandler *h) { + return advance_can_rollback_to( + to, + [&](pg_log_entry_t &entry) { + h->rollforward(entry); + }); + } + + void skip_can_rollback_to_to_head() { + advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {}); + } + + mempool::osd_pglog::list rewind_from_head(eversion_t newhead) { + auto divergent = pg_log_t::rewind_from_head(newhead); + index(); + reset_rollback_info_trimmed_to_riter(); + return divergent; + } + + template + void scan_log_after( + const eversion_t &bound, ///< [in] scan entries > bound + T &&f) const { + auto iter = log.rbegin(); + while (iter != log.rend() && iter->version > bound) + ++iter; + + while (true) { + if (iter == log.rbegin()) + break; + f(*(--iter)); + } + } + + /****/ + void claim_log_and_clear_rollback_info(const pg_log_t& o) { + // we must have already trimmed the old entries + ceph_assert(rollback_info_trimmed_to == head); + ceph_assert(rollback_info_trimmed_to_riter == log.rbegin()); + + *this = IndexedLog(o); + + skip_can_rollback_to_to_head(); + index(); + } + + void split_out_child( + pg_t child_pgid, + unsigned split_bits, + IndexedLog *target); + + void zero() { + // we must have already trimmed the old entries + ceph_assert(rollback_info_trimmed_to == head); + ceph_assert(rollback_info_trimmed_to_riter == log.rbegin()); + + unindex(); + pg_log_t::clear(); + rollback_info_trimmed_to_riter = log.rbegin(); + reset_recovery_pointers(); + } + void clear() { + skip_can_rollback_to_to_head(); + zero(); + } + void reset_recovery_pointers() { + complete_to = log.end(); + last_requested = 0; + } + + bool logged_object(const hobject_t& oid) const { + if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { + index_objects(); + } + return objects.count(oid); + } + + bool logged_req(const osd_reqid_t &r) const { + if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { + index_caller_ops(); + } + if (!caller_ops.count(r)) { + if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { + index_extra_caller_ops(); + } + return extra_caller_ops.count(r); + } + return true; + } + + bool get_request( + const osd_reqid_t &r, + eversion_t *version, + version_t *user_version, + int *return_code, + std::vector *op_returns) const + { + ceph_assert(version); + ceph_assert(user_version); + ceph_assert(return_code); + if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { + index_caller_ops(); + } + auto p = caller_ops.find(r); + if (p != caller_ops.end()) { + *version = p->second->version; + *user_version = p->second->user_version; + *return_code = p->second->return_code; + *op_returns = p->second->op_returns; + return true; + } + + // warning: we will return *a* request for this reqid, but not + // necessarily the most recent. + if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { + index_extra_caller_ops(); + } + p = extra_caller_ops.find(r); + if (p != extra_caller_ops.end()) { + uint32_t idx = 0; + for (auto i = p->second->extra_reqids.begin(); + i != p->second->extra_reqids.end(); + ++idx, ++i) { + if (i->first == r) { + *version = p->second->version; + *user_version = i->second; + *return_code = p->second->return_code; + *op_returns = p->second->op_returns; + if (*return_code >= 0) { + auto it = p->second->extra_reqid_return_codes.find(idx); + if (it != p->second->extra_reqid_return_codes.end()) { + *return_code = it->second; + } + } + return true; + } + } + ceph_abort_msg("in extra_caller_ops but not extra_reqids"); + } + + if (!(indexed_data & PGLOG_INDEXED_DUPS)) { + index_dups(); + } + auto q = dup_index.find(r); + if (q != dup_index.end()) { + *version = q->second->version; + *user_version = q->second->user_version; + *return_code = q->second->return_code; + *op_returns = q->second->op_returns; + return true; + } + + return false; + } + + bool has_write_since(const hobject_t &oid, const eversion_t &bound) const { + for (auto i = log.rbegin(); i != log.rend(); ++i) { + if (i->version <= bound) + return false; + if (i->soid.get_head() == oid.get_head()) + return true; + } + return false; + } + + /// get a (bounded) std::list of recent reqids for the given object + void get_object_reqids(const hobject_t& oid, unsigned max, + mempool::osd_pglog::vector > *pls, + mempool::osd_pglog::map *return_codes) const { + // make sure object is present at least once before we do an + // O(n) search. + if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { + index_objects(); + } + if (objects.count(oid) == 0) + return; + + for (auto i = log.rbegin(); i != log.rend(); ++i) { + if (i->soid == oid) { + if (i->reqid_is_indexed()) { + if (i->op == pg_log_entry_t::ERROR) { + // propagate op errors to the cache tier's PG log + return_codes->emplace(pls->size(), i->return_code); + } + pls->push_back(std::make_pair(i->reqid, i->user_version)); + } + + pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end()); + if (pls->size() >= max) { + if (pls->size() > max) { + pls->resize(max); + } + return; + } + } + } + } + + void index(__u16 to_index = PGLOG_INDEXED_ALL) const { + // if to_index is 0, no need to run any of this code, especially + // loop below; this can happen with copy constructor for + // IndexedLog (and indirectly through assignment operator) + if (!to_index) return; + + if (to_index & PGLOG_INDEXED_OBJECTS) + objects.clear(); + if (to_index & PGLOG_INDEXED_CALLER_OPS) + caller_ops.clear(); + if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) + extra_caller_ops.clear(); + if (to_index & PGLOG_INDEXED_DUPS) { + dup_index.clear(); + for (auto& i : dups) { + dup_index[i.reqid] = const_cast(&i); + } + } + + constexpr __u16 any_log_entry_index = + PGLOG_INDEXED_OBJECTS | + PGLOG_INDEXED_CALLER_OPS | + PGLOG_INDEXED_EXTRA_CALLER_OPS; + + if (to_index & any_log_entry_index) { + for (auto i = log.begin(); i != log.end(); ++i) { + if (to_index & PGLOG_INDEXED_OBJECTS) { + if (i->object_is_indexed()) { + objects[i->soid] = const_cast(&(*i)); + } + } + + if (to_index & PGLOG_INDEXED_CALLER_OPS) { + if (i->reqid_is_indexed()) { + caller_ops[i->reqid] = const_cast(&(*i)); + } + } + + if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = i->extra_reqids.begin(); + j != i->extra_reqids.end(); + ++j) { + extra_caller_ops.insert( + std::make_pair(j->first, const_cast(&(*i)))); + } + } + } + } + + indexed_data |= to_index; + } + + void index_objects() const { + index(PGLOG_INDEXED_OBJECTS); + } + + void index_caller_ops() const { + index(PGLOG_INDEXED_CALLER_OPS); + } + + void index_extra_caller_ops() const { + index(PGLOG_INDEXED_EXTRA_CALLER_OPS); + } + + void index_dups() const { + index(PGLOG_INDEXED_DUPS); + } + + void index(pg_log_entry_t& e) { + if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { + if (objects.count(e.soid) == 0 || + objects[e.soid]->version < e.version) + objects[e.soid] = &e; + } + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + // divergent merge_log indexes new before unindexing old + if (e.reqid_is_indexed()) { + caller_ops[e.reqid] = &e; + } + } + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + extra_caller_ops.insert(std::make_pair(j->first, &e)); + } + } + } + + void unindex() { + objects.clear(); + caller_ops.clear(); + extra_caller_ops.clear(); + dup_index.clear(); + indexed_data = 0; + } + + void unindex(const pg_log_entry_t& e) { + // NOTE: this only works if we remove from the _tail_ of the log! + if (indexed_data & PGLOG_INDEXED_OBJECTS) { + auto it = objects.find(e.soid); + if (it != objects.end() && it->second->version == e.version) + objects.erase(it); + } + if (e.reqid_is_indexed()) { + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + auto it = caller_ops.find(e.reqid); + // divergent merge_log indexes new before unindexing old + if (it != caller_ops.end() && it->second == &e) + caller_ops.erase(it); + } + } + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + for (auto k = extra_caller_ops.find(j->first); + k != extra_caller_ops.end() && k->first == j->first; + ++k) { + if (k->second == &e) { + extra_caller_ops.erase(k); + break; + } + } + } + } + } + + void index(pg_log_dup_t& e) { + if (indexed_data & PGLOG_INDEXED_DUPS) { + dup_index[e.reqid] = &e; + } + } + + void unindex(const pg_log_dup_t& e) { + if (indexed_data & PGLOG_INDEXED_DUPS) { + auto i = dup_index.find(e.reqid); + if (i != dup_index.end()) { + dup_index.erase(i); + } + } + } + + // actors + void add(const pg_log_entry_t& e, bool applied = true) { + if (!applied) { + ceph_assert(get_can_rollback_to() == head); + } + + // make sure our buffers don't pin bigger buffers + e.mod_desc.trim_bl(); + + // add to log + log.push_back(e); + + // riter previously pointed to the previous entry + if (rollback_info_trimmed_to_riter == log.rbegin()) + ++rollback_info_trimmed_to_riter; + + ceph_assert(e.version > head); + ceph_assert(head.version == 0 || e.version.version > head.version); + head = e.version; + + // to our index + if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { + objects[e.soid] = &(log.back()); + } + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + if (e.reqid_is_indexed()) { + caller_ops[e.reqid] = &(log.back()); + } + } + + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + extra_caller_ops.insert(std::make_pair(j->first, &(log.back()))); + } + } + + if (!applied) { + skip_can_rollback_to_to_head(); + } + } // add + + void trim( + CephContext* cct, + eversion_t s, + std::set *trimmed, + std::set* trimmed_dups, + eversion_t *write_from_dups); + + std::ostream& print(std::ostream& out) const; + }; // IndexedLog + + +protected: + //////////////////// data members //////////////////// + + pg_missing_tracker_t missing; + IndexedLog log; + + eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to + eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from + eversion_t writeout_from; ///< must writout keys >= writeout_from + std::set trimmed; ///< must clear keys in trimmed + eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups + eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups + eversion_t write_from_dups; ///< must write keys >= write_from_dups + std::set trimmed_dups; ///< must clear keys in trimmed_dups + CephContext *cct; + bool pg_log_debug; + /// Log is clean on [dirty_to, dirty_from) + bool touched_log; + bool dirty_log; + bool clear_divergent_priors; + bool may_include_deletes_in_missing_dirty = false; + + void mark_dirty_to(eversion_t to) { + if (to > dirty_to) + dirty_to = to; + } + void mark_dirty_from(eversion_t from) { + if (from < dirty_from) + dirty_from = from; + } + void mark_writeout_from(eversion_t from) { + if (from < writeout_from) + writeout_from = from; + } + void mark_dirty_to_dups(eversion_t to) { + if (to > dirty_to_dups) + dirty_to_dups = to; + } + void mark_dirty_from_dups(eversion_t from) { + if (from < dirty_from_dups) + dirty_from_dups = from; + } +public: + bool needs_write() const { + return !touched_log || is_dirty(); + } + + bool is_dirty() const { + return dirty_log || + (dirty_to != eversion_t()) || + (dirty_from != eversion_t::max()) || + (writeout_from != eversion_t::max()) || + !(trimmed.empty()) || + !missing.is_clean() || + !(trimmed_dups.empty()) || + (dirty_to_dups != eversion_t()) || + (dirty_from_dups != eversion_t::max()) || + (write_from_dups != eversion_t::max()) || + may_include_deletes_in_missing_dirty; + } + + void mark_log_for_rewrite() { + mark_dirty_to(eversion_t::max()); + mark_dirty_from(eversion_t()); + mark_dirty_to_dups(eversion_t::max()); + mark_dirty_from_dups(eversion_t()); + touched_log = false; + } + bool get_may_include_deletes_in_missing_dirty() const { + return may_include_deletes_in_missing_dirty; + } +protected: + + /// DEBUG + std::set log_keys_debug; + static void clear_after(std::set *log_keys_debug, const std::string &lb) { + if (!log_keys_debug) + return; + for (auto i = log_keys_debug->lower_bound(lb); + i != log_keys_debug->end(); + log_keys_debug->erase(i++)); + } + static void clear_up_to(std::set *log_keys_debug, const std::string &ub) { + if (!log_keys_debug) + return; + for (auto i = log_keys_debug->begin(); + i != log_keys_debug->end() && *i < ub; + log_keys_debug->erase(i++)); + } + + void check(); + void undirty() { + dirty_to = eversion_t(); + dirty_from = eversion_t::max(); + touched_log = true; + dirty_log = false; + trimmed.clear(); + trimmed_dups.clear(); + writeout_from = eversion_t::max(); + check(); + missing.flush(); + dirty_to_dups = eversion_t(); + dirty_from_dups = eversion_t::max(); + write_from_dups = eversion_t::max(); + } +public: + + // cppcheck-suppress noExplicitConstructor + PGLog(CephContext *cct) : + dirty_from(eversion_t::max()), + writeout_from(eversion_t::max()), + dirty_from_dups(eversion_t::max()), + write_from_dups(eversion_t::max()), + cct(cct), + pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))), + touched_log(false), + dirty_log(false), + clear_divergent_priors(false) + { } + + void reset_backfill(); + + void clear(); + + //////////////////// get or std::set missing //////////////////// + + const pg_missing_tracker_t& get_missing() const { return missing; } + + void missing_add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete=false) { + missing.add(oid, need, have, is_delete); + } + + void missing_add_next_entry(const pg_log_entry_t& e) { + missing.add_next_event(e); + } + + //////////////////// get or std::set log //////////////////// + + const IndexedLog &get_log() const { return log; } + + const eversion_t &get_tail() const { return log.tail; } + + void set_tail(eversion_t tail) { log.tail = tail; } + + const eversion_t &get_head() const { return log.head; } + + void set_head(eversion_t head) { log.head = head; } + + void set_last_requested(version_t last_requested) { + log.last_requested = last_requested; + } + + void index() { log.index(); } + + void unindex() { log.unindex(); } + + void add(const pg_log_entry_t& e, bool applied = true) { + mark_writeout_from(e.version); + log.add(e, applied); + } + + void reset_recovery_pointers() { log.reset_recovery_pointers(); } + + static void clear_info_log( + spg_t pgid, + ObjectStore::Transaction *t); + + void trim( + eversion_t trim_to, + pg_info_t &info, + bool transaction_applied = true, + bool async = false); + + void roll_forward_to( + eversion_t roll_forward_to, + LogEntryHandler *h) { + if (log.roll_forward_to( + roll_forward_to, + h)) + dirty_log = true; + } + + eversion_t get_can_rollback_to() const { + return log.get_can_rollback_to(); + } + + void roll_forward(LogEntryHandler *h) { + roll_forward_to( + log.head, + h); + } + + void skip_rollforward() { + log.skip_can_rollback_to_to_head(); + } + + //////////////////// get or std::set log & missing //////////////////// + + void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) { + log.trim_rollback_info_to(log.head, h); + log.claim_log_and_clear_rollback_info(o); + missing.clear(); + mark_dirty_to(eversion_t::max()); + mark_dirty_to_dups(eversion_t::max()); + } + + void split_into( + pg_t child_pgid, + unsigned split_bits, + PGLog *opg_log) { + log.split_out_child(child_pgid, split_bits, &opg_log->log); + missing.split_into(child_pgid, split_bits, &(opg_log->missing)); + opg_log->mark_dirty_to(eversion_t::max()); + opg_log->mark_dirty_to_dups(eversion_t::max()); + mark_dirty_to(eversion_t::max()); + mark_dirty_to_dups(eversion_t::max()); + if (missing.may_include_deletes) { + opg_log->set_missing_may_contain_deletes(); + } + } + + void merge_from( + const std::vector& sources, + eversion_t last_update) { + unindex(); + missing.clear(); + + std::vector slogs; + for (auto s : sources) { + slogs.push_back(&s->log); + } + log.merge_from(slogs, last_update); + + index(); + + mark_log_for_rewrite(); + } + + void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) { + if (missing.is_missing(oid, v)) { + missing.got(oid, v); + info.stats.stats.sum.num_objects_missing = missing.num_missing(); + + // raise last_complete? + if (missing.get_items().empty()) { + log.complete_to = log.log.end(); + info.last_complete = info.last_update; + } + auto oldest_need = missing.get_oldest_need(); + while (log.complete_to != log.log.end()) { + if (oldest_need <= log.complete_to->version) + break; + if (info.last_complete < log.complete_to->version) + info.last_complete = log.complete_to->version; + ++log.complete_to; + } + } + + ceph_assert(log.get_can_rollback_to() >= v); + } + + void reset_complete_to(pg_info_t *info) { + if (log.log.empty()) // caller is split_into() + return; + log.complete_to = log.log.begin(); + ceph_assert(log.complete_to != log.log.end()); + auto oldest_need = missing.get_oldest_need(); + if (oldest_need != eversion_t()) { + while (log.complete_to->version < oldest_need) { + ++log.complete_to; + ceph_assert(log.complete_to != log.log.end()); + } + } + if (!info) + return; + if (log.complete_to == log.log.begin()) { + info->last_complete = eversion_t(); + } else { + --log.complete_to; + info->last_complete = log.complete_to->version; + ++log.complete_to; + } + } + + void activate_not_complete(pg_info_t &info) { + reset_complete_to(&info); + log.last_requested = 0; + } + + void proc_replica_log(pg_info_t &oinfo, + const pg_log_t &olog, + pg_missing_t& omissing, pg_shard_t from) const; + + void set_missing_may_contain_deletes() { + missing.may_include_deletes = true; + may_include_deletes_in_missing_dirty = true; + } + + void rebuild_missing_set_with_deletes(ObjectStore *store, + ObjectStore::CollectionHandle& ch, + const pg_info_t &info); + +protected: + static void split_by_object( + mempool::osd_pglog::list &entries, + std::map> *out_entries) { + while (!entries.empty()) { + auto &out_list = (*out_entries)[entries.front().soid]; + out_list.splice(out_list.end(), entries, entries.begin()); + } + } + + /** + * _merge_object_divergent_entries + * + * There are 5 distinct cases: + * 1) There is a more recent update: in this case we assume we adjusted the + * store and missing during merge_log + * 2) The first entry in the divergent sequence is a create. This might + * either be because the object is a clone or because prior_version is + * eversion_t(). In this case the object does not exist and we must + * adjust missing and the store to match. + * 3) We are currently missing the object. In this case, we adjust the + * missing to our prior_version taking care to add a divergent_prior + * if necessary + * 4) We can rollback all of the entries. In this case, we do so using + * the rollbacker and return -- the object does not go into missing. + * 5) We cannot rollback at least 1 of the entries. In this case, we + * clear the object out of the store and add a missing entry at + * prior_version taking care to add a divergent_prior if + * necessary. + */ + template + static void _merge_object_divergent_entries( + const IndexedLog &log, ///< [in] log to merge against + const hobject_t &hoid, ///< [in] object we are merging + const mempool::osd_pglog::list &orig_entries, ///< [in] entries for hoid to merge + const pg_info_t &info, ///< [in] info for merging entries + eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input InedexedLog + missing_type &missing, ///< [in,out] missing to adjust, use + LogEntryHandler *rollbacker, ///< [in] optional rollbacker object + const DoutPrefixProvider *dpp ///< [in] logging provider + ) { + ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid + << " entries: " << orig_entries << dendl; + + if (hoid > info.last_backfill) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill" + << dendl; + return; + } + + // entries is non-empty + ceph_assert(!orig_entries.empty()); + // strip out and ignore ERROR entries + mempool::osd_pglog::list entries; + eversion_t last; + bool seen_non_error = false; + for (auto i = orig_entries.begin(); + i != orig_entries.end(); + ++i) { + // all entries are on hoid + ceph_assert(i->soid == hoid); + // did not see error entries before this entry and this entry is not error + // then this entry is the first non error entry + bool first_non_error = ! seen_non_error && ! i->is_error(); + if (! i->is_error() ) { + // see a non error entry now + seen_non_error = true; + } + + // No need to check the first entry since it prior_version is unavailable + // in the std::list + // No need to check if the prior_version is the minimal version + // No need to check the first non-error entry since the leading error + // entries are not its prior version + if (i != orig_entries.begin() && i->prior_version != eversion_t() && + ! first_non_error) { + // in increasing order of version + ceph_assert(i->version > last); + // prior_version correct (unless it is an ERROR entry) + ceph_assert(i->prior_version == last || i->is_error()); + } + if (i->is_error()) { + ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl; + } else { + ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl; + entries.push_back(*i); + last = i->version; + } + } + if (entries.empty()) { + ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl; + return; + } + + const eversion_t prior_version = entries.begin()->prior_version; + const eversion_t first_divergent_update = entries.begin()->version; + const eversion_t last_divergent_update = entries.rbegin()->version; + const bool object_not_in_store = + !missing.is_missing(hoid) && + entries.rbegin()->is_delete(); + ldpp_dout(dpp, 10) << __func__ << ": hoid " << " object_not_in_store: " + << object_not_in_store << dendl; + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version: " << prior_version + << " first_divergent_update: " << first_divergent_update + << " last_divergent_update: " << last_divergent_update + << dendl; + + auto objiter = log.objects.find(hoid); + if (objiter != log.objects.end() && + objiter->second->version >= first_divergent_update) { + /// Case 1) + ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: " + << *objiter->second << ", already merged" << dendl; + + ceph_assert(objiter->second->version > last_divergent_update); + + // ensure missing has been updated appropriately + if (objiter->second->is_update() || + (missing.may_include_deletes && objiter->second->is_delete())) { + ceph_assert(missing.is_missing(hoid) && + missing.get_items().at(hoid).need == objiter->second->version); + } else { + ceph_assert(!missing.is_missing(hoid)); + } + missing.revise_have(hoid, eversion_t()); + missing.mark_fully_dirty(hoid); + if (rollbacker) { + if (!object_not_in_store) { + rollbacker->remove(hoid); + } + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + <<" has no more recent entries in log" << dendl; + if (prior_version == eversion_t() || entries.front().is_clone()) { + /// Case 2) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version or op type indicates creation," + << " deleting" + << dendl; + if (missing.is_missing(hoid)) + missing.rm(missing.get_items().find(hoid)); + if (rollbacker) { + if (!object_not_in_store) { + rollbacker->remove(hoid); + } + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + if (missing.is_missing(hoid)) { + /// Case 3) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing, " << missing.get_items().at(hoid) + << " adjusting" << dendl; + + if (missing.get_items().at(hoid).have == prior_version) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing.have is prior_version " << prior_version + << " removing from missing" << dendl; + missing.rm(missing.get_items().find(hoid)); + } else { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing.have is " << missing.get_items().at(hoid).have + << ", adjusting" << dendl; + missing.revise_need(hoid, prior_version, false); + if (prior_version <= info.log_tail) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version " << prior_version + << " <= info.log_tail " + << info.log_tail << dendl; + } + } + if (rollbacker) { + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " must be rolled back or recovered," + << " attempting to rollback" + << dendl; + bool can_rollback = true; + // We are going to make an important decision based on the + // olog_can_rollback_to value we have received, better known it. + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " olog_can_rollback_to: " + << olog_can_rollback_to << dendl; + /// Distinguish between 4) and 5) + for (auto i = entries.rbegin(); i != entries.rend(); ++i) { + if (!i->can_rollback() || i->version <= olog_can_rollback_to) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback " + << *i << dendl; + can_rollback = false; + break; + } + } + + if (can_rollback) { + /// Case 4) + for (auto i = entries.rbegin(); i != entries.rend(); ++i) { + ceph_assert(i->can_rollback() && i->version > olog_can_rollback_to); + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " rolling back " << *i << dendl; + if (rollbacker) + rollbacker->rollback(*i); + } + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " rolled back" << dendl; + return; + } else { + /// Case 5) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, " + << "removing and adding to missing" << dendl; + if (rollbacker) { + if (!object_not_in_store) + rollbacker->remove(hoid); + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + missing.add(hoid, prior_version, eversion_t(), false); + if (prior_version <= info.log_tail) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version " << prior_version + << " <= info.log_tail " + << info.log_tail << dendl; + } + } + } + + /// Merge all entries using above + template + static void _merge_divergent_entries( + const IndexedLog &log, ///< [in] log to merge against + mempool::osd_pglog::list &entries, ///< [in] entries to merge + const pg_info_t &oinfo, ///< [in] info for merging entries + eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input IndexedLog + missing_type &omissing, ///< [in,out] missing to adjust, use + LogEntryHandler *rollbacker, ///< [in] optional rollbacker object + const DoutPrefixProvider *dpp ///< [in] logging provider + ) { + std::map > split; + split_by_object(entries, &split); + for (auto i = split.begin(); i != split.end(); ++i) { + _merge_object_divergent_entries( + log, + i->first, + i->second, + oinfo, + olog_can_rollback_to, + omissing, + rollbacker, + dpp); + } + } + + /** + * Exists for use in TestPGLog for simply testing single divergent log + * cases + */ + void merge_old_entry( + ObjectStore::Transaction& t, + const pg_log_entry_t& oe, + const pg_info_t& info, + LogEntryHandler *rollbacker) { + mempool::osd_pglog::list entries; + entries.push_back(oe); + _merge_object_divergent_entries( + log, + oe.soid, + entries, + info, + log.get_can_rollback_to(), + missing, + rollbacker, + this); + } + + bool merge_log_dups(const pg_log_t& olog); + +public: + + void rewind_divergent_log(eversion_t newhead, + pg_info_t &info, + LogEntryHandler *rollbacker, + bool &dirty_info, + bool &dirty_big_info); + + void merge_log(pg_info_t &oinfo, + pg_log_t&& olog, + pg_shard_t from, + pg_info_t &info, LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info); + + template + static bool append_log_entries_update_missing( + const hobject_t &last_backfill, + const mempool::osd_pglog::list &entries, + bool maintain_rollback, + IndexedLog *log, + missing_type &missing, + LogEntryHandler *rollbacker, + const DoutPrefixProvider *dpp) { + bool invalidate_stats = false; + if (log && !entries.empty()) { + ceph_assert(log->head < entries.begin()->version); + } + for (auto p = entries.begin(); p != entries.end(); ++p) { + invalidate_stats = invalidate_stats || !p->is_error(); + if (log) { + ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl; + log->add(*p); + } + if (p->soid <= last_backfill && + !p->is_error()) { + if (missing.may_include_deletes) { + missing.add_next_event(*p); + } else { + if (p->is_delete()) { + missing.rm(p->soid, p->version); + } else { + missing.add_next_event(*p); + } + if (rollbacker) { + // hack to match PG::mark_all_unfound_lost + if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) { + rollbacker->try_stash(p->soid, p->version.version); + } else if (p->is_delete()) { + rollbacker->remove(p->soid); + } + } + } + } + } + return invalidate_stats; + } + bool append_new_log_entries( + const hobject_t &last_backfill, + const mempool::osd_pglog::list &entries, + LogEntryHandler *rollbacker) { + bool invalidate_stats = append_log_entries_update_missing( + last_backfill, + entries, + true, + &log, + missing, + rollbacker, + this); + if (!entries.empty()) { + mark_writeout_from(entries.begin()->version); + if (entries.begin()->is_lost_delete()) { + // hack: since lost deletes queue recovery directly, and don't + // go through activate_not_complete() again, our complete_to + // iterator may still point at log.end(). Reset it to point + // before these new lost_delete entries. This only occurs + // when lost+delete entries are initially added, which is + // always in a std::list of solely lost_delete entries, so it is + // sufficient to check whether the first entry is a + // lost_delete + reset_complete_to(nullptr); + } + } + return invalidate_stats; + } + + void write_log_and_missing( + ObjectStore::Transaction& t, + std::map *km, + const coll_t& coll, + const ghobject_t &log_oid, + bool require_rollback); + + static void write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + std::map* km, + pg_log_t &log, + const coll_t& coll, + const ghobject_t &log_oid, std::map &divergent_priors, + bool require_rollback, + const DoutPrefixProvider *dpp = nullptr); + + static void write_log_and_missing( + ObjectStore::Transaction& t, + std::map* km, + pg_log_t &log, + const coll_t& coll, + const ghobject_t &log_oid, + const pg_missing_tracker_t &missing, + bool require_rollback, + bool *rebuilt_missing_set_with_deletes, + const DoutPrefixProvider *dpp = nullptr); + + static void _write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + std::map* km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + std::map &divergent_priors, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + bool dirty_divergent_priors, + bool touch_log, + bool require_rollback, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + std::set *log_keys_debug, + const DoutPrefixProvider *dpp = nullptr + ); + + static void _write_log_and_missing( + ObjectStore::Transaction& t, + std::map* km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + std::set &&trimmed, + std::set &&trimmed_dups, + const pg_missing_tracker_t &missing, + bool touch_log, + bool require_rollback, + bool clear_divergent_priors, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + bool *may_include_deletes_in_missing_dirty, + std::set *log_keys_debug, + const DoutPrefixProvider *dpp = nullptr + ); + + void read_log_and_missing( + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + ghobject_t pgmeta_oid, + const pg_info_t &info, + std::ostringstream &oss, + bool tolerate_divergent_missing_log, + bool debug_verify_stored_missing = false + ) { + return read_log_and_missing( + cct, store, ch, pgmeta_oid, info, + log, missing, oss, + tolerate_divergent_missing_log, + &clear_divergent_priors, + this, + (pg_log_debug ? &log_keys_debug : nullptr), + debug_verify_stored_missing); + } + + template + static void read_log_and_missing( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle &ch, + ghobject_t pgmeta_oid, + const pg_info_t &info, + IndexedLog &log, + missing_type &missing, + std::ostringstream &oss, + bool tolerate_divergent_missing_log, + bool *clear_divergent_priors = nullptr, + const DoutPrefixProvider *dpp = nullptr, + std::set *log_keys_debug = nullptr, + bool debug_verify_stored_missing = false + ) { + ldpp_dout(dpp, 10) << "read_log_and_missing coll " << ch->cid + << " " << pgmeta_oid << dendl; + size_t total_dups = 0; + + // legacy? + struct stat st; + int r = store->stat(ch, pgmeta_oid, &st); + ceph_assert(r == 0); + ceph_assert(st.st_size == 0); + + // will get overridden below if it had been recorded + eversion_t on_disk_can_rollback_to = info.last_update; + eversion_t on_disk_rollback_info_trimmed_to = eversion_t(); + ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, + pgmeta_oid); + std::map divergent_priors; + bool must_rebuild = false; + missing.may_include_deletes = false; + std::list entries; + std::list dups; + const auto NUM_DUPS_WARN_THRESHOLD = 2*cct->_conf->osd_pg_log_dups_tracked; + if (p) { + using ceph::decode; + for (p->seek_to_first(); p->valid() ; p->next()) { + // non-log pgmeta_oid keys are prefixed with _; skip those + if (p->key()[0] == '_') + continue; + auto bl = p->value();//Copy ceph::buffer::list before creating iterator + auto bp = bl.cbegin(); + if (p->key() == "divergent_priors") { + decode(divergent_priors, bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size() + << " divergent_priors" << dendl; + must_rebuild = true; + debug_verify_stored_missing = false; + } else if (p->key() == "can_rollback_to") { + decode(on_disk_can_rollback_to, bp); + } else if (p->key() == "rollback_info_trimmed_to") { + decode(on_disk_rollback_info_trimmed_to, bp); + } else if (p->key() == "may_include_deletes_in_missing") { + missing.may_include_deletes = true; + } else if (p->key().substr(0, 7) == std::string("missing")) { + hobject_t oid; + pg_missing_item item; + decode(oid, bp); + decode(item, bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << item << dendl; + if (item.is_delete()) { + ceph_assert(missing.may_include_deletes); + } + missing.add(oid, std::move(item)); + } else if (p->key().substr(0, 4) == std::string("dup_")) { + ++total_dups; + pg_log_dup_t dup; + decode(dup, bp); + if (!dups.empty()) { + ceph_assert(dups.back().version < dup.version); + } + if (dups.size() == NUM_DUPS_WARN_THRESHOLD) { + ldpp_dout(dpp, 0) << "read_log_and_missing WARN num of dups exceeded " + << NUM_DUPS_WARN_THRESHOLD << "." + << " You can be hit by THE DUPS BUG" + << " https://tracker.ceph.com/issues/53729." + << " Consider ceph-objectstore-tool --op trim-pg-log-dups" + << dendl; + } + dups.push_back(dup); + } else { + pg_log_entry_t e; + e.decode_with_checksum(bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl; + if (!entries.empty()) { + pg_log_entry_t last_e(entries.back()); + ceph_assert(last_e.version.version < e.version.version); + ceph_assert(last_e.version.epoch <= e.version.epoch); + } + entries.push_back(e); + if (log_keys_debug) + log_keys_debug->insert(e.get_key_name()); + } + } + } + log = IndexedLog( + info.last_update, + info.log_tail, + on_disk_can_rollback_to, + on_disk_rollback_info_trimmed_to, + std::move(entries), + std::move(dups)); + + if (must_rebuild || debug_verify_stored_missing) { + // build missing + if (debug_verify_stored_missing || info.last_complete < info.last_update) { + ldpp_dout(dpp, 10) + << "read_log_and_missing checking for missing items over interval (" + << info.last_complete + << "," << info.last_update << "]" << dendl; + + std::set did; + std::set checked; + std::set skipped; + for (auto i = log.log.rbegin(); i != log.log.rend(); ++i) { + if (i->soid > info.last_backfill) + continue; + if (i->is_error()) + continue; + if (did.count(i->soid)) continue; + did.insert(i->soid); + + if (!missing.may_include_deletes && i->is_delete()) + continue; + + ceph::buffer::list bv; + int r = store->getattr( + ch, + ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + if (oi.version < i->version) { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i + << " (have " << oi.version << ")" + << " clean_regions " << i->clean_regions << dendl; + + if (debug_verify_stored_missing) { + auto miter = missing.get_items().find(i->soid); + ceph_assert(miter != missing.get_items().end()); + ceph_assert(miter->second.need == i->version); + // the 'have' version is reset if an object is deleted, + // then created again + ceph_assert(miter->second.have == oi.version || miter->second.have == eversion_t()); + checked.insert(i->soid); + } else { + missing.add(i->soid, i->version, oi.version, i->is_delete()); + } + } + } else { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; + if (debug_verify_stored_missing) { + auto miter = missing.get_items().find(i->soid); + if (i->is_delete()) { + ceph_assert(miter == missing.get_items().end() || + (miter->second.need == i->version && + miter->second.have == eversion_t())); + } else { + ceph_assert(miter != missing.get_items().end()); + ceph_assert(miter->second.need == i->version); + ceph_assert(miter->second.have == eversion_t()); + } + checked.insert(i->soid); + } else { + missing.add(i->soid, i->version, eversion_t(), i->is_delete()); + } + } + } + if (debug_verify_stored_missing) { + for (auto &&i: missing.get_items()) { + if (checked.count(i.first)) + continue; + if (i.first > info.last_backfill) { + ldpp_dout(dpp, -1) << __func__ << ": invalid missing std::set entry " + << "found before last_backfill: " + << i.first << " " << i.second + << " last_backfill = " << info.last_backfill + << dendl; + ceph_abort_msg("invalid missing std::set entry found"); + } + ceph::buffer::list bv; + int r = store->getattr( + ch, + ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + ceph_assert(oi.version == i.second.have || eversion_t() == i.second.have); + } else { + ceph_assert(i.second.is_delete() || eversion_t() == i.second.have); + } + } + } else { + ceph_assert(must_rebuild); + for (auto i = divergent_priors.rbegin(); + i != divergent_priors.rend(); + ++i) { + if (i->first <= info.last_complete) break; + if (i->second > info.last_backfill) + continue; + if (did.count(i->second)) continue; + did.insert(i->second); + ceph::buffer::list bv; + int r = store->getattr( + ch, + ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + /** + * 1) we see this entry in the divergent priors mapping + * 2) we didn't see an entry for this object in the log + * + * From 1 & 2 we know that either the object does not exist + * or it is at the version specified in the divergent_priors + * map since the object would have been deleted atomically + * with the addition of the divergent_priors entry, an older + * version would not have been recovered, and a newer version + * would show up in the log above. + */ + /** + * Unfortunately the assessment above is incorrect because of + * http://tracker.ceph.com/issues/17916 (we were incorrectly + * not removing the divergent_priors std::set from disk state!), + * so let's check that. + */ + if (oi.version > i->first && tolerate_divergent_missing_log) { + ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i + << ") inconsistent with disk state (" << oi + << "), assuming it is tracker.ceph.com/issues/17916" + << dendl; + } else { + ceph_assert(oi.version == i->first); + } + } else { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; + missing.add(i->second, i->first, eversion_t(), false); + } + } + } + if (clear_divergent_priors) + (*clear_divergent_priors) = true; + } + } + + if (!must_rebuild) { + if (clear_divergent_priors) + (*clear_divergent_priors) = false; + missing.flush(); + } + ldpp_dout(dpp, 10) << "read_log_and_missing done coll " << ch->cid + << " total_dups=" << total_dups + << " log.dups.size()=" << log.dups.size() << dendl; + } // static read_log_and_missing + +#ifdef WITH_SEASTAR + seastar::future<> read_log_and_missing_crimson( + crimson::os::FuturizedStore &store, + crimson::os::CollectionRef ch, + const pg_info_t &info, + ghobject_t pgmeta_oid + ) { + return read_log_and_missing_crimson( + store, ch, info, + log, (pg_log_debug ? &log_keys_debug : nullptr), + missing, pgmeta_oid, this); + } + + static seastar::future<> read_log_and_missing_crimson( + crimson::os::FuturizedStore &store, + crimson::os::CollectionRef ch, + const pg_info_t &info, + IndexedLog &log, + std::set* log_keys_debug, + pg_missing_tracker_t &missing, + ghobject_t pgmeta_oid, + const DoutPrefixProvider *dpp = nullptr); + +#endif + +}; // struct PGLog diff --git a/src/osd/PGPeeringEvent.cc b/src/osd/PGPeeringEvent.cc new file mode 100644 index 000000000..2d28c6f84 --- /dev/null +++ b/src/osd/PGPeeringEvent.cc @@ -0,0 +1,17 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/mempool.h" +#include "osd/PGPeeringEvent.h" +#include "messages/MOSDPGLog.h" + +MEMPOOL_DEFINE_OBJECT_FACTORY(PGPeeringEvent, pg_peering_evt, osd); + +MLogRec::MLogRec(pg_shard_t from, MOSDPGLog *msg) + : from(from), msg(msg) {} + +void MLogRec::print(std::ostream *out) const +{ + *out << "MLogRec from " << from << " "; + msg->inner_print(*out); +} diff --git a/src/osd/PGPeeringEvent.h b/src/osd/PGPeeringEvent.h new file mode 100644 index 000000000..2828880f6 --- /dev/null +++ b/src/osd/PGPeeringEvent.h @@ -0,0 +1,220 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "osd/osd_types.h" + +class MOSDPGLog; + +/// what we need to instantiate a pg +struct PGCreateInfo { + spg_t pgid; + epoch_t epoch = 0; + pg_history_t history; + PastIntervals past_intervals; + bool by_mon; + PGCreateInfo(spg_t p, epoch_t e, + const pg_history_t& h, + const PastIntervals& pi, + bool mon) + : pgid(p), epoch(e), history(h), past_intervals(pi), by_mon(mon) {} +}; + +class PGPeeringEvent { + epoch_t epoch_sent; + epoch_t epoch_requested; + std::string desc; +public: + boost::intrusive_ptr< const boost::statechart::event_base > evt; + bool requires_pg; + std::unique_ptr create_info; + MEMPOOL_CLASS_HELPERS(); + template + PGPeeringEvent( + epoch_t epoch_sent, + epoch_t epoch_requested, + const T &evt_, + bool req = true, + PGCreateInfo *ci = 0) + : epoch_sent(epoch_sent), + epoch_requested(epoch_requested), + evt(evt_.intrusive_from_this()), + requires_pg(req), + create_info(ci) { + std::stringstream out; + out << "epoch_sent: " << epoch_sent + << " epoch_requested: " << epoch_requested << " "; + evt_.print(&out); + if (create_info) { + out << " +create_info"; + } + desc = out.str(); + } + epoch_t get_epoch_sent() const { + return epoch_sent; + } + epoch_t get_epoch_requested() const { + return epoch_requested; + } + const boost::statechart::event_base &get_event() const { + return *evt; + } + const std::string& get_desc() const { + return desc; + } +}; +typedef std::shared_ptr PGPeeringEventRef; +typedef std::unique_ptr PGPeeringEventURef; + +struct MInfoRec : boost::statechart::event< MInfoRec > { + pg_shard_t from; + pg_info_t info; + epoch_t msg_epoch; + std::optional lease; + std::optional lease_ack; + MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch, + std::optional l = {}, + std::optional la = {}) + : from(from), info(info), msg_epoch(msg_epoch), + lease(l), lease_ack(la) {} + void print(std::ostream *out) const { + *out << "MInfoRec from " << from << " info: " << info; + if (lease) { + *out << " " << *lease; + } + if (lease_ack) { + *out << " " << *lease_ack; + } + } +}; + +struct MLogRec : boost::statechart::event< MLogRec > { + pg_shard_t from; + boost::intrusive_ptr msg; + MLogRec(pg_shard_t from, MOSDPGLog *msg); + void print(std::ostream *out) const; +}; + +struct MNotifyRec : boost::statechart::event< MNotifyRec > { + spg_t pgid; + pg_shard_t from; + pg_notify_t notify; + uint64_t features; + MNotifyRec(spg_t p, pg_shard_t from, const pg_notify_t ¬ify, uint64_t f) + : pgid(p), from(from), notify(notify), features(f) {} + void print(std::ostream *out) const { + *out << "MNotifyRec " << pgid << " from " << from << " notify: " << notify + << " features: 0x" << std::hex << features << std::dec; + } +}; + +struct MQuery : boost::statechart::event< MQuery > { + spg_t pgid; + pg_shard_t from; + pg_query_t query; + epoch_t query_epoch; + MQuery(spg_t p, pg_shard_t from, const pg_query_t &query, epoch_t query_epoch) + : pgid(p), from(from), query(query), query_epoch(query_epoch) {} + void print(std::ostream *out) const { + *out << "MQuery " << pgid << " from " << from + << " query_epoch " << query_epoch + << " query: " << query; + } +}; + +struct MTrim : boost::statechart::event { + epoch_t epoch; + int from; + shard_id_t shard; + eversion_t trim_to; + MTrim(epoch_t epoch, int from, shard_id_t shard, eversion_t trim_to) + : epoch(epoch), from(from), shard(shard), trim_to(trim_to) {} + void print(std::ostream *out) const { + *out << "MTrim epoch " << epoch << " from " << from << " shard " << shard + << " trim_to " << trim_to; + } +}; + +struct MLease : boost::statechart::event { + epoch_t epoch; + int from; + pg_lease_t lease; + MLease(epoch_t epoch, int from, pg_lease_t l) + : epoch(epoch), from(from), lease(l) {} + void print(std::ostream *out) const { + *out << "MLease epoch " << epoch << " from osd." << from << " " << lease; + } +}; + +struct MLeaseAck : boost::statechart::event { + epoch_t epoch; + int from; + pg_lease_ack_t lease_ack; + MLeaseAck(epoch_t epoch, int from, pg_lease_ack_t l) + : epoch(epoch), from(from), lease_ack(l) {} + void print(std::ostream *out) const { + *out << "MLeaseAck epoch " << epoch << " from osd." << from + << " " << lease_ack; + } +}; + +struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > { + unsigned priority; + int64_t primary_num_bytes; + int64_t local_num_bytes; + explicit RequestBackfillPrio(unsigned prio, int64_t pbytes, int64_t lbytes) : + boost::statechart::event< RequestBackfillPrio >(), + priority(prio), primary_num_bytes(pbytes), local_num_bytes(lbytes) {} + void print(std::ostream *out) const { + *out << "RequestBackfillPrio: priority " << priority + << " primary bytes " << primary_num_bytes + << " local bytes " << local_num_bytes; + } +}; + +struct RequestRecoveryPrio : boost::statechart::event< RequestRecoveryPrio > { + unsigned priority; + explicit RequestRecoveryPrio(unsigned prio) : + boost::statechart::event< RequestRecoveryPrio >(), + priority(prio) {} + void print(std::ostream *out) const { + *out << "RequestRecoveryPrio: priority " << priority; + } +}; + +#define TrivialEvent(T) struct T : boost::statechart::event< T > { \ + T() : boost::statechart::event< T >() {} \ + void print(std::ostream *out) const { \ + *out << #T; \ + } \ + }; + +TrivialEvent(NullEvt) +TrivialEvent(RemoteBackfillReserved) +TrivialEvent(RemoteReservationRejectedTooFull) +TrivialEvent(RemoteReservationRevokedTooFull) +TrivialEvent(RemoteReservationRevoked) +TrivialEvent(RemoteReservationCanceled) +TrivialEvent(RemoteRecoveryReserved) +TrivialEvent(RecoveryDone) + +struct DeferRecovery : boost::statechart::event { + float delay; + explicit DeferRecovery(float delay) : delay(delay) {} + void print(std::ostream *out) const { + *out << "DeferRecovery: delay " << delay; + } +}; + +struct DeferBackfill : boost::statechart::event { + float delay; + explicit DeferBackfill(float delay) : delay(delay) {} + void print(std::ostream *out) const { + *out << "DeferBackfill: delay " << delay; + } +}; + +TrivialEvent(RenewLease) diff --git a/src/osd/PGStateUtils.cc b/src/osd/PGStateUtils.cc new file mode 100644 index 000000000..5dbe78eb7 --- /dev/null +++ b/src/osd/PGStateUtils.cc @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PGStateUtils.h" +#include "common/Clock.h" + +using ceph::Formatter; + +/*------NamedState----*/ +NamedState::NamedState(PGStateHistory *pgsh, const char *state_name) + : pgsh(pgsh), state_name(state_name), enter_time(ceph_clock_now()) { + if(pgsh) { + pgsh->enter(enter_time, state_name); + } +} + +NamedState::~NamedState() { + if(pgsh) { + pgsh->exit(state_name); + } +} + +/*---------PGStateHistory---------*/ +void PGStateHistory::enter(const utime_t entime, const char* state) +{ + if (pi == nullptr) { + pi = std::make_unique(); + } + pi->enter_state(entime, state); +} + +void PGStateHistory::exit(const char* state) { + pi->setepoch(es.get_osdmap_epoch()); + pi->exit_state(ceph_clock_now()); + if (pi->empty()) { + reset(); + } +} + +void PGStateHistory::dump(Formatter* f) const { + f->open_array_section("history"); + for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) { + f->open_object_section("epochs"); + f->dump_stream("epoch") << (*pi)->this_epoch; + f->open_array_section("states"); + for (auto she : (*pi)->state_history) { + f->open_object_section("state"); + f->dump_string("state", std::get<2>(she)); + f->dump_stream("enter") << std::get<0>(she); + f->dump_stream("exit") << std::get<1>(she); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); +} diff --git a/src/osd/PGStateUtils.h b/src/osd/PGStateUtils.h new file mode 100644 index 000000000..952464641 --- /dev/null +++ b/src/osd/PGStateUtils.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/utime.h" +#include "common/Formatter.h" + +#include +#include +#include + +class PGStateHistory; + +struct EpochSource { + virtual epoch_t get_osdmap_epoch() const = 0; + virtual ~EpochSource() {} +}; + +struct NamedState { + PGStateHistory *pgsh; + const char *state_name; + utime_t enter_time; + const char *get_state_name() { return state_name; } + NamedState( + PGStateHistory *pgsh, + const char *state_name_); + virtual ~NamedState(); +}; + +using state_history_entry = std::tuple; +using embedded_state = std::pair; + +struct PGStateInstance { + // Time spent in pg states + + void setepoch(const epoch_t current_epoch) { + this_epoch = current_epoch; + } + + void enter_state(const utime_t entime, const char* state) { + embedded_states.push(std::make_pair(entime, state)); + } + + void exit_state(const utime_t extime) { + embedded_state this_state = embedded_states.top(); + state_history.push_back(state_history_entry{ + this_state.first, extime, this_state.second}); + embedded_states.pop(); + } + + bool empty() const { + return embedded_states.empty(); + } + + epoch_t this_epoch; + std::vector state_history; + std::stack embedded_states; +}; + +class PGStateHistory { +public: + PGStateHistory(const EpochSource &es) : buffer(10), es(es) {} + + void enter(const utime_t entime, const char* state); + + void exit(const char* state); + + void reset() { + buffer.push_back(std::move(pi)); + pi = nullptr; + } + + void dump(ceph::Formatter* f) const; + + const char *get_current_state() const { + if (pi == nullptr) return "unknown"; + return std::get<1>(pi->embedded_states.top()); + } + +private: + std::unique_ptr pi; + boost::circular_buffer> buffer; + const EpochSource &es; +}; diff --git a/src/osd/PGTransaction.h b/src/osd/PGTransaction.h new file mode 100644 index 000000000..3b5b9e72c --- /dev/null +++ b/src/osd/PGTransaction.h @@ -0,0 +1,601 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef PGTRANSACTION_H +#define PGTRANSACTION_H + +#include +#include +#include + +#include "common/hobject.h" +#include "osd/osd_types.h" +#include "osd/osd_internal_types.h" +#include "common/interval_map.h" +#include "common/inline_variant.h" + +/** + * This class represents transactions which can be submitted to + * a PGBackend. For expediency, there are some constraints on + * the operations submitted: + * 1) Rename sources may only be referenced prior to the rename + * operation to the destination. + * 2) The graph formed by edges of source->destination for clones + * (Create) and Renames must be acyclic. + * 3) clone_range sources must not be modified by the same + * transaction + */ +class PGTransaction { +public: + std::map obc_map; + + class ObjectOperation { + public: + struct Init + { + struct None {}; + struct Create {}; + struct Clone { + hobject_t source; + }; + struct Rename { + hobject_t source; // must be temp object + }; + }; + using InitType = boost::variant< + Init::None, + Init::Create, + Init::Clone, + Init::Rename>; + + InitType init_type = Init::None(); + bool delete_first = false; + + /** + * is_none() && is_delete() indicates that we are deleting an + * object which already exists and not recreating it. delete_first means + * that the transaction logically removes the object. + + * There are really 4 cases: + + * 1) We are modifying an existing object (is_none() && + * !is_delete()) + * a) If it's an append, we just write into the log entry the old size + * b) If it's an actual overwrite, we save the old versions of the + * extents being overwritten and write those offsets into the log + * entry + * 2) We are removing and then recreating an object (!is_none() && is_delete()) + * -- stash + * 3) We are removing an object (is_none() && is_delete()) -- stash + * 4) We are creating an object (!is_none() && !is_delete()) -- create (no + * stash) + * + * Create, Clone, Rename are the three ways we can recreate it. + * ECBackend transaction planning needs this context + * to figure out how to perform the transaction. + */ + bool deletes_first() const { + return delete_first; + } + bool is_delete() const { + return boost::get(&init_type) != nullptr && delete_first; + } + bool is_none() const { + return boost::get(&init_type) != nullptr && !delete_first; + } + bool is_fresh_object() const { + return boost::get(&init_type) == nullptr; + } + bool is_rename() const { + return boost::get(&init_type) != nullptr; + } + bool has_source(hobject_t *source = nullptr) const { + return match( + init_type, + [&](const Init::Clone &op) -> bool { + if (source) + *source = op.source; + return true; + }, + [&](const Init::Rename &op) -> bool { + if (source) + *source = op.source; + return true; + }, + [&](const Init::None &) -> bool { return false; }, + [&](const Init::Create &) -> bool { return false; }); + } + + bool clear_omap = false; + + /** + * truncate + * ? + * + * truncate is represented as a pair because in the event of + * multiple truncates within a single transaction we need to + * remember the lowest truncate and the final object size + * (the last truncate). We also adjust the buffers map + * to account for truncates overriding previous writes */ + std::optional > truncate = std::nullopt; + + std::map > attr_updates; + + enum class OmapUpdateType {Remove, Insert, RemoveRange}; + std::vector > omap_updates; + + std::optional omap_header; + + /// (old, new) -- only valid with no truncate or buffer updates + std::optional, std::set>> updated_snaps; + + struct alloc_hint_t { + uint64_t expected_object_size; + uint64_t expected_write_size; + uint32_t flags; + }; + std::optional alloc_hint; + + struct BufferUpdate { + struct Write { + ceph::buffer::list buffer; + uint32_t fadvise_flags; + }; + struct Zero { + uint64_t len; + }; + struct CloneRange { + hobject_t from; + uint64_t offset; + uint64_t len; + }; + }; + using BufferUpdateType = boost::variant< + BufferUpdate::Write, + BufferUpdate::Zero, + BufferUpdate::CloneRange>; + + private: + struct SplitMerger { + BufferUpdateType split( + uint64_t offset, + uint64_t len, + const BufferUpdateType &bu) const { + return match( + bu, + [&](const BufferUpdate::Write &w) -> BufferUpdateType { + ceph::buffer::list bl; + bl.substr_of(w.buffer, offset, len); + return BufferUpdate::Write{bl, w.fadvise_flags}; + }, + [&](const BufferUpdate::Zero &) -> BufferUpdateType { + return BufferUpdate::Zero{len}; + }, + [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType { + return BufferUpdate::CloneRange{c.from, c.offset + offset, len}; + }); + } + uint64_t length( + const BufferUpdateType &left) const { + return match( + left, + [&](const BufferUpdate::Write &w) -> uint64_t { + return w.buffer.length(); + }, + [&](const BufferUpdate::Zero &z) -> uint64_t { + return z.len; + }, + [&](const BufferUpdate::CloneRange &c) -> uint64_t { + return c.len; + }); + } + bool can_merge( + const BufferUpdateType &left, + const BufferUpdateType &right) const { + return match( + left, + [&](const BufferUpdate::Write &w) -> bool { + auto r = boost::get(&right); + return r != nullptr && (w.fadvise_flags == r->fadvise_flags); + }, + [&](const BufferUpdate::Zero &) -> bool { + auto r = boost::get(&right); + return r != nullptr; + }, + [&](const BufferUpdate::CloneRange &c) -> bool { + return false; + }); + } + BufferUpdateType merge( + BufferUpdateType &&left, + BufferUpdateType &&right) const { + return match( + left, + [&](const BufferUpdate::Write &w) -> BufferUpdateType { + auto r = boost::get(&right); + ceph_assert(r && w.fadvise_flags == r->fadvise_flags); + ceph::buffer::list bl = w.buffer; + bl.append(r->buffer); + return BufferUpdate::Write{bl, w.fadvise_flags}; + }, + [&](const BufferUpdate::Zero &z) -> BufferUpdateType { + auto r = boost::get(&right); + ceph_assert(r); + return BufferUpdate::Zero{z.len + r->len}; + }, + [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType { + ceph_abort_msg("violates can_merge condition"); + return left; + }); + } + }; + public: + using buffer_update_type = interval_map< + uint64_t, BufferUpdateType, SplitMerger>; + buffer_update_type buffer_updates; + + friend class PGTransaction; + }; + std::map op_map; +private: + ObjectOperation &get_object_op_for_modify(const hobject_t &hoid) { + auto &op = op_map[hoid]; + ceph_assert(!op.is_delete()); + return op; + } + ObjectOperation &get_object_op(const hobject_t &hoid) { + return op_map[hoid]; + } +public: + void add_obc( + ObjectContextRef obc) { + ceph_assert(obc); + obc_map[obc->obs.oi.soid] = obc; + } + /// Sets up state for new object + void create( + const hobject_t &hoid + ) { + auto &op = op_map[hoid]; + ceph_assert(op.is_none() || op.is_delete()); + op.init_type = ObjectOperation::Init::Create(); + } + + /// Sets up state for target cloned from source + void clone( + const hobject_t &target, ///< [in] obj to clone to + const hobject_t &source ///< [in] obj to clone from + ) { + auto &op = op_map[target]; + ceph_assert(op.is_none() || op.is_delete()); + op.init_type = ObjectOperation::Init::Clone{source}; + } + + /// Sets up state for target renamed from source + void rename( + const hobject_t &target, ///< [in] to, must not exist, be non-temp + const hobject_t &source ///< [in] source (must be a temp object) + ) { + ceph_assert(source.is_temp()); + ceph_assert(!target.is_temp()); + auto &op = op_map[target]; + ceph_assert(op.is_none() || op.is_delete()); + + bool del_first = op.is_delete(); + auto iter = op_map.find(source); + if (iter != op_map.end()) { + op = iter->second; + op_map.erase(iter); + op.delete_first = del_first; + } + + op.init_type = ObjectOperation::Init::Rename{source}; + } + + /// Remove -- must not be called on rename target + void remove( + const hobject_t &hoid ///< [in] obj to remove + ) { + auto &op = get_object_op_for_modify(hoid); + if (!op.is_fresh_object()) { + ceph_assert(!op.updated_snaps); + op = ObjectOperation(); + op.delete_first = true; + } else { + ceph_assert(!op.is_rename()); + op_map.erase(hoid); // make it a noop if it's a fresh object + } + } + + void update_snaps( + const hobject_t &hoid, ///< [in] object for snaps + const std::set &old_snaps,///< [in] old snaps value + const std::set &new_snaps ///< [in] new snaps value + ) { + auto &op = get_object_op(hoid); + ceph_assert(!op.updated_snaps); + ceph_assert(op.buffer_updates.empty()); + ceph_assert(!op.truncate); + op.updated_snaps = make_pair( + old_snaps, + new_snaps); + } + + /// Clears, truncates + void omap_clear( + const hobject_t &hoid ///< [in] object to clear omap + ) { + auto &op = get_object_op_for_modify(hoid); + op.clear_omap = true; + op.omap_updates.clear(); + op.omap_header = std::nullopt; + } + void truncate( + const hobject_t &hoid, ///< [in] object + uint64_t off ///< [in] offset to truncate to + ) { + auto &op = get_object_op_for_modify(hoid); + ceph_assert(!op.updated_snaps); + op.buffer_updates.erase( + off, + std::numeric_limits::max() - off); + if (!op.truncate || off < op.truncate->first) { + op.truncate = std::pair(off, off); + } else { + op.truncate->second = off; + } + } + + /// Attr ops + void setattrs( + const hobject_t &hoid, ///< [in] object to write + std::map &attrs ///< [in] attrs, may be cleared + ) { + auto &op = get_object_op_for_modify(hoid); + for (auto &&i: attrs) { + auto& d = op.attr_updates[i.first]; + d = i.second; + d->rebuild(); + } + } + void setattr( + const hobject_t &hoid, ///< [in] object to write + const std::string &attrname, ///< [in] attr to write + ceph::buffer::list &bl ///< [in] val to write, may be claimed + ) { + auto &op = get_object_op_for_modify(hoid); + auto& d = op.attr_updates[attrname]; + d = bl; + d->rebuild(); + } + void rmattr( + const hobject_t &hoid, ///< [in] object to write + const std::string &attrname ///< [in] attr to remove + ) { + auto &op = get_object_op_for_modify(hoid); + op.attr_updates[attrname] = std::nullopt; + } + + /// set alloc hint + void set_alloc_hint( + const hobject_t &hoid, ///< [in] object (must exist) + uint64_t expected_object_size, ///< [in] + uint64_t expected_write_size, + uint32_t flags + ) { + auto &op = get_object_op_for_modify(hoid); + op.alloc_hint = ObjectOperation::alloc_hint_t{ + expected_object_size, expected_write_size, flags}; + } + + /// Buffer updates + void write( + const hobject_t &hoid, ///< [in] object to write + uint64_t off, ///< [in] off at which to write + uint64_t len, ///< [in] len to write from bl + ceph::buffer::list &bl, ///< [in] bl to write will be claimed to len + uint32_t fadvise_flags = 0 ///< [in] fadvise hint + ) { + auto &op = get_object_op_for_modify(hoid); + ceph_assert(!op.updated_snaps); + ceph_assert(len > 0); + ceph_assert(len == bl.length()); + op.buffer_updates.insert( + off, + len, + ObjectOperation::BufferUpdate::Write{bl, fadvise_flags}); + } + void clone_range( + const hobject_t &from, ///< [in] from + const hobject_t &to, ///< [in] to + uint64_t fromoff, ///< [in] offset + uint64_t len, ///< [in] len + uint64_t tooff ///< [in] offset + ) { + auto &op = get_object_op_for_modify(to); + ceph_assert(!op.updated_snaps); + op.buffer_updates.insert( + tooff, + len, + ObjectOperation::BufferUpdate::CloneRange{from, fromoff, len}); + } + void zero( + const hobject_t &hoid, ///< [in] object + uint64_t off, ///< [in] offset to start zeroing at + uint64_t len ///< [in] amount to zero + ) { + auto &op = get_object_op_for_modify(hoid); + ceph_assert(!op.updated_snaps); + op.buffer_updates.insert( + off, + len, + ObjectOperation::BufferUpdate::Zero{len}); + } + + /// Omap updates + void omap_setkeys( + const hobject_t &hoid, ///< [in] object to write + ceph::buffer::list &keys_bl ///< [in] encoded map + ) { + auto &op = get_object_op_for_modify(hoid); + op.omap_updates.emplace_back( + std::make_pair( + ObjectOperation::OmapUpdateType::Insert, + keys_bl)); + } + void omap_setkeys( + const hobject_t &hoid, ///< [in] object to write + std::map &keys ///< [in] omap keys, may be cleared + ) { + using ceph::encode; + ceph::buffer::list bl; + encode(keys, bl); + omap_setkeys(hoid, bl); + } + void omap_rmkeys( + const hobject_t &hoid, ///< [in] object to write + ceph::buffer::list &keys_bl ///< [in] encode set + ) { + auto &op = get_object_op_for_modify(hoid); + op.omap_updates.emplace_back( + std::make_pair( + ObjectOperation::OmapUpdateType::Remove, + keys_bl)); + } + void omap_rmkeys( + const hobject_t &hoid, ///< [in] object to write + std::set &keys ///< [in] omap keys, may be cleared + ) { + using ceph::encode; + ceph::buffer::list bl; + encode(keys, bl); + omap_rmkeys(hoid, bl); + } + void omap_rmkeyrange( + const hobject_t &hoid, ///< [in] object to write + ceph::buffer::list &range_bl ///< [in] encode string[2] + ) { + auto &op = get_object_op_for_modify(hoid); + op.omap_updates.emplace_back( + std::make_pair( + ObjectOperation::OmapUpdateType::RemoveRange, + range_bl)); + } + void omap_rmkeyrange( + const hobject_t &hoid, ///< [in] object to write + std::string& key_begin, ///< [in] first key in range + std::string& key_end ///< [in] first key past range, range is [first,last) + ) { + ceph::buffer::list bl; + ::encode(key_begin, bl); + ::encode(key_end, bl); + omap_rmkeyrange(hoid, bl); + } + void omap_setheader( + const hobject_t &hoid, ///< [in] object to write + ceph::buffer::list &header ///< [in] header + ) { + auto &op = get_object_op_for_modify(hoid); + op.omap_header = header; + } + + bool empty() const { + return op_map.empty(); + } + + uint64_t get_bytes_written() const { + uint64_t ret = 0; + for (auto &&i: op_map) { + for (auto &&j: i.second.buffer_updates) { + ret += j.get_len(); + } + } + return ret; + } + + void nop( + const hobject_t &hoid ///< [in] obj to which we are doing nothing + ) { + get_object_op_for_modify(hoid); + } + + /* Calls t() on all pair & such that clone/rename + * sinks are always called before clone sources + * + * TODO: add a fast path for the single object case and possibly the single + * object clone from source case (make_writeable made a clone). + * + * This structure only requires that the source->sink graph be acyclic. + * This is much more general than is actually required by PrimaryLogPG. + * Only 4 flavors of multi-object transactions actually happen: + * 1) rename temp -> object for copyfrom + * 2) clone head -> clone, modify head for make_writeable on normal head write + * 3) clone clone -> head for rollback + * 4) 2 + 3 + * + * We can bypass the below logic for single object transactions trivially + * (including case 1 above since temp doesn't show up again). + * For 2-3, we could add something ad-hoc to ensure that they happen in the + * right order, but it actually seems easier to just do the graph construction. + */ + template + void safe_create_traverse(T &&t) { + std::map> dgraph; + std::list stack; + + // Populate stack with roots, dgraph with edges + for (auto &&opair: op_map) { + hobject_t source; + if (opair.second.has_source(&source)) { + auto &l = dgraph[source]; + if (l.empty() && !op_map.count(source)) { + /* Source oids not in op_map need to be added as roots + * (but only once!) */ + stack.push_back(source); + } + l.push_back(opair.first); + } else { + stack.push_back(opair.first); + } + } + + /* Why don't we need to worry about accessing the same node + * twice? dgraph nodes always have in-degree at most 1 because + * the inverse graph nodes (source->dest) can have out-degree + * at most 1 (only one possible source). We do a post-order + * depth-first traversal here to ensure we call f on children + * before parents. + */ + while (!stack.empty()) { + hobject_t &cur = stack.front(); + auto diter = dgraph.find(cur); + if (diter == dgraph.end()) { + /* Leaf: pop and call t() */ + auto opiter = op_map.find(cur); + if (opiter != op_map.end()) + t(*opiter); + stack.pop_front(); + } else { + /* Internal node: push children onto stack, remove edge, + * recurse. When this node is encountered again, it'll + * be a leaf */ + ceph_assert(!diter->second.empty()); + stack.splice(stack.begin(), diter->second); + dgraph.erase(diter); + } + } + } +}; +using PGTransactionUPtr = std::unique_ptr; + +#endif diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc new file mode 100644 index 000000000..9709f3ce1 --- /dev/null +++ b/src/osd/PeeringState.cc @@ -0,0 +1,7607 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PGPeeringEvent.h" +#include "common/ceph_releases.h" +#include "common/dout.h" +#include "PeeringState.h" + +#include "messages/MOSDPGRemove.h" +#include "messages/MBackfillReserve.h" +#include "messages/MRecoveryReserve.h" +#include "messages/MOSDScrubReserve.h" +#include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGInfo2.h" +#include "messages/MOSDPGTrim.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGNotify2.h" +#include "messages/MOSDPGQuery.h" +#include "messages/MOSDPGQuery2.h" +#include "messages/MOSDPGLease.h" +#include "messages/MOSDPGLeaseAck.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd + +using std::dec; +using std::hex; +using std::make_pair; +using std::map; +using std::ostream; +using std::pair; +using std::set; +using std::stringstream; +using std::vector; + +using ceph::Formatter; +using ceph::make_message; + +BufferedRecoveryMessages::BufferedRecoveryMessages( + ceph_release_t r, + PeeringCtx &ctx) + : require_osd_release(r) { + // steal messages from ctx + message_map.swap(ctx.message_map); +} + +void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n) +{ + if (require_osd_release >= ceph_release_t::octopus) { + spg_t pgid(n.info.pgid.pgid, n.to); + send_osd_message(to, make_message(pgid, n)); + } else { + send_osd_message(to, make_message(n.epoch_sent, vector{n})); + } +} + +void BufferedRecoveryMessages::send_query( + int to, + spg_t to_spgid, + const pg_query_t &q) +{ + if (require_osd_release >= ceph_release_t::octopus) { + send_osd_message(to, + make_message(to_spgid, q)); + } else { + auto m = make_message( + q.epoch_sent, + MOSDPGQuery::pg_list_t{{to_spgid, q}}); + send_osd_message(to, m); + } +} + +void BufferedRecoveryMessages::send_info( + int to, + spg_t to_spgid, + epoch_t min_epoch, + epoch_t cur_epoch, + const pg_info_t &info, + std::optional lease, + std::optional lease_ack) +{ + if (require_osd_release >= ceph_release_t::octopus) { + send_osd_message( + to, + make_message( + to_spgid, + info, + cur_epoch, + min_epoch, + lease, + lease_ack) + ); + } else { + send_osd_message( + to, + make_message( + cur_epoch, + vector{pg_notify_t{to_spgid.shard, + info.pgid.shard, + min_epoch, cur_epoch, + info, PastIntervals{}}}) + ); + } +} + +void PGPool::update(OSDMapRef map) +{ + const pg_pool_t *pi = map->get_pg_pool(id); + if (!pi) { + return; // pool has been deleted + } + info = *pi; + name = map->get_pool_name(id); + + bool updated = false; + if ((map->get_epoch() != cached_epoch + 1) || + (pi->get_snap_epoch() == map->get_epoch())) { + updated = true; + } + + if (info.is_pool_snaps_mode() && updated) { + snapc = pi->get_snap_context(); + } + cached_epoch = map->get_epoch(); +} + +/*-------------Peering State Helpers----------------*/ +#undef dout_prefix +#define dout_prefix (dpp->gen_prefix(*_dout)) +#undef psdout +#define psdout(x) ldout(cct, x) + +PeeringState::PeeringState( + CephContext *cct, + pg_shard_t pg_whoami, + spg_t spgid, + const PGPool &_pool, + OSDMapRef curmap, + DoutPrefixProvider *dpp, + PeeringListener *pl) + : state_history(*pl), + cct(cct), + spgid(spgid), + dpp(dpp), + pl(pl), + orig_ctx(0), + osdmap_ref(curmap), + pool(_pool), + pg_whoami(pg_whoami), + info(spgid), + pg_log(cct), + missing_loc(spgid, this, dpp, cct), + machine(this, cct, spgid, dpp, pl, &state_history) +{ + machine.initiate(); +} + +void PeeringState::start_handle(PeeringCtx *new_ctx) { + ceph_assert(!rctx); + ceph_assert(!orig_ctx); + orig_ctx = new_ctx; + if (new_ctx) { + if (messages_pending_flush) { + rctx.emplace(*messages_pending_flush, *new_ctx); + } else { + rctx.emplace(*new_ctx); + } + rctx->start_time = ceph_clock_now(); + } +} + +void PeeringState::begin_block_outgoing() { + ceph_assert(!messages_pending_flush); + ceph_assert(orig_ctx); + ceph_assert(rctx); + messages_pending_flush = BufferedRecoveryMessages( + orig_ctx->require_osd_release); + rctx.emplace(*messages_pending_flush, *orig_ctx); +} + +void PeeringState::clear_blocked_outgoing() { + ceph_assert(orig_ctx); + ceph_assert(rctx); + messages_pending_flush = std::optional(); +} + +void PeeringState::end_block_outgoing() { + ceph_assert(messages_pending_flush); + ceph_assert(orig_ctx); + ceph_assert(rctx); + + orig_ctx->accept_buffered_messages(*messages_pending_flush); + rctx.emplace(*orig_ctx); + messages_pending_flush = std::optional(); +} + +void PeeringState::end_handle() { + if (rctx) { + utime_t dur = ceph_clock_now() - rctx->start_time; + machine.event_time += dur; + } + + machine.event_count++; + rctx = std::nullopt; + orig_ctx = NULL; +} + +void PeeringState::check_recovery_sources(const OSDMapRef& osdmap) +{ + /* + * check that any peers we are planning to (or currently) pulling + * objects from are dealt with. + */ + missing_loc.check_recovery_sources(osdmap); + pl->check_recovery_sources(osdmap); + + for (auto i = peer_log_requested.begin(); i != peer_log_requested.end();) { + if (!osdmap->is_up(i->osd)) { + psdout(10) << "peer_log_requested removing " << *i << dendl; + peer_log_requested.erase(i++); + } else { + ++i; + } + } + + for (auto i = peer_missing_requested.begin(); + i != peer_missing_requested.end();) { + if (!osdmap->is_up(i->osd)) { + psdout(10) << "peer_missing_requested removing " << *i << dendl; + peer_missing_requested.erase(i++); + } else { + ++i; + } + } +} + +void PeeringState::update_history(const pg_history_t& new_history) +{ + auto mnow = pl->get_mnow(); + info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub); + if (info.history.merge(new_history)) { + psdout(20) << __func__ << " advanced history from " << new_history << dendl; + dirty_info = true; + if (info.history.last_epoch_clean >= info.history.same_interval_since) { + psdout(20) << __func__ << " clearing past_intervals" << dendl; + past_intervals.clear(); + dirty_big_info = true; + } + prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow); + if (prior_readable_until_ub != ceph::signedspan::zero()) { + dout(20) << __func__ + << " prior_readable_until_ub " << prior_readable_until_ub + << " (mnow " << mnow << " + " + << info.history.prior_readable_until_ub << ")" << dendl; + } + } + pl->on_info_history_change(); +} + +hobject_t PeeringState::earliest_backfill() const +{ + hobject_t e = hobject_t::get_max(); + for (const pg_shard_t& bt : get_backfill_targets()) { + const pg_info_t &pi = get_peer_info(bt); + e = std::min(pi.last_backfill, e); + } + return e; +} + +void PeeringState::purge_strays() +{ + if (is_premerge()) { + psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing" + << dendl; + return; + } + if (cct->_conf.get_val("osd_debug_no_purge_strays")) { + return; + } + psdout(10) << "purge_strays " << stray_set << dendl; + + bool removed = false; + for (auto p = stray_set.begin(); p != stray_set.end(); ++p) { + ceph_assert(!is_acting_recovery_backfill(*p)); + if (get_osdmap()->is_up(p->osd)) { + psdout(10) << "sending PGRemove to osd." << *p << dendl; + vector to_remove; + to_remove.push_back(spg_t(info.pgid.pgid, p->shard)); + auto m = make_message( + get_osdmap_epoch(), + to_remove); + pl->send_cluster_message(p->osd, m, get_osdmap_epoch()); + } else { + psdout(10) << "not sending PGRemove to down osd." << *p << dendl; + } + peer_missing.erase(*p); + peer_info.erase(*p); + missing_loc.remove_stray_recovery_sources(*p); + peer_purged.insert(*p); + removed = true; + } + + // if we removed anyone, update peers (which include peer_info) + if (removed) + update_heartbeat_peers(); + + stray_set.clear(); + + // clear _requested maps; we may have to peer() again if we discover + // (more) stray content + peer_log_requested.clear(); + peer_missing_requested.clear(); +} + +void PeeringState::query_unfound(Formatter *f, string state) +{ + psdout(20) << "Enter PeeringState common QueryUnfound" << dendl; + { + f->dump_string("state", state); + f->dump_bool("available_might_have_unfound", true); + f->open_array_section("might_have_unfound"); + for (auto p = might_have_unfound.begin(); + p != might_have_unfound.end(); + ++p) { + if (peer_missing.count(*p)) { + ; // Ignore already probed OSDs + } else { + f->open_object_section("osd"); + f->dump_stream("osd") << *p; + if (peer_missing_requested.count(*p)) { + f->dump_string("status", "querying"); + } else if (!get_osdmap()->is_up(p->osd)) { + f->dump_string("status", "osd is down"); + } else { + f->dump_string("status", "not queried"); + } + f->close_section(); + } + } + f->close_section(); + } + psdout(20) << "Exit PeeringState common QueryUnfound" << dendl; + return; +} + +bool PeeringState::proc_replica_info( + pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch) +{ + auto p = peer_info.find(from); + if (p != peer_info.end() && p->second.last_update == oinfo.last_update) { + psdout(10) << " got dup osd." << from << " info " + << oinfo << ", identical to ours" << dendl; + return false; + } + + if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) { + psdout(10) << " got info " << oinfo << " from down osd." << from + << " discarding" << dendl; + return false; + } + + psdout(10) << " got osd." << from << " " << oinfo << dendl; + ceph_assert(is_primary()); + peer_info[from] = oinfo; + might_have_unfound.insert(from); + + update_history(oinfo.history); + + // stray? + if (!is_up(from) && !is_acting(from)) { + psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl; + stray_set.insert(from); + if (is_clean()) { + purge_strays(); + } + } + + // was this a new info? if so, update peers! + if (p == peer_info.end()) + update_heartbeat_peers(); + + return true; +} + + +void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap) +{ + // Remove any downed osds from peer_info + bool removed = false; + auto p = peer_info.begin(); + while (p != peer_info.end()) { + if (!osdmap->is_up(p->first.osd)) { + psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl; + peer_missing.erase(p->first); + peer_log_requested.erase(p->first); + peer_missing_requested.erase(p->first); + peer_info.erase(p++); + removed = true; + } else + ++p; + } + + // Remove any downed osds from peer_purged so we can re-purge if necessary + auto it = peer_purged.begin(); + while (it != peer_purged.end()) { + if (!osdmap->is_up(it->osd)) { + psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl; + peer_purged.erase(it++); + } else { + ++it; + } + } + + // if we removed anyone, update peers (which include peer_info) + if (removed) + update_heartbeat_peers(); + + check_recovery_sources(osdmap); +} + +void PeeringState::update_heartbeat_peers() +{ + if (!is_primary()) + return; + + set new_peers; + for (unsigned i=0; ifirst.osd); + } + pl->update_heartbeat_peers(std::move(new_peers)); +} + +void PeeringState::write_if_dirty(ObjectStore::Transaction& t) +{ + pl->prepare_write( + info, + last_written_info, + past_intervals, + pg_log, + dirty_info, + dirty_big_info, + last_persisted_osdmap < get_osdmap_epoch(), + t); + if (dirty_info || dirty_big_info) { + last_persisted_osdmap = get_osdmap_epoch(); + last_written_info = info; + dirty_info = false; + dirty_big_info = false; + } +} + +void PeeringState::advance_map( + OSDMapRef osdmap, OSDMapRef lastmap, + vector& newup, int up_primary, + vector& newacting, int acting_primary, + PeeringCtx &rctx) +{ + ceph_assert(lastmap == osdmap_ref); + psdout(10) << "handle_advance_map " + << newup << "/" << newacting + << " -- " << up_primary << "/" << acting_primary + << dendl; + + update_osdmap_ref(osdmap); + pool.update(osdmap); + + AdvMap evt( + osdmap, lastmap, newup, up_primary, + newacting, acting_primary); + handle_event(evt, &rctx); + if (pool.info.last_change == osdmap_ref->get_epoch()) { + pl->on_pool_change(); + } + readable_interval = pool.get_readable_interval(cct->_conf); + last_require_osd_release = osdmap->require_osd_release; +} + +void PeeringState::activate_map(PeeringCtx &rctx) +{ + psdout(10) << __func__ << dendl; + ActMap evt; + handle_event(evt, &rctx); + if (osdmap_ref->get_epoch() - last_persisted_osdmap > + cct->_conf->osd_pg_epoch_persisted_max_stale) { + psdout(20) << __func__ << ": Dirtying info: last_persisted is " + << last_persisted_osdmap + << " while current is " << osdmap_ref->get_epoch() << dendl; + dirty_info = true; + } else { + psdout(20) << __func__ << ": Not dirtying info: last_persisted is " + << last_persisted_osdmap + << " while current is " << osdmap_ref->get_epoch() << dendl; + } + write_if_dirty(rctx.transaction); + + if (get_osdmap()->check_new_blocklist_entries()) { + pl->check_blocklisted_watchers(); + } +} + +void PeeringState::set_last_peering_reset() +{ + psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl; + if (last_peering_reset != get_osdmap_epoch()) { + last_peering_reset = get_osdmap_epoch(); + psdout(10) << "Clearing blocked outgoing recovery messages" << dendl; + clear_blocked_outgoing(); + if (!pl->try_flush_or_schedule_async()) { + psdout(10) << "Beginning to block outgoing recovery messages" << dendl; + begin_block_outgoing(); + } else { + psdout(10) << "Not blocking outgoing recovery messages" << dendl; + } + } +} + +void PeeringState::complete_flush() +{ + flushes_in_progress--; + if (flushes_in_progress == 0) { + pl->on_flushed(); + } +} + +void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap) +{ + const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool()); + if (!pi) { + return; // pool deleted + } + bool changed = false; + if (pi->has_flag(pg_pool_t::FLAG_FULL)) { + const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool()); + if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) { + psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl; + changed = true; + } + } + if (changed) { + info.history.last_epoch_marked_full = osdmap->get_epoch(); + dirty_info = true; + } +} + +bool PeeringState::should_restart_peering( + int newupprimary, + int newactingprimary, + const vector& newup, + const vector& newacting, + OSDMapRef lastmap, + OSDMapRef osdmap) +{ + if (PastIntervals::is_new_interval( + primary.osd, + newactingprimary, + acting, + newacting, + up_primary.osd, + newupprimary, + up, + newup, + osdmap.get(), + lastmap.get(), + info.pgid.pgid)) { + psdout(20) << "new interval newup " << newup + << " newacting " << newacting << dendl; + return true; + } + if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) { + psdout(10) << __func__ << " osd transitioned from down -> up" + << dendl; + return true; + } + return false; +} + +/* Called before initializing peering during advance_map */ +void PeeringState::start_peering_interval( + const OSDMapRef lastmap, + const vector& newup, int new_up_primary, + const vector& newacting, int new_acting_primary, + ObjectStore::Transaction &t) +{ + const OSDMapRef osdmap = get_osdmap(); + + set_last_peering_reset(); + + vector oldacting, oldup; + int oldrole = get_role(); + + if (is_primary()) { + pl->clear_ready_to_merge(); + } + + + pg_shard_t old_acting_primary = get_primary(); + pg_shard_t old_up_primary = up_primary; + bool was_old_primary = is_primary(); + bool was_old_nonprimary = is_nonprimary(); + + acting.swap(oldacting); + up.swap(oldup); + init_primary_up_acting( + newup, + newacting, + new_up_primary, + new_acting_primary); + + if (info.stats.up != up || + info.stats.acting != acting || + info.stats.up_primary != new_up_primary || + info.stats.acting_primary != new_acting_primary) { + info.stats.up = up; + info.stats.up_primary = new_up_primary; + info.stats.acting = acting; + info.stats.acting_primary = new_acting_primary; + info.stats.mapping_epoch = osdmap->get_epoch(); + } + + pl->clear_publish_stats(); + + // This will now be remapped during a backfill in cases + // that it would not have been before. + if (up != acting) + state_set(PG_STATE_REMAPPED); + else + state_clear(PG_STATE_REMAPPED); + + int role = osdmap->calc_pg_role(pg_whoami, acting); + set_role(role); + + // did acting, up, primary|acker change? + if (!lastmap) { + psdout(10) << " no lastmap" << dendl; + dirty_info = true; + dirty_big_info = true; + info.history.same_interval_since = osdmap->get_epoch(); + } else { + std::stringstream debug; + ceph_assert(info.history.same_interval_since != 0); + bool new_interval = PastIntervals::check_new_interval( + old_acting_primary.osd, + new_acting_primary, + oldacting, newacting, + old_up_primary.osd, + new_up_primary, + oldup, newup, + info.history.same_interval_since, + info.history.last_epoch_clean, + osdmap.get(), + lastmap.get(), + info.pgid.pgid, + missing_loc.get_recoverable_predicate(), + &past_intervals, + &debug); + psdout(10) << __func__ << ": check_new_interval output: " + << debug.str() << dendl; + if (new_interval) { + if (osdmap->get_epoch() == pl->oldest_stored_osdmap() && + info.history.last_epoch_clean < osdmap->get_epoch()) { + psdout(10) << " map gap, clearing past_intervals and faking" << dendl; + // our information is incomplete and useless; someone else was clean + // after everything we know if osdmaps were trimmed. + past_intervals.clear(); + } else { + psdout(10) << " noting past " << past_intervals << dendl; + } + dirty_info = true; + dirty_big_info = true; + info.history.same_interval_since = osdmap->get_epoch(); + if (osdmap->have_pg_pool(info.pgid.pgid.pool()) && + info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()), + osdmap->get_pg_num(info.pgid.pgid.pool()), + nullptr)) { + info.history.last_epoch_split = osdmap->get_epoch(); + } + } + } + + if (old_up_primary != up_primary || + oldup != up) { + info.history.same_up_since = osdmap->get_epoch(); + } + // this comparison includes primary rank via pg_shard_t + if (old_acting_primary != get_primary()) { + info.history.same_primary_since = osdmap->get_epoch(); + } + + on_new_interval(); + pl->on_info_history_change(); + + psdout(1) << __func__ << " up " << oldup << " -> " << up + << ", acting " << oldacting << " -> " << acting + << ", acting_primary " << old_acting_primary << " -> " + << new_acting_primary + << ", up_primary " << old_up_primary << " -> " << new_up_primary + << ", role " << oldrole << " -> " << role + << ", features acting " << acting_features + << " upacting " << upacting_features + << dendl; + + // deactivate. + state_clear(PG_STATE_ACTIVE); + state_clear(PG_STATE_PEERED); + state_clear(PG_STATE_PREMERGE); + state_clear(PG_STATE_DOWN); + state_clear(PG_STATE_RECOVERY_WAIT); + state_clear(PG_STATE_RECOVERY_TOOFULL); + state_clear(PG_STATE_RECOVERING); + + peer_purged.clear(); + acting_recovery_backfill.clear(); + + // reset primary/replica state? + if (was_old_primary || is_primary()) { + pl->clear_want_pg_temp(); + } else if (was_old_nonprimary || is_nonprimary()) { + pl->clear_want_pg_temp(); + } + clear_primary_state(); + + pl->on_change(t); + + ceph_assert(!deleting); + + // should we tell the primary we are here? + send_notify = !is_primary(); + + if (role != oldrole || + was_old_primary != is_primary()) { + // did primary change? + if (was_old_primary != is_primary()) { + state_clear(PG_STATE_CLEAN); + } + + pl->on_role_change(); + } else { + // no role change. + // did primary change? + if (get_primary() != old_acting_primary) { + psdout(10) << oldacting << " -> " << acting + << ", acting primary " + << old_acting_primary << " -> " << get_primary() + << dendl; + } else { + // primary is the same. + if (is_primary()) { + // i am (still) primary. but my replica set changed. + state_clear(PG_STATE_CLEAN); + + psdout(10) << oldacting << " -> " << acting + << ", replicas changed" << dendl; + } + } + } + + if (acting.empty() && !up.empty() && up_primary == pg_whoami) { + psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl; + pl->queue_want_pg_temp(acting); + } +} + +void PeeringState::on_new_interval() +{ + dout(20) << __func__ << dendl; + const OSDMapRef osdmap = get_osdmap(); + + // initialize features + acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + for (auto p = acting.begin(); p != acting.end(); ++p) { + if (*p == CRUSH_ITEM_NONE) + continue; + uint64_t f = osdmap->get_xinfo(*p).features; + acting_features &= f; + upacting_features &= f; + } + for (auto p = up.begin(); p != up.end(); ++p) { + if (*p == CRUSH_ITEM_NONE) + continue; + upacting_features &= osdmap->get_xinfo(*p).features; + } + psdout(20) << __func__ << " upacting_features 0x" << std::hex + << upacting_features << std::dec + << " from " << acting << "+" << up << dendl; + + psdout(20) << __func__ << " checking missing set deletes flag. missing = " + << get_pg_log().get_missing() << dendl; + + if (!pg_log.get_missing().may_include_deletes && + !perform_deletes_during_peering()) { + pl->rebuild_missing_set_with_deletes(pg_log); + } + ceph_assert( + pg_log.get_missing().may_include_deletes == + !perform_deletes_during_peering()); + + init_hb_stamps(); + + // update lease bounds for a new interval + auto mnow = pl->get_mnow(); + prior_readable_until_ub = std::max(prior_readable_until_ub, + readable_until_ub); + prior_readable_until_ub = info.history.refresh_prior_readable_until_ub( + mnow, prior_readable_until_ub); + psdout(10) << __func__ << " prior_readable_until_ub " + << prior_readable_until_ub << " (mnow " << mnow << " + " + << info.history.prior_readable_until_ub << ")" << dendl; + prior_readable_down_osds.clear(); // we populate this when we build the priorset + + readable_until = + readable_until_ub = + readable_until_ub_sent = + readable_until_ub_from_primary = ceph::signedspan::zero(); + + acting_readable_until_ub.clear(); + if (is_primary()) { + acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero()); + } + + pl->on_new_interval(); +} + +void PeeringState::init_primary_up_acting( + const vector &newup, + const vector &newacting, + int new_up_primary, + int new_acting_primary) +{ + actingset.clear(); + acting = newacting; + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] != CRUSH_ITEM_NONE) + actingset.insert( + pg_shard_t( + acting[i], + pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + upset.clear(); + up = newup; + for (uint8_t i = 0; i < up.size(); ++i) { + if (up[i] != CRUSH_ITEM_NONE) + upset.insert( + pg_shard_t( + up[i], + pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + if (!pool.info.is_erasure()) { + // replicated + up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD); + primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD); + } else { + // erasure + up_primary = pg_shard_t(); + primary = pg_shard_t(); + for (uint8_t i = 0; i < up.size(); ++i) { + if (up[i] == new_up_primary) { + up_primary = pg_shard_t(up[i], shard_id_t(i)); + break; + } + } + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == new_acting_primary) { + primary = pg_shard_t(acting[i], shard_id_t(i)); + break; + } + } + ceph_assert(up_primary.osd == new_up_primary); + ceph_assert(primary.osd == new_acting_primary); + } +} + +void PeeringState::init_hb_stamps() +{ + if (is_primary()) { + // we care about all other osds in the acting set + hb_stamps.resize(acting.size() - 1); + unsigned i = 0; + for (auto p : acting) { + if (p == CRUSH_ITEM_NONE || p == get_primary().osd) { + continue; + } + hb_stamps[i++] = pl->get_hb_stamps(p); + } + hb_stamps.resize(i); + } else if (is_nonprimary()) { + // we care about just the primary + hb_stamps.resize(1); + hb_stamps[0] = pl->get_hb_stamps(get_primary().osd); + } else { + hb_stamps.clear(); + } + dout(10) << __func__ << " now " << hb_stamps << dendl; +} + + +void PeeringState::clear_recovery_state() +{ + async_recovery_targets.clear(); + backfill_targets.clear(); +} + +void PeeringState::clear_primary_state() +{ + psdout(10) << "clear_primary_state" << dendl; + + // clear peering state + stray_set.clear(); + peer_log_requested.clear(); + peer_missing_requested.clear(); + peer_info.clear(); + peer_bytes.clear(); + peer_missing.clear(); + peer_last_complete_ondisk.clear(); + peer_activated.clear(); + min_last_complete_ondisk = eversion_t(); + pg_trim_to = eversion_t(); + might_have_unfound.clear(); + need_up_thru = false; + missing_loc.clear(); + pg_log.reset_recovery_pointers(); + + clear_recovery_state(); + + last_update_ondisk = eversion_t(); + missing_loc.clear(); + pl->clear_primary_state(); +} + +/// return [start,end) bounds for required past_intervals +static pair get_required_past_interval_bounds( + const pg_info_t &info, + epoch_t oldest_map) { + epoch_t start = std::max( + info.history.last_epoch_clean ? info.history.last_epoch_clean : + info.history.epoch_pool_created, + oldest_map); + epoch_t end = std::max( + info.history.same_interval_since, + info.history.epoch_pool_created); + return make_pair(start, end); +} + + +void PeeringState::check_past_interval_bounds() const +{ + auto oldest_epoch = pl->oldest_stored_osdmap(); + auto rpib = get_required_past_interval_bounds( + info, + oldest_epoch); + if (rpib.first >= rpib.second) { + // do not warn if the start bound is dictated by oldest_map; the + // past intervals are presumably appropriate given the pg info. + if (!past_intervals.empty() && + rpib.first > oldest_epoch) { + pl->get_clog_error() << info.pgid << " required past_interval bounds are" + << " empty [" << rpib << ") but past_intervals is not: " + << past_intervals; + derr << info.pgid << " required past_interval bounds are" + << " empty [" << rpib << ") but past_intervals is not: " + << past_intervals << dendl; + } + } else { + if (past_intervals.empty()) { + pl->get_clog_error() << info.pgid << " required past_interval bounds are" + << " not empty [" << rpib << ") but past_intervals " + << past_intervals << " is empty"; + derr << info.pgid << " required past_interval bounds are" + << " not empty [" << rpib << ") but past_intervals " + << past_intervals << " is empty" << dendl; + ceph_assert(!past_intervals.empty()); + } + + auto apib = past_intervals.get_bounds(); + if (apib.first > rpib.first) { + pl->get_clog_error() << info.pgid << " past_intervals [" << apib + << ") start interval does not contain the required" + << " bound [" << rpib << ") start"; + derr << info.pgid << " past_intervals [" << apib + << ") start interval does not contain the required" + << " bound [" << rpib << ") start" << dendl; + ceph_abort_msg("past_interval start interval mismatch"); + } + if (apib.second != rpib.second) { + pl->get_clog_error() << info.pgid << " past_interal bound [" << apib + << ") end does not match required [" << rpib + << ") end"; + derr << info.pgid << " past_interal bound [" << apib + << ") end does not match required [" << rpib + << ") end" << dendl; + ceph_abort_msg("past_interval end mismatch"); + } + } +} + +int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max) +{ + static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range"); + static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type"); + + ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX); + + // User can't set this too high anymore, but might be a legacy value + if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX) + pool_recovery_priority = OSD_POOL_PRIORITY_MAX; + if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN) + pool_recovery_priority = OSD_POOL_PRIORITY_MIN; + // Shift range from min to max to 0 to max - min + pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN); + ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN)); + + priority += pool_recovery_priority; + + // Clamp to valid range + if (priority > max) { + return max; + } else if (priority < OSD_RECOVERY_PRIORITY_MIN) { + return OSD_RECOVERY_PRIORITY_MIN; + } else { + return priority; + } +} + +unsigned PeeringState::get_recovery_priority() +{ + // a higher value -> a higher priority + int ret = OSD_RECOVERY_PRIORITY_BASE; + int base = ret; + + if (state & PG_STATE_FORCED_RECOVERY) { + ret = OSD_RECOVERY_PRIORITY_FORCED; + } else { + // XXX: This priority boost isn't so much about inactive, but about data-at-risk + if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) { + base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE; + // inactive: no. of replicas < min_size, highest priority since it blocks IO + ret = base + (pool.info.min_size - info.stats.avail_no_missing.size()); + } + + int64_t pool_recovery_priority = 0; + pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority); + + ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]); + } + psdout(20) << __func__ << " recovery priority is " << ret << dendl; + return static_cast(ret); +} + +unsigned PeeringState::get_backfill_priority() +{ + // a higher value -> a higher priority + int ret = OSD_BACKFILL_PRIORITY_BASE; + int base = ret; + + if (state & PG_STATE_FORCED_BACKFILL) { + ret = OSD_BACKFILL_PRIORITY_FORCED; + } else { + if (actingset.size() < pool.info.min_size) { + base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE; + // inactive: no. of replicas < min_size, highest priority since it blocks IO + ret = base + (pool.info.min_size - actingset.size()); + + } else if (is_undersized()) { + // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas + ceph_assert(pool.info.size > actingset.size()); + base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE; + ret = base + (pool.info.size - actingset.size()); + + } else if (is_degraded()) { + // degraded: baseline degraded + base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE; + } + + // Adjust with pool's recovery priority + int64_t pool_recovery_priority = 0; + pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority); + + ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]); + } + + psdout(20) << __func__ << " backfill priority is " << ret << dendl; + return static_cast(ret); +} + +unsigned PeeringState::get_delete_priority() +{ + auto state = get_osdmap()->get_state(pg_whoami.osd); + if (state & (CEPH_OSD_BACKFILLFULL | + CEPH_OSD_FULL)) { + return OSD_DELETE_PRIORITY_FULL; + } else if (state & CEPH_OSD_NEARFULL) { + return OSD_DELETE_PRIORITY_FULLISH; + } else { + return OSD_DELETE_PRIORITY_NORMAL; + } +} + +bool PeeringState::set_force_recovery(bool b) +{ + bool did = false; + if (b) { + if (!(state & PG_STATE_FORCED_RECOVERY) && + (state & (PG_STATE_DEGRADED | + PG_STATE_RECOVERY_WAIT | + PG_STATE_RECOVERING))) { + psdout(20) << __func__ << " set" << dendl; + state_set(PG_STATE_FORCED_RECOVERY); + pl->publish_stats_to_osd(); + did = true; + } + } else if (state & PG_STATE_FORCED_RECOVERY) { + psdout(20) << __func__ << " clear" << dendl; + state_clear(PG_STATE_FORCED_RECOVERY); + pl->publish_stats_to_osd(); + did = true; + } + if (did) { + psdout(20) << __func__ << " state " << get_current_state() + << dendl; + pl->update_local_background_io_priority(get_recovery_priority()); + } + return did; +} + +bool PeeringState::set_force_backfill(bool b) +{ + bool did = false; + if (b) { + if (!(state & PG_STATE_FORCED_BACKFILL) && + (state & (PG_STATE_DEGRADED | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILLING))) { + psdout(10) << __func__ << " set" << dendl; + state_set(PG_STATE_FORCED_BACKFILL); + pl->publish_stats_to_osd(); + did = true; + } + } else if (state & PG_STATE_FORCED_BACKFILL) { + psdout(10) << __func__ << " clear" << dendl; + state_clear(PG_STATE_FORCED_BACKFILL); + pl->publish_stats_to_osd(); + did = true; + } + if (did) { + psdout(20) << __func__ << " state " << get_current_state() + << dendl; + pl->update_local_background_io_priority(get_backfill_priority()); + } + return did; +} + +void PeeringState::schedule_renew_lease() +{ + pl->schedule_renew_lease( + last_peering_reset, + readable_interval / 2); +} + +void PeeringState::send_lease() +{ + epoch_t epoch = pl->get_osdmap_epoch(); + for (auto peer : actingset) { + if (peer == pg_whoami) { + continue; + } + pl->send_cluster_message( + peer.osd, + make_message(epoch, + spg_t(spgid.pgid, peer.shard), + get_lease()), + epoch); + } +} + +void PeeringState::proc_lease(const pg_lease_t& l) +{ + if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { + psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex + << upacting_features << std::dec + << " does not include SERVER_OCTOPUS" << dendl; + return; + } + if (!is_nonprimary()) { + psdout(20) << __func__ << " no-op, !nonprimary" << dendl; + return; + } + psdout(10) << __func__ << " " << l << dendl; + if (l.readable_until_ub > readable_until_ub_from_primary) { + readable_until_ub_from_primary = l.readable_until_ub; + } + + ceph::signedspan ru = ceph::signedspan::zero(); + if (l.readable_until != ceph::signedspan::zero() && + hb_stamps[0]->peer_clock_delta_ub) { + ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub; + psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub + << " -> ru " << ru << dendl; + } + if (ru > readable_until) { + readable_until = ru; + psdout(20) << __func__ << " readable_until now " << readable_until << dendl; + // NOTE: if we ever decide to block/queue ops on the replica, + // we'll need to wake them up here. + } + + ceph::signedspan ruub; + if (hb_stamps[0]->peer_clock_delta_lb) { + ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb; + psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb + << " -> ruub " << ruub << dendl; + } else { + ruub = pl->get_mnow() + l.interval; + psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl; + } + if (ruub > readable_until_ub) { + readable_until_ub = ruub; + psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub + << dendl; + } +} + +void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a) +{ + if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { + return; + } + auto now = pl->get_mnow(); + bool was_min = false; + for (unsigned i = 0; i < acting.size(); ++i) { + if (from == acting[i]) { + // the lease_ack value is based on the primary's clock + if (a.readable_until_ub > acting_readable_until_ub[i]) { + if (acting_readable_until_ub[i] == readable_until) { + was_min = true; + } + acting_readable_until_ub[i] = a.readable_until_ub; + break; + } + } + } + if (was_min) { + auto old_ru = readable_until; + recalc_readable_until(); + if (now < old_ru) { + pl->recheck_readable(); + } + } +} + +void PeeringState::proc_renew_lease() +{ + if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { + return; + } + renew_lease(pl->get_mnow()); + send_lease(); + schedule_renew_lease(); +} + +void PeeringState::recalc_readable_until() +{ + assert(is_primary()); + ceph::signedspan min = readable_until_ub_sent; + for (unsigned i = 0; i < acting.size(); ++i) { + if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) { + continue; + } + dout(20) << __func__ << " peer osd." << acting[i] + << " ruub " << acting_readable_until_ub[i] << dendl; + if (acting_readable_until_ub[i] < min) { + min = acting_readable_until_ub[i]; + } + } + readable_until = min; + readable_until_ub = min; + dout(20) << __func__ << " readable_until[_ub] " << readable_until + << " (sent " << readable_until_ub_sent << ")" << dendl; +} + +bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map) +{ + if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { + return false; + } + bool changed = false; + auto p = prior_readable_down_osds.begin(); + while (p != prior_readable_down_osds.end()) { + if (map->is_dead(*p)) { + dout(10) << __func__ << " prior_readable_down_osds osd." << *p + << " is dead as of epoch " << map->get_epoch() + << dendl; + p = prior_readable_down_osds.erase(p); + changed = true; + } else { + ++p; + } + } + if (changed && prior_readable_down_osds.empty()) { + psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl; + clear_prior_readable_until_ub(); + return true; + } + return false; +} + +bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap) +{ + epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd); + if (need_up_thru && + up_thru >= info.history.same_interval_since) { + psdout(10) << "adjust_need_up_thru now " + << up_thru << ", need_up_thru now false" << dendl; + need_up_thru = false; + return true; + } + return false; +} + +PastIntervals::PriorSet PeeringState::build_prior() +{ + if (1) { + // sanity check + for (auto it = peer_info.begin(); it != peer_info.end(); ++it) { + ceph_assert(info.history.last_epoch_started >= + it->second.history.last_epoch_started); + } + } + + const OSDMap &osdmap = *get_osdmap(); + PastIntervals::PriorSet prior = past_intervals.get_prior_set( + pool.info.is_erasure(), + info.history.last_epoch_started, + &missing_loc.get_recoverable_predicate(), + [&](epoch_t start, int osd, epoch_t *lost_at) { + const osd_info_t *pinfo = 0; + if (osdmap.exists(osd)) { + pinfo = &osdmap.get_info(osd); + if (lost_at) + *lost_at = pinfo->lost_at; + } + + if (osdmap.is_up(osd)) { + return PastIntervals::UP; + } else if (!pinfo) { + return PastIntervals::DNE; + } else if (pinfo->lost_at > start) { + return PastIntervals::LOST; + } else { + return PastIntervals::DOWN; + } + }, + up, + acting, + dpp); + + if (prior.pg_down) { + state_set(PG_STATE_DOWN); + } + + if (get_osdmap()->get_up_thru(pg_whoami.osd) < + info.history.same_interval_since) { + psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd) + << " < same_since " << info.history.same_interval_since + << ", must notify monitor" << dendl; + need_up_thru = true; + } else { + psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd) + << " >= same_since " << info.history.same_interval_since + << ", all is well" << dendl; + need_up_thru = false; + } + pl->set_probe_targets(prior.probe); + return prior; +} + +bool PeeringState::needs_recovery() const +{ + ceph_assert(is_primary()); + + auto &missing = pg_log.get_missing(); + + if (missing.num_missing()) { + psdout(10) << __func__ << " primary has " << missing.num_missing() + << " missing" << dendl; + return true; + } + + ceph_assert(!acting_recovery_backfill.empty()); + for (const pg_shard_t& peer : acting_recovery_backfill) { + if (peer == get_primary()) { + continue; + } + auto pm = peer_missing.find(peer); + if (pm == peer_missing.end()) { + psdout(10) << __func__ << " osd." << peer << " doesn't have missing set" + << dendl; + continue; + } + if (pm->second.num_missing()) { + psdout(10) << __func__ << " osd." << peer << " has " + << pm->second.num_missing() << " missing" << dendl; + return true; + } + } + + psdout(10) << __func__ << " is recovered" << dendl; + return false; +} + +bool PeeringState::needs_backfill() const +{ + ceph_assert(is_primary()); + + // We can assume that only possible osds that need backfill + // are on the backfill_targets vector nodes. + for (const pg_shard_t& peer : backfill_targets) { + auto pi = peer_info.find(peer); + ceph_assert(pi != peer_info.end()); + if (!pi->second.last_backfill.is_max()) { + psdout(10) << __func__ << " osd." << peer + << " has last_backfill " << pi->second.last_backfill << dendl; + return true; + } + } + + psdout(10) << __func__ << " does not need backfill" << dendl; + return false; +} + +/* + * Returns true unless there is a non-lost OSD in might_have_unfound. + */ +bool PeeringState::all_unfound_are_queried_or_lost( + const OSDMapRef osdmap) const +{ + ceph_assert(is_primary()); + + auto peer = might_have_unfound.begin(); + auto mend = might_have_unfound.end(); + for (; peer != mend; ++peer) { + if (peer_missing.count(*peer)) + continue; + auto iter = peer_info.find(*peer); + if (iter != peer_info.end() && + (iter->second.is_empty() || iter->second.dne())) + continue; + if (!osdmap->exists(peer->osd)) + continue; + const osd_info_t &osd_info(osdmap->get_info(peer->osd)); + if (osd_info.lost_at <= osd_info.up_from) { + // If there is even one OSD in might_have_unfound that isn't lost, we + // still might retrieve our unfound. + return false; + } + } + psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " + << might_have_unfound + << " have been queried or are marked lost" << dendl; + return true; +} + + +void PeeringState::reject_reservation() +{ + pl->unreserve_recovery_space(); + pl->send_cluster_message( + primary.osd, + make_message( + MBackfillReserve::REJECT_TOOFULL, + spg_t(info.pgid.pgid, primary.shard), + get_osdmap_epoch()), + get_osdmap_epoch()); +} + +/** + * find_best_info + * + * Returns an iterator to the best info in infos sorted by: + * 1) Prefer newer last_update + * 2) Prefer longer tail if it brings another info into contiguity + * 3) Prefer current primary + */ +map::const_iterator PeeringState::find_best_info( + const map &infos, + bool restrict_to_up_acting, + bool *history_les_bound) const +{ + ceph_assert(history_les_bound); + /* See doc/dev/osd_internals/last_epoch_started.rst before attempting + * to make changes to this process. Also, make sure to update it + * when you find bugs! */ + epoch_t max_last_epoch_started_found = 0; + for (auto i = infos.begin(); i != infos.end(); ++i) { + if (!cct->_conf->osd_find_best_info_ignore_history_les && + max_last_epoch_started_found < i->second.history.last_epoch_started) { + *history_les_bound = true; + max_last_epoch_started_found = i->second.history.last_epoch_started; + } + if (!i->second.is_incomplete() && + max_last_epoch_started_found < i->second.last_epoch_started) { + *history_les_bound = false; + max_last_epoch_started_found = i->second.last_epoch_started; + } + } + eversion_t min_last_update_acceptable = eversion_t::max(); + for (auto i = infos.begin(); i != infos.end(); ++i) { + if (max_last_epoch_started_found <= i->second.last_epoch_started) { + if (min_last_update_acceptable > i->second.last_update) + min_last_update_acceptable = i->second.last_update; + } + } + if (min_last_update_acceptable == eversion_t::max()) + return infos.end(); + + auto best = infos.end(); + // find osd with newest last_update (oldest for ec_pool). + // if there are multiples, prefer + // - a longer tail, if it brings another peer into log contiguity + // - the current primary + for (auto p = infos.begin(); p != infos.end(); ++p) { + if (restrict_to_up_acting && !is_up(p->first) && + !is_acting(p->first)) + continue; + // Only consider peers with last_update >= min_last_update_acceptable + if (p->second.last_update < min_last_update_acceptable) + continue; + // Disqualify anyone with a too old last_epoch_started + if (p->second.last_epoch_started < max_last_epoch_started_found) + continue; + // Disqualify anyone who is incomplete (not fully backfilled) + if (p->second.is_incomplete()) + continue; + if (best == infos.end()) { + best = p; + continue; + } + // Prefer newer last_update + if (pool.info.require_rollback()) { + if (p->second.last_update > best->second.last_update) + continue; + if (p->second.last_update < best->second.last_update) { + best = p; + continue; + } + } else { + if (p->second.last_update < best->second.last_update) + continue; + if (p->second.last_update > best->second.last_update) { + best = p; + continue; + } + } + + // Prefer longer tail + if (p->second.log_tail > best->second.log_tail) { + continue; + } else if (p->second.log_tail < best->second.log_tail) { + best = p; + continue; + } + + if (!p->second.has_missing() && best->second.has_missing()) { + psdout(10) << __func__ << " prefer osd." << p->first + << " because it is complete while best has missing" + << dendl; + best = p; + continue; + } else if (p->second.has_missing() && !best->second.has_missing()) { + psdout(10) << __func__ << " skipping osd." << p->first + << " because it has missing while best is complete" + << dendl; + continue; + } else { + // both are complete or have missing + // fall through + } + + // prefer current primary (usually the caller), all things being equal + if (p->first == pg_whoami) { + psdout(10) << "calc_acting prefer osd." << p->first + << " because it is current primary" << dendl; + best = p; + continue; + } + } + return best; +} + +void PeeringState::calc_ec_acting( + map::const_iterator auth_log_shard, + unsigned size, + const vector &acting, + const vector &up, + const map &all_info, + bool restrict_to_up_acting, + vector *_want, + set *backfill, + set *acting_backfill, + ostream &ss) +{ + vector want(size, CRUSH_ITEM_NONE); + map > all_info_by_shard; + for (auto i = all_info.begin(); + i != all_info.end(); + ++i) { + all_info_by_shard[i->first.shard].insert(i->first); + } + for (uint8_t i = 0; i < want.size(); ++i) { + ss << "For position " << (unsigned)i << ": "; + if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE && + !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() && + all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl; + want[i] = up[i]; + continue; + } + if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) { + ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i)) + << " and "; + backfill->insert(pg_shard_t(up[i], shard_id_t(i))); + } + + if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE && + !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() && + all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl; + want[i] = acting[i]; + } else if (!restrict_to_up_acting) { + for (auto j = all_info_by_shard[shard_id_t(i)].begin(); + j != all_info_by_shard[shard_id_t(i)].end(); + ++j) { + ceph_assert(j->shard == i); + if (!all_info.find(*j)->second.is_incomplete() && + all_info.find(*j)->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting stray: " << *j << std::endl; + want[i] = j->osd; + break; + } + } + if (want[i] == CRUSH_ITEM_NONE) + ss << " failed to fill position " << (int)i << std::endl; + } + } + + for (uint8_t i = 0; i < want.size(); ++i) { + if (want[i] != CRUSH_ITEM_NONE) { + acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i))); + } + } + acting_backfill->insert(backfill->begin(), backfill->end()); + _want->swap(want); +} + +std::pair::const_iterator, eversion_t> +PeeringState::select_replicated_primary( + map::const_iterator auth_log_shard, + uint64_t force_auth_primary_missing_objects, + const std::vector &up, + pg_shard_t up_primary, + const map &all_info, + const OSDMapRef osdmap, + ostream &ss) +{ + pg_shard_t auth_log_shard_id = auth_log_shard->first; + + ss << __func__ << " newest update on osd." << auth_log_shard_id + << " with " << auth_log_shard->second << std::endl; + + // select primary + auto primary = all_info.find(up_primary); + if (up.size() && + !primary->second.is_incomplete() && + primary->second.last_update >= + auth_log_shard->second.log_tail) { + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + primary->second.stats.stats.sum.num_objects_missing; + auto auth_version = auth_log_shard->second.last_update.version; + auto primary_version = primary->second.last_update.version; + if (auth_version > primary_version) { + approx_missing_objects += auth_version - primary_version; + } else { + approx_missing_objects += primary_version - auth_version; + } + if ((uint64_t)approx_missing_objects > + force_auth_primary_missing_objects) { + primary = auth_log_shard; + ss << "up_primary: " << up_primary << ") has approximate " + << approx_missing_objects + << "(>" << force_auth_primary_missing_objects <<") " + << "missing objects, osd." << auth_log_shard_id + << " selected as primary instead" + << std::endl; + } else { + ss << "up_primary: " << up_primary << ") selected as primary" + << std::endl; + } + } else { + ss << "up_primary: " << up_primary << ") selected as primary" << std::endl; + } + } else { + ceph_assert(!auth_log_shard->second.is_incomplete()); + ss << "up[0] needs backfill, osd." << auth_log_shard_id + << " selected as primary instead" << std::endl; + primary = auth_log_shard; + } + + ss << __func__ << " primary is osd." << primary->first + << " with " << primary->second << std::endl; + + /* We include auth_log_shard->second.log_tail because in GetLog, + * we will request logs back to the min last_update over our + * acting_backfill set, which will result in our log being extended + * as far backwards as necessary to pick up any peers which can + * be log recovered by auth_log_shard's log */ + eversion_t oldest_auth_log_entry = + std::min(primary->second.log_tail, auth_log_shard->second.log_tail); + + return std::make_pair(primary, oldest_auth_log_entry); +} + + +/** + * calculate the desired acting set. + * + * Choose an appropriate acting set. Prefer up[0], unless it is + * incomplete, or another osd has a longer tail that allows us to + * bring other up nodes up to date. + */ +void PeeringState::calc_replicated_acting( + map::const_iterator primary, + eversion_t oldest_auth_log_entry, + unsigned size, + const vector &acting, + const vector &up, + pg_shard_t up_primary, + const map &all_info, + bool restrict_to_up_acting, + vector *want, + set *backfill, + set *acting_backfill, + const OSDMapRef osdmap, + const PGPool& pool, + ostream &ss) +{ + ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "") + << std::endl; + + want->push_back(primary->first.osd); + acting_backfill->insert(primary->first); + + // select replicas that have log contiguity with primary. + // prefer up, then acting, then any peer_info osds + for (auto i : up) { + pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD); + if (up_cand == primary->first) + continue; + const pg_info_t &cur_info = all_info.find(up_cand)->second; + if (cur_info.is_incomplete() || + cur_info.last_update < oldest_auth_log_entry) { + ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl; + backfill->insert(up_cand); + acting_backfill->insert(up_cand); + } else { + want->push_back(i); + acting_backfill->insert(up_cand); + ss << " osd." << i << " (up) accepted " << cur_info << std::endl; + } + } + + if (want->size() >= size) { + return; + } + + std::vector> candidate_by_last_update; + candidate_by_last_update.reserve(acting.size()); + // This no longer has backfill OSDs, but they are covered above. + for (auto i : acting) { + pg_shard_t acting_cand(i, shard_id_t::NO_SHARD); + // skip up osds we already considered above + if (acting_cand == primary->first) + continue; + auto up_it = find(up.begin(), up.end(), i); + if (up_it != up.end()) + continue; + + const pg_info_t &cur_info = all_info.find(acting_cand)->second; + if (cur_info.is_incomplete() || + cur_info.last_update < oldest_auth_log_entry) { + ss << " shard " << acting_cand << " (acting) REJECTED " + << cur_info << std::endl; + } else { + candidate_by_last_update.emplace_back(cur_info.last_update, i); + } + } + + auto sort_by_eversion =[](const std::pair &lhs, + const std::pair &rhs) { + return lhs.first > rhs.first; + }; + // sort by last_update, in descending order. + std::sort(candidate_by_last_update.begin(), + candidate_by_last_update.end(), sort_by_eversion); + for (auto &p: candidate_by_last_update) { + ceph_assert(want->size() < size); + want->push_back(p.second); + pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); + acting_backfill->insert(s); + ss << " shard " << s << " (acting) accepted " + << all_info.find(s)->second << std::endl; + if (want->size() >= size) { + return; + } + } + + if (restrict_to_up_acting) { + return; + } + candidate_by_last_update.clear(); + candidate_by_last_update.reserve(all_info.size()); // overestimate but fine + // continue to search stray to find more suitable peers + for (auto &i : all_info) { + // skip up osds we already considered above + if (i.first == primary->first) + continue; + auto up_it = find(up.begin(), up.end(), i.first.osd); + if (up_it != up.end()) + continue; + auto acting_it = find( + acting.begin(), acting.end(), i.first.osd); + if (acting_it != acting.end()) + continue; + + if (i.second.is_incomplete() || + i.second.last_update < oldest_auth_log_entry) { + ss << " shard " << i.first << " (stray) REJECTED " << i.second + << std::endl; + } else { + candidate_by_last_update.emplace_back( + i.second.last_update, i.first.osd); + } + } + + if (candidate_by_last_update.empty()) { + // save us some effort + return; + } + + // sort by last_update, in descending order. + std::sort(candidate_by_last_update.begin(), + candidate_by_last_update.end(), sort_by_eversion); + + for (auto &p: candidate_by_last_update) { + ceph_assert(want->size() < size); + want->push_back(p.second); + pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); + acting_backfill->insert(s); + ss << " shard " << s << " (stray) accepted " + << all_info.find(s)->second << std::endl; + if (want->size() >= size) { + return; + } + } +} + +// Defines osd preference order: acting set, then larger last_update +using osd_ord_t = std::tuple; // +using osd_id_t = int; + +class bucket_candidates_t { + std::deque> osds; + int selected = 0; + +public: + void add_osd(osd_ord_t ord, osd_id_t osd) { + // osds will be added in smallest to largest order + assert(osds.empty() || osds.back().first <= ord); + osds.push_back(std::make_pair(ord, osd)); + } + osd_id_t pop_osd() { + ceph_assert(!is_empty()); + auto ret = osds.front(); + osds.pop_front(); + return ret.second; + } + + void inc_selected() { selected++; } + unsigned get_num_selected() const { return selected; } + + osd_ord_t get_ord() const { + return osds.empty() ? std::make_tuple(false, eversion_t()) + : osds.front().first; + } + + bool is_empty() const { return osds.empty(); } + + bool operator<(const bucket_candidates_t &rhs) const { + return std::make_tuple(-selected, get_ord()) < + std::make_tuple(-rhs.selected, rhs.get_ord()); + } + + friend std::ostream &operator<<(std::ostream &, const bucket_candidates_t &); +}; + +std::ostream &operator<<(std::ostream &lhs, const bucket_candidates_t &cand) +{ + return lhs << "candidates[" << cand.osds << "]"; +} + +class bucket_heap_t { + using elem_t = std::reference_wrapper; + std::vector heap; + + // Max heap -- should emit buckets in order of preference + struct comp { + bool operator()(const elem_t &lhs, const elem_t &rhs) { + return lhs.get() < rhs.get(); + } + }; +public: + void push_if_nonempty(elem_t e) { + if (!e.get().is_empty()) { + heap.push_back(e); + std::push_heap(heap.begin(), heap.end(), comp()); + } + } + elem_t pop() { + std::pop_heap(heap.begin(), heap.end(), comp()); + auto ret = heap.back(); + heap.pop_back(); + return ret; + } + + bool is_empty() const { return heap.empty(); } +}; + +/** + * calc_replicated_acting_stretch + * + * Choose an acting set using as much of the up set as possible; filling + * in the remaining slots so as to maximize the number of crush buckets at + * level pool.info.peering_crush_bucket_barrier represented. + * + * Stretch clusters are a bit special: while they have a "size" the + * same way as normal pools, if we happen to lose a data center + * (we call it a "stretch bucket", but really it'll be a data center or + * a cloud availability zone), we don't actually want to shove + * 2 DC's worth of replication into a single site -- it won't fit! + * So we locally calculate a bucket_max, based + * on the targeted number of stretch buckets for the pool and + * its size. Then we won't pull more than bucket_max from any + * given ancestor even if it leaves us undersized. + + * There are two distinct phases: (commented below) + */ +void PeeringState::calc_replicated_acting_stretch( + map::const_iterator primary, + eversion_t oldest_auth_log_entry, + unsigned size, + const vector &acting, + const vector &up, + pg_shard_t up_primary, + const map &all_info, + bool restrict_to_up_acting, + vector *want, + set *backfill, + set *acting_backfill, + const OSDMapRef osdmap, + const PGPool& pool, + ostream &ss) +{ + ceph_assert(want); + ceph_assert(acting_backfill); + ceph_assert(backfill); + ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "") + << std::endl; + + auto used = [want](int osd) { + return std::find(want->begin(), want->end(), osd) != want->end(); + }; + + auto usable_info = [&](const auto &cur_info) mutable { + return !(cur_info.is_incomplete() || + cur_info.last_update < oldest_auth_log_entry); + }; + + auto osd_info = [&](int osd) mutable -> const pg_info_t & { + pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD); + const pg_info_t &cur_info = all_info.find(cand)->second; + return cur_info; + }; + + auto usable_osd = [&](int osd) mutable { + return usable_info(osd_info(osd)); + }; + + std::map ancestors; + auto get_ancestor = [&](int osd) mutable { + int ancestor = osdmap->crush->get_parent_of_type( + osd, + pool.info.peering_crush_bucket_barrier, + pool.info.crush_rule); + return &ancestors[ancestor]; + }; + + unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target; + if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size) { + ++bucket_max; + } + + /* 1) Select all usable osds from the up set as well as the primary + * + * We also stash any unusable osds from up into backfill. + */ + auto add_required = [&](int osd) { + if (!used(osd)) { + want->push_back(osd); + acting_backfill->insert( + pg_shard_t(osd, shard_id_t::NO_SHARD)); + get_ancestor(osd)->inc_selected(); + } + }; + add_required(primary->first.osd); + ss << " osd " << primary->first.osd << " primary accepted " + << osd_info(primary->first.osd) << std::endl; + for (auto upcand: up) { + auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD); + auto &curinfo = osd_info(upcand); + if (usable_osd(upcand)) { + ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl; + add_required(upcand); + } else { + ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl; + backfill->insert(upshard); + acting_backfill->insert(upshard); + } + } + + if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid + ss << " up set sufficient" << std::endl; + return; + } + ss << " up set insufficient, considering remaining osds" << std::endl; + + /* 2) Fill out remaining slots from usable osds in all_info + * while maximizing the number of ancestor nodes at the + * barrier_id crush level. + */ + { + std::vector> candidates; + /* To do this, we first filter the set of usable osd into an ordered + * list of usable osds + */ + auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t { + return std::make_tuple( + !is_acting /* acting should sort first */, + info.last_update); + }; + for (auto &cand : acting) { + auto &cand_info = osd_info(cand); + if (!used(cand) && usable_info(cand_info)) { + ss << " acting candidate " << cand << " " << cand_info << std::endl; + candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand)); + } + } + if (!restrict_to_up_acting) { + for (auto &[cand, info] : all_info) { + if (!used(cand.osd) && usable_info(info) && + (std::find(acting.begin(), acting.end(), cand.osd) + == acting.end())) { + ss << " other candidate " << cand << " " << info << std::endl; + candidates.push_back( + std::make_pair(get_osd_ord(false, info), cand.osd)); + } + } + } + std::sort(candidates.begin(), candidates.end()); + + // We then filter these candidates by ancestor + std::for_each(candidates.begin(), candidates.end(), [&](auto cand) { + get_ancestor(cand.second)->add_osd(cand.first, cand.second); + }); + } + + auto pop_ancestor = [&](auto &ancestor) { + ceph_assert(!ancestor.is_empty()); + auto osd = ancestor.pop_osd(); + + ss << " accepting candidate " << osd << std::endl; + + ceph_assert(!used(osd)); + ceph_assert(usable_osd(osd)); + + want->push_back(osd); + acting_backfill->insert( + pg_shard_t(osd, shard_id_t::NO_SHARD)); + ancestor.inc_selected(); + }; + + /* Next, we use the ancestors map to grab a descendant of the + * peering_crush_mandatory_member if not already represented. + * + * TODO: using 0 here to match other users. Prior to merge, I + * expect that this and other users should instead check against + * CRUSH_ITEM_NONE. + */ + if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) { + auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member); + if (aiter != ancestors.end() && + !aiter->second.get_num_selected()) { + ss << " adding required ancestor " << aiter->first << std::endl; + ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise + pop_ancestor(aiter->second); + } + } + + /* We then place the ancestors in a heap ordered by fewest selected + * and then by the ordering token of the next osd */ + bucket_heap_t aheap; + std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) { + aheap.push_if_nonempty(anc.second); + }); + + /* and pull from this heap until it's empty or we have enough. + * "We have enough" is a sufficient check here for + * stretch_set_can_peer() because our heap sorting always + * pulls from ancestors with the least number of included OSDs, + * so if it is possible to satisfy the bucket_count constraints we + * will do so. + */ + while (!aheap.is_empty() && want->size() < pool.info.size) { + auto next = aheap.pop(); + pop_ancestor(next.get()); + if (next.get().get_num_selected() < bucket_max) { + aheap.push_if_nonempty(next); + } + } + + /* The end result is that we should have as many buckets covered as + * possible while respecting up, the primary selection, + * the pool size (given bucket count constraints), + * and the mandatory member. + */ +} + + +bool PeeringState::recoverable(const vector &want) const +{ + unsigned num_want_acting = 0; + set have; + for (int i = 0; i < (int)want.size(); ++i) { + if (want[i] != CRUSH_ITEM_NONE) { + ++num_want_acting; + have.insert( + pg_shard_t( + want[i], + pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + } + + if (num_want_acting < pool.info.min_size) { + const bool recovery_ec_pool_below_min_size= + HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS); + + if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) { + psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl; + return false; + } else if (!cct->_conf.get_val("osd_allow_recovery_below_min_size")) { + psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl; + return false; + } + } + if (missing_loc.get_recoverable_predicate()(have)) { + return true; + } else { + psdout(10) << __func__ << " failed, not recoverable " << dendl; + return false; + } +} + +void PeeringState::choose_async_recovery_ec( + const map &all_info, + const pg_info_t &auth_info, + vector *want, + set *async_recovery, + const OSDMapRef osdmap) const +{ + set > candidates_by_cost; + for (uint8_t i = 0; i < want->size(); ++i) { + if ((*want)[i] == CRUSH_ITEM_NONE) + continue; + + // Considering log entries to recover is accurate enough for + // now. We could use minimum_to_decode_with_cost() later if + // necessary. + pg_shard_t shard_i((*want)[i], shard_id_t(i)); + // do not include strays + if (stray_set.find(shard_i) != stray_set.end()) + continue; + // Do not include an osd that is not up, since choosing it as + // an async_recovery_target will move it out of the acting set. + // This results in it being identified as a stray during peering, + // because it is no longer in the up or acting set. + if (!is_up(shard_i)) + continue; + auto shard_info = all_info.find(shard_i)->second; + // for ec pools we rollback all entries past the authoritative + // last_update *before* activation. This is relatively inexpensive + // compared to recovery, since it is purely local, so treat shards + // past the authoritative last_update the same as those equal to it. + version_t auth_version = auth_info.last_update.version; + version_t candidate_version = shard_info.last_update.version; + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + shard_info.stats.stats.sum.num_objects_missing; + if (auth_version > candidate_version) { + approx_missing_objects += auth_version - candidate_version; + } + if (static_cast(approx_missing_objects) > + cct->_conf.get_val("osd_async_recovery_min_cost")) { + candidates_by_cost.emplace(approx_missing_objects, shard_i); + } + } else { + if (auth_version > candidate_version && + (auth_version - candidate_version) > cct->_conf.get_val("osd_async_recovery_min_cost")) { + candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i)); + } + } + } + + psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost + << dendl; + + // take out as many osds as we can for async recovery, in order of cost + for (auto rit = candidates_by_cost.rbegin(); + rit != candidates_by_cost.rend(); ++rit) { + pg_shard_t cur_shard = rit->second; + vector candidate_want(*want); + candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE; + if (recoverable(candidate_want)) { + want->swap(candidate_want); + async_recovery->insert(cur_shard); + } + } + psdout(20) << __func__ << " result want=" << *want + << " async_recovery=" << *async_recovery << dendl; +} + +void PeeringState::choose_async_recovery_replicated( + const map &all_info, + const pg_info_t &auth_info, + vector *want, + set *async_recovery, + const OSDMapRef osdmap) const +{ + set > candidates_by_cost; + for (auto osd_num : *want) { + pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD); + // do not include strays + if (stray_set.find(shard_i) != stray_set.end()) + continue; + // Do not include an osd that is not up, since choosing it as + // an async_recovery_target will move it out of the acting set. + // This results in it being identified as a stray during peering, + // because it is no longer in the up or acting set. + if (!is_up(shard_i)) + continue; + auto shard_info = all_info.find(shard_i)->second; + // use the approximate magnitude of the difference in length of + // logs plus historical missing objects as the cost of recovery + version_t auth_version = auth_info.last_update.version; + version_t candidate_version = shard_info.last_update.version; + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + shard_info.stats.stats.sum.num_objects_missing; + if (auth_version > candidate_version) { + approx_missing_objects += auth_version - candidate_version; + } else { + approx_missing_objects += candidate_version - auth_version; + } + if (static_cast(approx_missing_objects) > + cct->_conf.get_val("osd_async_recovery_min_cost")) { + candidates_by_cost.emplace(approx_missing_objects, shard_i); + } + } else { + size_t approx_entries; + if (auth_version > candidate_version) { + approx_entries = auth_version - candidate_version; + } else { + approx_entries = candidate_version - auth_version; + } + if (approx_entries > cct->_conf.get_val("osd_async_recovery_min_cost")) { + candidates_by_cost.insert(make_pair(approx_entries, shard_i)); + } + } + } + + psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost + << dendl; + // take out as many osds as we can for async recovery, in order of cost + for (auto rit = candidates_by_cost.rbegin(); + rit != candidates_by_cost.rend(); ++rit) { + if (want->size() <= pool.info.min_size) { + break; + } + pg_shard_t cur_shard = rit->second; + vector candidate_want(*want); + for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) { + if (*it == cur_shard.osd) { + candidate_want.erase(it); + if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) { + // if we're in stretch mode, we can only remove the osd if it doesn't + // break peering limits. + want->swap(candidate_want); + async_recovery->insert(cur_shard); + } + break; + } + } + } + + psdout(20) << __func__ << " result want=" << *want + << " async_recovery=" << *async_recovery << dendl; +} + +/** + * choose acting + * + * calculate the desired acting, and request a change with the monitor + * if it differs from the current acting. + * + * if restrict_to_up_acting=true, we filter out anything that's not in + * up/acting. in order to lift this restriction, we need to + * 1) check whether it's worth switching the acting set any time we get + * a new pg info (not just here, when recovery finishes) + * 2) check whether anything in want_acting went down on each new map + * (and, if so, calculate a new want_acting) + * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap) + * TODO! + */ +bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, + bool restrict_to_up_acting, + bool *history_les_bound, + bool request_pg_temp_change_only) +{ + map all_info(peer_info.begin(), peer_info.end()); + all_info[pg_whoami] = info; + + if (cct->_conf->subsys.should_gather()) { + for (auto p = all_info.begin(); p != all_info.end(); ++p) { + psdout(10) << __func__ << " all_info osd." << p->first << " " + << p->second << dendl; + } + } + + auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting, + history_les_bound); + + if (auth_log_shard == all_info.end()) { + if (up != acting) { + psdout(10) << __func__ << " no suitable info found (incomplete backfills?)," + << " reverting to up" << dendl; + want_acting = up; + vector empty; + pl->queue_want_pg_temp(empty); + } else { + psdout(10) << __func__ << " failed" << dendl; + ceph_assert(want_acting.empty()); + } + return false; + } + + ceph_assert(!auth_log_shard->second.is_incomplete()); + auth_log_shard_id = auth_log_shard->first; + + set want_backfill, want_acting_backfill; + vector want; + stringstream ss; + if (pool.info.is_replicated()) { + auto [primary_shard, oldest_log] = select_replicated_primary( + auth_log_shard, + cct->_conf.get_val( + "osd_force_auth_primary_missing_objects"), + up, + up_primary, + all_info, + get_osdmap(), + ss); + if (pool.info.is_stretch_pool()) { + calc_replicated_acting_stretch( + primary_shard, + oldest_log, + get_osdmap()->get_pg_size(info.pgid.pgid), + acting, + up, + up_primary, + all_info, + restrict_to_up_acting, + &want, + &want_backfill, + &want_acting_backfill, + get_osdmap(), + pool, + ss); + } else { + calc_replicated_acting( + primary_shard, + oldest_log, + get_osdmap()->get_pg_size(info.pgid.pgid), + acting, + up, + up_primary, + all_info, + restrict_to_up_acting, + &want, + &want_backfill, + &want_acting_backfill, + get_osdmap(), + pool, + ss); + } + } else { + calc_ec_acting( + auth_log_shard, + get_osdmap()->get_pg_size(info.pgid.pgid), + acting, + up, + all_info, + restrict_to_up_acting, + &want, + &want_backfill, + &want_acting_backfill, + ss); + } + psdout(10) << ss.str() << dendl; + + if (!recoverable(want)) { + want_acting.clear(); + return false; + } + + set want_async_recovery; + if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) { + if (pool.info.is_erasure()) { + choose_async_recovery_ec( + all_info, auth_log_shard->second, &want, &want_async_recovery, + get_osdmap()); + } else { + choose_async_recovery_replicated( + all_info, auth_log_shard->second, &want, &want_async_recovery, + get_osdmap()); + } + } + while (want.size() > pool.info.size) { + // async recovery should have taken out as many osds as it can. + // if not, then always evict the last peer + // (will get synchronously recovered later) + psdout(10) << __func__ << " evicting osd." << want.back() + << " from oversized want " << want << dendl; + want.pop_back(); + } + if (want != acting) { + psdout(10) << __func__ << " want " << want << " != acting " << acting + << ", requesting pg_temp change" << dendl; + want_acting = want; + + if (!cct->_conf->osd_debug_no_acting_change) { + if (want_acting == up) { + // There can't be any pending backfill if + // want is the same as crush map up OSDs. + ceph_assert(want_backfill.empty()); + vector empty; + pl->queue_want_pg_temp(empty); + } else + pl->queue_want_pg_temp(want); + } + return false; + } + + if (request_pg_temp_change_only) + return true; + want_acting.clear(); + acting_recovery_backfill = want_acting_backfill; + psdout(10) << "acting_recovery_backfill is " + << acting_recovery_backfill << dendl; + ceph_assert( + backfill_targets.empty() || + backfill_targets == want_backfill); + if (backfill_targets.empty()) { + // Caller is GetInfo + backfill_targets = want_backfill; + } + // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete + ceph_assert( + async_recovery_targets.empty() || + async_recovery_targets == want_async_recovery || + !needs_recovery()); + if (async_recovery_targets.empty() || !needs_recovery()) { + async_recovery_targets = want_async_recovery; + } + // Will not change if already set because up would have had to change + // Verify that nothing in backfill is in stray_set + for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i) { + ceph_assert(stray_set.find(*i) == stray_set.end()); + } + psdout(10) << "choose_acting want=" << want << " backfill_targets=" + << want_backfill << " async_recovery_targets=" + << async_recovery_targets << dendl; + return true; +} + +void PeeringState::log_weirdness() +{ + if (pg_log.get_tail() != info.log_tail) + pl->get_clog_error() << info.pgid + << " info mismatch, log.tail " << pg_log.get_tail() + << " != info.log_tail " << info.log_tail; + if (pg_log.get_head() != info.last_update) + pl->get_clog_error() << info.pgid + << " info mismatch, log.head " << pg_log.get_head() + << " != info.last_update " << info.last_update; + + if (!pg_log.get_log().empty()) { + // sloppy check + if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail())) + pl->get_clog_error() << info.pgid + << " log bound mismatch, info (tail,head] (" + << pg_log.get_tail() << "," + << pg_log.get_head() << "]" + << " actual [" + << pg_log.get_log().log.begin()->version << "," + << pg_log.get_log().log.rbegin()->version << "]"; + } + + if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) { + pl->get_clog_error() << info.pgid + << " caller_ops.size " + << pg_log.get_log().caller_ops.size() + << " > log size " << pg_log.get_log().log.size(); + } +} + +/* + * Process information from a replica to determine if it could have any + * objects that i need. + * + * TODO: if the missing set becomes very large, this could get expensive. + * Instead, we probably want to just iterate over our unfound set. + */ +bool PeeringState::search_for_missing( + const pg_info_t &oinfo, const pg_missing_t &omissing, + pg_shard_t from, + PeeringCtxWrapper &ctx) +{ + uint64_t num_unfound_before = missing_loc.num_unfound(); + bool found_missing = missing_loc.add_source_info( + from, oinfo, omissing, ctx.handle); + if (found_missing && num_unfound_before != missing_loc.num_unfound()) + pl->publish_stats_to_osd(); + // avoid doing this if the peer is empty. This is abit of paranoia + // to avoid doing something rash if add_source_info() above + // incorrectly decided we found something new. (if the peer has + // last_update=0'0 that's impossible.) + if (found_missing && + oinfo.last_update != eversion_t()) { + pg_info_t tinfo(oinfo); + tinfo.pgid.shard = pg_whoami.shard; + ctx.send_info( + from.osd, + spg_t(info.pgid.pgid, from.shard), + get_osdmap_epoch(), // fixme: use lower epoch? + get_osdmap_epoch(), + tinfo); + } + return found_missing; +} + +bool PeeringState::discover_all_missing( + BufferedRecoveryMessages &rctx) +{ + auto &missing = pg_log.get_missing(); + uint64_t unfound = get_num_unfound(); + bool any = false; // did we start any queries + + psdout(10) << __func__ << " " + << missing.num_missing() << " missing, " + << unfound << " unfound" + << dendl; + + auto m = might_have_unfound.begin(); + auto mend = might_have_unfound.end(); + for (; m != mend; ++m) { + pg_shard_t peer(*m); + + if (!get_osdmap()->is_up(peer.osd)) { + psdout(20) << __func__ << " skipping down osd." << peer << dendl; + continue; + } + + if (peer_purged.count(peer)) { + psdout(20) << __func__ << " skipping purged osd." << peer << dendl; + continue; + } + + auto iter = peer_info.find(peer); + if (iter != peer_info.end() && + (iter->second.is_empty() || iter->second.dne())) { + // ignore empty peers + continue; + } + + // If we've requested any of this stuff, the pg_missing_t information + // should be on its way. + // TODO: coalsce requested_* into a single data structure + if (peer_missing.find(peer) != peer_missing.end()) { + psdout(20) << __func__ << ": osd." << peer + << ": we already have pg_missing_t" << dendl; + continue; + } + if (peer_log_requested.find(peer) != peer_log_requested.end()) { + psdout(20) << __func__ << ": osd." << peer + << ": in peer_log_requested" << dendl; + continue; + } + if (peer_missing_requested.find(peer) != peer_missing_requested.end()) { + psdout(20) << __func__ << ": osd." << peer + << ": in peer_missing_requested" << dendl; + continue; + } + + // Request missing + psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t" + << dendl; + peer_missing_requested.insert(peer); + rctx.send_query( + peer.osd, + spg_t(info.pgid.pgid, peer.shard), + pg_query_t( + pg_query_t::FULLLOG, + peer.shard, pg_whoami.shard, + info.history, get_osdmap_epoch())); + any = true; + } + return any; +} + +/* Build the might_have_unfound set. + * + * This is used by the primary OSD during recovery. + * + * This set tracks the OSDs which might have unfound objects that the primary + * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we + * will remove the OSD from the set. + */ +void PeeringState::build_might_have_unfound() +{ + ceph_assert(might_have_unfound.empty()); + ceph_assert(is_primary()); + + psdout(10) << __func__ << dendl; + + check_past_interval_bounds(); + + might_have_unfound = past_intervals.get_might_have_unfound( + pg_whoami, + pool.info.is_erasure()); + + // include any (stray) peers + for (auto p = peer_info.begin(); p != peer_info.end(); ++p) + might_have_unfound.insert(p->first); + + psdout(15) << __func__ << ": built " << might_have_unfound << dendl; +} + +void PeeringState::activate( + ObjectStore::Transaction& t, + epoch_t activation_epoch, + PeeringCtxWrapper &ctx) +{ + ceph_assert(!is_peered()); + + // twiddle pg state + state_clear(PG_STATE_DOWN); + + send_notify = false; + + if (is_primary()) { + // only update primary last_epoch_started if we will go active + if (acting_set_writeable()) { + ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les || + info.last_epoch_started <= activation_epoch); + info.last_epoch_started = activation_epoch; + info.last_interval_started = info.history.same_interval_since; + } + } else if (is_acting(pg_whoami)) { + /* update last_epoch_started on acting replica to whatever the primary sent + * unless it's smaller (could happen if we are going peered rather than + * active, see doc/dev/osd_internals/last_epoch_started.rst) */ + if (info.last_epoch_started < activation_epoch) { + info.last_epoch_started = activation_epoch; + info.last_interval_started = info.history.same_interval_since; + } + } + + auto &missing = pg_log.get_missing(); + + min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)! + if (is_primary()) { + last_update_ondisk = info.last_update; + } + last_update_applied = info.last_update; + last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to(); + + need_up_thru = false; + + // write pg info, log + dirty_info = true; + dirty_big_info = true; // maybe + + pl->schedule_event_on_commit( + t, + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + ActivateCommitted( + get_osdmap_epoch(), + activation_epoch))); + + // init complete pointer + if (missing.num_missing() == 0) { + psdout(10) << "activate - no missing, moving last_complete " << info.last_complete + << " -> " << info.last_update << dendl; + info.last_complete = info.last_update; + info.stats.stats.sum.num_objects_missing = 0; + pg_log.reset_recovery_pointers(); + } else { + psdout(10) << "activate - not complete, " << missing << dendl; + info.stats.stats.sum.num_objects_missing = missing.num_missing(); + pg_log.activate_not_complete(info); + } + + log_weirdness(); + + if (is_primary()) { + // initialize snap_trimq + interval_set to_trim; + auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue(); + auto p = removed_snaps_queue.find(info.pgid.pgid.pool()); + if (p != removed_snaps_queue.end()) { + dout(20) << "activate - purged_snaps " << info.purged_snaps + << " removed_snaps " << p->second + << dendl; + for (auto q : p->second) { + to_trim.insert(q.first, q.second); + } + } + interval_set purged; + purged.intersection_of(to_trim, info.purged_snaps); + to_trim.subtract(purged); + + if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { + renew_lease(pl->get_mnow()); + // do not schedule until we are actually activated + } + + // adjust purged_snaps: PG may have been inactive while snaps were pruned + // from the removed_snaps_queue in the osdmap. update local purged_snaps + // reflect only those snaps that we thought were pruned and were still in + // the queue. + info.purged_snaps.swap(purged); + + // start up replicas + if (prior_readable_down_osds.empty()) { + dout(10) << __func__ << " no prior_readable_down_osds to wait on, clearing ub" + << dendl; + clear_prior_readable_until_ub(); + } + info.history.refresh_prior_readable_until_ub(pl->get_mnow(), + prior_readable_until_ub); + + ceph_assert(!acting_recovery_backfill.empty()); + for (auto i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == pg_whoami) continue; + pg_shard_t peer = *i; + ceph_assert(peer_info.count(peer)); + pg_info_t& pi = peer_info[peer]; + + psdout(10) << "activate peer osd." << peer << " " << pi << dendl; + + MRef m; + ceph_assert(peer_missing.count(peer)); + pg_missing_t& pm = peer_missing[peer]; + + bool needs_past_intervals = pi.dne(); + + // Save num_bytes for backfill reservation request, can't be negative + peer_bytes[peer] = std::max(0, pi.stats.stats.sum.num_bytes); + + if (pi.last_update == info.last_update) { + // empty log + if (!pi.last_backfill.is_max()) + pl->get_clog_info() << info.pgid << " continuing backfill to osd." + << peer + << " from (" << pi.log_tail << "," << pi.last_update + << "] " << pi.last_backfill + << " to " << info.last_update; + if (!pi.is_empty()) { + psdout(10) << "activate peer osd." << peer + << " is up to date, queueing in pending_activators" << dendl; + ctx.send_info( + peer.osd, + spg_t(info.pgid.pgid, peer.shard), + get_osdmap_epoch(), // fixme: use lower epoch? + get_osdmap_epoch(), + info, + get_lease()); + } else { + psdout(10) << "activate peer osd." << peer + << " is up to date, but sending pg_log anyway" << dendl; + m = make_message( + i->shard, pg_whoami.shard, + get_osdmap_epoch(), info, + last_peering_reset); + } + } else if ( + pg_log.get_tail() > pi.last_update || + pi.last_backfill == hobject_t() || + (backfill_targets.count(*i) && pi.last_backfill.is_max())) { + /* ^ This last case covers a situation where a replica is not contiguous + * with the auth_log, but is contiguous with this replica. Reshuffling + * the active set to handle this would be tricky, so instead we just go + * ahead and backfill it anyway. This is probably preferrable in any + * case since the replica in question would have to be significantly + * behind. + */ + // backfill + pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer + << " from (" << pi.log_tail << "," << pi.last_update + << "] " << pi.last_backfill + << " to " << info.last_update; + + pi.last_update = info.last_update; + pi.last_complete = info.last_update; + pi.set_last_backfill(hobject_t()); + pi.last_epoch_started = info.last_epoch_started; + pi.last_interval_started = info.last_interval_started; + pi.history = info.history; + pi.hit_set = info.hit_set; + pi.stats.stats.clear(); + pi.stats.stats.sum.num_bytes = peer_bytes[peer]; + + // initialize peer with our purged_snaps. + pi.purged_snaps = info.purged_snaps; + + m = make_message( + i->shard, pg_whoami.shard, + get_osdmap_epoch(), pi, + last_peering_reset /* epoch to create pg at */); + + // send some recent log, so that op dup detection works well. + m->log.copy_up_to(cct, pg_log.get_log(), + cct->_conf->osd_max_pg_log_entries); + m->info.log_tail = m->log.tail; + pi.log_tail = m->log.tail; // sigh... + + pm.clear(); + } else { + // catch up + ceph_assert(pg_log.get_tail() <= pi.last_update); + m = make_message( + i->shard, pg_whoami.shard, + get_osdmap_epoch(), info, + last_peering_reset /* epoch to create pg at */); + // send new stuff to append to replicas log + m->log.copy_after(cct, pg_log.get_log(), pi.last_update); + } + + // share past_intervals if we are creating the pg on the replica + // based on whether our info for that peer was dne() *before* + // updating pi.history in the backfill block above. + if (m && needs_past_intervals) + m->past_intervals = past_intervals; + + // update local version of peer's missing list! + if (m && pi.last_backfill != hobject_t()) { + for (auto p = m->log.log.begin(); p != m->log.log.end(); ++p) { + if (p->soid <= pi.last_backfill && + !p->is_error()) { + if (perform_deletes_during_peering() && p->is_delete()) { + pm.rm(p->soid, p->version); + } else { + pm.add_next_event(*p); + } + } + } + } + + if (m) { + dout(10) << "activate peer osd." << peer << " sending " << m->log + << dendl; + m->lease = get_lease(); + pl->send_cluster_message(peer.osd, m, get_osdmap_epoch()); + } + + // peer now has + pi.last_update = info.last_update; + + // update our missing + if (pm.num_missing() == 0) { + pi.last_complete = pi.last_update; + psdout(10) << "activate peer osd." << peer << " " << pi + << " uptodate" << dendl; + } else { + psdout(10) << "activate peer osd." << peer << " " << pi + << " missing " << pm << dendl; + } + } + + // Set up missing_loc + set complete_shards; + for (auto i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + psdout(20) << __func__ << " setting up missing_loc from shard " << *i + << " " << dendl; + if (*i == get_primary()) { + missing_loc.add_active_missing(missing); + if (!missing.have_missing()) + complete_shards.insert(*i); + } else { + auto peer_missing_entry = peer_missing.find(*i); + ceph_assert(peer_missing_entry != peer_missing.end()); + missing_loc.add_active_missing(peer_missing_entry->second); + if (!peer_missing_entry->second.have_missing() && + peer_info[*i].last_backfill.is_max()) + complete_shards.insert(*i); + } + } + + // If necessary, create might_have_unfound to help us find our unfound objects. + // NOTE: It's important that we build might_have_unfound before trimming the + // past intervals. + might_have_unfound.clear(); + if (needs_recovery()) { + // If only one shard has missing, we do a trick to add all others as recovery + // source, this is considered safe since the PGLogs have been merged locally, + // and covers vast majority of the use cases, like one OSD/host is down for + // a while for hardware repairing + if (complete_shards.size() + 1 == acting_recovery_backfill.size()) { + missing_loc.add_batch_sources_info(complete_shards, ctx.handle); + } else { + missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(), + ctx.handle); + for (auto i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == pg_whoami) continue; + psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl; + ceph_assert(peer_missing.count(*i)); + ceph_assert(peer_info.count(*i)); + missing_loc.add_source_info( + *i, + peer_info[*i], + peer_missing[*i], + ctx.handle); + } + } + for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) { + if (is_acting_recovery_backfill(i->first)) + continue; + ceph_assert(peer_info.count(i->first)); + search_for_missing( + peer_info[i->first], + i->second, + i->first, + ctx); + } + + build_might_have_unfound(); + + // Always call now so update_calc_stats() will be accurate + discover_all_missing(ctx.msgs); + + } + + // num_objects_degraded if calculated should reflect this too, unless no + // missing and we are about to go clean. + if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) { + state_set(PG_STATE_UNDERSIZED); + } + + state_set(PG_STATE_ACTIVATING); + pl->on_activate(std::move(to_trim)); + } + if (acting_set_writeable()) { + PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; + pg_log.roll_forward(rollbacker.get()); + } +} + +void PeeringState::share_pg_info() +{ + psdout(10) << "share_pg_info" << dendl; + + info.history.refresh_prior_readable_until_ub(pl->get_mnow(), + prior_readable_until_ub); + + // share new pg_info_t with replicas + ceph_assert(!acting_recovery_backfill.empty()); + for (auto pg_shard : acting_recovery_backfill) { + if (pg_shard == pg_whoami) continue; + if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) { + peer->second.last_epoch_started = info.last_epoch_started; + peer->second.last_interval_started = info.last_interval_started; + peer->second.history.merge(info.history); + } + MessageRef m; + if (last_require_osd_release >= ceph_release_t::octopus) { + m = make_message(spg_t{info.pgid.pgid, pg_shard.shard}, + info, + get_osdmap_epoch(), + get_osdmap_epoch(), + std::optional{get_lease()}, + std::nullopt); + } else { + m = make_message(get_osdmap_epoch(), + MOSDPGInfo::pg_list_t{ + pg_notify_t{pg_shard.shard, + pg_whoami.shard, + get_osdmap_epoch(), + get_osdmap_epoch(), + info, + past_intervals}}); + } + pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch()); + } +} + +void PeeringState::merge_log( + ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog, + pg_shard_t from) +{ + PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; + pg_log.merge_log( + oinfo, std::move(olog), from, info, rollbacker.get(), + dirty_info, dirty_big_info); +} + +void PeeringState::rewind_divergent_log( + ObjectStore::Transaction& t, eversion_t newhead) +{ + PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; + pg_log.rewind_divergent_log( + newhead, info, rollbacker.get(), dirty_info, dirty_big_info); +} + + +void PeeringState::proc_primary_info( + ObjectStore::Transaction &t, const pg_info_t &oinfo) +{ + ceph_assert(!is_primary()); + + update_history(oinfo.history); + if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) { + info.stats.stats.sum.num_scrub_errors = 0; + info.stats.stats.sum.num_shallow_scrub_errors = 0; + info.stats.stats.sum.num_deep_scrub_errors = 0; + dirty_info = true; + } + + if (!(info.purged_snaps == oinfo.purged_snaps)) { + psdout(10) << __func__ << " updating purged_snaps to " + << oinfo.purged_snaps + << dendl; + info.purged_snaps = oinfo.purged_snaps; + dirty_info = true; + dirty_big_info = true; + } +} + +void PeeringState::proc_master_log( + ObjectStore::Transaction& t, pg_info_t &oinfo, + pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from) +{ + psdout(10) << "proc_master_log for osd." << from << ": " + << olog << " " << omissing << dendl; + ceph_assert(!is_peered() && is_primary()); + + // merge log into our own log to build master log. no need to + // make any adjustments to their missing map; we are taking their + // log to be authoritative (i.e., their entries are by definitely + // non-divergent). + merge_log(t, oinfo, std::move(olog), from); + peer_info[from] = oinfo; + psdout(10) << " peer osd." << from << " now " << oinfo + << " " << omissing << dendl; + might_have_unfound.insert(from); + + // See doc/dev/osd_internals/last_epoch_started + if (oinfo.last_epoch_started > info.last_epoch_started) { + info.last_epoch_started = oinfo.last_epoch_started; + dirty_info = true; + } + if (oinfo.last_interval_started > info.last_interval_started) { + info.last_interval_started = oinfo.last_interval_started; + dirty_info = true; + } + update_history(oinfo.history); + ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les || + info.last_epoch_started >= info.history.last_epoch_started); + + peer_missing[from].claim(std::move(omissing)); +} + +void PeeringState::proc_replica_log( + pg_info_t &oinfo, + const pg_log_t &olog, + pg_missing_t&& omissing, + pg_shard_t from) +{ + psdout(10) << "proc_replica_log for osd." << from << ": " + << oinfo << " " << olog << " " << omissing << dendl; + + pg_log.proc_replica_log(oinfo, olog, omissing, from); + + peer_info[from] = oinfo; + psdout(10) << " peer osd." << from << " now " + << oinfo << " " << omissing << dendl; + might_have_unfound.insert(from); + + for (auto i = omissing.get_items().begin(); + i != omissing.get_items().end(); + ++i) { + psdout(20) << " after missing " << i->first + << " need " << i->second.need + << " have " << i->second.have << dendl; + } + peer_missing[from].claim(std::move(omissing)); +} + +void PeeringState::fulfill_info( + pg_shard_t from, const pg_query_t &query, + pair ¬ify_info) +{ + ceph_assert(from == primary); + ceph_assert(query.type == pg_query_t::INFO); + + // info + psdout(10) << "sending info" << dendl; + notify_info = make_pair(from, info); +} + +void PeeringState::fulfill_log( + pg_shard_t from, const pg_query_t &query, epoch_t query_epoch) +{ + psdout(10) << "log request from " << from << dendl; + ceph_assert(from == primary); + ceph_assert(query.type != pg_query_t::INFO); + + auto mlog = make_message( + from.shard, pg_whoami.shard, + get_osdmap_epoch(), + info, query_epoch); + mlog->missing = pg_log.get_missing(); + + // primary -> other, when building master log + if (query.type == pg_query_t::LOG) { + psdout(10) << " sending info+missing+log since " << query.since + << dendl; + if (query.since != eversion_t() && query.since < pg_log.get_tail()) { + pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since " + << query.since + << " when my log.tail is " << pg_log.get_tail() + << ", sending full log instead"; + mlog->log = pg_log.get_log(); // primary should not have requested this!! + } else + mlog->log.copy_after(cct, pg_log.get_log(), query.since); + } + else if (query.type == pg_query_t::FULLLOG) { + psdout(10) << " sending info+missing+full log" << dendl; + mlog->log = pg_log.get_log(); + } + + psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl; + + pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true); +} + +void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx) +{ + if (query.query.type == pg_query_t::INFO) { + pair notify_info; + // note this refreshes our prior_readable_until_ub value + update_history(query.query.history); + fulfill_info(query.from, query.query, notify_info); + rctx.send_notify( + notify_info.first.osd, + pg_notify_t( + notify_info.first.shard, pg_whoami.shard, + query.query_epoch, + get_osdmap_epoch(), + notify_info.second, + past_intervals)); + } else { + update_history(query.query.history); + fulfill_log(query.from, query.query, query.query_epoch); + } +} + +void PeeringState::try_mark_clean() +{ + if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) { + state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); + state_set(PG_STATE_CLEAN); + info.history.last_epoch_clean = get_osdmap_epoch(); + info.history.last_interval_clean = info.history.same_interval_since; + past_intervals.clear(); + dirty_big_info = true; + dirty_info = true; + } + + if (!is_active() && is_peered()) { + if (is_clean()) { + bool target; + if (pool.info.is_pending_merge(info.pgid.pgid, &target)) { + if (target) { + psdout(10) << "ready to merge (target)" << dendl; + pl->set_ready_to_merge_target( + info.last_update, + info.history.last_epoch_started, + info.history.last_epoch_clean); + } else { + psdout(10) << "ready to merge (source)" << dendl; + pl->set_ready_to_merge_source(info.last_update); + } + } + } else { + psdout(10) << "not clean, not ready to merge" << dendl; + // we should have notified OSD in Active state entry point + } + } + + state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL); + + share_pg_info(); + pl->publish_stats_to_osd(); + clear_recovery_state(); +} + +void PeeringState::split_into( + pg_t child_pgid, PeeringState *child, unsigned split_bits) +{ + child->update_osdmap_ref(get_osdmap()); + child->pool = pool; + + // Log + pg_log.split_into(child_pgid, split_bits, &(child->pg_log)); + child->info.last_complete = info.last_complete; + + info.last_update = pg_log.get_head(); + child->info.last_update = child->pg_log.get_head(); + + child->info.last_user_version = info.last_user_version; + + info.log_tail = pg_log.get_tail(); + child->info.log_tail = child->pg_log.get_tail(); + + // reset last_complete, we might have modified pg_log & missing above + pg_log.reset_complete_to(&info); + child->pg_log.reset_complete_to(&child->info); + + // Info + child->info.history = info.history; + child->info.history.epoch_created = get_osdmap_epoch(); + child->info.purged_snaps = info.purged_snaps; + + if (info.last_backfill.is_max()) { + child->info.set_last_backfill(hobject_t::get_max()); + } else { + // restart backfill on parent and child to be safe. we could + // probably do better in the bitwise sort case, but it's more + // fragile (there may be special work to do on backfill completion + // in the future). + info.set_last_backfill(hobject_t()); + child->info.set_last_backfill(hobject_t()); + // restarting backfill implies that the missing set is empty, + // since it is only used for objects prior to last_backfill + pg_log.reset_backfill(); + child->pg_log.reset_backfill(); + } + + child->info.stats = info.stats; + child->info.stats.parent_split_bits = split_bits; + info.stats.stats_invalid = true; + child->info.stats.stats_invalid = true; + child->info.last_epoch_started = info.last_epoch_started; + child->info.last_interval_started = info.last_interval_started; + + // There can't be recovery/backfill going on now + int primary, up_primary; + vector newup, newacting; + get_osdmap()->pg_to_up_acting_osds( + child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary); + child->init_primary_up_acting( + newup, + newacting, + up_primary, + primary); + child->role = OSDMap::calc_pg_role(pg_whoami, child->acting); + + // this comparison includes primary rank via pg_shard_t + if (get_primary() != child->get_primary()) + child->info.history.same_primary_since = get_osdmap_epoch(); + + child->info.stats.up = newup; + child->info.stats.up_primary = up_primary; + child->info.stats.acting = newacting; + child->info.stats.acting_primary = primary; + child->info.stats.mapping_epoch = get_osdmap_epoch(); + + // History + child->past_intervals = past_intervals; + + child->on_new_interval(); + + child->send_notify = !child->is_primary(); + + child->dirty_info = true; + child->dirty_big_info = true; + dirty_info = true; + dirty_big_info = true; +} + +void PeeringState::merge_from( + map& sources, + PeeringCtx &rctx, + unsigned split_bits, + const pg_merge_meta_t& last_pg_merge_meta) +{ + bool incomplete = false; + if (info.last_complete != info.last_update || + info.is_incomplete() || + info.dne()) { + psdout(10) << __func__ << " target incomplete" << dendl; + incomplete = true; + } + if (last_pg_merge_meta.source_pgid != pg_t()) { + if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) { + psdout(10) << __func__ << " target doesn't match expected parent " + << last_pg_merge_meta.source_pgid.get_parent() + << " of source_pgid " << last_pg_merge_meta.source_pgid + << dendl; + incomplete = true; + } + if (info.last_update != last_pg_merge_meta.target_version) { + psdout(10) << __func__ << " target version doesn't match expected " + << last_pg_merge_meta.target_version << dendl; + incomplete = true; + } + } + + PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)}; + pg_log.roll_forward(handler.get()); + + info.last_complete = info.last_update; // to fake out trim() + pg_log.reset_recovery_pointers(); + pg_log.trim(info.last_update, info); + + vector log_from; + for (auto& i : sources) { + auto& source = i.second; + if (!source) { + psdout(10) << __func__ << " source " << i.first << " missing" << dendl; + incomplete = true; + continue; + } + if (source->info.last_complete != source->info.last_update || + source->info.is_incomplete() || + source->info.dne()) { + psdout(10) << __func__ << " source " << source->pg_whoami + << " incomplete" + << dendl; + incomplete = true; + } + if (last_pg_merge_meta.source_pgid != pg_t()) { + if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) { + dout(10) << __func__ << " source " << source->info.pgid.pgid + << " doesn't match expected source pgid " + << last_pg_merge_meta.source_pgid << dendl; + incomplete = true; + } + if (source->info.last_update != last_pg_merge_meta.source_version) { + dout(10) << __func__ << " source version doesn't match expected " + << last_pg_merge_meta.target_version << dendl; + incomplete = true; + } + } + + // prepare log + PGLog::LogEntryHandlerRef handler{ + source->pl->get_log_handler(rctx.transaction)}; + source->pg_log.roll_forward(handler.get()); + source->info.last_complete = source->info.last_update; // to fake out trim() + source->pg_log.reset_recovery_pointers(); + source->pg_log.trim(source->info.last_update, source->info); + log_from.push_back(&source->pg_log); + + // combine stats + info.stats.add(source->info.stats); + + // pull up last_update + info.last_update = std::max(info.last_update, source->info.last_update); + + // adopt source's PastIntervals if target has none. we can do this since + // pgp_num has been reduced prior to the merge, so the OSD mappings for + // the PGs are identical. + if (past_intervals.empty() && !source->past_intervals.empty()) { + psdout(10) << __func__ << " taking source's past_intervals" << dendl; + past_intervals = source->past_intervals; + } + } + + info.last_complete = info.last_update; + info.log_tail = info.last_update; + if (incomplete) { + info.last_backfill = hobject_t(); + } + + // merge logs + pg_log.merge_from(log_from, info.last_update); + + // make sure we have a meaningful last_epoch_started/clean (if we were a + // placeholder) + if (info.history.epoch_created == 0) { + // start with (a) source's history, since these PGs *should* have been + // remapped in concert with each other... + info.history = sources.begin()->second->info.history; + + // we use the last_epoch_{started,clean} we got from + // the caller, which are the epochs that were reported by the PGs were + // found to be ready for merge. + info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean; + info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started; + info.last_epoch_started = last_pg_merge_meta.last_epoch_started; + psdout(10) << __func__ + << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/" + << last_pg_merge_meta.last_epoch_clean + << " from pool last_dec_*, source pg history was " + << sources.begin()->second->info.history + << dendl; + + // above we have pulled down source's history and we need to check + // history.epoch_created again to confirm that source is not a placeholder + // too. (peering requires a sane history.same_interval_since value for any + // non-newly created pg and below here we know we are basically iterating + // back a series of past maps to fake a merge process, hence we need to + // fix history.same_interval_since first so that start_peering_interval() + // will not complain) + if (info.history.epoch_created == 0) { + dout(10) << __func__ << " both merge target and source are placeholders," + << " set sis to lec " << info.history.last_epoch_clean + << dendl; + info.history.same_interval_since = info.history.last_epoch_clean; + } + + // if the past_intervals start is later than last_epoch_clean, it + // implies the source repeered again but the target didn't, or + // that the source became clean in a later epoch than the target. + // avoid the discrepancy but adjusting the interval start + // backwards to match so that check_past_interval_bounds() will + // not complain. + auto pib = past_intervals.get_bounds(); + if (info.history.last_epoch_clean < pib.first) { + psdout(10) << __func__ << " last_epoch_clean " + << info.history.last_epoch_clean << " < past_interval start " + << pib.first << ", adjusting start backwards" << dendl; + past_intervals.adjust_start_backwards(info.history.last_epoch_clean); + } + + // Similarly, if the same_interval_since value is later than + // last_epoch_clean, the next interval change will result in a + // past_interval start that is later than last_epoch_clean. This + // can happen if we use the pg_history values from the merge + // source. Adjust the same_interval_since value backwards if that + // happens. (We trust the les and lec values more because they came from + // the real target, whereas the history value we stole from the source.) + if (info.history.last_epoch_started < info.history.same_interval_since) { + psdout(10) << __func__ << " last_epoch_started " + << info.history.last_epoch_started << " < same_interval_since " + << info.history.same_interval_since + << ", adjusting pg_history backwards" << dendl; + info.history.same_interval_since = info.history.last_epoch_clean; + // make sure same_{up,primary}_since are <= same_interval_since + info.history.same_up_since = std::min( + info.history.same_up_since, info.history.same_interval_since); + info.history.same_primary_since = std::min( + info.history.same_primary_since, info.history.same_interval_since); + } + } + + dirty_info = true; + dirty_big_info = true; +} + +void PeeringState::start_split_stats( + const set& childpgs, vector *out) +{ + out->resize(childpgs.size() + 1); + info.stats.stats.sum.split(*out); +} + +void PeeringState::finish_split_stats( + const object_stat_sum_t& stats, ObjectStore::Transaction &t) +{ + info.stats.stats.sum = stats; + write_if_dirty(t); +} + +void PeeringState::update_blocked_by() +{ + // set a max on the number of blocking peers we report. if we go + // over, report a random subset. keep the result sorted. + unsigned keep = std::min( + blocked_by.size(), cct->_conf->osd_max_pg_blocked_by); + unsigned skip = blocked_by.size() - keep; + info.stats.blocked_by.clear(); + info.stats.blocked_by.resize(keep); + unsigned pos = 0; + for (auto p = blocked_by.begin(); p != blocked_by.end() && keep > 0; ++p) { + if (skip > 0 && (rand() % (skip + keep) < skip)) { + --skip; + } else { + info.stats.blocked_by[pos++] = *p; + --keep; + } + } +} + +static bool find_shard(const set & pgs, shard_id_t shard) +{ + for (auto&p : pgs) + if (p.shard == shard) + return true; + return false; +} + +static pg_shard_t get_another_shard(const set & pgs, pg_shard_t skip, shard_id_t shard) +{ + for (auto&p : pgs) { + if (p == skip) + continue; + if (p.shard == shard) + return p; + } + return pg_shard_t(); +} + +void PeeringState::update_calc_stats() +{ + info.stats.version = info.last_update; + info.stats.created = info.history.epoch_created; + info.stats.last_scrub = info.history.last_scrub; + info.stats.last_scrub_stamp = info.history.last_scrub_stamp; + info.stats.last_deep_scrub = info.history.last_deep_scrub; + info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp; + info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp; + info.stats.last_epoch_clean = info.history.last_epoch_clean; + + info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version; + info.stats.ondisk_log_size = info.stats.log_size; + info.stats.log_start = pg_log.get_tail(); + info.stats.ondisk_log_start = pg_log.get_tail(); + info.stats.snaptrimq_len = pl->get_snap_trimq_size(); + + unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid); + + // In rare case that upset is too large (usually transient), use as target + // for calculations below. + unsigned target = std::max(num_shards, (unsigned)upset.size()); + // For undersized actingset may be larger with OSDs out + unsigned nrep = std::max(actingset.size(), upset.size()); + // calc num_object_copies + info.stats.stats.calc_copies(std::max(target, nrep)); + info.stats.stats.sum.num_objects_degraded = 0; + info.stats.stats.sum.num_objects_unfound = 0; + info.stats.stats.sum.num_objects_misplaced = 0; + info.stats.avail_no_missing.clear(); + info.stats.object_location_counts.clear(); + + // We should never hit this condition, but if end up hitting it, + // make sure to update num_objects and set PG_STATE_INCONSISTENT. + if (info.stats.stats.sum.num_objects < 0) { + psdout(0) << __func__ << " negative num_objects = " + << info.stats.stats.sum.num_objects << " setting it to 0 " + << dendl; + info.stats.stats.sum.num_objects = 0; + state_set(PG_STATE_INCONSISTENT); + } + + if ((is_remapped() || is_undersized() || !is_clean()) && + (is_peered()|| is_activating())) { + psdout(20) << __func__ << " actingset " << actingset << " upset " + << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl; + + ceph_assert(!acting_recovery_backfill.empty()); + + bool estimate = false; + + // NOTE: we only generate degraded, misplaced and unfound + // values for the summation, not individual stat categories. + int64_t num_objects = info.stats.stats.sum.num_objects; + + // Objects missing from up nodes, sorted by # objects. + boost::container::flat_set> missing_target_objects; + // Objects missing from nodes not in up, sort by # objects + boost::container::flat_set> acting_source_objects; + + // Fill missing_target_objects/acting_source_objects + + { + int64_t missing; + + // Primary first + missing = pg_log.get_missing().num_missing(); + ceph_assert(acting_recovery_backfill.count(pg_whoami)); + if (upset.count(pg_whoami)) { + missing_target_objects.emplace(missing, pg_whoami); + } else { + acting_source_objects.emplace(missing, pg_whoami); + } + info.stats.stats.sum.num_objects_missing_on_primary = missing; + if (missing == 0) + info.stats.avail_no_missing.push_back(pg_whoami); + psdout(20) << __func__ << " shard " << pg_whoami + << " primary objects " << num_objects + << " missing " << missing + << dendl; + } + + // All other peers + for (auto& peer : peer_info) { + // Primary should not be in the peer_info, skip if it is. + if (peer.first == pg_whoami) continue; + int64_t missing = 0; + int64_t peer_num_objects = + std::max((int64_t)0, peer.second.stats.stats.sum.num_objects); + // Backfill targets always track num_objects accurately + // all other peers track missing accurately. + if (is_backfill_target(peer.first)) { + missing = std::max((int64_t)0, num_objects - peer_num_objects); + } else { + if (peer_missing.count(peer.first)) { + missing = peer_missing[peer.first].num_missing(); + } else { + psdout(20) << __func__ << " no peer_missing found for " + << peer.first << dendl; + if (is_recovering()) { + estimate = true; + } + missing = std::max((int64_t)0, num_objects - peer_num_objects); + } + } + if (upset.count(peer.first)) { + missing_target_objects.emplace(missing, peer.first); + } else if (actingset.count(peer.first)) { + acting_source_objects.emplace(missing, peer.first); + } + peer.second.stats.stats.sum.num_objects_missing = missing; + if (missing == 0) + info.stats.avail_no_missing.push_back(peer.first); + psdout(20) << __func__ << " shard " << peer.first + << " objects " << peer_num_objects + << " missing " << missing + << dendl; + } + + // Compute object_location_counts + for (auto& ml: missing_loc.get_missing_locs()) { + info.stats.object_location_counts[ml.second]++; + psdout(30) << __func__ << " " << ml.first << " object_location_counts[" + << ml.second << "]=" << info.stats.object_location_counts[ml.second] + << dendl; + } + int64_t not_missing = num_objects - missing_loc.get_missing_locs().size(); + if (not_missing) { + // During recovery we know upset == actingset and is being populated + // During backfill we know that all non-missing objects are in the actingset + info.stats.object_location_counts[actingset] = not_missing; + } + psdout(30) << __func__ << " object_location_counts[" + << upset << "]=" << info.stats.object_location_counts[upset] + << dendl; + psdout(20) << __func__ << " object_location_counts " + << info.stats.object_location_counts << dendl; + + // A misplaced object is not stored on the correct OSD + int64_t misplaced = 0; + // a degraded objects has fewer replicas or EC shards than the pool specifies. + int64_t degraded = 0; + + if (is_recovering()) { + for (auto& sml: missing_loc.get_missing_by_count()) { + for (auto& ml: sml.second) { + int missing_shards; + if (sml.first == shard_id_t::NO_SHARD) { + psdout(20) << __func__ << " ml " << ml.second + << " upset size " << upset.size() + << " up " << ml.first.up << dendl; + missing_shards = (int)upset.size() - ml.first.up; + } else { + // Handle shards not even in upset below + if (!find_shard(upset, sml.first)) + continue; + missing_shards = std::max(0, 1 - ml.first.up); + psdout(20) << __func__ + << " shard " << sml.first + << " ml " << ml.second + << " missing shards " << missing_shards << dendl; + } + int odegraded = ml.second * missing_shards; + // Copies on other osds but limited to the possible degraded + int more_osds = std::min(missing_shards, ml.first.other); + int omisplaced = ml.second * more_osds; + ceph_assert(omisplaced <= odegraded); + odegraded -= omisplaced; + + misplaced += omisplaced; + degraded += odegraded; + } + } + + psdout(20) << __func__ << " missing based degraded " + << degraded << dendl; + psdout(20) << __func__ << " missing based misplaced " + << misplaced << dendl; + + // Handle undersized case + if (pool.info.is_replicated()) { + // Add degraded for missing targets (num_objects missing) + ceph_assert(target >= upset.size()); + unsigned needed = target - upset.size(); + degraded += num_objects * needed; + } else { + for (unsigned i = 0 ; i < num_shards; ++i) { + shard_id_t shard(i); + + if (!find_shard(upset, shard)) { + pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard); + + if (pgs != pg_shard_t()) { + int64_t missing; + + if (pgs == pg_whoami) + missing = info.stats.stats.sum.num_objects_missing_on_primary; + else + missing = peer_info[pgs].stats.stats.sum.num_objects_missing; + + degraded += missing; + misplaced += std::max((int64_t)0, num_objects - missing); + } else { + // No shard anywhere + degraded += num_objects; + } + } + } + } + goto out; + } + + // Handle undersized case + if (pool.info.is_replicated()) { + // Add to missing_target_objects + ceph_assert(target >= missing_target_objects.size()); + unsigned needed = target - missing_target_objects.size(); + if (needed) + missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)); + } else { + for (unsigned i = 0 ; i < num_shards; ++i) { + shard_id_t shard(i); + bool found = false; + for (const auto& t : missing_target_objects) { + if (std::get<1>(t).shard == shard) { + found = true; + break; + } + } + if (!found) + missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)); + } + } + + for (const auto& item : missing_target_objects) + psdout(20) << __func__ << " missing shard " << std::get<1>(item) + << " missing= " << std::get<0>(item) << dendl; + for (const auto& item : acting_source_objects) + psdout(20) << __func__ << " acting shard " << std::get<1>(item) + << " missing= " << std::get<0>(item) << dendl; + + // Handle all objects not in missing for remapped + // or backfill + for (auto m = missing_target_objects.rbegin(); + m != missing_target_objects.rend(); ++m) { + + int64_t extra_missing = -1; + + if (pool.info.is_replicated()) { + if (!acting_source_objects.empty()) { + auto extra_copy = acting_source_objects.begin(); + extra_missing = std::get<0>(*extra_copy); + acting_source_objects.erase(extra_copy); + } + } else { // Erasure coded + // Use corresponding shard + for (const auto& a : acting_source_objects) { + if (std::get<1>(a).shard == std::get<1>(*m).shard) { + extra_missing = std::get<0>(a); + acting_source_objects.erase(a); + break; + } + } + } + + if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) { + // We don't know which of the objects on the target + // are part of extra_missing so assume are all degraded. + misplaced += std::get<0>(*m) - extra_missing; + degraded += extra_missing; + } else { + // 1. extra_missing == -1, more targets than sources so degraded + // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing + // previously degraded are now present on the target. + degraded += std::get<0>(*m); + } + } + // If there are still acting that haven't been accounted for + // then they are misplaced + for (const auto& a : acting_source_objects) { + int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a)); + psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced + << dendl; + misplaced += extra_misplaced; + } +out: + // NOTE: Tests use these messages to verify this code + psdout(20) << __func__ << " degraded " << degraded + << (estimate ? " (est)": "") << dendl; + psdout(20) << __func__ << " misplaced " << misplaced + << (estimate ? " (est)": "")<< dendl; + + info.stats.stats.sum.num_objects_degraded = degraded; + info.stats.stats.sum.num_objects_unfound = get_num_unfound(); + info.stats.stats.sum.num_objects_misplaced = misplaced; + } +} + +std::optional PeeringState::prepare_stats_for_publish( + bool pg_stats_publish_valid, + const pg_stat_t &pg_stats_publish, + const object_stat_collection_t &unstable_stats) +{ + if (info.stats.stats.sum.num_scrub_errors) { + psdout(10) << __func__ << " inconsistent due to " << + info.stats.stats.sum.num_scrub_errors << " scrub errors" << dendl; + state_set(PG_STATE_INCONSISTENT); + } else { + state_clear(PG_STATE_INCONSISTENT); + state_clear(PG_STATE_FAILED_REPAIR); + } + + utime_t now = ceph_clock_now(); + if (info.stats.state != state) { + info.stats.last_change = now; + // Optimistic estimation, if we just find out an inactive PG, + // assumt it is active till now. + if (!(state & PG_STATE_ACTIVE) && + (info.stats.state & PG_STATE_ACTIVE)) + info.stats.last_active = now; + + if ((state & PG_STATE_ACTIVE) && + !(info.stats.state & PG_STATE_ACTIVE)) + info.stats.last_became_active = now; + if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) && + !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))) + info.stats.last_became_peered = now; + info.stats.state = state; + } + + update_calc_stats(); + if (info.stats.stats.sum.num_objects_degraded) { + state_set(PG_STATE_DEGRADED); + } else { + state_clear(PG_STATE_DEGRADED); + } + update_blocked_by(); + + pg_stat_t pre_publish = info.stats; + pre_publish.stats.add(unstable_stats); + utime_t cutoff = now; + cutoff -= cct->_conf->osd_pg_stat_report_interval_max; + + // share (some of) our purged_snaps via the pg_stats. limit # of intervals + // because we don't want to make the pg_stat_t structures too expensive. + unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch; + unsigned num = 0; + auto i = info.purged_snaps.begin(); + while (num < max && i != info.purged_snaps.end()) { + pre_publish.purged_snaps.insert(i.get_start(), i.get_len()); + ++num; + ++i; + } + psdout(20) << __func__ << " reporting purged_snaps " + << pre_publish.purged_snaps << dendl; + + if (pg_stats_publish_valid && pre_publish == pg_stats_publish && + info.stats.last_fresh > cutoff) { + psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch + << ": no change since " << info.stats.last_fresh << dendl; + return std::nullopt; + } else { + // update our stat summary and timestamps + info.stats.reported_epoch = get_osdmap_epoch(); + ++info.stats.reported_seq; + + info.stats.last_fresh = now; + + if (info.stats.state & PG_STATE_CLEAN) + info.stats.last_clean = now; + if (info.stats.state & PG_STATE_ACTIVE) + info.stats.last_active = now; + if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) + info.stats.last_peered = now; + info.stats.last_unstale = now; + if ((info.stats.state & PG_STATE_DEGRADED) == 0) + info.stats.last_undegraded = now; + if ((info.stats.state & PG_STATE_UNDERSIZED) == 0) + info.stats.last_fullsized = now; + + psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch + << ":" << pg_stats_publish.reported_seq << dendl; + return std::make_optional(std::move(pre_publish)); + } +} + +void PeeringState::init( + int role, + const vector& newup, int new_up_primary, + const vector& newacting, int new_acting_primary, + const pg_history_t& history, + const PastIntervals& pi, + bool backfill, + ObjectStore::Transaction &t) +{ + psdout(10) << "init role " << role << " up " + << newup << " acting " << newacting + << " history " << history + << " past_intervals " << pi + << dendl; + + set_role(role); + init_primary_up_acting( + newup, + newacting, + new_up_primary, + new_acting_primary); + + info.history = history; + past_intervals = pi; + + info.stats.up = up; + info.stats.up_primary = new_up_primary; + info.stats.acting = acting; + info.stats.acting_primary = new_acting_primary; + info.stats.mapping_epoch = info.history.same_interval_since; + + if (!perform_deletes_during_peering()) { + pg_log.set_missing_may_contain_deletes(); + } + + if (backfill) { + psdout(10) << __func__ << ": Setting backfill" << dendl; + info.set_last_backfill(hobject_t()); + info.last_complete = info.last_update; + pg_log.mark_log_for_rewrite(); + } + + on_new_interval(); + + dirty_info = true; + dirty_big_info = true; + write_if_dirty(t); +} + +void PeeringState::dump_peering_state(Formatter *f) +{ + f->dump_string("state", get_pg_state_string()); + f->dump_unsigned("epoch", get_osdmap_epoch()); + f->open_array_section("up"); + for (auto p = up.begin(); p != up.end(); ++p) + f->dump_unsigned("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (auto p = acting.begin(); p != acting.end(); ++p) + f->dump_unsigned("osd", *p); + f->close_section(); + if (!backfill_targets.empty()) { + f->open_array_section("backfill_targets"); + for (auto p = backfill_targets.begin(); p != backfill_targets.end(); ++p) + f->dump_stream("shard") << *p; + f->close_section(); + } + if (!async_recovery_targets.empty()) { + f->open_array_section("async_recovery_targets"); + for (auto p = async_recovery_targets.begin(); + p != async_recovery_targets.end(); + ++p) + f->dump_stream("shard") << *p; + f->close_section(); + } + if (!acting_recovery_backfill.empty()) { + f->open_array_section("acting_recovery_backfill"); + for (auto p = acting_recovery_backfill.begin(); + p != acting_recovery_backfill.end(); + ++p) + f->dump_stream("shard") << *p; + f->close_section(); + } + f->open_object_section("info"); + update_calc_stats(); + info.dump(f); + f->close_section(); + + f->open_array_section("peer_info"); + for (auto p = peer_info.begin(); p != peer_info.end(); ++p) { + f->open_object_section("info"); + f->dump_stream("peer") << p->first; + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PeeringState::update_stats( + std::function f, + ObjectStore::Transaction *t) { + if (f(info.history, info.stats)) { + pl->publish_stats_to_osd(); + } + pl->on_info_history_change(); + + if (t) { + dirty_info = true; + write_if_dirty(*t); + } +} + +bool PeeringState::append_log_entries_update_missing( + const mempool::osd_pglog::list &entries, + ObjectStore::Transaction &t, std::optional trim_to, + std::optional roll_forward_to) +{ + ceph_assert(!entries.empty()); + ceph_assert(entries.begin()->version > info.last_update); + + PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; + bool invalidate_stats = + pg_log.append_new_log_entries( + info.last_backfill, + entries, + rollbacker.get()); + + if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) { + pg_log.roll_forward(rollbacker.get()); + } + if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) { + pg_log.roll_forward_to(*roll_forward_to, rollbacker.get()); + last_rollback_info_trimmed_to_applied = *roll_forward_to; + } + + info.last_update = pg_log.get_head(); + + if (pg_log.get_missing().num_missing() == 0) { + // advance last_complete since nothing else is missing! + info.last_complete = info.last_update; + } + info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats; + + psdout(20) << __func__ << " trim_to bool = " << bool(trim_to) + << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl; + if (trim_to) + pg_log.trim(*trim_to, info); + dirty_info = true; + write_if_dirty(t); + return invalidate_stats; +} + +void PeeringState::merge_new_log_entries( + const mempool::osd_pglog::list &entries, + ObjectStore::Transaction &t, + std::optional trim_to, + std::optional roll_forward_to) +{ + psdout(10) << __func__ << " " << entries << dendl; + ceph_assert(is_primary()); + + bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to); + for (auto i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + pg_shard_t peer(*i); + if (peer == pg_whoami) continue; + ceph_assert(peer_missing.count(peer)); + ceph_assert(peer_info.count(peer)); + pg_missing_t& pmissing(peer_missing[peer]); + psdout(20) << __func__ << " peer_missing for " << peer + << " = " << pmissing << dendl; + pg_info_t& pinfo(peer_info[peer]); + bool invalidate_stats = PGLog::append_log_entries_update_missing( + pinfo.last_backfill, + entries, + true, + NULL, + pmissing, + NULL, + dpp); + pinfo.last_update = info.last_update; + pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats; + rebuild_missing = rebuild_missing || invalidate_stats; + } + + if (!rebuild_missing) { + return; + } + + for (auto &&i: entries) { + missing_loc.rebuild( + i.soid, + pg_whoami, + acting_recovery_backfill, + info, + pg_log.get_missing(), + peer_missing, + peer_info); + } +} + +void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied) +{ + // raise last_complete only if we were previously up to date + if (info.last_complete == info.last_update) + info.last_complete = e.version; + + // raise last_update. + ceph_assert(e.version > info.last_update); + info.last_update = e.version; + + // raise user_version, if it increased (it may have not get bumped + // by all logged updates) + if (e.user_version > info.last_user_version) + info.last_user_version = e.user_version; + + // log mutation + pg_log.add(e, applied); + psdout(10) << "add_log_entry " << e << dendl; +} + + +void PeeringState::append_log( + vector&& logv, + eversion_t trim_to, + eversion_t roll_forward_to, + eversion_t mlcod, + ObjectStore::Transaction &t, + bool transaction_applied, + bool async) +{ + /* The primary has sent an info updating the history, but it may not + * have arrived yet. We want to make sure that we cannot remember this + * write without remembering that it happened in an interval which went + * active in epoch history.last_epoch_started. + */ + if (info.last_epoch_started != info.history.last_epoch_started) { + info.history.last_epoch_started = info.last_epoch_started; + } + if (info.last_interval_started != info.history.last_interval_started) { + info.history.last_interval_started = info.last_interval_started; + } + psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl; + + PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)}; + if (!transaction_applied) { + /* We must be a backfill or async recovery peer, so it's ok if we apply + * out-of-turn since we won't be considered when + * determining a min possible last_update. + * + * We skip_rollforward() here, which advances the crt, without + * doing an actual rollforward. This avoids cleaning up entries + * from the backend and we do not end up in a situation, where the + * object is deleted before we can _merge_object_divergent_entries(). + */ + pg_log.skip_rollforward(); + } + + for (auto p = logv.begin(); p != logv.end(); ++p) { + add_log_entry(*p, transaction_applied); + + /* We don't want to leave the rollforward artifacts around + * here past last_backfill. It's ok for the same reason as + * above */ + if (transaction_applied && + p->soid > info.last_backfill) { + pg_log.roll_forward(handler.get()); + } + } + if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) { + pg_log.roll_forward_to( + roll_forward_to, + handler.get()); + last_rollback_info_trimmed_to_applied = roll_forward_to; + } + + psdout(10) << __func__ << " approx pg log length = " + << pg_log.get_log().approx_size() << dendl; + psdout(10) << __func__ << " dups pg log length = " + << pg_log.get_log().dups.size() << dendl; + psdout(10) << __func__ << " transaction_applied = " + << transaction_applied << dendl; + if (!transaction_applied || async) + psdout(10) << __func__ << " " << pg_whoami + << " is async_recovery or backfill target" << dendl; + pg_log.trim(trim_to, info, transaction_applied, async); + + // update the local pg, pg log + dirty_info = true; + write_if_dirty(t); + + if (!is_primary()) + min_last_complete_ondisk = mlcod; +} + +void PeeringState::recover_got( + const hobject_t &oid, eversion_t v, + bool is_delete, + ObjectStore::Transaction &t) +{ + if (v > pg_log.get_can_rollback_to()) { + /* This can only happen during a repair, and even then, it would + * be one heck of a race. If we are repairing the object, the + * write in question must be fully committed, so it's not valid + * to roll it back anyway (and we'll be rolled forward shortly + * anyway) */ + PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)}; + pg_log.roll_forward_to(v, handler.get()); + } + + psdout(10) << "got missing " << oid << " v " << v << dendl; + pg_log.recover_got(oid, v, info); + if (pg_log.get_log().log.empty()) { + psdout(10) << "last_complete now " << info.last_complete + << " while log is empty" << dendl; + } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) { + psdout(10) << "last_complete now " << info.last_complete + << " log.complete_to " << pg_log.get_log().complete_to->version + << dendl; + } else { + psdout(10) << "last_complete now " << info.last_complete + << " log.complete_to at end" << dendl; + //below is not true in the repair case. + //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong. + ceph_assert(info.last_complete == info.last_update); + } + + if (is_primary()) { + ceph_assert(missing_loc.needs_recovery(oid)); + if (!is_delete) + missing_loc.add_location(oid, pg_whoami); + } + + // update pg + dirty_info = true; + write_if_dirty(t); +} + +void PeeringState::update_backfill_progress( + const hobject_t &updated_backfill, + const pg_stat_t &updated_stats, + bool preserve_local_num_bytes, + ObjectStore::Transaction &t) { + info.set_last_backfill(updated_backfill); + if (preserve_local_num_bytes) { + psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes + << " local " << info.stats.stats.sum.num_bytes << dendl; + int64_t bytes = info.stats.stats.sum.num_bytes; + info.stats = updated_stats; + info.stats.stats.sum.num_bytes = bytes; + } else { + psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes + << " replaces local " << info.stats.stats.sum.num_bytes << dendl; + info.stats = updated_stats; + } + + dirty_info = true; + write_if_dirty(t); +} + +void PeeringState::adjust_purged_snaps( + std::function &snaps)> f) { + f(info.purged_snaps); + dirty_info = true; + dirty_big_info = true; +} + +void PeeringState::on_peer_recover( + pg_shard_t peer, + const hobject_t &soid, + const eversion_t &version) +{ + pl->publish_stats_to_osd(); + // done! + peer_missing[peer].got(soid, version); + missing_loc.add_location(soid, peer); +} + +void PeeringState::begin_peer_recover( + pg_shard_t peer, + const hobject_t soid) +{ + peer_missing[peer].revise_have(soid, eversion_t()); +} + +void PeeringState::force_object_missing( + const set &peers, + const hobject_t &soid, + eversion_t version) +{ + for (auto &&peer : peers) { + if (peer != primary) { + peer_missing[peer].add(soid, version, eversion_t(), false); + } else { + pg_log.missing_add(soid, version, eversion_t()); + pg_log.reset_complete_to(&info); + pg_log.set_last_requested(0); + } + } + + missing_loc.rebuild( + soid, + pg_whoami, + acting_recovery_backfill, + info, + pg_log.get_missing(), + peer_missing, + peer_info); +} + +void PeeringState::pre_submit_op( + const hobject_t &hoid, + const vector& logv, + eversion_t at_version) +{ + if (at_version > eversion_t()) { + for (auto &&i : get_acting_recovery_backfill()) { + if (i == primary) continue; + pg_info_t &pinfo = peer_info[i]; + // keep peer_info up to date + if (pinfo.last_complete == pinfo.last_update) + pinfo.last_complete = at_version; + pinfo.last_update = at_version; + } + } + + bool requires_missing_loc = false; + for (auto &&i : get_async_recovery_targets()) { + if (i == primary || !get_peer_missing(i).is_missing(hoid)) + continue; + requires_missing_loc = true; + for (auto &&entry: logv) { + peer_missing[i].add_next_event(entry); + } + } + + if (requires_missing_loc) { + for (auto &&entry: logv) { + psdout(30) << __func__ << " missing_loc before: " + << missing_loc.get_locations(entry.soid) << dendl; + missing_loc.add_missing(entry.soid, entry.version, + eversion_t(), entry.is_delete()); + // clear out missing_loc + missing_loc.clear_location(entry.soid); + for (auto &i: get_actingset()) { + if (!get_peer_missing(i).is_missing(entry.soid)) + missing_loc.add_location(entry.soid, i); + } + psdout(30) << __func__ << " missing_loc after: " + << missing_loc.get_locations(entry.soid) << dendl; + } + } +} + +void PeeringState::recovery_committed_to(eversion_t version) +{ + psdout(10) << __func__ << " version " << version + << " now ondisk" << dendl; + last_complete_ondisk = version; + + if (last_complete_ondisk == info.last_update) { + if (!is_primary()) { + // Either we are a replica or backfill target. + // we are fully up to date. tell the primary! + pl->send_cluster_message( + get_primary().osd, + make_message( + get_osdmap_epoch(), + spg_t(info.pgid.pgid, primary.shard), + last_complete_ondisk), + get_osdmap_epoch()); + } else { + calc_min_last_complete_ondisk(); + } + } +} + +void PeeringState::complete_write(eversion_t v, eversion_t lc) +{ + last_update_ondisk = v; + last_complete_ondisk = lc; + calc_min_last_complete_ondisk(); +} + +void PeeringState::calc_trim_to() +{ + size_t target = pl->get_target_pg_log_entries(); + + eversion_t limit = std::min( + min_last_complete_ondisk, + pg_log.get_can_rollback_to()); + if (limit != eversion_t() && + limit != pg_trim_to && + pg_log.get_log().approx_size() > target) { + size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target, + cct->_conf->osd_pg_log_trim_max); + if (num_to_trim < cct->_conf->osd_pg_log_trim_min && + cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) { + return; + } + auto it = pg_log.get_log().log.begin(); + eversion_t new_trim_to; + for (size_t i = 0; i < num_to_trim; ++i) { + new_trim_to = it->version; + ++it; + if (new_trim_to > limit) { + new_trim_to = limit; + psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl; + break; + } + } + psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl; + pg_trim_to = new_trim_to; + assert(pg_trim_to <= pg_log.get_head()); + assert(pg_trim_to <= min_last_complete_ondisk); + } +} + +void PeeringState::calc_trim_to_aggressive() +{ + size_t target = pl->get_target_pg_log_entries(); + + // limit pg log trimming up to the can_rollback_to value + eversion_t limit = std::min({ + pg_log.get_head(), + pg_log.get_can_rollback_to(), + last_update_ondisk}); + psdout(10) << __func__ << " limit = " << limit << dendl; + + if (limit != eversion_t() && + limit != pg_trim_to && + pg_log.get_log().approx_size() > target) { + psdout(10) << __func__ << " approx pg log length = " + << pg_log.get_log().approx_size() << dendl; + uint64_t num_to_trim = std::min(pg_log.get_log().approx_size() - target, + cct->_conf->osd_pg_log_trim_max); + psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl; + if (num_to_trim < cct->_conf->osd_pg_log_trim_min && + cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) { + return; + } + auto it = pg_log.get_log().log.begin(); // oldest log entry + auto rit = pg_log.get_log().log.rbegin(); + eversion_t by_n_to_keep; // start from tail + eversion_t by_n_to_trim = eversion_t::max(); // start from head + for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) { + i++; + if (i > target && by_n_to_keep == eversion_t()) { + by_n_to_keep = rit->version; + } + if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) { + by_n_to_trim = it->version; + } + if (by_n_to_keep != eversion_t() && + by_n_to_trim != eversion_t::max()) { + break; + } + } + + if (by_n_to_keep == eversion_t()) { + return; + } + + pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit}); + psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl; + ceph_assert(pg_trim_to <= pg_log.get_head()); + } +} + +void PeeringState::apply_op_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats) +{ + info.stats.stats.add(delta_stats); + info.stats.stats.floor(0); + + for (auto i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + pg_info_t& pinfo = peer_info[bt]; + if (soid <= pinfo.last_backfill) + pinfo.stats.stats.add(delta_stats); + } +} + +void PeeringState::update_complete_backfill_object_stats( + const hobject_t &hoid, + const pg_stat_t &stats) +{ + for (auto &&bt: get_backfill_targets()) { + pg_info_t& pinfo = peer_info[bt]; + //Add stats to all peers that were missing object + if (hoid > pinfo.last_backfill) + pinfo.stats.add(stats); + } +} + +void PeeringState::update_peer_last_backfill( + pg_shard_t peer, + const hobject_t &new_last_backfill) +{ + pg_info_t &pinfo = peer_info[peer]; + pinfo.last_backfill = new_last_backfill; + if (new_last_backfill.is_max()) { + /* pinfo.stats might be wrong if we did log-based recovery on the + * backfilled portion in addition to continuing backfill. + */ + pinfo.stats = info.stats; + } +} + +void PeeringState::set_revert_with_targets( + const hobject_t &soid, + const set &good_peers) +{ + for (auto &&peer: good_peers) { + missing_loc.add_location(soid, peer); + } +} + +void PeeringState::prepare_backfill_for_missing( + const hobject_t &soid, + const eversion_t &version, + const vector &targets) { + for (auto &&peer: targets) { + peer_missing[peer].add(soid, version, eversion_t(), false); + } +} + +void PeeringState::update_hset(const pg_hit_set_history_t &hset_history) +{ + info.hit_set = hset_history; +} + +/*------------ Peering State Machine----------------*/ +#undef dout_prefix +#define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \ + << "state<" << get_state_name() << ">: ") +#undef psdout +#define psdout(x) ldout(context< PeeringMachine >().cct, x) + +#define DECLARE_LOCALS \ + PeeringState *ps = context< PeeringMachine >().state; \ + std::ignore = ps; \ + PeeringListener *pl = context< PeeringMachine >().pl; \ + std::ignore = pl + + +/*------Crashed-------*/ +PeeringState::Crashed::Crashed(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Crashed") +{ + context< PeeringMachine >().log_enter(state_name); + ceph_abort_msg("we got a bad state machine event"); +} + + +/*------Initial-------*/ +PeeringState::Initial::Initial(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Initial") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify) +{ + DECLARE_LOCALS; + ps->proc_replica_info( + notify.from, notify.notify.info, notify.notify.epoch_sent); + ps->set_last_peering_reset(); + return transit< Primary >(); +} + +boost::statechart::result PeeringState::Initial::react(const MInfoRec& i) +{ + DECLARE_LOCALS; + ceph_assert(!ps->is_primary()); + post_event(i); + return transit< Stray >(); +} + +boost::statechart::result PeeringState::Initial::react(const MLogRec& i) +{ + DECLARE_LOCALS; + ceph_assert(!ps->is_primary()); + post_event(i); + return transit< Stray >(); +} + +void PeeringState::Initial::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_initial_latency, dur); +} + +/*------Started-------*/ +PeeringState::Started::Started(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::Started::react(const IntervalFlush&) +{ + psdout(10) << "Ending blocked outgoing recovery messages" << dendl; + context< PeeringMachine >().state->end_block_outgoing(); + return discard_event(); +} + +boost::statechart::result PeeringState::Started::react(const AdvMap& advmap) +{ + DECLARE_LOCALS; + psdout(10) << "Started advmap" << dendl; + ps->check_full_transition(advmap.lastmap, advmap.osdmap); + if (ps->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + psdout(10) << "should_restart_peering, transitioning to Reset" + << dendl; + post_event(advmap); + return transit< Reset >(); + } + ps->remove_down_peer_info(advmap.osdmap); + return discard_event(); +} + +boost::statechart::result PeeringState::Started::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return discard_event(); +} + +boost::statechart::result PeeringState::Started::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "Started"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::Started::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_started_latency, dur); + ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY); +} + +/*--------Reset---------*/ +PeeringState::Reset::Reset(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Reset") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + + ps->flushes_in_progress = 0; + ps->set_last_peering_reset(); + ps->log_weirdness(); +} + +boost::statechart::result +PeeringState::Reset::react(const IntervalFlush&) +{ + psdout(10) << "Ending blocked outgoing recovery messages" << dendl; + context< PeeringMachine >().state->end_block_outgoing(); + return discard_event(); +} + +boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap) +{ + DECLARE_LOCALS; + psdout(10) << "Reset advmap" << dendl; + + ps->check_full_transition(advmap.lastmap, advmap.osdmap); + + if (ps->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + psdout(10) << "should restart peering, calling start_peering_interval again" + << dendl; + ps->start_peering_interval( + advmap.lastmap, + advmap.newup, advmap.up_primary, + advmap.newacting, advmap.acting_primary, + context< PeeringMachine >().get_cur_transaction()); + } + ps->remove_down_peer_info(advmap.osdmap); + ps->check_past_interval_bounds(); + return discard_event(); +} + +boost::statechart::result PeeringState::Reset::react(const ActMap&) +{ + DECLARE_LOCALS; + if (ps->should_send_notify() && ps->get_primary().osd >= 0) { + ps->info.history.refresh_prior_readable_until_ub( + pl->get_mnow(), + ps->prior_readable_until_ub); + context< PeeringMachine >().send_notify( + ps->get_primary().osd, + pg_notify_t( + ps->get_primary().shard, ps->pg_whoami.shard, + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + ps->info, + ps->past_intervals)); + } + + ps->update_heartbeat_peers(); + + return transit< Started >(); +} + +boost::statechart::result PeeringState::Reset::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return discard_event(); +} + +boost::statechart::result PeeringState::Reset::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "Reset"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::Reset::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_reset_latency, dur); +} + +/*-------Start---------*/ +PeeringState::Start::Start(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Start") +{ + context< PeeringMachine >().log_enter(state_name); + + DECLARE_LOCALS; + if (ps->is_primary()) { + psdout(1) << "transitioning to Primary" << dendl; + post_event(MakePrimary()); + } else { //is_stray + psdout(1) << "transitioning to Stray" << dendl; + post_event(MakeStray()); + } +} + +void PeeringState::Start::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_start_latency, dur); +} + +/*---------Primary--------*/ +PeeringState::Primary::Primary(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + ceph_assert(ps->want_acting.empty()); + + // set CREATING bit until we have peered for the first time. + if (ps->info.history.last_epoch_started == 0) { + ps->state_set(PG_STATE_CREATING); + // use the history timestamp, which ultimately comes from the + // monitor in the create case. + utime_t t = ps->info.history.last_scrub_stamp; + ps->info.stats.last_fresh = t; + ps->info.stats.last_active = t; + ps->info.stats.last_change = t; + ps->info.stats.last_peered = t; + ps->info.stats.last_clean = t; + ps->info.stats.last_unstale = t; + ps->info.stats.last_undegraded = t; + ps->info.stats.last_fullsized = t; + ps->info.stats.last_scrub_stamp = t; + ps->info.stats.last_deep_scrub_stamp = t; + ps->info.stats.last_clean_scrub_stamp = t; + } +} + +boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt) +{ + DECLARE_LOCALS; + psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl; + ps->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react(const ActMap&) +{ + DECLARE_LOCALS; + psdout(7) << "handle ActMap primary" << dendl; + pl->publish_stats_to_osd(); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const SetForceRecovery&) +{ + DECLARE_LOCALS; + ps->set_force_recovery(true); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const UnsetForceRecovery&) +{ + DECLARE_LOCALS; + ps->set_force_recovery(false); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const RequestScrub& evt) +{ + DECLARE_LOCALS; + if (ps->is_primary()) { + pl->scrub_requested(evt.deep, evt.repair); + psdout(10) << "marking for scrub" << dendl; + } + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const SetForceBackfill&) +{ + DECLARE_LOCALS; + ps->set_force_backfill(true); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const UnsetForceBackfill&) +{ + DECLARE_LOCALS; + ps->set_force_backfill(false); + return discard_event(); +} + +void PeeringState::Primary::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + ps->want_acting.clear(); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_primary_latency, dur); + pl->clear_primary_state(); + ps->state_clear(PG_STATE_CREATING); +} + +/*---------Peering--------*/ +PeeringState::Peering::Peering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"), + history_les_bound(false) +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + + ceph_assert(!ps->is_peered()); + ceph_assert(!ps->is_peering()); + ceph_assert(ps->is_primary()); + ps->state_set(PG_STATE_PEERING); +} + +boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap) +{ + DECLARE_LOCALS; + psdout(10) << "Peering advmap" << dendl; + if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) { + psdout(1) << "Peering, affected_by_map, going to Reset" << dendl; + post_event(advmap); + return transit< Reset >(); + } + + ps->adjust_need_up_thru(advmap.osdmap); + ps->check_prior_readable_down_osds(advmap.osdmap); + + return forward_event(); +} + +boost::statechart::result PeeringState::Peering::react(const QueryState& q) +{ + DECLARE_LOCALS; + + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("past_intervals"); + ps->past_intervals.dump(q.f); + q.f->close_section(); + + q.f->open_array_section("probing_osds"); + for (auto p = prior_set.probe.begin(); p != prior_set.probe.end(); ++p) + q.f->dump_stream("osd") << *p; + q.f->close_section(); + + if (prior_set.pg_down) + q.f->dump_string("blocked", "peering is blocked due to down osds"); + + q.f->open_array_section("down_osds_we_would_probe"); + for (auto p = prior_set.down.begin(); p != prior_set.down.end(); ++p) + q.f->dump_int("osd", *p); + q.f->close_section(); + + q.f->open_array_section("peering_blocked_by"); + for (auto p = prior_set.blocked_by.begin(); + p != prior_set.blocked_by.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_int("osd", p->first); + q.f->dump_int("current_lost_at", p->second); + q.f->dump_string("comment", "starting or marking this osd lost may let us proceed"); + q.f->close_section(); + } + q.f->close_section(); + + if (history_les_bound) { + q.f->open_array_section("peering_blocked_by_detail"); + q.f->open_object_section("item"); + q.f->dump_string("detail","peering_blocked_by_history_les_bound"); + q.f->close_section(); + q.f->close_section(); + } + + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::Peering::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "Peering"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::Peering::exit() +{ + + DECLARE_LOCALS; + psdout(10) << "Leaving Peering" << dendl; + context< PeeringMachine >().log_exit(state_name, enter_time); + ps->state_clear(PG_STATE_PEERING); + pl->clear_probe_targets(); + + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_peering_latency, dur); +} + + +/*------Backfilling-------*/ +PeeringState::Backfilling::Backfilling(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling") +{ + context< PeeringMachine >().log_enter(state_name); + + + DECLARE_LOCALS; + ps->backfill_reserved = true; + pl->on_backfill_reserved(); + ps->state_clear(PG_STATE_BACKFILL_TOOFULL); + ps->state_clear(PG_STATE_BACKFILL_WAIT); + ps->state_set(PG_STATE_BACKFILLING); + pl->publish_stats_to_osd(); +} + +void PeeringState::Backfilling::backfill_release_reservations() +{ + DECLARE_LOCALS; + pl->cancel_local_background_io_reservation(); + for (auto it = ps->backfill_targets.begin(); + it != ps->backfill_targets.end(); + ++it) { + ceph_assert(*it != ps->pg_whoami); + pl->send_cluster_message( + it->osd, + make_message( + MBackfillReserve::RELEASE, + spg_t(ps->info.pgid.pgid, it->shard), + ps->get_osdmap_epoch()), + ps->get_osdmap_epoch()); + } +} + +void PeeringState::Backfilling::cancel_backfill() +{ + DECLARE_LOCALS; + backfill_release_reservations(); + pl->on_backfill_canceled(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const Backfilled &c) +{ + backfill_release_reservations(); + return transit(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const DeferBackfill &c) +{ + DECLARE_LOCALS; + + psdout(10) << "defer backfill, retry delay " << c.delay << dendl; + ps->state_set(PG_STATE_BACKFILL_WAIT); + ps->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + + pl->schedule_event_after( + std::make_shared( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + RequestBackfill()), + c.delay); + return transit(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const UnfoundBackfill &c) +{ + DECLARE_LOCALS; + psdout(10) << "backfill has unfound, can't continue" << dendl; + ps->state_set(PG_STATE_BACKFILL_UNFOUND); + ps->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + return transit(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &) +{ + DECLARE_LOCALS; + + ps->state_set(PG_STATE_BACKFILL_TOOFULL); + ps->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + + pl->schedule_event_after( + std::make_shared( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + RequestBackfill()), + ps->cct->_conf->osd_backfill_retry_interval); + + return transit(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const RemoteReservationRevoked &) +{ + DECLARE_LOCALS; + ps->state_set(PG_STATE_BACKFILL_WAIT); + cancel_backfill(); + if (ps->needs_backfill()) { + return transit(); + } else { + // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore + return discard_event(); + } +} + +void PeeringState::Backfilling::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + ps->backfill_reserved = false; + ps->state_clear(PG_STATE_BACKFILLING); + ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_backfilling_latency, dur); +} + +/*--WaitRemoteBackfillReserved--*/ + +PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"), + backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin()) +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + + ps->state_set(PG_STATE_BACKFILL_WAIT); + pl->publish_stats_to_osd(); + post_event(RemoteBackfillReserved()); +} + +boost::statechart::result +PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt) +{ + DECLARE_LOCALS; + + int64_t num_bytes = ps->info.stats.stats.sum.num_bytes; + psdout(10) << __func__ << " num_bytes " << num_bytes << dendl; + if (backfill_osd_it != + context< Active >().remote_shards_to_reserve_backfill.end()) { + // The primary never backfills itself + ceph_assert(*backfill_osd_it != ps->pg_whoami); + pl->send_cluster_message( + backfill_osd_it->osd, + make_message( + MBackfillReserve::REQUEST, + spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard), + ps->get_osdmap_epoch(), + ps->get_backfill_priority(), + num_bytes, + ps->peer_bytes[*backfill_osd_it]), + ps->get_osdmap_epoch()); + ++backfill_osd_it; + } else { + ps->peer_bytes.clear(); + post_event(AllBackfillsReserved()); + } + return discard_event(); +} + +void PeeringState::WaitRemoteBackfillReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur); +} + +void PeeringState::WaitRemoteBackfillReserved::retry() +{ + DECLARE_LOCALS; + pl->cancel_local_background_io_reservation(); + + // Send CANCEL to all previously acquired reservations + set::const_iterator it, begin, end; + begin = context< Active >().remote_shards_to_reserve_backfill.begin(); + end = context< Active >().remote_shards_to_reserve_backfill.end(); + ceph_assert(begin != end); + for (it = begin; it != backfill_osd_it; ++it) { + // The primary never backfills itself + ceph_assert(*it != ps->pg_whoami); + pl->send_cluster_message( + it->osd, + make_message( + MBackfillReserve::RELEASE, + spg_t(context< PeeringMachine >().spgid.pgid, it->shard), + ps->get_osdmap_epoch()), + ps->get_osdmap_epoch()); + } + + ps->state_clear(PG_STATE_BACKFILL_WAIT); + pl->publish_stats_to_osd(); + + pl->schedule_event_after( + std::make_shared( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + RequestBackfill()), + ps->cct->_conf->osd_backfill_retry_interval); +} + +boost::statechart::result +PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt) +{ + DECLARE_LOCALS; + ps->state_set(PG_STATE_BACKFILL_TOOFULL); + retry(); + return transit(); +} + +boost::statechart::result +PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt) +{ + retry(); + return transit(); +} + +/*--WaitLocalBackfillReserved--*/ +PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + + ps->state_set(PG_STATE_BACKFILL_WAIT); + pl->request_local_background_io_reservation( + ps->get_backfill_priority(), + std::make_unique( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + LocalBackfillReserved()), + std::make_unique( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + DeferBackfill(0.0))); + pl->publish_stats_to_osd(); +} + +void PeeringState::WaitLocalBackfillReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur); +} + +/*----NotBackfilling------*/ +PeeringState::NotBackfilling::NotBackfilling(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + ps->state_clear(PG_STATE_REPAIR); + pl->publish_stats_to_osd(); +} + +boost::statechart::result PeeringState::NotBackfilling::react(const QueryUnfound& q) +{ + DECLARE_LOCALS; + + ps->query_unfound(q.f, "NotBackfilling"); + return discard_event(); +} + +boost::statechart::result +PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt) +{ + return discard_event(); +} + +boost::statechart::result +PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt) +{ + return discard_event(); +} + +void PeeringState::NotBackfilling::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + ps->state_clear(PG_STATE_BACKFILL_UNFOUND); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur); +} + +/*----NotRecovering------*/ +PeeringState::NotRecovering::NotRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + ps->state_clear(PG_STATE_REPAIR); + pl->publish_stats_to_osd(); +} + +boost::statechart::result PeeringState::NotRecovering::react(const QueryUnfound& q) +{ + DECLARE_LOCALS; + + ps->query_unfound(q.f, "NotRecovering"); + return discard_event(); +} + +void PeeringState::NotRecovering::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + ps->state_clear(PG_STATE_RECOVERY_UNFOUND); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_notrecovering_latency, dur); +} + +/*---RepNotRecovering----*/ +PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt) +{ + DECLARE_LOCALS; + ps->reject_reservation(); + post_event(RemoteReservationRejectedTooFull()); + return discard_event(); +} + +void PeeringState::RepNotRecovering::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur); +} + +/*---RepWaitRecoveryReserved--*/ +PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt) +{ + DECLARE_LOCALS; + pl->send_cluster_message( + ps->primary.osd, + make_message( + MRecoveryReserve::GRANT, + spg_t(ps->info.pgid.pgid, ps->primary.shard), + ps->get_osdmap_epoch()), + ps->get_osdmap_epoch()); + return transit(); +} + +boost::statechart::result +PeeringState::RepWaitRecoveryReserved::react( + const RemoteReservationCanceled &evt) +{ + DECLARE_LOCALS; + pl->unreserve_recovery_space(); + + pl->cancel_remote_recovery_reservation(); + return transit(); +} + +void PeeringState::RepWaitRecoveryReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur); +} + +/*-RepWaitBackfillReserved*/ +PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt) +{ + + DECLARE_LOCALS; + + if (!pl->try_reserve_recovery_space( + evt.primary_num_bytes, evt.local_num_bytes)) { + post_event(RejectTooFullRemoteReservation()); + } else { + PGPeeringEventURef preempt; + if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) { + // older peers will interpret preemption as TOOFULL + preempt = std::make_unique( + pl->get_osdmap_epoch(), + pl->get_osdmap_epoch(), + RemoteBackfillPreempted()); + } + pl->request_remote_recovery_reservation( + evt.priority, + std::make_unique( + pl->get_osdmap_epoch(), + pl->get_osdmap_epoch(), + RemoteBackfillReserved()), + std::move(preempt)); + } + return transit(); +} + +boost::statechart::result +PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt) +{ + DECLARE_LOCALS; + + // fall back to a local reckoning of priority of primary doesn't pass one + // (pre-mimic compat) + int prio = evt.priority ? evt.priority : ps->get_recovery_priority(); + + PGPeeringEventURef preempt; + if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) { + // older peers can't handle this + preempt = std::make_unique( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + RemoteRecoveryPreempted()); + } + + pl->request_remote_recovery_reservation( + prio, + std::make_unique( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + RemoteRecoveryReserved()), + std::move(preempt)); + return transit(); +} + +void PeeringState::RepWaitBackfillReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur); +} + +boost::statechart::result +PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt) +{ + DECLARE_LOCALS; + + + pl->send_cluster_message( + ps->primary.osd, + make_message( + MBackfillReserve::GRANT, + spg_t(ps->info.pgid.pgid, ps->primary.shard), + ps->get_osdmap_epoch()), + ps->get_osdmap_epoch()); + return transit(); +} + +boost::statechart::result +PeeringState::RepWaitBackfillReserved::react( + const RejectTooFullRemoteReservation &evt) +{ + DECLARE_LOCALS; + ps->reject_reservation(); + post_event(RemoteReservationRejectedTooFull()); + return discard_event(); +} + +boost::statechart::result +PeeringState::RepWaitBackfillReserved::react( + const RemoteReservationRejectedTooFull &evt) +{ + DECLARE_LOCALS; + pl->unreserve_recovery_space(); + + pl->cancel_remote_recovery_reservation(); + return transit(); +} + +boost::statechart::result +PeeringState::RepWaitBackfillReserved::react( + const RemoteReservationCanceled &evt) +{ + DECLARE_LOCALS; + pl->unreserve_recovery_space(); + + pl->cancel_remote_recovery_reservation(); + return transit(); +} + +/*---RepRecovering-------*/ +PeeringState::RepRecovering::RepRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &) +{ + DECLARE_LOCALS; + + + pl->unreserve_recovery_space(); + pl->send_cluster_message( + ps->primary.osd, + make_message( + MRecoveryReserve::REVOKE, + spg_t(ps->info.pgid.pgid, ps->primary.shard), + ps->get_osdmap_epoch()), + ps->get_osdmap_epoch()); + return discard_event(); +} + +boost::statechart::result +PeeringState::RepRecovering::react(const BackfillTooFull &) +{ + DECLARE_LOCALS; + + + pl->unreserve_recovery_space(); + pl->send_cluster_message( + ps->primary.osd, + make_message( + MBackfillReserve::REVOKE_TOOFULL, + spg_t(ps->info.pgid.pgid, ps->primary.shard), + ps->get_osdmap_epoch()), + ps->get_osdmap_epoch()); + return discard_event(); +} + +boost::statechart::result +PeeringState::RepRecovering::react(const RemoteBackfillPreempted &) +{ + DECLARE_LOCALS; + + + pl->unreserve_recovery_space(); + pl->send_cluster_message( + ps->primary.osd, + make_message( + MBackfillReserve::REVOKE, + spg_t(ps->info.pgid.pgid, ps->primary.shard), + ps->get_osdmap_epoch()), + ps->get_osdmap_epoch()); + return discard_event(); +} + +void PeeringState::RepRecovering::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + pl->unreserve_recovery_space(); + + pl->cancel_remote_recovery_reservation(); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_reprecovering_latency, dur); +} + +/*------Activating--------*/ +PeeringState::Activating::Activating(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating") +{ + context< PeeringMachine >().log_enter(state_name); +} + +void PeeringState::Activating::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_activating_latency, dur); +} + +PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + + // Make sure all nodes that part of the recovery aren't full + if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery && + ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) { + post_event(RecoveryTooFull()); + return; + } + + ps->state_clear(PG_STATE_RECOVERY_TOOFULL); + ps->state_set(PG_STATE_RECOVERY_WAIT); + pl->request_local_background_io_reservation( + ps->get_recovery_priority(), + std::make_unique( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + LocalRecoveryReserved()), + std::make_unique( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + DeferRecovery(0.0))); + pl->publish_stats_to_osd(); +} + +boost::statechart::result +PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt) +{ + DECLARE_LOCALS; + ps->state_set(PG_STATE_RECOVERY_TOOFULL); + pl->schedule_event_after( + std::make_shared( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + DoRecovery()), + ps->cct->_conf->osd_recovery_retry_interval); + return transit(); +} + +void PeeringState::WaitLocalRecoveryReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur); +} + +PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"), + remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin()) +{ + context< PeeringMachine >().log_enter(state_name); + post_event(RemoteRecoveryReserved()); +} + +boost::statechart::result +PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) { + DECLARE_LOCALS; + + if (remote_recovery_reservation_it != + context< Active >().remote_shards_to_reserve_recovery.end()) { + ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami); + pl->send_cluster_message( + remote_recovery_reservation_it->osd, + make_message( + MRecoveryReserve::REQUEST, + spg_t(context< PeeringMachine >().spgid.pgid, + remote_recovery_reservation_it->shard), + ps->get_osdmap_epoch(), + ps->get_recovery_priority()), + ps->get_osdmap_epoch()); + ++remote_recovery_reservation_it; + } else { + post_event(AllRemotesReserved()); + } + return discard_event(); +} + +void PeeringState::WaitRemoteRecoveryReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur); +} + +PeeringState::Recovering::Recovering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering") +{ + context< PeeringMachine >().log_enter(state_name); + + DECLARE_LOCALS; + ps->state_clear(PG_STATE_RECOVERY_WAIT); + ps->state_clear(PG_STATE_RECOVERY_TOOFULL); + ps->state_set(PG_STATE_RECOVERING); + pl->on_recovery_reserved(); + ceph_assert(!ps->state_test(PG_STATE_ACTIVATING)); + pl->publish_stats_to_osd(); +} + +void PeeringState::Recovering::release_reservations(bool cancel) +{ + DECLARE_LOCALS; + ceph_assert(cancel || !ps->pg_log.get_missing().have_missing()); + + // release remote reservations + for (auto i = context< Active >().remote_shards_to_reserve_recovery.begin(); + i != context< Active >().remote_shards_to_reserve_recovery.end(); + ++i) { + if (*i == ps->pg_whoami) // skip myself + continue; + pl->send_cluster_message( + i->osd, + make_message( + MRecoveryReserve::RELEASE, + spg_t(ps->info.pgid.pgid, i->shard), + ps->get_osdmap_epoch()), + ps->get_osdmap_epoch()); + } +} + +boost::statechart::result +PeeringState::Recovering::react(const AllReplicasRecovered &evt) +{ + DECLARE_LOCALS; + ps->state_clear(PG_STATE_FORCED_RECOVERY); + release_reservations(); + pl->cancel_local_background_io_reservation(); + return transit(); +} + +boost::statechart::result +PeeringState::Recovering::react(const RequestBackfill &evt) +{ + DECLARE_LOCALS; + + release_reservations(); + + ps->state_clear(PG_STATE_FORCED_RECOVERY); + pl->cancel_local_background_io_reservation(); + pl->publish_stats_to_osd(); + // transit any async_recovery_targets back into acting + // so pg won't have to stay undersized for long + // as backfill might take a long time to complete.. + if (!ps->async_recovery_targets.empty()) { + pg_shard_t auth_log_shard; + bool history_les_bound = false; + // FIXME: Uh-oh we have to check this return value; choose_acting can fail! + ps->choose_acting(auth_log_shard, true, &history_les_bound); + } + return transit(); +} + +boost::statechart::result +PeeringState::Recovering::react(const DeferRecovery &evt) +{ + DECLARE_LOCALS; + if (!ps->state_test(PG_STATE_RECOVERING)) { + // we may have finished recovery and have an AllReplicasRecovered + // event queued to move us to the next state. + psdout(10) << "got defer recovery but not recovering" << dendl; + return discard_event(); + } + psdout(10) << "defer recovery, retry delay " << evt.delay << dendl; + ps->state_set(PG_STATE_RECOVERY_WAIT); + pl->cancel_local_background_io_reservation(); + release_reservations(true); + pl->schedule_event_after( + std::make_shared( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + DoRecovery()), + evt.delay); + return transit(); +} + +boost::statechart::result +PeeringState::Recovering::react(const UnfoundRecovery &evt) +{ + DECLARE_LOCALS; + psdout(10) << "recovery has unfound, can't continue" << dendl; + ps->state_set(PG_STATE_RECOVERY_UNFOUND); + pl->cancel_local_background_io_reservation(); + release_reservations(true); + return transit(); +} + +void PeeringState::Recovering::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + ps->state_clear(PG_STATE_RECOVERING); + pl->get_peering_perf().tinc(rs_recovering_latency, dur); +} + +PeeringState::Recovered::Recovered(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered") +{ + pg_shard_t auth_log_shard; + + context< PeeringMachine >().log_enter(state_name); + + DECLARE_LOCALS; + + ceph_assert(!ps->needs_recovery()); + + // if we finished backfill, all acting are active; recheck if + // DEGRADED | UNDERSIZED is appropriate. + ceph_assert(!ps->acting_recovery_backfill.empty()); + if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <= + ps->acting_recovery_backfill.size()) { + ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); + pl->publish_stats_to_osd(); + } + + // adjust acting set? (e.g. because backfill completed...) + bool history_les_bound = false; + if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard, + true, &history_les_bound)) { + ceph_assert(ps->want_acting.size()); + } else if (!ps->async_recovery_targets.empty()) { + // FIXME: Uh-oh we have to check this return value; choose_acting can fail! + ps->choose_acting(auth_log_shard, true, &history_les_bound); + } + + if (context< Active >().all_replicas_activated && + ps->async_recovery_targets.empty()) + post_event(GoClean()); +} + +void PeeringState::Recovered::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_recovered_latency, dur); +} + +PeeringState::Clean::Clean(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean") +{ + context< PeeringMachine >().log_enter(state_name); + + DECLARE_LOCALS; + + if (ps->info.last_complete != ps->info.last_update) { + ceph_abort(); + } + + + ps->try_mark_clean(); + + context< PeeringMachine >().get_cur_transaction().register_on_commit( + pl->on_clean()); +} + +void PeeringState::Clean::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + ps->state_clear(PG_STATE_CLEAN); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_clean_latency, dur); +} + +template +set unique_osd_shard_set(const pg_shard_t & skip, const T &in) +{ + set osds_found; + set out; + for (auto i = in.begin(); i != in.end(); ++i) { + if (*i != skip && !osds_found.count(i->osd)) { + osds_found.insert(i->osd); + out.insert(*i); + } + } + return out; +} + +/*---------Active---------*/ +PeeringState::Active::Active(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"), + remote_shards_to_reserve_recovery( + unique_osd_shard_set( + context< PeeringMachine >().state->pg_whoami, + context< PeeringMachine >().state->acting_recovery_backfill)), + remote_shards_to_reserve_backfill( + unique_osd_shard_set( + context< PeeringMachine >().state->pg_whoami, + context< PeeringMachine >().state->backfill_targets)), + all_replicas_activated(false) +{ + context< PeeringMachine >().log_enter(state_name); + + + DECLARE_LOCALS; + + ceph_assert(!ps->backfill_reserved); + ceph_assert(ps->is_primary()); + psdout(10) << "In Active, about to call activate" << dendl; + ps->start_flush(context< PeeringMachine >().get_cur_transaction()); + ps->activate(context< PeeringMachine >().get_cur_transaction(), + ps->get_osdmap_epoch(), + context< PeeringMachine >().get_recovery_ctx()); + + // everyone has to commit/ack before we are truly active + ps->blocked_by.clear(); + for (auto p = ps->acting_recovery_backfill.begin(); + p != ps->acting_recovery_backfill.end(); + ++p) { + if (p->shard != ps->pg_whoami.shard) { + ps->blocked_by.insert(p->shard); + } + } + pl->publish_stats_to_osd(); + psdout(10) << "Activate Finished" << dendl; +} + +boost::statechart::result PeeringState::Active::react(const AdvMap& advmap) +{ + DECLARE_LOCALS; + + if (ps->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + psdout(10) << "Active advmap interval change, fast return" << dendl; + return forward_event(); + } + psdout(10) << "Active advmap" << dendl; + bool need_publish = false; + + pl->on_active_advmap(advmap.osdmap); + if (ps->dirty_big_info) { + // share updated purged_snaps to mgr/mon so that we (a) stop reporting + // purged snaps and (b) perhaps share more snaps that we have purged + // but didn't fit in pg_stat_t. + need_publish = true; + ps->share_pg_info(); + } + + bool need_acting_change = false; + for (size_t i = 0; i < ps->want_acting.size(); i++) { + int osd = ps->want_acting[i]; + if (!advmap.osdmap->is_up(osd)) { + pg_shard_t osd_with_shard(osd, shard_id_t(i)); + if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) { + psdout(10) << "Active stray osd." << osd << " in want_acting is down" + << dendl; + need_acting_change = true; + } + } + } + if (need_acting_change) { + psdout(10) << "Active need acting change, call choose_acting again" + << dendl; + // possibly because we re-add some strays into the acting set and + // some of them then go down in a subsequent map before we could see + // the map changing the pg temp. + // call choose_acting again to clear them out. + // note that we leave restrict_to_up_acting to false in order to + // not overkill any chosen stray that is still alive. + pg_shard_t auth_log_shard; + bool history_les_bound = false; + ps->remove_down_peer_info(advmap.osdmap); + ps->choose_acting(auth_log_shard, false, &history_les_bound, true); + } + + /* Check for changes in pool size (if the acting set changed as a result, + * this does not matter) */ + if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) != + ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) { + if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <= + ps->actingset.size()) { + ps->state_clear(PG_STATE_UNDERSIZED); + } else { + ps->state_set(PG_STATE_UNDERSIZED); + } + // degraded changes will be detected by call from publish_stats_to_osd() + need_publish = true; + } + + // if we haven't reported our PG stats in a long time, do so now. + if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) { + psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch) + << " epochs" << dendl; + need_publish = true; + } + + if (need_publish) + pl->publish_stats_to_osd(); + + if (ps->check_prior_readable_down_osds(advmap.osdmap)) { + pl->recheck_readable(); + } + + return forward_event(); +} + +boost::statechart::result PeeringState::Active::react(const ActMap&) +{ + DECLARE_LOCALS; + psdout(10) << "Active: handling ActMap" << dendl; + ceph_assert(ps->is_primary()); + + pl->on_active_actmap(); + + if (ps->have_unfound()) { + // object may have become unfound + ps->discover_all_missing(context().get_recovery_ctx().msgs); + } + + uint64_t unfound = ps->missing_loc.num_unfound(); + if (unfound > 0 && + ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) { + if (ps->cct->_conf->osd_auto_mark_unfound_lost) { + pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound + << " objects unfound and apparently lost, would automatically " + << "mark these objects lost but this feature is not yet implemented " + << "(osd_auto_mark_unfound_lost)"; + } else + pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " + << unfound << " objects unfound and apparently lost"; + } + + return forward_event(); +} + +boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt) +{ + + DECLARE_LOCALS; + ceph_assert(ps->is_primary()); + if (ps->peer_info.count(notevt.from)) { + psdout(10) << "Active: got notify from " << notevt.from + << ", already have info from that osd, ignoring" + << dendl; + } else if (ps->peer_purged.count(notevt.from)) { + psdout(10) << "Active: got notify from " << notevt.from + << ", already purged that peer, ignoring" + << dendl; + } else { + psdout(10) << "Active: got notify from " << notevt.from + << ", calling proc_replica_info and discover_all_missing" + << dendl; + ps->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent); + if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) { + ps->discover_all_missing( + context().get_recovery_ctx().msgs); + } + // check if it is a previous down acting member that's coming back. + // if so, request pg_temp change to trigger a new interval transition + pg_shard_t auth_log_shard; + bool history_les_bound = false; + // FIXME: Uh-oh we have to check this return value; choose_acting can fail! + ps->choose_acting(auth_log_shard, false, &history_les_bound, true); + if (!ps->want_acting.empty() && ps->want_acting != ps->acting) { + psdout(10) << "Active: got notify from previous acting member " + << notevt.from << ", requesting pg_temp change" + << dendl; + } + } + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const MTrim& trim) +{ + DECLARE_LOCALS; + ceph_assert(ps->is_primary()); + + // peer is informing us of their last_complete_ondisk + ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl; + ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard}, + trim.trim_to); + // trim log when the pg is recovered + ps->calc_min_last_complete_ondisk(); + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt) +{ + DECLARE_LOCALS; + ceph_assert(ps->is_primary()); + + ceph_assert(!ps->acting_recovery_backfill.empty()); + if (infoevt.lease_ack) { + ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack); + } + // don't update history (yet) if we are active and primary; the replica + // may be telling us they have activated (and committed) but we can't + // share that until _everyone_ does the same. + if (ps->is_acting_recovery_backfill(infoevt.from) && + ps->peer_activated.count(infoevt.from) == 0) { + psdout(10) << " peer osd." << infoevt.from + << " activated and committed" << dendl; + ps->peer_activated.insert(infoevt.from); + ps->blocked_by.erase(infoevt.from.shard); + pl->publish_stats_to_osd(); + if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) { + all_activated_and_committed(); + } + } + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const MLogRec& logevt) +{ + DECLARE_LOCALS; + psdout(10) << "searching osd." << logevt.from + << " log for unfound items" << dendl; + ps->proc_replica_log( + logevt.msg->info, logevt.msg->log, std::move(logevt.msg->missing), logevt.from); + bool got_missing = ps->search_for_missing( + ps->peer_info[logevt.from], + ps->peer_missing[logevt.from], + logevt.from, + context< PeeringMachine >().get_recovery_ctx()); + // If there are missing AND we are "fully" active then start recovery now + if (got_missing && ps->state_test(PG_STATE_ACTIVE)) { + post_event(DoRecovery()); + } + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const QueryState& q) +{ + DECLARE_LOCALS; + + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + { + q.f->open_array_section("might_have_unfound"); + for (auto p = ps->might_have_unfound.begin(); + p != ps->might_have_unfound.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (ps->peer_missing.count(*p)) { + q.f->dump_string("status", "already probed"); + } else if (ps->peer_missing_requested.count(*p)) { + q.f->dump_string("status", "querying"); + } else if (!ps->get_osdmap()->is_up(p->osd)) { + q.f->dump_string("status", "osd is down"); + } else { + q.f->dump_string("status", "not queried"); + } + q.f->close_section(); + } + q.f->close_section(); + } + { + q.f->open_object_section("recovery_progress"); + q.f->open_array_section("backfill_targets"); + for (auto p = ps->backfill_targets.begin(); + p != ps->backfill_targets.end(); ++p) + q.f->dump_stream("replica") << *p; + q.f->close_section(); + pl->dump_recovery_info(q.f); + q.f->close_section(); + } + + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::Active::react(const QueryUnfound& q) +{ + DECLARE_LOCALS; + + ps->query_unfound(q.f, "Active"); + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react( + const ActivateCommitted &evt) +{ + DECLARE_LOCALS; + ceph_assert(!ps->peer_activated.count(ps->pg_whoami)); + ps->peer_activated.insert(ps->pg_whoami); + psdout(10) << "_activate_committed " << evt.epoch + << " peer_activated now " << ps->peer_activated + << " last_interval_started " + << ps->info.history.last_interval_started + << " last_epoch_started " + << ps->info.history.last_epoch_started + << " same_interval_since " + << ps->info.history.same_interval_since + << dendl; + ceph_assert(!ps->acting_recovery_backfill.empty()); + if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) + all_activated_and_committed(); + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt) +{ + + DECLARE_LOCALS; + pg_t pgid = context< PeeringMachine >().spgid.pgid; + + all_replicas_activated = true; + + ps->state_clear(PG_STATE_ACTIVATING); + ps->state_clear(PG_STATE_CREATING); + ps->state_clear(PG_STATE_PREMERGE); + + bool merge_target; + if (ps->pool.info.is_pending_merge(pgid, &merge_target)) { + ps->state_set(PG_STATE_PEERED); + ps->state_set(PG_STATE_PREMERGE); + + if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) { + if (merge_target) { + pg_t src = pgid; + src.set_ps(ps->pool.info.get_pg_num_pending()); + assert(src.get_parent() == pgid); + pl->set_not_ready_to_merge_target(pgid, src); + } else { + pl->set_not_ready_to_merge_source(pgid); + } + } + } else if (!ps->acting_set_writeable()) { + ps->state_set(PG_STATE_PEERED); + } else { + ps->state_set(PG_STATE_ACTIVE); + } + + auto mnow = pl->get_mnow(); + if (ps->prior_readable_until_ub > mnow) { + psdout(10) << " waiting for prior_readable_until_ub " + << ps->prior_readable_until_ub << " > mnow " << mnow << dendl; + ps->state_set(PG_STATE_WAIT); + pl->queue_check_readable( + ps->last_peering_reset, + ps->prior_readable_until_ub - mnow); + } else { + psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub " + << ps->prior_readable_until_ub << dendl; + } + + if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) { + pl->send_pg_created(pgid); + } + + ps->info.history.last_epoch_started = ps->info.last_epoch_started; + ps->info.history.last_interval_started = ps->info.last_interval_started; + ps->dirty_info = true; + + ps->share_pg_info(); + pl->publish_stats_to_osd(); + + pl->on_activate_complete(); + + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const RenewLease& rl) +{ + DECLARE_LOCALS; + ps->proc_renew_lease(); + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const MLeaseAck& la) +{ + DECLARE_LOCALS; + ps->proc_lease_ack(la.from, la.lease_ack); + return discard_event(); +} + + +boost::statechart::result PeeringState::Active::react(const CheckReadable &evt) +{ + DECLARE_LOCALS; + pl->recheck_readable(); + return discard_event(); +} + +/* + * update info.history.last_epoch_started ONLY after we and all + * replicas have activated AND committed the activate transaction + * (i.e. the peering results are stable on disk). + */ +void PeeringState::Active::all_activated_and_committed() +{ + DECLARE_LOCALS; + psdout(10) << "all_activated_and_committed" << dendl; + ceph_assert(ps->is_primary()); + ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size()); + ceph_assert(!ps->acting_recovery_backfill.empty()); + ceph_assert(ps->blocked_by.empty()); + + if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) { + // this is overkill when the activation is quick, but when it is slow it + // is important, because the lease was renewed by the activate itself but we + // don't know how long ago that was, and simply scheduling now may leave + // a gap in lease coverage. keep it simple and aggressively renew. + ps->renew_lease(pl->get_mnow()); + ps->send_lease(); + ps->schedule_renew_lease(); + } + + // Degraded? + ps->update_calc_stats(); + if (ps->info.stats.stats.sum.num_objects_degraded) { + ps->state_set(PG_STATE_DEGRADED); + } else { + ps->state_clear(PG_STATE_DEGRADED); + } + + post_event(PeeringState::AllReplicasActivated()); +} + + +void PeeringState::Active::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + + DECLARE_LOCALS; + pl->cancel_local_background_io_reservation(); + + ps->blocked_by.clear(); + ps->backfill_reserved = false; + ps->state_clear(PG_STATE_ACTIVATING); + ps->state_clear(PG_STATE_DEGRADED); + ps->state_clear(PG_STATE_UNDERSIZED); + ps->state_clear(PG_STATE_BACKFILL_TOOFULL); + ps->state_clear(PG_STATE_BACKFILL_WAIT); + ps->state_clear(PG_STATE_RECOVERY_WAIT); + ps->state_clear(PG_STATE_RECOVERY_TOOFULL); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_active_latency, dur); + pl->on_active_exit(); +} + +/*------ReplicaActive-----*/ +PeeringState::ReplicaActive::ReplicaActive(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive") +{ + context< PeeringMachine >().log_enter(state_name); + + DECLARE_LOCALS; + ps->start_flush(context< PeeringMachine >().get_cur_transaction()); +} + + +boost::statechart::result PeeringState::ReplicaActive::react( + const Activate& actevt) { + DECLARE_LOCALS; + psdout(10) << "In ReplicaActive, about to call activate" << dendl; + ps->activate( + context< PeeringMachine >().get_cur_transaction(), + actevt.activation_epoch, + context< PeeringMachine >().get_recovery_ctx()); + psdout(10) << "Activate Finished" << dendl; + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react( + const ActivateCommitted &evt) +{ + DECLARE_LOCALS; + psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl; + + auto &rctx = context().get_recovery_ctx(); + auto epoch = ps->get_osdmap_epoch(); + pg_info_t i = ps->info; + i.history.last_epoch_started = evt.activation_epoch; + i.history.last_interval_started = i.history.same_interval_since; + rctx.send_info( + ps->get_primary().osd, + spg_t(ps->info.pgid.pgid, ps->get_primary().shard), + epoch, + epoch, + i, + {}, /* lease */ + ps->get_lease_ack()); + + if (ps->acting_set_writeable()) { + ps->state_set(PG_STATE_ACTIVE); + } else { + ps->state_set(PG_STATE_PEERED); + } + pl->on_activate_committed(); + + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l) +{ + DECLARE_LOCALS; + spg_t spgid = context< PeeringMachine >().spgid; + epoch_t epoch = pl->get_osdmap_epoch(); + + ps->proc_lease(l.lease); + pl->send_cluster_message( + ps->get_primary().osd, + make_message(epoch, + spg_t(spgid.pgid, ps->get_primary().shard), + ps->get_lease_ack()), + epoch); + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt) +{ + DECLARE_LOCALS; + ps->proc_primary_info(context().get_cur_transaction(), + infoevt.info); + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt) +{ + DECLARE_LOCALS; + psdout(10) << "received log from " << logevt.from << dendl; + ObjectStore::Transaction &t = context().get_cur_transaction(); + ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from); + ceph_assert(ps->pg_log.get_head() == ps->info.last_update); + if (logevt.msg->lease) { + ps->proc_lease(*logevt.msg->lease); + } + + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim) +{ + DECLARE_LOCALS; + // primary is instructing us to trim + ps->pg_log.trim(trim.trim_to, ps->info); + ps->dirty_info = true; + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&) +{ + DECLARE_LOCALS; + if (ps->should_send_notify() && ps->get_primary().osd >= 0) { + ps->info.history.refresh_prior_readable_until_ub( + pl->get_mnow(), ps->prior_readable_until_ub); + context< PeeringMachine >().send_notify( + ps->get_primary().osd, + pg_notify_t( + ps->get_primary().shard, ps->pg_whoami.shard, + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + ps->info, + ps->past_intervals)); + } + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react( + const MQuery& query) +{ + DECLARE_LOCALS; + ps->fulfill_query(query, context().get_recovery_ctx()); + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "ReplicaActive"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::ReplicaActive::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + pl->unreserve_recovery_space(); + + pl->cancel_remote_recovery_reservation(); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_replicaactive_latency, dur); + + ps->min_last_complete_ondisk = eversion_t(); +} + +/*-------Stray---*/ +PeeringState::Stray::Stray(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Stray") +{ + context< PeeringMachine >().log_enter(state_name); + + + DECLARE_LOCALS; + ceph_assert(!ps->is_peered()); + ceph_assert(!ps->is_peering()); + ceph_assert(!ps->is_primary()); + + if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) { + ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl; + post_event(DeleteStart()); + } else { + ps->start_flush(context< PeeringMachine >().get_cur_transaction()); + } +} + +boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt) +{ + DECLARE_LOCALS; + MOSDPGLog *msg = logevt.msg.get(); + psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl; + + ObjectStore::Transaction &t = context().get_cur_transaction(); + if (msg->info.last_backfill == hobject_t()) { + // restart backfill + ps->info = msg->info; + pl->on_info_history_change(); + ps->dirty_info = true; + ps->dirty_big_info = true; // maybe. + + PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; + ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get()); + + ps->pg_log.reset_backfill(); + } else { + ps->merge_log(t, msg->info, std::move(msg->log), logevt.from); + } + if (logevt.msg->lease) { + ps->proc_lease(*logevt.msg->lease); + } + + ceph_assert(ps->pg_log.get_head() == ps->info.last_update); + + post_event(Activate(logevt.msg->info.last_epoch_started)); + return transit(); +} + +boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt) +{ + DECLARE_LOCALS; + psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl; + + if (ps->info.last_update > infoevt.info.last_update) { + // rewind divergent log entries + ObjectStore::Transaction &t = context().get_cur_transaction(); + ps->rewind_divergent_log(t, infoevt.info.last_update); + ps->info.stats = infoevt.info.stats; + ps->info.hit_set = infoevt.info.hit_set; + } + + if (infoevt.lease) { + ps->proc_lease(*infoevt.lease); + } + + ceph_assert(infoevt.info.last_update == ps->info.last_update); + ceph_assert(ps->pg_log.get_head() == ps->info.last_update); + + post_event(Activate(infoevt.info.last_epoch_started)); + return transit(); +} + +boost::statechart::result PeeringState::Stray::react(const MQuery& query) +{ + DECLARE_LOCALS; + ps->fulfill_query(query, context().get_recovery_ctx()); + return discard_event(); +} + +boost::statechart::result PeeringState::Stray::react(const ActMap&) +{ + DECLARE_LOCALS; + if (ps->should_send_notify() && ps->get_primary().osd >= 0) { + ps->info.history.refresh_prior_readable_until_ub( + pl->get_mnow(), ps->prior_readable_until_ub); + context< PeeringMachine >().send_notify( + ps->get_primary().osd, + pg_notify_t( + ps->get_primary().shard, ps->pg_whoami.shard, + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + ps->info, + ps->past_intervals)); + } + return discard_event(); +} + +void PeeringState::Stray::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_stray_latency, dur); +} + + +/*--------ToDelete----------*/ +PeeringState::ToDelete::ToDelete(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/ToDelete") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + pl->get_perf_logger().inc(l_osd_pg_removing); +} + +void PeeringState::ToDelete::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + // note: on a successful removal, this path doesn't execute. see + // _delete_some(). + pl->get_perf_logger().dec(l_osd_pg_removing); + + pl->cancel_local_background_io_reservation(); +} + +/*----WaitDeleteReserved----*/ +PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, + "Started/ToDelete/WaitDeleteReseved") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + context< ToDelete >().priority = ps->get_delete_priority(); + + pl->cancel_local_background_io_reservation(); + pl->request_local_background_io_reservation( + context().priority, + std::make_unique( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + DeleteReserved()), + std::make_unique( + ps->get_osdmap_epoch(), + ps->get_osdmap_epoch(), + DeleteInterrupted())); +} + +boost::statechart::result PeeringState::ToDelete::react( + const ActMap& evt) +{ + DECLARE_LOCALS; + if (ps->get_delete_priority() != priority) { + psdout(10) << __func__ << " delete priority changed, resetting" + << dendl; + return transit(); + } + return discard_event(); +} + +void PeeringState::WaitDeleteReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); +} + +/*----Deleting-----*/ +PeeringState::Deleting::Deleting(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting") +{ + context< PeeringMachine >().log_enter(state_name); + + DECLARE_LOCALS; + ps->deleting = true; + ObjectStore::Transaction &t = context().get_cur_transaction(); + + // clear log + PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; + ps->pg_log.roll_forward(rollbacker.get()); + + // adjust info to backfill + ps->info.set_last_backfill(hobject_t()); + ps->pg_log.reset_backfill(); + ps->dirty_info = true; + + pl->on_removal(t); +} + +boost::statechart::result PeeringState::Deleting::react( + const DeleteSome& evt) +{ + DECLARE_LOCALS; + std::pair p; + p = pl->do_delete_work(context().get_cur_transaction(), + next); + next = p.first; + return p.second ? discard_event() : terminate(); +} + +void PeeringState::Deleting::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + ps->deleting = false; + pl->cancel_local_background_io_reservation(); +} + +/*--------GetInfo---------*/ +PeeringState::GetInfo::GetInfo(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo") +{ + context< PeeringMachine >().log_enter(state_name); + + + DECLARE_LOCALS; + ps->check_past_interval_bounds(); + ps->log_weirdness(); + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + + ceph_assert(ps->blocked_by.empty()); + + prior_set = ps->build_prior(); + ps->prior_readable_down_osds = prior_set.down; + + if (ps->prior_readable_down_osds.empty()) { + psdout(10) << " no prior_set down osds, will clear prior_readable_until_ub before activating" + << dendl; + } + + ps->reset_min_peer_features(); + get_infos(); + if (prior_set.pg_down) { + post_event(IsDown()); + } else if (peer_info_requested.empty()) { + post_event(GotInfo()); + } +} + +void PeeringState::GetInfo::get_infos() +{ + DECLARE_LOCALS; + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + + ps->blocked_by.clear(); + for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it) { + pg_shard_t peer = *it; + if (peer == ps->pg_whoami) { + continue; + } + if (ps->peer_info.count(peer)) { + psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl; + continue; + } + if (peer_info_requested.count(peer)) { + psdout(10) << " already requested info from osd." << peer << dendl; + ps->blocked_by.insert(peer.osd); + } else if (!ps->get_osdmap()->is_up(peer.osd)) { + psdout(10) << " not querying info from down osd." << peer << dendl; + } else { + psdout(10) << " querying info from osd." << peer << dendl; + context< PeeringMachine >().send_query( + peer.osd, + pg_query_t(pg_query_t::INFO, + it->shard, ps->pg_whoami.shard, + ps->info.history, + ps->get_osdmap_epoch())); + peer_info_requested.insert(peer); + ps->blocked_by.insert(peer.osd); + } + } + + ps->check_prior_readable_down_osds(ps->get_osdmap()); + + pl->publish_stats_to_osd(); +} + +boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt) +{ + + DECLARE_LOCALS; + + auto p = peer_info_requested.find(infoevt.from); + if (p != peer_info_requested.end()) { + peer_info_requested.erase(p); + ps->blocked_by.erase(infoevt.from.osd); + } + + epoch_t old_start = ps->info.history.last_epoch_started; + if (ps->proc_replica_info( + infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) { + // we got something new ... + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + if (old_start < ps->info.history.last_epoch_started) { + psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl; + prior_set = ps->build_prior(); + ps->prior_readable_down_osds = prior_set.down; + + // filter out any osds that got dropped from the probe set from + // peer_info_requested. this is less expensive than restarting + // peering (which would re-probe everyone). + auto p = peer_info_requested.begin(); + while (p != peer_info_requested.end()) { + if (prior_set.probe.count(*p) == 0) { + psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; + peer_info_requested.erase(p++); + } else { + ++p; + } + } + get_infos(); + } + psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: " + << hex << infoevt.features << dec << dendl; + ps->apply_peer_features(infoevt.features); + + // are we done getting everything? + if (peer_info_requested.empty() && !prior_set.pg_down) { + psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl; + psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl; + psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl; + post_event(GotInfo()); + } + } + return discard_event(); +} + +boost::statechart::result PeeringState::GetInfo::react(const QueryState& q) +{ + DECLARE_LOCALS; + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("requested_info_from"); + for (auto p = peer_info_requested.begin(); + p != peer_info_requested.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (ps->peer_info.count(*p)) { + q.f->open_object_section("got_info"); + ps->peer_info[*p].dump(q.f); + q.f->close_section(); + } + q.f->close_section(); + } + q.f->close_section(); + + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::GetInfo::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "GetInfo"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::GetInfo::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_getinfo_latency, dur); + ps->blocked_by.clear(); +} + +/*------GetLog------------*/ +PeeringState::GetLog::GetLog(my_context ctx) + : my_base(ctx), + NamedState( + context< PeeringMachine >().state_history, + "Started/Primary/Peering/GetLog"), + msg(0) +{ + context< PeeringMachine >().log_enter(state_name); + + DECLARE_LOCALS; + + ps->log_weirdness(); + + // adjust acting? + if (!ps->choose_acting(auth_log_shard, false, + &context< Peering >().history_les_bound)) { + if (!ps->want_acting.empty()) { + post_event(NeedActingChange()); + } else { + post_event(IsIncomplete()); + } + return; + } + + // am i the best? + if (auth_log_shard == ps->pg_whoami) { + post_event(GotLog()); + return; + } + + const pg_info_t& best = ps->peer_info[auth_log_shard]; + + // am i broken? + if (ps->info.last_update < best.log_tail) { + psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl; + post_event(IsIncomplete()); + return; + } + + // how much log to request? + eversion_t request_log_from = ps->info.last_update; + ceph_assert(!ps->acting_recovery_backfill.empty()); + for (auto p = ps->acting_recovery_backfill.begin(); + p != ps->acting_recovery_backfill.end(); + ++p) { + if (*p == ps->pg_whoami) continue; + pg_info_t& ri = ps->peer_info[*p]; + if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail && + ri.last_update < request_log_from) + request_log_from = ri.last_update; + } + + // how much? + psdout(10) << " requesting log from osd." << auth_log_shard << dendl; + context().send_query( + auth_log_shard.osd, + pg_query_t( + pg_query_t::LOG, + auth_log_shard.shard, ps->pg_whoami.shard, + request_log_from, ps->info.history, + ps->get_osdmap_epoch())); + + ceph_assert(ps->blocked_by.empty()); + ps->blocked_by.insert(auth_log_shard.osd); + pl->publish_stats_to_osd(); +} + +boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap) +{ + // make sure our log source didn't go down. we need to check + // explicitly because it may not be part of the prior set, which + // means the Peering state check won't catch it going down. + if (!advmap.osdmap->is_up(auth_log_shard.osd)) { + psdout(10) << "GetLog: auth_log_shard osd." + << auth_log_shard.osd << " went down" << dendl; + post_event(advmap); + return transit< Reset >(); + } + + // let the Peering state do its checks. + return forward_event(); +} + +boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt) +{ + ceph_assert(!msg); + if (logevt.from != auth_log_shard) { + psdout(10) << "GetLog: discarding log from " + << "non-auth_log_shard osd." << logevt.from << dendl; + return discard_event(); + } + psdout(10) << "GetLog: received master log from osd." + << logevt.from << dendl; + msg = logevt.msg; + post_event(GotLog()); + return discard_event(); +} + +boost::statechart::result PeeringState::GetLog::react(const GotLog&) +{ + + DECLARE_LOCALS; + psdout(10) << "leaving GetLog" << dendl; + if (msg) { + psdout(10) << "processing master log" << dendl; + ps->proc_master_log(context().get_cur_transaction(), + msg->info, std::move(msg->log), std::move(msg->missing), + auth_log_shard); + } + ps->start_flush(context< PeeringMachine >().get_cur_transaction()); + return transit< GetMissing >(); +} + +boost::statechart::result PeeringState::GetLog::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_stream("auth_log_shard") << auth_log_shard; + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::GetLog::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "GetLog"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::GetLog::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_getlog_latency, dur); + ps->blocked_by.clear(); +} + +/*------WaitActingChange--------*/ +PeeringState::WaitActingChange::WaitActingChange(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap) +{ + DECLARE_LOCALS; + OSDMapRef osdmap = advmap.osdmap; + + psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl; + for (auto p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) { + if (!osdmap->is_up(*p)) { + psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl; + post_event(advmap); + return transit< Reset >(); + } + } + return forward_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt) +{ + psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl; + return discard_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt) +{ + psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl; + return discard_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt) +{ + psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl; + return discard_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "waiting for pg acting set to change"); + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "WaitActingChange"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::WaitActingChange::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur); +} + +/*------Down--------*/ +PeeringState::Down::Down(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + + ps->state_clear(PG_STATE_PEERING); + ps->state_set(PG_STATE_DOWN); + + auto &prior_set = context< Peering >().prior_set; + ceph_assert(ps->blocked_by.empty()); + ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); + pl->publish_stats_to_osd(); +} + +void PeeringState::Down::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + + ps->state_clear(PG_STATE_DOWN); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_down_latency, dur); + + ps->blocked_by.clear(); +} + +boost::statechart::result PeeringState::Down::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", + "not enough up instances of this PG to go active"); + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::Down::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "Down"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt) +{ + DECLARE_LOCALS; + + ceph_assert(ps->is_primary()); + epoch_t old_start = ps->info.history.last_epoch_started; + if (!ps->peer_info.count(infoevt.from) && + ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) { + ps->update_history(infoevt.notify.info.history); + } + // if we got something new to make pg escape down state + if (ps->info.history.last_epoch_started > old_start) { + psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl; + ps->state_clear(PG_STATE_DOWN); + ps->state_set(PG_STATE_PEERING); + return transit< GetInfo >(); + } + + return discard_event(); +} + + +/*------Incomplete--------*/ +PeeringState::Incomplete::Incomplete(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete") +{ + context< PeeringMachine >().log_enter(state_name); + DECLARE_LOCALS; + + ps->state_clear(PG_STATE_PEERING); + ps->state_set(PG_STATE_INCOMPLETE); + + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + ceph_assert(ps->blocked_by.empty()); + ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); + pl->publish_stats_to_osd(); +} + +boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) { + DECLARE_LOCALS; + int64_t poolnum = ps->info.pgid.pool(); + + // Reset if min_size turn smaller than previous value, pg might now be able to go active + if (!advmap.osdmap->have_pg_pool(poolnum) || + advmap.lastmap->get_pools().find(poolnum)->second.min_size > + advmap.osdmap->get_pools().find(poolnum)->second.min_size) { + post_event(advmap); + return transit< Reset >(); + } + + return forward_event(); +} + +boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) { + DECLARE_LOCALS; + psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl; + if (ps->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) { + // We got something new, try again! + return transit< GetLog >(); + } else { + return discard_event(); + } +} + +boost::statechart::result PeeringState::Incomplete::react( + const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "not enough complete instances of this PG"); + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::Incomplete::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "Incomplete"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::Incomplete::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + + ps->state_clear(PG_STATE_INCOMPLETE); + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_incomplete_latency, dur); + + ps->blocked_by.clear(); +} + +/*------GetMissing--------*/ +PeeringState::GetMissing::GetMissing(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing") +{ + context< PeeringMachine >().log_enter(state_name); + + DECLARE_LOCALS; + ps->log_weirdness(); + ceph_assert(!ps->acting_recovery_backfill.empty()); + eversion_t since; + for (auto i = ps->acting_recovery_backfill.begin(); + i != ps->acting_recovery_backfill.end(); + ++i) { + if (*i == ps->get_primary()) continue; + const pg_info_t& pi = ps->peer_info[*i]; + // reset this so to make sure the pg_missing_t is initialized and + // has the correct semantics even if we don't need to get a + // missing set from a shard. This way later additions due to + // lost+unfound delete work properly. + ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering(); + + if (pi.is_empty()) + continue; // no pg data, nothing divergent + + if (pi.last_update < ps->pg_log.get_tail()) { + psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl; + ps->peer_missing[*i].clear(); + continue; + } + if (pi.last_backfill == hobject_t()) { + psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl; + ps->peer_missing[*i].clear(); + continue; + } + + if (pi.last_update == pi.last_complete && // peer has no missing + pi.last_update == ps->info.last_update) { // peer is up to date + // replica has no missing and identical log as us. no need to + // pull anything. + // FIXME: we can do better here. if last_update==last_complete we + // can infer the rest! + psdout(10) << " osd." << *i << " has no missing, identical log" << dendl; + ps->peer_missing[*i].clear(); + continue; + } + + // We pull the log from the peer's last_epoch_started to ensure we + // get enough log to detect divergent updates. + since.epoch = pi.last_epoch_started; + ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing + if (pi.log_tail <= since) { + psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl; + context< PeeringMachine >().send_query( + i->osd, + pg_query_t( + pg_query_t::LOG, + i->shard, ps->pg_whoami.shard, + since, ps->info.history, + ps->get_osdmap_epoch())); + } else { + psdout(10) << " requesting fulllog+missing from osd." << *i + << " (want since " << since << " < log.tail " + << pi.log_tail << ")" << dendl; + context< PeeringMachine >().send_query( + i->osd, pg_query_t( + pg_query_t::FULLLOG, + i->shard, ps->pg_whoami.shard, + ps->info.history, ps->get_osdmap_epoch())); + } + peer_missing_requested.insert(*i); + ps->blocked_by.insert(i->osd); + } + + if (peer_missing_requested.empty()) { + if (ps->need_up_thru) { + psdout(10) << " still need up_thru update before going active" + << dendl; + post_event(NeedUpThru()); + return; + } + + // all good! + post_event(Activate(ps->get_osdmap_epoch())); + } else { + pl->publish_stats_to_osd(); + } +} + +boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt) +{ + DECLARE_LOCALS; + + peer_missing_requested.erase(logevt.from); + ps->proc_replica_log(logevt.msg->info, + logevt.msg->log, + std::move(logevt.msg->missing), + logevt.from); + + if (peer_missing_requested.empty()) { + if (ps->need_up_thru) { + psdout(10) << " still need up_thru update before going active" + << dendl; + post_event(NeedUpThru()); + } else { + psdout(10) << "Got last missing, don't need missing " + << "posting Activate" << dendl; + post_event(Activate(ps->get_osdmap_epoch())); + } + } + return discard_event(); +} + +boost::statechart::result PeeringState::GetMissing::react(const QueryState& q) +{ + DECLARE_LOCALS; + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("peer_missing_requested"); + for (auto p = peer_missing_requested.begin(); + p != peer_missing_requested.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (ps->peer_missing.count(*p)) { + q.f->open_object_section("got_missing"); + ps->peer_missing[*p].dump(q.f); + q.f->close_section(); + } + q.f->close_section(); + } + q.f->close_section(); + + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::GetMissing::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "GetMising"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::GetMissing::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_getmissing_latency, dur); + ps->blocked_by.clear(); +} + +/*------WaitUpThru--------*/ +PeeringState::WaitUpThru::WaitUpThru(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am) +{ + DECLARE_LOCALS; + if (!ps->need_up_thru) { + post_event(Activate(ps->get_osdmap_epoch())); + } + return forward_event(); +} + +boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt) +{ + DECLARE_LOCALS; + psdout(10) << "Noting missing from osd." << logevt.from << dendl; + ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing)); + ps->peer_info[logevt.from] = logevt.msg->info; + return discard_event(); +} + +boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd"); + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::WaitUpThru::react(const QueryUnfound& q) +{ + q.f->dump_string("state", "WaitUpThru"); + q.f->dump_bool("available_might_have_unfound", false); + return discard_event(); +} + +void PeeringState::WaitUpThru::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + pl->get_peering_perf().tinc(rs_waitupthru_latency, dur); +} + +/*----PeeringState::PeeringMachine Methods-----*/ +#undef dout_prefix +#define dout_prefix dpp->gen_prefix(*_dout) + +void PeeringState::PeeringMachine::log_enter(const char *state_name) +{ + DECLARE_LOCALS; + psdout(5) << "enter " << state_name << dendl; + pl->log_state_enter(state_name); +} + +void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time) +{ + DECLARE_LOCALS; + utime_t dur = ceph_clock_now() - enter_time; + psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl; + pl->log_state_exit(state_name, enter_time, event_count, event_time); + event_count = 0; + event_time = utime_t(); +} + +ostream &operator<<(ostream &out, const PeeringState &ps) { + out << "pg[" << ps.info + << " " << pg_vector_string(ps.up); + if (ps.acting != ps.up) + out << "/" << pg_vector_string(ps.acting); + if (ps.is_ec_pg()) + out << "p" << ps.get_primary(); + if (!ps.async_recovery_targets.empty()) + out << " async=[" << ps.async_recovery_targets << "]"; + if (!ps.backfill_targets.empty()) + out << " backfill=[" << ps.backfill_targets << "]"; + out << " r=" << ps.get_role(); + out << " lpr=" << ps.get_last_peering_reset(); + + if (ps.deleting) + out << " DELETING"; + + if (!ps.past_intervals.empty()) { + out << " pi=[" << ps.past_intervals.get_bounds() + << ")/" << ps.past_intervals.size(); + } + + if (ps.is_peered()) { + if (ps.last_update_ondisk != ps.info.last_update) + out << " luod=" << ps.last_update_ondisk; + if (ps.last_update_applied != ps.info.last_update) + out << " lua=" << ps.last_update_applied; + } + + if (ps.pg_log.get_tail() != ps.info.log_tail || + ps.pg_log.get_head() != ps.info.last_update) + out << " (info mismatch, " << ps.pg_log.get_log() << ")"; + + if (!ps.pg_log.get_log().empty()) { + if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) { + out << " (log bound mismatch, actual=[" + << ps.pg_log.get_log().log.begin()->version << "," + << ps.pg_log.get_log().log.rbegin()->version << "]"; + out << ")"; + } + } + + out << " crt=" << ps.pg_log.get_can_rollback_to(); + + if (ps.last_complete_ondisk != ps.info.last_complete) + out << " lcod " << ps.last_complete_ondisk; + + out << " mlcod " << ps.min_last_complete_ondisk; + + out << " " << pg_state_string(ps.get_state()); + if (ps.should_send_notify()) + out << " NOTIFY"; + + if (ps.prior_readable_until_ub != ceph::signedspan::zero()) { + out << " pruub " << ps.prior_readable_until_ub + << "@" << ps.get_prior_readable_down_osds(); + } + return out; +} + +std::vector PeeringState::get_replica_recovery_order() const +{ + std::vector> replicas_by_num_missing, + async_by_num_missing; + replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1); + for (auto &p : get_acting_recovery_backfill()) { + if (p == get_primary()) { + continue; + } + auto pm = get_peer_missing().find(p); + assert(pm != get_peer_missing().end()); + auto nm = pm->second.num_missing(); + if (nm != 0) { + if (is_async_recovery_target(p)) { + async_by_num_missing.push_back(make_pair(nm, p)); + } else { + replicas_by_num_missing.push_back(make_pair(nm, p)); + } + } + } + // sort by number of missing objects, in ascending order. + auto func = [](const std::pair &lhs, + const std::pair &rhs) { + return lhs.first < rhs.first; + }; + // acting goes first + std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func); + // then async_recovery_targets + std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func); + replicas_by_num_missing.insert(replicas_by_num_missing.end(), + async_by_num_missing.begin(), async_by_num_missing.end()); + + std::vector ret; + ret.reserve(replicas_by_num_missing.size()); + for (auto p : replicas_by_num_missing) { + ret.push_back(p.second); + } + return ret; +} + + diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h new file mode 100644 index 000000000..2cc340cb9 --- /dev/null +++ b/src/osd/PeeringState.h @@ -0,0 +1,2442 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "include/ceph_assert.h" +#include "include/common_fwd.h" + +#include "PGLog.h" +#include "PGStateUtils.h" +#include "PGPeeringEvent.h" +#include "osd_types.h" +#include "os/ObjectStore.h" +#include "OSDMap.h" +#include "MissingLoc.h" +#include "osd/osd_perf_counters.h" +#include "common/ostream_temp.h" + +struct PGPool { + epoch_t cached_epoch; + int64_t id; + std::string name; + + pg_pool_t info; + SnapContext snapc; // the default pool snapc, ready to go. + + PGPool(OSDMapRef map, int64_t i, const pg_pool_t& info, + const std::string& name) + : cached_epoch(map->get_epoch()), + id(i), + name(name), + info(info) { + snapc = info.get_snap_context(); + } + + void update(OSDMapRef map); + + ceph::timespan get_readable_interval(ConfigProxy &conf) const { + double v = 0; + if (info.opts.get(pool_opts_t::READ_LEASE_INTERVAL, &v)) { + return ceph::make_timespan(v); + } else { + auto hbi = conf->osd_heartbeat_grace; + auto fac = conf->osd_pool_default_read_lease_ratio; + return ceph::make_timespan(hbi * fac); + } + } +}; + +struct PeeringCtx; + +// [primary only] content recovery state +struct BufferedRecoveryMessages { + ceph_release_t require_osd_release; + std::map> message_map; + + BufferedRecoveryMessages(ceph_release_t r) + : require_osd_release(r) { + } + BufferedRecoveryMessages(ceph_release_t r, PeeringCtx &ctx); + + void accept_buffered_messages(BufferedRecoveryMessages &m) { + for (auto &[target, ls] : m.message_map) { + auto &ovec = message_map[target]; + // put buffered messages in front + ls.reserve(ls.size() + ovec.size()); + ls.insert(ls.end(), ovec.begin(), ovec.end()); + ovec.clear(); + ovec.swap(ls); + } + } + + void send_osd_message(int target, MessageRef m) { + message_map[target].push_back(std::move(m)); + } + void send_notify(int to, const pg_notify_t &n); + void send_query(int to, spg_t spgid, const pg_query_t &q); + void send_info(int to, spg_t to_spgid, + epoch_t min_epoch, epoch_t cur_epoch, + const pg_info_t &info, + std::optional lease = {}, + std::optional lease_ack = {}); +}; + +struct HeartbeatStamps : public RefCountedObject { + mutable ceph::mutex lock = ceph::make_mutex("HeartbeatStamps::lock"); + + const int osd; + + // we maintain an upper and lower bound on the delta between our local + // mono_clock time (minus the startup_time) to the peer OSD's mono_clock + // time (minus its startup_time). + // + // delta is (remote_clock_time - local_clock_time), so that + // local_time + delta -> peer_time, and peer_time - delta -> local_time. + // + // we have an upper and lower bound value on this delta, meaning the + // value of the remote clock is somewhere between [my_time + lb, my_time + ub] + // + // conversely, if we have a remote timestamp T, then that is + // [T - ub, T - lb] in terms of the local clock. i.e., if you are + // substracting the delta, then take care that you swap the role of the + // lb and ub values. + + /// lower bound on peer clock - local clock + std::optional peer_clock_delta_lb; + + /// upper bound on peer clock - local clock + std::optional peer_clock_delta_ub; + + /// highest up_from we've seen from this rank + epoch_t up_from = 0; + + void print(std::ostream& out) const { + std::lock_guard l(lock); + out << "hbstamp(osd." << osd << " up_from " << up_from + << " peer_clock_delta ["; + if (peer_clock_delta_lb) { + out << *peer_clock_delta_lb; + } + out << ","; + if (peer_clock_delta_ub) { + out << *peer_clock_delta_ub; + } + out << "])"; + } + + void sent_ping(std::optional *delta_ub) { + std::lock_guard l(lock); + // the non-primaries need a lower bound on remote clock - local clock. if + // we assume the transit for the last ping_reply was + // instantaneous, that would be (the negative of) our last + // peer_clock_delta_lb value. + if (peer_clock_delta_lb) { + *delta_ub = - *peer_clock_delta_lb; + } + } + + void got_ping(epoch_t this_up_from, + ceph::signedspan now, + ceph::signedspan peer_send_stamp, + std::optional delta_ub, + ceph::signedspan *out_delta_ub) { + std::lock_guard l(lock); + if (this_up_from < up_from) { + return; + } + if (this_up_from > up_from) { + up_from = this_up_from; + } + peer_clock_delta_lb = peer_send_stamp - now; + peer_clock_delta_ub = delta_ub; + *out_delta_ub = - *peer_clock_delta_lb; + } + + void got_ping_reply(ceph::signedspan now, + ceph::signedspan peer_send_stamp, + std::optional delta_ub) { + std::lock_guard l(lock); + peer_clock_delta_lb = peer_send_stamp - now; + peer_clock_delta_ub = delta_ub; + } + +private: + FRIEND_MAKE_REF(HeartbeatStamps); + HeartbeatStamps(int o) + : RefCountedObject(NULL), + osd(o) {} +}; +using HeartbeatStampsRef = ceph::ref_t; + +inline std::ostream& operator<<(std::ostream& out, const HeartbeatStamps& hb) +{ + hb.print(out); + return out; +} + + +struct PeeringCtx : BufferedRecoveryMessages { + ObjectStore::Transaction transaction; + HBHandle* handle = nullptr; + + PeeringCtx(ceph_release_t r) + : BufferedRecoveryMessages(r) {} + + PeeringCtx(const PeeringCtx &) = delete; + PeeringCtx &operator=(const PeeringCtx &) = delete; + + PeeringCtx(PeeringCtx &&) = default; + PeeringCtx &operator=(PeeringCtx &&) = default; + + void reset_transaction() { + transaction = ObjectStore::Transaction(); + } +}; + +/** + * Wraps PeeringCtx to hide the difference between buffering messages to + * be sent after flush or immediately. + */ +struct PeeringCtxWrapper { + utime_t start_time; + BufferedRecoveryMessages &msgs; + ObjectStore::Transaction &transaction; + HBHandle * const handle = nullptr; + + PeeringCtxWrapper(PeeringCtx &wrapped) : + msgs(wrapped), + transaction(wrapped.transaction), + handle(wrapped.handle) {} + + PeeringCtxWrapper(BufferedRecoveryMessages &buf, PeeringCtx &wrapped) + : msgs(buf), + transaction(wrapped.transaction), + handle(wrapped.handle) {} + + PeeringCtxWrapper(PeeringCtxWrapper &&ctx) = default; + + void send_osd_message(int target, MessageRef m) { + msgs.send_osd_message(target, std::move(m)); + } + void send_notify(int to, const pg_notify_t &n) { + msgs.send_notify(to, n); + } + void send_query(int to, spg_t spgid, const pg_query_t &q) { + msgs.send_query(to, spgid, q); + } + void send_info(int to, spg_t to_spgid, + epoch_t min_epoch, epoch_t cur_epoch, + const pg_info_t &info, + std::optional lease = {}, + std::optional lease_ack = {}) { + msgs.send_info(to, to_spgid, min_epoch, cur_epoch, info, + lease, lease_ack); + } +}; + +/* Encapsulates PG recovery process */ +class PeeringState : public MissingLoc::MappingInfo { +public: + struct PeeringListener : public EpochSource { + /// Prepare t with written information + virtual void prepare_write( + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ObjectStore::Transaction &t) = 0; + + /// Notify that info/history changed (generally to update scrub registration) + virtual void on_info_history_change() = 0; + /// Notify that a scrub has been requested + virtual void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) = 0; + + /// Return current snap_trimq size + virtual uint64_t get_snap_trimq_size() const = 0; + + /// Send cluster message to osd + virtual void send_cluster_message( + int osd, MessageRef m, epoch_t epoch, bool share_map_update=false) = 0; + /// Send pg_created to mon + virtual void send_pg_created(pg_t pgid) = 0; + + virtual ceph::signedspan get_mnow() = 0; + virtual HeartbeatStampsRef get_hb_stamps(int peer) = 0; + virtual void schedule_renew_lease(epoch_t plr, ceph::timespan delay) = 0; + virtual void queue_check_readable(epoch_t lpr, ceph::timespan delay) = 0; + virtual void recheck_readable() = 0; + + virtual unsigned get_target_pg_log_entries() const = 0; + + // ============ Flush state ================== + /** + * try_flush_or_schedule_async() + * + * If true, caller may assume all past operations on this pg + * have been flushed. Else, caller will receive an on_flushed() + * call once the flush has completed. + */ + virtual bool try_flush_or_schedule_async() = 0; + /// Arranges for a commit on t to call on_flushed() once flushed. + virtual void start_flush_on_transaction( + ObjectStore::Transaction &t) = 0; + /// Notification that all outstanding flushes for interval have completed + virtual void on_flushed() = 0; + + //============= Recovery ==================== + /// Arrange for even to be queued after delay + virtual void schedule_event_after( + PGPeeringEventRef event, + float delay) = 0; + /** + * request_local_background_io_reservation + * + * Request reservation at priority with on_grant queued on grant + * and on_preempt on preempt + */ + virtual void request_local_background_io_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) = 0; + /// Modify pending local background reservation request priority + virtual void update_local_background_io_priority( + unsigned priority) = 0; + /// Cancel pending local background reservation request + virtual void cancel_local_background_io_reservation() = 0; + + /** + * request_remote_background_io_reservation + * + * Request reservation at priority with on_grant queued on grant + * and on_preempt on preempt + */ + virtual void request_remote_recovery_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) = 0; + /// Cancel pending remote background reservation request + virtual void cancel_remote_recovery_reservation() = 0; + + /// Arrange for on_commit to be queued upon commit of t + virtual void schedule_event_on_commit( + ObjectStore::Transaction &t, + PGPeeringEventRef on_commit) = 0; + + //============================ HB ============================= + /// Update hb set to peers + virtual void update_heartbeat_peers(std::set peers) = 0; + + /// Std::set targets being probed in this interval + virtual void set_probe_targets(const std::set &probe_set) = 0; + /// Clear targets being probed in this interval + virtual void clear_probe_targets() = 0; + + /// Queue for a pg_temp of wanted + virtual void queue_want_pg_temp(const std::vector &wanted) = 0; + /// Clear queue for a pg_temp of wanted + virtual void clear_want_pg_temp() = 0; + + /// Arrange for stats to be shipped to mon to be updated for this pg + virtual void publish_stats_to_osd() = 0; + /// Clear stats to be shipped to mon for this pg + virtual void clear_publish_stats() = 0; + + /// Notification to check outstanding operation targets + virtual void check_recovery_sources(const OSDMapRef& newmap) = 0; + /// Notification to check outstanding blocklist + virtual void check_blocklisted_watchers() = 0; + /// Notification to clear state associated with primary + virtual void clear_primary_state() = 0; + + // =================== Event notification ==================== + virtual void on_pool_change() = 0; + virtual void on_role_change() = 0; + virtual void on_change(ObjectStore::Transaction &t) = 0; + virtual void on_activate(interval_set to_trim) = 0; + virtual void on_activate_complete() = 0; + virtual void on_new_interval() = 0; + virtual Context *on_clean() = 0; + virtual void on_activate_committed() = 0; + virtual void on_active_exit() = 0; + + // ====================== PG deletion ======================= + /// Notification of removal complete, t must be populated to complete removal + virtual void on_removal(ObjectStore::Transaction &t) = 0; + /// Perform incremental removal work + virtual std::pair do_delete_work( + ObjectStore::Transaction &t, ghobject_t _next) = 0; + + // ======================= PG Merge ========================= + virtual void clear_ready_to_merge() = 0; + virtual void set_not_ready_to_merge_target(pg_t pgid, pg_t src) = 0; + virtual void set_not_ready_to_merge_source(pg_t pgid) = 0; + virtual void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) = 0; + virtual void set_ready_to_merge_source(eversion_t lu) = 0; + + // ==================== Std::map notifications =================== + virtual void on_active_actmap() = 0; + virtual void on_active_advmap(const OSDMapRef &osdmap) = 0; + virtual epoch_t oldest_stored_osdmap() = 0; + + // ============ recovery reservation notifications ========== + virtual void on_backfill_reserved() = 0; + virtual void on_backfill_canceled() = 0; + virtual void on_recovery_reserved() = 0; + + // ================recovery space accounting ================ + virtual bool try_reserve_recovery_space( + int64_t primary_num_bytes, int64_t local_num_bytes) = 0; + virtual void unreserve_recovery_space() = 0; + + // ================== Peering log events ==================== + /// Get handler for rolling forward/back log entries + virtual PGLog::LogEntryHandlerRef get_log_handler( + ObjectStore::Transaction &t) = 0; + + // ============ On disk representation changes ============== + virtual void rebuild_missing_set_with_deletes(PGLog &pglog) = 0; + + // ======================= Logging ========================== + virtual PerfCounters &get_peering_perf() = 0; + virtual PerfCounters &get_perf_logger() = 0; + virtual void log_state_enter(const char *state) = 0; + virtual void log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) = 0; + virtual void dump_recovery_info(ceph::Formatter *f) const = 0; + + virtual OstreamTemp get_clog_info() = 0; + virtual OstreamTemp get_clog_error() = 0; + virtual OstreamTemp get_clog_debug() = 0; + + virtual ~PeeringListener() {} + }; + + struct QueryState : boost::statechart::event< QueryState > { + ceph::Formatter *f; + explicit QueryState(ceph::Formatter *f) : f(f) {} + void print(std::ostream *out) const { + *out << "Query"; + } + }; + + struct QueryUnfound : boost::statechart::event< QueryUnfound > { + ceph::Formatter *f; + explicit QueryUnfound(ceph::Formatter *f) : f(f) {} + void print(std::ostream *out) const { + *out << "QueryUnfound"; + } + }; + + struct AdvMap : boost::statechart::event< AdvMap > { + OSDMapRef osdmap; + OSDMapRef lastmap; + std::vector newup, newacting; + int up_primary, acting_primary; + AdvMap( + OSDMapRef osdmap, OSDMapRef lastmap, + std::vector& newup, int up_primary, + std::vector& newacting, int acting_primary): + osdmap(osdmap), lastmap(lastmap), + newup(newup), + newacting(newacting), + up_primary(up_primary), + acting_primary(acting_primary) {} + void print(std::ostream *out) const { + *out << "AdvMap"; + } + }; + + struct ActMap : boost::statechart::event< ActMap > { + ActMap() : boost::statechart::event< ActMap >() {} + void print(std::ostream *out) const { + *out << "ActMap"; + } + }; + struct Activate : boost::statechart::event< Activate > { + epoch_t activation_epoch; + explicit Activate(epoch_t q) : boost::statechart::event< Activate >(), + activation_epoch(q) {} + void print(std::ostream *out) const { + *out << "Activate from " << activation_epoch; + } + }; + struct ActivateCommitted : boost::statechart::event< ActivateCommitted > { + epoch_t epoch; + epoch_t activation_epoch; + explicit ActivateCommitted(epoch_t e, epoch_t ae) + : boost::statechart::event< ActivateCommitted >(), + epoch(e), + activation_epoch(ae) {} + void print(std::ostream *out) const { + *out << "ActivateCommitted from " << activation_epoch + << " processed at " << epoch; + } + }; +public: + struct UnfoundBackfill : boost::statechart::event { + explicit UnfoundBackfill() {} + void print(std::ostream *out) const { + *out << "UnfoundBackfill"; + } + }; + struct UnfoundRecovery : boost::statechart::event { + explicit UnfoundRecovery() {} + void print(std::ostream *out) const { + *out << "UnfoundRecovery"; + } + }; + + struct RequestScrub : boost::statechart::event { + scrub_level_t deep; + scrub_type_t repair; + explicit RequestScrub(bool d, bool r) : deep(scrub_level_t(d)), repair(scrub_type_t(r)) {} + void print(std::ostream *out) const { + *out << "RequestScrub(" << ((deep==scrub_level_t::deep) ? "deep" : "shallow") + << ((repair==scrub_type_t::do_repair) ? " repair)" : ")"); + } + }; + + TrivialEvent(Initialize) + TrivialEvent(GotInfo) + TrivialEvent(NeedUpThru) + TrivialEvent(Backfilled) + TrivialEvent(LocalBackfillReserved) + TrivialEvent(RejectTooFullRemoteReservation) + TrivialEvent(RequestBackfill) + TrivialEvent(RemoteRecoveryPreempted) + TrivialEvent(RemoteBackfillPreempted) + TrivialEvent(BackfillTooFull) + TrivialEvent(RecoveryTooFull) + + TrivialEvent(MakePrimary) + TrivialEvent(MakeStray) + TrivialEvent(NeedActingChange) + TrivialEvent(IsIncomplete) + TrivialEvent(IsDown) + + TrivialEvent(AllReplicasRecovered) + TrivialEvent(DoRecovery) + TrivialEvent(LocalRecoveryReserved) + TrivialEvent(AllRemotesReserved) + TrivialEvent(AllBackfillsReserved) + TrivialEvent(GoClean) + + TrivialEvent(AllReplicasActivated) + + TrivialEvent(IntervalFlush) + + TrivialEvent(DeleteStart) + TrivialEvent(DeleteSome) + + TrivialEvent(SetForceRecovery) + TrivialEvent(UnsetForceRecovery) + TrivialEvent(SetForceBackfill) + TrivialEvent(UnsetForceBackfill) + + TrivialEvent(DeleteReserved) + TrivialEvent(DeleteInterrupted) + + TrivialEvent(CheckReadable) + + void start_handle(PeeringCtx *new_ctx); + void end_handle(); + void begin_block_outgoing(); + void end_block_outgoing(); + void clear_blocked_outgoing(); + private: + + /* States */ + struct Initial; + class PeeringMachine : public boost::statechart::state_machine< PeeringMachine, Initial > { + public: + PeeringState *state; + PGStateHistory *state_history; + CephContext *cct; + spg_t spgid; + DoutPrefixProvider *dpp; + PeeringListener *pl; + + utime_t event_time; + uint64_t event_count; + + void clear_event_counters() { + event_time = utime_t(); + event_count = 0; + } + + void log_enter(const char *state_name); + void log_exit(const char *state_name, utime_t duration); + + PeeringMachine( + PeeringState *state, CephContext *cct, + spg_t spgid, + DoutPrefixProvider *dpp, + PeeringListener *pl, + PGStateHistory *state_history) : + state(state), + state_history(state_history), + cct(cct), spgid(spgid), + dpp(dpp), pl(pl), + event_count(0) {} + + /* Accessor functions for state methods */ + ObjectStore::Transaction& get_cur_transaction() { + ceph_assert(state->rctx); + return state->rctx->transaction; + } + + PeeringCtxWrapper &get_recovery_ctx() { + assert(state->rctx); + return *(state->rctx); + } + + void send_notify(int to, const pg_notify_t &n) { + ceph_assert(state->rctx); + state->rctx->send_notify(to, n); + } + void send_query(int to, const pg_query_t &query) { + state->rctx->send_query( + to, + spg_t(spgid.pgid, query.to), + query); + } + }; + friend class PeeringMachine; + + /* States */ + // Initial + // Reset + // Start + // Started + // Primary + // WaitActingChange + // Peering + // GetInfo + // GetLog + // GetMissing + // WaitUpThru + // Incomplete + // Active + // Activating + // Clean + // Recovered + // Backfilling + // WaitRemoteBackfillReserved + // WaitLocalBackfillReserved + // NotBackfilling + // NotRecovering + // Recovering + // WaitRemoteRecoveryReserved + // WaitLocalRecoveryReserved + // ReplicaActive + // RepNotRecovering + // RepRecovering + // RepWaitBackfillReserved + // RepWaitRecoveryReserved + // Stray + // ToDelete + // WaitDeleteReserved + // Deleting + // Crashed + + struct Crashed : boost::statechart::state< Crashed, PeeringMachine >, NamedState { + explicit Crashed(my_context ctx); + }; + + struct Reset; + + struct Initial : boost::statechart::state< Initial, PeeringMachine >, NamedState { + explicit Initial(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::transition< Initialize, Reset >, + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + + boost::statechart::result react(const MNotifyRec&); + boost::statechart::result react(const MInfoRec&); + boost::statechart::result react(const MLogRec&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Reset : boost::statechart::state< Reset, PeeringMachine >, NamedState { + explicit Reset(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::custom_reaction< IntervalFlush >, + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const IntervalFlush&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Start; + + struct Started : boost::statechart::state< Started, PeeringMachine, Start >, NamedState { + explicit Started(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< IntervalFlush >, + // ignored + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + // crash + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const IntervalFlush&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Primary; + struct Stray; + + struct Start : boost::statechart::state< Start, Started >, NamedState { + explicit Start(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::transition< MakePrimary, Primary >, + boost::statechart::transition< MakeStray, Stray > + > reactions; + }; + + struct Peering; + struct WaitActingChange; + struct Incomplete; + struct Down; + + struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState { + explicit Primary(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction + > reactions; + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const MNotifyRec&); + boost::statechart::result react(const SetForceRecovery&); + boost::statechart::result react(const UnsetForceRecovery&); + boost::statechart::result react(const SetForceBackfill&); + boost::statechart::result react(const UnsetForceBackfill&); + boost::statechart::result react(const RequestScrub&); + }; + + struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>, + NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MNotifyRec > + > reactions; + explicit WaitActingChange(my_context ctx); + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const MLogRec&); + boost::statechart::result react(const MInfoRec&); + boost::statechart::result react(const MNotifyRec&); + void exit(); + }; + + struct GetInfo; + struct Active; + + struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState { + PastIntervals::PriorSet prior_set; + bool history_les_bound; //< need osd_find_best_info_ignore_history_les + + explicit Peering(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::transition< Activate, Active >, + boost::statechart::custom_reaction< AdvMap > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const AdvMap &advmap); + }; + + struct WaitLocalRecoveryReserved; + struct Activating; + struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState { + explicit Active(my_context ctx); + void exit(); + + const std::set remote_shards_to_reserve_recovery; + const std::set remote_shards_to_reserve_backfill; + bool all_replicas_activated; + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MTrim >, + boost::statechart::custom_reaction< Backfilled >, + boost::statechart::custom_reaction< ActivateCommitted >, + boost::statechart::custom_reaction< AllReplicasActivated >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, + boost::statechart::custom_reaction< RemoteReservationRevoked>, + boost::statechart::custom_reaction< DoRecovery>, + boost::statechart::custom_reaction< RenewLease>, + boost::statechart::custom_reaction< MLeaseAck>, + boost::statechart::custom_reaction< CheckReadable> + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const MNotifyRec& notevt); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MTrim& trimevt); + boost::statechart::result react(const Backfilled&) { + return discard_event(); + } + boost::statechart::result react(const ActivateCommitted&); + boost::statechart::result react(const AllReplicasActivated&); + boost::statechart::result react(const RenewLease&); + boost::statechart::result react(const MLeaseAck&); + boost::statechart::result react(const DeferRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const DeferBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteReservationRevokedTooFull&) { + return discard_event(); + } + boost::statechart::result react(const RemoteReservationRevoked&) { + return discard_event(); + } + boost::statechart::result react(const DoRecovery&) { + return discard_event(); + } + boost::statechart::result react(const CheckReadable&); + void all_activated_and_committed(); + }; + + struct Clean : boost::statechart::state< Clean, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction + > reactions; + explicit Clean(my_context ctx); + void exit(); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Recovered : boost::statechart::state< Recovered, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< GoClean, Clean >, + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction< AllReplicasActivated > + > reactions; + explicit Recovered(my_context ctx); + void exit(); + boost::statechart::result react(const AllReplicasActivated&) { + post_event(GoClean()); + return forward_event(); + } + }; + + struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< Backfilled >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >, + boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, + boost::statechart::custom_reaction< RemoteReservationRevoked> + > reactions; + explicit Backfilling(my_context ctx); + boost::statechart::result react(const RemoteReservationRejectedTooFull& evt) { + // for compat with old peers + post_event(RemoteReservationRevokedTooFull()); + return discard_event(); + } + void backfill_release_reservations(); + boost::statechart::result react(const Backfilled& evt); + boost::statechart::result react(const RemoteReservationRevokedTooFull& evt); + boost::statechart::result react(const RemoteReservationRevoked& evt); + boost::statechart::result react(const DeferBackfill& evt); + boost::statechart::result react(const UnfoundBackfill& evt); + void cancel_backfill(); + void exit(); + }; + + struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >, + boost::statechart::custom_reaction< RemoteReservationRevoked >, + boost::statechart::transition< AllBackfillsReserved, Backfilling > + > reactions; + std::set::const_iterator backfill_osd_it; + explicit WaitRemoteBackfillReserved(my_context ctx); + void retry(); + void exit(); + boost::statechart::result react(const RemoteBackfillReserved& evt); + boost::statechart::result react(const RemoteReservationRejectedTooFull& evt); + boost::statechart::result react(const RemoteReservationRevoked& evt); + }; + + struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >, + boost::statechart::custom_reaction< RemoteBackfillReserved > + > reactions; + explicit WaitLocalBackfillReserved(my_context ctx); + boost::statechart::result react(const RemoteBackfillReserved& evt) { + /* no-op */ + return discard_event(); + } + void exit(); + }; + + struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>, + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull > + > reactions; + explicit NotBackfilling(my_context ctx); + void exit(); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const RemoteBackfillReserved& evt); + boost::statechart::result react(const RemoteReservationRejectedTooFull& evt); + }; + + struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< UnfoundRecovery > + > reactions; + explicit NotRecovering(my_context ctx); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const DeferRecovery& evt) { + /* no-op */ + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + /* no-op */ + return discard_event(); + } + void exit(); + }; + + struct ToDelete; + struct RepNotRecovering; + struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState { + explicit ReplicaActive(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MQuery >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MTrim >, + boost::statechart::custom_reaction< Activate >, + boost::statechart::custom_reaction< ActivateCommitted >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteBackfillPreempted >, + boost::statechart::custom_reaction< RemoteRecoveryPreempted >, + boost::statechart::custom_reaction< RecoveryDone >, + boost::statechart::transition, + boost::statechart::custom_reaction< MLease > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MTrim& trimevt); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const MQuery&); + boost::statechart::result react(const Activate&); + boost::statechart::result react(const ActivateCommitted&); + boost::statechart::result react(const MLease&); + boost::statechart::result react(const RecoveryDone&) { + return discard_event(); + } + boost::statechart::result react(const DeferRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const DeferBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteBackfillPreempted& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteRecoveryPreempted& evt) { + return discard_event(); + } + }; + + struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< RecoveryDone, RepNotRecovering >, + // for compat with old peers + boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >, + boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, + boost::statechart::custom_reaction< BackfillTooFull >, + boost::statechart::custom_reaction< RemoteRecoveryPreempted >, + boost::statechart::custom_reaction< RemoteBackfillPreempted > + > reactions; + explicit RepRecovering(my_context ctx); + boost::statechart::result react(const RemoteRecoveryPreempted &evt); + boost::statechart::result react(const BackfillTooFull &evt); + boost::statechart::result react(const RemoteBackfillPreempted &evt); + void exit(); + }; + + struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RejectTooFullRemoteReservation >, + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >, + boost::statechart::custom_reaction< RemoteReservationCanceled > + > reactions; + explicit RepWaitBackfillReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RemoteBackfillReserved &evt); + boost::statechart::result react(const RejectTooFullRemoteReservation &evt); + boost::statechart::result react(const RemoteReservationRejectedTooFull &evt); + boost::statechart::result react(const RemoteReservationCanceled &evt); + }; + + struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + // for compat with old peers + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >, + boost::statechart::custom_reaction< RemoteReservationCanceled > + > reactions; + explicit RepWaitRecoveryReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RemoteRecoveryReserved &evt); + boost::statechart::result react(const RemoteReservationRejectedTooFull &evt) { + // for compat with old peers + post_event(RemoteReservationCanceled()); + return discard_event(); + } + boost::statechart::result react(const RemoteReservationCanceled &evt); + }; + + struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RequestRecoveryPrio >, + boost::statechart::custom_reaction< RequestBackfillPrio >, + boost::statechart::custom_reaction< RejectTooFullRemoteReservation >, + boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >, + boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers + > reactions; + explicit RepNotRecovering(my_context ctx); + boost::statechart::result react(const RequestRecoveryPrio &evt); + boost::statechart::result react(const RequestBackfillPrio &evt); + boost::statechart::result react(const RemoteBackfillReserved &evt) { + // my reservation completion raced with a RELEASE from primary + return discard_event(); + } + boost::statechart::result react(const RemoteRecoveryReserved &evt) { + // my reservation completion raced with a RELEASE from primary + return discard_event(); + } + boost::statechart::result react(const RejectTooFullRemoteReservation &evt); + void exit(); + }; + + struct Recovering : boost::statechart::state< Recovering, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< AllReplicasRecovered >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< RequestBackfill > + > reactions; + explicit Recovering(my_context ctx); + void exit(); + void release_reservations(bool cancel = false); + boost::statechart::result react(const AllReplicasRecovered &evt); + boost::statechart::result react(const DeferRecovery& evt); + boost::statechart::result react(const UnfoundRecovery& evt); + boost::statechart::result react(const RequestBackfill &evt); + }; + + struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + boost::statechart::transition< AllRemotesReserved, Recovering > + > reactions; + std::set::const_iterator remote_recovery_reservation_it; + explicit WaitRemoteRecoveryReserved(my_context ctx); + boost::statechart::result react(const RemoteRecoveryReserved &evt); + void exit(); + }; + + struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >, + boost::statechart::custom_reaction< RecoveryTooFull > + > reactions; + explicit WaitLocalRecoveryReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RecoveryTooFull &evt); + }; + + struct Activating : boost::statechart::state< Activating, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::transition< AllReplicasRecovered, Recovered >, + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved > + > reactions; + explicit Activating(my_context ctx); + void exit(); + }; + + struct Stray : boost::statechart::state< Stray, Started >, + NamedState { + explicit Stray(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< MQuery >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< RecoveryDone >, + boost::statechart::transition + > reactions; + boost::statechart::result react(const MQuery& query); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const RecoveryDone&) { + return discard_event(); + } + }; + + struct WaitDeleteReserved; + struct ToDelete : boost::statechart::state, NamedState { + unsigned priority = 0; + typedef boost::mpl::list < + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< ActivateCommitted >, + boost::statechart::custom_reaction< DeleteSome > + > reactions; + explicit ToDelete(my_context ctx); + boost::statechart::result react(const ActMap &evt); + boost::statechart::result react(const DeleteSome &evt) { + // happens if we drop out of Deleting due to reprioritization etc. + return discard_event(); + } + boost::statechart::result react(const ActivateCommitted&) { + // Can happens if we were activated as a stray but not actually pulled + // from prior to the pg going clean and sending a delete. + return discard_event(); + } + void exit(); + }; + + struct Deleting; + struct WaitDeleteReserved : boost::statechart::state, NamedState { + typedef boost::mpl::list < + boost::statechart::transition + > reactions; + explicit WaitDeleteReserved(my_context ctx); + void exit(); + }; + + struct Deleting : boost::statechart::state, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< DeleteSome >, + boost::statechart::transition + > reactions; + ghobject_t next; + explicit Deleting(my_context ctx); + boost::statechart::result react(const DeleteSome &evt); + void exit(); + }; + + struct GetLog; + + struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState { + std::set peer_info_requested; + + explicit GetInfo(my_context ctx); + void exit(); + void get_infos(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::transition< GotInfo, GetLog >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::transition< IsDown, Down > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const MNotifyRec& infoevt); + }; + + struct GotLog : boost::statechart::event< GotLog > { + GotLog() : boost::statechart::event< GotLog >() {} + }; + + struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState { + pg_shard_t auth_log_shard; + boost::intrusive_ptr msg; + + explicit GetLog(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< GotLog >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::transition< NeedActingChange, WaitActingChange >, + boost::statechart::transition< IsIncomplete, Incomplete > + > reactions; + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const GotLog&); + }; + + struct WaitUpThru; + + struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState { + std::set peer_missing_requested; + + explicit GetMissing(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::transition< NeedUpThru, WaitUpThru > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const MLogRec& logevt); + }; + + struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState { + explicit WaitUpThru(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MLogRec > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const ActMap& am); + boost::statechart::result react(const MLogRec& logrec); + }; + + struct Down : boost::statechart::state< Down, Peering>, NamedState { + explicit Down(my_context ctx); + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< MNotifyRec > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const MNotifyRec& infoevt); + void exit(); + }; + + struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::custom_reaction< QueryUnfound >, + boost::statechart::custom_reaction< QueryState > + > reactions; + explicit Incomplete(my_context ctx); + boost::statechart::result react(const AdvMap &advmap); + boost::statechart::result react(const MNotifyRec& infoevt); + boost::statechart::result react(const QueryUnfound& q); + boost::statechart::result react(const QueryState& q); + void exit(); + }; + + PGStateHistory state_history; + CephContext* cct; + spg_t spgid; + DoutPrefixProvider *dpp; + PeeringListener *pl; + + /// context passed in by state machine caller + PeeringCtx *orig_ctx; + + /// populated if we are buffering messages pending a flush + std::optional messages_pending_flush; + + /** + * populated between start_handle() and end_handle(), points into + * the message lists for messages_pending_flush while blocking messages + * or into orig_ctx otherwise + */ + std::optional rctx; + + /** + * OSDMap state + */ + OSDMapRef osdmap_ref; ///< Reference to current OSDMap + PGPool pool; ///< Current pool state + epoch_t last_persisted_osdmap = 0; ///< Last osdmap epoch persisted + + + /** + * Peering state information + */ + int role = -1; ///< 0 = primary, 1 = replica, -1=none. + uint64_t state = 0; ///< PG_STATE_* + + pg_shard_t primary; ///< id/shard of primary + pg_shard_t pg_whoami; ///< my id/shard + pg_shard_t up_primary; ///< id/shard of primary of up set + std::vector up; ///< crush mapping without temp pgs + std::set upset; ///< up in set form + std::vector acting; ///< actual acting set for the current interval + std::set actingset; ///< acting in set form + + /// union of acting, recovery, and backfill targets + std::set acting_recovery_backfill; + + std::vector hb_stamps; + + ceph::signedspan readable_interval = ceph::signedspan::zero(); + + /// how long we can service reads in this interval + ceph::signedspan readable_until = ceph::signedspan::zero(); + + /// upper bound on any acting OSDs' readable_until in this interval + ceph::signedspan readable_until_ub = ceph::signedspan::zero(); + + /// upper bound from prior interval(s) + ceph::signedspan prior_readable_until_ub = ceph::signedspan::zero(); + + /// pg instances from prior interval(s) that may still be readable + std::set prior_readable_down_osds; + + /// [replica] upper bound we got from the primary (primary's clock) + ceph::signedspan readable_until_ub_from_primary = ceph::signedspan::zero(); + + /// [primary] last upper bound shared by primary to replicas + ceph::signedspan readable_until_ub_sent = ceph::signedspan::zero(); + + /// [primary] readable ub acked by acting set members + std::vector acting_readable_until_ub; + + bool send_notify = false; ///< True if a notify needs to be sent to the primary + + bool dirty_info = false; ///< small info structu on disk out of date + bool dirty_big_info = false; ///< big info structure on disk out of date + + pg_info_t info; ///< current pg info + pg_info_t last_written_info; ///< last written info + PastIntervals past_intervals; ///< information about prior pg mappings + PGLog pg_log; ///< pg log + + epoch_t last_peering_reset = 0; ///< epoch of last peering reset + + /// last_update that has committed; ONLY DEFINED WHEN is_active() + eversion_t last_update_ondisk; + eversion_t last_complete_ondisk; ///< last_complete that has committed. + eversion_t last_update_applied; ///< last_update readable + /// last version to which rollback_info trimming has been applied + eversion_t last_rollback_info_trimmed_to_applied; + + /// Counter to determine when pending flushes have completed + unsigned flushes_in_progress = 0; + + /** + * Primary state + */ + std::set stray_set; ///< non-acting osds that have PG data. + std::map peer_info; ///< info from peers (stray or prior) + std::map peer_bytes; ///< Peer's num_bytes from peer_info + std::set peer_purged; ///< peers purged + std::map peer_missing; ///< peer missing sets + std::set peer_log_requested; ///< logs i've requested (and start stamps) + std::set peer_missing_requested; ///< missing sets requested + + /// features supported by all peers + uint64_t peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + /// features supported by acting set + uint64_t acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + /// features supported by up and acting + uint64_t upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + + /// most recently consumed osdmap's require_osd_version + ceph_release_t last_require_osd_release = ceph_release_t::unknown; + + std::vector want_acting; ///< non-empty while peering needs a new acting set + + // acting_recovery_backfill contains shards that are acting, + // async recovery targets, or backfill targets. + std::map peer_last_complete_ondisk; + + /// up: min over last_complete_ondisk, peer_last_complete_ondisk + eversion_t min_last_complete_ondisk; + /// point to which the log should be trimmed + eversion_t pg_trim_to; + + std::set blocked_by; ///< osds we are blocked by (for pg stats) + + bool need_up_thru = false; ///< true if osdmap with updated up_thru needed + + /// I deleted these strays; ignore racing PGInfo from them + std::set peer_activated; + + std::set backfill_targets; ///< osds to be backfilled + std::set async_recovery_targets; ///< osds to be async recovered + + /// osds which might have objects on them which are unfound on the primary + std::set might_have_unfound; + + bool deleting = false; /// true while in removing or OSD is shutting down + std::atomic deleted = {false}; /// true once deletion complete + + MissingLoc missing_loc; ///< information about missing objects + + bool backfill_reserved = false; + bool backfill_reserving = false; + + PeeringMachine machine; + + void update_osdmap_ref(OSDMapRef newmap) { + osdmap_ref = std::move(newmap); + } + + void update_heartbeat_peers(); + void query_unfound(Formatter *f, string state); + bool proc_replica_info( + pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch); + void remove_down_peer_info(const OSDMapRef &osdmap); + void check_recovery_sources(const OSDMapRef& map); + void set_last_peering_reset(); + void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap); + bool should_restart_peering( + int newupprimary, + int newactingprimary, + const std::vector& newup, + const std::vector& newacting, + OSDMapRef lastmap, + OSDMapRef osdmap); + void start_peering_interval( + const OSDMapRef lastmap, + const std::vector& newup, int up_primary, + const std::vector& newacting, int acting_primary, + ObjectStore::Transaction &t); + void on_new_interval(); + void clear_recovery_state(); + void clear_primary_state(); + void check_past_interval_bounds() const; + bool set_force_recovery(bool b); + bool set_force_backfill(bool b); + + /// clip calculated priority to reasonable range + int clamp_recovery_priority(int prio, int pool_recovery_prio, int max); + /// get log recovery reservation priority + unsigned get_recovery_priority(); + /// get backfill reservation priority + unsigned get_backfill_priority(); + /// get priority for pg deletion + unsigned get_delete_priority(); + + bool check_prior_readable_down_osds(const OSDMapRef& map); + + bool adjust_need_up_thru(const OSDMapRef osdmap); + PastIntervals::PriorSet build_prior(); + + void reject_reservation(); + + // acting std::set + std::map::const_iterator find_best_info( + const std::map &infos, + bool restrict_to_up_acting, + bool *history_les_bound) const; + + static void calc_ec_acting( + std::map::const_iterator auth_log_shard, + unsigned size, + const std::vector &acting, + const std::vector &up, + const std::map &all_info, + bool restrict_to_up_acting, + std::vector *want, + std::set *backfill, + std::set *acting_backfill, + std::ostream &ss); + + static std::pair::const_iterator, eversion_t> + select_replicated_primary( + map::const_iterator auth_log_shard, + uint64_t force_auth_primary_missing_objects, + const std::vector &up, + pg_shard_t up_primary, + const map &all_info, + const OSDMapRef osdmap, + ostream &ss); + + static void calc_replicated_acting( + map::const_iterator primary_shard, + eversion_t oldest_auth_log_entry, + unsigned size, + const std::vector &acting, + const std::vector &up, + pg_shard_t up_primary, + const std::map &all_info, + bool restrict_to_up_acting, + std::vector *want, + std::set *backfill, + std::set *acting_backfill, + const OSDMapRef osdmap, + const PGPool& pool, + std::ostream &ss); + static void calc_replicated_acting_stretch( + map::const_iterator primary_shard, + eversion_t oldest_auth_log_entry, + unsigned size, + const std::vector &acting, + const std::vector &up, + pg_shard_t up_primary, + const std::map &all_info, + bool restrict_to_up_acting, + std::vector *want, + std::set *backfill, + std::set *acting_backfill, + const OSDMapRef osdmap, + const PGPool& pool, + std::ostream &ss); + + void choose_async_recovery_ec( + const std::map &all_info, + const pg_info_t &auth_info, + std::vector *want, + std::set *async_recovery, + const OSDMapRef osdmap) const; + void choose_async_recovery_replicated( + const std::map &all_info, + const pg_info_t &auth_info, + std::vector *want, + std::set *async_recovery, + const OSDMapRef osdmap) const; + + bool recoverable(const std::vector &want) const; + bool choose_acting(pg_shard_t &auth_log_shard, + bool restrict_to_up_acting, + bool *history_les_bound, + bool request_pg_temp_change_only = false); + + bool search_for_missing( + const pg_info_t &oinfo, const pg_missing_t &omissing, + pg_shard_t fromosd, + PeeringCtxWrapper &rctx); + void build_might_have_unfound(); + void log_weirdness(); + void activate( + ObjectStore::Transaction& t, + epoch_t activation_epoch, + PeeringCtxWrapper &ctx); + + void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead); + void merge_log( + ObjectStore::Transaction& t, pg_info_t &oinfo, + pg_log_t&& olog, pg_shard_t from); + + void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info); + void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, + pg_log_t&& olog, pg_missing_t&& omissing, + pg_shard_t from); + void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog, + pg_missing_t&& omissing, pg_shard_t from); + + void calc_min_last_complete_ondisk() { + eversion_t min = last_complete_ondisk; + ceph_assert(!acting_recovery_backfill.empty()); + for (std::set::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == get_primary()) continue; + if (peer_last_complete_ondisk.count(*i) == 0) + return; // we don't have complete info + eversion_t a = peer_last_complete_ondisk[*i]; + if (a < min) + min = a; + } + if (min == min_last_complete_ondisk) + return; + min_last_complete_ondisk = min; + return; + } + + void fulfill_info( + pg_shard_t from, const pg_query_t &query, + std::pair ¬ify_info); + void fulfill_log( + pg_shard_t from, const pg_query_t &query, epoch_t query_epoch); + void fulfill_query(const MQuery& q, PeeringCtxWrapper &rctx); + + void try_mark_clean(); + + void update_blocked_by(); + void update_calc_stats(); + + void add_log_entry(const pg_log_entry_t& e, bool applied); + + void calc_trim_to(); + void calc_trim_to_aggressive(); + +public: + PeeringState( + CephContext *cct, + pg_shard_t pg_whoami, + spg_t spgid, + const PGPool &pool, + OSDMapRef curmap, + DoutPrefixProvider *dpp, + PeeringListener *pl); + + /// Process evt + void handle_event(const boost::statechart::event_base &evt, + PeeringCtx *rctx) { + start_handle(rctx); + machine.process_event(evt); + end_handle(); + } + + /// Process evt + void handle_event(PGPeeringEventRef evt, + PeeringCtx *rctx) { + start_handle(rctx); + machine.process_event(evt->get_event()); + end_handle(); + } + + /// Init fresh instance of PG + void init( + int role, + const std::vector& newup, int new_up_primary, + const std::vector& newacting, int new_acting_primary, + const pg_history_t& history, + const PastIntervals& pi, + bool backfill, + ObjectStore::Transaction &t); + + /// Init pg instance from disk state + template + auto init_from_disk_state( + pg_info_t &&info_from_disk, + PastIntervals &&past_intervals_from_disk, + F &&pg_log_init) { + info = std::move(info_from_disk); + last_written_info = info; + past_intervals = std::move(past_intervals_from_disk); + auto ret = pg_log_init(pg_log); + log_weirdness(); + return ret; + } + + /// Std::set initial primary/acting + void init_primary_up_acting( + const std::vector &newup, + const std::vector &newacting, + int new_up_primary, + int new_acting_primary); + void init_hb_stamps(); + + /// Std::set initial role + void set_role(int r) { + role = r; + } + + /// Std::set predicates used for determining readable and recoverable + void set_backend_predicates( + IsPGReadablePredicate *is_readable, + IsPGRecoverablePredicate *is_recoverable) { + missing_loc.set_backend_predicates(is_readable, is_recoverable); + } + + /// Send current pg_info to peers + void share_pg_info(); + + /// Get stats for child pgs + void start_split_stats( + const std::set& childpgs, std::vector *out); + + /// Update new child with stats + void finish_split_stats( + const object_stat_sum_t& stats, ObjectStore::Transaction &t); + + /// Split state for child_pgid into *child + void split_into( + pg_t child_pgid, PeeringState *child, unsigned split_bits); + + /// Merge state from sources + void merge_from( + std::map& sources, + PeeringCtx &rctx, + unsigned split_bits, + const pg_merge_meta_t& last_pg_merge_meta); + + /// Permit stray replicas to purge now unnecessary state + void purge_strays(); + + /** + * update_stats + * + * Mechanism for updating stats and/or history. Pass t to mark + * dirty and write out. Return true if stats should be published + * to the osd. + */ + void update_stats( + std::function f, + ObjectStore::Transaction *t = nullptr); + + /** + * adjust_purged_snaps + * + * Mechanism for updating purged_snaps. Marks dirty_info, big_dirty_info. + */ + void adjust_purged_snaps( + std::function &snaps)> f); + + /// Updates info.hit_set to hset_history, does not dirty + void update_hset(const pg_hit_set_history_t &hset_history); + + /// Get all pg_shards that needs recovery + std::vector get_replica_recovery_order() const; + + /** + * update_history + * + * Merges new_history into info.history clearing past_intervals and + * dirtying as needed. + * + * Calls PeeringListener::on_info_history_change() + */ + void update_history(const pg_history_t& new_history); + + /** + * prepare_stats_for_publish + * + * Returns updated pg_stat_t if stats have changed since + * pg_stats_publish adding in unstable_stats. + */ + std::optional prepare_stats_for_publish( + bool pg_stats_publish_valid, + const pg_stat_t &pg_stats_publish, + const object_stat_collection_t &unstable_stats); + + /** + * Merge entries updating missing as necessary on all + * acting_recovery_backfill logs and missings (also missing_loc) + */ + bool append_log_entries_update_missing( + const mempool::osd_pglog::list &entries, + ObjectStore::Transaction &t, + std::optional trim_to, + std::optional roll_forward_to); + + void append_log_with_trim_to_updated( + std::vector&& log_entries, + eversion_t roll_forward_to, + ObjectStore::Transaction &t, + bool transaction_applied, + bool async) { + update_trim_to(); + append_log(std::move(log_entries), pg_trim_to, roll_forward_to, + min_last_complete_ondisk, t, transaction_applied, async); + } + + /** + * Updates local log to reflect new write from primary. + */ + void append_log( + std::vector&& logv, + eversion_t trim_to, + eversion_t roll_forward_to, + eversion_t min_last_complete_ondisk, + ObjectStore::Transaction &t, + bool transaction_applied, + bool async); + + /** + * retrieve the min last_backfill among backfill targets + */ + hobject_t earliest_backfill() const; + + + /** + * Updates local log/missing to reflect new oob log update from primary + */ + void merge_new_log_entries( + const mempool::osd_pglog::list &entries, + ObjectStore::Transaction &t, + std::optional trim_to, + std::optional roll_forward_to); + + /// Update missing set to reflect e (TODOSAM: not sure why this is needed) + void add_local_next_event(const pg_log_entry_t& e) { + pg_log.missing_add_next_entry(e); + } + + /// Update log trim boundary + void update_trim_to() { + bool hard_limit = (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT)); + if (hard_limit) + calc_trim_to_aggressive(); + else + calc_trim_to(); + } + + /// Pre-process pending update on hoid represented by logv + void pre_submit_op( + const hobject_t &hoid, + const std::vector& logv, + eversion_t at_version); + + /// Signal that oid has been locally recovered to version v + void recover_got( + const hobject_t &oid, eversion_t v, + bool is_delete, + ObjectStore::Transaction &t); + + /// Signal that oid has been recovered on peer to version + void on_peer_recover( + pg_shard_t peer, + const hobject_t &soid, + const eversion_t &version); + + /// Notify that soid is being recovered on peer + void begin_peer_recover( + pg_shard_t peer, + const hobject_t soid); + + /// Pull missing sets from all candidate peers + bool discover_all_missing( + BufferedRecoveryMessages &rctx); + + /// Notify that hoid has been fully recocovered + void object_recovered( + const hobject_t &hoid, + const object_stat_sum_t &stat_diff) { + info.stats.stats.sum.add(stat_diff); + missing_loc.recovered(hoid); + } + + /// Update info/stats to reflect backfill progress + void update_backfill_progress( + const hobject_t &updated_backfill, + const pg_stat_t &updated_stats, + bool preserve_local_num_bytes, + ObjectStore::Transaction &t); + + /// Update info/stats to reflect completed backfill on hoid + void update_complete_backfill_object_stats( + const hobject_t &hoid, + const pg_stat_t &stats); + + /// Update last_backfill for peer to new_last_backfill + void update_peer_last_backfill( + pg_shard_t peer, + const hobject_t &new_last_backfill); + + /// Update info.stats with delta_stats for operation on soid + void apply_op_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats); + + /** + * force_object_missing + * + * Force oid on peer to be missing at version. If the object does not + * currently need recovery, either candidates if provided or the remainder + * of the acting std::set will be deemed to have the object. + */ + void force_object_missing( + const pg_shard_t &peer, + const hobject_t &oid, + eversion_t version) { + force_object_missing(std::set{peer}, oid, version); + } + void force_object_missing( + const std::set &peer, + const hobject_t &oid, + eversion_t version); + + /// Update state prior to backfilling soid on targets + void prepare_backfill_for_missing( + const hobject_t &soid, + const eversion_t &version, + const std::vector &targets); + + /// Std::set targets with the right version for revert (see recover_primary) + void set_revert_with_targets( + const hobject_t &soid, + const std::set &good_peers); + + /// Update lcod for fromosd + void update_peer_last_complete_ondisk( + pg_shard_t fromosd, + eversion_t lcod) { + peer_last_complete_ondisk[fromosd] = lcod; + } + + /// Update lcod + void update_last_complete_ondisk( + eversion_t lcod) { + last_complete_ondisk = lcod; + } + + /// Update state to reflect recovery up to version + void recovery_committed_to(eversion_t version); + + /// Mark recovery complete + void local_recovery_complete() { + info.last_complete = info.last_update; + } + + /// Update last_requested pointer to v + void set_last_requested(version_t v) { + pg_log.set_last_requested(v); + } + + /// Write dirty state to t + void write_if_dirty(ObjectStore::Transaction& t); + + /// Mark write completed to v with persisted lc + void complete_write(eversion_t v, eversion_t lc); + + /// Update local write applied pointer + void local_write_applied(eversion_t v) { + last_update_applied = v; + } + + /// Updates peering state with new map + void advance_map( + OSDMapRef osdmap, ///< [in] new osdmap + OSDMapRef lastmap, ///< [in] prev osdmap + std::vector& newup, ///< [in] new up set + int up_primary, ///< [in] new up primary + std::vector& newacting, ///< [in] new acting + int acting_primary, ///< [in] new acting primary + PeeringCtx &rctx ///< [out] recovery context + ); + + /// Activates most recently updated map + void activate_map( + PeeringCtx &rctx ///< [out] recovery context + ); + + /// resets last_persisted_osdmap + void reset_last_persisted() { + last_persisted_osdmap = 0; + dirty_info = true; + dirty_big_info = true; + } + + /// Signal shutdown beginning + void shutdown() { + deleting = true; + } + + /// Signal shutdown complete + void set_delete_complete() { + deleted = true; + } + + /// Dirty info and write out + void force_write_state(ObjectStore::Transaction &t) { + dirty_info = true; + dirty_big_info = true; + write_if_dirty(t); + } + + /// Get current interval's readable_until + ceph::signedspan get_readable_until() const { + return readable_until; + } + + /// Get prior intervals' readable_until upper bound + ceph::signedspan get_prior_readable_until_ub() const { + return prior_readable_until_ub; + } + + /// Get prior intervals' readable_until down OSDs of note + const std::set& get_prior_readable_down_osds() const { + return prior_readable_down_osds; + } + + /// Reset prior intervals' readable_until upper bound (e.g., bc it passed) + void clear_prior_readable_until_ub() { + prior_readable_until_ub = ceph::signedspan::zero(); + prior_readable_down_osds.clear(); + info.history.prior_readable_until_ub = ceph::signedspan::zero(); + } + + void renew_lease(ceph::signedspan now) { + bool was_min = (readable_until_ub == readable_until); + readable_until_ub_sent = now + readable_interval; + if (was_min) { + recalc_readable_until(); + } + } + + void send_lease(); + void schedule_renew_lease(); + + pg_lease_t get_lease() { + return pg_lease_t(readable_until, readable_until_ub_sent, readable_interval); + } + + void proc_lease(const pg_lease_t& l); + void proc_lease_ack(int from, const pg_lease_ack_t& la); + void proc_renew_lease(); + + pg_lease_ack_t get_lease_ack() { + return pg_lease_ack_t(readable_until_ub_from_primary); + } + + /// [primary] recalc readable_until[_ub] for the current interval + void recalc_readable_until(); + + //============================ const helpers ================================ + const char *get_current_state() const { + return state_history.get_current_state(); + } + epoch_t get_last_peering_reset() const { + return last_peering_reset; + } + eversion_t get_last_rollback_info_trimmed_to_applied() const { + return last_rollback_info_trimmed_to_applied; + } + /// Returns stable reference to internal pool structure + const PGPool &get_pool() const { + return pool; + } + /// Returns reference to current osdmap + const OSDMapRef &get_osdmap() const { + ceph_assert(osdmap_ref); + return osdmap_ref; + } + /// Returns epoch of current osdmap + epoch_t get_osdmap_epoch() const { + return get_osdmap()->get_epoch(); + } + + bool is_ec_pg() const override { + return pool.info.is_erasure(); + } + int get_pg_size() const override { + return pool.info.size; + } + bool is_deleting() const { + return deleting; + } + bool is_deleted() const { + return deleted; + } + const std::set &get_upset() const override { + return upset; + } + bool is_acting_recovery_backfill(pg_shard_t osd) const { + return acting_recovery_backfill.count(osd); + } + bool is_acting(pg_shard_t osd) const { + return has_shard(pool.info.is_erasure(), acting, osd); + } + bool is_up(pg_shard_t osd) const { + return has_shard(pool.info.is_erasure(), up, osd); + } + static bool has_shard(bool ec, const std::vector& v, pg_shard_t osd) { + if (ec) { + return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd; + } else { + return std::find(v.begin(), v.end(), osd.osd) != v.end(); + } + } + const PastIntervals& get_past_intervals() const { + return past_intervals; + } + /// acting osd that is not the primary + bool is_nonprimary() const { + return role >= 0 && pg_whoami != primary; + } + /// primary osd + bool is_primary() const { + return pg_whoami == primary; + } + bool pg_has_reset_since(epoch_t e) const { + return deleted || e < get_last_peering_reset(); + } + + int get_role() const { + return role; + } + const std::vector &get_acting() const { + return acting; + } + const std::set &get_actingset() const { + return actingset; + } + int get_acting_primary() const { + return primary.osd; + } + pg_shard_t get_primary() const { + return primary; + } + const std::vector &get_up() const { + return up; + } + int get_up_primary() const { + return up_primary.osd; + } + + bool is_backfill_target(pg_shard_t osd) const { + return backfill_targets.count(osd); + } + const std::set &get_backfill_targets() const { + return backfill_targets; + } + bool is_async_recovery_target(pg_shard_t peer) const { + return async_recovery_targets.count(peer); + } + const std::set &get_async_recovery_targets() const { + return async_recovery_targets; + } + const std::set &get_acting_recovery_backfill() const { + return acting_recovery_backfill; + } + + const PGLog &get_pg_log() const { + return pg_log; + } + + bool state_test(uint64_t m) const { return (state & m) != 0; } + void state_set(uint64_t m) { state |= m; } + void state_clear(uint64_t m) { state &= ~m; } + + bool is_complete() const { return info.last_complete == info.last_update; } + bool should_send_notify() const { return send_notify; } + + uint64_t get_state() const { return state; } + bool is_active() const { return state_test(PG_STATE_ACTIVE); } + bool is_activating() const { return state_test(PG_STATE_ACTIVATING); } + bool is_peering() const { return state_test(PG_STATE_PEERING); } + bool is_down() const { return state_test(PG_STATE_DOWN); } + bool is_recovery_unfound() const { + return state_test(PG_STATE_RECOVERY_UNFOUND); + } + bool is_backfilling() const { + return state_test(PG_STATE_BACKFILLING); + } + bool is_backfill_unfound() const { + return state_test(PG_STATE_BACKFILL_UNFOUND); + } + bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); } + bool is_clean() const { return state_test(PG_STATE_CLEAN); } + bool is_degraded() const { return state_test(PG_STATE_DEGRADED); } + bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); } + bool is_remapped() const { return state_test(PG_STATE_REMAPPED); } + bool is_peered() const { + return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED); + } + bool is_recovering() const { return state_test(PG_STATE_RECOVERING); } + bool is_premerge() const { return state_test(PG_STATE_PREMERGE); } + bool is_repair() const { return state_test(PG_STATE_REPAIR); } + bool is_empty() const { return info.last_update == eversion_t(0,0); } + + bool get_need_up_thru() const { + return need_up_thru; + } + + bool is_forced_recovery_or_backfill() const { + return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL); + } + + bool is_backfill_reserved() const { + return backfill_reserved; + } + + bool is_backfill_reserving() const { + return backfill_reserving; + } + + ceph_release_t get_last_require_osd_release() const { + return last_require_osd_release; + } + + const pg_info_t &get_info() const { + return info; + } + + const decltype(peer_info) &get_peer_info() const { + return peer_info; + } + const decltype(peer_missing) &get_peer_missing() const { + return peer_missing; + } + const pg_missing_const_i &get_peer_missing(const pg_shard_t &peer) const { + if (peer == pg_whoami) { + return pg_log.get_missing(); + } else { + assert(peer_missing.count(peer)); + return peer_missing.find(peer)->second; + } + } + const pg_info_t&get_peer_info(pg_shard_t peer) const { + assert(peer_info.count(peer)); + return peer_info.find(peer)->second; + } + bool has_peer_info(pg_shard_t peer) const { + return peer_info.count(peer); + } + + bool needs_recovery() const; + bool needs_backfill() const; + + /** + * Returns whether a particular object can be safely read on this replica + */ + bool can_serve_replica_read(const hobject_t &hoid) { + ceph_assert(!is_primary()); + return !pg_log.get_log().has_write_since( + hoid, get_min_last_complete_ondisk()); + } + + /** + * Returns whether the current acting set is able to go active + * and serve writes. It needs to satisfy min_size and any + * applicable stretch cluster constraints. + */ + bool acting_set_writeable() { + return (actingset.size() >= pool.info.min_size) && + (pool.info.stretch_set_can_peer(acting, *get_osdmap(), NULL)); + } + + /** + * Returns whether all peers which might have unfound objects have been + * queried or marked lost. + */ + bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const; + bool all_missing_unfound() const { + const auto& missing = pg_log.get_missing(); + if (!missing.have_missing()) + return false; + for (auto& m : missing.get_items()) { + if (!missing_loc.is_unfound(m.first)) + return false; + } + return true; + } + + bool perform_deletes_during_peering() const { + return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)); + } + + + bool have_unfound() const { + return missing_loc.have_unfound(); + } + uint64_t get_num_unfound() const { + return missing_loc.num_unfound(); + } + + bool have_missing() const { + return pg_log.get_missing().num_missing() > 0; + } + unsigned int get_num_missing() const { + return pg_log.get_missing().num_missing(); + } + + const MissingLoc &get_missing_loc() const { + return missing_loc; + } + + const MissingLoc::missing_by_count_t &get_missing_by_count() const { + return missing_loc.get_missing_by_count(); + } + + eversion_t get_min_last_complete_ondisk() const { + return min_last_complete_ondisk; + } + + eversion_t get_pg_trim_to() const { + return pg_trim_to; + } + + eversion_t get_last_update_applied() const { + return last_update_applied; + } + + eversion_t get_last_update_ondisk() const { + return last_update_ondisk; + } + + bool debug_has_dirty_state() const { + return dirty_info || dirty_big_info; + } + + std::string get_pg_state_string() const { + return pg_state_string(state); + } + + /// Dump representation of past_intervals to out + void print_past_intervals(std::ostream &out) const { + out << "[" << past_intervals.get_bounds() + << ")/" << past_intervals.size(); + } + + void dump_history(ceph::Formatter *f) const { + state_history.dump(f); + } + + /// Dump formatted peering status + void dump_peering_state(ceph::Formatter *f); + +private: + /// Mask feature vector with feature set from new peer + void apply_peer_features(uint64_t f) { peer_features &= f; } + + /// Reset feature vector to default + void reset_min_peer_features() { + peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + } +public: + /// Get feature vector common to all known peers with this pg + uint64_t get_min_peer_features() const { return peer_features; } + + /// Get feature vector common to acting set + uint64_t get_min_acting_features() const { return acting_features; } + + /// Get feature vector common to up/acting set + uint64_t get_min_upacting_features() const { return upacting_features; } + + + // Flush control interface +private: + /** + * Start additional flush (blocks needs_flush/activation until + * complete_flush is called once for each start_flush call as + * required by start_flush_on_transaction). + */ + void start_flush(ObjectStore::Transaction &t) { + flushes_in_progress++; + pl->start_flush_on_transaction(t); + } +public: + /// True if there are outstanding flushes + bool needs_flush() const { + return flushes_in_progress > 0; + } + /// Must be called once per start_flush + void complete_flush(); + + friend std::ostream &operator<<(std::ostream &out, const PeeringState &ps); +}; + +std::ostream &operator<<(std::ostream &out, const PeeringState &ps); diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc new file mode 100644 index 000000000..c1673bf70 --- /dev/null +++ b/src/osd/PrimaryLogPG.cc @@ -0,0 +1,15470 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "boost/tuple/tuple.hpp" +#include "boost/intrusive_ptr.hpp" +#include "PG.h" +#include "pg_scrubber.h" +#include "PrimaryLogPG.h" +#include "OSD.h" +#include "PrimaryLogScrub.h" +#include "OpRequest.h" +#include "ScrubStore.h" +#include "Session.h" +#include "objclass/objclass.h" +#include "osd/ClassHandler.h" + +#include "cls/cas/cls_cas_ops.h" +#include "common/ceph_crypto.h" +#include "common/errno.h" +#include "common/scrub_types.h" +#include "common/perf_counters.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDPGTrim.h" +#include "messages/MOSDPGScan.h" +#include "messages/MOSDRepScrub.h" +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" +#include "messages/MCommandReply.h" +#include "messages/MOSDScrubReserve.h" +#include "common/EventTrace.h" + +#include "common/config.h" +#include "include/compat.h" +#include "mon/MonClient.h" +#include "osdc/Objecter.h" +#include "json_spirit/json_spirit_value.h" +#include "json_spirit/json_spirit_reader.h" +#include "include/ceph_assert.h" // json_spirit clobbers it +#include "include/rados/rados_types.hpp" + +#ifdef WITH_LTTNG +#include "tracing/osd.h" +#else +#define tracepoint(...) +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap() +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +#include +#include + +#include +#ifdef HAVE_JAEGER +#include "common/tracer.h" +#endif + +#include + +MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd); + +using std::list; +using std::ostream; +using std::pair; +using std::make_pair; +using std::map; +using std::ostringstream; +using std::set; +using std::string; +using std::string_view; +using std::stringstream; +using std::unique_ptr; +using std::vector; + +using ceph::bufferlist; +using ceph::bufferptr; +using ceph::Formatter; +using ceph::decode; +using ceph::decode_noclear; +using ceph::encode; +using ceph::encode_destructively; + +using namespace ceph::osd::scheduler; +using TOPNSPC::common::cmd_getval; + +template +static ostream& _prefix(std::ostream *_dout, T *pg) { + return pg->gen_prefix(*_dout); +} + +/** + * The CopyCallback class defines an interface for completions to the + * copy_start code. Users of the copy infrastructure must implement + * one and give an instance of the class to start_copy. + * + * The implementer is responsible for making sure that the CopyCallback + * can associate itself with the correct copy operation. + */ +class PrimaryLogPG::CopyCallback : public GenContext { +protected: + CopyCallback() {} + /** + * results.get<0>() is the return code: 0 for success; -ECANCELED if + * the operation was cancelled by the local OSD; -errno for other issues. + * results.get<1>() is a pointer to a CopyResults object, which you are + * responsible for deleting. + */ + void finish(CopyCallbackResults results_) override = 0; + +public: + /// Provide the final size of the copied object to the CopyCallback + ~CopyCallback() override {} +}; + +template +class PrimaryLogPG::BlessedGenContext : public GenContext { + PrimaryLogPGRef pg; + unique_ptr> c; + epoch_t e; +public: + BlessedGenContext(PrimaryLogPG *pg, GenContext *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(T t) override { + std::scoped_lock locker{*pg}; + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(t); + } + bool sync_finish(T t) { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(t); + return true; + } +}; + +GenContext *PrimaryLogPG::bless_gencontext( + GenContext *c) { + return new BlessedGenContext( + this, c, get_osdmap_epoch()); +} + +template +class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext { + PrimaryLogPGRef pg; + unique_ptr> c; + epoch_t e; +public: + UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(T t) override { + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(t); + } + bool sync_finish(T t) { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(t); + return true; + } +}; + +GenContext *PrimaryLogPG::bless_unlocked_gencontext( + GenContext *c) { + return new UnlockedBlessedGenContext( + this, c, get_osdmap_epoch()); +} + +class PrimaryLogPG::BlessedContext : public Context { + PrimaryLogPGRef pg; + unique_ptr c; + epoch_t e; +public: + BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(int r) override { + std::scoped_lock locker{*pg}; + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(r); + } + bool sync_finish(int r) override { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(r); + return true; + } +}; + +Context *PrimaryLogPG::bless_context(Context *c) { + return new BlessedContext(this, c, get_osdmap_epoch()); +} + +class PrimaryLogPG::C_PG_ObjectContext : public Context { + PrimaryLogPGRef pg; + ObjectContext *obc; + public: + C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) : + pg(p), obc(o) {} + void finish(int r) override { + pg->object_context_destructor_callback(obc); + } +}; + +struct OnReadComplete : public Context { + PrimaryLogPG *pg; + PrimaryLogPG::OpContext *opcontext; + OnReadComplete( + PrimaryLogPG *pg, + PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {} + void finish(int r) override { + opcontext->finish_read(pg); + } + ~OnReadComplete() override {} +}; + +class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context { + PrimaryLogPGRef pg; + ObjectContextRef obc; + public: + C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) : + pg(p), obc(o) {} + bool sync_finish(int r) override { + pg->_applied_recovered_object(obc); + return true; + } + void finish(int r) override { + std::scoped_lock locker{*pg}; + pg->_applied_recovered_object(obc); + } +}; + +class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context { + PrimaryLogPGRef pg; + epoch_t epoch; + eversion_t last_complete; + public: + C_OSD_CommittedPushedObject( + PrimaryLogPG *p, epoch_t epoch, eversion_t lc) : + pg(p), epoch(epoch), last_complete(lc) { + } + void finish(int r) override { + pg->_committed_pushed_object(epoch, last_complete); + } +}; + +class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context { + PrimaryLogPGRef pg; + public: + explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) : + pg(p) {} + bool sync_finish(int r) override { + pg->_applied_recovered_object_replica(); + return true; + } + void finish(int r) override { + std::scoped_lock locker{*pg}; + pg->_applied_recovered_object_replica(); + } +}; + +// OpContext +void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg) +{ + inflightreads = 1; + list, + pair > > in; + in.swap(pending_async_reads); + pg->pgbackend->objects_read_async( + obc->obs.oi.soid, + in, + new OnReadComplete(pg, this), pg->get_pool().fast_read); +} +void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg) +{ + ceph_assert(inflightreads > 0); + --inflightreads; + if (async_reads_complete()) { + ceph_assert(pg->in_progress_async_reads.size()); + ceph_assert(pg->in_progress_async_reads.front().second == this); + pg->in_progress_async_reads.pop_front(); + + // Restart the op context now that all reads have been + // completed. Read failures will be handled by the op finisher + pg->execute_ctx(this); + } +} + +class CopyFromCallback : public PrimaryLogPG::CopyCallback { +public: + PrimaryLogPG::CopyResults *results = nullptr; + PrimaryLogPG::OpContext *ctx; + OSDOp &osd_op; + uint32_t truncate_seq; + uint64_t truncate_size; + bool have_truncate = false; + + CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op) + : ctx(ctx), osd_op(osd_op) { + } + ~CopyFromCallback() override {} + + void finish(PrimaryLogPG::CopyCallbackResults results_) override { + results = results_.get<1>(); + int r = results_.get<0>(); + + // Only use truncate_{seq,size} from the original object if the client + // did not sent us these parameters + if (!have_truncate) { + truncate_seq = results->truncate_seq; + truncate_size = results->truncate_size; + } + + // for finish_copyfrom + ctx->user_at_version = results->user_version; + + if (r >= 0) { + ctx->pg->execute_ctx(ctx); + } else { + if (r != -ECANCELED) { // on cancel just toss it out; client resends + if (ctx->op) + ctx->pg->osd->reply_op_error(ctx->op, r); + } else if (results->should_requeue) { + if (ctx->op) + ctx->pg->requeue_op(ctx->op); + } + ctx->pg->close_op_ctx(ctx); + } + } + + bool is_temp_obj_used() { + return results->started_temp_obj; + } + uint64_t get_data_size() { + return results->object_size; + } + void set_truncate(uint32_t seq, uint64_t size) { + truncate_seq = seq; + truncate_size = size; + have_truncate = true; + } +}; + +struct CopyFromFinisher : public PrimaryLogPG::OpFinisher { + CopyFromCallback *copy_from_callback; + + explicit CopyFromFinisher(CopyFromCallback *copy_from_callback) + : copy_from_callback(copy_from_callback) { + } + + int execute() override { + // instance will be destructed after this method completes + copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback); + return 0; + } +}; + +// ====================== +// PGBackend::Listener + +void PrimaryLogPG::on_local_recover( + const hobject_t &hoid, + const ObjectRecoveryInfo &_recovery_info, + ObjectContextRef obc, + bool is_delete, + ObjectStore::Transaction *t + ) +{ + dout(10) << __func__ << ": " << hoid << dendl; + + ObjectRecoveryInfo recovery_info(_recovery_info); + clear_object_snap_mapping(t, hoid); + if (!is_delete && recovery_info.soid.is_snap()) { + OSDriver::OSTransaction _t(osdriver.get_transaction(t)); + set snaps; + dout(20) << " snapset " << recovery_info.ss << dendl; + auto p = recovery_info.ss.clone_snaps.find(hoid.snap); + if (p != recovery_info.ss.clone_snaps.end()) { + snaps.insert(p->second.begin(), p->second.end()); + dout(20) << " snaps " << snaps << dendl; + snap_mapper.add_oid( + recovery_info.soid, + snaps, + &_t); + } else { + derr << __func__ << " " << hoid << " had no clone_snaps" << dendl; + } + } + if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) && + recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) { + ceph_assert(is_primary()); + const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second; + if (latest->op == pg_log_entry_t::LOST_REVERT && + latest->reverting_to == recovery_info.version) { + dout(10) << " got old revert version " << recovery_info.version + << " for " << *latest << dendl; + recovery_info.version = latest->version; + // update the attr to the revert event version + recovery_info.oi.prior_version = recovery_info.oi.version; + recovery_info.oi.version = latest->version; + bufferlist bl; + encode(recovery_info.oi, bl, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + ceph_assert(!pool.info.is_erasure()); + t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl); + if (obc) + obc->attr_cache[OI_ATTR] = bl; + } + } + + // keep track of active pushes for scrub + ++active_pushes; + + recovery_state.recover_got( + recovery_info.soid, + recovery_info.version, + is_delete, + *t); + + if (is_primary()) { + if (!is_delete) { + obc->obs.exists = true; + + bool got = obc->get_recovery_read(); + ceph_assert(got); + + ceph_assert(recovering.count(obc->obs.oi.soid)); + recovering[obc->obs.oi.soid] = obc; + obc->obs.oi = recovery_info.oi; // may have been updated above + } + + t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc)); + + publish_stats_to_osd(); + release_backoffs(hoid); + if (!is_unreadable_object(hoid)) { + auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid); + if (unreadable_object_entry != waiting_for_unreadable_object.end()) { + dout(20) << " kicking unreadable waiters on " << hoid << dendl; + requeue_ops(unreadable_object_entry->second); + waiting_for_unreadable_object.erase(unreadable_object_entry); + } + } + } else { + t->register_on_applied( + new C_OSD_AppliedRecoveredObjectReplica(this)); + + } + + t->register_on_commit( + new C_OSD_CommittedPushedObject( + this, + get_osdmap_epoch(), + info.last_complete)); +} + +void PrimaryLogPG::on_global_recover( + const hobject_t &soid, + const object_stat_sum_t &stat_diff, + bool is_delete) +{ + recovery_state.object_recovered(soid, stat_diff); + publish_stats_to_osd(); + dout(10) << "pushed " << soid << " to all replicas" << dendl; + auto i = recovering.find(soid); + ceph_assert(i != recovering.end()); + + if (i->second && i->second->rwstate.recovery_read_marker) { + // recover missing won't have had an obc, but it gets filled in + // during on_local_recover + ceph_assert(i->second); + list requeue_list; + i->second->drop_recovery_read(&requeue_list); + requeue_ops(requeue_list); + } + + backfills_in_flight.erase(soid); + + recovering.erase(i); + finish_recovery_op(soid); + release_backoffs(soid); + auto degraded_object_entry = waiting_for_degraded_object.find(soid); + if (degraded_object_entry != waiting_for_degraded_object.end()) { + dout(20) << " kicking degraded waiters on " << soid << dendl; + requeue_ops(degraded_object_entry->second); + waiting_for_degraded_object.erase(degraded_object_entry); + } + auto unreadable_object_entry = waiting_for_unreadable_object.find(soid); + if (unreadable_object_entry != waiting_for_unreadable_object.end()) { + dout(20) << " kicking unreadable waiters on " << soid << dendl; + requeue_ops(unreadable_object_entry->second); + waiting_for_unreadable_object.erase(unreadable_object_entry); + } + finish_degraded_object(soid); +} + +void PrimaryLogPG::schedule_recovery_work( + GenContext *c) +{ + osd->queue_recovery_context(this, c); +} + +void PrimaryLogPG::replica_clear_repop_obc( + const vector &logv, + ObjectStore::Transaction &t) +{ + for (auto &&e: logv) { + /* Have to blast all clones, they share a snapset */ + object_contexts.clear_range( + e.soid.get_object_boundary(), e.soid.get_head()); + ceph_assert( + snapset_contexts.find(e.soid.get_head()) == + snapset_contexts.end()); + } +} + +bool PrimaryLogPG::should_send_op( + pg_shard_t peer, + const hobject_t &hoid) { + if (peer == get_primary()) + return true; + ceph_assert(recovery_state.has_peer_info(peer)); + bool should_send = + hoid.pool != (int64_t)info.pgid.pool() || + hoid <= last_backfill_started || + hoid <= recovery_state.get_peer_info(peer).last_backfill; + if (!should_send) { + ceph_assert(is_backfill_target(peer)); + dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer + << ", object " << hoid + << " beyond std::max(last_backfill_started " + << ", peer_info[peer].last_backfill " + << recovery_state.get_peer_info(peer).last_backfill + << ")" << dendl; + return should_send; + } + if (is_async_recovery_target(peer) && + recovery_state.get_peer_missing(peer).is_missing(hoid)) { + should_send = false; + dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer + << ", object " << hoid + << " which is pending recovery in async_recovery_targets" << dendl; + } + return should_send; +} + + +ConnectionRef PrimaryLogPG::get_con_osd_cluster( + int peer, epoch_t from_epoch) +{ + return osd->get_con_osd_cluster(peer, from_epoch); +} + +PerfCounters *PrimaryLogPG::get_logger() +{ + return osd->logger; +} + + +// ==================== +// missing objects + +bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const +{ + return recovery_state.get_pg_log().get_missing().get_items().count(soid); +} + +void PrimaryLogPG::maybe_kick_recovery( + const hobject_t &soid) +{ + eversion_t v; + bool work_started = false; + if (!recovery_state.get_missing_loc().needs_recovery(soid, &v)) + return; + + map::const_iterator p = recovering.find(soid); + if (p != recovering.end()) { + dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl; + } else if (recovery_state.get_missing_loc().is_unfound(soid)) { + dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl; + } else { + dout(7) << "object " << soid << " v " << v << ", recovering." << dendl; + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + if (is_missing_object(soid)) { + recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h); + } else if (recovery_state.get_missing_loc().is_deleted(soid)) { + prep_object_replica_deletes(soid, v, h, &work_started); + } else { + prep_object_replica_pushes(soid, v, h, &work_started); + } + pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH); + } +} + +void PrimaryLogPG::wait_for_unreadable_object( + const hobject_t& soid, OpRequestRef op) +{ + ceph_assert(is_unreadable_object(soid)); + maybe_kick_recovery(soid); + waiting_for_unreadable_object[soid].push_back(op); + op->mark_delayed("waiting for missing object"); +} + +bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid) +{ + /* The conditions below may clear (on_local_recover, before we queue + * the transaction) before we actually requeue the degraded waiters + * in on_global_recover after the transaction completes. + */ + if (waiting_for_degraded_object.count(soid)) + return true; + if (recovery_state.get_pg_log().get_missing().get_items().count(soid)) + return true; + ceph_assert(!get_acting_recovery_backfill().empty()); + for (set::iterator i = get_acting_recovery_backfill().begin(); + i != get_acting_recovery_backfill().end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + auto peer_missing_entry = recovery_state.get_peer_missing().find(peer); + // If an object is missing on an async_recovery_target, return false. + // This will not block the op and the object is async recovered later. + if (peer_missing_entry != recovery_state.get_peer_missing().end() && + peer_missing_entry->second.get_items().count(soid)) { + if (is_async_recovery_target(peer)) + continue; + else + return true; + } + // Object is degraded if after last_backfill AND + // we are backfilling it + if (is_backfill_target(peer) && + recovery_state.get_peer_info(peer).last_backfill <= soid && + last_backfill_started >= soid && + backfills_in_flight.count(soid)) + return true; + } + return false; +} + +bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid) +{ + for (auto &i: get_async_recovery_targets()) { + auto peer_missing_entry = recovery_state.get_peer_missing().find(i); + if (peer_missing_entry != recovery_state.get_peer_missing().end() && + peer_missing_entry->second.get_items().count(soid)) { + dout(30) << __func__ << " " << soid << dendl; + return true; + } + } + return false; +} + +void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op) +{ + ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid)); + + maybe_kick_recovery(soid); + waiting_for_degraded_object[soid].push_back(op); + op->mark_delayed("waiting for degraded object"); +} + +void PrimaryLogPG::block_write_on_full_cache( + const hobject_t& _oid, OpRequestRef op) +{ + const hobject_t oid = _oid.get_head(); + dout(20) << __func__ << ": blocking object " << oid + << " on full cache" << dendl; + objects_blocked_on_cache_full.insert(oid); + waiting_for_cache_not_full.push_back(op); + op->mark_delayed("waiting for cache not full"); +} + +void PrimaryLogPG::block_for_clean( + const hobject_t& oid, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << oid + << " on primary repair" << dendl; + waiting_for_clean_to_primary_repair.push_back(op); + op->mark_delayed("waiting for clean to repair"); +} + +void PrimaryLogPG::block_write_on_snap_rollback( + const hobject_t& oid, ObjectContextRef obc, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << oid.get_head() + << " on snap promotion " << obc->obs.oi.soid << dendl; + // otherwise, we'd have blocked in do_op + ceph_assert(oid.is_head()); + ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0); + objects_blocked_on_snap_promotion[oid] = obc; + wait_for_blocked_object(obc->obs.oi.soid, op); +} + +void PrimaryLogPG::block_write_on_degraded_snap( + const hobject_t& snap, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << snap.get_head() + << " on degraded snap " << snap << dendl; + // otherwise, we'd have blocked in do_op + ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0); + objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap; + wait_for_degraded_object(snap, op); +} + +bool PrimaryLogPG::maybe_await_blocked_head( + const hobject_t &hoid, + OpRequestRef op) +{ + ObjectContextRef obc; + obc = object_contexts.lookup(hoid.get_head()); + if (obc) { + if (obc->is_blocked()) { + wait_for_blocked_object(obc->obs.oi.soid, op); + return true; + } else { + return false; + } + } + return false; +} + +void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op) +{ + dout(10) << __func__ << " " << soid << " " << op << dendl; + waiting_for_blocked_object[soid].push_back(op); + op->mark_delayed("waiting for blocked object"); +} + +void PrimaryLogPG::maybe_force_recovery() +{ + // no force if not in degraded/recovery/backfill states + if (!is_degraded() && + !state_test(PG_STATE_RECOVERING | + PG_STATE_RECOVERY_WAIT | + PG_STATE_BACKFILLING | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILL_TOOFULL)) + return; + + if (recovery_state.get_pg_log().get_log().approx_size() < + cct->_conf->osd_max_pg_log_entries * + cct->_conf->osd_force_recovery_pg_log_entries_factor) + return; + + // find the oldest missing object + version_t min_version = recovery_state.get_pg_log().get_log().head.version; + hobject_t soid; + if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) { + min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first; + soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second; + } + ceph_assert(!get_acting_recovery_backfill().empty()); + for (set::iterator it = get_acting_recovery_backfill().begin(); + it != get_acting_recovery_backfill().end(); + ++it) { + if (*it == get_primary()) continue; + pg_shard_t peer = *it; + auto it_missing = recovery_state.get_peer_missing().find(peer); + if (it_missing != recovery_state.get_peer_missing().end() && + !it_missing->second.get_rmissing().empty()) { + const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin(); + dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first + << " oid " << min_obj->second << dendl; + if (min_version > min_obj->first) { + min_version = min_obj->first; + soid = min_obj->second; + } + } + } + + // recover it + if (soid != hobject_t()) + maybe_kick_recovery(soid); +} + +bool PrimaryLogPG::check_laggy(OpRequestRef& op) +{ + if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(), + SERVER_OCTOPUS)) { + dout(20) << __func__ << " not all upacting has SERVER_OCTOPUS" << dendl; + return true; + } + if (state_test(PG_STATE_WAIT)) { + dout(10) << __func__ << " PG is WAIT state" << dendl; + } else if (!state_test(PG_STATE_LAGGY)) { + auto mnow = osd->get_mnow(); + auto ru = recovery_state.get_readable_until(); + if (mnow <= ru) { + // not laggy + return true; + } + dout(10) << __func__ + << " mnow " << mnow + << " > readable_until " << ru << dendl; + + if (!is_primary()) { + osd->reply_op_error(op, -EAGAIN); + return false; + } + + // go to laggy state + state_set(PG_STATE_LAGGY); + publish_stats_to_osd(); + } + dout(10) << __func__ << " not readable" << dendl; + waiting_for_readable.push_back(op); + op->mark_delayed("waiting for readable"); + return false; +} + +bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op) +{ + if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(), + SERVER_OCTOPUS)) { + return true; + } + if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) { + return true; // not laggy + } + dout(10) << __func__ << " not readable" << dendl; + waiting_for_readable.push_front(op); + op->mark_delayed("waiting for readable"); + return false; +} + +void PrimaryLogPG::recheck_readable() +{ + if (!is_wait() && !is_laggy()) { + dout(20) << __func__ << " wasn't wait or laggy" << dendl; + return; + } + auto mnow = osd->get_mnow(); + bool pub = false; + if (is_wait()) { + auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub(); + if (mnow < prior_readable_until_ub) { + dout(10) << __func__ << " still wait (mnow " << mnow + << " < prior_readable_until_ub " << prior_readable_until_ub + << ")" << dendl; + } else { + dout(10) << __func__ << " no longer wait (mnow " << mnow + << " >= prior_readable_until_ub " << prior_readable_until_ub + << ")" << dendl; + state_clear(PG_STATE_WAIT); + recovery_state.clear_prior_readable_until_ub(); + pub = true; + } + } + if (is_laggy()) { + auto ru = recovery_state.get_readable_until(); + if (ru == ceph::signedspan::zero()) { + dout(10) << __func__ << " still laggy (mnow " << mnow + << ", readable_until zero)" << dendl; + } else if (mnow >= ru) { + dout(10) << __func__ << " still laggy (mnow " << mnow + << " >= readable_until " << ru << ")" << dendl; + } else { + dout(10) << __func__ << " no longer laggy (mnow " << mnow + << " < readable_until " << ru << ")" << dendl; + state_clear(PG_STATE_LAGGY); + pub = true; + } + } + if (pub) { + publish_stats_to_osd(); + } + if (!is_laggy() && !is_wait()) { + requeue_ops(waiting_for_readable); + } +} + +bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj) +{ + bufferlist bl; + + // If filter has expressed an interest in an xattr, load it. + if (!filter.get_xattr().empty()) { + int ret = pgbackend->objects_get_attr( + sobj, + filter.get_xattr(), + &bl); + dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl; + if (ret < 0) { + if (ret != -ENODATA || filter.reject_empty_xattr()) { + return false; + } + } + } + + return filter.filter(sobj, bl); +} + +std::pair> +PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter) +{ + string type; + // storing non-const PGLSFilter for the sake of ::init() + std::unique_ptr filter; + + try { + decode(type, iter); + } + catch (ceph::buffer::error& e) { + return { -EINVAL, nullptr }; + } + + if (type.compare("plain") == 0) { + filter = std::make_unique(); + } else { + std::size_t dot = type.find("."); + if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) { + return { -EINVAL, nullptr }; + } + + const std::string class_name = type.substr(0, dot); + const std::string filter_name = type.substr(dot + 1); + ClassHandler::ClassData *cls = NULL; + int r = ClassHandler::get_instance().open_class(class_name, &cls); + if (r != 0) { + derr << "Error opening class '" << class_name << "': " + << cpp_strerror(r) << dendl; + if (r != -EPERM) // propagate permission error + r = -EINVAL; + return { r, nullptr }; + } else { + ceph_assert(cls); + } + + ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name); + if (class_filter == NULL) { + derr << "Error finding filter '" << filter_name << "' in class " + << class_name << dendl; + return { -EINVAL, nullptr }; + } + filter.reset(class_filter->fn()); + if (!filter) { + // Object classes are obliged to return us something, but let's + // give an error rather than asserting out. + derr << "Buggy class " << class_name << " failed to construct " + "filter " << filter_name << dendl; + return { -EINVAL, nullptr }; + } + } + + ceph_assert(filter); + int r = filter->init(iter); + if (r < 0) { + derr << "Error initializing filter " << type << ": " + << cpp_strerror(r) << dendl; + return { -EINVAL, nullptr }; + } else { + // Successfully constructed and initialized, return it. + return std::make_pair(0, std::move(filter)); + } +} + + +// ========================================================== + +void PrimaryLogPG::do_command( + const string_view& orig_prefix, + const cmdmap_t& cmdmap, + const bufferlist& idata, + std::function on_finish) +{ + string format; + cmd_getval(cmdmap, "format", format); + std::unique_ptr f(Formatter::create( + format, "json-pretty", "json-pretty")); + int ret = 0; + stringstream ss; // stderr error message stream + bufferlist outbl; // if empty at end, we'll dump formatter as output + + // get final prefix: + // - ceph pg foo -> prefix=pg, cmd=foo + // - ceph tell foo -> prefix=foo + string prefix(orig_prefix); + string command; + cmd_getval(cmdmap, "cmd", command); + if (command.size()) { + prefix = command; + } + + if (prefix == "query") { + f->open_object_section("pg"); + f->dump_stream("snap_trimq") << snap_trimq; + f->dump_unsigned("snap_trimq_len", snap_trimq.size()); + recovery_state.dump_peering_state(f.get()); + + f->open_array_section("recovery_state"); + handle_query_state(f.get()); + f->close_section(); + + if (is_primary() && is_active() && m_scrubber) { + m_scrubber->dump(f.get()); + } + + f->open_object_section("agent_state"); + if (agent_state) + agent_state->dump(f.get()); + f->close_section(); + + f->close_section(); + } + + else if (prefix == "mark_unfound_lost") { + string mulcmd; + cmd_getval(cmdmap, "mulcmd", mulcmd); + int mode = -1; + if (mulcmd == "revert") { + if (pool.info.is_erasure()) { + ss << "mode must be 'delete' for ec pool"; + ret = -EINVAL; + goto out; + } + mode = pg_log_entry_t::LOST_REVERT; + } else if (mulcmd == "delete") { + mode = pg_log_entry_t::LOST_DELETE; + } else { + ss << "mode must be 'revert' or 'delete'; mark not yet implemented"; + ret = -EINVAL; + goto out; + } + ceph_assert(mode == pg_log_entry_t::LOST_REVERT || + mode == pg_log_entry_t::LOST_DELETE); + + if (!is_primary()) { + ss << "not primary"; + ret = -EROFS; + goto out; + } + + uint64_t unfound = recovery_state.get_missing_loc().num_unfound(); + if (!unfound) { + ss << "pg has no unfound objects"; + goto out; // make command idempotent + } + + if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) { + ss << "pg has " << unfound + << " unfound objects but we haven't probed all sources, not marking lost"; + ret = -EINVAL; + goto out; + } + + mark_all_unfound_lost(mode, on_finish); + return; + } + + else if (prefix == "list_unfound") { + hobject_t offset; + string offset_json; + bool show_offset = false; + if (cmd_getval(cmdmap, "offset", offset_json)) { + json_spirit::Value v; + try { + if (!json_spirit::read(offset_json, v)) + throw std::runtime_error("bad json"); + offset.decode(v); + } catch (std::runtime_error& e) { + ss << "error parsing offset: " << e.what(); + ret = -EINVAL; + goto out; + } + show_offset = true; + } + f->open_object_section("missing"); + if (show_offset) { + f->open_object_section("offset"); + offset.dump(f.get()); + f->close_section(); + } + auto &needs_recovery_map = recovery_state.get_missing_loc() + .get_needs_recovery(); + f->dump_int("num_missing", needs_recovery_map.size()); + f->dump_int("num_unfound", get_num_unfound()); + map::const_iterator p = + needs_recovery_map.upper_bound(offset); + { + f->open_array_section("objects"); + int32_t num = 0; + for (; p != needs_recovery_map.end() && + num < cct->_conf->osd_command_max_records; + ++p) { + if (recovery_state.get_missing_loc().is_unfound(p->first)) { + f->open_object_section("object"); + { + f->open_object_section("oid"); + p->first.dump(f.get()); + f->close_section(); + } + p->second.dump(f.get()); // have, need keys + { + f->open_array_section("locations"); + for (auto &&r : recovery_state.get_missing_loc().get_locations( + p->first)) { + f->dump_stream("shard") << r; + } + f->close_section(); + } + f->close_section(); + num++; + } + } + f->close_section(); + } + // Get possible locations of missing objects from pg information + PeeringState::QueryUnfound q(f.get()); + recovery_state.handle_event(q, 0); + f->dump_bool("more", p != needs_recovery_map.end()); + f->close_section(); + } + + else if (prefix == "scrub" || + prefix == "deep_scrub") { + bool deep = (prefix == "deep_scrub"); + int64_t time; + cmd_getval(cmdmap, "time", time, (int64_t)0); + + if (is_primary()) { + const pg_pool_t *p = &pool.info; + double pool_scrub_max_interval = 0; + double scrub_max_interval; + if (deep) { + p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval); + scrub_max_interval = pool_scrub_max_interval > 0 ? + pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval; + } else { + p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval); + scrub_max_interval = pool_scrub_max_interval > 0 ? + pool_scrub_max_interval : g_conf()->osd_scrub_max_interval; + } + // Instead of marking must_scrub force a schedule scrub + utime_t stamp = ceph_clock_now(); + if (time == 0) + stamp -= scrub_max_interval; + else + stamp -= (float)time; + stamp -= 100.0; // push back last scrub more for good measure + if (deep) { + set_last_deep_scrub_stamp(stamp); + } else { + set_last_scrub_stamp(stamp); + } + f->open_object_section("result"); + f->dump_bool("deep", deep); + f->dump_stream("stamp") << stamp; + f->close_section(); + } else { + ss << "Not primary"; + ret = -EPERM; + } + outbl.append(ss.str()); + } + + else { + ret = -ENOSYS; + ss << "prefix '" << prefix << "' not implemented"; + } + + out: + if (ret >= 0 && outbl.length() == 0) { + f->flush(outbl); + } + on_finish(ret, ss.str(), outbl); +} + + +// ========================================================== + +void PrimaryLogPG::do_pg_op(OpRequestRef op) +{ + const MOSDOp *m = static_cast(op->get_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + dout(10) << "do_pg_op " << *m << dendl; + + op->mark_started(); + + int result = 0; + string cname, mname; + + snapid_t snapid = m->get_snapid(); + + vector ops = m->ops; + + for (vector::iterator p = ops.begin(); p != ops.end(); ++p) { + std::unique_ptr filter; + OSDOp& osd_op = *p; + auto bp = p->indata.cbegin(); + switch (p->op.op) { + case CEPH_OSD_OP_PGNLS_FILTER: + try { + decode(cname, bp); + decode(mname, bp); + } + catch (const ceph::buffer::error& e) { + dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl; + result = -EINVAL; + break; + } + std::tie(result, filter) = get_pgls_filter(bp); + if (result < 0) + break; + + ceph_assert(filter); + + // fall through + + case CEPH_OSD_OP_PGNLS: + if (snapid != CEPH_NOSNAP) { + result = -EINVAL; + break; + } + if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) { + dout(10) << " pgnls pg=" << m->get_pg() + << " " << get_osdmap()->raw_pg_to_pg(m->get_pg()) + << " != " << info.pgid << dendl; + result = 0; // hmm? + } else { + unsigned list_size = std::min(cct->_conf->osd_max_pgls, + p->op.pgls.count); + + dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size + << dendl; + // read into a buffer + vector sentries; + pg_nls_response_t response; + try { + decode(response.handle, bp); + } + catch (const ceph::buffer::error& e) { + dout(0) << "unable to decode PGNLS handle in " << *m << dendl; + result = -EINVAL; + break; + } + + hobject_t next; + hobject_t lower_bound = response.handle; + hobject_t pg_start = info.pgid.pgid.get_hobj_start(); + hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + dout(10) << " pgnls lower_bound " << lower_bound + << " pg_end " << pg_end << dendl; + if (((!lower_bound.is_max() && lower_bound >= pg_end) || + (lower_bound != hobject_t() && lower_bound < pg_start))) { + // this should only happen with a buggy client. + dout(10) << "outside of PG bounds " << pg_start << " .. " + << pg_end << dendl; + result = -EINVAL; + break; + } + + hobject_t current = lower_bound; + int r = pgbackend->objects_list_partial( + current, + list_size, + list_size, + &sentries, + &next); + if (r != 0) { + result = -EINVAL; + break; + } + + map::const_iterator missing_iter = + recovery_state.get_pg_log().get_missing().get_items().lower_bound(current); + vector::iterator ls_iter = sentries.begin(); + hobject_t _max = hobject_t::get_max(); + while (1) { + const hobject_t &mcand = + missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ? + _max : + missing_iter->first; + const hobject_t &lcand = + ls_iter == sentries.end() ? + _max : + *ls_iter; + + hobject_t candidate; + if (mcand == lcand) { + candidate = mcand; + if (!mcand.is_max()) { + ++ls_iter; + ++missing_iter; + } + } else if (mcand < lcand) { + candidate = mcand; + ceph_assert(!mcand.is_max()); + ++missing_iter; + } else { + candidate = lcand; + ceph_assert(!lcand.is_max()); + ++ls_iter; + } + + dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash() + << " vs lower bound 0x" << lower_bound.get_hash() + << std::dec << dendl; + + if (candidate >= next) { + break; + } + + if (response.entries.size() == list_size) { + next = candidate; + break; + } + + if (candidate.snap != CEPH_NOSNAP) + continue; + + // skip internal namespace + if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace) + continue; + + if (recovery_state.get_missing_loc().is_deleted(candidate)) + continue; + + // skip wrong namespace + if (m->get_hobj().nspace != librados::all_nspaces && + candidate.get_namespace() != m->get_hobj().nspace) + continue; + + if (filter && !pgls_filter(*filter, candidate)) + continue; + + dout(20) << "pgnls item 0x" << std::hex + << candidate.get_hash() + << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash()) + << std::dec << " " + << candidate.oid.name << dendl; + + librados::ListObjectImpl item; + item.nspace = candidate.get_namespace(); + item.oid = candidate.oid.name; + item.locator = candidate.get_key(); + response.entries.push_back(item); + } + + if (next.is_max() && + missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() && + ls_iter == sentries.end()) { + result = 1; + + // Set response.handle to the start of the next PG according + // to the object sort order. + response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + } else { + response.handle = next; + } + dout(10) << "pgnls handle=" << response.handle << dendl; + encode(response, osd_op.outdata); + dout(10) << " pgnls result=" << result << " outdata.length()=" + << osd_op.outdata.length() << dendl; + } + break; + + case CEPH_OSD_OP_PGLS_FILTER: + try { + decode(cname, bp); + decode(mname, bp); + } + catch (const ceph::buffer::error& e) { + dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl; + result = -EINVAL; + break; + } + std::tie(result, filter) = get_pgls_filter(bp); + if (result < 0) + break; + + ceph_assert(filter); + + // fall through + + case CEPH_OSD_OP_PGLS: + if (snapid != CEPH_NOSNAP) { + result = -EINVAL; + break; + } + if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) { + dout(10) << " pgls pg=" << m->get_pg() + << " " << get_osdmap()->raw_pg_to_pg(m->get_pg()) + << " != " << info.pgid << dendl; + result = 0; // hmm? + } else { + unsigned list_size = std::min(cct->_conf->osd_max_pgls, + p->op.pgls.count); + + dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl; + // read into a buffer + vector sentries; + pg_ls_response_t response; + try { + decode(response.handle, bp); + } + catch (const ceph::buffer::error& e) { + dout(0) << "unable to decode PGLS handle in " << *m << dendl; + result = -EINVAL; + break; + } + + hobject_t next; + hobject_t current = response.handle; + int r = pgbackend->objects_list_partial( + current, + list_size, + list_size, + &sentries, + &next); + if (r != 0) { + result = -EINVAL; + break; + } + + ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty()); + + map::const_iterator missing_iter = + recovery_state.get_pg_log().get_missing().get_items().lower_bound(current); + vector::iterator ls_iter = sentries.begin(); + hobject_t _max = hobject_t::get_max(); + while (1) { + const hobject_t &mcand = + missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ? + _max : + missing_iter->first; + const hobject_t &lcand = + ls_iter == sentries.end() ? + _max : + *ls_iter; + + hobject_t candidate; + if (mcand == lcand) { + candidate = mcand; + if (!mcand.is_max()) { + ++ls_iter; + ++missing_iter; + } + } else if (mcand < lcand) { + candidate = mcand; + ceph_assert(!mcand.is_max()); + ++missing_iter; + } else { + candidate = lcand; + ceph_assert(!lcand.is_max()); + ++ls_iter; + } + + if (candidate >= next) { + break; + } + + if (response.entries.size() == list_size) { + next = candidate; + break; + } + + if (candidate.snap != CEPH_NOSNAP) + continue; + + // skip wrong namespace + if (candidate.get_namespace() != m->get_hobj().nspace) + continue; + + if (recovery_state.get_missing_loc().is_deleted(candidate)) + continue; + + if (filter && !pgls_filter(*filter, candidate)) + continue; + + response.entries.push_back(make_pair(candidate.oid, + candidate.get_key())); + } + if (next.is_max() && + missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() && + ls_iter == sentries.end()) { + result = 1; + } + response.handle = next; + encode(response, osd_op.outdata); + dout(10) << " pgls result=" << result << " outdata.length()=" + << osd_op.outdata.length() << dendl; + } + break; + + case CEPH_OSD_OP_PG_HITSET_LS: + { + list< pair > ls; + for (list::const_iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) + ls.push_back(make_pair(p->begin, p->end)); + if (hit_set) + ls.push_back(make_pair(hit_set_start_stamp, utime_t())); + encode(ls, osd_op.outdata); + } + break; + + case CEPH_OSD_OP_PG_HITSET_GET: + { + utime_t stamp(osd_op.op.hit_set_get.stamp); + if (hit_set_start_stamp && stamp >= hit_set_start_stamp) { + // read the current in-memory HitSet, not the version we've + // checkpointed. + if (!hit_set) { + result= -ENOENT; + break; + } + encode(*hit_set, osd_op.outdata); + result = osd_op.outdata.length(); + } else { + // read an archived HitSet. + hobject_t oid; + for (list::const_iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + if (stamp >= p->begin && stamp <= p->end) { + oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + break; + } + } + if (oid == hobject_t()) { + result = -ENOENT; + break; + } + if (!pool.info.is_replicated()) { + // FIXME: EC not supported yet + result = -EOPNOTSUPP; + break; + } + if (is_unreadable_object(oid)) { + wait_for_unreadable_object(oid, op); + return; + } + result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata); + } + } + break; + + case CEPH_OSD_OP_SCRUBLS: + result = do_scrub_ls(m, &osd_op); + break; + + default: + result = -EINVAL; + break; + } + + if (result < 0) + break; + } + + // reply + MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + false); + reply->claim_op_out_data(ops); + reply->set_result(result); + reply->set_reply_versions(info.last_update, info.last_user_version); + osd->send_message_osd_client(reply, m->get_connection()); +} + +int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op) +{ + if (m->get_pg() != info.pgid.pgid) { + dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl; + return -EINVAL; // hmm? + } + auto bp = osd_op->indata.cbegin(); + scrub_ls_arg_t arg; + try { + arg.decode(bp); + } catch (ceph::buffer::error&) { + dout(10) << " corrupted scrub_ls_arg_t" << dendl; + return -EINVAL; + } + + int r = 0; + scrub_ls_result_t result = {.interval = info.history.same_interval_since}; + + if (arg.interval != 0 && arg.interval != info.history.same_interval_since) { + r = -EAGAIN; + } else { + bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result); + if (store_queried) { + encode(result, osd_op->outdata); + } else { + // the scrubber's store is not initialized + r = -ENOENT; + } + } + + return r; +} + +/** + * Releases locks + * + * @param manager [in] manager with locks to release + */ +void PrimaryLogPG::release_object_locks( + ObcLockManager &lock_manager) { + std::list > > to_req; + bool requeue_recovery = false; + bool requeue_snaptrim = false; + lock_manager.put_locks( + &to_req, + &requeue_recovery, + &requeue_snaptrim); + if (requeue_recovery) + queue_recovery(); + if (requeue_snaptrim) + snap_trimmer_machine.process_event(TrimWriteUnblocked()); + + if (!to_req.empty()) { + // requeue at front of scrub blocking queue if we are blocked by scrub + for (auto &&p: to_req) { + if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) { + for (auto& op : p.second) { + op->mark_delayed("waiting for scrub"); + } + + waiting_for_scrub.splice( + waiting_for_scrub.begin(), + p.second, + p.second.begin(), + p.second.end()); + } else if (is_laggy()) { + for (auto& op : p.second) { + op->mark_delayed("waiting for readable"); + } + waiting_for_readable.splice( + waiting_for_readable.begin(), + p.second, + p.second.begin(), + p.second.end()); + } else { + requeue_ops(p.second); + } + } + } +} + +PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap, + const PGPool &_pool, + const map& ec_profile, spg_t p) : + PG(o, curmap, _pool, p), + pgbackend( + PGBackend::build_pg_backend( + _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)), + object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count), + new_backfill(false), + temp_seq(0), + snap_trimmer_machine(this) +{ + recovery_state.set_backend_predicates( + pgbackend->get_is_readable_predicate(), + pgbackend->get_is_recoverable_predicate()); + snap_trimmer_machine.initiate(); + + m_scrubber = make_unique(this); +} + +PrimaryLogPG::~PrimaryLogPG() +{ + m_scrubber.reset(); +} + +void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc) +{ + src_oloc = oloc; + if (oloc.key.empty()) + src_oloc.key = oid.name; +} + +void PrimaryLogPG::handle_backoff(OpRequestRef& op) +{ + auto m = op->get_req(); + auto session = ceph::ref_cast(m->get_connection()->get_priv()); + if (!session) + return; // drop it. + hobject_t begin = info.pgid.pgid.get_hobj_start(); + hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + if (begin < m->begin) { + begin = m->begin; + } + if (end > m->end) { + end = m->end; + } + dout(10) << __func__ << " backoff ack id " << m->id + << " [" << begin << "," << end << ")" << dendl; + session->ack_backoff(cct, m->pgid, m->id, begin, end); +} + +void PrimaryLogPG::do_request( + OpRequestRef& op, + ThreadPool::TPHandle &handle) +{ + if (op->osd_trace) { + op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace); + op->pg_trace.event("do request"); + } +#ifdef HAVE_JAEGER + if (op->osd_parent_span) { + auto do_req_span = jaeger_tracing::child_span(__func__, op->osd_parent_span); + } +#endif +// make sure we have a new enough map + auto p = waiting_for_map.find(op->get_source()); + if (p != waiting_for_map.end()) { + // preserve ordering + dout(20) << __func__ << " waiting_for_map " + << p->first << " not empty, queueing" << dendl; + p->second.push_back(op); + op->mark_delayed("waiting_for_map not empty"); + return; + } + if (!have_same_or_newer_map(op->min_epoch)) { + dout(20) << __func__ << " min " << op->min_epoch + << ", queue on waiting_for_map " << op->get_source() << dendl; + waiting_for_map[op->get_source()].push_back(op); + op->mark_delayed("op must wait for map"); + osd->request_osdmap_update(op->min_epoch); + return; + } + + if (can_discard_request(op)) { + return; + } + + // pg-wide backoffs + const Message *m = op->get_req(); + int msg_type = m->get_type(); + if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) { + auto session = ceph::ref_cast(m->get_connection()->get_priv()); + if (!session) + return; // drop it. + if (msg_type == CEPH_MSG_OSD_OP) { + if (session->check_backoff(cct, info.pgid, + info.pgid.pgid.get_hobj_start(), m)) { + return; + } + + bool backoff = + is_down() || + is_incomplete() || + (!is_active() && is_peered()); + if (g_conf()->osd_backoff_on_peering && !backoff) { + if (is_peering()) { + backoff = true; + } + } + if (backoff) { + add_pg_backoff(session); + return; + } + } + // pg backoff acks at pg-level + if (msg_type == CEPH_MSG_OSD_BACKOFF) { + const MOSDBackoff *ba = static_cast(m); + if (ba->begin != ba->end) { + handle_backoff(op); + return; + } + } + } + + if (!is_peered()) { + // Delay unless PGBackend says it's ok + if (pgbackend->can_handle_while_inactive(op)) { + bool handled = pgbackend->handle_message(op); + ceph_assert(handled); + return; + } else { + waiting_for_peered.push_back(op); + op->mark_delayed("waiting for peered"); + return; + } + } + + if (recovery_state.needs_flush()) { + dout(20) << "waiting for flush on " << op << dendl; + waiting_for_flush.push_back(op); + op->mark_delayed("waiting for flush"); + return; + } + + ceph_assert(is_peered() && !recovery_state.needs_flush()); + if (pgbackend->handle_message(op)) + return; + + switch (msg_type) { + case CEPH_MSG_OSD_OP: + case CEPH_MSG_OSD_BACKOFF: + if (!is_active()) { + dout(20) << " peered, not active, waiting for active on " << op << dendl; + waiting_for_active.push_back(op); + op->mark_delayed("waiting for active"); + return; + } + switch (msg_type) { + case CEPH_MSG_OSD_OP: + // verify client features + if ((pool.info.has_tiers() || pool.info.is_tier()) && + !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) { + osd->reply_op_error(op, -EOPNOTSUPP); + return; + } + do_op(op); + break; + case CEPH_MSG_OSD_BACKOFF: + // object-level backoff acks handled in osdop context + handle_backoff(op); + break; + } + break; + + case MSG_OSD_PG_SCAN: + do_scan(op, handle); + break; + + case MSG_OSD_PG_BACKFILL: + do_backfill(op); + break; + + case MSG_OSD_PG_BACKFILL_REMOVE: + do_backfill_remove(op); + break; + + case MSG_OSD_SCRUB_RESERVE: + { + if (!m_scrubber) { + osd->reply_op_error(op, -EAGAIN); + return; + } + auto m = op->get_req(); + switch (m->type) { + case MOSDScrubReserve::REQUEST: + m_scrubber->handle_scrub_reserve_request(op); + break; + case MOSDScrubReserve::GRANT: + m_scrubber->handle_scrub_reserve_grant(op, m->from); + break; + case MOSDScrubReserve::REJECT: + m_scrubber->handle_scrub_reserve_reject(op, m->from); + break; + case MOSDScrubReserve::RELEASE: + m_scrubber->handle_scrub_reserve_release(op); + break; + } + } + break; + + case MSG_OSD_REP_SCRUB: + replica_scrub(op, handle); + break; + + case MSG_OSD_REP_SCRUBMAP: + do_replica_scrub_map(op); + break; + + case MSG_OSD_PG_UPDATE_LOG_MISSING: + do_update_log_missing(op); + break; + + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + do_update_log_missing_reply(op); + break; + + default: + ceph_abort_msg("bad message type in do_request"); + } +} + +/** do_op - do an op + * pg lock will be held (if multithreaded) + * osd_lock NOT held. + */ +void PrimaryLogPG::do_op(OpRequestRef& op) +{ + FUNCTRACE(cct); + // NOTE: take a non-const pointer here; we must be careful not to + // change anything that will break other reads on m (operator<<). + MOSDOp *m = static_cast(op->get_nonconst_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + if (m->finish_decode()) { + op->reset_desc(); // for TrackedOp + m->clear_payload(); + } + + dout(20) << __func__ << ": op " << *m << dendl; + + const hobject_t head = m->get_hobj().get_head(); + + if (!info.pgid.pgid.contains( + info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) { + derr << __func__ << " " << info.pgid.pgid << " does not contain " + << head << " pg_num " << pool.info.get_pg_num() << " hash " + << std::hex << head.get_hash() << std::dec << dendl; + osd->clog->warn() << info.pgid.pgid << " does not contain " << head + << " op " << *m; + ceph_assert(!cct->_conf->osd_debug_misdirected_ops); + return; + } + + bool can_backoff = + m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF); + ceph::ref_t session; + if (can_backoff) { + session = static_cast(m->get_connection()->get_priv().get()); + if (!session.get()) { + dout(10) << __func__ << " no session" << dendl; + return; + } + + if (session->check_backoff(cct, info.pgid, head, m)) { + return; + } + } + + if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) { + // not implemented. + dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + { + int r = op->maybe_init_op_info(*get_osdmap()); + if (r) { + osd->reply_op_error(op, r); + return; + } + } + + if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) && + op->may_read() && + !(op->may_write() || op->may_cache())) { + // balanced reads; any replica will do + if (!(is_primary() || is_nonprimary())) { + osd->handle_misdirected_op(this, op); + return; + } + } else { + // normal case; must be primary + if (!is_primary()) { + osd->handle_misdirected_op(this, op); + return; + } + } + + if (!check_laggy(op)) { + return; + } + + if (!op_has_sufficient_caps(op)) { + osd->reply_op_error(op, -EPERM); + return; + } + + if (op->includes_pg_op()) { + return do_pg_op(op); + } + + // object name too long? + if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) { + dout(4) << "do_op name is longer than " + << cct->_conf->osd_max_object_name_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) { + dout(4) << "do_op locator is longer than " + << cct->_conf->osd_max_object_name_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) { + dout(4) << "do_op namespace is longer than " + << cct->_conf->osd_max_object_namespace_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().oid.name.empty()) { + dout(4) << "do_op empty oid name is not allowed" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + if (int r = osd->store->validate_hobject_key(head)) { + dout(4) << "do_op object " << head << " invalid for backing store: " + << r << dendl; + osd->reply_op_error(op, r); + return; + } + + // blocklisted? + if (get_osdmap()->is_blocklisted(m->get_source_addr())) { + dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl; + osd->reply_op_error(op, -EBLOCKLISTED); + return; + } + + // order this op as a write? + bool write_ordered = op->rwordered(); + + // discard due to cluster full transition? (we discard any op that + // originates before the cluster or pool is marked full; the client + // will resend after the full flag is removed or if they expect the + // op to succeed despite being full). The except is FULL_FORCE and + // FULL_TRY ops, which there is no reason to discard because they + // bypass all full checks anyway. If this op isn't write or + // read-ordered, we skip. + // FIXME: we exclude mds writes for now. + if (write_ordered && !(m->get_source().is_mds() || + m->has_flag(CEPH_OSD_FLAG_FULL_TRY) || + m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) && + info.history.last_epoch_marked_full > m->get_map_epoch()) { + dout(10) << __func__ << " discarding op sent before full " << m << " " + << *m << dendl; + return; + } + // mds should have stopped writing before this point. + // We can't allow OSD to become non-startable even if mds + // could be writing as part of file removals. + if (write_ordered && osd->check_failsafe_full(get_dpp()) && + !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl; + return; + } + int64_t poolid = get_pgid().pool(); + if (op->may_write()) { + + const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid); + if (!pi) { + return; + } + + // invalid? + if (m->get_snapid() != CEPH_NOSNAP) { + dout(20) << __func__ << ": write to clone not valid " << *m << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + // too big? + if (cct->_conf->osd_max_write_size && + m->get_data_len() > cct->_conf->osd_max_write_size << 20) { + // journal can't hold commit! + derr << "do_op msg data len " << m->get_data_len() + << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20) + << " on " << *m << dendl; + osd->reply_op_error(op, -OSD_WRITETOOBIG); + return; + } + } + + dout(10) << "do_op " << *m + << (op->may_write() ? " may_write" : "") + << (op->may_read() ? " may_read" : "") + << (op->may_cache() ? " may_cache" : "") + << " -> " << (write_ordered ? "write-ordered" : "read-ordered") + << " flags " << ceph_osd_flag_string(m->get_flags()) + << dendl; + +#ifdef HAVE_JAEGER + if (op->osd_parent_span) { + auto do_op_span = jaeger_tracing::child_span(__func__, op->osd_parent_span); + } +#endif + // missing object? + if (is_unreadable_object(head)) { + if (!is_primary()) { + osd->reply_op_error(op, -EAGAIN); + return; + } + if (can_backoff && + (g_conf()->osd_backoff_on_degraded || + (g_conf()->osd_backoff_on_unfound && + recovery_state.get_missing_loc().is_unfound(head)))) { + add_backoff(session, head, head); + maybe_kick_recovery(head); + } else { + wait_for_unreadable_object(head, op); + } + return; + } + + if (write_ordered) { + // degraded object? + if (is_degraded_or_backfilling_object(head)) { + if (can_backoff && g_conf()->osd_backoff_on_degraded) { + add_backoff(session, head, head); + maybe_kick_recovery(head); + } else { + wait_for_degraded_object(head, op); + } + return; + } + + if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) { + dout(20) << __func__ << ": waiting for scrub" << dendl; + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + return; + } + if (!check_laggy_requeue(op)) { + return; + } + + // blocked on snap? + if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head); + blocked_iter != std::end(objects_blocked_on_degraded_snap)) { + hobject_t to_wait_on(head); + to_wait_on.snap = blocked_iter->second; + wait_for_degraded_object(to_wait_on, op); + return; + } + if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head); + blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) { + wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op); + return; + } + if (objects_blocked_on_cache_full.count(head)) { + block_write_on_full_cache(head, op); + return; + } + } + + // dup/resent? + if (op->may_write() || op->may_cache()) { + // warning: we will get back *a* request for this reqid, but not + // necessarily the most recent. this happens with flush and + // promote ops, but we can't possible have both in our log where + // the original request is still not stable on disk, so for our + // purposes here it doesn't matter which one we get. + eversion_t version; + version_t user_version; + int return_code = 0; + vector op_returns; + bool got = check_in_progress_op( + m->get_reqid(), &version, &user_version, &return_code, &op_returns); + if (got) { + dout(3) << __func__ << " dup " << m->get_reqid() + << " version " << version << dendl; + if (already_complete(version)) { + osd->reply_op_error(op, return_code, version, user_version, op_returns); + } else { + dout(10) << " waiting for " << version << " to commit" << dendl; + // always queue ondisk waiters, so that we can requeue if needed + waiting_for_ondisk[version].emplace_back(op, user_version, return_code, + op_returns); + op->mark_delayed("waiting for ondisk"); + } + return; + } + } + + ObjectContextRef obc; + bool can_create = op->may_write(); + hobject_t missing_oid; + + // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS + const hobject_t& oid = + m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj(); + + // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else + for (vector::iterator p = m->ops.begin(); p != m->ops.end(); ++p) { + OSDOp& osd_op = *p; + + if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) { + if (m->get_snapid() != CEPH_SNAPDIR) { + dout(10) << "LIST_SNAPS with incorrect context" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + } else { + if (m->get_snapid() == CEPH_SNAPDIR) { + dout(10) << "non-LIST_SNAPS on snapdir" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + } + } + + // io blocked on obc? + if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) && + maybe_await_blocked_head(oid, op)) { + return; + } + + if (!is_primary()) { + if (!recovery_state.can_serve_replica_read(oid)) { + dout(20) << __func__ + << ": unstable write on replica, bouncing to primary " + << *m << dendl; + osd->reply_op_error(op, -EAGAIN); + return; + } + dout(20) << __func__ << ": serving replica read on oid " << oid + << dendl; + } + + int r = find_object_context( + oid, &obc, can_create, + m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE), + &missing_oid); + + // LIST_SNAPS needs the ssc too + if (obc && + m->get_snapid() == CEPH_SNAPDIR && + !obc->ssc) { + obc->ssc = get_snapset_context(oid, true); + } + + if (r == -EAGAIN) { + // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise, + // we have to wait for the object. + if (is_primary()) { + // missing the specific snap we need; requeue and wait. + ceph_assert(!op->may_write()); // only happens on a read/cache + wait_for_unreadable_object(missing_oid, op); + return; + } + } else if (r == 0) { + if (is_unreadable_object(obc->obs.oi.soid)) { + dout(10) << __func__ << ": clone " << obc->obs.oi.soid + << " is unreadable, waiting" << dendl; + wait_for_unreadable_object(obc->obs.oi.soid, op); + return; + } + + // degraded object? (the check above was for head; this could be a clone) + if (write_ordered && + obc->obs.oi.soid.snap != CEPH_NOSNAP && + is_degraded_or_backfilling_object(obc->obs.oi.soid)) { + dout(10) << __func__ << ": clone " << obc->obs.oi.soid + << " is degraded, waiting" << dendl; + wait_for_degraded_object(obc->obs.oi.soid, op); + return; + } + } + + bool in_hit_set = false; + if (hit_set) { + if (obc.get()) { + if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid)) + in_hit_set = true; + } else { + if (missing_oid != hobject_t() && hit_set->contains(missing_oid)) + in_hit_set = true; + } + if (!op->hitset_inserted) { + hit_set->insert(oid); + op->hitset_inserted = true; + if (hit_set->is_full() || + hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) { + hit_set_persist(); + } + } + } + + if (agent_state) { + if (agent_choose_mode(false, op)) + return; + } + + if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) { + if (recover_adjacent_clones(obc, op)) { + return; + } + if (maybe_handle_manifest(op, + write_ordered, + obc)) + return; + } + + if (maybe_handle_cache(op, + write_ordered, + obc, + r, + missing_oid, + false, + in_hit_set)) + return; + + if (r && (r != -ENOENT || !obc)) { + // copy the reqids for copy get on ENOENT + if (r == -ENOENT && + (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) { + fill_in_copy_get_noent(op, oid, m->ops[0]); + return; + } + dout(20) << __func__ << ": find_object_context got error " << r << dendl; + if (op->may_write() && + get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + record_write_error(op, oid, nullptr, r); + } else { + osd->reply_op_error(op, r); + } + return; + } + + // make sure locator is consistent + object_locator_t oloc(obc->obs.oi.soid); + if (m->get_object_locator() != oloc) { + dout(10) << " provided locator " << m->get_object_locator() + << " != object's " << obc->obs.oi.soid << dendl; + osd->clog->warn() << "bad locator " << m->get_object_locator() + << " on object " << oloc + << " op " << *m; + } + + // io blocked on obc? + if (obc->is_blocked() && + !m->has_flag(CEPH_OSD_FLAG_FLUSH)) { + wait_for_blocked_object(obc->obs.oi.soid, op); + return; + } + + dout(25) << __func__ << " oi " << obc->obs.oi << dendl; + + OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this); + + if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) { + dout(20) << __func__ << ": skipping rw locks" << dendl; + } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) { + dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl; + + // verify there is in fact a flush in progress + // FIXME: we could make this a stronger test. + map::iterator p = flush_ops.find(obc->obs.oi.soid); + if (p == flush_ops.end()) { + dout(10) << __func__ << " no flush in progress, aborting" << dendl; + reply_ctx(ctx, -EINVAL); + return; + } + } else if (!get_rw_locks(write_ordered, ctx)) { + dout(20) << __func__ << " waiting for rw locks " << dendl; + op->mark_delayed("waiting for rw locks"); + close_op_ctx(ctx); + return; + } + dout(20) << __func__ << " obc " << *obc << dendl; + + if (r) { + dout(20) << __func__ << " returned an error: " << r << dendl; + if (op->may_write() && + get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + record_write_error(op, oid, nullptr, r, + ctx->op->allows_returnvec() ? ctx : nullptr); + } else { + osd->reply_op_error(op, r); + } + close_op_ctx(ctx); + return; + } + + if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) { + ctx->ignore_cache = true; + } + + if ((op->may_read()) && (obc->obs.oi.is_lost())) { + // This object is lost. Reading from it returns an error. + dout(20) << __func__ << ": object " << obc->obs.oi.soid + << " is lost" << dendl; + reply_ctx(ctx, -ENFILE); + return; + } + if (!op->may_write() && + !op->may_cache() && + (!obc->obs.exists || + ((m->get_snapid() != CEPH_SNAPDIR) && + obc->obs.oi.is_whiteout()))) { + // copy the reqids for copy get on ENOENT + if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) { + fill_in_copy_get_noent(op, oid, m->ops[0]); + close_op_ctx(ctx); + return; + } + reply_ctx(ctx, -ENOENT); + return; + } + + op->mark_started(); + + execute_ctx(ctx); + utime_t prepare_latency = ceph_clock_now(); + prepare_latency -= op->get_dequeued_time(); + osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency); + if (op->may_read() && op->may_write()) { + osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency); + } else if (op->may_read()) { + osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency); + } else if (op->may_write() || op->may_cache()) { + osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency); + } + + // force recovery of the oldest missing object if too many logs + maybe_force_recovery(); +} + +PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail( + OpRequestRef op, + bool write_ordered, + ObjectContextRef obc) +{ + ceph_assert(obc); + if (op->get_req()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) { + dout(20) << __func__ << ": ignoring redirect due to flag" << dendl; + return cache_result_t::NOOP; + } + + // if it is write-ordered and blocked, stop now + if (obc->is_blocked() && write_ordered) { + // we're already doing something with this object + dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl; + return cache_result_t::NOOP; + } + + vector ops = op->get_req()->ops; + for (vector::iterator p = ops.begin(); p != ops.end(); ++p) { + OSDOp& osd_op = *p; + ceph_osd_op& op = osd_op.op; + if (op.op == CEPH_OSD_OP_SET_REDIRECT || + op.op == CEPH_OSD_OP_SET_CHUNK || + op.op == CEPH_OSD_OP_UNSET_MANIFEST || + op.op == CEPH_OSD_OP_TIER_PROMOTE || + op.op == CEPH_OSD_OP_TIER_FLUSH || + op.op == CEPH_OSD_OP_TIER_EVICT) { + return cache_result_t::NOOP; + } + } + + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + if (op->may_write() || write_ordered) { + do_proxy_write(op, obc); + } else { + // promoted object + if (obc->obs.oi.size != 0) { + return cache_result_t::NOOP; + } + do_proxy_read(op, obc); + } + return cache_result_t::HANDLED_PROXY; + case object_manifest_t::TYPE_CHUNKED: + { + if (can_proxy_chunked_read(op, obc)) { + map::iterator p = flush_ops.find(obc->obs.oi.soid); + if (p != flush_ops.end()) { + do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true); + return cache_result_t::HANDLED_PROXY; + } + do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered); + return cache_result_t::HANDLED_PROXY; + } + + MOSDOp *m = static_cast(op->get_nonconst_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + hobject_t head = m->get_hobj(); + + if (is_degraded_or_backfilling_object(head)) { + dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl; + wait_for_degraded_object(head, op); + return cache_result_t::BLOCKED_RECOVERY; + } + + if (m_scrubber->write_blocked_by_scrub(head)) { + dout(20) << __func__ << ": waiting for scrub" << dendl; + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + return cache_result_t::BLOCKED_RECOVERY; + } + if (!check_laggy_requeue(op)) { + return cache_result_t::BLOCKED_RECOVERY; + } + + for (auto& p : obc->obs.oi.manifest.chunk_map) { + if (p.second.is_missing()) { + auto m = op->get_req(); + const object_locator_t oloc = m->get_object_locator(); + promote_object(obc, obc->obs.oi.soid, oloc, op, NULL); + return cache_result_t::BLOCKED_PROMOTE; + } + } + return cache_result_t::NOOP; + } + default: + ceph_abort_msg("unrecognized manifest type"); + } + + return cache_result_t::NOOP; +} + +void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid, + MOSDOpReply *orig_reply, int r, + OpContext *ctx_for_op_returns) +{ + dout(20) << __func__ << " r=" << r << dendl; + ceph_assert(op->may_write()); + const osd_reqid_t &reqid = op->get_req()->get_reqid(); + mempool::osd_pglog::list entries; + entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid, + get_next_version(), eversion_t(), 0, + reqid, utime_t(), r)); + if (ctx_for_op_returns) { + entries.back().set_op_returns(*ctx_for_op_returns->ops); + dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl; + } + + struct OnComplete { + PrimaryLogPG *pg; + OpRequestRef op; + boost::intrusive_ptr orig_reply; + int r; + OnComplete( + PrimaryLogPG *pg, + OpRequestRef op, + MOSDOpReply *orig_reply, + int r) + : pg(pg), op(op), + orig_reply(orig_reply, false /* take over ref */), r(r) + {} + void operator()() { + ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl; + auto m = op->get_req(); + MOSDOpReply *reply = orig_reply.detach(); + ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl; + pg->osd->send_message_osd_client(reply, m->get_connection()); + } + }; + + ObcLockManager lock_manager; + submit_log_entries( + entries, + std::move(lock_manager), + std::optional >( + OnComplete(this, op, orig_reply, r)), + op, + r); +} + +PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail( + OpRequestRef op, + bool write_ordered, + ObjectContextRef obc, + int r, hobject_t missing_oid, + bool must_promote, + bool in_hit_set, + ObjectContextRef *promote_obc) +{ + // return quickly if caching is not enabled + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) + return cache_result_t::NOOP; + + if (op && + op->get_req() && + op->get_req()->get_type() == CEPH_MSG_OSD_OP && + (op->get_req()->get_flags() & + CEPH_OSD_FLAG_IGNORE_CACHE)) { + dout(20) << __func__ << ": ignoring cache due to flag" << dendl; + return cache_result_t::NOOP; + } + + must_promote = must_promote || op->need_promote(); + + if (obc) + dout(25) << __func__ << " " << obc->obs.oi << " " + << (obc->obs.exists ? "exists" : "DNE") + << " missing_oid " << missing_oid + << " must_promote " << (int)must_promote + << " in_hit_set " << (int)in_hit_set + << dendl; + else + dout(25) << __func__ << " (no obc)" + << " missing_oid " << missing_oid + << " must_promote " << (int)must_promote + << " in_hit_set " << (int)in_hit_set + << dendl; + + // if it is write-ordered and blocked, stop now + if (obc.get() && obc->is_blocked() && write_ordered) { + // we're already doing something with this object + dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl; + return cache_result_t::NOOP; + } + + if (r == -ENOENT && missing_oid == hobject_t()) { + // we know this object is logically absent (e.g., an undefined clone) + return cache_result_t::NOOP; + } + + if (obc.get() && obc->obs.exists) { + osd->logger->inc(l_osd_op_cache_hit); + return cache_result_t::NOOP; + } + if (!is_primary()) { + dout(20) << __func__ << " cache miss; ask the primary" << dendl; + osd->reply_op_error(op, -EAGAIN); + return cache_result_t::REPLIED_WITH_EAGAIN; + } + + if (missing_oid == hobject_t() && obc.get()) { + missing_oid = obc->obs.oi.soid; + } + + auto m = op->get_req(); + const object_locator_t oloc = m->get_object_locator(); + + if (op->need_skip_handle_cache()) { + return cache_result_t::NOOP; + } + + OpRequestRef promote_op; + + switch (pool.info.cache_mode) { + case pg_pool_t::CACHEMODE_WRITEBACK: + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + if (!op->may_write() && !op->may_cache() && + !write_ordered && !must_promote) { + dout(20) << __func__ << " cache pool full, proxying read" << dendl; + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + } + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + + if (must_promote || (!hit_set && !op->need_skip_promote())) { + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + + if (op->may_write() || op->may_cache()) { + do_proxy_write(op); + + // Promote too? + if (!op->need_skip_promote() && + maybe_promote(obc, missing_oid, oloc, in_hit_set, + pool.info.min_write_recency_for_promote, + OpRequestRef(), + promote_obc)) { + return cache_result_t::BLOCKED_PROMOTE; + } + return cache_result_t::HANDLED_PROXY; + } else { + do_proxy_read(op); + + // Avoid duplicate promotion + if (obc.get() && obc->is_blocked()) { + if (promote_obc) + *promote_obc = obc; + return cache_result_t::BLOCKED_PROMOTE; + } + + // Promote too? + if (!op->need_skip_promote()) { + (void)maybe_promote(obc, missing_oid, oloc, in_hit_set, + pool.info.min_read_recency_for_promote, + promote_op, promote_obc); + } + + return cache_result_t::HANDLED_PROXY; + } + ceph_abort_msg("unreachable"); + return cache_result_t::NOOP; + + case pg_pool_t::CACHEMODE_READONLY: + // TODO: clean this case up + if (!obc.get() && r == -ENOENT) { + // we don't have the object and op's a read + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + if (!r) { // it must be a write + do_cache_redirect(op); + return cache_result_t::HANDLED_REDIRECT; + } + // crap, there was a failure of some kind + return cache_result_t::NOOP; + + case pg_pool_t::CACHEMODE_FORWARD: + // this mode is deprecated; proxy instead + case pg_pool_t::CACHEMODE_PROXY: + if (!must_promote) { + if (op->may_write() || op->may_cache() || write_ordered) { + do_proxy_write(op); + return cache_result_t::HANDLED_PROXY; + } else { + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + } + } + // ugh, we're forced to promote. + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + + case pg_pool_t::CACHEMODE_READFORWARD: + // this mode is deprecated; proxy instead + case pg_pool_t::CACHEMODE_READPROXY: + // Do writeback to the cache tier for writes + if (op->may_write() || write_ordered || must_promote) { + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + + // If it is a read, we can read, we need to proxy it + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + + default: + ceph_abort_msg("unrecognized cache_mode"); + } + return cache_result_t::NOOP; +} + +bool PrimaryLogPG::maybe_promote(ObjectContextRef obc, + const hobject_t& missing_oid, + const object_locator_t& oloc, + bool in_hit_set, + uint32_t recency, + OpRequestRef promote_op, + ObjectContextRef *promote_obc) +{ + dout(20) << __func__ << " missing_oid " << missing_oid + << " in_hit_set " << in_hit_set << dendl; + + switch (recency) { + case 0: + break; + case 1: + // Check if in the current hit set + if (in_hit_set) { + break; + } else { + // not promoting + return false; + } + break; + default: + { + unsigned count = (int)in_hit_set; + if (count) { + // Check if in other hit sets + const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid; + for (map::reverse_iterator itor = + agent_state->hit_set_map.rbegin(); + itor != agent_state->hit_set_map.rend(); + ++itor) { + if (!itor->second->contains(oid)) { + break; + } + ++count; + if (count >= recency) { + break; + } + } + } + if (count >= recency) { + break; + } + return false; // not promoting + } + break; + } + + if (osd->promote_throttle()) { + dout(10) << __func__ << " promote throttled" << dendl; + return false; + } + promote_object(obc, missing_oid, oloc, promote_op, promote_obc); + return true; +} + +void PrimaryLogPG::do_cache_redirect(OpRequestRef op) +{ + auto m = op->get_req(); + int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); + MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(), + flags, false); + request_redirect_t redir(m->get_object_locator(), pool.info.tier_of); + reply->set_redirect(redir); + dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op " + << op << dendl; + m->get_connection()->send_message(reply); + return; +} + +struct C_ProxyRead : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyReadOpRef prdop; + utime_t start; + C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyReadOpRef& prd) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), prdop(prd), start(ceph_clock_now()) + {} + void finish(int r) override { + if (prdop->canceled) + return; + std::scoped_lock locker{*pg}; + if (prdop->canceled) { + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_proxy_read(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start); + } + } +}; + +struct C_ProxyChunkRead : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyReadOpRef prdop; + utime_t start; + ObjectOperation *obj_op; + int op_index = 0; + uint64_t req_offset = 0; + ObjectContextRef obc; + uint64_t req_total_len = 0; + C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyReadOpRef& prd) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL) + {} + void finish(int r) override { + if (prdop->canceled) + return; + std::scoped_lock locker{*pg}; + if (prdop->canceled) { + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + if (r >= 0) { + if (!prdop->ops[op_index].outdata.length()) { + ceph_assert(req_total_len); + bufferlist list; + bufferptr bptr(req_total_len); + list.push_back(std::move(bptr)); + prdop->ops[op_index].outdata.append(list); + } + ceph_assert(obj_op); + uint64_t copy_offset; + if (req_offset >= prdop->ops[op_index].op.extent.offset) { + copy_offset = req_offset - prdop->ops[op_index].op.extent.offset; + } else { + copy_offset = 0; + } + prdop->ops[op_index].outdata.begin(copy_offset).copy_in( + obj_op->ops[0].outdata.length(), + obj_op->ops[0].outdata.c_str()); + } + + pg->finish_proxy_read(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start); + if (obj_op) { + delete obj_op; + } + } + } +}; + +void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc) +{ + // NOTE: non-const here because the ProxyReadOp needs mutable refs to + // stash the result in the request's OSDOp vector + MOSDOp *m = static_cast(op->get_nonconst_req()); + object_locator_t oloc; + hobject_t soid; + /* extensible tier */ + if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) { + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + oloc = object_locator_t(obc->obs.oi.manifest.redirect_target); + soid = obc->obs.oi.manifest.redirect_target; + break; + default: + ceph_abort_msg("unrecognized manifest type"); + } + } else { + /* proxy */ + soid = m->get_hobj(); + oloc = object_locator_t(m->get_object_locator()); + oloc.pool = pool.info.tier_of; + } + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + + // pass through some original flags that make sense. + // - leave out redirection and balancing flags since we are + // already proxying through the primary + // - leave off read/write/exec flags that are derived from the op + flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED | + CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ENFORCE_SNAPC | + CEPH_OSD_FLAG_MAP_SNAP_CLONE); + + dout(10) << __func__ << " Start proxy read for " << *m << dendl; + + ProxyReadOpRef prdop(std::make_shared(op, soid, m->ops)); + + ObjectOperation obj_op; + obj_op.dup(prdop->ops); + + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK && + (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) { + for (unsigned i = 0; i < obj_op.ops.size(); i++) { + ceph_osd_op op = obj_op.ops[i].op; + switch (op.op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_CHECKSUM: + case CEPH_OSD_OP_CMPEXT: + op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) & + ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + } + } + } + + C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(), + prdop); + ceph_tid_t tid = osd->objecter->read( + soid.oid, oloc, obj_op, + m->get_snapid(), NULL, + flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())), + &prdop->user_version, + &prdop->data_offset, + m->get_features()); + fin->tid = tid; + prdop->objecter_tid = tid; + proxyread_ops[tid] = prdop; + in_progress_proxy_ops[soid].push_back(op); +} + +void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + + map::iterator p = proxyread_ops.find(tid); + if (p == proxyread_ops.end()) { + dout(10) << __func__ << " no proxyread_op found" << dendl; + return; + } + ProxyReadOpRef prdop = p->second; + if (tid != prdop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != prdop " << prdop + << " tid " << prdop->objecter_tid << dendl; + return; + } + if (oid != prdop->soid) { + dout(10) << __func__ << " oid " << oid << " != prdop " << prdop + << " soid " << prdop->soid << dendl; + return; + } + proxyread_ops.erase(tid); + + map>::iterator q = in_progress_proxy_ops.find(oid); + if (q == in_progress_proxy_ops.end()) { + dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl; + return; + } + ceph_assert(q->second.size()); + list::iterator it = std::find(q->second.begin(), + q->second.end(), + prdop->op); + ceph_assert(it != q->second.end()); + OpRequestRef op = *it; + q->second.erase(it); + if (q->second.size() == 0) { + in_progress_proxy_ops.erase(oid); + } else if (std::find(q->second.begin(), + q->second.end(), + prdop->op) != q->second.end()) { + /* multiple read case */ + dout(20) << __func__ << " " << oid << " is not completed " << dendl; + return; + } + + osd->logger->inc(l_osd_tier_proxy_read); + + auto m = op->get_req(); + OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this); + ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + ctx->user_at_version = prdop->user_version; + ctx->data_off = prdop->data_offset; + ctx->ignore_log_op_stats = true; + complete_read_ctx(r, ctx); +} + +void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid) +{ + map>::iterator p = in_progress_proxy_ops.find(soid); + if (p == in_progress_proxy_ops.end()) + return; + + list& ls = p->second; + dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl; + requeue_ops(ls); + in_progress_proxy_ops.erase(p); +} + +void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop, + vector *tids) +{ + dout(10) << __func__ << " " << prdop->soid << dendl; + prdop->canceled = true; + + // cancel objecter op, if we can + if (prdop->objecter_tid) { + tids->push_back(prdop->objecter_tid); + for (uint32_t i = 0; i < prdop->ops.size(); i++) { + prdop->ops[i].outdata.clear(); + } + proxyread_ops.erase(prdop->objecter_tid); + prdop->objecter_tid = 0; + } +} + +void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector *tids) +{ + dout(10) << __func__ << dendl; + + // cancel proxy reads + map::iterator p = proxyread_ops.begin(); + while (p != proxyread_ops.end()) { + cancel_proxy_read((p++)->second, tids); + } + + // cancel proxy writes + map::iterator q = proxywrite_ops.begin(); + while (q != proxywrite_ops.end()) { + cancel_proxy_write((q++)->second, tids); + } + + if (requeue) { + map>::iterator p = + in_progress_proxy_ops.begin(); + while (p != in_progress_proxy_ops.end()) { + list& ls = p->second; + dout(10) << __func__ << " " << p->first << " requeuing " << ls.size() + << " requests" << dendl; + requeue_ops(ls); + in_progress_proxy_ops.erase(p++); + } + } else { + in_progress_proxy_ops.clear(); + } +} + +struct C_ProxyWrite_Commit : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyWriteOpRef pwop; + C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyWriteOpRef& pw) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), pwop(pw) + {} + void finish(int r) override { + if (pwop->canceled) + return; + std::scoped_lock locker{*pg}; + if (pwop->canceled) { + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_proxy_write(oid, tid, r); + } + } +}; + +void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc) +{ + // NOTE: non-const because ProxyWriteOp takes a mutable ref + MOSDOp *m = static_cast(op->get_nonconst_req()); + object_locator_t oloc; + SnapContext snapc(m->get_snap_seq(), m->get_snaps()); + hobject_t soid; + /* extensible tier */ + if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) { + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + oloc = object_locator_t(obc->obs.oi.manifest.redirect_target); + soid = obc->obs.oi.manifest.redirect_target; + break; + default: + ceph_abort_msg("unrecognized manifest type"); + } + } else { + /* proxy */ + soid = m->get_hobj(); + oloc = object_locator_t(m->get_object_locator()); + oloc.pool = pool.info.tier_of; + } + + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (!(op->may_write() || op->may_cache())) { + flags |= CEPH_OSD_FLAG_RWORDERED; + } + if (op->allows_returnvec()) { + flags |= CEPH_OSD_FLAG_RETURNVEC; + } + + dout(10) << __func__ << " Start proxy write for " << *m << dendl; + + ProxyWriteOpRef pwop(std::make_shared(op, soid, m->ops, m->get_reqid())); + pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this); + pwop->mtime = m->get_mtime(); + + ObjectOperation obj_op; + obj_op.dup(pwop->ops); + + C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit( + this, soid, get_last_peering_reset(), pwop); + ceph_tid_t tid = osd->objecter->mutate( + soid.oid, oloc, obj_op, snapc, + ceph::real_clock::from_ceph_timespec(pwop->mtime), + flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())), + &pwop->user_version, pwop->reqid); + fin->tid = tid; + pwop->objecter_tid = tid; + proxywrite_ops[tid] = pwop; + in_progress_proxy_ops[soid].push_back(op); +} + +void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid, + ObjectContextRef obc, bool write_ordered) +{ + MOSDOp *m = static_cast(op->get_nonconst_req()); + OSDOp *osd_op = NULL; + for (unsigned int i = 0; i < m->ops.size(); i++) { + osd_op = &m->ops[i]; + uint64_t cursor = osd_op->op.extent.offset; + uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length; + uint64_t chunk_length = 0, chunk_index = 0, req_len = 0; + object_manifest_t *manifest = &obc->obs.oi.manifest; + map > chunk_read; + + while (cursor < op_length) { + chunk_index = 0; + chunk_length = 0; + /* find the right chunk position for cursor */ + for (auto &p : manifest->chunk_map) { + if (p.first <= cursor && p.first + p.second.length > cursor) { + chunk_length = p.second.length; + chunk_index = p.first; + break; + } + } + /* no index */ + if (!chunk_index && !chunk_length) { + if (cursor == osd_op->op.extent.offset) { + OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this); + ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + ctx->data_off = osd_op->op.extent.offset; + ctx->ignore_log_op_stats = true; + complete_read_ctx(0, ctx); + } + break; + } + uint64_t next_length = chunk_length; + /* the size to read -> | op length | */ + /* | a chunk | */ + if (cursor + next_length > op_length) { + next_length = op_length - cursor; + } + /* the size to read -> | op length | */ + /* | a chunk | */ + if (cursor + next_length > chunk_index + chunk_length) { + next_length = chunk_index + chunk_length - cursor; + } + + chunk_read[cursor] = {{chunk_index, next_length}}; + cursor += next_length; + } + + req_len = cursor - osd_op->op.extent.offset; + for (auto &p : chunk_read) { + auto chunks = p.second.begin(); + dout(20) << __func__ << " chunk_index: " << chunks->first + << " next_length: " << chunks->second << " cursor: " + << p.first << dendl; + do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered); + } + } +} + +struct RefCountCallback : public Context { +public: + PrimaryLogPG::OpContext *ctx; + OSDOp& osd_op; + bool requeue = false; + + RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op) + : ctx(ctx), osd_op(osd_op) {} + void finish(int r) override { + // NB: caller must already have pg->lock held + ctx->obc->stop_block(); + ctx->pg->kick_object_context_blocked(ctx->obc); + if (r >= 0) { + osd_op.rval = 0; + ctx->pg->execute_ctx(ctx); + } else { + // on cancel simply toss op out, + // or requeue as requested + if (r != -ECANCELED) { + if (ctx->op) + ctx->pg->osd->reply_op_error(ctx->op, r); + } else if (requeue) { + if (ctx->op) + ctx->pg->requeue_op(ctx->op); + } + ctx->pg->close_op_ctx(ctx); + } + } + void set_requeue(bool rq) { + requeue = rq; + } +}; + +struct SetManifestFinisher : public PrimaryLogPG::OpFinisher { + OSDOp& osd_op; + + explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) { + } + + int execute() override { + return osd_op.rval; + } +}; + +struct C_SetManifestRefCountDone : public Context { + PrimaryLogPGRef pg; + PrimaryLogPG::ManifestOpRef mop; + hobject_t soid; + C_SetManifestRefCountDone(PrimaryLogPG *p, + PrimaryLogPG::ManifestOpRef mop, hobject_t soid) : + pg(p), mop(mop), soid(soid) {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock locker{*pg}; + auto it = pg->manifest_ops.find(soid); + if (it == pg->manifest_ops.end()) { + // raced with cancel_manifest_ops + return; + } + if (it->second->cb) { + it->second->cb->complete(r); + } + pg->manifest_ops.erase(it); + mop.reset(); + } +}; + +struct C_SetDedupChunks : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + uint64_t offset; + + C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), offset(offset) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock locker{*pg}; + if (last_peering_reset != pg->get_last_peering_reset()) { + return; + } + pg->finish_set_dedup(oid, r, tid, offset); + } +}; + +void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector *tids) +{ + dout(10) << __func__ << dendl; + auto p = manifest_ops.begin(); + while (p != manifest_ops.end()) { + auto mop = p->second; + // cancel objecter op, if we can + if (mop->objecter_tid) { + tids->push_back(mop->objecter_tid); + mop->objecter_tid = 0; + } + if (mop->cb) { + mop->cb->set_requeue(requeue); + mop->cb->complete(-ECANCELED); + } + manifest_ops.erase(p++); + } +} + +int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op) +{ + int cnt = 0; + // head + for (auto &p : obc->obs.oi.manifest.chunk_map) { + if (p.second.oid.oid.name == fp_oid) { + cnt++; + } + } + // snap + SnapSet& ss = obc->ssc->snapset; + const OSDMapRef& osdmap = get_osdmap(); + for (vector::const_reverse_iterator p = ss.clones.rbegin(); + p != ss.clones.rend(); + ++p) { + object_ref_delta_t refs; + ObjectContextRef obc_l = nullptr; + ObjectContextRef obc_g = nullptr; + hobject_t clone_oid = obc->obs.oi.soid; + clone_oid.snap = *p; + if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) { + return -EBUSY; + } + ObjectContextRef clone_obc = get_object_context(clone_oid, false); + if (!clone_obc) { + break; + } + if (recover_adjacent_clones(clone_obc, op)) { + return -EAGAIN; + } + get_adjacent_clones(clone_obc, obc_l, obc_g); + clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set( + obc_g ? &(obc_g->obs.oi.manifest) : nullptr , + nullptr, + refs); + for (auto p = refs.begin(); p != refs.end(); ++p) { + if (p->first.oid.name == fp_oid && p->second > 0) { + cnt += p->second; + } + } + } + + return cnt; +} + +bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op) +{ + if (!obc->ssc || !obc->ssc->snapset.clones.size()) { + return false; + } + MOSDOp *m = static_cast(op->get_nonconst_req()); + bool has_manifest_op = std::any_of( + begin(m->ops), + end(m->ops), + [](const auto& osd_op) { + return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK; + }); + if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) { + return false; + } + ceph_assert(op); + + const SnapSet& snapset = obc->ssc->snapset; + auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap); + auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool { + hobject_t cid = obc->obs.oi.soid; + cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter; + if (is_unreadable_object(cid)) { + dout(10) << __func__ << ": clone " << cid + << " is unreadable, waiting" << dendl; + wait_for_unreadable_object(cid, op); + return true; + } + return false; + }; + if (s != snapset.clones.begin()) { + if (is_unreadable_snap(s - 1)) { + return true; + } + } + if (s != snapset.clones.end()) { + if (is_unreadable_snap(s + 1)) { + return true; + } + } + return false; +} + +ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc) +{ + auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(), + obc->obs.oi.soid.snap); + if (s != obc->ssc->snapset.clones.begin()) { + auto s_iter = s - 1; + hobject_t cid = obc->obs.oi.soid; + object_ref_delta_t refs; + cid.snap = *s_iter; + ObjectContextRef cobc = get_object_context(cid, false, NULL); + ceph_assert(cobc); + return cobc; + } + return nullptr; +} + +void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs) +{ + for (auto p = refs.begin(); p != refs.end(); ++p) { + int dec_ref_count = p->second; + ceph_assert(dec_ref_count < 0); + while (dec_ref_count < 0) { + dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl; + refcount_manifest(soid, p->first, + refcount_t::DECREMENT_REF, NULL, std::nullopt); + dec_ref_count++; + } + } +} + + +void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc, + ObjectContextRef& _l, ObjectContextRef& _g) +{ + const SnapSet& snapset = src_obc->ssc->snapset; + const object_info_t& oi = src_obc->obs.oi; + + auto get_context = [this, &oi, &snapset](auto iter) + -> ObjectContextRef { + hobject_t cid = oi.soid; + cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter; + ObjectContextRef obc = get_object_context(cid, false, NULL); + ceph_assert(obc); + return obc; + }; + + // check adjacent clones + auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap); + + // We *must* find the clone iff it's not head, + // let s == snapset.clones.end() mean head + ceph_assert((s == snapset.clones.end()) == oi.soid.is_head()); + + if (s != snapset.clones.begin()) { + _l = get_context(s - 1); + } + + if (s != snapset.clones.end()) { + _g = get_context(s + 1); + } +} + +bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk, + OSDOp& osd_op) +{ + object_ref_delta_t refs; + ObjectContextRef obc_l, obc_g; + get_adjacent_clones(ctx->obc, obc_l, obc_g); + set_chunk.calc_refs_to_inc_on_set( + obc_l ? &(obc_l->obs.oi.manifest) : nullptr, + obc_g ? &(obc_g->obs.oi.manifest) : nullptr, + refs); + if (!refs.is_empty()) { + /* This is called by set-chunk, so we only consider a single chunk for the time being */ + ceph_assert(refs.size() == 1); + auto p = refs.begin(); + int inc_ref_count = p->second; + if (inc_ref_count > 0) { + /* + * In set-chunk case, the first thing we should do is to increment + * the reference the targe object has prior to update object_manifest in object_info_t. + * So, call directly refcount_manifest. + */ + ManifestOpRef mop = std::make_shared(new RefCountCallback(ctx, osd_op)); + C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, ctx->obs->oi.soid); + ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, p->first, + refcount_t::INCREMENT_REF, fin, std::nullopt); + mop->objecter_tid = tid; + manifest_ops[ctx->obs->oi.soid] = mop; + ctx->obc->start_block(); + return true; + } else if (inc_ref_count < 0) { + hobject_t src = ctx->obs->oi.soid; + hobject_t tgt = p->first; + ctx->register_on_commit( + [src, tgt, this](){ + refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt); + }); + return false; + } + } + + return false; +} + +void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx) +{ + object_ref_delta_t refs; + ObjectContextRef cobc = nullptr; + ObjectContextRef obc = ctx->obc; + for (auto &p : ctx->obs->oi.manifest.chunk_map) { + if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) { + ctx->new_obs.oi.manifest.chunk_map.erase(p.first); + if (ctx->new_obs.oi.manifest.chunk_map.empty()) { + ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST); + ctx->delta_stats.num_objects_manifest--; + } + } + } + // Look over previous snapshot, then figure out whether updated chunk needs to be deleted + cobc = get_prev_clone_obc(obc); + obc->obs.oi.manifest.calc_refs_to_drop_on_modify( + cobc ? &cobc->obs.oi.manifest : nullptr, + ctx->clean_regions, + refs); + if (!refs.is_empty()) { + hobject_t soid = obc->obs.oi.soid; + ctx->register_on_commit( + [soid, this, refs](){ + dec_refcount(soid, refs); + }); + } +} + +void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx) +{ + ceph_assert(oi.has_manifest()); + ceph_assert(ctx->obc->ssc); + + if (oi.manifest.is_chunked()) { + object_ref_delta_t refs; + ObjectContextRef obc_l, obc_g; + get_adjacent_clones(ctx->obc, obc_l, obc_g); + oi.manifest.calc_refs_to_drop_on_removal( + obc_l ? &(obc_l->obs.oi.manifest) : nullptr, + obc_g ? &(obc_g->obs.oi.manifest) : nullptr, + refs); + + if (!refs.is_empty()) { + hobject_t soid = ctx->obc->obs.oi.soid; + ctx->register_on_commit( + [soid, this, refs](){ + dec_refcount(soid, refs); + }); + } + } else if (oi.manifest.is_redirect() && + oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) { + ctx->register_on_commit( + [oi, this](){ + refcount_manifest(oi.soid, oi.manifest.redirect_target, + refcount_t::DECREMENT_REF, NULL, std::nullopt); + }); + } +} + +ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type, + Context *cb, std::optional chunk) +{ + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_RWORDERED; + + dout(10) << __func__ << " Start refcount from " << src_soid + << " to " << tgt_soid << dendl; + + ObjectOperation obj_op; + bufferlist in; + if (type == refcount_t::INCREMENT_REF) { + cls_cas_chunk_get_ref_op call; + call.source = src_soid.get_head(); + ::encode(call, in); + obj_op.call("cas", "chunk_get_ref", in); + } else if (type == refcount_t::DECREMENT_REF) { + cls_cas_chunk_put_ref_op call; + call.source = src_soid.get_head(); + ::encode(call, in); + obj_op.call("cas", "chunk_put_ref", in); + } else if (type == refcount_t::CREATE_OR_GET_REF) { + cls_cas_chunk_create_or_get_ref_op get_call; + get_call.source = src_soid.get_head(); + ceph_assert(chunk); + get_call.data = move(*chunk); + ::encode(get_call, in); + obj_op.call("cas", "chunk_create_or_get_ref", in); + } else { + ceph_assert(0 == "unrecognized type"); + } + + Context *c = nullptr; + if (cb) { + c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard())); + } + + object_locator_t oloc(tgt_soid); + ObjectContextRef src_obc = get_object_context(src_soid, false, NULL); + ceph_assert(src_obc); + auto tid = osd->objecter->mutate( + tgt_soid.oid, oloc, obj_op, SnapContext(), + ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime), + flags, c); + return tid; +} + +void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index, + uint64_t chunk_index, uint64_t req_offset, uint64_t req_length, + uint64_t req_total_len, bool write_ordered) +{ + MOSDOp *m = static_cast(op->get_nonconst_req()); + object_manifest_t *manifest = &obc->obs.oi.manifest; + if (!manifest->chunk_map.count(chunk_index)) { + return; + } + uint64_t chunk_length = manifest->chunk_map[chunk_index].length; + hobject_t soid = manifest->chunk_map[chunk_index].oid; + hobject_t ori_soid = m->get_hobj(); + object_locator_t oloc(soid); + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (write_ordered) { + flags |= CEPH_OSD_FLAG_RWORDERED; + } + + if (!chunk_length || soid == hobject_t()) { + return; + } + + /* same as do_proxy_read() */ + flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED | + CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ENFORCE_SNAPC | + CEPH_OSD_FLAG_MAP_SNAP_CLONE); + + dout(10) << __func__ << " Start do chunk proxy read for " << *m + << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset + << " req_length: " << req_length << dendl; + + ProxyReadOpRef prdop(std::make_shared(op, ori_soid, m->ops)); + + ObjectOperation *pobj_op = new ObjectOperation; + OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op); + + if (chunk_index <= req_offset) { + osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index; + } else { + ceph_abort_msg("chunk_index > req_offset"); + } + osd_op.op.extent.length = req_length; + + ObjectOperation obj_op; + obj_op.dup(pobj_op->ops); + + C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(), + prdop); + fin->obj_op = pobj_op; + fin->op_index = op_index; + fin->req_offset = req_offset; + fin->obc = obc; + fin->req_total_len = req_total_len; + + ceph_tid_t tid = osd->objecter->read( + soid.oid, oloc, obj_op, + m->get_snapid(), NULL, + flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())), + &prdop->user_version, + &prdop->data_offset, + m->get_features()); + fin->tid = tid; + prdop->objecter_tid = tid; + proxyread_ops[tid] = prdop; + in_progress_proxy_ops[ori_soid].push_back(op); +} + +bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc) +{ + MOSDOp *m = static_cast(op->get_nonconst_req()); + OSDOp *osd_op = NULL; + bool ret = true; + for (unsigned int i = 0; i < m->ops.size(); i++) { + osd_op = &m->ops[i]; + ceph_osd_op op = osd_op->op; + switch (op.op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SYNC_READ: { + uint64_t cursor = osd_op->op.extent.offset; + uint64_t remain = osd_op->op.extent.length; + + /* requested chunks exist in chunk_map ? */ + for (auto &p : obc->obs.oi.manifest.chunk_map) { + if (p.first <= cursor && p.first + p.second.length > cursor) { + if (!p.second.is_missing()) { + return false; + } + if (p.second.length >= remain) { + remain = 0; + break; + } else { + remain = remain - p.second.length; + } + cursor += p.second.length; + } + } + + if (remain) { + dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl; + return false; + } + continue; + } + default: + return false; + } + } + return ret; +} + +void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + + map::iterator p = proxywrite_ops.find(tid); + if (p == proxywrite_ops.end()) { + dout(10) << __func__ << " no proxywrite_op found" << dendl; + return; + } + ProxyWriteOpRef pwop = p->second; + ceph_assert(tid == pwop->objecter_tid); + ceph_assert(oid == pwop->soid); + + proxywrite_ops.erase(tid); + + map >::iterator q = in_progress_proxy_ops.find(oid); + if (q == in_progress_proxy_ops.end()) { + dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl; + delete pwop->ctx; + pwop->ctx = NULL; + return; + } + list& in_progress_op = q->second; + ceph_assert(in_progress_op.size()); + list::iterator it = std::find(in_progress_op.begin(), + in_progress_op.end(), + pwop->op); + ceph_assert(it != in_progress_op.end()); + in_progress_op.erase(it); + if (in_progress_op.size() == 0) { + in_progress_proxy_ops.erase(oid); + } else if (std::find(in_progress_op.begin(), + in_progress_op.end(), + pwop->op) != in_progress_op.end()) { + if (pwop->ctx) + delete pwop->ctx; + pwop->ctx = NULL; + dout(20) << __func__ << " " << oid << " tid " << tid + << " in_progress_op size: " + << in_progress_op.size() << dendl; + return; + } + + osd->logger->inc(l_osd_tier_proxy_write); + + auto m = pwop->op->get_req(); + ceph_assert(m != NULL); + + if (!pwop->sent_reply) { + // send commit. + assert(pwop->ctx->reply == nullptr); + MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0, + true /* we claim it below */); + reply->set_reply_versions(eversion_t(), pwop->user_version); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + reply->claim_op_out_data(pwop->ops); + dout(10) << " sending commit on " << pwop << " " << reply << dendl; + osd->send_message_osd_client(reply, m->get_connection()); + pwop->sent_reply = true; + pwop->ctx->op->mark_commit_sent(); + } + + delete pwop->ctx; + pwop->ctx = NULL; +} + +void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop, + vector *tids) +{ + dout(10) << __func__ << " " << pwop->soid << dendl; + pwop->canceled = true; + + // cancel objecter op, if we can + if (pwop->objecter_tid) { + tids->push_back(pwop->objecter_tid); + delete pwop->ctx; + pwop->ctx = NULL; + proxywrite_ops.erase(pwop->objecter_tid); + pwop->objecter_tid = 0; + } +} + +class PromoteCallback: public PrimaryLogPG::CopyCallback { + ObjectContextRef obc; + PrimaryLogPG *pg; + utime_t start; +public: + PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_) + : obc(obc_), + pg(pg_), + start(ceph_clock_now()) {} + + void finish(PrimaryLogPG::CopyCallbackResults results) override { + PrimaryLogPG::CopyResults *results_data = results.get<1>(); + int r = results.get<0>(); + pg->finish_promote(r, results_data, obc); + pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start); + } +}; + +class PromoteManifestCallback: public PrimaryLogPG::CopyCallback { + ObjectContextRef obc; + PrimaryLogPG *pg; + utime_t start; + PrimaryLogPG::OpContext *ctx; + PrimaryLogPG::CopyCallbackResults promote_results; +public: + PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL) + : obc(obc_), + pg(pg_), + start(ceph_clock_now()), ctx(ctx) {} + + void finish(PrimaryLogPG::CopyCallbackResults results) override { + PrimaryLogPG::CopyResults *results_data = results.get<1>(); + int r = results.get<0>(); + if (ctx) { + promote_results = results; + pg->execute_ctx(ctx); + } else { + pg->finish_promote_manifest(r, results_data, obc); + } + pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start); + } + friend struct PromoteFinisher; +}; + +struct PromoteFinisher : public PrimaryLogPG::OpFinisher { + PromoteManifestCallback *promote_callback; + + explicit PromoteFinisher(PromoteManifestCallback *promote_callback) + : promote_callback(promote_callback) { + } + + int execute() override { + if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) { + promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(), + promote_callback->promote_results.get<1>(), + promote_callback->obc); + } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) { + promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(), + promote_callback->promote_results.get<1>(), + promote_callback->obc); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + return 0; + } +}; + +void PrimaryLogPG::promote_object(ObjectContextRef obc, + const hobject_t& missing_oid, + const object_locator_t& oloc, + OpRequestRef op, + ObjectContextRef *promote_obc) +{ + hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid; + ceph_assert(hoid != hobject_t()); + if (m_scrubber->write_blocked_by_scrub(hoid)) { + dout(10) << __func__ << " " << hoid + << " blocked by scrub" << dendl; + if (op) { + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + dout(10) << __func__ << " " << hoid + << " placing op in waiting_for_scrub" << dendl; + } else { + dout(10) << __func__ << " " << hoid + << " no op, dropping on the floor" << dendl; + } + return; + } + if (op && !check_laggy_requeue(op)) { + return; + } + if (!obc) { // we need to create an ObjectContext + ceph_assert(missing_oid != hobject_t()); + obc = get_object_context(missing_oid, true); + } + if (promote_obc) + *promote_obc = obc; + + /* + * Before promote complete, if there are proxy-reads for the object, + * for this case we don't use DONTNEED. + */ + unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL; + map>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid); + if (q == in_progress_proxy_ops.end()) { + src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED; + } + + CopyCallback *cb; + object_locator_t my_oloc; + hobject_t src_hoid; + if (!obc->obs.oi.has_manifest()) { + my_oloc = oloc; + my_oloc.pool = pool.info.tier_of; + src_hoid = obc->obs.oi.soid; + cb = new PromoteCallback(obc, this); + } else { + if (obc->obs.oi.manifest.is_chunked()) { + src_hoid = obc->obs.oi.soid; + cb = new PromoteManifestCallback(obc, this); + } else if (obc->obs.oi.manifest.is_redirect()) { + object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target); + my_oloc = src_oloc; + src_hoid = obc->obs.oi.manifest.redirect_target; + cb = new PromoteCallback(obc, this); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + } + + unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | + CEPH_OSD_COPY_FROM_FLAG_RWORDERED; + start_copy(cb, obc, src_hoid, my_oloc, 0, flags, + obc->obs.oi.soid.snap == CEPH_NOSNAP, + src_fadvise_flags, 0); + + ceph_assert(obc->is_blocked()); + + if (op) + wait_for_blocked_object(obc->obs.oi.soid, op); + + recovery_state.update_stats( + [](auto &history, auto &stats) { + stats.stats.sum.num_promote++; + return false; + }); +} + +void PrimaryLogPG::execute_ctx(OpContext *ctx) +{ + FUNCTRACE(cct); + dout(10) << __func__ << " " << ctx << dendl; + ctx->reset_obs(ctx->obc); + ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx + OpRequestRef op = ctx->op; + auto m = op->get_req(); + ObjectContextRef obc = ctx->obc; + const hobject_t& soid = obc->obs.oi.soid; + + // this method must be idempotent since we may call it several times + // before we finally apply the resulting transaction. + ctx->op_t.reset(new PGTransaction); + + if (op->may_write() || op->may_cache()) { + // snap + if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) && + pool.info.is_pool_snaps_mode()) { + // use pool's snapc + ctx->snapc = pool.snapc; + } else { + // client specified snapc + ctx->snapc.seq = m->get_snap_seq(); + ctx->snapc.snaps = m->get_snaps(); + filter_snapc(ctx->snapc.snaps); + } + if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) && + ctx->snapc.seq < obc->ssc->snapset.seq) { + dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq + << " < snapset seq " << obc->ssc->snapset.seq + << " on " << obc->obs.oi.soid << dendl; + reply_ctx(ctx, -EOLDSNAPC); + return; + } + + // version + ctx->at_version = get_next_version(); + ctx->mtime = m->get_mtime(); + + dout(10) << __func__ << " " << soid << " " << *ctx->ops + << " ov " << obc->obs.oi.version << " av " << ctx->at_version + << " snapc " << ctx->snapc + << " snapset " << obc->ssc->snapset + << dendl; + } else { + dout(10) << __func__ << " " << soid << " " << *ctx->ops + << " ov " << obc->obs.oi.version + << dendl; + } + + if (!ctx->user_at_version) + ctx->user_at_version = obc->obs.oi.user_version; + dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl; + + { +#ifdef WITH_LTTNG + osd_reqid_t reqid = ctx->op->get_reqid(); +#endif + tracepoint(osd, prepare_tx_enter, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } +#ifdef HAVE_JAEGER + if (ctx->op->osd_parent_span) { + auto execute_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span); + } +#endif + + int result = prepare_transaction(ctx); + + { +#ifdef WITH_LTTNG + osd_reqid_t reqid = ctx->op->get_reqid(); +#endif + tracepoint(osd, prepare_tx_exit, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + bool pending_async_reads = !ctx->pending_async_reads.empty(); + if (result == -EINPROGRESS || pending_async_reads) { + // come back later. + if (pending_async_reads) { + ceph_assert(pool.info.is_erasure()); + in_progress_async_reads.push_back(make_pair(op, ctx)); + ctx->start_async_reads(this); + } + return; + } + + if (result == -EAGAIN) { + // clean up after the ctx + close_op_ctx(ctx); + return; + } + + bool ignore_out_data = false; + if (!ctx->op_t->empty() && + op->may_write() && + result >= 0) { + // successful update + if (ctx->op->allows_returnvec()) { + // enforce reasonable bound on the return buffer sizes + for (auto& i : *ctx->ops) { + if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) { + dout(10) << __func__ << " op " << i << " outdata overflow" << dendl; + result = -EOVERFLOW; // overall result is overflow + i.rval = -EOVERFLOW; + i.outdata.clear(); + } + } + } else { + // legacy behavior -- zero result and return data etc. + ignore_out_data = true; + result = 0; + } + } + + // prepare the reply + ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0, + ignore_out_data); + dout(20) << __func__ << " alloc reply " << ctx->reply + << " result " << result << dendl; + + // read or error? + if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) { + // finish side-effects + if (result >= 0) + do_osd_op_effects(ctx, m->get_connection()); + + complete_read_ctx(result, ctx); + return; + } + + ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version); + + ceph_assert(op->may_write() || op->may_cache()); + + // trim log? + recovery_state.update_trim_to(); + + // verify that we are doing this in order? + if (cct->_conf->osd_debug_op_order && m->get_source().is_client() && + !pool.info.is_tier() && !pool.info.has_tiers()) { + map& cm = debug_op_order[obc->obs.oi.soid]; + ceph_tid_t t = m->get_tid(); + client_t n = m->get_source().num(); + map::iterator p = cm.find(n); + if (p == cm.end()) { + dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl; + cm[n] = t; + } else { + dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl; + if (p->second > t) { + derr << "bad op order, already applied " << p->second << " > this " << t << dendl; + ceph_abort_msg("out of order op"); + } + p->second = t; + } + } + + if (ctx->update_log_only) { + if (result >= 0) + do_osd_op_effects(ctx, m->get_connection()); + + dout(20) << __func__ << " update_log_only -- result=" << result << dendl; + // save just what we need from ctx + MOSDOpReply *reply = ctx->reply; + ctx->reply = nullptr; + reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0); + + if (result == -ENOENT) { + reply->set_enoent_reply_versions(info.last_update, + info.last_user_version); + } + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + // append to pg log for dup detection - don't save buffers for now + record_write_error(op, soid, reply, result, + ctx->op->allows_returnvec() ? ctx : nullptr); + close_op_ctx(ctx); + return; + } + + // no need to capture PG ref, repop cancel will handle that + // Can capture the ctx by pointer, it's owned by the repop + ctx->register_on_commit( + [m, ctx, this](){ + if (ctx->op) + log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read); + + if (m && !ctx->sent_reply) { + MOSDOpReply *reply = ctx->reply; + ctx->reply = nullptr; + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + dout(10) << " sending reply on " << *m << " " << reply << dendl; + osd->send_message_osd_client(reply, m->get_connection()); + ctx->sent_reply = true; + ctx->op->mark_commit_sent(); + } + }); + ctx->register_on_success( + [ctx, this]() { + do_osd_op_effects( + ctx, + ctx->op ? ctx->op->get_req()->get_connection() : + ConnectionRef()); + }); + ctx->register_on_finish( + [ctx]() { + delete ctx; + }); + + // issue replica writes + ceph_tid_t rep_tid = osd->get_tid(); + + RepGather *repop = new_repop(ctx, obc, rep_tid); + + issue_repop(repop, ctx); + eval_repop(repop); + repop->put(); +} + +void PrimaryLogPG::close_op_ctx(OpContext *ctx) { + release_object_locks(ctx->lock_manager); + + ctx->op_t.reset(); + + for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end(); + ctx->on_finish.erase(p++)) { + (*p)(); + } + delete ctx; +} + +void PrimaryLogPG::reply_ctx(OpContext *ctx, int r) +{ + if (ctx->op) + osd->reply_op_error(ctx->op, r); + close_op_ctx(ctx); +} + +void PrimaryLogPG::log_op_stats(const OpRequest& op, + const uint64_t inb, + const uint64_t outb) +{ + auto m = op.get_req(); + const utime_t now = ceph_clock_now(); + + const utime_t latency = now - m->get_recv_stamp(); + const utime_t process_latency = now - op.get_dequeued_time(); + + osd->logger->inc(l_osd_op); + + osd->logger->inc(l_osd_op_outb, outb); + osd->logger->inc(l_osd_op_inb, inb); + osd->logger->tinc(l_osd_op_lat, latency); + osd->logger->tinc(l_osd_op_process_lat, process_latency); + + if (op.may_read() && op.may_write()) { + osd->logger->inc(l_osd_op_rw); + osd->logger->inc(l_osd_op_rw_inb, inb); + osd->logger->inc(l_osd_op_rw_outb, outb); + osd->logger->tinc(l_osd_op_rw_lat, latency); + osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb); + osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb); + osd->logger->tinc(l_osd_op_rw_process_lat, process_latency); + } else if (op.may_read()) { + osd->logger->inc(l_osd_op_r); + osd->logger->inc(l_osd_op_r_outb, outb); + osd->logger->tinc(l_osd_op_r_lat, latency); + osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb); + osd->logger->tinc(l_osd_op_r_process_lat, process_latency); + } else if (op.may_write() || op.may_cache()) { + osd->logger->inc(l_osd_op_w); + osd->logger->inc(l_osd_op_w_inb, inb); + osd->logger->tinc(l_osd_op_w_lat, latency); + osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb); + osd->logger->tinc(l_osd_op_w_process_lat, process_latency); + } else { + ceph_abort(); + } + + dout(15) << "log_op_stats " << *m + << " inb " << inb + << " outb " << outb + << " lat " << latency << dendl; + + if (m_dynamic_perf_stats.is_enabled()) { + m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency); + } +} + +void PrimaryLogPG::set_dynamic_perf_stats_queries( + const std::list &queries) +{ + m_dynamic_perf_stats.set_queries(queries); +} + +void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats) +{ + std::swap(m_dynamic_perf_stats, *stats); +} + +void PrimaryLogPG::do_scan( + OpRequestRef op, + ThreadPool::TPHandle &handle) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_SCAN); + dout(10) << "do_scan " << *m << dendl; + + op->mark_started(); + + switch (m->op) { + case MOSDPGScan::OP_SCAN_GET_DIGEST: + { + auto dpp = get_dpp(); + if (osd->check_backfill_full(dpp)) { + dout(1) << __func__ << ": Canceling backfill: Full." << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::BackfillTooFull()))); + return; + } + + BackfillInterval bi; + bi.begin = m->begin; + // No need to flush, there won't be any in progress writes occuring + // past m->begin + scan_range( + cct->_conf->osd_backfill_scan_min, + cct->_conf->osd_backfill_scan_max, + &bi, + handle); + MOSDPGScan *reply = new MOSDPGScan( + MOSDPGScan::OP_SCAN_DIGEST, + pg_whoami, + get_osdmap_epoch(), m->query_epoch, + spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end); + encode(bi.objects, reply->get_data()); + osd->send_message_osd_cluster(reply, m->get_connection()); + } + break; + + case MOSDPGScan::OP_SCAN_DIGEST: + { + pg_shard_t from = m->from; + + // Check that from is in backfill_targets vector + ceph_assert(is_backfill_target(from)); + + BackfillInterval& bi = peer_backfill_info[from]; + bi.begin = m->begin; + bi.end = m->end; + auto p = m->get_data().cbegin(); + + // take care to preserve ordering! + bi.clear_objects(); + decode_noclear(bi.objects, p); + dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end + << " bi.objects.size()=" << bi.objects.size() << dendl; + + if (waiting_on_backfill.erase(from)) { + if (waiting_on_backfill.empty()) { + ceph_assert( + peer_backfill_info.size() == + get_backfill_targets().size()); + finish_recovery_op(hobject_t::get_max()); + } + } else { + // we canceled backfill for a while due to a too full, and this + // is an extra response from a non-too-full peer + dout(20) << __func__ << " canceled backfill (too full?)" << dendl; + } + } + break; + } +} + +void PrimaryLogPG::do_backfill(OpRequestRef op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL); + dout(10) << "do_backfill " << *m << dendl; + + op->mark_started(); + + switch (m->op) { + case MOSDPGBackfill::OP_BACKFILL_FINISH: + { + ceph_assert(cct->_conf->osd_kill_backfill_at != 1); + + MOSDPGBackfill *reply = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + get_osdmap_epoch(), + m->query_epoch, + spg_t(info.pgid.pgid, get_primary().shard)); + reply->set_priority(get_recovery_op_priority()); + osd->send_message_osd_cluster(reply, m->get_connection()); + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + RecoveryDone()))); + } + // fall-thru + + case MOSDPGBackfill::OP_BACKFILL_PROGRESS: + { + ceph_assert(cct->_conf->osd_kill_backfill_at != 2); + + ObjectStore::Transaction t; + recovery_state.update_backfill_progress( + m->last_backfill, + m->stats, + m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS, + t); + + int tr = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(tr == 0); + } + break; + + case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK: + { + ceph_assert(is_primary()); + ceph_assert(cct->_conf->osd_kill_backfill_at != 3); + finish_recovery_op(hobject_t::get_max()); + } + break; + } +} + +void PrimaryLogPG::do_backfill_remove(OpRequestRef op) +{ + const MOSDPGBackfillRemove *m = static_cast( + op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE); + dout(7) << __func__ << " " << m->ls << dendl; + + op->mark_started(); + + ObjectStore::Transaction t; + for (auto& p : m->ls) { + if (is_remote_backfilling()) { + struct stat st; + int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN, + pg_whoami.shard) , &st); + if (r == 0) { + sub_local_num_bytes(st.st_size); + int64_t usersize; + if (pool.info.is_erasure()) { + bufferlist bv; + int r = osd->store->getattr( + ch, + ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + usersize = oi.size * pgbackend->get_ec_data_chunk_count(); + } else { + dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard) + << " can't get object info" << dendl; + usersize = 0; + } + } else { + usersize = st.st_size; + } + sub_num_bytes(usersize); + dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard) + << " sub actual data by " << st.st_size + << " sub num_bytes by " << usersize + << dendl; + } + } + remove_snap_mapped_object(t, p.first); + } + int r = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(r == 0); +} + +int PrimaryLogPG::trim_object( + bool first, const hobject_t &coid, snapid_t snap_to_trim, + PrimaryLogPG::OpContextUPtr *ctxp) +{ + *ctxp = NULL; + + // load clone info + bufferlist bl; + ObjectContextRef obc = get_object_context(coid, false, NULL); + if (!obc || !obc->ssc || !obc->ssc->exists) { + osd->clog->error() << __func__ << ": Can not trim " << coid + << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)"); + return -ENOENT; + } + + hobject_t head_oid = coid.get_head(); + ObjectContextRef head_obc = get_object_context(head_oid, false); + if (!head_obc) { + osd->clog->error() << __func__ << ": Can not trim " << coid + << " repair needed, no snapset obc for " << head_oid; + return -ENOENT; + } + + SnapSet& snapset = obc->ssc->snapset; + + object_info_t &coi = obc->obs.oi; + auto citer = snapset.clone_snaps.find(coid.snap); + if (citer == snapset.clone_snaps.end()) { + osd->clog->error() << "No clone_snaps in snapset " << snapset + << " for object " << coid << "\n"; + return -ENOENT; + } + set old_snaps(citer->second.begin(), citer->second.end()); + if (old_snaps.empty()) { + osd->clog->error() << "No object info snaps for object " << coid; + return -ENOENT; + } + + dout(10) << coid << " old_snaps " << old_snaps + << " old snapset " << snapset << dendl; + if (snapset.seq == 0) { + osd->clog->error() << "No snapset.seq for object " << coid; + return -ENOENT; + } + + set new_snaps; + const OSDMapRef& osdmap = get_osdmap(); + for (set::iterator i = old_snaps.begin(); + i != old_snaps.end(); + ++i) { + if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) && + *i != snap_to_trim) { + new_snaps.insert(*i); + } + } + + vector::iterator p = snapset.clones.end(); + + if (new_snaps.empty()) { + p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap); + if (p == snapset.clones.end()) { + osd->clog->error() << "Snap " << coid.snap << " not in clones"; + return -ENOENT; + } + } + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->head_obc = head_obc; + + if (!ctx->lock_manager.get_snaptrimmer_write( + coid, + obc, + first)) { + close_op_ctx(ctx.release()); + dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl; + return -ENOLCK; + } + + if (!ctx->lock_manager.get_snaptrimmer_write( + head_oid, + head_obc, + first)) { + close_op_ctx(ctx.release()); + dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl; + return -ENOLCK; + } + + ctx->at_version = get_next_version(); + + PGTransaction *t = ctx->op_t.get(); + + if (new_snaps.empty()) { + // remove clone + dout(10) << coid << " snaps " << old_snaps << " -> " + << new_snaps << " ... deleting" << dendl; + + // ...from snapset + ceph_assert(p != snapset.clones.end()); + + snapid_t last = coid.snap; + ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last); + + if (p != snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + vector::iterator n = p - 1; + hobject_t prev_coid = coid; + prev_coid.snap = *n; + bool adjust_prev_bytes = is_present_clone(prev_coid); + + if (adjust_prev_bytes) + ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n); + + snapset.clone_overlap[*n].intersection_of( + snapset.clone_overlap[*p]); + + if (adjust_prev_bytes) + ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n); + } + ctx->delta_stats.num_objects--; + if (coi.is_dirty()) + ctx->delta_stats.num_objects_dirty--; + if (coi.is_omap()) + ctx->delta_stats.num_objects_omap--; + if (coi.is_whiteout()) { + dout(20) << __func__ << " trimming whiteout on " << coid << dendl; + ctx->delta_stats.num_whiteouts--; + } + ctx->delta_stats.num_object_clones--; + if (coi.is_cache_pinned()) + ctx->delta_stats.num_objects_pinned--; + if (coi.has_manifest()) { + dec_all_refcount_manifest(coi, ctx.get()); + ctx->delta_stats.num_objects_manifest--; + } + obc->obs.exists = false; + + snapset.clones.erase(p); + snapset.clone_overlap.erase(last); + snapset.clone_size.erase(last); + snapset.clone_snaps.erase(last); + + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::DELETE, + coid, + ctx->at_version, + ctx->obs->oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + t->remove(coid); + t->update_snaps( + coid, + old_snaps, + new_snaps); + + coi = object_info_t(coid); + + ctx->at_version.version++; + } else { + // save adjusted snaps for this object + dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl; + snapset.clone_snaps[coid.snap] = + vector(new_snaps.rbegin(), new_snaps.rend()); + // we still do a 'modify' event on this object just to trigger a + // snapmapper.update ... :( + + coi.prior_version = coi.version; + coi.version = ctx->at_version; + bl.clear(); + encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + t->setattr(coid, OI_ATTR, bl); + + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + coid, + coi.version, + coi.prior_version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + ctx->at_version.version++; + + t->update_snaps( + coid, + old_snaps, + new_snaps); + } + + // save head snapset + dout(10) << coid << " new snapset " << snapset << " on " + << head_obc->obs.oi << dendl; + if (snapset.clones.empty() && + (head_obc->obs.oi.is_whiteout() && + !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) && + !head_obc->obs.oi.is_cache_pinned())) { + // NOTE: this arguably constitutes minor interference with the + // tiering agent if this is a cache tier since a snap trim event + // is effectively evicting a whiteout we might otherwise want to + // keep around. + dout(10) << coid << " removing " << head_oid << dendl; + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::DELETE, + head_oid, + ctx->at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + dout(10) << "removing snap head" << dendl; + object_info_t& oi = head_obc->obs.oi; + ctx->delta_stats.num_objects--; + if (oi.is_dirty()) { + ctx->delta_stats.num_objects_dirty--; + } + if (oi.is_omap()) + ctx->delta_stats.num_objects_omap--; + if (oi.is_whiteout()) { + dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl; + ctx->delta_stats.num_whiteouts--; + } + if (oi.is_cache_pinned()) { + ctx->delta_stats.num_objects_pinned--; + } + if (oi.has_manifest()) { + ctx->delta_stats.num_objects_manifest--; + dec_all_refcount_manifest(oi, ctx.get()); + } + head_obc->obs.exists = false; + head_obc->obs.oi = object_info_t(head_oid); + t->remove(head_oid); + } else { + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + // filter SnapSet::snaps for the benefit of pre-octopus + // peers. This is perhaps overly conservative in that I'm not + // certain they need this, but let's be conservative here. + dout(10) << coid << " filtering snapset on " << head_oid << dendl; + snapset.filter(pool.info); + } else { + snapset.snaps.clear(); + } + dout(10) << coid << " writing updated snapset on " << head_oid + << ", snapset is " << snapset << dendl; + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + head_oid, + ctx->at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + + head_obc->obs.oi.prior_version = head_obc->obs.oi.version; + head_obc->obs.oi.version = ctx->at_version; + + map attrs; + bl.clear(); + encode(snapset, bl); + attrs[SS_ATTR] = std::move(bl); + + bl.clear(); + encode(head_obc->obs.oi, bl, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR] = std::move(bl); + t->setattrs(head_oid, attrs); + } + + *ctxp = std::move(ctx); + return 0; +} + +void PrimaryLogPG::kick_snap_trim() +{ + ceph_assert(is_active()); + ceph_assert(is_primary()); + if (is_clean() && + !state_test(PG_STATE_PREMERGE) && + !snap_trimq.empty()) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) { + dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl; + } else { + dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl; + snap_trimmer_machine.process_event(KickTrim()); + } + } +} + +void PrimaryLogPG::snap_trimmer_scrub_complete() +{ + if (is_primary() && is_active() && is_clean() && !snap_trimq.empty()) { + dout(10) << "scrub finished - requeuing snap_trimmer" << dendl; + snap_trimmer_machine.process_event(ScrubComplete()); + } +} + +void PrimaryLogPG::snap_trimmer(epoch_t queued) +{ + if (recovery_state.is_deleting() || pg_has_reset_since(queued)) { + return; + } + + ceph_assert(is_primary()); + + dout(10) << "snap_trimmer posting" << dendl; + snap_trimmer_machine.process_event(DoSnapWork()); + dout(10) << "snap_trimmer complete" << dendl; + return; +} + +int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr) +{ + __u64 v2; + + string v2s(xattr.c_str(), xattr.length()); + if (v2s.length()) + v2 = strtoull(v2s.c_str(), NULL, 10); + else + v2 = 0; + + dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl; + + switch (op) { + case CEPH_OSD_CMPXATTR_OP_EQ: + return (v1 == v2); + case CEPH_OSD_CMPXATTR_OP_NE: + return (v1 != v2); + case CEPH_OSD_CMPXATTR_OP_GT: + return (v1 > v2); + case CEPH_OSD_CMPXATTR_OP_GTE: + return (v1 >= v2); + case CEPH_OSD_CMPXATTR_OP_LT: + return (v1 < v2); + case CEPH_OSD_CMPXATTR_OP_LTE: + return (v1 <= v2); + default: + return -EINVAL; + } +} + +int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr) +{ + string v2s(xattr.c_str(), xattr.length()); + + dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl; + + switch (op) { + case CEPH_OSD_CMPXATTR_OP_EQ: + return (v1s.compare(v2s) == 0); + case CEPH_OSD_CMPXATTR_OP_NE: + return (v1s.compare(v2s) != 0); + case CEPH_OSD_CMPXATTR_OP_GT: + return (v1s.compare(v2s) > 0); + case CEPH_OSD_CMPXATTR_OP_GTE: + return (v1s.compare(v2s) >= 0); + case CEPH_OSD_CMPXATTR_OP_LT: + return (v1s.compare(v2s) < 0); + case CEPH_OSD_CMPXATTR_OP_LTE: + return (v1s.compare(v2s) <= 0); + default: + return -EINVAL; + } +} + +int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op) +{ + ceph_osd_op& op = osd_op.op; + vector write_ops(1); + OSDOp& write_op = write_ops[0]; + uint64_t write_length = op.writesame.length; + int result = 0; + + if (!write_length) + return 0; + + if (!op.writesame.data_length || write_length % op.writesame.data_length) + return -EINVAL; + + if (op.writesame.data_length != osd_op.indata.length()) { + derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl; + return -EINVAL; + } + + while (write_length) { + write_op.indata.append(osd_op.indata); + write_length -= op.writesame.data_length; + } + + write_op.op.op = CEPH_OSD_OP_WRITE; + write_op.op.extent.offset = op.writesame.offset; + write_op.op.extent.length = op.writesame.length; + result = do_osd_ops(ctx, write_ops); + if (result < 0) + derr << "do_writesame do_osd_ops failed " << result << dendl; + + return result; +} + +// ======================================================================== +// low level osd ops + +int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags) +{ + dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl; + bufferlist header, vals; + int r = _get_tmap(ctx, &header, &vals); + if (r < 0) { + if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK)) + r = 0; + return r; + } + + vector ops(3); + + ops[0].op.op = CEPH_OSD_OP_TRUNCATE; + ops[0].op.extent.offset = 0; + ops[0].op.extent.length = 0; + + ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER; + ops[1].indata = std::move(header); + + ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS; + ops[2].indata = std::move(vals); + + return do_osd_ops(ctx, ops); +} + +int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp, + OSDOp& osd_op, bufferlist& bl) +{ + // decode + bufferlist header; + map m; + if (bl.length()) { + auto p = bl.cbegin(); + decode(header, p); + decode(m, p); + ceph_assert(p.end()); + } + + // do the update(s) + while (!bp.end()) { + __u8 op; + string key; + decode(op, bp); + + switch (op) { + case CEPH_OSD_TMAP_SET: // insert key + { + decode(key, bp); + bufferlist data; + decode(data, bp); + m[key] = data; + } + break; + case CEPH_OSD_TMAP_RM: // remove key + decode(key, bp); + if (!m.count(key)) { + return -ENOENT; + } + m.erase(key); + break; + case CEPH_OSD_TMAP_RMSLOPPY: // remove key + decode(key, bp); + m.erase(key); + break; + case CEPH_OSD_TMAP_HDR: // update header + { + decode(header, bp); + } + break; + default: + return -EINVAL; + } + } + + // reencode + bufferlist obl; + encode(header, obl); + encode(m, obl); + + // write it out + vector nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = obl.length(); + newop.indata = obl; + do_osd_ops(ctx, nops); + return 0; +} + +int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op) +{ + bufferlist::const_iterator orig_bp = bp; + int result = 0; + if (bp.end()) { + dout(10) << "tmapup is a no-op" << dendl; + } else { + // read the whole object + vector nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_READ; + newop.op.extent.offset = 0; + newop.op.extent.length = 0; + result = do_osd_ops(ctx, nops); + + dout(10) << "tmapup read " << newop.outdata.length() << dendl; + + dout(30) << " starting is \n"; + newop.outdata.hexdump(*_dout); + *_dout << dendl; + + auto ip = newop.outdata.cbegin(); + bufferlist obl; + + dout(30) << "the update command is: \n"; + osd_op.indata.hexdump(*_dout); + *_dout << dendl; + + // header + bufferlist header; + __u32 nkeys = 0; + if (newop.outdata.length()) { + decode(header, ip); + decode(nkeys, ip); + } + dout(10) << "tmapup header " << header.length() << dendl; + + if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) { + ++bp; + decode(header, bp); + dout(10) << "tmapup new header " << header.length() << dendl; + } + + encode(header, obl); + + dout(20) << "tmapup initial nkeys " << nkeys << dendl; + + // update keys + bufferlist newkeydata; + string nextkey, last_in_key; + bufferlist nextval; + bool have_next = false; + if (!ip.end()) { + have_next = true; + decode(nextkey, ip); + decode(nextval, ip); + } + while (!bp.end() && !result) { + __u8 op; + string key; + try { + decode(op, bp); + decode(key, bp); + } + catch (ceph::buffer::error& e) { + return -EINVAL; + } + if (key < last_in_key) { + dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key + << "', falling back to an inefficient (unsorted) update" << dendl; + bp = orig_bp; + return do_tmapup_slow(ctx, bp, osd_op, newop.outdata); + } + last_in_key = key; + + dout(10) << "tmapup op " << (int)op << " key " << key << dendl; + + // skip existing intervening keys + bool key_exists = false; + while (have_next && !key_exists) { + dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl; + if (nextkey > key) + break; + if (nextkey < key) { + // copy untouched. + encode(nextkey, newkeydata); + encode(nextval, newkeydata); + dout(20) << " keep " << nextkey << " " << nextval.length() << dendl; + } else { + // don't copy; discard old value. and stop. + dout(20) << " drop " << nextkey << " " << nextval.length() << dendl; + key_exists = true; + nkeys--; + } + if (!ip.end()) { + decode(nextkey, ip); + decode(nextval, ip); + } else { + have_next = false; + } + } + + if (op == CEPH_OSD_TMAP_SET) { + bufferlist val; + try { + decode(val, bp); + } + catch (ceph::buffer::error& e) { + return -EINVAL; + } + encode(key, newkeydata); + encode(val, newkeydata); + dout(20) << " set " << key << " " << val.length() << dendl; + nkeys++; + } else if (op == CEPH_OSD_TMAP_CREATE) { + if (key_exists) { + return -EEXIST; + } + bufferlist val; + try { + decode(val, bp); + } + catch (ceph::buffer::error& e) { + return -EINVAL; + } + encode(key, newkeydata); + encode(val, newkeydata); + dout(20) << " create " << key << " " << val.length() << dendl; + nkeys++; + } else if (op == CEPH_OSD_TMAP_RM) { + // do nothing. + if (!key_exists) { + return -ENOENT; + } + } else if (op == CEPH_OSD_TMAP_RMSLOPPY) { + // do nothing + } else { + dout(10) << " invalid tmap op " << (int)op << dendl; + return -EINVAL; + } + } + + // copy remaining + if (have_next) { + encode(nextkey, newkeydata); + encode(nextval, newkeydata); + dout(20) << " keep " << nextkey << " " << nextval.length() << dendl; + } + if (!ip.end()) { + bufferlist rest; + rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off()); + dout(20) << " keep trailing " << rest.length() + << " at " << newkeydata.length() << dendl; + newkeydata.claim_append(rest); + } + + // encode final key count + key data + dout(20) << "tmapup final nkeys " << nkeys << dendl; + encode(nkeys, obl); + obl.claim_append(newkeydata); + + if (0) { + dout(30) << " final is \n"; + obl.hexdump(*_dout); + *_dout << dendl; + + // sanity check + auto tp = obl.cbegin(); + bufferlist h; + decode(h, tp); + map d; + decode(d, tp); + ceph_assert(tp.end()); + dout(0) << " **** debug sanity check, looks ok ****" << dendl; + } + + // write it out + if (!result) { + dout(20) << "tmapput write " << obl.length() << dendl; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = obl.length(); + newop.indata = obl; + do_osd_ops(ctx, nops); + } + } + return result; +} + +static int check_offset_and_length(uint64_t offset, uint64_t length, + uint64_t max, DoutPrefixProvider *dpp) +{ + if (offset >= max || + length > max || + offset + length > max) { + ldpp_dout(dpp, 10) << __func__ << " " + << "osd_max_object_size: " << max + << "; Hard limit of object size is 4GB." << dendl; + return -EFBIG; + } + + return 0; +} + +struct FillInVerifyExtent : public Context { + ceph_le64 *r; + int32_t *rval; + bufferlist *outdatap; + std::optional maybe_crc; + uint64_t size; + OSDService *osd; + hobject_t soid; + uint32_t flags; + FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp, + std::optional mc, uint64_t size, + OSDService *osd, hobject_t soid, uint32_t flags) : + r(r), rval(rv), outdatap(blp), maybe_crc(mc), + size(size), osd(osd), soid(soid), flags(flags) {} + void finish(int len) override { + if (len < 0) { + *rval = len; + return; + } + *r = len; + *rval = 0; + + // whole object? can we verify the checksum? + if (maybe_crc && *r == size) { + uint32_t crc = outdatap->crc32c(-1); + if (maybe_crc != crc) { + osd->clog->error() << std::hex << " full-object read crc 0x" << crc + << " != expected 0x" << *maybe_crc + << std::dec << " on " << soid; + if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) { + *rval = -EIO; + *r = 0; + } + } + } + } +}; + +struct ToSparseReadResult : public Context { + int* result; + bufferlist* data_bl; + uint64_t data_offset; + ceph_le64* len; + ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset, + ceph_le64* len) + : result(result), data_bl(bl), data_offset(offset),len(len) {} + void finish(int r) override { + if (r < 0) { + *result = r; + return; + } + *result = 0; + *len = r; + bufferlist outdata; + map extents = {{data_offset, r}}; + encode(extents, outdata); + encode_destructively(*data_bl, outdata); + data_bl->swap(outdata); + } +}; + +template +static string list_keys(const map& m) { + string s; + for (typename map::const_iterator itr = m.begin(); itr != m.end(); ++itr) { + if (!s.empty()) { + s.push_back(','); + } + s.append(itr->first); + } + return s; +} + +template +static string list_entries(const T& m) { + string s; + for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) { + if (!s.empty()) { + s.push_back(','); + } + s.append(*itr); + } + return s; +} + +void PrimaryLogPG::maybe_create_new_object( + OpContext *ctx, + bool ignore_transaction) +{ + ObjectState& obs = ctx->new_obs; + if (!obs.exists) { + ctx->delta_stats.num_objects++; + obs.exists = true; + ceph_assert(!obs.oi.is_whiteout()); + obs.oi.new_object(); + if (!ignore_transaction) + ctx->op_t->create(obs.oi.soid); + } else if (obs.oi.is_whiteout()) { + dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + --ctx->delta_stats.num_whiteouts; + } +} + +struct ReadFinisher : public PrimaryLogPG::OpFinisher { + OSDOp& osd_op; + + explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) { + } + + int execute() override { + return osd_op.rval; + } +}; + +struct C_ChecksumRead : public Context { + PrimaryLogPG *primary_log_pg; + OSDOp &osd_op; + Checksummer::CSumType csum_type; + bufferlist init_value_bl; + ceph_le64 read_length; + bufferlist read_bl; + Context *fill_extent_ctx; + + C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op, + Checksummer::CSumType csum_type, bufferlist &&init_value_bl, + std::optional maybe_crc, uint64_t size, + OSDService *osd, hobject_t soid, uint32_t flags) + : primary_log_pg(primary_log_pg), osd_op(osd_op), + csum_type(csum_type), init_value_bl(std::move(init_value_bl)), + fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval, + &read_bl, maybe_crc, size, + osd, soid, flags)) { + } + ~C_ChecksumRead() override { + delete fill_extent_ctx; + } + + void finish(int r) override { + fill_extent_ctx->complete(r); + fill_extent_ctx = nullptr; + + if (osd_op.rval >= 0) { + bufferlist::const_iterator init_value_bl_it = init_value_bl.begin(); + osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type, + &init_value_bl_it, read_bl); + } + } +}; + +int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op, + bufferlist::const_iterator *bl_it) +{ + dout(20) << __func__ << dendl; + + auto& op = osd_op.op; + if (op.checksum.chunk_size > 0) { + if (op.checksum.length == 0) { + dout(10) << __func__ << ": length required when chunk size provided" + << dendl; + return -EINVAL; + } + if (op.checksum.length % op.checksum.chunk_size != 0) { + dout(10) << __func__ << ": length not aligned to chunk size" << dendl; + return -EINVAL; + } + } + + auto& oi = ctx->new_obs.oi; + if (op.checksum.offset == 0 && op.checksum.length == 0) { + // zeroed offset+length implies checksum whole object + op.checksum.length = oi.size; + } else if (op.checksum.offset >= oi.size) { + // read size was trimmed to zero, do nothing + // see PrimaryLogPG::do_read + return 0; + } else if (op.extent.offset + op.extent.length > oi.size) { + op.extent.length = oi.size - op.extent.offset; + if (op.checksum.chunk_size > 0 && + op.checksum.length % op.checksum.chunk_size != 0) { + dout(10) << __func__ << ": length (trimmed to 0x" + << std::hex << op.checksum.length + << ") not aligned to chunk size 0x" + << op.checksum.chunk_size << std::dec + << dendl; + return -EINVAL; + } + } + + Checksummer::CSumType csum_type; + switch (op.checksum.type) { + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32: + csum_type = Checksummer::CSUM_XXHASH32; + break; + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64: + csum_type = Checksummer::CSUM_XXHASH64; + break; + case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C: + csum_type = Checksummer::CSUM_CRC32C; + break; + default: + dout(10) << __func__ << ": unknown crc type (" + << static_cast(op.checksum.type) << ")" << dendl; + return -EINVAL; + } + + size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type); + if (bl_it->get_remaining() < csum_init_value_size) { + dout(10) << __func__ << ": init value not provided" << dendl; + return -EINVAL; + } + + bufferlist init_value_bl; + init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(), + csum_init_value_size); + *bl_it += csum_init_value_size; + + if (pool.info.is_erasure() && op.checksum.length > 0) { + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. + std::optional maybe_crc; + if (oi.is_data_digest() && op.checksum.offset == 0 && + op.checksum.length >= oi.size) { + maybe_crc = oi.data_digest; + } + + // async read + auto& soid = oi.soid; + auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type, + std::move(init_value_bl), maybe_crc, + oi.size, osd, soid, op.flags); + + ctx->pending_async_reads.push_back({ + {op.checksum.offset, op.checksum.length, op.flags}, + {&checksum_ctx->read_bl, checksum_ctx}}); + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + return -EINPROGRESS; + } + + // sync read + std::vector read_ops(1); + auto& read_op = read_ops[0]; + if (op.checksum.length > 0) { + read_op.op.op = CEPH_OSD_OP_READ; + read_op.op.flags = op.flags; + read_op.op.extent.offset = op.checksum.offset; + read_op.op.extent.length = op.checksum.length; + read_op.op.extent.truncate_size = 0; + read_op.op.extent.truncate_seq = 0; + + int r = do_osd_ops(ctx, read_ops); + if (r < 0) { + derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl; + return r; + } + } + + bufferlist::const_iterator init_value_bl_it = init_value_bl.begin(); + return finish_checksum(osd_op, csum_type, &init_value_bl_it, + read_op.outdata); +} + +int PrimaryLogPG::finish_checksum(OSDOp& osd_op, + Checksummer::CSumType csum_type, + bufferlist::const_iterator *init_value_bl_it, + const bufferlist &read_bl) { + dout(20) << __func__ << dendl; + + auto& op = osd_op.op; + + if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) { + derr << __func__ << ": bytes read " << read_bl.length() << " != " + << op.checksum.length << dendl; + return -EINVAL; + } + + size_t csum_chunk_size = (op.checksum.chunk_size != 0 ? + op.checksum.chunk_size : read_bl.length()); + uint32_t csum_count = (csum_chunk_size > 0 ? + read_bl.length() / csum_chunk_size : 0); + + bufferlist csum; + bufferptr csum_data; + if (csum_count > 0) { + size_t csum_value_size = Checksummer::get_csum_value_size(csum_type); + csum_data = ceph::buffer::create(csum_value_size * csum_count); + csum_data.zero(); + csum.append(csum_data); + + switch (csum_type) { + case Checksummer::CSUM_XXHASH32: + { + Checksummer::xxhash32::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + case Checksummer::CSUM_XXHASH64: + { + Checksummer::xxhash64::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + case Checksummer::CSUM_CRC32C: + { + Checksummer::crc32c::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + default: + break; + } + } + + encode(csum_count, osd_op.outdata); + osd_op.outdata.claim_append(csum); + return 0; +} + +struct C_ExtentCmpRead : public Context { + PrimaryLogPG *primary_log_pg; + OSDOp &osd_op; + ceph_le64 read_length{}; + bufferlist read_bl; + Context *fill_extent_ctx; + + C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op, + std::optional maybe_crc, uint64_t size, + OSDService *osd, hobject_t soid, uint32_t flags) + : primary_log_pg(primary_log_pg), osd_op(osd_op), + fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval, + &read_bl, maybe_crc, size, + osd, soid, flags)) { + } + ~C_ExtentCmpRead() override { + delete fill_extent_ctx; + } + + void finish(int r) override { + if (r == -ENOENT) { + osd_op.rval = 0; + read_bl.clear(); + delete fill_extent_ctx; + } else { + fill_extent_ctx->complete(r); + } + fill_extent_ctx = nullptr; + + if (osd_op.rval >= 0) { + osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl); + } + } +}; + +int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op) +{ + dout(20) << __func__ << dendl; + ceph_osd_op& op = osd_op.op; + + auto& oi = ctx->new_obs.oi; + uint64_t size = oi.size; + if ((oi.truncate_seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size)) { + size = op.extent.truncate_size; + } + + if (op.extent.offset >= size) { + op.extent.length = 0; + } else if (op.extent.offset + op.extent.length > size) { + op.extent.length = size - op.extent.offset; + } + + if (op.extent.length == 0) { + dout(20) << __func__ << " zero length extent" << dendl; + return finish_extent_cmp(osd_op, bufferlist{}); + } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) { + dout(20) << __func__ << " object DNE" << dendl; + return finish_extent_cmp(osd_op, {}); + } else if (pool.info.is_erasure()) { + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. + std::optional maybe_crc; + if (oi.is_data_digest() && op.checksum.offset == 0 && + op.checksum.length >= oi.size) { + maybe_crc = oi.data_digest; + } + + // async read + auto& soid = oi.soid; + auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size, + osd, soid, op.flags); + ctx->pending_async_reads.push_back({ + {op.extent.offset, op.extent.length, op.flags}, + {&extent_cmp_ctx->read_bl, extent_cmp_ctx}}); + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + return -EINPROGRESS; + } + + // sync read + vector read_ops(1); + OSDOp& read_op = read_ops[0]; + + read_op.op.op = CEPH_OSD_OP_SYNC_READ; + read_op.op.extent.offset = op.extent.offset; + read_op.op.extent.length = op.extent.length; + read_op.op.extent.truncate_seq = op.extent.truncate_seq; + read_op.op.extent.truncate_size = op.extent.truncate_size; + + int result = do_osd_ops(ctx, read_ops); + if (result < 0) { + derr << __func__ << " failed " << result << dendl; + return result; + } + return finish_extent_cmp(osd_op, read_op.outdata); +} + +int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl) +{ + for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) { + char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0); + if (osd_op.indata[idx] != read_byte) { + return (-MAX_ERRNO - idx); + } + } + + return 0; +} + +int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) { + dout(20) << __func__ << dendl; + auto& op = osd_op.op; + auto& oi = ctx->new_obs.oi; + auto& soid = oi.soid; + __u32 seq = oi.truncate_seq; + uint64_t size = oi.size; + bool trimmed_read = false; + + dout(30) << __func__ << " oi.size: " << oi.size << dendl; + dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl; + dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl; + dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl; + + // are we beyond truncate_size? + if ( (seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size) && + (size > op.extent.truncate_size) ) + size = op.extent.truncate_size; + + if (op.extent.length == 0) //length is zero mean read the whole object + op.extent.length = size; + + if (op.extent.offset >= size) { + op.extent.length = 0; + trimmed_read = true; + } else if (op.extent.offset + op.extent.length > size) { + op.extent.length = size - op.extent.offset; + trimmed_read = true; + } + + dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl; + + // read into a buffer + int result = 0; + if (trimmed_read && op.extent.length == 0) { + // read size was trimmed to zero and it is expected to do nothing + // a read operation of 0 bytes does *not* do nothing, this is why + // the trimmed_read boolean is needed + } else if (pool.info.is_erasure()) { + // The initialisation below is required to silence a false positive + // -Wmaybe-uninitialized warning + std::optional maybe_crc; + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. FillInVerifyExtent will + // will check the oi.size again. + if (oi.is_data_digest() && op.extent.offset == 0 && + op.extent.length >= oi.size) + maybe_crc = oi.data_digest; + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(op.extent.offset, op.extent.length, op.flags), + make_pair(&osd_op.outdata, + new FillInVerifyExtent(&op.extent.length, &osd_op.rval, + &osd_op.outdata, maybe_crc, oi.size, + osd, soid, op.flags)))); + dout(10) << " async_read noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + } else { + int r = pgbackend->objects_read_sync( + soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata); + // whole object? can we verify the checksum? + if (r >= 0 && op.extent.offset == 0 && + (uint64_t)r == oi.size && oi.is_data_digest()) { + uint32_t crc = osd_op.outdata.crc32c(-1); + if (oi.data_digest != crc) { + osd->clog->error() << info.pgid << std::hex + << " full-object read crc 0x" << crc + << " != expected 0x" << oi.data_digest + << std::dec << " on " << soid; + r = -EIO; // try repair later + } + } + if (r == -EIO) { + r = rep_repair_primary_object(soid, ctx); + } + if (r >= 0) + op.extent.length = r; + else if (r == -EAGAIN) { + result = -EAGAIN; + } else { + result = r; + op.extent.length = 0; + } + dout(10) << " read got " << r << " / " << op.extent.length + << " bytes from obj " << soid << dendl; + } + if (result >= 0) { + ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10); + ctx->delta_stats.num_rd++; + } + return result; +} + +int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { + dout(20) << __func__ << dendl; + auto& op = osd_op.op; + auto& oi = ctx->new_obs.oi; + auto& soid = oi.soid; + + if (op.extent.truncate_seq) { + dout(0) << "sparse_read does not support truncation sequence " << dendl; + return -EINVAL; + } + + ++ctx->num_read; + if (pool.info.is_erasure()) { + // translate sparse read to a normal one if not supported + uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + if (offset > oi.size) { + length = 0; + } else if (offset + length > oi.size) { + length = oi.size - offset; + } + + if (length > 0) { + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(offset, length, op.flags), + make_pair( + &osd_op.outdata, + new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset, + &op.extent.length)))); + dout(10) << " async_read (was sparse_read) noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + } else { + dout(10) << " sparse read ended up empty for " << soid << dendl; + map extents; + encode(extents, osd_op.outdata); + } + } else { + // read into a buffer + map m; + int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN, + info.pgid.shard), + op.extent.offset, op.extent.length, m); + if (r < 0) { + return r; + } + + bufferlist data_bl; + r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl); + if (r == -EIO) { + r = rep_repair_primary_object(soid, ctx); + } + if (r < 0) { + return r; + } + + // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read. + // Maybe at first, there is no much whole objects. With continued use, more + // and more whole object exist. So from this point, for spare-read add + // checksum make sense. + if ((uint64_t)r == oi.size && oi.is_data_digest()) { + uint32_t crc = data_bl.crc32c(-1); + if (oi.data_digest != crc) { + osd->clog->error() << info.pgid << std::hex + << " full-object read crc 0x" << crc + << " != expected 0x" << oi.data_digest + << std::dec << " on " << soid; + r = rep_repair_primary_object(soid, ctx); + if (r < 0) { + return r; + } + } + } + + op.extent.length = r; + + encode(m, osd_op.outdata); // re-encode since it might be modified + ::encode_destructively(data_bl, osd_op.outdata); + + dout(10) << " sparse_read got " << r << " bytes from object " + << soid << dendl; + } + + ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10); + ctx->delta_stats.num_rd++; + return 0; +} + +int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) +{ + int result = 0; + SnapSetContext *ssc = ctx->obc->ssc; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + const bool skip_data_digest = osd->store->has_builtin_csum() && + osd->osd_skip_data_digest; + + PGTransaction* t = ctx->op_t.get(); + + dout(10) << "do_osd_op " << soid << " " << ops << dendl; +#ifdef HAVE_JAEGER + if (ctx->op->osd_parent_span) { + auto do_osd_op_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span); + } +#endif + + ctx->current_osd_subop_num = 0; + for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) { + OSDOp& osd_op = *p; + ceph_osd_op& op = osd_op.op; + + OpFinisher* op_finisher = nullptr; + { + auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num); + if (op_finisher_it != ctx->op_finishers.end()) { + op_finisher = op_finisher_it->second.get(); + } + } + + // TODO: check endianness (ceph_le32 vs uint32_t, etc.) + // The fields in ceph_osd_op are little-endian (according to the definition in rados.h), + // but the code in this function seems to treat them as native-endian. What should the + // tracepoints do? + tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags); + + dout(10) << "do_osd_op " << osd_op << dendl; + + auto bp = osd_op.indata.cbegin(); + + // user-visible modifcation? + switch (op.op) { + // non user-visible modifications + case CEPH_OSD_OP_WATCH: + case CEPH_OSD_OP_CACHE_EVICT: + case CEPH_OSD_OP_CACHE_FLUSH: + case CEPH_OSD_OP_CACHE_TRY_FLUSH: + case CEPH_OSD_OP_UNDIRTY: + case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly + case CEPH_OSD_OP_COPY_FROM2: + case CEPH_OSD_OP_CACHE_PIN: + case CEPH_OSD_OP_CACHE_UNPIN: + case CEPH_OSD_OP_SET_REDIRECT: + case CEPH_OSD_OP_SET_CHUNK: + case CEPH_OSD_OP_TIER_PROMOTE: + case CEPH_OSD_OP_TIER_FLUSH: + case CEPH_OSD_OP_TIER_EVICT: + break; + default: + if (op.op & CEPH_OSD_OP_MODE_WR) + ctx->user_modify = true; + } + + // munge -1 truncate to 0 truncate + if (ceph_osd_op_uses_extent(op.op) && + op.extent.truncate_seq == 1 && + op.extent.truncate_size == (-1ULL)) { + op.extent.truncate_size = 0; + op.extent.truncate_seq = 0; + } + + // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes) + if (op.op == CEPH_OSD_OP_ZERO && + obs.exists && + op.extent.offset < static_cast(osd->osd_max_object_size) && + op.extent.length >= 1 && + op.extent.length <= static_cast(osd->osd_max_object_size) && + op.extent.offset + op.extent.length >= oi.size) { + if (op.extent.offset >= oi.size) { + // no-op + goto fail; + } + dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length + << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl; + op.op = CEPH_OSD_OP_TRUNCATE; + } + + switch (op.op) { + + // --- READS --- + + case CEPH_OSD_OP_CMPEXT: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + + if (op_finisher == nullptr) { + result = do_extent_cmp(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_SYNC_READ: + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + // fall through + case CEPH_OSD_OP_READ: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + if (op_finisher == nullptr) { + if (!ctx->data_off) { + ctx->data_off = op.extent.offset; + } + result = do_read(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_CHECKSUM: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type, + op.checksum.offset, op.checksum.length, + op.checksum.chunk_size); + + if (op_finisher == nullptr) { + result = do_checksum(ctx, osd_op, &bp); + } else { + result = op_finisher->execute(); + } + } + break; + + /* map extents */ + case CEPH_OSD_OP_MAPEXT: + tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_read; + { + // read into a buffer + bufferlist bl; + int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN, + info.pgid.shard), + op.extent.offset, op.extent.length, bl); + osd_op.outdata = std::move(bl); + if (r < 0) + result = r; + else + ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + ctx->delta_stats.num_rd++; + dout(10) << " map_extents done on object " << soid << dendl; + } + break; + + /* map extents */ + case CEPH_OSD_OP_SPARSE_READ: + tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + if (op_finisher == nullptr) { + result = do_sparse_read(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_CALL: + { + string cname, mname; + bufferlist indata; + try { + bp.copy(op.cls.class_len, cname); + bp.copy(op.cls.method_len, mname); + bp.copy(op.cls.indata_len, indata); + } catch (ceph::buffer::error& e) { + dout(10) << "call unable to decode class + method + indata" << dendl; + dout(30) << "in dump: "; + osd_op.indata.hexdump(*_dout); + *_dout << dendl; + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???"); + break; + } + tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str()); + + ClassHandler::ClassData *cls; + result = ClassHandler::get_instance().open_class(cname, &cls); + ceph_assert(result == 0); // init_op_flags() already verified this works. + + ClassHandler::ClassMethod *method = cls->get_method(mname); + if (!method) { + dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl; + result = -EOPNOTSUPP; + break; + } + + int flags = method->get_flags(); + if (flags & CLS_METHOD_WR) + ctx->user_modify = true; + + bufferlist outdata; + dout(10) << "call method " << cname << "." << mname << dendl; + int prev_rd = ctx->num_read; + int prev_wr = ctx->num_write; + result = method->exec((cls_method_context_t)&ctx, indata, outdata); + + if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) { + derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl; + result = -EIO; + break; + } + if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) { + derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl; + result = -EIO; + break; + } + + dout(10) << "method called response length=" << outdata.length() << dendl; + op.extent.length = outdata.length(); + osd_op.outdata.claim_append(outdata); + dout(30) << "out dump: "; + osd_op.outdata.hexdump(*_dout); + *_dout << dendl; + } + break; + + case CEPH_OSD_OP_STAT: + // note: stat does not require RD + { + tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val); + + if (obs.exists && !oi.is_whiteout()) { + encode(oi.size, osd_op.outdata); + encode(oi.mtime, osd_op.outdata); + dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl; + } else { + result = -ENOENT; + dout(10) << "stat oi object does not exist" << dendl; + } + + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_ISDIRTY: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val); + bool is_dirty = obs.oi.is_dirty(); + encode(is_dirty, osd_op.outdata); + ctx->delta_stats.num_rd++; + result = 0; + } + break; + + case CEPH_OSD_OP_UNDIRTY: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val); + if (oi.is_dirty()) { + ctx->undirty = true; // see make_writeable() + ctx->modify = true; + ctx->delta_stats.num_wr++; + } + } + break; + + case CEPH_OSD_OP_CACHE_TRY_FLUSH: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val); + if (ctx->lock_type != RWState::RWNONE) { + dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl; + result = -EINVAL; + break; + } + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + if (oi.is_dirty()) { + result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt); + if (result == -EINPROGRESS) + result = -EAGAIN; + } else { + result = 0; + } + } + break; + + case CEPH_OSD_OP_CACHE_FLUSH: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val); + if (ctx->lock_type == RWState::RWNONE) { + dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl; + result = -EINVAL; + break; + } + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + hobject_t missing; + if (oi.is_dirty()) { + result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt); + if (result == -EINPROGRESS) + result = -EAGAIN; + } else { + result = 0; + } + // Check special return value which has set missing_return + if (result == -ENOENT) { + dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl; + ceph_assert(!missing.is_min()); + wait_for_unreadable_object(missing, ctx->op); + // Error code which is used elsewhere when wait_for_unreadable_object() is used + result = -EAGAIN; + } + } + break; + + case CEPH_OSD_OP_CACHE_EVICT: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + if (oi.is_dirty()) { + result = -EBUSY; + break; + } + if (!oi.watchers.empty()) { + result = -EBUSY; + break; + } + if (soid.snap == CEPH_NOSNAP) { + result = _verify_no_head_clones(soid, ssc->snapset); + if (result < 0) + break; + } + result = _delete_oid(ctx, true, false); + if (result >= 0) { + // mark that this is a cache eviction to avoid triggering normal + // make_writeable() clone creation in finish_ctx() + ctx->cache_operation = true; + } + osd->logger->inc(l_osd_tier_evict); + } + break; + + case CEPH_OSD_OP_GETXATTR: + ++ctx->num_read; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + int r = getattr_maybe_cache( + ctx->obc, + name, + &(osd_op.outdata)); + if (r >= 0) { + op.xattr.value_len = osd_op.outdata.length(); + result = 0; + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + } else + result = r; + + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_GETXATTRS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val); + map out; + result = getattrs_maybe_cache( + ctx->obc, + &out); + + bufferlist bl; + encode(out, bl); + ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + ctx->delta_stats.num_rd++; + osd_op.outdata.claim_append(bl); + } + break; + + case CEPH_OSD_OP_CMPXATTR: + ++ctx->num_read; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + name[op.xattr.name_len + 1] = 0; + + bufferlist xattr; + result = getattr_maybe_cache( + ctx->obc, + name, + &xattr); + if (result < 0 && result != -EEXIST && result != -ENODATA) + break; + + ctx->delta_stats.num_rd++; + ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10); + + switch (op.xattr.cmp_mode) { + case CEPH_OSD_CMPXATTR_MODE_STRING: + { + string val; + bp.copy(op.xattr.value_len, val); + val[op.xattr.value_len] = 0; + dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val + << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl; + result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr); + } + break; + + case CEPH_OSD_CMPXATTR_MODE_U64: + { + uint64_t u64val; + try { + decode(u64val, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val + << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl; + result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr); + } + break; + + default: + dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl; + result = -EINVAL; + } + + if (!result) { + dout(10) << "comparison returned false" << dendl; + result = -ECANCELED; + break; + } + if (result < 0) { + dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl; + break; + } + + dout(10) << "comparison returned true" << dendl; + } + break; + + case CEPH_OSD_OP_ASSERT_VER: + ++ctx->num_read; + { + uint64_t ver = op.assert_ver.ver; + tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver); + if (!ver) + result = -EINVAL; + else if (ver < oi.user_version) + result = -ERANGE; + else if (ver > oi.user_version) + result = -EOVERFLOW; + } + break; + + case CEPH_OSD_OP_LIST_WATCHERS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val); + obj_list_watch_response_t resp; + + map, watch_info_t>::const_iterator oi_iter; + for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end(); + ++oi_iter) { + dout(20) << "key cookie=" << oi_iter->first.first + << " entity=" << oi_iter->first.second << " " + << oi_iter->second << dendl; + ceph_assert(oi_iter->first.first == oi_iter->second.cookie); + ceph_assert(oi_iter->first.second.is_client()); + + watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie, + oi_iter->second.timeout_seconds, oi_iter->second.addr); + resp.entries.push_back(wi); + } + + resp.encode(osd_op.outdata, ctx->get_features()); + result = 0; + + ctx->delta_stats.num_rd++; + break; + } + + case CEPH_OSD_OP_LIST_SNAPS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val); + obj_list_snap_response_t resp; + + if (!ssc) { + ssc = ctx->obc->ssc = get_snapset_context(soid, false); + } + ceph_assert(ssc); + dout(20) << " snapset " << ssc->snapset << dendl; + + int clonecount = ssc->snapset.clones.size(); + clonecount++; // for head + resp.clones.reserve(clonecount); + for (auto clone_iter = ssc->snapset.clones.begin(); + clone_iter != ssc->snapset.clones.end(); ++clone_iter) { + clone_info ci; + ci.cloneid = *clone_iter; + + hobject_t clone_oid = soid; + clone_oid.snap = *clone_iter; + + auto p = ssc->snapset.clone_snaps.find(*clone_iter); + if (p == ssc->snapset.clone_snaps.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_snaps found for oid " + << soid << " clone " << *clone_iter + << " snapset " << ssc->snapset; + result = -EINVAL; + break; + } + for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) { + ci.snaps.push_back(*q); + } + + dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl; + + map >::const_iterator coi; + coi = ssc->snapset.clone_overlap.find(ci.cloneid); + if (coi == ssc->snapset.clone_overlap.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_overlap found for oid " + << soid << " clone " << *clone_iter; + result = -EINVAL; + break; + } + const interval_set &o = coi->second; + ci.overlap.reserve(o.num_intervals()); + for (interval_set::const_iterator r = o.begin(); + r != o.end(); ++r) { + ci.overlap.push_back(pair(r.get_start(), + r.get_len())); + } + + map::const_iterator si; + si = ssc->snapset.clone_size.find(ci.cloneid); + if (si == ssc->snapset.clone_size.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_size found for oid " + << soid << " clone " << *clone_iter; + result = -EINVAL; + break; + } + ci.size = si->second; + + resp.clones.push_back(ci); + } + if (result < 0) { + break; + } + if (!ctx->obc->obs.oi.is_whiteout()) { + ceph_assert(obs.exists); + clone_info ci; + ci.cloneid = CEPH_NOSNAP; + + //Size for HEAD is oi.size + ci.size = oi.size; + + resp.clones.push_back(ci); + } + resp.seq = ssc->snapset.seq; + + resp.encode(osd_op.outdata); + result = 0; + + ctx->delta_stats.num_rd++; + break; + } + + case CEPH_OSD_OP_NOTIFY: + ++ctx->num_read; + { + uint32_t timeout; + bufferlist bl; + + try { + uint32_t ver; // obsolete + decode(ver, bp); + decode(timeout, bp); + decode(bl, bp); + } catch (const ceph::buffer::error &e) { + timeout = 0; + } + tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout); + if (!timeout) + timeout = cct->_conf->osd_default_notify_timeout; + + notify_info_t n; + n.timeout = timeout; + n.notify_id = osd->get_next_id(get_osdmap_epoch()); + n.cookie = op.notify.cookie; + n.bl = bl; + ctx->notifies.push_back(n); + + // return our unique notify id to the client + encode(n.notify_id, osd_op.outdata); + } + break; + + case CEPH_OSD_OP_NOTIFY_ACK: + ++ctx->num_read; + { + try { + uint64_t notify_id = 0; + uint64_t watch_cookie = 0; + decode(notify_id, bp); + decode(watch_cookie, bp); + bufferlist reply_bl; + if (!bp.end()) { + decode(reply_bl, bp); + } + tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y"); + OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl); + ctx->notify_acks.push_back(ack); + } catch (const ceph::buffer::error &e) { + tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N"); + OpContext::NotifyAck ack( + // op.watch.cookie is actually the notify_id for historical reasons + op.watch.cookie + ); + ctx->notify_acks.push_back(ack); + } + } + break; + + case CEPH_OSD_OP_SETALLOCHINT: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size); + maybe_create_new_object(ctx); + oi.expected_object_size = op.alloc_hint.expected_object_size; + oi.expected_write_size = op.alloc_hint.expected_write_size; + oi.alloc_hint_flags = op.alloc_hint.flags; + t->set_alloc_hint(soid, op.alloc_hint.expected_object_size, + op.alloc_hint.expected_write_size, + op.alloc_hint.flags); + } + break; + + + // --- WRITES --- + + // -- object data -- + + case CEPH_OSD_OP_WRITE: + ++ctx->num_write; + result = 0; + { // write + __u32 seq = oi.truncate_seq; + tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + if (op.extent.length != osd_op.indata.length()) { + result = -EINVAL; + break; + } + + if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED)) + op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + if (pool.info.requires_aligned_append() && + (op.extent.offset % pool.info.required_alignment() != 0)) { + result = -EOPNOTSUPP; + break; + } + + if (!obs.exists) { + if (pool.info.requires_aligned_append() && op.extent.offset) { + result = -EOPNOTSUPP; + break; + } + } else if (op.extent.offset != oi.size && + pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + + if (seq && (seq > op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > oi.size)) { + // old write, arrived after trimtrunc + op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset); + dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq + << ", adjusting write length to " << op.extent.length << dendl; + bufferlist t; + t.substr_of(osd_op.indata, 0, op.extent.length); + osd_op.indata.swap(t); + } + if (op.extent.truncate_seq > seq) { + // write arrives before trimtrunc + if (obs.exists && !oi.is_whiteout()) { + dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq + << ", truncating to " << op.extent.truncate_size << dendl; + t->truncate(soid, op.extent.truncate_size); + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + if (oi.size > op.extent.truncate_size) { + interval_set trim; + trim.insert(op.extent.truncate_size, + oi.size - op.extent.truncate_size); + ctx->modified_ranges.union_of(trim); + ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size); + oi.clear_data_digest(); + } + if (op.extent.truncate_size != oi.size) { + truncate_update_size_and_usage(ctx->delta_stats, + oi, + op.extent.truncate_size); + } + } else { + dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq + << ", but object is new" << dendl; + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + } + } + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + maybe_create_new_object(ctx); + + if (op.extent.length == 0) { + if (op.extent.offset > oi.size) { + t->truncate( + soid, op.extent.offset); + truncate_update_size_and_usage(ctx->delta_stats, oi, + op.extent.offset); + } else { + t->nop(soid); + } + } else { + t->write( + soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags); + } + + if (op.extent.offset == 0 && op.extent.length >= oi.size + && !skip_data_digest) { + obs.oi.set_data_digest(osd_op.indata.crc32c(-1)); + } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) { + if (skip_data_digest) { + obs.oi.clear_data_digest(); + } else { + obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest)); + } + } else { + obs.oi.clear_data_digest(); + } + write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, + op.extent.offset, op.extent.length); + ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length); + dout(10) << "clean_regions modified" << ctx->clean_regions << dendl; + } + break; + + case CEPH_OSD_OP_WRITEFULL: + ++ctx->num_write; + result = 0; + { // write full object + tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length); + + if (op.extent.length != osd_op.indata.length()) { + result = -EINVAL; + break; + } + result = check_offset_and_length( + 0, op.extent.length, + static_cast(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED)) + op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + maybe_create_new_object(ctx); + if (pool.info.is_erasure()) { + t->truncate(soid, 0); + } else if (obs.exists && op.extent.length < oi.size) { + t->truncate(soid, op.extent.length); + } + if (op.extent.length) { + t->write(soid, 0, op.extent.length, osd_op.indata, op.flags); + } + if (!skip_data_digest) { + obs.oi.set_data_digest(osd_op.indata.crc32c(-1)); + } else { + obs.oi.clear_data_digest(); + } + ctx->clean_regions.mark_data_region_dirty(0, + std::max((uint64_t)op.extent.length, oi.size)); + write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, + 0, op.extent.length, true); + } + break; + + case CEPH_OSD_OP_WRITESAME: + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length); + result = do_writesame(ctx, osd_op); + break; + + case CEPH_OSD_OP_ROLLBACK : + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val); + result = _rollback_to(ctx, op); + break; + + case CEPH_OSD_OP_ZERO: + tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length); + if (pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + { // zero + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + ceph_assert(op.extent.length); + if (obs.exists && !oi.is_whiteout()) { + t->zero(soid, op.extent.offset, op.extent.length); + interval_set ch; + ch.insert(op.extent.offset, op.extent.length); + ctx->modified_ranges.union_of(ch); + ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length); + ctx->delta_stats.num_wr++; + oi.clear_data_digest(); + } else { + // no-op + } + } + break; + case CEPH_OSD_OP_CREATE: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val); + if (obs.exists && !oi.is_whiteout() && + (op.flags & CEPH_OSD_OP_FLAG_EXCL)) { + result = -EEXIST; /* this is an exclusive create */ + } else { + if (osd_op.indata.length()) { + auto p = osd_op.indata.cbegin(); + string category; + try { + decode(category, p); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + // category is no longer implemented. + } + maybe_create_new_object(ctx); + t->nop(soid); + } + } + break; + + case CEPH_OSD_OP_TRIMTRUNC: + op.extent.offset = op.extent.truncate_size; + // falling through + + case CEPH_OSD_OP_TRUNCATE: + tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + if (pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = 0; + { + // truncate + if (!obs.exists || oi.is_whiteout()) { + dout(10) << " object dne, truncate is a no-op" << dendl; + break; + } + + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + if (op.extent.truncate_seq) { + ceph_assert(op.extent.offset == op.extent.truncate_size); + if (op.extent.truncate_seq <= oi.truncate_seq) { + dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq + << ", no-op" << dendl; + break; // old + } + dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq + << ", truncating" << dendl; + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + } + + maybe_create_new_object(ctx); + t->truncate(soid, op.extent.offset); + if (oi.size > op.extent.offset) { + interval_set trim; + trim.insert(op.extent.offset, oi.size-op.extent.offset); + ctx->modified_ranges.union_of(trim); + ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset); + } else if (oi.size < op.extent.offset) { + ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size); + } + if (op.extent.offset != oi.size) { + truncate_update_size_and_usage(ctx->delta_stats, + oi, + op.extent.offset); + } + ctx->delta_stats.num_wr++; + // do no set exists, or we will break above DELETE -> TRUNCATE munging. + + oi.clear_data_digest(); + } + break; + + case CEPH_OSD_OP_DELETE: + ++ctx->num_write; + result = 0; + tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val); + { + result = _delete_oid(ctx, false, ctx->ignore_cache); + } + break; + + case CEPH_OSD_OP_WATCH: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val, + op.watch.cookie, op.watch.op); + if (!obs.exists) { + result = -ENOENT; + break; + } + result = 0; + uint64_t cookie = op.watch.cookie; + entity_name_t entity = ctx->reqid.name; + ObjectContextRef obc = ctx->obc; + + dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op) + << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie + << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl; + dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl; + dout(10) << "watch: peer_addr=" + << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl; + + uint32_t timeout = cct->_conf->osd_client_watch_timeout; + if (op.watch.timeout != 0) { + timeout = op.watch.timeout; + } + + watch_info_t w(cookie, timeout, + ctx->op->get_req()->get_connection()->get_peer_addr()); + if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH || + op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) { + if (oi.watchers.count(make_pair(cookie, entity))) { + dout(10) << " found existing watch " << w << " by " << entity << dendl; + } else { + dout(10) << " registered new watch " << w << " by " << entity << dendl; + oi.watchers[make_pair(cookie, entity)] = w; + t->nop(soid); // make sure update the object_info on disk! + } + bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH); + ctx->watch_connects.push_back(make_pair(w, will_ping)); + } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) { + if (!oi.watchers.count(make_pair(cookie, entity))) { + result = -ENOTCONN; + break; + } + dout(10) << " found existing watch " << w << " by " << entity << dendl; + ctx->watch_connects.push_back(make_pair(w, true)); + } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) { + /* Note: WATCH with PING doesn't cause may_write() to return true, + * so if there is nothing else in the transaction, this is going + * to run do_osd_op_effects, but not write out a log entry */ + if (!oi.watchers.count(make_pair(cookie, entity))) { + result = -ENOTCONN; + break; + } + map,WatchRef>::iterator p = + obc->watchers.find(make_pair(cookie, entity)); + if (p == obc->watchers.end() || + !p->second->is_connected()) { + // client needs to reconnect + result = -ETIMEDOUT; + break; + } + dout(10) << " found existing watch " << w << " by " << entity << dendl; + p->second->got_ping(ceph_clock_now()); + result = 0; + } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) { + map, watch_info_t>::iterator oi_iter = + oi.watchers.find(make_pair(cookie, entity)); + if (oi_iter != oi.watchers.end()) { + dout(10) << " removed watch " << oi_iter->second << " by " + << entity << dendl; + oi.watchers.erase(oi_iter); + t->nop(soid); // update oi on disk + ctx->watch_disconnects.push_back( + watch_disconnect_t(cookie, entity, false)); + } else { + dout(10) << " can't remove: no watch by " << entity << dendl; + } + } + } + break; + + case CEPH_OSD_OP_CACHE_PIN: + tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val); + if ((!pool.info.is_tier() || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) { + result = -EINVAL; + dout(10) << " pin object is only allowed on the cache tier " << dendl; + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + + if (!oi.is_cache_pinned()) { + oi.set_flag(object_info_t::FLAG_CACHE_PIN); + ctx->modify = true; + ctx->delta_stats.num_objects_pinned++; + ctx->delta_stats.num_wr++; + } + } + break; + + case CEPH_OSD_OP_CACHE_UNPIN: + tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val); + if ((!pool.info.is_tier() || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) { + result = -EINVAL; + dout(10) << " pin object is only allowed on the cache tier " << dendl; + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + + if (oi.is_cache_pinned()) { + oi.clear_flag(object_info_t::FLAG_CACHE_PIN); + ctx->modify = true; + ctx->delta_stats.num_objects_pinned--; + ctx->delta_stats.num_wr++; + } + } + break; + + case CEPH_OSD_OP_SET_REDIRECT: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::luminous) { + result = -EOPNOTSUPP; + break; + } + + object_t target_name; + object_locator_t target_oloc; + snapid_t target_snapid = (uint64_t)op.copy_from.snapid; + version_t target_version = op.copy_from.src_version; + try { + decode(target_name, bp); + decode(target_oloc, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + pg_t raw_pg; + result = get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg); + if (result < 0) { + dout(5) << " pool information is invalid: " << result << dendl; + break; + } + hobject_t target(target_name, target_oloc.key, target_snapid, + raw_pg.ps(), raw_pg.pool(), + target_oloc.nspace); + if (target == soid) { + dout(20) << " set-redirect self is invalid" << dendl; + result = -EINVAL; + break; + } + + bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE); + bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE); + if (has_reference) { + result = -EINVAL; + dout(5) << " the object is already a manifest " << dendl; + break; + } + if (op_finisher == nullptr && need_reference) { + // start + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new SetManifestFinisher(osd_op)); + ManifestOpRef mop = std::make_shared(new RefCountCallback(ctx, osd_op)); + C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, soid); + ceph_tid_t tid = refcount_manifest(soid, target, + refcount_t::INCREMENT_REF, fin, std::nullopt); + mop->objecter_tid = tid; + manifest_ops[soid] = mop; + ctx->obc->start_block(); + result = -EINPROGRESS; + } else { + // finish + if (op_finisher) { + result = op_finisher->execute(); + ceph_assert(result == 0); + } + + if (!oi.has_manifest() && !oi.manifest.is_redirect()) + ctx->delta_stats.num_objects_manifest++; + + oi.set_flag(object_info_t::FLAG_MANIFEST); + oi.manifest.redirect_target = target; + oi.manifest.type = object_manifest_t::TYPE_REDIRECT; + t->truncate(soid, 0); + ctx->clean_regions.mark_data_region_dirty(0, oi.size); + if (oi.is_omap() && pool.info.supports_omap()) { + t->omap_clear(soid); + obs.oi.clear_omap_digest(); + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + ctx->clean_regions.mark_omap_dirty(); + } + write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, + 0, oi.size, false); + ctx->delta_stats.num_bytes -= oi.size; + oi.size = 0; + oi.new_object(); + oi.user_version = target_version; + ctx->user_at_version = target_version; + /* rm_attrs */ + map rmattrs; + result = getattrs_maybe_cache(ctx->obc, &rmattrs); + if (result < 0) { + dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl; + return result; + } + map::iterator iter; + for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) { + const string& name = iter->first; + t->rmattr(soid, name); + } + if (!has_reference && need_reference) { + oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE); + } + dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl; + if (op_finisher) { + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + } + + break; + + case CEPH_OSD_OP_SET_CHUNK: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::luminous) { + result = -EOPNOTSUPP; + break; + } + if (oi.manifest.is_redirect()) { + result = -EINVAL; + goto fail; + } + + object_locator_t tgt_oloc; + uint64_t src_offset, src_length, tgt_offset; + object_t tgt_name; + try { + decode(src_offset, bp); + decode(src_length, bp); + decode(tgt_oloc, bp); + decode(tgt_name, bp); + decode(tgt_offset, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + + if (!src_length) { + result = -EINVAL; + goto fail; + } + if (src_offset + src_length > oi.size) { + result = -ERANGE; + goto fail; + } + if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) { + result = -EOPNOTSUPP; + break; + } + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + + for (auto &p : oi.manifest.chunk_map) { + interval_set chunk; + chunk.insert(p.first, p.second.length); + if (chunk.intersects(src_offset, src_length)) { + dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length + << " chunk_info: " << p << dendl; + result = -EOPNOTSUPP; + goto fail; + } + } + + pg_t raw_pg; + chunk_info_t chunk_info; + result = get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg); + if (result < 0) { + dout(5) << " pool information is invalid: " << result << dendl; + break; + } + hobject_t target(tgt_name, tgt_oloc.key, snapid_t(), + raw_pg.ps(), raw_pg.pool(), + tgt_oloc.nspace); + bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) && + (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE)); + if (has_reference) { + result = -EINVAL; + dout(5) << " the object is already a manifest " << dendl; + break; + } + chunk_info.oid = target; + chunk_info.offset = tgt_offset; + chunk_info.length = src_length; + if (op_finisher == nullptr) { + // start + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new SetManifestFinisher(osd_op)); + object_manifest_t set_chunk; + bool need_inc_ref = false; + set_chunk.chunk_map[src_offset] = chunk_info; + need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op); + if (need_inc_ref) { + result = -EINPROGRESS; + break; + } + } + if (op_finisher) { + result = op_finisher->execute(); + ceph_assert(result == 0); + } + + oi.manifest.chunk_map[src_offset] = chunk_info; + if (!oi.has_manifest() && !oi.manifest.is_chunked()) + ctx->delta_stats.num_objects_manifest++; + oi.set_flag(object_info_t::FLAG_MANIFEST); + oi.manifest.type = object_manifest_t::TYPE_CHUNKED; + if (!has_reference) { + oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE); + } + ctx->modify = true; + ctx->cache_operation = true; + + dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version + << " chunk_info: " << chunk_info << dendl; + if (op_finisher) { + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + + break; + + case CEPH_OSD_OP_TIER_PROMOTE: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::luminous) { + result = -EOPNOTSUPP; + break; + } + if (!obs.oi.has_manifest()) { + result = 0; + break; + } + + if (op_finisher == nullptr) { + PromoteManifestCallback *cb; + object_locator_t my_oloc; + hobject_t src_hoid; + + if (obs.oi.manifest.is_chunked()) { + src_hoid = obs.oi.soid; + } else if (obs.oi.manifest.is_redirect()) { + object_locator_t src_oloc(obs.oi.manifest.redirect_target); + my_oloc = src_oloc; + src_hoid = obs.oi.manifest.redirect_target; + } else { + ceph_abort_msg("unrecognized manifest type"); + } + cb = new PromoteManifestCallback(ctx->obc, this, ctx); + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new PromoteFinisher(cb)); + unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | + CEPH_OSD_COPY_FROM_FLAG_RWORDERED; + unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL; + start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags, + obs.oi.soid.snap == CEPH_NOSNAP, + src_fadvise_flags, 0); + + dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl; + result = -EINPROGRESS; + } else { + result = op_finisher->execute(); + ceph_assert(result == 0); + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + + break; + + case CEPH_OSD_OP_TIER_FLUSH: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + result = -EOPNOTSUPP; + break; + } + if (!obs.oi.has_manifest()) { + result = 0; + break; + } + + if (oi.is_dirty()) { + result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt); + if (result == -EINPROGRESS) + result = -EAGAIN; + } else { + result = 0; + } + } + + break; + + case CEPH_OSD_OP_TIER_EVICT: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + result = -EOPNOTSUPP; + break; + } + if (!obs.oi.has_manifest()) { + result = -EINVAL; + break; + } + + // The chunks already has a reference, so it is just enough to invoke truncate if necessary + uint64_t chunk_length = 0; + for (auto p : obs.oi.manifest.chunk_map) { + chunk_length += p.second.length; + } + if (chunk_length == obs.oi.size) { + for (auto &p : obs.oi.manifest.chunk_map) { + p.second.set_flag(chunk_info_t::FLAG_MISSING); + } + // punch hole + t->zero(soid, 0, oi.size); + oi.clear_data_digest(); + ctx->delta_stats.num_wr++; + ctx->cache_operation = true; + } + osd->logger->inc(l_osd_tier_evict); + } + + break; + + case CEPH_OSD_OP_UNSET_MANIFEST: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (!oi.has_manifest()) { + result = -EOPNOTSUPP; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::luminous) { + result = -EOPNOTSUPP; + break; + } + + dec_all_refcount_manifest(oi, ctx); + + oi.clear_flag(object_info_t::FLAG_MANIFEST); + oi.manifest = object_manifest_t(); + ctx->delta_stats.num_objects_manifest--; + ctx->delta_stats.num_wr++; + ctx->modify = true; + } + + break; + + // -- object attrs -- + + case CEPH_OSD_OP_SETXATTR: + ++ctx->num_write; + result = 0; + { + if (cct->_conf->osd_max_attr_size > 0 && + op.xattr.value_len > cct->_conf->osd_max_attr_size) { + tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???"); + result = -EFBIG; + break; + } + unsigned max_name_len = + std::min(osd->store->get_max_attr_name_length(), + cct->_conf->osd_max_attr_name_len); + if (op.xattr.name_len > max_name_len) { + result = -ENAMETOOLONG; + break; + } + maybe_create_new_object(ctx); + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + bufferlist bl; + bp.copy(op.xattr.value_len, bl); + t->setattr(soid, name, bl); + ctx->delta_stats.num_wr++; + } + break; + + case CEPH_OSD_OP_RMXATTR: + ++ctx->num_write; + result = 0; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + string name = "_" + aname; + t->rmattr(soid, name); + ctx->delta_stats.num_wr++; + } + break; + + + // -- fancy writers -- + case CEPH_OSD_OP_APPEND: + { + tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + // just do it inline; this works because we are happy to execute + // fancy op on replicas as well. + vector nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITE; + newop.op.extent.offset = oi.size; + newop.op.extent.length = op.extent.length; + newop.op.extent.truncate_seq = oi.truncate_seq; + newop.indata = osd_op.indata; + result = do_osd_ops(ctx, nops); + osd_op.outdata = std::move(newop.outdata); + } + break; + + case CEPH_OSD_OP_STARTSYNC: + result = 0; + t->nop(soid); + break; + + // -- trivial map -- + case CEPH_OSD_OP_TMAPGET: + tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + { + vector nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_SYNC_READ; + newop.op.extent.offset = 0; + newop.op.extent.length = 0; + result = do_osd_ops(ctx, nops); + osd_op.outdata = std::move(newop.outdata); + } + break; + + case CEPH_OSD_OP_TMAPPUT: + tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + { + //_dout_lock.Lock(); + //osd_op.data.hexdump(*_dout); + //_dout_lock.Unlock(); + + // verify sort order + bool unsorted = false; + if (true) { + bufferlist header; + decode(header, bp); + uint32_t n; + decode(n, bp); + string last_key; + while (n--) { + string key; + decode(key, bp); + dout(10) << "tmapput key " << key << dendl; + bufferlist val; + decode(val, bp); + if (key < last_key) { + dout(10) << "TMAPPUT is unordered; resorting" << dendl; + unsorted = true; + break; + } + last_key = key; + } + } + + // write it + vector nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = osd_op.indata.length(); + newop.indata = osd_op.indata; + + if (unsorted) { + bp = osd_op.indata.begin(); + bufferlist header; + map m; + decode(header, bp); + decode(m, bp); + ceph_assert(bp.end()); + bufferlist newbl; + encode(header, newbl); + encode(m, newbl); + newop.indata = newbl; + } + result = do_osd_ops(ctx, nops); + ceph_assert(result == 0); + } + break; + + case CEPH_OSD_OP_TMAPUP: + tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = do_tmapup(ctx, bp, osd_op); + break; + + case CEPH_OSD_OP_TMAP2OMAP: + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val); + result = do_tmap2omap(ctx, op.tmap2omap.flags); + break; + + // OMAP Read ops + case CEPH_OSD_OP_OMAPGETKEYS: + ++ctx->num_read; + { + string start_after; + uint64_t max_return; + try { + decode(start_after, bp); + decode(max_return, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0); + goto fail; + } + if (max_return > cct->_conf->osd_max_omap_entries_per_request) { + max_return = cct->_conf->osd_max_omap_entries_per_request; + } + tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return); + + bufferlist bl; + uint32_t num = 0; + bool truncated = false; + if (oi.is_omap()) { + ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( + ch, ghobject_t(soid) + ); + ceph_assert(iter); + iter->upper_bound(start_after); + for (num = 0; iter->valid(); ++num, iter->next()) { + if (num >= max_return || + bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(iter->key(), bl); + } + } // else return empty out_set + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(bl); + encode(truncated, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETVALS: + ++ctx->num_read; + { + string start_after; + uint64_t max_return; + string filter_prefix; + try { + decode(start_after, bp); + decode(max_return, bp); + decode(filter_prefix, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???"); + goto fail; + } + if (max_return > cct->_conf->osd_max_omap_entries_per_request) { + max_return = cct->_conf->osd_max_omap_entries_per_request; + } + tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str()); + + uint32_t num = 0; + bool truncated = false; + bufferlist bl; + if (oi.is_omap()) { + ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( + ch, ghobject_t(soid) + ); + if (!iter) { + result = -ENOENT; + goto fail; + } + iter->upper_bound(start_after); + if (filter_prefix > start_after) iter->lower_bound(filter_prefix); + for (num = 0; + iter->valid() && + iter->key().substr(0, filter_prefix.size()) == filter_prefix; + ++num, iter->next()) { + dout(20) << "Found key " << iter->key() << dendl; + if (num >= max_return || + bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(iter->key(), bl); + encode(iter->value(), bl); + } + } // else return empty out_set + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(bl); + encode(truncated, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETHEADER: + tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val); + if (!oi.is_omap()) { + // return empty header + break; + } + ++ctx->num_read; + { + osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + ++ctx->num_read; + { + set keys_to_get; + try { + decode(keys_to_get, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???"); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str()); + map out; + if (oi.is_omap()) { + osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out); + } // else return empty omap entries + encode(out, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAP_CMP: + ++ctx->num_read; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???"); + break; + } + map > assertions; + try { + decode(assertions, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???"); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str()); + + map out; + + if (oi.is_omap()) { + set to_get; + for (map >::iterator i = assertions.begin(); + i != assertions.end(); + ++i) + to_get.insert(i->first); + int r = osd->store->omap_get_values(ch, ghobject_t(soid), + to_get, &out); + if (r < 0) { + result = r; + break; + } + } // else leave out empty + + //Should set num_rd_kb based on encode length of map + ctx->delta_stats.num_rd++; + + int r = 0; + bufferlist empty; + for (map >::iterator i = assertions.begin(); + i != assertions.end(); + ++i) { + auto out_entry = out.find(i->first); + bufferlist &bl = (out_entry != out.end()) ? + out_entry->second : empty; + switch (i->second.second) { + case CEPH_OSD_CMPXATTR_OP_EQ: + if (!(bl == i->second.first)) { + r = -ECANCELED; + } + break; + case CEPH_OSD_CMPXATTR_OP_LT: + if (!(bl < i->second.first)) { + r = -ECANCELED; + } + break; + case CEPH_OSD_CMPXATTR_OP_GT: + if (!(bl > i->second.first)) { + r = -ECANCELED; + } + break; + default: + r = -EINVAL; + break; + } + if (r < 0) + break; + } + if (r < 0) { + result = r; + } + } + break; + + // OMAP Write ops + case CEPH_OSD_OP_OMAPSETVALS: + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + break; + } + ++ctx->num_write; + result = 0; + { + maybe_create_new_object(ctx); + bufferlist to_set_bl; + try { + decode_str_str_map_to_bl(bp, &to_set_bl); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + if (cct->_conf->subsys.should_gather()) { + dout(20) << "setting vals: " << dendl; + map to_set; + bufferlist::const_iterator pt = to_set_bl.begin(); + decode(to_set, pt); + for (map::iterator i = to_set.begin(); + i != to_set.end(); + ++i) { + dout(20) << "\t" << i->first << dendl; + } + } + t->omap_setkeys(soid, to_set_bl); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10); + } + obs.oi.set_flag(object_info_t::FLAG_OMAP); + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_OMAPSETHEADER: + tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val); + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = 0; + { + maybe_create_new_object(ctx); + t->omap_setheader(soid, osd_op.indata); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + } + obs.oi.set_flag(object_info_t::FLAG_OMAP); + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_OMAPCLEAR: + tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val); + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + if (oi.is_omap()) { + t->omap_clear(soid); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + obs.oi.clear_omap_digest(); + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + } + break; + + case CEPH_OSD_OP_OMAPRMKEYS: + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + break; + } + bufferlist to_rm_bl; + try { + decode_str_set_to_bl(bp, &to_rm_bl); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + t->omap_rmkeys(soid, to_rm_bl); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + } + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_OMAPRMKEYRANGE: + tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val); + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + std::string key_begin, key_end; + try { + decode(key_begin, bp); + decode(key_end, bp); + } catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + t->omap_rmkeyrange(soid, key_begin, key_end); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + } + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_COPY_GET: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(), + soid.snap.val); + if (op_finisher == nullptr) { + result = do_copy_get(ctx, bp, osd_op, ctx->obc); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: + ++ctx->num_write; + result = 0; + { + object_t src_name; + object_locator_t src_oloc; + uint32_t truncate_seq = 0; + uint64_t truncate_size = 0; + bool have_truncate = false; + snapid_t src_snapid = (uint64_t)op.copy_from.snapid; + version_t src_version = op.copy_from.src_version; + + if ((op.op == CEPH_OSD_OP_COPY_FROM2) && + (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) { + dout(20) << "invalid copy-from2 flags 0x" + << std::hex << (int)op.copy_from.flags << std::dec << dendl; + result = -EINVAL; + break; + } + try { + decode(src_name, bp); + decode(src_oloc, bp); + // check if client sent us truncate_seq and truncate_size + if ((op.op == CEPH_OSD_OP_COPY_FROM2) && + (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) { + decode(truncate_seq, bp); + decode(truncate_size, bp); + have_truncate = true; + } + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, + do_osd_op_pre_copy_from, + soid.oid.name.c_str(), + soid.snap.val, + "???", + 0, + "???", + "???", + 0, + src_snapid, + src_version); + goto fail; + } + tracepoint(osd, + do_osd_op_pre_copy_from, + soid.oid.name.c_str(), + soid.snap.val, + src_name.name.c_str(), + src_oloc.pool, + src_oloc.key.c_str(), + src_oloc.nspace.c_str(), + src_oloc.hash, + src_snapid, + src_version); + if (op_finisher == nullptr) { + // start + pg_t raw_pg; + get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg); + hobject_t src(src_name, src_oloc.key, src_snapid, + raw_pg.ps(), raw_pg.pool(), + src_oloc.nspace); + if (src == soid) { + dout(20) << " copy from self is invalid" << dendl; + result = -EINVAL; + break; + } + CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op); + if (have_truncate) + cb->set_truncate(truncate_seq, truncate_size); + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new CopyFromFinisher(cb)); + start_copy(cb, ctx->obc, src, src_oloc, src_version, + op.copy_from.flags, + false, + op.copy_from.src_fadvise_flags, + op.flags); + result = -EINPROGRESS; + } else { + // finish + result = op_finisher->execute(); + ceph_assert(result == 0); + + // COPY_FROM cannot be executed multiple times -- it must restart + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + break; + + default: + tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op)); + dout(1) << "unrecognized osd op " << op.op + << " " << ceph_osd_op_name(op.op) + << dendl; + result = -EOPNOTSUPP; + } + + fail: + osd_op.rval = result; + tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result); + if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) && + result != -EAGAIN && result != -EINPROGRESS) + result = 0; + + if (result < 0) + break; + } + if (result < 0) { + dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl; + } + return result; +} + +int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals) +{ + if (ctx->new_obs.oi.size == 0) { + dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl; + return -ENODATA; + } + vector nops(1); + OSDOp &newop = nops[0]; + newop.op.op = CEPH_OSD_OP_TMAPGET; + do_osd_ops(ctx, nops); + try { + bufferlist::const_iterator i = newop.outdata.begin(); + decode(*header, i); + (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining()); + } catch (...) { + dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid + << dendl; + return -EINVAL; + } + dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid + << dendl; + return 0; +} + +int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid, + const SnapSet& ss) +{ + // verify that all clones have been evicted + dout(20) << __func__ << " verifying clones are absent " + << ss << dendl; + for (vector::const_iterator p = ss.clones.begin(); + p != ss.clones.end(); + ++p) { + hobject_t clone_oid = soid; + clone_oid.snap = *p; + if (is_missing_object(clone_oid)) + return -EBUSY; + ObjectContextRef clone_obc = get_object_context(clone_oid, false); + if (clone_obc && clone_obc->obs.exists) { + dout(10) << __func__ << " cannot evict head before clone " + << clone_oid << dendl; + return -EBUSY; + } + if (copy_ops.count(clone_oid)) { + dout(10) << __func__ << " cannot evict head, pending promote on clone " + << clone_oid << dendl; + return -EBUSY; + } + } + return 0; +} + +inline int PrimaryLogPG::_delete_oid( + OpContext *ctx, + bool no_whiteout, // no whiteouts, no matter what. + bool try_no_whiteout) // try not to whiteout +{ + SnapSet& snapset = ctx->new_snapset; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + PGTransaction* t = ctx->op_t.get(); + + // cache: cache: set whiteout on delete? + bool whiteout = false; + if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE + && !no_whiteout + && !try_no_whiteout) { + whiteout = true; + } + + // in luminous or later, we can't delete the head if there are + // clones. we trust the caller passing no_whiteout has already + // verified they don't exist. + if (!snapset.clones.empty() || + (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) { + if (no_whiteout) { + dout(20) << __func__ << " has or will have clones but no_whiteout=1" + << dendl; + } else { + dout(20) << __func__ << " has or will have clones; will whiteout" + << dendl; + whiteout = true; + } + } + dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout + << " no_whiteout=" << (int)no_whiteout + << " try_no_whiteout=" << (int)try_no_whiteout + << dendl; + if (!obs.exists || (obs.oi.is_whiteout() && whiteout)) + return -ENOENT; + + t->remove(soid); + + if (oi.size > 0) { + interval_set ch; + ch.insert(0, oi.size); + ctx->modified_ranges.union_of(ch); + ctx->clean_regions.mark_data_region_dirty(0, oi.size); + } + + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + if (soid.is_snap()) { + ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap)); + ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap); + } else { + ctx->delta_stats.num_bytes -= oi.size; + } + oi.size = 0; + oi.new_object(); + + // disconnect all watchers + for (map, watch_info_t>::iterator p = + oi.watchers.begin(); + p != oi.watchers.end(); + ++p) { + dout(20) << __func__ << " will disconnect watcher " << p->first << dendl; + ctx->watch_disconnects.push_back( + watch_disconnect_t(p->first.first, p->first.second, true)); + } + oi.watchers.clear(); + + if (oi.has_manifest()) { + ctx->delta_stats.num_objects_manifest--; + dec_all_refcount_manifest(oi, ctx); + } + + if (whiteout) { + dout(20) << __func__ << " setting whiteout on " << soid << dendl; + oi.set_flag(object_info_t::FLAG_WHITEOUT); + ctx->delta_stats.num_whiteouts++; + t->create(soid); + osd->logger->inc(l_osd_tier_whiteout); + return 0; + } + + // delete the head + ctx->delta_stats.num_objects--; + if (soid.is_snap()) + ctx->delta_stats.num_object_clones--; + if (oi.is_whiteout()) { + dout(20) << __func__ << " deleting whiteout on " << soid << dendl; + ctx->delta_stats.num_whiteouts--; + oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + if (oi.is_cache_pinned()) { + ctx->delta_stats.num_objects_pinned--; + } + obs.exists = false; + return 0; +} + +int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) +{ + SnapSet& snapset = ctx->new_snapset; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + PGTransaction* t = ctx->op_t.get(); + snapid_t snapid = (uint64_t)op.snap.snapid; + hobject_t missing_oid; + + dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl; + + ObjectContextRef rollback_to; + + int ret = find_object_context( + hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(), + soid.get_namespace()), + &rollback_to, false, false, &missing_oid); + if (ret == -EAGAIN) { + /* clone must be missing */ + ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid)); + dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone " + << missing_oid << " (requested snapid: ) " << snapid << dendl; + block_write_on_degraded_snap(missing_oid, ctx->op); + return ret; + } + { + ObjectContextRef promote_obc; + cache_result_t tier_mode_result; + if (obs.exists && obs.oi.has_manifest()) { + tier_mode_result = + maybe_handle_manifest_detail( + ctx->op, + true, + rollback_to); + } else { + tier_mode_result = + maybe_handle_cache_detail( + ctx->op, + true, + rollback_to, + ret, + missing_oid, + true, + false, + &promote_obc); + } + switch (tier_mode_result) { + case cache_result_t::NOOP: + break; + case cache_result_t::BLOCKED_PROMOTE: + ceph_assert(promote_obc); + block_write_on_snap_rollback(soid, promote_obc, ctx->op); + return -EAGAIN; + case cache_result_t::BLOCKED_FULL: + block_write_on_full_cache(soid, ctx->op); + return -EAGAIN; + case cache_result_t::REPLIED_WITH_EAGAIN: + ceph_abort_msg("this can't happen, no rollback on replica"); + default: + ceph_abort_msg("must promote was set, other values are not valid"); + return -EAGAIN; + } + } + + if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) { + // there's no snapshot here, or there's no object. + // if there's no snapshot, we delete the object; otherwise, do nothing. + dout(20) << "_rollback_to deleting head on " << soid.oid + << " because got ENOENT|whiteout on find_object_context" << dendl; + if (ctx->obc->obs.oi.watchers.size()) { + // Cannot delete an object with watchers + ret = -EBUSY; + } else { + _delete_oid(ctx, false, false); + ret = 0; + } + } else if (ret) { + // ummm....huh? It *can't* return anything else at time of writing. + ceph_abort_msg("unexpected error code in _rollback_to"); + } else { //we got our context, let's use it to do the rollback! + hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid; + if (is_degraded_or_backfilling_object(rollback_to_sobject) || + is_degraded_on_async_recovery_target(rollback_to_sobject)) { + dout(20) << "_rollback_to attempted to roll back to a degraded object " + << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl; + block_write_on_degraded_snap(rollback_to_sobject, ctx->op); + ret = -EAGAIN; + } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) { + // rolling back to the head; we just need to clone it. + ctx->modify = true; + } else { + /* 1) Delete current head + * 2) Clone correct snapshot into head + * 3) Calculate clone_overlaps by following overlaps + * forward from rollback snapshot */ + dout(10) << "_rollback_to deleting " << soid.oid + << " and rolling back to old snap" << dendl; + + if (obs.exists) { + t->remove(soid); + } + t->clone(soid, rollback_to_sobject); + t->add_obc(rollback_to); + + map >::iterator iter = + snapset.clone_overlap.lower_bound(snapid); + ceph_assert(iter != snapset.clone_overlap.end()); + interval_set overlaps = iter->second; + for ( ; + iter != snapset.clone_overlap.end(); + ++iter) + overlaps.intersection_of(iter->second); + + if (obs.oi.size > 0) { + interval_set modified; + modified.insert(0, obs.oi.size); + overlaps.intersection_of(modified); + modified.subtract(overlaps); + ctx->modified_ranges.union_of(modified); + } + + // Adjust the cached objectcontext + maybe_create_new_object(ctx, true); + ctx->delta_stats.num_bytes -= obs.oi.size; + ctx->delta_stats.num_bytes += rollback_to->obs.oi.size; + ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size)); + ctx->clean_regions.mark_omap_dirty(); + obs.oi.size = rollback_to->obs.oi.size; + if (rollback_to->obs.oi.is_data_digest()) + obs.oi.set_data_digest(rollback_to->obs.oi.data_digest); + else + obs.oi.clear_data_digest(); + if (rollback_to->obs.oi.is_omap_digest()) + obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest); + else + obs.oi.clear_omap_digest(); + + if (rollback_to->obs.oi.is_omap()) { + dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl; + obs.oi.set_flag(object_info_t::FLAG_OMAP); + } else { + dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + } + } + return ret; +} + +void PrimaryLogPG::_make_clone( + OpContext *ctx, + PGTransaction* t, + ObjectContextRef obc, + const hobject_t& head, const hobject_t& coid, + object_info_t *poi) +{ + bufferlist bv; + encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + + t->clone(coid, head); + setattr_maybe_cache(obc, t, OI_ATTR, bv); + rmattr_maybe_cache(obc, t, SS_ATTR); +} + +void PrimaryLogPG::make_writeable(OpContext *ctx) +{ + const hobject_t& soid = ctx->obs->oi.soid; + SnapContext& snapc = ctx->snapc; + + // clone? + ceph_assert(soid.snap == CEPH_NOSNAP); + dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset + << " snapc=" << snapc << dendl; + + bool was_dirty = ctx->obc->obs.oi.is_dirty(); + if (ctx->new_obs.exists) { + // we will mark the object dirty + if (ctx->undirty && was_dirty) { + dout(20) << " clearing DIRTY flag" << dendl; + ceph_assert(ctx->new_obs.oi.is_dirty()); + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + osd->logger->inc(l_osd_tier_clean); + } else if (!was_dirty && !ctx->undirty) { + dout(20) << " setting DIRTY flag" << dendl; + ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY); + ++ctx->delta_stats.num_objects_dirty; + osd->logger->inc(l_osd_tier_dirty); + } + } else { + if (was_dirty) { + dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + } + } + + if ((ctx->new_obs.exists && + ctx->new_obs.oi.is_omap()) && + (!ctx->obc->obs.exists || + !ctx->obc->obs.oi.is_omap())) { + ++ctx->delta_stats.num_objects_omap; + } + if ((!ctx->new_obs.exists || + !ctx->new_obs.oi.is_omap()) && + (ctx->obc->obs.exists && + ctx->obc->obs.oi.is_omap())) { + --ctx->delta_stats.num_objects_omap; + } + + if (ctx->new_snapset.seq > snapc.seq) { + dout(10) << " op snapset is old" << dendl; + } + + if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed) + snapc.snaps.size() && // there are snaps + !ctx->cache_operation && + snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old + // clone + hobject_t coid = soid; + coid.snap = snapc.seq; + + unsigned l; + for (l = 1; + l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; + l++) ; + + vector snaps(l); + for (unsigned i=0; iclone_obc = object_contexts.lookup_or_create(static_snap_oi.soid); + ctx->clone_obc->destructor_callback = + new C_PG_ObjectContext(this, ctx->clone_obc.get()); + ctx->clone_obc->obs.oi = static_snap_oi; + ctx->clone_obc->obs.exists = true; + ctx->clone_obc->ssc = ctx->obc->ssc; + ctx->clone_obc->ssc->ref++; + if (pool.info.is_erasure()) + ctx->clone_obc->attr_cache = ctx->obc->attr_cache; + snap_oi = &ctx->clone_obc->obs.oi; + if (ctx->obc->obs.oi.has_manifest()) { + if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) && + ctx->obc->obs.oi.manifest.is_redirect()) { + snap_oi->set_flag(object_info_t::FLAG_MANIFEST); + snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT; + snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target; + } else if (ctx->obc->obs.oi.manifest.is_chunked()) { + snap_oi->set_flag(object_info_t::FLAG_MANIFEST); + snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED; + snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map; + } else { + ceph_abort_msg("unrecognized manifest type"); + } + } + bool got = ctx->lock_manager.get_write_greedy( + coid, + ctx->clone_obc, + ctx->op); + ceph_assert(got); + dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl; + } else { + snap_oi = &static_snap_oi; + } + snap_oi->version = ctx->at_version; + snap_oi->prior_version = ctx->obs->oi.version; + snap_oi->copy_user_bits(ctx->obs->oi); + + _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi); + + ctx->delta_stats.num_objects++; + if (snap_oi->is_dirty()) { + ctx->delta_stats.num_objects_dirty++; + osd->logger->inc(l_osd_tier_dirty); + } + if (snap_oi->is_omap()) + ctx->delta_stats.num_objects_omap++; + if (snap_oi->is_cache_pinned()) + ctx->delta_stats.num_objects_pinned++; + if (snap_oi->has_manifest()) + ctx->delta_stats.num_objects_manifest++; + ctx->delta_stats.num_object_clones++; + ctx->new_snapset.clones.push_back(coid.snap); + ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size; + ctx->new_snapset.clone_snaps[coid.snap] = snaps; + + // clone_overlap should contain an entry for each clone + // (an empty interval_set if there is no overlap) + ctx->new_snapset.clone_overlap[coid.snap]; + if (ctx->obs->oi.size) + ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size); + + // log clone + dout(10) << " cloning v " << ctx->obs->oi.version + << " to " << coid << " v " << ctx->at_version + << " snaps=" << snaps + << " snapset=" << ctx->new_snapset << dendl; + ctx->log.push_back(pg_log_entry_t( + pg_log_entry_t::CLONE, coid, ctx->at_version, + ctx->obs->oi.version, + ctx->obs->oi.user_version, + osd_reqid_t(), ctx->new_obs.oi.mtime, 0)); + encode(snaps, ctx->log.back().snaps); + + ctx->at_version.version++; + } + + // update most recent clone_overlap and usage stats + if (ctx->new_snapset.clones.size() > 0) { + // the clone_overlap is difference of range between head and clones. + // we need to check whether the most recent clone exists, if it's + // been evicted, it's not included in the stats, but the clone_overlap + // is still exist in the snapset, so we should update the + // clone_overlap to make it sense. + hobject_t last_clone_oid = soid; + last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first; + interval_set &newest_overlap = + ctx->new_snapset.clone_overlap.rbegin()->second; + ctx->modified_ranges.intersection_of(newest_overlap); + if (is_present_clone(last_clone_oid)) { + // modified_ranges is still in use by the clone + ctx->delta_stats.num_bytes += ctx->modified_ranges.size(); + } + newest_overlap.subtract(ctx->modified_ranges); + } + + if (snapc.seq > ctx->new_snapset.seq) { + // update snapset with latest snap context + ctx->new_snapset.seq = snapc.seq; + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + ctx->new_snapset.snaps = snapc.snaps; + } else { + ctx->new_snapset.snaps.clear(); + } + } + dout(20) << "make_writeable " << soid + << " done, snapset=" << ctx->new_snapset << dendl; +} + + +void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi, + interval_set& modified, uint64_t offset, + uint64_t length, bool write_full) +{ + interval_set ch; + if (write_full) { + if (oi.size) + ch.insert(0, oi.size); + } else if (length) + ch.insert(offset, length); + modified.union_of(ch); + if (write_full || + (offset + length > oi.size && length)) { + uint64_t new_size = offset + length; + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += new_size; + oi.size = new_size; + } + + delta_stats.num_wr++; + delta_stats.num_wr_kb += shift_round_up(length, 10); +} + +void PrimaryLogPG::truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size) +{ + if (oi.size != truncate_size) { + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += truncate_size; + oi.size = truncate_size; + } +} + +void PrimaryLogPG::complete_disconnect_watches( + ObjectContextRef obc, + const list &to_disconnect) +{ + for (list::const_iterator i = + to_disconnect.begin(); + i != to_disconnect.end(); + ++i) { + pair watcher(i->cookie, i->name); + auto watchers_entry = obc->watchers.find(watcher); + if (watchers_entry != obc->watchers.end()) { + WatchRef watch = watchers_entry->second; + dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl; + obc->watchers.erase(watcher); + watch->remove(i->send_disconnect); + } else { + dout(10) << "do_osd_op_effects disconnect failed to find watcher " + << watcher << dendl; + } + } +} + +void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn) +{ + entity_name_t entity = ctx->reqid.name; + dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl; + + // disconnects first + complete_disconnect_watches(ctx->obc, ctx->watch_disconnects); + + ceph_assert(conn); + + auto session = conn->get_priv(); + if (!session) + return; + + for (list >::iterator i = ctx->watch_connects.begin(); + i != ctx->watch_connects.end(); + ++i) { + pair watcher(i->first.cookie, entity); + dout(15) << "do_osd_op_effects applying watch connect on session " + << session.get() << " watcher " << watcher << dendl; + WatchRef watch; + if (ctx->obc->watchers.count(watcher)) { + dout(15) << "do_osd_op_effects found existing watch watcher " << watcher + << dendl; + watch = ctx->obc->watchers[watcher]; + } else { + dout(15) << "do_osd_op_effects new watcher " << watcher + << dendl; + watch = Watch::makeWatchRef( + this, osd, ctx->obc, i->first.timeout_seconds, + i->first.cookie, entity, conn->get_peer_addr()); + ctx->obc->watchers.insert( + make_pair( + watcher, + watch)); + } + watch->connect(conn, i->second); + } + + for (list::iterator p = ctx->notifies.begin(); + p != ctx->notifies.end(); + ++p) { + dout(10) << "do_osd_op_effects, notify " << *p << dendl; + ConnectionRef conn(ctx->op->get_req()->get_connection()); + NotifyRef notif( + Notify::makeNotifyRef( + conn, + ctx->reqid.name.num(), + p->bl, + p->timeout, + p->cookie, + p->notify_id, + ctx->obc->obs.oi.user_version, + osd)); + for (map, WatchRef>::iterator i = + ctx->obc->watchers.begin(); + i != ctx->obc->watchers.end(); + ++i) { + dout(10) << "starting notify on watch " << i->first << dendl; + i->second->start_notify(notif); + } + notif->init(); + } + + for (list::iterator p = ctx->notify_acks.begin(); + p != ctx->notify_acks.end(); + ++p) { + if (p->watch_cookie) + dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl; + else + dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl; + for (map, WatchRef>::iterator i = + ctx->obc->watchers.begin(); + i != ctx->obc->watchers.end(); + ++i) { + if (i->first.second != entity) continue; + if (p->watch_cookie && + *(p->watch_cookie) != i->first.first) continue; + dout(10) << "acking notify on watch " << i->first << dendl; + i->second->notify_ack(p->notify_id, p->reply_bl); + } + } +} + +hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target) +{ + ostringstream ss; + ss << "temp_" << info.pgid << "_" << get_role() + << "_" << osd->monc->get_global_id() << "_" << (++temp_seq); + hobject_t hoid = target.make_temp_hobject(ss.str()); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +hobject_t PrimaryLogPG::get_temp_recovery_object( + const hobject_t& target, + eversion_t version) +{ + ostringstream ss; + ss << "temp_recovering_" << info.pgid // (note this includes the shardid) + << "_" << version + << "_" << info.history.same_interval_since + << "_" << target.snap; + // pgid + version + interval + snapid is unique, and short + hobject_t hoid = target.make_temp_hobject(ss.str()); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +int PrimaryLogPG::prepare_transaction(OpContext *ctx) +{ + ceph_assert(!ctx->ops->empty()); + + // valid snap context? + if (!ctx->snapc.is_valid()) { + dout(10) << " invalid snapc " << ctx->snapc << dendl; + return -EINVAL; + } + + // prepare the actual mutation + int result = do_osd_ops(ctx, *ctx->ops); + if (result < 0) { + if (ctx->op->may_write() && + get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + // need to save the error code in the pg log, to detect dup ops, + // but do nothing else + ctx->update_log_only = true; + } + return result; + } + + // read-op? write-op noop? done? + if (ctx->op_t->empty() && !ctx->modify) { + if (ctx->pending_async_reads.empty()) + unstable_stats.add(ctx->delta_stats); + if (ctx->op->may_write() && + get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + ctx->update_log_only = true; + } + return result; + } + + // check for full + if ((ctx->delta_stats.num_bytes > 0 || + ctx->delta_stats.num_objects > 0) && // FIXME: keys? + pool.info.has_flag(pg_pool_t::FLAG_FULL)) { + auto m = ctx->op->get_req(); + if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now + m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) { + dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS" + << dendl; + } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + // they tried, they failed. + dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl; + return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC; + } else { + // drop request + dout(20) << __func__ << " full, dropping request (bad client)" << dendl; + return -EAGAIN; + } + } + + const hobject_t& soid = ctx->obs->oi.soid; + // clone, if necessary + if (soid.snap == CEPH_NOSNAP) + make_writeable(ctx); + + finish_ctx(ctx, + ctx->new_obs.exists ? pg_log_entry_t::MODIFY : + pg_log_entry_t::DELETE, + result); + + return result; +} + +void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result) +{ + const hobject_t& soid = ctx->obs->oi.soid; + dout(20) << __func__ << " " << soid << " " << ctx + << " op " << pg_log_entry_t::get_op_name(log_op_type) + << dendl; + utime_t now = ceph_clock_now(); + +#ifdef HAVE_JAEGER + if (ctx->op->osd_parent_span) { + auto finish_ctx_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span); + } +#endif + // Drop the reference if deduped chunk is modified + if (ctx->new_obs.oi.is_dirty() && + (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) && + // If a clone is creating, ignore dropping the reference for manifest object + !ctx->delta_stats.num_object_clones && + ctx->new_obs.oi.size != 0 && // missing, redirect and delete + !ctx->cache_operation && + log_op_type != pg_log_entry_t::PROMOTE) { + dec_refcount_by_dirty(ctx); + } + + // finish and log the op. + if (ctx->user_modify) { + // update the user_version for any modify ops, except for the watch op + ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1; + /* In order for new clients and old clients to interoperate properly + * when exchanging versions, we need to lower bound the user_version + * (which our new clients pay proper attention to) + * by the at_version (which is all the old clients can ever see). */ + if (ctx->at_version.version > ctx->user_at_version) + ctx->user_at_version = ctx->at_version.version; + ctx->new_obs.oi.user_version = ctx->user_at_version; + } + ctx->bytes_written = ctx->op_t->get_bytes_written(); + + if (ctx->new_obs.exists) { + ctx->new_obs.oi.version = ctx->at_version; + ctx->new_obs.oi.prior_version = ctx->obs->oi.version; + ctx->new_obs.oi.last_reqid = ctx->reqid; + if (ctx->mtime != utime_t()) { + ctx->new_obs.oi.mtime = ctx->mtime; + dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl; + ctx->new_obs.oi.local_mtime = now; + } else { + dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl; + } + + // object_info_t + map attrs; + bufferlist bv(sizeof(ctx->new_obs.oi)); + encode(ctx->new_obs.oi, bv, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR] = std::move(bv); + + // snapset + if (soid.snap == CEPH_NOSNAP) { + dout(10) << " final snapset " << ctx->new_snapset + << " in " << soid << dendl; + bufferlist bss; + encode(ctx->new_snapset, bss); + attrs[SS_ATTR] = std::move(bss); + } else { + dout(10) << " no snapset (this is a clone)" << dendl; + } + ctx->op_t->setattrs(soid, attrs); + } else { + // reset cached oi + ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid); + } + + // append to log + ctx->log.push_back( + pg_log_entry_t(log_op_type, soid, ctx->at_version, + ctx->obs->oi.version, + ctx->user_at_version, ctx->reqid, + ctx->mtime, + (ctx->op && ctx->op->allows_returnvec()) ? result : 0)); + if (ctx->op && ctx->op->allows_returnvec()) { + // also the per-op values + ctx->log.back().set_op_returns(*ctx->ops); + dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns + << dendl; + } + + ctx->log.back().clean_regions = ctx->clean_regions; + dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl; + + if (soid.snap < CEPH_NOSNAP) { + switch (log_op_type) { + case pg_log_entry_t::MODIFY: + case pg_log_entry_t::PROMOTE: + case pg_log_entry_t::CLEAN: + dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset + << dendl; + encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps); + break; + default: + break; + } + } + + if (!ctx->extra_reqids.empty()) { + dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " " + << ctx->extra_reqid_return_codes << dendl; + ctx->log.back().extra_reqids.swap(ctx->extra_reqids); + ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes); + } + + // apply new object state. + ctx->obc->obs = ctx->new_obs; + + if (soid.is_head() && !ctx->obc->obs.exists) { + ctx->obc->ssc->exists = false; + ctx->obc->ssc->snapset = SnapSet(); + } else { + ctx->obc->ssc->exists = true; + ctx->obc->ssc->snapset = ctx->new_snapset; + } +} + +void PrimaryLogPG::apply_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats) { + + recovery_state.apply_op_stats(soid, delta_stats); + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + const pg_info_t& pinfo = recovery_state.get_peer_info(bt); + if (soid > pinfo.last_backfill && soid <= last_backfill_started) { + pending_backfill_updates[soid].stats.add(delta_stats); + } + } + + m_scrubber->stats_of_handled_objects(delta_stats, soid); +} + +void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx) +{ + auto m = ctx->op->get_req(); + ceph_assert(ctx->async_reads_complete()); + + for (auto p = ctx->ops->begin(); + p != ctx->ops->end() && result >= 0; ++p) { + if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = p->rval; + break; + } + ctx->bytes_read += p->outdata.length(); + } + ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0); + + MOSDOpReply *reply = ctx->reply; + ctx->reply = nullptr; + + if (result >= 0) { + if (!ctx->ignore_log_op_stats) { + log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read); + + publish_stats_to_osd(); + } + + // on read, return the current object version + if (ctx->obs) { + reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version); + } else { + reply->set_reply_versions(eversion_t(), ctx->user_at_version); + } + } else if (result == -ENOENT) { + // on ENOENT, set a floor for what the next user version will be. + reply->set_enoent_reply_versions(info.last_update, info.last_user_version); + } + + reply->set_result(result); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + osd->send_message_osd_client(reply, m->get_connection()); + close_op_ctx(ctx); +} + +// ======================================================================== +// copyfrom + +struct C_Copyfrom : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive + C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::CopyOpRef& c) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), cop(c) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock l{*pg}; + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->process_copy_chunk(oid, tid, r); + cop.reset(); + } + } +}; + +struct C_CopyFrom_AsyncReadCb : public Context { + OSDOp *osd_op; + object_copy_data_t reply_obj; + uint64_t features; + size_t len; + C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) : + osd_op(osd_op), features(features), len(0) {} + void finish(int r) override { + osd_op->rval = r; + if (r < 0) { + return; + } + + ceph_assert(len > 0); + ceph_assert(len <= reply_obj.data.length()); + bufferlist bl; + bl.substr_of(reply_obj.data, 0, len); + reply_obj.data.swap(bl); + encode(reply_obj, osd_op->outdata, features); + } +}; + +struct C_CopyChunk : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive + uint64_t offset = 0; + C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::CopyOpRef& c) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), cop(c) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock l{*pg}; + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->process_copy_chunk_manifest(oid, tid, r, offset); + cop.reset(); + } + } +}; + +int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp, + OSDOp& osd_op, ObjectContextRef &obc) +{ + object_info_t& oi = obc->obs.oi; + hobject_t& soid = oi.soid; + int result = 0; + object_copy_cursor_t cursor; + uint64_t out_max; + try { + decode(cursor, bp); + decode(out_max, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + return result; + } + + const MOSDOp *op = reinterpret_cast(ctx->op->get_req()); + uint64_t features = op->get_features(); + + bool async_read_started = false; + object_copy_data_t _reply_obj; + C_CopyFrom_AsyncReadCb *cb = nullptr; + if (pool.info.is_erasure()) { + cb = new C_CopyFrom_AsyncReadCb(&osd_op, features); + } + object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj; + // size, mtime + reply_obj.size = oi.size; + reply_obj.mtime = oi.mtime; + ceph_assert(obc->ssc); + if (soid.snap < CEPH_NOSNAP) { + auto p = obc->ssc->snapset.clone_snaps.find(soid.snap); + ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn? + reply_obj.snaps = p->second; + } else { + reply_obj.snap_seq = obc->ssc->snapset.seq; + } + if (oi.is_data_digest()) { + reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST; + reply_obj.data_digest = oi.data_digest; + } + if (oi.is_omap_digest()) { + reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST; + reply_obj.omap_digest = oi.omap_digest; + } + reply_obj.truncate_seq = oi.truncate_seq; + reply_obj.truncate_size = oi.truncate_size; + + // attrs + map& out_attrs = reply_obj.attrs; + if (!cursor.attr_complete) { + result = getattrs_maybe_cache( + ctx->obc, + &out_attrs); + if (result < 0) { + if (cb) { + delete cb; + } + return result; + } + cursor.attr_complete = true; + dout(20) << " got attrs" << dendl; + } + + int64_t left = out_max - osd_op.outdata.length(); + + // data + bufferlist& bl = reply_obj.data; + if (left > 0 && !cursor.data_complete) { + if (cursor.data_offset < oi.size) { + uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left); + if (cb) { + async_read_started = true; + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags), + make_pair(&bl, cb))); + cb->len = max_read; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + result = -EINPROGRESS; + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + } else { + result = pgbackend->objects_read_sync( + oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl); + if (result < 0) + return result; + } + left -= max_read; + cursor.data_offset += max_read; + } + if (cursor.data_offset == oi.size) { + cursor.data_complete = true; + dout(20) << " got data" << dendl; + } + ceph_assert(cursor.data_offset <= oi.size); + } + + // omap + uint32_t omap_keys = 0; + if (!pool.info.supports_omap() || !oi.is_omap()) { + cursor.omap_complete = true; + } else { + if (left > 0 && !cursor.omap_complete) { + ceph_assert(cursor.data_complete); + if (cursor.omap_offset.empty()) { + osd->store->omap_get_header(ch, ghobject_t(oi.soid), + &reply_obj.omap_header); + } + bufferlist omap_data; + ObjectMap::ObjectMapIterator iter = + osd->store->get_omap_iterator(ch, ghobject_t(oi.soid)); + ceph_assert(iter); + iter->upper_bound(cursor.omap_offset); + for (; iter->valid(); iter->next()) { + ++omap_keys; + encode(iter->key(), omap_data); + encode(iter->value(), omap_data); + left -= iter->key().length() + 4 + iter->value().length() + 4; + if (left <= 0) + break; + } + if (omap_keys) { + encode(omap_keys, reply_obj.omap_data); + reply_obj.omap_data.claim_append(omap_data); + } + if (iter->valid()) { + cursor.omap_offset = iter->key(); + } else { + cursor.omap_complete = true; + dout(20) << " got omap" << dendl; + } + } + } + + if (cursor.is_complete()) { + // include reqids only in the final step. this is a bit fragile + // but it works... + recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, + &reply_obj.reqids, + &reply_obj.reqid_return_codes); + dout(20) << " got reqids" << dendl; + } + + dout(20) << " cursor.is_complete=" << cursor.is_complete() + << " " << out_attrs.size() << " attrs" + << " " << bl.length() << " bytes" + << " " << reply_obj.omap_header.length() << " omap header bytes" + << " " << reply_obj.omap_data.length() << " omap data bytes in " + << omap_keys << " keys" + << " " << reply_obj.reqids.size() << " reqids" + << dendl; + reply_obj.cursor = cursor; + if (!async_read_started) { + encode(reply_obj, osd_op.outdata, features); + } + if (cb && !async_read_started) { + delete cb; + } + + if (result > 0) { + result = 0; + } + return result; +} + +void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid, + OSDOp& osd_op) +{ + const MOSDOp *m = static_cast(op->get_req()); + uint64_t features = m->get_features(); + object_copy_data_t reply_obj; + + recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids, + &reply_obj.reqid_return_codes); + dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl; + encode(reply_obj, osd_op.outdata, features); + osd_op.rval = -ENOENT; + MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + reply->set_result(-ENOENT); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + osd->send_message_osd_client(reply, m->get_connection()); +} + +void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc, + hobject_t src, object_locator_t oloc, + version_t version, unsigned flags, + bool mirror_snapset, + unsigned src_obj_fadvise_flags, + unsigned dest_obj_fadvise_flags) +{ + const hobject_t& dest = obc->obs.oi.soid; + dout(10) << __func__ << " " << dest + << " from " << src << " " << oloc << " v" << version + << " flags " << flags + << (mirror_snapset ? " mirror_snapset" : "") + << dendl; + + ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP); + + // cancel a previous in-progress copy? + if (copy_ops.count(dest)) { + // FIXME: if the src etc match, we could avoid restarting from the + // beginning. + CopyOpRef cop = copy_ops[dest]; + vector tids; + cancel_copy(cop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + } + + CopyOpRef cop(std::make_shared(cb, obc, src, oloc, version, flags, + mirror_snapset, src_obj_fadvise_flags, + dest_obj_fadvise_flags)); + copy_ops[dest] = cop; + obc->start_block(); + + if (!obc->obs.oi.has_manifest()) { + _copy_some(obc, cop); + } else { + if (obc->obs.oi.manifest.is_redirect()) { + _copy_some(obc, cop); + } else if (obc->obs.oi.manifest.is_chunked()) { + auto p = obc->obs.oi.manifest.chunk_map.begin(); + _copy_some_manifest(obc, cop, p->first); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + } +} + +void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop) +{ + dout(10) << __func__ << " " << *obc << " " << cop << dendl; + + unsigned flags = 0; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH) + flags |= CEPH_OSD_FLAG_FLUSH; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE) + flags |= CEPH_OSD_FLAG_IGNORE_CACHE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY) + flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE) + flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED) + flags |= CEPH_OSD_FLAG_RWORDERED; + + C_GatherBuilder gather(cct); + + if (cop->cursor.is_initial() && cop->mirror_snapset) { + // list snaps too. + ceph_assert(cop->src.snap == CEPH_NOSNAP); + ObjectOperation op; + op.list_snaps(&cop->results.snapset, NULL); + ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op, + CEPH_SNAPDIR, NULL, + flags, gather.new_sub(), NULL); + cop->objecter_tid2 = tid; + } + + ObjectOperation op; + if (cop->results.user_version) { + op.assert_version(cop->results.user_version); + } else { + // we should learn the version after the first chunk, if we didn't know + // it already! + ceph_assert(cop->cursor.is_initial()); + } + op.copy_get(&cop->cursor, get_copy_chunk_size(), + &cop->results.object_size, &cop->results.mtime, + &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data, + &cop->results.snaps, &cop->results.snap_seq, + &cop->results.flags, + &cop->results.source_data_digest, + &cop->results.source_omap_digest, + &cop->results.reqids, + &cop->results.reqid_return_codes, + &cop->results.truncate_seq, + &cop->results.truncate_size, + &cop->rval); + op.set_last_op_flags(cop->src_obj_fadvise_flags); + + C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid, + get_last_peering_reset(), cop); + gather.set_finisher(new C_OnFinisher(fin, + osd->get_objecter_finisher(get_pg_shard()))); + + ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op, + cop->src.snap, NULL, + flags, + gather.new_sub(), + // discover the object version if we don't know it yet + cop->results.user_version ? NULL : &cop->results.user_version); + fin->tid = tid; + cop->objecter_tid = tid; + gather.activate(); +} + +void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset) +{ + dout(10) << __func__ << " " << *obc << " " << cop << dendl; + + unsigned flags = 0; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH) + flags |= CEPH_OSD_FLAG_FLUSH; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE) + flags |= CEPH_OSD_FLAG_IGNORE_CACHE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY) + flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE) + flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED) + flags |= CEPH_OSD_FLAG_RWORDERED; + + int num_chunks = 0; + uint64_t last_offset = 0, chunks_size = 0; + object_manifest_t *manifest = &obc->obs.oi.manifest; + map::iterator iter = manifest->chunk_map.find(start_offset); + for (;iter != manifest->chunk_map.end(); ++iter) { + num_chunks++; + chunks_size += iter->second.length; + last_offset = iter->first; + if (get_copy_chunk_size() < chunks_size) { + break; + } + } + + cop->num_chunk = num_chunks; + cop->start_offset = start_offset; + cop->last_offset = last_offset; + dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks + << " start_offset: " << start_offset << " chunks_size: " << chunks_size + << " last_offset: " << last_offset << dendl; + + iter = manifest->chunk_map.find(start_offset); + for (;iter != manifest->chunk_map.end(); ++iter) { + uint64_t obj_offset = iter->first; + uint64_t length = manifest->chunk_map[iter->first].length; + hobject_t soid = manifest->chunk_map[iter->first].oid; + object_locator_t oloc(soid); + CopyCallback * cb = NULL; + CopyOpRef sub_cop(std::make_shared(cb, ObjectContextRef(), cop->src, oloc, + cop->results.user_version, cop->flags, cop->mirror_snapset, + cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags)); + sub_cop->cursor.data_offset = obj_offset; + cop->chunk_cops[obj_offset] = sub_cop; + + int s = sub_cop->chunk_ops.size(); + sub_cop->chunk_ops.resize(s+1); + sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ; + sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset; + sub_cop->chunk_ops[s].op.extent.length = length; + + ObjectOperation op; + op.dup(sub_cop->chunk_ops); + + if (cop->results.user_version) { + op.assert_version(cop->results.user_version); + } else { + // we should learn the version after the first chunk, if we didn't know + // it already! + ceph_assert(cop->cursor.is_initial()); + } + op.set_last_op_flags(cop->src_obj_fadvise_flags); + + C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid, + get_last_peering_reset(), cop); + fin->offset = obj_offset; + + ceph_tid_t tid = osd->objecter->read( + soid.oid, oloc, op, + sub_cop->src.snap, NULL, + flags, + new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())), + // discover the object version if we don't know it yet + sub_cop->results.user_version ? NULL : &sub_cop->results.user_version); + fin->tid = tid; + sub_cop->objecter_tid = tid; + + dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: " + << manifest->chunk_map[iter->first].offset + << " length: " << length << " pool id: " << oloc.pool + << " tid: " << tid << dendl; + + if (last_offset < iter->first) { + break; + } + } +} + +void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map::iterator p = copy_ops.find(oid); + if (p == copy_ops.end()) { + dout(10) << __func__ << " no copy_op found" << dendl; + return; + } + CopyOpRef cop = p->second; + if (tid != cop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != cop " << cop + << " tid " << cop->objecter_tid << dendl; + return; + } + + if (cop->omap_data.length() || cop->omap_header.length()) + cop->results.has_omap = true; + + if (r >= 0 && !pool.info.supports_omap() && + (cop->omap_data.length() || cop->omap_header.length())) { + r = -EOPNOTSUPP; + } + cop->objecter_tid = 0; + cop->objecter_tid2 = 0; // assume this ordered before us (if it happened) + ObjectContextRef& cobc = cop->obc; + + if (r < 0) + goto out; + + ceph_assert(cop->rval >= 0); + + if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) { + // verify snap hasn't been deleted + vector::iterator p = cop->results.snaps.begin(); + while (p != cop->results.snaps.end()) { + // make best effort to sanitize snaps/clones. + if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) { + dout(10) << __func__ << " clone snap " << *p << " has been deleted" + << dendl; + for (vector::iterator q = p + 1; + q != cop->results.snaps.end(); + ++q) + *(q - 1) = *q; + cop->results.snaps.resize(cop->results.snaps.size() - 1); + } else { + ++p; + } + } + if (cop->results.snaps.empty()) { + dout(10) << __func__ << " no more snaps for " << oid << dendl; + r = -ENOENT; + goto out; + } + } + + ceph_assert(cop->rval >= 0); + + if (!cop->temp_cursor.data_complete) { + cop->results.data_digest = cop->data.crc32c(cop->results.data_digest); + } + if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) { + if (cop->omap_header.length()) { + cop->results.omap_digest = + cop->omap_header.crc32c(cop->results.omap_digest); + } + if (cop->omap_data.length()) { + bufferlist keys; + keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4); + cop->results.omap_digest = keys.crc32c(cop->results.omap_digest); + } + } + + if (!cop->temp_cursor.attr_complete) { + for (map::iterator p = cop->attrs.begin(); + p != cop->attrs.end(); + ++p) { + cop->results.attrs[string("_") + p->first] = p->second; + } + cop->attrs.clear(); + } + + if (!cop->cursor.is_complete()) { + // write out what we have so far + if (cop->temp_cursor.is_initial()) { + ceph_assert(!cop->results.started_temp_obj); + cop->results.started_temp_obj = true; + cop->results.temp_oid = generate_temp_object(oid); + dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl; + } + ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true); + OpContextUPtr ctx = simple_opc_create(tempobc); + if (cop->temp_cursor.is_initial()) { + ctx->new_temp_oid = cop->results.temp_oid; + } + _write_copy_chunk(cop, ctx->op_t.get()); + simple_opc_submit(std::move(ctx)); + dout(10) << __func__ << " fetching more" << dendl; + _copy_some(cobc, cop); + return; + } + + // verify digests? + if (cop->results.is_data_digest() || cop->results.is_omap_digest()) { + dout(20) << __func__ << std::hex + << " got digest: rx data 0x" << cop->results.data_digest + << " omap 0x" << cop->results.omap_digest + << ", source: data 0x" << cop->results.source_data_digest + << " omap 0x" << cop->results.source_omap_digest + << std::dec + << " flags " << cop->results.flags + << dendl; + } + if (cop->results.is_data_digest() && + cop->results.data_digest != cop->results.source_data_digest) { + derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest + << " != source 0x" << cop->results.source_data_digest << std::dec + << dendl; + osd->clog->error() << info.pgid << " copy from " << cop->src + << " to " << cop->obc->obs.oi.soid << std::hex + << " data digest 0x" << cop->results.data_digest + << " != source 0x" << cop->results.source_data_digest + << std::dec; + r = -EIO; + goto out; + } + if (cop->results.is_omap_digest() && + cop->results.omap_digest != cop->results.source_omap_digest) { + derr << __func__ << std::hex + << " omap digest 0x" << cop->results.omap_digest + << " != source 0x" << cop->results.source_omap_digest + << std::dec << dendl; + osd->clog->error() << info.pgid << " copy from " << cop->src + << " to " << cop->obc->obs.oi.soid << std::hex + << " omap digest 0x" << cop->results.omap_digest + << " != source 0x" << cop->results.source_omap_digest + << std::dec; + r = -EIO; + goto out; + } + if (cct->_conf->osd_debug_inject_copyfrom_error) { + derr << __func__ << " injecting copyfrom failure" << dendl; + r = -EIO; + goto out; + } + + cop->results.fill_in_final_tx = std::function( + [this, &cop /* avoid ref cycle */](PGTransaction *t) { + ObjectState& obs = cop->obc->obs; + if (cop->temp_cursor.is_initial()) { + dout(20) << "fill_in_final_tx: writing " + << "directly to final object" << dendl; + // write directly to final object + cop->results.temp_oid = obs.oi.soid; + _write_copy_chunk(cop, t); + } else { + // finish writing to temp object, then move into place + dout(20) << "fill_in_final_tx: writing to temp object" << dendl; + if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) { + /* In redirect manifest case, the object exists in the upper tier. + * So, to avoid a conflict when rename() is called, remove existing + * object first + */ + t->remove(obs.oi.soid); + } + _write_copy_chunk(cop, t); + t->rename(obs.oi.soid, cop->results.temp_oid); + } + t->setattrs(obs.oi.soid, cop->results.attrs); + }); + + dout(20) << __func__ << " success; committing" << dendl; + + out: + dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl; + CopyCallbackResults results(r, &cop->results); + cop->cb->complete(results); + + copy_ops.erase(cobc->obs.oi.soid); + cobc->stop_block(); + + if (r < 0 && cop->results.started_temp_obj) { + dout(10) << __func__ << " deleting partial temp object " + << cop->results.temp_oid << dendl; + ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true); + OpContextUPtr ctx = simple_opc_create(tempobc); + ctx->op_t->remove(cop->results.temp_oid); + ctx->discard_temp_oid = cop->results.temp_oid; + simple_opc_submit(std::move(ctx)); + } + + // cancel and requeue proxy ops on this object + if (!r) { + cancel_and_requeue_proxy_ops(cobc->obs.oi.soid); + } + + kick_object_context_blocked(cobc); +} + +void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map::iterator p = copy_ops.find(oid); + if (p == copy_ops.end()) { + dout(10) << __func__ << " no copy_op found" << dendl; + return; + } + CopyOpRef obj_cop = p->second; + CopyOpRef chunk_cop = obj_cop->chunk_cops[offset]; + + if (tid != chunk_cop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop + << " tid " << chunk_cop->objecter_tid << dendl; + return; + } + + if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) { + r = -EOPNOTSUPP; + } + + chunk_cop->objecter_tid = 0; + chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened) + ObjectContextRef& cobc = obj_cop->obc; + OSDOp &chunk_data = chunk_cop->chunk_ops[0]; + + if (r < 0) { + obj_cop->failed = true; + goto out; + } + + if (obj_cop->failed) { + return; + } + if (!chunk_data.outdata.length()) { + r = -EIO; + obj_cop->failed = true; + goto out; + } + + obj_cop->num_chunk--; + + /* check all of the copyop are completed */ + if (obj_cop->num_chunk) { + dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl; + return; + } + + { + OpContextUPtr ctx = simple_opc_create(obj_cop->obc); + if (!ctx->lock_manager.take_write_lock( + obj_cop->obc->obs.oi.soid, + obj_cop->obc)) { + // recovery op can take read lock. + // so need to wait for recovery completion + r = -EAGAIN; + obj_cop->failed = true; + close_op_ctx(ctx.release()); + goto out; + } + dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl; + + PGTransaction *t = ctx->op_t.get(); + ObjectState& obs = ctx->new_obs; + for (auto p : obj_cop->chunk_cops) { + OSDOp &sub_chunk = p.second->chunk_ops[0]; + t->write(cobc->obs.oi.soid, + p.second->cursor.data_offset, + sub_chunk.outdata.length(), + sub_chunk.outdata, + p.second->dest_obj_fadvise_flags); + dout(20) << __func__ << " offset: " << p.second->cursor.data_offset + << " length: " << sub_chunk.outdata.length() << dendl; + write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges, + p.second->cursor.data_offset, sub_chunk.outdata.length()); + obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING); + ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length()); + sub_chunk.outdata.clear(); + } + obs.oi.clear_data_digest(); + ctx->at_version = get_next_version(); + finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE); + simple_opc_submit(std::move(ctx)); + + auto p = cobc->obs.oi.manifest.chunk_map.rbegin(); + /* check remaining work */ + if (p != cobc->obs.oi.manifest.chunk_map.rend()) { + if (obj_cop->last_offset >= p->first + p->second.length) { + for (auto &en : cobc->obs.oi.manifest.chunk_map) { + if (obj_cop->last_offset < en.first) { + _copy_some_manifest(cobc, obj_cop, en.first); + return; + } + } + } + } + } + + out: + dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl; + CopyCallbackResults results(r, &obj_cop->results); + obj_cop->cb->complete(results); + + copy_ops.erase(cobc->obs.oi.soid); + cobc->stop_block(); + + // cancel and requeue proxy ops on this object + if (!r) { + cancel_and_requeue_proxy_ops(cobc->obs.oi.soid); + } + + kick_object_context_blocked(cobc); +} + +void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) { + vector tids; + for (map::iterator it = proxyread_ops.begin(); + it != proxyread_ops.end();) { + if (it->second->soid == oid) { + cancel_proxy_read((it++)->second, &tids); + } else { + ++it; + } + } + for (map::iterator it = proxywrite_ops.begin(); + it != proxywrite_ops.end();) { + if (it->second->soid == oid) { + cancel_proxy_write((it++)->second, &tids); + } else { + ++it; + } + } + osd->objecter->op_cancel(tids, -ECANCELED); + kick_proxy_ops_blocked(oid); +} + +void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t) +{ + dout(20) << __func__ << " " << cop + << " " << cop->attrs.size() << " attrs" + << " " << cop->data.length() << " bytes" + << " " << cop->omap_header.length() << " omap header bytes" + << " " << cop->omap_data.length() << " omap data bytes" + << dendl; + if (!cop->temp_cursor.attr_complete) { + t->create(cop->results.temp_oid); + } + if (!cop->temp_cursor.data_complete) { + ceph_assert(cop->data.length() + cop->temp_cursor.data_offset == + cop->cursor.data_offset); + if (pool.info.required_alignment() && + !cop->cursor.data_complete) { + /** + * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset + * to pick it up on the next pass. + */ + ceph_assert(cop->temp_cursor.data_offset % + pool.info.required_alignment() == 0); + if (cop->data.length() % pool.info.required_alignment() != 0) { + uint64_t to_trim = + cop->data.length() % pool.info.required_alignment(); + bufferlist bl; + bl.substr_of(cop->data, 0, cop->data.length() - to_trim); + cop->data.swap(bl); + cop->cursor.data_offset -= to_trim; + ceph_assert(cop->data.length() + cop->temp_cursor.data_offset == + cop->cursor.data_offset); + } + } + if (cop->data.length()) { + t->write( + cop->results.temp_oid, + cop->temp_cursor.data_offset, + cop->data.length(), + cop->data, + cop->dest_obj_fadvise_flags); + } + cop->data.clear(); + } + if (pool.info.supports_omap()) { + if (!cop->temp_cursor.omap_complete) { + if (cop->omap_header.length()) { + t->omap_setheader( + cop->results.temp_oid, + cop->omap_header); + cop->omap_header.clear(); + } + if (cop->omap_data.length()) { + map omap; + bufferlist::const_iterator p = cop->omap_data.begin(); + decode(omap, p); + t->omap_setkeys(cop->results.temp_oid, omap); + cop->omap_data.clear(); + } + } + } else { + ceph_assert(cop->omap_header.length() == 0); + ceph_assert(cop->omap_data.length() == 0); + } + cop->temp_cursor = cop->cursor; +} + +void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb) +{ + OpContext *ctx = cb->ctx; + dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl; + + ObjectState& obs = ctx->new_obs; + if (obs.exists) { + dout(20) << __func__ << ": exists, removing" << dendl; + ctx->op_t->remove(obs.oi.soid); + } else { + ctx->delta_stats.num_objects++; + obs.exists = true; + } + if (cb->is_temp_obj_used()) { + ctx->discard_temp_oid = cb->results->temp_oid; + } + cb->results->fill_in_final_tx(ctx->op_t.get()); + + // CopyFromCallback fills this in for us + obs.oi.user_version = ctx->user_at_version; + + if (cb->results->is_data_digest()) { + obs.oi.set_data_digest(cb->results->data_digest); + } else { + obs.oi.clear_data_digest(); + } + if (cb->results->is_omap_digest()) { + obs.oi.set_omap_digest(cb->results->omap_digest); + } else { + obs.oi.clear_omap_digest(); + } + + obs.oi.truncate_seq = cb->truncate_seq; + obs.oi.truncate_size = cb->truncate_size; + + obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime); + ctx->mtime = utime_t(); + + ctx->extra_reqids = cb->results->reqids; + ctx->extra_reqid_return_codes = cb->results->reqid_return_codes; + + // cache: clear whiteout? + if (obs.oi.is_whiteout()) { + dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + --ctx->delta_stats.num_whiteouts; + } + + if (cb->results->has_omap) { + dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl; + obs.oi.set_flag(object_info_t::FLAG_OMAP); + ctx->clean_regions.mark_omap_dirty(); + } else { + dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + + interval_set ch; + if (obs.oi.size > 0) + ch.insert(0, obs.oi.size); + ctx->modified_ranges.union_of(ch); + ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size())); + + if (cb->get_data_size() != obs.oi.size) { + ctx->delta_stats.num_bytes -= obs.oi.size; + obs.oi.size = cb->get_data_size(); + ctx->delta_stats.num_bytes += obs.oi.size; + } + ctx->delta_stats.num_wr++; + ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10); + + osd->logger->inc(l_osd_copyfrom); +} + +void PrimaryLogPG::finish_promote(int r, CopyResults *results, + ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + dout(10) << __func__ << " " << soid << " r=" << r + << " uv" << results->user_version << dendl; + + if (r == -ECANCELED) { + return; + } + + if (r != -ENOENT && soid.is_snap()) { + if (results->snaps.empty()) { + // we must have read "snap" content from the head object in the + // base pool. use snap_seq to construct what snaps should be + // for this clone (what is was before we evicted the clean clone + // from this pool, and what it will be when we flush and the + // clone eventually happens in the base pool). we want to use + // snaps in (results->snap_seq,soid.snap] + SnapSet& snapset = obc->ssc->snapset; + for (auto p = snapset.clone_snaps.rbegin(); + p != snapset.clone_snaps.rend(); + ++p) { + for (auto snap : p->second) { + if (snap > soid.snap) { + continue; + } + if (snap <= results->snap_seq) { + break; + } + results->snaps.push_back(snap); + } + } + } + + dout(20) << __func__ << " snaps " << results->snaps << dendl; + filter_snapc(results->snaps); + + dout(20) << __func__ << " filtered snaps " << results->snaps << dendl; + if (results->snaps.empty()) { + dout(20) << __func__ + << " snaps are empty, clone is invalid," + << " setting r to ENOENT" << dendl; + r = -ENOENT; + } + } + + if (r < 0 && results->started_temp_obj) { + dout(10) << __func__ << " abort; will clean up partial work" << dendl; + ObjectContextRef tempobc = get_object_context(results->temp_oid, false); + ceph_assert(tempobc); + OpContextUPtr ctx = simple_opc_create(tempobc); + ctx->op_t->remove(results->temp_oid); + simple_opc_submit(std::move(ctx)); + results->started_temp_obj = false; + } + + if (r == -ENOENT && soid.is_snap()) { + dout(10) << __func__ + << ": enoent while trying to promote clone, " << soid + << " must have been trimmed, removing from snapset" + << dendl; + hobject_t head(soid.get_head()); + ObjectContextRef obc = get_object_context(head, false); + ceph_assert(obc); + + OpContextUPtr tctx = simple_opc_create(obc); + tctx->at_version = get_next_version(); + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + filter_snapc(tctx->new_snapset.snaps); + } else { + tctx->new_snapset.snaps.clear(); + } + vector new_clones; + map> new_clone_snaps; + for (vector::iterator i = tctx->new_snapset.clones.begin(); + i != tctx->new_snapset.clones.end(); + ++i) { + if (*i != soid.snap) { + new_clones.push_back(*i); + auto p = tctx->new_snapset.clone_snaps.find(*i); + if (p != tctx->new_snapset.clone_snaps.end()) { + new_clone_snaps[*i] = p->second; + } + } + } + tctx->new_snapset.clones.swap(new_clones); + tctx->new_snapset.clone_overlap.erase(soid.snap); + tctx->new_snapset.clone_size.erase(soid.snap); + tctx->new_snapset.clone_snaps.swap(new_clone_snaps); + + // take RWWRITE lock for duration of our local write. ignore starvation. + if (!tctx->lock_manager.take_write_lock( + head, + obc)) { + ceph_abort_msg("problem!"); + } + dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl; + + finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE); + + simple_opc_submit(std::move(tctx)); + return; + } + + bool whiteout = false; + if (r == -ENOENT) { + ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above + dout(10) << __func__ << " whiteout " << soid << dendl; + whiteout = true; + } + + if (r < 0 && !whiteout) { + derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl; + // pass error to everyone blocked on this object + // FIXME: this is pretty sloppy, but at this point we got + // something unexpected and don't have many other options. + map>::iterator blocked_iter = + waiting_for_blocked_object.find(soid); + if (blocked_iter != waiting_for_blocked_object.end()) { + while (!blocked_iter->second.empty()) { + osd->reply_op_error(blocked_iter->second.front(), r); + blocked_iter->second.pop_front(); + } + waiting_for_blocked_object.erase(blocked_iter); + } + return; + } + + osd->promote_finish(results->object_size); + + OpContextUPtr tctx = simple_opc_create(obc); + tctx->at_version = get_next_version(); + + if (!obc->obs.oi.has_manifest()) { + ++tctx->delta_stats.num_objects; + } + if (soid.snap < CEPH_NOSNAP) + ++tctx->delta_stats.num_object_clones; + tctx->new_obs.exists = true; + + tctx->extra_reqids = results->reqids; + tctx->extra_reqid_return_codes = results->reqid_return_codes; + + if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) { + tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE; + tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE); + tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST); + tctx->new_obs.oi.manifest.redirect_target = hobject_t(); + tctx->delta_stats.num_objects_manifest--; + if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) { + dec_all_refcount_manifest(obc->obs.oi, tctx.get()); + } + } + + if (whiteout) { + // create a whiteout + tctx->op_t->create(soid); + tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT); + ++tctx->delta_stats.num_whiteouts; + dout(20) << __func__ << " creating whiteout on " << soid << dendl; + osd->logger->inc(l_osd_tier_whiteout); + } else { + if (results->has_omap) { + dout(10) << __func__ << " setting omap flag on " << soid << dendl; + tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP); + ++tctx->delta_stats.num_objects_omap; + } + + results->fill_in_final_tx(tctx->op_t.get()); + if (results->started_temp_obj) { + tctx->discard_temp_oid = results->temp_oid; + } + tctx->new_obs.oi.size = results->object_size; + tctx->new_obs.oi.user_version = results->user_version; + tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime); + tctx->mtime = utime_t(); + if (results->is_data_digest()) { + tctx->new_obs.oi.set_data_digest(results->data_digest); + } else { + tctx->new_obs.oi.clear_data_digest(); + } + if (results->object_size) + tctx->clean_regions.mark_data_region_dirty(0, results->object_size); + if (results->is_omap_digest()) { + tctx->new_obs.oi.set_omap_digest(results->omap_digest); + } else { + tctx->new_obs.oi.clear_omap_digest(); + } + if (results->has_omap) + tctx->clean_regions.mark_omap_dirty(); + tctx->new_obs.oi.truncate_seq = results->truncate_seq; + tctx->new_obs.oi.truncate_size = results->truncate_size; + + if (soid.snap != CEPH_NOSNAP) { + ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap)); + ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap)); + ceph_assert(obc->ssc->snapset.clone_size[soid.snap] == + results->object_size); + ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap)); + + tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap); + } else { + tctx->delta_stats.num_bytes += results->object_size; + } + } + + if (results->mirror_snapset) { + ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP); + tctx->new_snapset.from_snap_set( + results->snapset, + get_osdmap()->require_osd_release < ceph_release_t::luminous); + } + dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl; + + // take RWWRITE lock for duration of our local write. ignore starvation. + if (!tctx->lock_manager.take_write_lock( + obc->obs.oi.soid, + obc)) { + ceph_abort_msg("problem!"); + } + dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl; + + finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE); + + simple_opc_submit(std::move(tctx)); + + osd->logger->inc(l_osd_tier_promote); + + if (agent_state && + agent_state->is_idle()) + agent_choose_mode(); +} + +void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results, + ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + dout(10) << __func__ << " " << soid << " r=" << r + << " uv" << results->user_version << dendl; + + if (r == -ECANCELED || r == -EAGAIN) { + return; + } + + if (r < 0) { + derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl; + // pass error to everyone blocked on this object + // FIXME: this is pretty sloppy, but at this point we got + // something unexpected and don't have many other options. + map>::iterator blocked_iter = + waiting_for_blocked_object.find(soid); + if (blocked_iter != waiting_for_blocked_object.end()) { + while (!blocked_iter->second.empty()) { + osd->reply_op_error(blocked_iter->second.front(), r); + blocked_iter->second.pop_front(); + } + waiting_for_blocked_object.erase(blocked_iter); + } + return; + } + + osd->promote_finish(results->object_size); + osd->logger->inc(l_osd_tier_promote); + + if (agent_state && + agent_state->is_idle()) + agent_choose_mode(); +} + +void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue, + vector *tids) +{ + dout(10) << __func__ << " " << cop->obc->obs.oi.soid + << " from " << cop->src << " " << cop->oloc + << " v" << cop->results.user_version << dendl; + + // cancel objecter op, if we can + if (cop->objecter_tid) { + tids->push_back(cop->objecter_tid); + cop->objecter_tid = 0; + if (cop->objecter_tid2) { + tids->push_back(cop->objecter_tid2); + cop->objecter_tid2 = 0; + } + } + + copy_ops.erase(cop->obc->obs.oi.soid); + cop->obc->stop_block(); + + kick_object_context_blocked(cop->obc); + cop->results.should_requeue = requeue; + CopyCallbackResults result(-ECANCELED, &cop->results); + cop->cb->complete(result); + + // There may still be an objecter callback referencing this copy op. + // That callback will not need the obc since it's been canceled, and + // we need the obc reference to go away prior to flush. + cop->obc = ObjectContextRef(); +} + +void PrimaryLogPG::cancel_copy_ops(bool requeue, vector *tids) +{ + dout(10) << __func__ << dendl; + map::iterator p = copy_ops.begin(); + while (p != copy_ops.end()) { + // requeue this op? can I queue up all of them? + cancel_copy((p++)->second, requeue, tids); + } +} + + +// ======================================================================== +// flush +// +// Flush a dirty object in the cache tier by writing it back to the +// base tier. The sequence looks like: +// +// * send a copy-from operation to the base tier to copy the current +// version of the object +// * base tier will pull the object via (perhaps multiple) copy-get(s) +// * on completion, we check if the object has been modified. if so, +// just reply with -EAGAIN. +// * try to take a write lock so we can clear the dirty flag. if this +// fails, wait and retry +// * start a repop that clears the bit. +// +// If we have to wait, we will retry by coming back through the +// start_flush method. We check if a flush is already in progress +// and, if so, try to finish it by rechecking the version and trying +// to clear the dirty bit. +// +// In order for the cache-flush (a write op) to not block the copy-get +// from reading the object, the client *must* set the SKIPRWLOCKS +// flag. +// +// NOTE: normally writes are strictly ordered for the client, but +// flushes are special in that they can be reordered with respect to +// other writes. In particular, we can't have a flush request block +// an update to the cache pool object! + +struct C_Flush : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + utime_t start; + C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), start(ceph_clock_now()) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock locker{*pg}; + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_flush(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start); + } + } +}; + +int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc) +{ + const object_info_t& oi = obc->obs.oi; + const hobject_t& soid = oi.soid; + + ceph_assert(obc->is_blocked()); + if (oi.size == 0) { + // evicted + return 0; + } + if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) { + dout(0) << " fingerprint algorithm is not set " << dendl; + return -EINVAL; + } + if (pool.info.get_dedup_tier() <= 0) { + dout(10) << " dedup tier is not set " << dendl; + return -EINVAL; + } + + /* + * The operations to make dedup chunks are tracked by a ManifestOp. + * This op will be finished if all the operations are completed. + */ + ManifestOpRef mop(std::make_shared(nullptr)); + + // cdc + std::map chunks; + int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks); + if (r < 0) { + return r; + } + if (!chunks.size()) { + return 0; + } + + // chunks issued here are different with chunk_map newly generated + // because the same chunks in previous snap will not be issued + // So, we need two data structures; the first is the issued chunk list to track + // issued operations, and the second is the new chunk_map to update chunk_map after + // all operations are finished + object_ref_delta_t refs; + ObjectContextRef obc_l, obc_g; + get_adjacent_clones(obc, obc_l, obc_g); + // skip if the same content exits in prev snap at same offset + mop->new_manifest.calc_refs_to_inc_on_set( + obc_l ? &(obc_l->obs.oi.manifest) : nullptr, + obc_g ? &(obc_g->obs.oi.manifest) : nullptr, + refs); + + for (auto p : chunks) { + hobject_t target = mop->new_manifest.chunk_map[p.first].oid; + if (refs.find(target) == refs.end()) { + continue; + } + C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first); + ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF, + fin, move(chunks[p.first])); + mop->chunks[target] = make_pair(p.first, p.second.length()); + mop->num_chunks++; + mop->tids[p.first] = tid; + fin->tid = tid; + dout(10) << __func__ << " oid: " << soid << " tid: " << tid + << " target: " << target << " offset: " << p.first + << " length: " << p.second.length() << dendl; + } + + if (mop->tids.size()) { + manifest_ops[soid] = mop; + manifest_ops[soid]->op = op; + } else { + // size == 0 + return 0; + } + + return -EINPROGRESS; +} + +int PrimaryLogPG::do_cdc(const object_info_t& oi, + std::map& chunk_map, + std::map& chunks) +{ + string chunk_algo = pool.info.get_dedup_chunk_algorithm_name(); + int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size(); + uint64_t total_length = 0; + + std::unique_ptr cdc = CDC::create(chunk_algo, cbits(chunk_size)-1); + if (!cdc) { + dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl; + return -EINVAL; + } + + bufferlist bl; + /** + * We disable EC pool as a base tier of distributed dedup. + * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync(). + * Therefore, we should change the current implementation totally to make EC pool compatible. + * As s result, we leave this as a future work. + */ + int r = pgbackend->objects_read_sync( + oi.soid, 0, oi.size, 0, &bl); + if (r < 0) { + dout(0) << __func__ << " read fail " << oi.soid + << " len: " << oi.size << " r: " << r << dendl; + return r; + } + if (bl.length() != oi.size) { + dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: " + << oi.size << " during chunking " << dendl; + return -EIO; + } + + dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length() + << " oi.size: " << oi.size + << " chunk_size: " << chunk_size << dendl; + + vector> cdc_chunks; + cdc->calc_chunks(bl, &cdc_chunks); + + // get fingerprint + for (auto p : cdc_chunks) { + bufferlist chunk; + chunk.substr_of(bl, p.first, p.second); + auto [ret, target] = get_fpoid_from_chunk(oi.soid, chunk); + if (ret < 0) { + return ret; + } + chunks[p.first] = std::move(chunk); + chunk_map[p.first] = chunk_info_t(0, p.second, target); + total_length += p.second; + } + return total_length; +} + +std::pair PrimaryLogPG::get_fpoid_from_chunk( + const hobject_t soid, bufferlist& chunk) +{ + pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type(); + if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) { + return make_pair(-EINVAL, hobject_t()); + } + object_t fp_oid = [&fp_algo, &chunk]() -> string { + switch (fp_algo) { + case pg_pool_t::TYPE_FINGERPRINT_SHA1: + return ceph::crypto::digest(chunk).to_str(); + case pg_pool_t::TYPE_FINGERPRINT_SHA256: + return ceph::crypto::digest(chunk).to_str(); + case pg_pool_t::TYPE_FINGERPRINT_SHA512: + return ceph::crypto::digest(chunk).to_str(); + default: + assert(0 == "unrecognized fingerprint type"); + return {}; + } + }(); + + pg_t raw_pg; + object_locator_t oloc(soid); + oloc.pool = pool.info.get_dedup_tier(); + // check if dedup_tier isn't set + ceph_assert(oloc.pool > 0); + int ret = get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg); + if (ret < 0) { + return make_pair(ret, hobject_t()); + } + hobject_t target(fp_oid, oloc.key, snapid_t(), + raw_pg.ps(), raw_pg.pool(), + oloc.nspace); + return make_pair(0, target); +} + +int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map::iterator p = manifest_ops.find(oid); + if (p == manifest_ops.end()) { + dout(10) << __func__ << " no manifest_op found" << dendl; + return -EINVAL; + } + ManifestOpRef mop = p->second; + mop->results[offset] = r; + if (r < 0) { + // if any failure occurs, put a mark on the results to recognize the failure + mop->results[0] = r; + } + if (mop->num_chunks != mop->results.size()) { + // there are on-going works + return -EINPROGRESS; + } + ObjectContextRef obc = get_object_context(oid, false); + if (!obc) { + if (mop->op) + osd->reply_op_error(mop->op, -EINVAL); + return -EINVAL; + } + ceph_assert(obc->is_blocked()); + obc->stop_block(); + kick_object_context_blocked(obc); + if (mop->results[0] < 0) { + // check if the previous op returns fail + ceph_assert(mop->num_chunks == mop->results.size()); + manifest_ops.erase(oid); + osd->reply_op_error(mop->op, mop->results[0]); + return -EIO; + } + + if (mop->chunks.size()) { + OpContextUPtr ctx = simple_opc_create(obc); + ceph_assert(ctx); + if (ctx->lock_manager.get_lock_type( + RWState::RWWRITE, + oid, + obc, + mop->op)) { + dout(20) << __func__ << " took write lock" << dendl; + } else if (mop->op) { + dout(10) << __func__ << " waiting on write lock " << mop->op << dendl; + close_op_ctx(ctx.release()); + return -EAGAIN; + } + + ctx->at_version = get_next_version(); + ctx->new_obs = obc->obs; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + + /* + * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head. + * head: [0, 2) aaa <-- tier_flush() + * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc + * + * In this case, if the new chunk_map is as follows, + * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc + * we should drop aaa from head by using calc_refs_to_drop_on_removal(). + * So, the precedure is + * 1. calc_refs_to_drop_on_removal() + * 2. register old references to drop after tier_flush() is committed + * 3. update new chunk_map + */ + + ObjectCleanRegions c_regions = ctx->clean_regions; + ObjectContextRef cobc = get_prev_clone_obc(obc); + c_regions.mark_fully_dirty(); + // CDC was done on entire range of manifest object, + // so the first thing we should do here is to drop the reference to old chunks + ObjectContextRef obc_l, obc_g; + get_adjacent_clones(obc, obc_l, obc_g); + // clear all old references + object_ref_delta_t refs; + ctx->obs->oi.manifest.calc_refs_to_drop_on_removal( + obc_l ? &(obc_l->obs.oi.manifest) : nullptr, + obc_g ? &(obc_g->obs.oi.manifest) : nullptr, + refs); + if (!refs.is_empty()) { + ctx->register_on_commit( + [oid, this, refs](){ + dec_refcount(oid, refs); + }); + } + + // set new references + ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map; + + finish_ctx(ctx.get(), pg_log_entry_t::CLEAN); + simple_opc_submit(std::move(ctx)); + } + if (mop->op) + osd->reply_op_error(mop->op, r); + + manifest_ops.erase(oid); + return 0; +} + +int PrimaryLogPG::start_flush( + OpRequestRef op, ObjectContextRef obc, + bool blocking, hobject_t *pmissing, + std::optional> &&on_flush) +{ + const object_info_t& oi = obc->obs.oi; + const hobject_t& soid = oi.soid; + dout(10) << __func__ << " " << soid + << " v" << oi.version + << " uv" << oi.user_version + << " " << (blocking ? "blocking" : "non-blocking/best-effort") + << dendl; + + bool preoctopus_compat = + get_osdmap()->require_osd_release < ceph_release_t::octopus; + SnapSet snapset; + if (preoctopus_compat) { + // for pre-octopus compatibility, filter SnapSet::snaps. not + // certain we need this, but let's be conservative. + snapset = obc->ssc->snapset.get_filtered(pool.info); + } else { + // NOTE: change this to a const ref when we remove this compat code + snapset = obc->ssc->snapset; + } + + if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) { + // current dedup tier only supports blocking operation + if (!blocking) { + return -EOPNOTSUPP; + } + } + + // verify there are no (older) check for dirty clones + { + dout(20) << " snapset " << snapset << dendl; + vector::reverse_iterator p = snapset.clones.rbegin(); + while (p != snapset.clones.rend() && *p >= soid.snap) + ++p; + if (p != snapset.clones.rend()) { + hobject_t next = soid; + next.snap = *p; + ceph_assert(next.snap < soid.snap); + if (recovery_state.get_pg_log().get_missing().is_missing(next)) { + dout(10) << __func__ << " missing clone is " << next << dendl; + if (pmissing) + *pmissing = next; + return -ENOENT; + } + ObjectContextRef older_obc = get_object_context(next, false); + if (older_obc) { + dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi + << dendl; + if (older_obc->obs.oi.is_dirty()) { + dout(10) << __func__ << " next oldest clone is dirty: " + << older_obc->obs.oi << dendl; + return -EBUSY; + } + } else { + dout(20) << __func__ << " next oldest clone " << next + << " is not present; implicitly clean" << dendl; + } + } else { + dout(20) << __func__ << " no older clones" << dendl; + } + } + + if (blocking) + obc->start_block(); + + map::iterator p = flush_ops.find(soid); + if (p != flush_ops.end()) { + FlushOpRef fop = p->second; + if (fop->op == op) { + // we couldn't take the write lock on a cache-try-flush before; + // now we are trying again for the lock. + return try_flush_mark_clean(fop); + } + if (fop->flushed_version == obc->obs.oi.user_version && + (fop->blocking || !blocking)) { + // nonblocking can join anything + // blocking can only join a blocking flush + dout(20) << __func__ << " piggybacking on existing flush " << dendl; + if (op) + fop->dup_ops.push_back(op); + return -EAGAIN; // clean up this ctx; op will retry later + } + + // cancel current flush since it will fail anyway, or because we + // are blocking and the existing flush is nonblocking. + dout(20) << __func__ << " canceling previous flush; it will fail" << dendl; + if (fop->op) + osd->reply_op_error(fop->op, -EBUSY); + while (!fop->dup_ops.empty()) { + osd->reply_op_error(fop->dup_ops.front(), -EBUSY); + fop->dup_ops.pop_front(); + } + vector tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + } + + if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) { + int r = start_dedup(op, obc); + if (r != -EINPROGRESS) { + if (blocking) + obc->stop_block(); + } + return r; + } + + /** + * In general, we need to send a delete and a copyfrom. + * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)] + * where 4 is marked as clean. To flush 10, we have to: + * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4 + * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8 + * + * There is a complicating case. Supposed there had been a clone 7 + * for snaps [7, 6] which has been trimmed since they no longer exist. + * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit + * the delete, the snap will be promoted to 5, and the head will become + * a whiteout. When the copy-from goes through, we'll end up with + * 8:[8,4,3,2]:[4(4,3,2)]+head. + * + * Another complication is the case where there is an interval change + * after doing the delete and the flush but before marking the object + * clean. We'll happily delete head and then recreate it at the same + * sequence number, which works out ok. + */ + + SnapContext snapc, dsnapc; + if (snapset.seq != 0) { + if (soid.snap == CEPH_NOSNAP) { + snapc = snapset.get_ssc_as_of(snapset.seq); + } else { + snapid_t min_included_snap; + auto p = snapset.clone_snaps.find(soid.snap); + ceph_assert(p != snapset.clone_snaps.end()); + min_included_snap = p->second.back(); + snapc = snapset.get_ssc_as_of(min_included_snap - 1); + } + + snapid_t prev_snapc = 0; + for (vector::reverse_iterator citer = snapset.clones.rbegin(); + citer != snapset.clones.rend(); + ++citer) { + if (*citer < soid.snap) { + prev_snapc = *citer; + break; + } + } + + dsnapc = snapset.get_ssc_as_of(prev_snapc); + } + + object_locator_t base_oloc(soid); + base_oloc.pool = pool.info.tier_of; + + if (dsnapc.seq < snapc.seq) { + ObjectOperation o; + o.remove(); + osd->objecter->mutate( + soid.oid, + base_oloc, + o, + dsnapc, + ceph::real_clock::from_ceph_timespec(oi.mtime), + (CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_ENFORCE_SNAPC), + NULL /* no callback, we'll rely on the ordering w.r.t the next op */); + } + + FlushOpRef fop(std::make_shared()); + fop->obc = obc; + fop->flushed_version = oi.user_version; + fop->blocking = blocking; + fop->on_flush = std::move(on_flush); + fop->op = op; + + ObjectOperation o; + if (oi.is_whiteout()) { + fop->removal = true; + o.remove(); + } else { + object_locator_t oloc(soid); + o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version, + CEPH_OSD_COPY_FROM_FLAG_FLUSH | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE, + LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + + //mean the base tier don't cache data after this + if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) + o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED); + } + C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset()); + + ceph_tid_t tid = osd->objecter->mutate( + soid.oid, base_oloc, o, snapc, + ceph::real_clock::from_ceph_timespec(oi.mtime), + CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC, + new C_OnFinisher(fin, + osd->get_objecter_finisher(get_pg_shard()))); + /* we're under the pg lock and fin->finish() is grabbing that */ + fin->tid = tid; + fop->objecter_tid = tid; + + flush_ops[soid] = fop; + + recovery_state.update_stats( + [&oi](auto &history, auto &stats) { + stats.stats.sum.num_flush++; + stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10); + return false; + }); + return -EINPROGRESS; +} + +void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map::iterator p = flush_ops.find(oid); + if (p == flush_ops.end()) { + dout(10) << __func__ << " no flush_op found" << dendl; + return; + } + FlushOpRef fop = p->second; + if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) { + dout(10) << __func__ << " tid " << tid << " != fop " << fop + << " tid " << fop->objecter_tid << dendl; + return; + } + ObjectContextRef obc = fop->obc; + fop->objecter_tid = 0; + + if (r < 0 && !(r == -ENOENT && fop->removal)) { + if (fop->op) + osd->reply_op_error(fop->op, -EBUSY); + if (fop->blocking) { + obc->stop_block(); + kick_object_context_blocked(obc); + } + + if (!fop->dup_ops.empty()) { + dout(20) << __func__ << " requeueing dups" << dendl; + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = std::nullopt; + } + flush_ops.erase(oid); + return; + } + + r = try_flush_mark_clean(fop); + if (r == -EBUSY && fop->op) { + osd->reply_op_error(fop->op, r); + } +} + +int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop) +{ + ObjectContextRef obc = fop->obc; + const hobject_t& oid = obc->obs.oi.soid; + + if (fop->blocking) { + obc->stop_block(); + kick_object_context_blocked(obc); + } + + if (fop->flushed_version != obc->obs.oi.user_version || + !obc->obs.exists) { + if (obc->obs.exists) + dout(10) << __func__ << " flushed_version " << fop->flushed_version + << " != current " << obc->obs.oi.user_version + << dendl; + else + dout(10) << __func__ << " object no longer exists" << dendl; + + if (!fop->dup_ops.empty()) { + dout(20) << __func__ << " requeueing dups" << dendl; + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = std::nullopt; + } + flush_ops.erase(oid); + if (fop->blocking) + osd->logger->inc(l_osd_tier_flush_fail); + else + osd->logger->inc(l_osd_tier_try_flush_fail); + return -EBUSY; + } + + if (!fop->blocking && + m_scrubber->write_blocked_by_scrub(oid)) { + if (fop->op) { + dout(10) << __func__ << " blocked by scrub" << dendl; + requeue_op(fop->op); + requeue_ops(fop->dup_ops); + return -EAGAIN; // will retry + } else { + osd->logger->inc(l_osd_tier_try_flush_fail); + vector tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + return -ECANCELED; + } + } + + // successfully flushed, can we evict this object? + if (!obc->obs.oi.has_manifest() && !fop->op && + agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && + agent_maybe_evict(obc, true)) { + osd->logger->inc(l_osd_tier_clean); + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = std::nullopt; + } + flush_ops.erase(oid); + return 0; + } + + dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl; + OpContextUPtr ctx = simple_opc_create(fop->obc); + + // successfully flushed; can we clear the dirty bit? + // try to take the lock manually, since we don't + // have a ctx yet. + if (ctx->lock_manager.get_lock_type( + RWState::RWWRITE, + oid, + obc, + fop->op)) { + dout(20) << __func__ << " took write lock" << dendl; + } else if (fop->op) { + dout(10) << __func__ << " waiting on write lock " << fop->op << " " + << fop->dup_ops << dendl; + // fop->op is now waiting on the lock; get fop->dup_ops to wait too. + for (auto op : fop->dup_ops) { + bool locked = ctx->lock_manager.get_lock_type( + RWState::RWWRITE, + oid, + obc, + op); + ceph_assert(!locked); + } + close_op_ctx(ctx.release()); + return -EAGAIN; // will retry + } else { + dout(10) << __func__ << " failed write lock, no op; failing" << dendl; + close_op_ctx(ctx.release()); + osd->logger->inc(l_osd_tier_try_flush_fail); + vector tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + return -ECANCELED; + } + + if (fop->on_flush) { + ctx->register_on_finish(*(fop->on_flush)); + fop->on_flush = std::nullopt; + } + + ctx->at_version = get_next_version(); + + ctx->new_obs = obc->obs; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + if (fop->obc->obs.oi.has_manifest()) { + ceph_assert(obc->obs.oi.manifest.is_chunked()); + PGTransaction* t = ctx->op_t.get(); + uint64_t chunks_size = 0; + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + chunks_size += p.second.length; + } + if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) { + t->omap_clear(oid); + ctx->new_obs.oi.clear_omap_digest(); + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP); + ctx->clean_regions.mark_omap_dirty(); + } + if (obc->obs.oi.size == chunks_size) { + t->truncate(oid, 0); + interval_set trim; + trim.insert(0, ctx->new_obs.oi.size); + ctx->modified_ranges.union_of(trim); + truncate_update_size_and_usage(ctx->delta_stats, + ctx->new_obs.oi, + 0); + ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size); + ctx->new_obs.oi.new_object(); + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + p.second.set_flag(chunk_info_t::FLAG_MISSING); + } + } else { + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + dout(20) << __func__ << " offset: " << p.second.offset + << " length: " << p.second.length << dendl; + p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN + } + } + } + + finish_ctx(ctx.get(), pg_log_entry_t::CLEAN); + + osd->logger->inc(l_osd_tier_clean); + + if (!fop->dup_ops.empty() || fop->op) { + dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl; + list ls; + if (fop->op) + ls.push_back(fop->op); + ls.splice(ls.end(), fop->dup_ops); + requeue_ops(ls); + } + + simple_opc_submit(std::move(ctx)); + + flush_ops.erase(oid); + + if (fop->blocking) + osd->logger->inc(l_osd_tier_flush); + else + osd->logger->inc(l_osd_tier_try_flush); + + return -EINPROGRESS; +} + +void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue, + vector *tids) +{ + dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid " + << fop->objecter_tid << dendl; + if (fop->objecter_tid) { + tids->push_back(fop->objecter_tid); + fop->objecter_tid = 0; + } + if (fop->io_tids.size()) { + for (auto &p : fop->io_tids) { + tids->push_back(p.second); + p.second = 0; + } + } + if (fop->blocking && fop->obc->is_blocked()) { + fop->obc->stop_block(); + kick_object_context_blocked(fop->obc); + } + if (requeue) { + if (fop->op) + requeue_op(fop->op); + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = std::nullopt; + } + flush_ops.erase(fop->obc->obs.oi.soid); +} + +void PrimaryLogPG::cancel_flush_ops(bool requeue, vector *tids) +{ + dout(10) << __func__ << dendl; + map::iterator p = flush_ops.begin(); + while (p != flush_ops.end()) { + cancel_flush((p++)->second, requeue, tids); + } +} + +bool PrimaryLogPG::is_present_clone(hobject_t coid) +{ + if (!pool.info.allow_incomplete_clones()) + return true; + if (is_missing_object(coid)) + return true; + ObjectContextRef obc = get_object_context(coid, false); + return obc && obc->obs.exists; +} + +// ======================================================================== +// rep op gather + +class C_OSD_RepopCommit : public Context { + PrimaryLogPGRef pg; + boost::intrusive_ptr repop; +public: + C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop) + : pg(pg), repop(repop) {} + void finish(int) override { + pg->repop_all_committed(repop.get()); + } +}; + +void PrimaryLogPG::repop_all_committed(RepGather *repop) +{ + dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed " + << dendl; + repop->all_committed = true; + if (!repop->rep_aborted) { + if (repop->v != eversion_t()) { + recovery_state.complete_write(repop->v, repop->pg_local_last_complete); + } + eval_repop(repop); + } +} + +void PrimaryLogPG::op_applied(const eversion_t &applied_version) +{ + dout(10) << "op_applied version " << applied_version << dendl; + ceph_assert(applied_version != eversion_t()); + ceph_assert(applied_version <= info.last_update); + recovery_state.local_write_applied(applied_version); + + if (is_primary() && m_scrubber) { + // if there's a scrub operation waiting for the selected chunk to be fully updated - + // allow it to continue + m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied()); + } +} + +void PrimaryLogPG::eval_repop(RepGather *repop) +{ + #ifdef HAVE_JAEGER + if (repop->op->osd_parent_span) { + auto eval_span = jaeger_tracing::child_span(__func__, repop->op->osd_parent_span); + } + #endif + dout(10) << "eval_repop " << *repop + << (repop->op && repop->op->get_req() ? "" : " (no op)") << dendl; + + // ondisk? + if (repop->all_committed) { + dout(10) << " commit: " << *repop << dendl; + for (auto p = repop->on_committed.begin(); + p != repop->on_committed.end(); + repop->on_committed.erase(p++)) { + (*p)(); + } + // send dup commits, in order + auto it = waiting_for_ondisk.find(repop->v); + if (it != waiting_for_ondisk.end()) { + ceph_assert(waiting_for_ondisk.begin()->first == repop->v); + for (auto& i : it->second) { + int return_code = repop->r; + if (return_code >= 0) { + return_code = std::get<2>(i); + } + osd->reply_op_error(std::get<0>(i), return_code, repop->v, + std::get<1>(i), std::get<3>(i)); + } + waiting_for_ondisk.erase(it); + } + + publish_stats_to_osd(); + + dout(10) << " removing " << *repop << dendl; + ceph_assert(!repop_queue.empty()); + dout(20) << " q front is " << *repop_queue.front() << dendl; + if (repop_queue.front() == repop) { + RepGather *to_remove = nullptr; + while (!repop_queue.empty() && + (to_remove = repop_queue.front())->all_committed) { + repop_queue.pop_front(); + for (auto p = to_remove->on_success.begin(); + p != to_remove->on_success.end(); + to_remove->on_success.erase(p++)) { + (*p)(); + } + remove_repop(to_remove); + } + } + } +} + +void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx) +{ + FUNCTRACE(cct); + const hobject_t& soid = ctx->obs->oi.soid; + dout(7) << "issue_repop rep_tid " << repop->rep_tid + << " o " << soid + << dendl; +#ifdef HAVE_JAEGER + if (ctx->op->osd_parent_span) { + auto issue_repop_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span); + } +#endif + + repop->v = ctx->at_version; + + ctx->op_t->add_obc(ctx->obc); + if (ctx->clone_obc) { + ctx->op_t->add_obc(ctx->clone_obc); + } + if (ctx->head_obc) { + ctx->op_t->add_obc(ctx->head_obc); + } + + Context *on_all_commit = new C_OSD_RepopCommit(this, repop); + if (!(ctx->log.empty())) { + ceph_assert(ctx->at_version >= projected_last_update); + projected_last_update = ctx->at_version; + } + for (auto &&entry: ctx->log) { + projected_log.add(entry); + } + + recovery_state.pre_submit_op( + soid, + ctx->log, + ctx->at_version); + pgbackend->submit_transaction( + soid, + ctx->delta_stats, + ctx->at_version, + std::move(ctx->op_t), + recovery_state.get_pg_trim_to(), + recovery_state.get_min_last_complete_ondisk(), + std::move(ctx->log), + ctx->updated_hset_history, + on_all_commit, + repop->rep_tid, + ctx->reqid, + ctx->op); +} + +PrimaryLogPG::RepGather *PrimaryLogPG::new_repop( + OpContext *ctx, ObjectContextRef obc, + ceph_tid_t rep_tid) +{ + if (ctx->op) + dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl; + else + dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl; + + RepGather *repop = new RepGather( + ctx, rep_tid, info.last_complete); + + repop->start = ceph_clock_now(); + + repop_queue.push_back(&repop->queue_item); + repop->get(); + + osd->logger->inc(l_osd_op_wip); + + dout(10) << __func__ << ": " << *repop << dendl; + return repop; +} + +boost::intrusive_ptr PrimaryLogPG::new_repop( + eversion_t version, + int r, + ObcLockManager &&manager, + OpRequestRef &&op, + std::optional > &&on_complete) +{ + RepGather *repop = new RepGather( + std::move(manager), + std::move(op), + std::move(on_complete), + osd->get_tid(), + info.last_complete, + r); + repop->v = version; + + repop->start = ceph_clock_now(); + + repop_queue.push_back(&repop->queue_item); + + osd->logger->inc(l_osd_op_wip); + + dout(10) << __func__ << ": " << *repop << dendl; + return boost::intrusive_ptr(repop); +} + +void PrimaryLogPG::remove_repop(RepGather *repop) +{ + dout(20) << __func__ << " " << *repop << dendl; + + for (auto p = repop->on_finish.begin(); + p != repop->on_finish.end(); + repop->on_finish.erase(p++)) { + (*p)(); + } + + release_object_locks( + repop->lock_manager); + repop->put(); + + osd->logger->dec(l_osd_op_wip); +} + +PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc) +{ + dout(20) << __func__ << " " << obc->obs.oi.soid << dendl; + ceph_tid_t rep_tid = osd->get_tid(); + osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid); + OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this)); + ctx->op_t.reset(new PGTransaction()); + ctx->mtime = ceph_clock_now(); + return ctx; +} + +void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx) +{ + RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid); + dout(20) << __func__ << " " << repop << dendl; + issue_repop(repop, ctx.get()); + eval_repop(repop); + recovery_state.update_trim_to(); + repop->put(); +} + + +void PrimaryLogPG::submit_log_entries( + const mempool::osd_pglog::list &entries, + ObcLockManager &&manager, + std::optional > &&_on_complete, + OpRequestRef op, + int r) +{ + dout(10) << __func__ << " " << entries << dendl; + ceph_assert(is_primary()); + + eversion_t version; + if (!entries.empty()) { + ceph_assert(entries.rbegin()->version >= projected_last_update); + version = projected_last_update = entries.rbegin()->version; + } + + boost::intrusive_ptr repop; + std::optional > on_complete; + if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) { + repop = new_repop( + version, + r, + std::move(manager), + std::move(op), + std::move(_on_complete)); + } else { + on_complete = std::move(_on_complete); + } + + pgbackend->call_write_ordered( + [this, entries, repop, on_complete]() { + ObjectStore::Transaction t; + eversion_t old_last_update = info.last_update; + recovery_state.merge_new_log_entries( + entries, t, recovery_state.get_pg_trim_to(), + recovery_state.get_min_last_complete_ondisk()); + + set waiting_on; + for (set::const_iterator i = get_acting_recovery_backfill().begin(); + i != get_acting_recovery_backfill().end(); + ++i) { + pg_shard_t peer(*i); + if (peer == pg_whoami) continue; + ceph_assert(recovery_state.get_peer_missing().count(peer)); + ceph_assert(recovery_state.has_peer_info(peer)); + if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) { + ceph_assert(repop); + MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing( + entries, + spg_t(info.pgid.pgid, i->shard), + pg_whoami.shard, + get_osdmap_epoch(), + get_last_peering_reset(), + repop->rep_tid, + recovery_state.get_pg_trim_to(), + recovery_state.get_min_last_complete_ondisk()); + osd->send_message_osd_cluster( + peer.osd, m, get_osdmap_epoch()); + waiting_on.insert(peer); + } else { + MOSDPGLog *m = new MOSDPGLog( + peer.shard, pg_whoami.shard, + info.last_update.epoch, + info, get_last_peering_reset()); + m->log.log = entries; + m->log.tail = old_last_update; + m->log.head = info.last_update; + osd->send_message_osd_cluster( + peer.osd, m, get_osdmap_epoch()); + } + } + ceph_tid_t rep_tid = repop->rep_tid; + waiting_on.insert(pg_whoami); + log_entry_update_waiting_on.insert( + make_pair( + rep_tid, + LogUpdateCtx{std::move(repop), std::move(waiting_on)} + )); + struct OnComplete : public Context { + PrimaryLogPGRef pg; + ceph_tid_t rep_tid; + epoch_t epoch; + OnComplete( + PrimaryLogPGRef pg, + ceph_tid_t rep_tid, + epoch_t epoch) + : pg(pg), rep_tid(rep_tid), epoch(epoch) {} + void finish(int) override { + std::scoped_lock l{*pg}; + if (!pg->pg_has_reset_since(epoch)) { + auto it = pg->log_entry_update_waiting_on.find(rep_tid); + ceph_assert(it != pg->log_entry_update_waiting_on.end()); + auto it2 = it->second.waiting_on.find(pg->pg_whoami); + ceph_assert(it2 != it->second.waiting_on.end()); + it->second.waiting_on.erase(it2); + if (it->second.waiting_on.empty()) { + pg->repop_all_committed(it->second.repop.get()); + pg->log_entry_update_waiting_on.erase(it); + } + } + } + }; + t.register_on_commit( + new OnComplete{this, rep_tid, get_osdmap_epoch()}); + int r = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(r == 0); + op_applied(info.last_update); + }); + + recovery_state.update_trim_to(); +} + +void PrimaryLogPG::cancel_log_updates() +{ + // get rid of all the LogUpdateCtx so their references to repops are + // dropped + log_entry_update_waiting_on.clear(); +} + +// ------------------------------------------------------- + +void PrimaryLogPG::get_watchers(list *ls) +{ + std::scoped_lock l{*this}; + pair i; + while (object_contexts.get_next(i.first, &i)) { + ObjectContextRef obc(i.second); + get_obc_watchers(obc, *ls); + } +} + +void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list &pg_watchers) +{ + for (map, WatchRef>::iterator j = + obc->watchers.begin(); + j != obc->watchers.end(); + ++j) { + obj_watch_item_t owi; + + owi.obj = obc->obs.oi.soid; + owi.wi.addr = j->second->get_peer_addr(); + owi.wi.name = j->second->get_entity(); + owi.wi.cookie = j->second->get_cookie(); + owi.wi.timeout_seconds = j->second->get_timeout(); + + dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr + << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl; + + pg_watchers.push_back(owi); + } +} + +void PrimaryLogPG::check_blocklisted_watchers() +{ + dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl; + pair i; + while (object_contexts.get_next(i.first, &i)) + check_blocklisted_obc_watchers(i.second); +} + +void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc) +{ + dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl; + for (map, WatchRef>::iterator k = + obc->watchers.begin(); + k != obc->watchers.end(); + ) { + //Advance iterator now so handle_watch_timeout() can erase element + map, WatchRef>::iterator j = k++; + dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl; + entity_addr_t ea = j->second->get_peer_addr(); + dout(30) << "watch: Check entity_addr_t " << ea << dendl; + if (get_osdmap()->is_blocklisted(ea)) { + dout(10) << "watch: Found blocklisted watcher for " << ea << dendl; + ceph_assert(j->second->get_pg() == this); + j->second->unregister_cb(); + handle_watch_timeout(j->second); + } + } +} + +void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc) +{ + ceph_assert(is_primary() && is_active()); + auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid); + ceph_assert((recovering.count(obc->obs.oi.soid) || + !is_missing_object(obc->obs.oi.soid)) || + (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary() + it_objects->second->op == + pg_log_entry_t::LOST_REVERT && + it_objects->second->reverting_to == + obc->obs.oi.version)); + + dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl; + ceph_assert(obc->watchers.empty()); + // populate unconnected_watchers + for (map, watch_info_t>::iterator p = + obc->obs.oi.watchers.begin(); + p != obc->obs.oi.watchers.end(); + ++p) { + utime_t expire = info.stats.last_became_active; + expire += p->second.timeout_seconds; + dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl; + WatchRef watch( + Watch::makeWatchRef( + this, osd, obc, p->second.timeout_seconds, p->first.first, + p->first.second, p->second.addr)); + watch->disconnect(); + obc->watchers.insert( + make_pair( + make_pair(p->first.first, p->first.second), + watch)); + } + // Look for watchers from blocklisted clients and drop + check_blocklisted_obc_watchers(obc); +} + +void PrimaryLogPG::handle_watch_timeout(WatchRef watch) +{ + ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref + dout(10) << "handle_watch_timeout obc " << obc << dendl; + + if (!is_active()) { + dout(10) << "handle_watch_timeout not active, no-op" << dendl; + return; + } + if (!obc->obs.exists) { + dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl; + return; + } + if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) { + callbacks_for_degraded_object[obc->obs.oi.soid].push_back( + watch->get_delayed_cb() + ); + dout(10) << "handle_watch_timeout waiting for degraded on obj " + << obc->obs.oi.soid + << dendl; + return; + } + + if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) { + dout(10) << "handle_watch_timeout waiting for scrub on obj " + << obc->obs.oi.soid + << dendl; + m_scrubber->add_callback( + watch->get_delayed_cb() // This callback! + ); + return; + } + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->at_version = get_next_version(); + + object_info_t& oi = ctx->new_obs.oi; + oi.watchers.erase(make_pair(watch->get_cookie(), + watch->get_entity())); + + list watch_disconnects = { + watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true) + }; + ctx->register_on_success( + [this, obc, watch_disconnects]() { + complete_disconnect_watches(obc, watch_disconnects); + }); + + + PGTransaction *t = ctx->op_t.get(); + ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid, + ctx->at_version, + oi.version, + 0, + osd_reqid_t(), ctx->mtime, 0)); + + oi.prior_version = obc->obs.oi.version; + oi.version = ctx->at_version; + bufferlist bl; + encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + t->setattr(obc->obs.oi.soid, OI_ATTR, bl); + + // apply new object state. + ctx->obc->obs = ctx->new_obs; + + // no ctx->delta_stats + simple_opc_submit(std::move(ctx)); +} + +ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi, + SnapSetContext *ssc) +{ + ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid)); + ceph_assert(obc->destructor_callback == NULL); + obc->destructor_callback = new C_PG_ObjectContext(this, obc.get()); + obc->obs.oi = oi; + obc->obs.exists = false; + obc->ssc = ssc; + if (ssc) + register_snapset_context(ssc); + dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl; + if (is_active()) + populate_obc_watchers(obc); + return obc; +} + +ObjectContextRef PrimaryLogPG::get_object_context( + const hobject_t& soid, + bool can_create, + const map *attrs) +{ + auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid); + ceph_assert( + attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) || + // or this is a revert... see recover_primary() + (it_objects != recovery_state.get_pg_log().get_log().objects.end() && + it_objects->second->op == + pg_log_entry_t::LOST_REVERT)); + ObjectContextRef obc = object_contexts.lookup(soid); + osd->logger->inc(l_osd_object_ctx_cache_total); + if (obc) { + osd->logger->inc(l_osd_object_ctx_cache_hit); + dout(10) << __func__ << ": found obc in cache: " << obc + << dendl; + } else { + dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl; + // check disk + bufferlist bv; + if (attrs) { + auto it_oi = attrs->find(OI_ATTR); + ceph_assert(it_oi != attrs->end()); + bv = it_oi->second; + } else { + int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv); + if (r < 0) { + if (!can_create) { + dout(10) << __func__ << ": no obc for soid " + << soid << " and !can_create" + << dendl; + return ObjectContextRef(); // -ENOENT! + } + + dout(10) << __func__ << ": no obc for soid " + << soid << " but can_create" + << dendl; + // new object. + object_info_t oi(soid); + SnapSetContext *ssc = get_snapset_context( + soid, true, 0, false); + ceph_assert(ssc); + obc = create_object_context(oi, ssc); + dout(10) << __func__ << ": " << obc << " " << soid + << " " << obc->rwstate + << " oi: " << obc->obs.oi + << " ssc: " << obc->ssc + << " snapset: " << obc->ssc->snapset << dendl; + return obc; + } + } + + object_info_t oi; + try { + bufferlist::const_iterator bliter = bv.begin(); + decode(oi, bliter); + } catch (...) { + dout(0) << __func__ << ": obc corrupt: " << soid << dendl; + return ObjectContextRef(); // -ENOENT! + } + + ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool()); + + obc = object_contexts.lookup_or_create(oi.soid); + obc->destructor_callback = new C_PG_ObjectContext(this, obc.get()); + obc->obs.oi = oi; + obc->obs.exists = true; + + obc->ssc = get_snapset_context( + soid, true, + soid.has_snapset() ? attrs : 0); + + if (is_primary() && is_active()) + populate_obc_watchers(obc); + + if (pool.info.is_erasure()) { + if (attrs) { + obc->attr_cache = *attrs; + } else { + int r = pgbackend->objects_get_attrs( + soid, + &obc->attr_cache); + ceph_assert(r == 0); + } + } + + dout(10) << __func__ << ": creating obc from disk: " << obc + << dendl; + } + + // XXX: Caller doesn't expect this + if (obc->ssc == NULL) { + derr << __func__ << ": obc->ssc not available, not returning context" << dendl; + return ObjectContextRef(); // -ENOENT! + } + + dout(10) << __func__ << ": " << obc << " " << soid + << " " << obc->rwstate + << " oi: " << obc->obs.oi + << " exists: " << (int)obc->obs.exists + << " ssc: " << obc->ssc + << " snapset: " << obc->ssc->snapset << dendl; + return obc; +} + +void PrimaryLogPG::context_registry_on_change() +{ + pair i; + while (object_contexts.get_next(i.first, &i)) { + ObjectContextRef obc(i.second); + if (obc) { + for (map, WatchRef>::iterator j = + obc->watchers.begin(); + j != obc->watchers.end(); + obc->watchers.erase(j++)) { + j->second->discard(); + } + } + } +} + + +/* + * If we return an error, and set *pmissing, then promoting that + * object may help. + * + * If we return -EAGAIN, we will always set *pmissing to the missing + * object to wait for. + * + * If we return an error but do not set *pmissing, then we know the + * object does not exist. + */ +int PrimaryLogPG::find_object_context(const hobject_t& oid, + ObjectContextRef *pobc, + bool can_create, + bool map_snapid_to_clone, + hobject_t *pmissing) +{ + FUNCTRACE(cct); + ceph_assert(oid.pool == static_cast(info.pgid.pool())); + // want the head? + if (oid.snap == CEPH_NOSNAP) { + ObjectContextRef obc = get_object_context(oid, can_create); + if (!obc) { + if (pmissing) + *pmissing = oid; + return -ENOENT; + } + dout(10) << __func__ << " " << oid + << " @" << oid.snap + << " oi=" << obc->obs.oi + << dendl; + *pobc = obc; + + return 0; + } + + // we want a snap + + hobject_t head = oid.get_head(); + SnapSetContext *ssc = get_snapset_context(oid, can_create); + if (!ssc || !(ssc->exists || can_create)) { + dout(20) << __func__ << " " << oid << " no snapset" << dendl; + if (pmissing) + *pmissing = head; // start by getting the head + if (ssc) + put_snapset_context(ssc); + return -ENOENT; + } + + if (map_snapid_to_clone) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " map_snapid_to_clone=true" << dendl; + if (oid.snap > ssc->snapset.seq) { + // already must be readable + ObjectContextRef obc = get_object_context(head, false); + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to head" << dendl; + *pobc = obc; + put_snapset_context(ssc); + return (obc && obc->obs.exists) ? 0 : -ENOENT; + } else { + vector::const_iterator citer = std::find( + ssc->snapset.clones.begin(), + ssc->snapset.clones.end(), + oid.snap); + if (citer == ssc->snapset.clones.end()) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to nothing" << dendl; + put_snapset_context(ssc); + return -ENOENT; + } + + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to " << oid << dendl; + + if (recovery_state.get_pg_log().get_missing().is_missing(oid)) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " is missing" << dendl; + if (pmissing) + *pmissing = oid; + put_snapset_context(ssc); + return -EAGAIN; + } + + ObjectContextRef obc = get_object_context(oid, false); + if (!obc || !obc->obs.exists) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " is not present" << dendl; + if (pmissing) + *pmissing = oid; + put_snapset_context(ssc); + return -ENOENT; + } + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " HIT" << dendl; + *pobc = obc; + put_snapset_context(ssc); + return 0; + } + ceph_abort(); //unreachable + } + + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset << dendl; + + // head? + if (oid.snap > ssc->snapset.seq) { + ObjectContextRef obc = get_object_context(head, false); + dout(10) << __func__ << " " << head + << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq + << " -- HIT " << obc->obs + << dendl; + if (!obc->ssc) + obc->ssc = ssc; + else { + ceph_assert(ssc == obc->ssc); + put_snapset_context(ssc); + } + *pobc = obc; + return 0; + } + + // which clone would it be? + unsigned k = 0; + while (k < ssc->snapset.clones.size() && + ssc->snapset.clones[k] < oid.snap) + k++; + if (k == ssc->snapset.clones.size()) { + dout(10) << __func__ << " no clones with last >= oid.snap " + << oid.snap << " -- DNE" << dendl; + put_snapset_context(ssc); + return -ENOENT; + } + hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(), + info.pgid.pool(), oid.get_namespace()); + + if (recovery_state.get_pg_log().get_missing().is_missing(soid)) { + dout(20) << __func__ << " " << soid << " missing, try again later" + << dendl; + if (pmissing) + *pmissing = soid; + put_snapset_context(ssc); + return -EAGAIN; + } + + ObjectContextRef obc = get_object_context(soid, false); + if (!obc || !obc->obs.exists) { + if (pmissing) + *pmissing = soid; + put_snapset_context(ssc); + if (is_primary()) { + if (is_degraded_or_backfilling_object(soid)) { + dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl; + return -EAGAIN; + } else if (is_degraded_on_async_recovery_target(soid)) { + dout(20) << __func__ << " clone is recovering " << soid << dendl; + return -EAGAIN; + } else { + dout(20) << __func__ << " missing clone " << soid << dendl; + return -ENOENT; + } + } else { + dout(20) << __func__ << " replica missing clone" << soid << dendl; + return -ENOENT; + } + } + + if (!obc->ssc) { + obc->ssc = ssc; + } else { + ceph_assert(obc->ssc == ssc); + put_snapset_context(ssc); + } + ssc = 0; + + // clone + dout(20) << __func__ << " " << soid + << " snapset " << obc->ssc->snapset + << dendl; + snapid_t first, last; + auto p = obc->ssc->snapset.clone_snaps.find(soid.snap); + ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); + if (p->second.empty()) { + dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl; + ceph_assert(!cct->_conf->osd_debug_verify_snaps); + return -ENOENT; + } + if (std::find(p->second.begin(), p->second.end(), oid.snap) == + p->second.end()) { + dout(20) << __func__ << " " << soid << " clone_snaps " << p->second + << " does not contain " << oid.snap << " -- DNE" << dendl; + return -ENOENT; + } + if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) { + dout(20) << __func__ << " " << soid << " snap " << oid.snap + << " in removed_snaps_queue" << " -- DNE" << dendl; + return -ENOENT; + } + dout(20) << __func__ << " " << soid << " clone_snaps " << p->second + << " contains " << oid.snap << " -- HIT " << obc->obs << dendl; + *pobc = obc; + return 0; +} + +void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc) +{ + if (obc->ssc) + put_snapset_context(obc->ssc); +} + +void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat) +{ + object_info_t& oi = obc->obs.oi; + + dout(10) << __func__ << " " << oi.soid << dendl; + ceph_assert(!oi.soid.is_snapdir()); + + object_stat_sum_t stat; + stat.num_objects++; + if (oi.is_dirty()) + stat.num_objects_dirty++; + if (oi.is_whiteout()) + stat.num_whiteouts++; + if (oi.is_omap()) + stat.num_objects_omap++; + if (oi.is_cache_pinned()) + stat.num_objects_pinned++; + if (oi.has_manifest()) + stat.num_objects_manifest++; + + if (oi.soid.is_snap()) { + stat.num_object_clones++; + + if (!obc->ssc) + obc->ssc = get_snapset_context(oi.soid, false); + ceph_assert(obc->ssc); + stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap); + } else { + stat.num_bytes += oi.size; + } + + // add it in + pgstat->stats.sum.add(stat); +} + +void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + if (obc->is_blocked()) { + dout(10) << __func__ << " " << soid << " still blocked" << dendl; + return; + } + + map>::iterator p = waiting_for_blocked_object.find(soid); + if (p != waiting_for_blocked_object.end()) { + list& ls = p->second; + dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl; + requeue_ops(ls); + waiting_for_blocked_object.erase(p); + } + + map::iterator i = + objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head()); + if (i != objects_blocked_on_snap_promotion.end()) { + ceph_assert(i->second == obc); + objects_blocked_on_snap_promotion.erase(i); + } + + if (obc->requeue_scrub_on_unblock) { + + obc->requeue_scrub_on_unblock = false; + + dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl; + + // only requeue if we are still active: we may be unblocking + // because we are resetting for a new peering interval + if (is_active()) { + osd->queue_scrub_unblocking(this, is_scrub_blocking_ops()); + } + } +} + +SnapSetContext *PrimaryLogPG::get_snapset_context( + const hobject_t& oid, + bool can_create, + const map *attrs, + bool oid_existed) +{ + std::lock_guard l(snapset_contexts_lock); + SnapSetContext *ssc; + map::iterator p = snapset_contexts.find( + oid.get_snapdir()); + if (p != snapset_contexts.end()) { + if (can_create || p->second->exists) { + ssc = p->second; + } else { + return NULL; + } + } else { + bufferlist bv; + if (!attrs) { + int r = -ENOENT; + if (!(oid.is_head() && !oid_existed)) { + r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv); + } + if (r < 0 && !can_create) + return NULL; + } else { + auto it_ss = attrs->find(SS_ATTR); + ceph_assert(it_ss != attrs->end()); + bv = it_ss->second; + } + ssc = new SnapSetContext(oid.get_snapdir()); + _register_snapset_context(ssc); + if (bv.length()) { + bufferlist::const_iterator bvp = bv.begin(); + try { + ssc->snapset.decode(bvp); + } catch (const ceph::buffer::error& e) { + dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl; + return NULL; + } + ssc->exists = true; + } else { + ssc->exists = false; + } + } + ceph_assert(ssc); + ssc->ref++; + return ssc; +} + +void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc) +{ + std::lock_guard l(snapset_contexts_lock); + --ssc->ref; + if (ssc->ref == 0) { + if (ssc->registered) + snapset_contexts.erase(ssc->oid); + delete ssc; + } +} + +/* + * Return values: + * NONE - didn't pull anything + * YES - pulled what the caller wanted + * HEAD - needed to pull head first + */ +enum { PULL_NONE, PULL_HEAD, PULL_YES }; + +int PrimaryLogPG::recover_missing( + const hobject_t &soid, eversion_t v, + int priority, + PGBackend::RecoveryHandle *h) +{ + if (recovery_state.get_missing_loc().is_unfound(soid)) { + dout(7) << __func__ << " " << soid + << " v " << v + << " but it is unfound" << dendl; + return PULL_NONE; + } + + if (recovery_state.get_missing_loc().is_deleted(soid)) { + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, ObjectContextRef())); + epoch_t cur_epoch = get_osdmap_epoch(); + remove_missing_object(soid, v, new LambdaContext( + [=](int) { + std::scoped_lock locker{*this}; + if (!pg_has_reset_since(cur_epoch)) { + bool object_missing = false; + for (const auto& shard : get_acting_recovery_backfill()) { + if (shard == pg_whoami) + continue; + if (recovery_state.get_peer_missing(shard).is_missing(soid)) { + dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl; + object_missing = true; + break; + } + } + if (!object_missing) { + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + if (scrub_after_recovery) + stat_diff.num_objects_repaired = 1; + on_global_recover(soid, stat_diff, true); + } else { + auto recovery_handle = pgbackend->open_recovery_op(); + pgbackend->recover_delete_object(soid, v, recovery_handle); + pgbackend->run_recovery_op(recovery_handle, priority); + } + } + })); + return PULL_YES; + } + + // is this a snapped object? if so, consult the snapset.. we may not need the entire object! + ObjectContextRef obc; + ObjectContextRef head_obc; + if (soid.snap && soid.snap < CEPH_NOSNAP) { + // do we have the head? + hobject_t head = soid.get_head(); + if (recovery_state.get_pg_log().get_missing().is_missing(head)) { + if (recovering.count(head)) { + dout(10) << " missing but already recovering head " << head << dendl; + return PULL_NONE; + } else { + int r = recover_missing( + head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority, + h); + if (r != PULL_NONE) + return PULL_HEAD; + return PULL_NONE; + } + } + head_obc = get_object_context( + head, + false, + 0); + ceph_assert(head_obc); + } + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, obc)); + int r = pgbackend->recover_object( + soid, + v, + head_obc, + obc, + h); + // This is only a pull which shouldn't return an error + ceph_assert(r >= 0); + return PULL_YES; +} + +void PrimaryLogPG::remove_missing_object(const hobject_t &soid, + eversion_t v, Context *on_complete) +{ + dout(20) << __func__ << " " << soid << " " << v << dendl; + ceph_assert(on_complete != nullptr); + // delete locally + ObjectStore::Transaction t; + remove_snap_mapped_object(t, soid); + + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = v; + + epoch_t cur_epoch = get_osdmap_epoch(); + t.register_on_complete(new LambdaContext( + [=](int) { + std::unique_lock locker{*this}; + if (!pg_has_reset_since(cur_epoch)) { + ObjectStore::Transaction t2; + on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2); + t2.register_on_complete(on_complete); + int r = osd->store->queue_transaction(ch, std::move(t2), nullptr); + ceph_assert(r == 0); + locker.unlock(); + } else { + locker.unlock(); + on_complete->complete(-EAGAIN); + } + })); + int r = osd->store->queue_transaction(ch, std::move(t), nullptr); + ceph_assert(r == 0); +} + +void PrimaryLogPG::finish_degraded_object(const hobject_t oid) +{ + dout(10) << __func__ << " " << oid << dendl; + if (callbacks_for_degraded_object.count(oid)) { + list contexts; + contexts.swap(callbacks_for_degraded_object[oid]); + callbacks_for_degraded_object.erase(oid); + for (list::iterator i = contexts.begin(); + i != contexts.end(); + ++i) { + (*i)->complete(0); + } + } + map::iterator i = objects_blocked_on_degraded_snap.find( + oid.get_head()); + if (i != objects_blocked_on_degraded_snap.end() && + i->second == oid.snap) + objects_blocked_on_degraded_snap.erase(i); +} + +void PrimaryLogPG::_committed_pushed_object( + epoch_t epoch, eversion_t last_complete) +{ + std::scoped_lock locker{*this}; + if (!pg_has_reset_since(epoch)) { + recovery_state.recovery_committed_to(last_complete); + } else { + dout(10) << __func__ + << " pg has changed, not touching last_complete_ondisk" << dendl; + } +} + +void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc) +{ + dout(20) << __func__ << dendl; + if (obc) { + dout(20) << "obc = " << *obc << dendl; + } + ceph_assert(active_pushes >= 1); + --active_pushes; + + // requeue an active chunky scrub waiting on recovery ops + if (!recovery_state.is_deleting() && active_pushes == 0 && + m_scrubber->is_scrub_active()) { + + osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops()); + } +} + +void PrimaryLogPG::_applied_recovered_object_replica() +{ + dout(20) << __func__ << dendl; + ceph_assert(active_pushes >= 1); + --active_pushes; + + // requeue an active scrub waiting on recovery ops + if (!recovery_state.is_deleting() && active_pushes == 0 && + m_scrubber->is_scrub_active()) { + + osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority()); + } +} + +void PrimaryLogPG::on_failed_pull( + const set &from, + const hobject_t &soid, + const eversion_t &v) +{ + dout(20) << __func__ << ": " << soid << dendl; + ceph_assert(recovering.count(soid)); + auto obc = recovering[soid]; + if (obc) { + list blocked_ops; + obc->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + recovering.erase(soid); + for (auto&& i : from) { + if (i != pg_whoami) { // we'll get it below in primary_error + recovery_state.force_object_missing(i, soid, v); + } + } + + dout(0) << __func__ << " " << soid << " from shard " << from + << ", reps on " << recovery_state.get_missing_loc().get_locations(soid) + << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid) + << dendl; + finish_recovery_op(soid); // close out this attempt, + finish_degraded_object(soid); + + if (from.count(pg_whoami)) { + dout(0) << " primary missing oid " << soid << " version " << v << dendl; + primary_error(soid, v); + backfills_in_flight.erase(soid); + } +} + +eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid) +{ + eversion_t v; + pg_missing_item pmi; + bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi); + ceph_assert(is_missing); + v = pmi.have; + dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl; + + ceph_assert(!get_acting_recovery_backfill().empty()); + for (set::iterator i = get_acting_recovery_backfill().begin(); + i != get_acting_recovery_backfill().end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + if (!recovery_state.get_peer_missing(peer).is_missing(oid)) { + continue; + } + eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have; + dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl; + if (h > v) + v = h; + } + + dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl; + return v; +} + +void PrimaryLogPG::do_update_log_missing(OpRequestRef &op) +{ + const MOSDPGUpdateLogMissing *m = static_cast( + op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING); + ObjectStore::Transaction t; + std::optional op_trim_to, op_roll_forward_to; + if (m->pg_trim_to != eversion_t()) + op_trim_to = m->pg_trim_to; + if (m->pg_roll_forward_to != eversion_t()) + op_roll_forward_to = m->pg_roll_forward_to; + + dout(20) << __func__ + << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl; + + recovery_state.append_log_entries_update_missing( + m->entries, t, op_trim_to, op_roll_forward_to); + eversion_t new_lcod = info.last_complete; + + Context *complete = new LambdaContext( + [=](int) { + const MOSDPGUpdateLogMissing *msg = static_cast( + op->get_req()); + std::scoped_lock locker{*this}; + if (!pg_has_reset_since(msg->get_epoch())) { + update_last_complete_ondisk(new_lcod); + MOSDPGUpdateLogMissingReply *reply = + new MOSDPGUpdateLogMissingReply( + spg_t(info.pgid.pgid, primary_shard().shard), + pg_whoami.shard, + msg->get_epoch(), + msg->min_epoch, + msg->get_tid(), + new_lcod); + reply->set_priority(CEPH_MSG_PRIO_HIGH); + msg->get_connection()->send_message(reply); + } + }); + + if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + t.register_on_commit(complete); + } else { + /* Hack to work around the fact that ReplicatedBackend sends + * ack+commit if commit happens first + * + * This behavior is no longer necessary, but we preserve it so old + * primaries can keep their repops in order */ + if (pool.info.is_erasure()) { + t.register_on_complete(complete); + } else { + t.register_on_commit(complete); + } + } + int tr = osd->store->queue_transaction( + ch, + std::move(t), + nullptr); + ceph_assert(tr == 0); + op_applied(info.last_update); +} + +void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op) +{ + const MOSDPGUpdateLogMissingReply *m = + static_cast( + op->get_req()); + dout(20) << __func__ << " got reply from " + << m->get_from() << dendl; + + auto it = log_entry_update_waiting_on.find(m->get_tid()); + if (it != log_entry_update_waiting_on.end()) { + if (it->second.waiting_on.count(m->get_from())) { + it->second.waiting_on.erase(m->get_from()); + if (m->last_complete_ondisk != eversion_t()) { + update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk); + } + } else { + osd->clog->error() + << info.pgid << " got reply " + << *m << " from shard we are not waiting for " + << m->get_from(); + } + + if (it->second.waiting_on.empty()) { + repop_all_committed(it->second.repop.get()); + log_entry_update_waiting_on.erase(it); + } + } else { + osd->clog->error() + << info.pgid << " got reply " + << *m << " on unknown tid " << m->get_tid(); + } +} + +/* Mark all unfound objects as lost. + */ +void PrimaryLogPG::mark_all_unfound_lost( + int what, + std::function on_finish) +{ + dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl; + list oids; + + dout(30) << __func__ << ": log before:\n"; + recovery_state.get_pg_log().get_log().print(*_dout); + *_dout << dendl; + + mempool::osd_pglog::list log_entries; + + utime_t mtime = ceph_clock_now(); + map::const_iterator m = + recovery_state.get_missing_loc().get_needs_recovery().begin(); + map::const_iterator mend = + recovery_state.get_missing_loc().get_needs_recovery().end(); + + ObcLockManager manager; + eversion_t v = get_next_version(); + v.epoch = get_osdmap_epoch(); + uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound(); + while (m != mend) { + const hobject_t &oid(m->first); + if (!recovery_state.get_missing_loc().is_unfound(oid)) { + // We only care about unfound objects + ++m; + continue; + } + + ObjectContextRef obc; + eversion_t prev; + + switch (what) { + case pg_log_entry_t::LOST_MARK: + ceph_abort_msg("actually, not implemented yet!"); + break; + + case pg_log_entry_t::LOST_REVERT: + prev = pick_newest_available(oid); + if (prev > eversion_t()) { + // log it + pg_log_entry_t e( + pg_log_entry_t::LOST_REVERT, oid, v, + m->second.need, 0, osd_reqid_t(), mtime, 0); + e.reverting_to = prev; + e.mark_unrollbackable(); + log_entries.push_back(e); + dout(10) << e << dendl; + + // we are now missing the new version; recovery code will sort it out. + ++v.version; + ++m; + break; + } + + case pg_log_entry_t::LOST_DELETE: + { + pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need, + 0, osd_reqid_t(), mtime, 0); + if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) { + if (pool.info.require_rollback()) { + e.mod_desc.try_rmobject(v.version); + } else { + e.mark_unrollbackable(); + } + } // otherwise, just do what we used to do + dout(10) << e << dendl; + log_entries.push_back(e); + oids.push_back(oid); + + // If context found mark object as deleted in case + // of racing with new creation. This can happen if + // object lost and EIO at primary. + obc = object_contexts.lookup(oid); + if (obc) + obc->obs.exists = false; + + ++v.version; + ++m; + } + break; + + default: + ceph_abort(); + } + } + + recovery_state.update_stats( + [](auto &history, auto &stats) { + stats.stats_invalid = true; + return false; + }); + + submit_log_entries( + log_entries, + std::move(manager), + std::optional >( + [this, oids, num_unfound, on_finish]() { + if (recovery_state.perform_deletes_during_peering()) { + for (auto oid : oids) { + // clear old locations - merge_new_log_entries will have + // handled rebuilding missing_loc for each of these + // objects if we have the RECOVERY_DELETES flag + recovery_state.object_recovered(oid, object_stat_sum_t()); + } + } + + if (is_recovery_unfound()) { + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery()))); + } else if (is_backfill_unfound()) { + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill()))); + } else { + queue_recovery(); + } + + stringstream ss; + ss << "pg has " << num_unfound + << " objects unfound and apparently lost marking"; + string rs = ss.str(); + dout(0) << "do_command r=" << 0 << " " << rs << dendl; + osd->clog->info() << rs; + bufferlist empty; + on_finish(0, rs, empty); + }), + OpRequestRef()); +} + +void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits) +{ + ceph_assert(repop_queue.empty()); +} + +/* + * pg status change notification + */ + +void PrimaryLogPG::apply_and_flush_repops(bool requeue) +{ + list rq; + + // apply all repops + while (!repop_queue.empty()) { + RepGather *repop = repop_queue.front(); + repop_queue.pop_front(); + dout(10) << " canceling repop tid " << repop->rep_tid << dendl; + repop->rep_aborted = true; + repop->on_committed.clear(); + repop->on_success.clear(); + + if (requeue) { + if (repop->op) { + dout(10) << " requeuing " << *repop->op->get_req() << dendl; + rq.push_back(repop->op); + repop->op = OpRequestRef(); + } + + // also requeue any dups, interleaved into position + auto p = waiting_for_ondisk.find(repop->v); + if (p != waiting_for_ondisk.end()) { + dout(10) << " also requeuing ondisk waiters " << p->second << dendl; + for (auto& i : p->second) { + rq.push_back(std::get<0>(i)); + } + waiting_for_ondisk.erase(p); + } + } + + remove_repop(repop); + } + + ceph_assert(repop_queue.empty()); + + if (requeue) { + requeue_ops(rq); + if (!waiting_for_ondisk.empty()) { + for (auto& i : waiting_for_ondisk) { + for (auto& j : i.second) { + derr << __func__ << ": op " << *(std::get<0>(j)->get_req()) + << " waiting on " << i.first << dendl; + } + } + ceph_assert(waiting_for_ondisk.empty()); + } + } + + waiting_for_ondisk.clear(); +} + +void PrimaryLogPG::on_flushed() +{ + requeue_ops(waiting_for_flush); + if (!is_peered() || !is_primary()) { + pair i; + while (object_contexts.get_next(i.first, &i)) { + derr << __func__ << ": object " << i.first << " obc still alive" << dendl; + } + ceph_assert(object_contexts.empty()); + } +} + +void PrimaryLogPG::on_removal(ObjectStore::Transaction &t) +{ + dout(10) << __func__ << dendl; + + on_shutdown(); + + t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch())); +} + +void PrimaryLogPG::clear_async_reads() +{ + dout(10) << __func__ << dendl; + for(auto& i : in_progress_async_reads) { + dout(10) << "clear ctx: " + << "OpRequestRef " << i.first + << " OpContext " << i.second + << dendl; + close_op_ctx(i.second); + } +} + +void PrimaryLogPG::clear_cache() +{ + object_contexts.clear(); +} + +void PrimaryLogPG::on_shutdown() +{ + dout(10) << __func__ << dendl; + + if (recovery_queued) { + recovery_queued = false; + osd->clear_queued_recovery(this); + } + + m_scrubber->scrub_clear_state(); + + m_scrubber->unreg_next_scrub(); + + vector tids; + cancel_copy_ops(false, &tids); + cancel_flush_ops(false, &tids); + cancel_proxy_ops(false, &tids); + cancel_manifest_ops(false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + + apply_and_flush_repops(false); + cancel_log_updates(); + // we must remove PGRefs, so do this this prior to release_backoffs() callers + clear_backoffs(); + // clean up snap trim references + snap_trimmer_machine.process_event(Reset()); + + pgbackend->on_change(); + + context_registry_on_change(); + object_contexts.clear(); + + clear_async_reads(); + + osd->remote_reserver.cancel_reservation(info.pgid); + osd->local_reserver.cancel_reservation(info.pgid); + + clear_primary_state(); + cancel_recovery(); + + if (is_primary()) { + osd->clear_ready_to_merge(this); + } +} + +void PrimaryLogPG::on_activate_complete() +{ + check_local(); + // waiters + if (!recovery_state.needs_flush()) { + requeue_ops(waiting_for_peered); + } else if (!waiting_for_peered.empty()) { + dout(10) << __func__ << " flushes in progress, moving " + << waiting_for_peered.size() + << " items to waiting_for_flush" + << dendl; + ceph_assert(waiting_for_flush.empty()); + waiting_for_flush.swap(waiting_for_peered); + } + + + // all clean? + if (needs_recovery()) { + dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery()))); + } else if (needs_backfill()) { + dout(10) << "activate queueing backfill" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill()))); + } else { + dout(10) << "activate all replicas clean, no recovery" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::AllReplicasRecovered()))); + } + + publish_stats_to_osd(); + + if (get_backfill_targets().size()) { + last_backfill_started = recovery_state.earliest_backfill(); + new_backfill = true; + ceph_assert(!last_backfill_started.is_max()); + dout(5) << __func__ << ": bft=" << get_backfill_targets() + << " from " << last_backfill_started << dendl; + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + dout(5) << "target shard " << *i + << " from " << recovery_state.get_peer_info(*i).last_backfill + << dendl; + } + } + + hit_set_setup(); + agent_setup(); +} + +void PrimaryLogPG::on_change(ObjectStore::Transaction &t) +{ + dout(10) << __func__ << dendl; + + if (hit_set && hit_set->insert_count() == 0) { + dout(20) << " discarding empty hit_set" << dendl; + hit_set_clear(); + } + + if (recovery_queued) { + recovery_queued = false; + osd->clear_queued_recovery(this); + } + + // requeue everything in the reverse order they should be + // reexamined. + requeue_ops(waiting_for_peered); + requeue_ops(waiting_for_flush); + requeue_ops(waiting_for_active); + requeue_ops(waiting_for_readable); + + vector tids; + cancel_copy_ops(is_primary(), &tids); + cancel_flush_ops(is_primary(), &tids); + cancel_proxy_ops(is_primary(), &tids); + cancel_manifest_ops(is_primary(), &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + + // requeue object waiters + for (auto& p : waiting_for_unreadable_object) { + release_backoffs(p.first); + } + if (is_primary()) { + requeue_object_waiters(waiting_for_unreadable_object); + } else { + waiting_for_unreadable_object.clear(); + } + for (map>::iterator p = waiting_for_degraded_object.begin(); + p != waiting_for_degraded_object.end(); + waiting_for_degraded_object.erase(p++)) { + release_backoffs(p->first); + if (is_primary()) + requeue_ops(p->second); + else + p->second.clear(); + finish_degraded_object(p->first); + } + + // requeues waiting_for_scrub + m_scrubber->scrub_clear_state(); + + for (auto p = waiting_for_blocked_object.begin(); + p != waiting_for_blocked_object.end(); + waiting_for_blocked_object.erase(p++)) { + if (is_primary()) + requeue_ops(p->second); + else + p->second.clear(); + } + for (auto i = callbacks_for_degraded_object.begin(); + i != callbacks_for_degraded_object.end(); + ) { + finish_degraded_object((i++)->first); + } + ceph_assert(callbacks_for_degraded_object.empty()); + + if (is_primary()) { + requeue_ops(waiting_for_cache_not_full); + } else { + waiting_for_cache_not_full.clear(); + } + objects_blocked_on_cache_full.clear(); + + for (list >::iterator i = + in_progress_async_reads.begin(); + i != in_progress_async_reads.end(); + in_progress_async_reads.erase(i++)) { + close_op_ctx(i->second); + if (is_primary()) + requeue_op(i->first); + } + + // this will requeue ops we were working on but didn't finish, and + // any dups + apply_and_flush_repops(is_primary()); + cancel_log_updates(); + + // do this *after* apply_and_flush_repops so that we catch any newly + // registered watches. + context_registry_on_change(); + + pgbackend->on_change_cleanup(&t); + m_scrubber->cleanup_store(&t); + pgbackend->on_change(); + + // clear snap_trimmer state + snap_trimmer_machine.process_event(Reset()); + + debug_op_order.clear(); + unstable_stats.clear(); + + // we don't want to cache object_contexts through the interval change + // NOTE: we actually assert that all currently live references are dead + // by the time the flush for the next interval completes. + object_contexts.clear(); + + // should have been cleared above by finishing all of the degraded objects + ceph_assert(objects_blocked_on_degraded_snap.empty()); +} + +void PrimaryLogPG::plpg_on_role_change() +{ + dout(10) << __func__ << dendl; + if (get_role() != 0 && hit_set) { + dout(10) << " clearing hit set" << dendl; + hit_set_clear(); + } +} + +void PrimaryLogPG::plpg_on_pool_change() +{ + dout(10) << __func__ << dendl; + // requeue cache full waiters just in case the cache_mode is + // changing away from writeback mode. note that if we are not + // active the normal requeuing machinery is sufficient (and properly + // ordered). + if (is_active() && + pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK && + !waiting_for_cache_not_full.empty()) { + dout(10) << __func__ << " requeuing full waiters (not in writeback) " + << dendl; + requeue_ops(waiting_for_cache_not_full); + objects_blocked_on_cache_full.clear(); + } + hit_set_setup(); + agent_setup(); +} + +// clear state. called on recovery completion AND cancellation. +void PrimaryLogPG::_clear_recovery_state() +{ +#ifdef DEBUG_RECOVERY_OIDS + recovering_oids.clear(); +#endif + dout(15) << __func__ << " flags: " << m_planned_scrub << dendl; + + last_backfill_started = hobject_t(); + set::iterator i = backfills_in_flight.begin(); + while (i != backfills_in_flight.end()) { + backfills_in_flight.erase(i++); + } + + list blocked_ops; + for (map::iterator i = recovering.begin(); + i != recovering.end(); + recovering.erase(i++)) { + if (i->second) { + i->second->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + } + ceph_assert(backfills_in_flight.empty()); + pending_backfill_updates.clear(); + ceph_assert(recovering.empty()); + pgbackend->clear_recovery_state(); +} + +void PrimaryLogPG::cancel_pull(const hobject_t &soid) +{ + dout(20) << __func__ << ": " << soid << dendl; + ceph_assert(recovering.count(soid)); + ObjectContextRef obc = recovering[soid]; + if (obc) { + list blocked_ops; + obc->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + recovering.erase(soid); + finish_recovery_op(soid); + release_backoffs(soid); + if (waiting_for_degraded_object.count(soid)) { + dout(20) << " kicking degraded waiters on " << soid << dendl; + requeue_ops(waiting_for_degraded_object[soid]); + waiting_for_degraded_object.erase(soid); + } + if (waiting_for_unreadable_object.count(soid)) { + dout(20) << " kicking unreadable waiters on " << soid << dendl; + requeue_ops(waiting_for_unreadable_object[soid]); + waiting_for_unreadable_object.erase(soid); + } + if (is_missing_object(soid)) + recovery_state.set_last_requested(0); + finish_degraded_object(soid); +} + +void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap) +{ + pgbackend->check_recovery_sources(osdmap); +} + +bool PrimaryLogPG::start_recovery_ops( + uint64_t max, + ThreadPool::TPHandle &handle, + uint64_t *ops_started) +{ + uint64_t& started = *ops_started; + started = 0; + bool work_in_progress = false; + bool recovery_started = false; + ceph_assert(is_primary()); + ceph_assert(is_peered()); + ceph_assert(!recovery_state.is_deleting()); + + ceph_assert(recovery_queued); + recovery_queued = false; + + if (!state_test(PG_STATE_RECOVERING) && + !state_test(PG_STATE_BACKFILLING)) { + /* TODO: I think this case is broken and will make do_recovery() + * unhappy since we're returning false */ + dout(10) << "recovery raced and were queued twice, ignoring!" << dendl; + return have_unfound(); + } + + const auto &missing = recovery_state.get_pg_log().get_missing(); + + uint64_t num_unfound = get_num_unfound(); + + if (!recovery_state.have_missing()) { + recovery_state.local_recovery_complete(); + } + + if (!missing.have_missing() || // Primary does not have missing + // or all of the missing objects are unfound. + recovery_state.all_missing_unfound()) { + // Recover the replicas. + started = recover_replicas(max, handle, &recovery_started); + } + if (!started) { + // We still have missing objects that we should grab from replicas. + started += recover_primary(max, handle); + } + if (!started && num_unfound != get_num_unfound()) { + // second chance to recovery replicas + started = recover_replicas(max, handle, &recovery_started); + } + + if (started || recovery_started) + work_in_progress = true; + + bool deferred_backfill = false; + if (recovering.empty() && + state_test(PG_STATE_BACKFILLING) && + !get_backfill_targets().empty() && started < max && + missing.num_missing() == 0 && + waiting_on_backfill.empty()) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) { + dout(10) << "deferring backfill due to NOBACKFILL" << dendl; + deferred_backfill = true; + } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) && + !is_degraded()) { + dout(10) << "deferring backfill due to NOREBALANCE" << dendl; + deferred_backfill = true; + } else if (!recovery_state.is_backfill_reserved()) { + /* DNMNOTE I think this branch is dead */ + dout(10) << "deferring backfill due to !backfill_reserved" << dendl; + if (!backfill_reserving) { + dout(10) << "queueing RequestBackfill" << dendl; + backfill_reserving = true; + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill()))); + } + deferred_backfill = true; + } else { + started += recover_backfill(max - started, handle, &work_in_progress); + } + } + + dout(10) << " started " << started << dendl; + osd->logger->inc(l_osd_rop, started); + + if (!recovering.empty() || + work_in_progress || recovery_ops_active > 0 || deferred_backfill) + return !work_in_progress && have_unfound(); + + ceph_assert(recovering.empty()); + ceph_assert(recovery_ops_active == 0); + + dout(10) << __func__ << " needs_recovery: " + << recovery_state.get_missing_loc().get_needs_recovery() + << dendl; + dout(10) << __func__ << " missing_loc: " + << recovery_state.get_missing_loc().get_missing_locs() + << dendl; + int unfound = get_num_unfound(); + if (unfound) { + dout(10) << " still have " << unfound << " unfound" << dendl; + return true; + } + + if (missing.num_missing() > 0) { + // this shouldn't happen! + osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with " + << missing.num_missing() << ": " << missing.get_items(); + return false; + } + + if (needs_recovery()) { + // this shouldn't happen! + // We already checked num_missing() so we must have missing replicas + osd->clog->error() << info.pgid + << " Unexpected Error: recovery ending with missing replicas"; + return false; + } + + if (state_test(PG_STATE_RECOVERING)) { + state_clear(PG_STATE_RECOVERING); + state_clear(PG_STATE_FORCED_RECOVERY); + if (needs_backfill()) { + dout(10) << "recovery done, queuing backfill" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill()))); + } else { + dout(10) << "recovery done, no backfill" << dendl; + state_clear(PG_STATE_FORCED_BACKFILL); + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::AllReplicasRecovered()))); + } + } else { // backfilling + state_clear(PG_STATE_BACKFILLING); + state_clear(PG_STATE_FORCED_BACKFILL); + state_clear(PG_STATE_FORCED_RECOVERY); + dout(10) << "recovery done, backfill done" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::Backfilled()))); + } + + return false; +} + +/** + * do one recovery op. + * return true if done, false if nothing left to do. + */ +uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle) +{ + ceph_assert(is_primary()); + + const auto &missing = recovery_state.get_pg_log().get_missing(); + + dout(10) << __func__ << " recovering " << recovering.size() + << " in pg," + << " missing " << missing << dendl; + + dout(25) << __func__ << " " << missing.get_items() << dendl; + + // look at log! + pg_log_entry_t *latest = 0; + unsigned started = 0; + int skipped = 0; + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + map::const_iterator p = + missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested); + while (p != missing.get_rmissing().end()) { + handle.reset_tp_timeout(); + hobject_t soid; + version_t v = p->first; + + auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second); + if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) { + latest = it_objects->second; + ceph_assert(latest->is_update() || latest->is_delete()); + soid = latest->soid; + } else { + latest = 0; + soid = p->second; + } + const pg_missing_item& item = missing.get_items().find(p->second)->second; + ++p; + + hobject_t head = soid.get_head(); + + eversion_t need = item.need; + + dout(10) << __func__ << " " + << soid << " " << item.need + << (missing.is_missing(soid) ? " (missing)":"") + << (missing.is_missing(head) ? " (missing head)":"") + << (recovering.count(soid) ? " (recovering)":"") + << (recovering.count(head) ? " (recovering head)":"") + << dendl; + + if (latest) { + switch (latest->op) { + case pg_log_entry_t::CLONE: + /* + * Handling for this special case removed for now, until we + * can correctly construct an accurate SnapSet from the old + * one. + */ + break; + + case pg_log_entry_t::LOST_REVERT: + { + if (item.have == latest->reverting_to) { + ObjectContextRef obc = get_object_context(soid, true); + + if (obc->obs.oi.version == latest->version) { + // I'm already reverting + dout(10) << " already reverting " << soid << dendl; + } else { + dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl; + obc->obs.oi.version = latest->version; + + ObjectStore::Transaction t; + bufferlist b2; + obc->obs.oi.encode( + b2, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + ceph_assert(!pool.info.require_rollback()); + t.setattr(coll, ghobject_t(soid), OI_ATTR, b2); + + recovery_state.recover_got( + soid, + latest->version, + false, + t); + + ++active_pushes; + + t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc)); + t.register_on_commit(new C_OSD_CommittedPushedObject( + this, + get_osdmap_epoch(), + info.last_complete)); + osd->store->queue_transaction(ch, std::move(t)); + continue; + } + } else { + /* + * Pull the old version of the object. Update missing_loc here to have the location + * of the version we want. + * + * This doesn't use the usual missing_loc paths, but that's okay: + * - if we have it locally, we hit the case above, and go from there. + * - if we don't, we always pass through this case during recovery and set up the location + * properly. + * - this way we don't need to mangle the missing code to be general about needing an old + * version... + */ + eversion_t alternate_need = latest->reverting_to; + dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl; + + set good_peers; + for (auto p = recovery_state.get_peer_missing().begin(); + p != recovery_state.get_peer_missing().end(); + ++p) { + if (p->second.is_missing(soid, need) && + p->second.get_items().at(soid).have == alternate_need) { + good_peers.insert(p->first); + } + } + recovery_state.set_revert_with_targets( + soid, + good_peers); + dout(10) << " will pull " << alternate_need << " or " << need + << " from one of " + << recovery_state.get_missing_loc().get_locations(soid) + << dendl; + } + } + break; + } + } + + if (!recovering.count(soid)) { + if (recovering.count(head)) { + ++skipped; + } else { + int r = recover_missing( + soid, need, get_recovery_op_priority(), h); + switch (r) { + case PULL_YES: + ++started; + break; + case PULL_HEAD: + ++started; + case PULL_NONE: + ++skipped; + break; + default: + ceph_abort(); + } + if (started >= max) + break; + } + } + + // only advance last_requested if we haven't skipped anything + if (!skipped) + recovery_state.set_last_requested(v); + } + + pgbackend->run_recovery_op(h, get_recovery_op_priority()); + return started; +} + +bool PrimaryLogPG::primary_error( + const hobject_t& soid, eversion_t v) +{ + recovery_state.force_object_missing(pg_whoami, soid, v); + bool uhoh = recovery_state.get_missing_loc().is_unfound(soid); + if (uhoh) + osd->clog->error() << info.pgid << " missing primary copy of " + << soid << ", unfound"; + else + osd->clog->error() << info.pgid << " missing primary copy of " + << soid + << ", will try copies on " + << recovery_state.get_missing_loc().get_locations(soid); + return uhoh; +} + +int PrimaryLogPG::prep_object_replica_deletes( + const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started) +{ + ceph_assert(is_primary()); + dout(10) << __func__ << ": on " << soid << dendl; + + ObjectContextRef obc = get_object_context(soid, false); + if (obc) { + if (!obc->get_recovery_read()) { + dout(20) << "replica delete delayed on " << soid + << "; could not get rw_manager lock" << dendl; + *work_started = true; + return 0; + } else { + dout(20) << "replica delete got recovery read lock on " << soid + << dendl; + } + } + + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + if (!obc) + recovering.insert(make_pair(soid, ObjectContextRef())); + else + recovering.insert(make_pair(soid, obc)); + + pgbackend->recover_delete_object(soid, v, h); + return 1; +} + +int PrimaryLogPG::prep_object_replica_pushes( + const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started) +{ + ceph_assert(is_primary()); + dout(10) << __func__ << ": on " << soid << dendl; + + if (soid.snap && soid.snap < CEPH_NOSNAP) { + // do we have the head and/or snapdir? + hobject_t head = soid.get_head(); + if (recovery_state.get_pg_log().get_missing().is_missing(head)) { + if (recovering.count(head)) { + dout(10) << " missing but already recovering head " << head << dendl; + return 0; + } else { + int r = recover_missing( + head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, + get_recovery_op_priority(), h); + if (r != PULL_NONE) + return 1; + return 0; + } + } + } + + // NOTE: we know we will get a valid oloc off of disk here. + ObjectContextRef obc = get_object_context(soid, false); + if (!obc) { + primary_error(soid, v); + return 0; + } + + if (!obc->get_recovery_read()) { + dout(20) << "recovery delayed on " << soid + << "; could not get rw_manager lock" << dendl; + *work_started = true; + return 0; + } else { + dout(20) << "recovery got recovery read lock on " << soid + << dendl; + } + + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, obc)); + + int r = pgbackend->recover_object( + soid, + v, + ObjectContextRef(), + obc, // has snapset context + h); + if (r < 0) { + dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl; + on_failed_pull({ pg_whoami }, soid, v); + return 0; + } + return 1; +} + +uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle, + bool *work_started) +{ + dout(10) << __func__ << "(" << max << ")" << dendl; + uint64_t started = 0; + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + + // this is FAR from an optimal recovery order. pretty lame, really. + ceph_assert(!get_acting_recovery_backfill().empty()); + // choose replicas to recover, replica has the shortest missing list first + // so we can bring it back to normal ASAP + std::vector> replicas_by_num_missing, + async_by_num_missing; + replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1); + for (auto &p: get_acting_recovery_backfill()) { + if (p == get_primary()) { + continue; + } + auto pm = recovery_state.get_peer_missing().find(p); + ceph_assert(pm != recovery_state.get_peer_missing().end()); + auto nm = pm->second.num_missing(); + if (nm != 0) { + if (is_async_recovery_target(p)) { + async_by_num_missing.push_back(make_pair(nm, p)); + } else { + replicas_by_num_missing.push_back(make_pair(nm, p)); + } + } + } + // sort by number of missing objects, in ascending order. + auto func = [](const std::pair &lhs, + const std::pair &rhs) { + return lhs.first < rhs.first; + }; + // acting goes first + std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func); + // then async_recovery_targets + std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func); + replicas_by_num_missing.insert(replicas_by_num_missing.end(), + async_by_num_missing.begin(), async_by_num_missing.end()); + for (auto &replica: replicas_by_num_missing) { + pg_shard_t &peer = replica.second; + ceph_assert(peer != get_primary()); + auto pm = recovery_state.get_peer_missing().find(peer); + ceph_assert(pm != recovery_state.get_peer_missing().end()); + size_t m_sz = pm->second.num_missing(); + + dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl; + dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl; + + // oldest first! + const pg_missing_t &m(pm->second); + for (map::const_iterator p = m.get_rmissing().begin(); + p != m.get_rmissing().end() && started < max; + ++p) { + handle.reset_tp_timeout(); + const hobject_t soid(p->second); + + if (recovery_state.get_missing_loc().is_unfound(soid)) { + dout(10) << __func__ << ": " << soid << " still unfound" << dendl; + continue; + } + + const pg_info_t &pi = recovery_state.get_peer_info(peer); + if (soid > pi.last_backfill) { + if (!recovering.count(soid)) { + derr << __func__ << ": object " << soid << " last_backfill " + << pi.last_backfill << dendl; + derr << __func__ << ": object added to missing set for backfill, but " + << "is not in recovering, error!" << dendl; + ceph_abort(); + } + continue; + } + + if (recovering.count(soid)) { + dout(10) << __func__ << ": already recovering " << soid << dendl; + continue; + } + + if (recovery_state.get_missing_loc().is_deleted(soid)) { + dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl; + map::const_iterator r = m.get_items().find(soid); + started += prep_object_replica_deletes(soid, r->second.need, h, work_started); + continue; + } + + if (soid.is_snap() && + recovery_state.get_pg_log().get_missing().is_missing( + soid.get_head())) { + dout(10) << __func__ << ": " << soid.get_head() + << " still missing on primary" << dendl; + continue; + } + + if (recovery_state.get_pg_log().get_missing().is_missing(soid)) { + dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl; + continue; + } + + dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl; + map::const_iterator r = m.get_items().find(soid); + started += prep_object_replica_pushes(soid, r->second.need, h, work_started); + } + } + + pgbackend->run_recovery_op(h, get_recovery_op_priority()); + return started; +} + +hobject_t PrimaryLogPG::earliest_peer_backfill() const +{ + hobject_t e = hobject_t::get_max(); + for (const pg_shard_t& peer : get_backfill_targets()) { + const auto iter = peer_backfill_info.find(peer); + ceph_assert(iter != peer_backfill_info.end()); + e = std::min(e, iter->second.begin); + } + return e; +} + +bool PrimaryLogPG::all_peer_done() const +{ + // Primary hasn't got any more objects + ceph_assert(backfill_info.empty()); + + for (const pg_shard_t& bt : get_backfill_targets()) { + const auto piter = peer_backfill_info.find(bt); + ceph_assert(piter != peer_backfill_info.end()); + const BackfillInterval& pbi = piter->second; + // See if peer has more to process + if (!pbi.extends_to_end() || !pbi.empty()) + return false; + } + return true; +} + +/** + * recover_backfill + * + * Invariants: + * + * backfilled: fully pushed to replica or present in replica's missing set (both + * our copy and theirs). + * + * All objects on a backfill_target in + * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed + * objects have been actually deleted and all logically-valid objects are replicated. + * There may be PG objects in this interval yet to be backfilled. + * + * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all + * backfill_targets. There may be objects on backfill_target(s) yet to be deleted. + * + * For a backfill target, all objects < std::min(peer_backfill_info[target].begin, + * backfill_info.begin) in PG are backfilled. No deleted objects in this + * interval remain on the backfill target. + * + * For a backfill target, all objects <= peer_info[target].last_backfill + * have been backfilled to target + * + * There *MAY* be missing/outdated objects between last_backfill_started and + * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client + * io created objects since the last scan. For this reason, we call + * update_range() again before continuing backfill. + */ +uint64_t PrimaryLogPG::recover_backfill( + uint64_t max, + ThreadPool::TPHandle &handle, bool *work_started) +{ + dout(10) << __func__ << " (" << max << ")" + << " bft=" << get_backfill_targets() + << " last_backfill_started " << last_backfill_started + << (new_backfill ? " new_backfill":"") + << dendl; + ceph_assert(!get_backfill_targets().empty()); + + // Initialize from prior backfill state + if (new_backfill) { + // on_activate() was called prior to getting here + ceph_assert(last_backfill_started == recovery_state.earliest_backfill()); + new_backfill = false; + + // initialize BackfillIntervals + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + peer_backfill_info[*i].reset( + recovery_state.get_peer_info(*i).last_backfill); + } + backfill_info.reset(last_backfill_started); + + backfills_in_flight.clear(); + pending_backfill_updates.clear(); + } + + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + dout(10) << "peer osd." << *i + << " info " << recovery_state.get_peer_info(*i) + << " interval " << peer_backfill_info[*i].begin + << "-" << peer_backfill_info[*i].end + << " " << peer_backfill_info[*i].objects.size() << " objects" + << dendl; + } + + // update our local interval to cope with recent changes + backfill_info.begin = last_backfill_started; + update_range(&backfill_info, handle); + + unsigned ops = 0; + vector > to_remove; + set add_to_stat; + + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + peer_backfill_info[*i].trim_to( + std::max( + recovery_state.get_peer_info(*i).last_backfill, + last_backfill_started)); + } + backfill_info.trim_to(last_backfill_started); + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + while (ops < max) { + if (backfill_info.begin <= earliest_peer_backfill() && + !backfill_info.extends_to_end() && backfill_info.empty()) { + hobject_t next = backfill_info.end; + backfill_info.reset(next); + backfill_info.end = hobject_t::get_max(); + update_range(&backfill_info, handle); + backfill_info.trim(); + } + + dout(20) << " my backfill interval " << backfill_info << dendl; + + bool sent_scan = false; + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + + dout(20) << " peer shard " << bt << " backfill " << pbi << dendl; + if (pbi.begin <= backfill_info.begin && + !pbi.extends_to_end() && pbi.empty()) { + dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl; + epoch_t e = get_osdmap_epoch(); + MOSDPGScan *m = new MOSDPGScan( + MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(), + spg_t(info.pgid.pgid, bt.shard), + pbi.end, hobject_t()); + osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch()); + ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end()); + waiting_on_backfill.insert(bt); + sent_scan = true; + } + } + + // Count simultaneous scans as a single op and let those complete + if (sent_scan) { + ops++; + start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end + break; + } + + if (backfill_info.empty() && all_peer_done()) { + dout(10) << " reached end for both local and all peers" << dendl; + break; + } + + // Get object within set of peers to operate on and + // the set of targets for which that object applies. + hobject_t check = earliest_peer_backfill(); + + if (check < backfill_info.begin) { + + set check_targets; + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + if (pbi.begin == check) + check_targets.insert(bt); + } + ceph_assert(!check_targets.empty()); + + dout(20) << " BACKFILL removing " << check + << " from peers " << check_targets << dendl; + for (set::iterator i = check_targets.begin(); + i != check_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + ceph_assert(pbi.begin == check); + + to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt)); + pbi.pop_front(); + } + + last_backfill_started = check; + + // Don't increment ops here because deletions + // are cheap and not replied to unlike real recovery_ops, + // and we can't increment ops without requeueing ourself + // for recovery. + } else { + eversion_t& obj_v = backfill_info.objects.begin()->second; + + vector need_ver_targs, missing_targs, keep_ver_targs, skip_targs; + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + // Find all check peers that have the wrong version + if (check == backfill_info.begin && check == pbi.begin) { + if (pbi.objects.begin()->second != obj_v) { + need_ver_targs.push_back(bt); + } else { + keep_ver_targs.push_back(bt); + } + } else { + const pg_info_t& pinfo = recovery_state.get_peer_info(bt); + + // Only include peers that we've caught up to their backfill line + // otherwise, they only appear to be missing this object + // because their pbi.begin > backfill_info.begin. + if (backfill_info.begin > pinfo.last_backfill) + missing_targs.push_back(bt); + else + skip_targs.push_back(bt); + } + } + + if (!keep_ver_targs.empty()) { + // These peers have version obj_v + dout(20) << " BACKFILL keeping " << check + << " with ver " << obj_v + << " on peers " << keep_ver_targs << dendl; + //assert(!waiting_for_degraded_object.count(check)); + } + if (!need_ver_targs.empty() || !missing_targs.empty()) { + ObjectContextRef obc = get_object_context(backfill_info.begin, false); + ceph_assert(obc); + if (obc->get_recovery_read()) { + if (!need_ver_targs.empty()) { + dout(20) << " BACKFILL replacing " << check + << " with ver " << obj_v + << " to peers " << need_ver_targs << dendl; + } + if (!missing_targs.empty()) { + dout(20) << " BACKFILL pushing " << backfill_info.begin + << " with ver " << obj_v + << " to peers " << missing_targs << dendl; + } + vector all_push = need_ver_targs; + all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end()); + + handle.reset_tp_timeout(); + int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h); + if (r < 0) { + *work_started = true; + dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl; + break; + } + ops++; + } else { + *work_started = true; + dout(20) << "backfill blocking on " << backfill_info.begin + << "; could not get rw_manager lock" << dendl; + break; + } + } + dout(20) << "need_ver_targs=" << need_ver_targs + << " keep_ver_targs=" << keep_ver_targs << dendl; + dout(20) << "backfill_targets=" << get_backfill_targets() + << " missing_targs=" << missing_targs + << " skip_targs=" << skip_targs << dendl; + + last_backfill_started = backfill_info.begin; + add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes? + backfill_info.pop_front(); + vector check_targets = need_ver_targs; + check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end()); + for (vector::iterator i = check_targets.begin(); + i != check_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + pbi.pop_front(); + } + } + } + + for (set::iterator i = add_to_stat.begin(); + i != add_to_stat.end(); + ++i) { + ObjectContextRef obc = get_object_context(*i, false); + ceph_assert(obc); + pg_stat_t stat; + add_object_context_to_pg_stat(obc, &stat); + pending_backfill_updates[*i] = stat; + } + map reqs; + for (unsigned i = 0; i < to_remove.size(); ++i) { + handle.reset_tp_timeout(); + const hobject_t& oid = to_remove[i].get<0>(); + eversion_t v = to_remove[i].get<1>(); + pg_shard_t peer = to_remove[i].get<2>(); + MOSDPGBackfillRemove *m; + auto it = reqs.find(peer); + if (it != reqs.end()) { + m = it->second; + } else { + m = reqs[peer] = new MOSDPGBackfillRemove( + spg_t(info.pgid.pgid, peer.shard), + get_osdmap_epoch()); + } + m->ls.push_back(make_pair(oid, v)); + + if (oid <= last_backfill_started) + pending_backfill_updates[oid]; // add empty stat! + } + for (auto p : reqs) { + osd->send_message_osd_cluster(p.first.osd, p.second, + get_osdmap_epoch()); + } + + pgbackend->run_recovery_op(h, get_recovery_op_priority()); + + hobject_t backfill_pos = + std::min(backfill_info.begin, earliest_peer_backfill()); + dout(5) << "backfill_pos is " << backfill_pos << dendl; + for (set::iterator i = backfills_in_flight.begin(); + i != backfills_in_flight.end(); + ++i) { + dout(20) << *i << " is still in flight" << dendl; + } + + hobject_t next_backfill_to_complete = backfills_in_flight.empty() ? + backfill_pos : *(backfills_in_flight.begin()); + hobject_t new_last_backfill = recovery_state.earliest_backfill(); + dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl; + for (map::iterator i = + pending_backfill_updates.begin(); + i != pending_backfill_updates.end() && + i->first < next_backfill_to_complete; + pending_backfill_updates.erase(i++)) { + dout(20) << " pending_backfill_update " << i->first << dendl; + ceph_assert(i->first > new_last_backfill); + // carried from a previous round – if we are here, then we had to + // be requeued (by e.g. on_global_recover()) and those operations + // are done. + recovery_state.update_complete_backfill_object_stats( + i->first, + i->second); + new_last_backfill = i->first; + } + dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl; + + ceph_assert(!pending_backfill_updates.empty() || + new_last_backfill == last_backfill_started); + if (pending_backfill_updates.empty() && + backfill_pos.is_max()) { + ceph_assert(backfills_in_flight.empty()); + new_last_backfill = backfill_pos; + last_backfill_started = backfill_pos; + } + dout(10) << "final new_last_backfill at " << new_last_backfill << dendl; + + // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to + // all the backfill targets. Otherwise, we will move last_backfill up on + // those targets need it and send OP_BACKFILL_PROGRESS to them. + for (set::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + const pg_info_t& pinfo = recovery_state.get_peer_info(bt); + + if (new_last_backfill > pinfo.last_backfill) { + recovery_state.update_peer_last_backfill(bt, new_last_backfill); + epoch_t e = get_osdmap_epoch(); + MOSDPGBackfill *m = NULL; + if (pinfo.last_backfill.is_max()) { + m = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_FINISH, + e, + get_last_peering_reset(), + spg_t(info.pgid.pgid, bt.shard)); + // Use default priority here, must match sub_op priority + start_recovery_op(hobject_t::get_max()); + } else { + m = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_PROGRESS, + e, + get_last_peering_reset(), + spg_t(info.pgid.pgid, bt.shard)); + // Use default priority here, must match sub_op priority + } + m->last_backfill = pinfo.last_backfill; + m->stats = pinfo.stats; + osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch()); + dout(10) << " peer " << bt + << " num_objects now " << pinfo.stats.stats.sum.num_objects + << " / " << info.stats.stats.sum.num_objects << dendl; + } + } + + if (ops) + *work_started = true; + return ops; +} + +int PrimaryLogPG::prep_backfill_object_push( + hobject_t oid, eversion_t v, + ObjectContextRef obc, + vector peers, + PGBackend::RecoveryHandle *h) +{ + dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl; + ceph_assert(!peers.empty()); + + backfills_in_flight.insert(oid); + recovery_state.prepare_backfill_for_missing(oid, v, peers); + + ceph_assert(!recovering.count(oid)); + + start_recovery_op(oid); + recovering.insert(make_pair(oid, obc)); + + int r = pgbackend->recover_object( + oid, + v, + ObjectContextRef(), + obc, + h); + if (r < 0) { + dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl; + on_failed_pull({ pg_whoami }, oid, v); + } + return r; +} + +void PrimaryLogPG::update_range( + BackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + int local_min = cct->_conf->osd_backfill_scan_min; + int local_max = cct->_conf->osd_backfill_scan_max; + + if (bi->version < info.log_tail) { + dout(10) << __func__<< ": bi is old, rescanning local backfill_info" + << dendl; + bi->version = info.last_update; + scan_range(local_min, local_max, bi, handle); + } + + if (bi->version >= projected_last_update) { + dout(10) << __func__<< ": bi is current " << dendl; + ceph_assert(bi->version == projected_last_update); + } else if (bi->version >= info.log_tail) { + if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) { + /* Because we don't move log_tail on split, the log might be + * empty even if log_tail != last_update. However, the only + * way to get here with an empty log is if log_tail is actually + * eversion_t(), because otherwise the entry which changed + * last_update since the last scan would have to be present. + */ + ceph_assert(bi->version == eversion_t()); + return; + } + + dout(10) << __func__<< ": bi is old, (" << bi->version + << ") can be updated with log to projected_last_update " + << projected_last_update << dendl; + + auto func = [&](const pg_log_entry_t &e) { + dout(10) << __func__ << ": updating from version " << e.version + << dendl; + const hobject_t &soid = e.soid; + if (soid >= bi->begin && + soid < bi->end) { + if (e.is_update()) { + dout(10) << __func__ << ": " << e.soid << " updated to version " + << e.version << dendl; + bi->objects.erase(e.soid); + bi->objects.insert( + make_pair( + e.soid, + e.version)); + } else if (e.is_delete()) { + dout(10) << __func__ << ": " << e.soid << " removed" << dendl; + bi->objects.erase(e.soid); + } + } + }; + dout(10) << "scanning pg log first" << dendl; + recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func); + dout(10) << "scanning projected log" << dendl; + projected_log.scan_log_after(bi->version, func); + bi->version = projected_last_update; + } else { + ceph_abort_msg("scan_range should have raised bi->version past log_tail"); + } +} + +void PrimaryLogPG::scan_range( + int min, int max, BackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + ceph_assert(is_locked()); + dout(10) << "scan_range from " << bi->begin << dendl; + bi->clear_objects(); + + vector ls; + ls.reserve(max); + int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end); + ceph_assert(r >= 0); + dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl; + dout(20) << ls << dendl; + + for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { + handle.reset_tp_timeout(); + ObjectContextRef obc; + if (is_primary()) + obc = object_contexts.lookup(*p); + if (obc) { + if (!obc->obs.exists) { + /* If the object does not exist here, it must have been removed + * between the collection_list_partial and here. This can happen + * for the first item in the range, which is usually last_backfill. + */ + continue; + } + bi->objects[*p] = obc->obs.oi.version; + dout(20) << " " << *p << " " << obc->obs.oi.version << dendl; + } else { + bufferlist bl; + int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl); + /* If the object does not exist here, it must have been removed + * between the collection_list_partial and here. This can happen + * for the first item in the range, which is usually last_backfill. + */ + if (r == -ENOENT) + continue; + + ceph_assert(r >= 0); + object_info_t oi(bl); + bi->objects[*p] = oi.version; + dout(20) << " " << *p << " " << oi.version << dendl; + } + } +} + + +/** check_local + * + * verifies that stray objects have been deleted + */ +void PrimaryLogPG::check_local() +{ + dout(10) << __func__ << dendl; + + ceph_assert( + info.last_update >= + recovery_state.get_pg_log().get_tail()); // otherwise we need some help! + + if (!cct->_conf->osd_debug_verify_stray_on_activate) + return; + + // just scan the log. + set did; + for (list::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin(); + p != recovery_state.get_pg_log().get_log().log.rend(); + ++p) { + if (did.count(p->soid)) + continue; + did.insert(p->soid); + + if (p->is_delete() && !is_missing_object(p->soid)) { + dout(10) << " checking " << p->soid + << " at " << p->version << dendl; + struct stat st; + int r = osd->store->stat( + ch, + ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard), + &st); + if (r != -ENOENT) { + derr << __func__ << " " << p->soid << " exists, but should have been " + << "deleted" << dendl; + ceph_abort_msg("erroneously present object"); + } + } else { + // ignore old(+missing) objects + } + } +} + + + +// =========================== +// hit sets + +hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp) +{ + ostringstream ss; + ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp; + hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", + info.pgid.ps(), info.pgid.pool(), + cct->_conf->osd_hit_set_namespace); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt) +{ + ostringstream ss; + ss << "hit_set_" << info.pgid.pgid << "_archive_"; + if (using_gmt) { + start.gmtime(ss, true /* legacy pre-octopus form */) << "_"; + end.gmtime(ss, true /* legacy pre-octopus form */); + } else { + start.localtime(ss, true /* legacy pre-octopus form */) << "_"; + end.localtime(ss, true /* legacy pre-octopus form */); + } + hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", + info.pgid.ps(), info.pgid.pool(), + cct->_conf->osd_hit_set_namespace); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +void PrimaryLogPG::hit_set_clear() +{ + dout(20) << __func__ << dendl; + hit_set.reset(); + hit_set_start_stamp = utime_t(); +} + +void PrimaryLogPG::hit_set_setup() +{ + if (!is_active() || + !is_primary()) { + hit_set_clear(); + return; + } + + if (is_active() && is_primary() && + (!pool.info.hit_set_count || + !pool.info.hit_set_period || + pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) { + hit_set_clear(); + + // only primary is allowed to remove all the hit set objects + hit_set_remove_all(); + return; + } + + // FIXME: discard any previous data for now + hit_set_create(); + + // include any writes we know about from the pg log. this doesn't + // capture reads, but it is better than nothing! + hit_set_apply_log(); +} + +void PrimaryLogPG::hit_set_remove_all() +{ + // If any archives are degraded we skip this + for (auto p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + // Once we hit a degraded object just skip + if (is_degraded_or_backfilling_object(aoid)) + return; + if (m_scrubber->write_blocked_by_scrub(aoid)) + return; + } + + if (!info.hit_set.history.empty()) { + auto p = info.hit_set.history.rbegin(); + ceph_assert(p != info.hit_set.history.rend()); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + ceph_assert(!is_degraded_or_backfilling_object(oid)); + ObjectContextRef obc = get_object_context(oid, false); + ceph_assert(obc); + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->at_version = get_next_version(); + ctx->updated_hset_history = info.hit_set; + utime_t now = ceph_clock_now(); + ctx->mtime = now; + hit_set_trim(ctx, 0); + simple_opc_submit(std::move(ctx)); + } + + recovery_state.update_hset(pg_hit_set_history_t()); + if (agent_state) { + agent_state->discard_hit_sets(); + } +} + +void PrimaryLogPG::hit_set_create() +{ + utime_t now = ceph_clock_now(); + // make a copy of the params to modify + HitSet::Params params(pool.info.hit_set_params); + + dout(20) << __func__ << " " << params << dendl; + if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) { + BloomHitSet::Params *p = + static_cast(params.impl.get()); + + // convert false positive rate so it holds up across the full period + p->set_fpp(p->get_fpp() / pool.info.hit_set_count); + if (p->get_fpp() <= 0.0) + p->set_fpp(.01); // fpp cannot be zero! + + // if we don't have specified size, estimate target size based on the + // previous bin! + if (p->target_size == 0 && hit_set) { + utime_t dur = now - hit_set_start_stamp; + unsigned unique = hit_set->approx_unique_insert_count(); + dout(20) << __func__ << " previous set had approx " << unique + << " unique items over " << dur << " seconds" << dendl; + p->target_size = (double)unique * (double)pool.info.hit_set_period + / (double)dur; + } + if (p->target_size < + static_cast(cct->_conf->osd_hit_set_min_size)) + p->target_size = cct->_conf->osd_hit_set_min_size; + + if (p->target_size + > static_cast(cct->_conf->osd_hit_set_max_size)) + p->target_size = cct->_conf->osd_hit_set_max_size; + + p->seed = now.sec(); + + dout(10) << __func__ << " target_size " << p->target_size + << " fpp " << p->get_fpp() << dendl; + } + hit_set.reset(new HitSet(params)); + hit_set_start_stamp = now; +} + +/** + * apply log entries to set + * + * this would only happen after peering, to at least capture writes + * during an interval that was potentially lost. + */ +bool PrimaryLogPG::hit_set_apply_log() +{ + if (!hit_set) + return false; + + eversion_t to = info.last_update; + eversion_t from = info.hit_set.current_last_update; + if (to <= from) { + dout(20) << __func__ << " no update" << dendl; + return false; + } + + dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl; + list::const_reverse_iterator p = + recovery_state.get_pg_log().get_log().log.rbegin(); + while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to) + ++p; + while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) { + hit_set->insert(p->soid); + ++p; + } + + return true; +} + +void PrimaryLogPG::hit_set_persist() +{ + dout(10) << __func__ << dendl; + bufferlist bl; + unsigned max = pool.info.hit_set_count; + + utime_t now = ceph_clock_now(); + hobject_t oid; + + // If any archives are degraded we skip this persist request + // account for the additional entry being added below + for (auto p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + // Once we hit a degraded object just skip further trim + if (is_degraded_or_backfilling_object(aoid)) + return; + if (m_scrubber->write_blocked_by_scrub(aoid)) + return; + } + + // If backfill is in progress and we could possibly overlap with the + // hit_set_* objects, back off. Since these all have + // hobject_t::hash set to pgid.ps(), and those sort first, we can + // look just at that. This is necessary because our transactions + // may include a modify of the new hit_set *and* a delete of the + // old one, and this may span the backfill boundary. + for (set::const_iterator p = get_backfill_targets().begin(); + p != get_backfill_targets().end(); + ++p) { + const pg_info_t& pi = recovery_state.get_peer_info(*p); + if (pi.last_backfill == hobject_t() || + pi.last_backfill.get_hash() == info.pgid.ps()) { + dout(10) << __func__ << " backfill target osd." << *p + << " last_backfill has not progressed past pgid ps" + << dendl; + return; + } + } + + + pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset); + new_hset.begin = hit_set_start_stamp; + new_hset.end = now; + oid = get_hit_set_archive_object( + new_hset.begin, + new_hset.end, + new_hset.using_gmt); + + // If the current object is degraded we skip this persist request + if (m_scrubber->write_blocked_by_scrub(oid)) + return; + + hit_set->seal(); + encode(*hit_set, bl); + dout(20) << __func__ << " archive " << oid << dendl; + + if (agent_state) { + agent_state->add_hit_set(new_hset.begin, hit_set); + uint32_t size = agent_state->hit_set_map.size(); + if (size >= pool.info.hit_set_count) { + size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0; + } + hit_set_in_memory_trim(size); + } + + ObjectContextRef obc = get_object_context(oid, true); + OpContextUPtr ctx = simple_opc_create(obc); + + ctx->at_version = get_next_version(); + ctx->updated_hset_history = info.hit_set; + pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history); + + updated_hit_set_hist.current_last_update = info.last_update; + new_hset.version = ctx->at_version; + + updated_hit_set_hist.history.push_back(new_hset); + hit_set_create(); + + // fabricate an object_info_t and SnapSet + obc->obs.oi.version = ctx->at_version; + obc->obs.oi.mtime = now; + obc->obs.oi.size = bl.length(); + obc->obs.exists = true; + obc->obs.oi.set_data_digest(bl.crc32c(-1)); + + ctx->new_obs = obc->obs; + + ctx->new_snapset = obc->ssc->snapset; + + ctx->delta_stats.num_objects++; + ctx->delta_stats.num_objects_hit_set_archive++; + + ctx->delta_stats.num_bytes += bl.length(); + ctx->delta_stats.num_bytes_hit_set_archive += bl.length(); + + bufferlist bss; + encode(ctx->new_snapset, bss); + bufferlist boi(sizeof(ctx->new_obs.oi)); + encode(ctx->new_obs.oi, boi, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + + ctx->op_t->create(oid); + if (bl.length()) { + ctx->op_t->write(oid, 0, bl.length(), bl, 0); + write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges, + 0, bl.length()); + ctx->clean_regions.mark_data_region_dirty(0, bl.length()); + } + map attrs; + attrs[OI_ATTR] = std::move(boi); + attrs[SS_ATTR] = std::move(bss); + setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs); + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + oid, + ctx->at_version, + eversion_t(), + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + ctx->log.back().clean_regions = ctx->clean_regions; + + hit_set_trim(ctx, max); + + simple_opc_submit(std::move(ctx)); +} + +void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max) +{ + ceph_assert(ctx->updated_hset_history); + pg_hit_set_history_t &updated_hit_set_hist = + *(ctx->updated_hset_history); + for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) { + list::iterator p = updated_hit_set_hist.history.begin(); + ceph_assert(p != updated_hit_set_hist.history.end()); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + ceph_assert(!is_degraded_or_backfilling_object(oid)); + + dout(20) << __func__ << " removing " << oid << dendl; + ++ctx->at_version.version; + ctx->log.push_back( + pg_log_entry_t(pg_log_entry_t::DELETE, + oid, + ctx->at_version, + p->version, + 0, + osd_reqid_t(), + ctx->mtime, + 0)); + + ctx->op_t->remove(oid); + updated_hit_set_hist.history.pop_front(); + + ObjectContextRef obc = get_object_context(oid, false); + ceph_assert(obc); + --ctx->delta_stats.num_objects; + --ctx->delta_stats.num_objects_hit_set_archive; + ctx->delta_stats.num_bytes -= obc->obs.oi.size; + ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size; + } +} + +void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory) +{ + while (agent_state->hit_set_map.size() > max_in_memory) { + agent_state->remove_oldest_hit_set(); + } +} + + +// ======================================= +// cache agent + +void PrimaryLogPG::agent_setup() +{ + ceph_assert(is_locked()); + if (!is_active() || + !is_primary() || + state_test(PG_STATE_PREMERGE) || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || + pool.info.tier_of < 0 || + !get_osdmap()->have_pg_pool(pool.info.tier_of)) { + agent_clear(); + return; + } + if (!agent_state) { + agent_state.reset(new TierAgentState); + + // choose random starting position + agent_state->position = hobject_t(); + agent_state->position.pool = info.pgid.pool(); + agent_state->position.set_hash(pool.info.get_random_pg_position( + info.pgid.pgid, + rand())); + agent_state->start = agent_state->position; + + dout(10) << __func__ << " allocated new state, position " + << agent_state->position << dendl; + } else { + dout(10) << __func__ << " keeping existing state" << dendl; + } + + if (info.stats.stats_invalid) { + osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate"; + } + + agent_choose_mode(); +} + +void PrimaryLogPG::agent_clear() +{ + agent_stop(); + agent_state.reset(NULL); +} + +// Return false if no objects operated on since start of object hash space +bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota) +{ + std::scoped_lock locker{*this}; + if (!agent_state) { + dout(10) << __func__ << " no agent state, stopping" << dendl; + return true; + } + + ceph_assert(!recovery_state.is_deleting()); + + if (agent_state->is_idle()) { + dout(10) << __func__ << " idle, stopping" << dendl; + return true; + } + + osd->logger->inc(l_osd_agent_wake); + + dout(10) << __func__ + << " max " << start_max + << ", flush " << agent_state->get_flush_mode_name() + << ", evict " << agent_state->get_evict_mode_name() + << ", pos " << agent_state->position + << dendl; + ceph_assert(is_primary()); + ceph_assert(is_active()); + + agent_load_hit_sets(); + + const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of); + ceph_assert(base_pool); + + int ls_min = 1; + int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size; + + // list some objects. this conveniently lists clones (oldest to + // newest) before heads... the same order we want to flush in. + // + // NOTE: do not flush the Sequencer. we will assume that the + // listing we get back is imprecise. + vector ls; + hobject_t next; + int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max, + &ls, &next); + ceph_assert(r >= 0); + dout(20) << __func__ << " got " << ls.size() << " objects" << dendl; + int started = 0; + for (vector::iterator p = ls.begin(); + p != ls.end(); + ++p) { + if (p->nspace == cct->_conf->osd_hit_set_namespace) { + dout(20) << __func__ << " skip (hit set) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (is_degraded_or_backfilling_object(*p)) { + dout(20) << __func__ << " skip (degraded) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (is_missing_object(p->get_head())) { + dout(20) << __func__ << " skip (missing head) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + ObjectContextRef obc = get_object_context(*p, false, NULL); + if (!obc) { + // we didn't flush; we may miss something here. + dout(20) << __func__ << " skip (no obc) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (!obc->obs.exists) { + dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid, + obc->obs.oi.soid.get_head())) { + dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (obc->is_blocked()) { + dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (obc->is_request_pending()) { + dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + + // be careful flushing omap to an EC pool. + if (!base_pool->supports_omap() && + obc->obs.oi.is_omap()) { + dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + + if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && + agent_maybe_evict(obc, false)) + ++started; + else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE && + agent_flush_quota > 0 && agent_maybe_flush(obc)) { + ++started; + --agent_flush_quota; + } + if (started >= start_max) { + // If finishing early, set "next" to the next object + if (++p != ls.end()) + next = *p; + break; + } + } + + if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) { + dout(20) << __func__ << " resetting atime and temp histograms" << dendl; + agent_state->hist_age = 0; + agent_state->temp_hist.decay(); + } + + // Total objects operated on so far + int total_started = agent_state->started + started; + bool need_delay = false; + + dout(20) << __func__ << " start pos " << agent_state->position + << " next start pos " << next + << " started " << total_started << dendl; + + // See if we've made a full pass over the object hash space + // This might check at most ls_max objects a second time to notice that + // we've checked every objects at least once. + if (agent_state->position < agent_state->start && + next >= agent_state->start) { + dout(20) << __func__ << " wrap around " << agent_state->start << dendl; + if (total_started == 0) + need_delay = true; + else + total_started = 0; + agent_state->start = next; + } + agent_state->started = total_started; + + // See if we are starting from beginning + if (next.is_max()) + agent_state->position = hobject_t(); + else + agent_state->position = next; + + // Discard old in memory HitSets + hit_set_in_memory_trim(pool.info.hit_set_count); + + if (need_delay) { + ceph_assert(agent_state->delaying == false); + agent_delay(); + return false; + } + agent_choose_mode(); + return true; +} + +void PrimaryLogPG::agent_load_hit_sets() +{ + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) { + return; + } + + if (agent_state->hit_set_map.size() < info.hit_set.history.size()) { + dout(10) << __func__ << dendl; + for (auto p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); ++p) { + if (agent_state->hit_set_map.count(p->begin.sec()) == 0) { + dout(10) << __func__ << " loading " << p->begin << "-" + << p->end << dendl; + if (!pool.info.is_replicated()) { + // FIXME: EC not supported here yet + derr << __func__ << " on non-replicated pool" << dendl; + break; + } + + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + if (is_unreadable_object(oid)) { + dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl; + break; + } + + ObjectContextRef obc = get_object_context(oid, false); + if (!obc) { + derr << __func__ << ": could not load hitset " << oid << dendl; + break; + } + + bufferlist bl; + { + int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl); + ceph_assert(r >= 0); + } + HitSetRef hs(new HitSet); + bufferlist::const_iterator pbl = bl.begin(); + decode(*hs, pbl); + agent_state->add_hit_set(p->begin.sec(), hs); + } + } + } +} + +bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc) +{ + if (!obc->obs.oi.is_dirty()) { + dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + if (obc->obs.oi.is_cache_pinned()) { + dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + utime_t now = ceph_clock_now(); + utime_t ob_local_mtime; + if (obc->obs.oi.local_mtime != utime_t()) { + ob_local_mtime = obc->obs.oi.local_mtime; + } else { + ob_local_mtime = obc->obs.oi.mtime; + } + bool evict_mode_full = + (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL); + if (!evict_mode_full && + obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay + (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) { + dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + if (osd->agent_is_active_oid(obc->obs.oi.soid)) { + dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + dout(10) << __func__ << " flushing " << obc->obs.oi << dendl; + + // FIXME: flush anything dirty, regardless of what distribution of + // ages we expect. + + hobject_t oid = obc->obs.oi.soid; + osd->agent_start_op(oid); + // no need to capture a pg ref, can't outlive fop or ctx + std::function on_flush = [this, oid]() { + osd->agent_finish_op(oid); + }; + + int result = start_flush( + OpRequestRef(), obc, false, NULL, + on_flush); + if (result != -EINPROGRESS) { + on_flush(); + dout(10) << __func__ << " start_flush() failed " << obc->obs.oi + << " with " << result << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + osd->logger->inc(l_osd_agent_flush); + return true; +} + +bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush) +{ + const hobject_t& soid = obc->obs.oi.soid; + if (!after_flush && obc->obs.oi.is_dirty()) { + dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl; + return false; + } + // This is already checked by agent_work() which passes after_flush = false + if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) { + dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl; + return false; + } + if (!obc->obs.oi.watchers.empty()) { + dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl; + return false; + } + if (obc->is_blocked()) { + dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl; + return false; + } + if (obc->obs.oi.is_cache_pinned()) { + dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl; + return false; + } + + if (soid.snap == CEPH_NOSNAP) { + int result = _verify_no_head_clones(soid, obc->ssc->snapset); + if (result < 0) { + dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl; + return false; + } + } + + if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) { + // is this object old than cache_min_evict_age? + utime_t now = ceph_clock_now(); + utime_t ob_local_mtime; + if (obc->obs.oi.local_mtime != utime_t()) { + ob_local_mtime = obc->obs.oi.local_mtime; + } else { + ob_local_mtime = obc->obs.oi.mtime; + } + if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) { + dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + // is this object old and/or cold enough? + int temp = 0; + uint64_t temp_upper = 0, temp_lower = 0; + if (hit_set) + agent_estimate_temp(soid, &temp); + agent_state->temp_hist.add(temp); + agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper); + + dout(20) << __func__ + << " temp " << temp + << " pos " << temp_lower << "-" << temp_upper + << ", evict_effort " << agent_state->evict_effort + << dendl; + dout(30) << "agent_state:\n"; + Formatter *f = Formatter::create(""); + f->open_object_section("agent_state"); + agent_state->dump(f); + f->close_section(); + f->flush(*_dout); + delete f; + *_dout << dendl; + + if (1000000 - temp_upper >= agent_state->evict_effort) + return false; + } + + dout(10) << __func__ << " evicting " << obc->obs.oi << dendl; + OpContextUPtr ctx = simple_opc_create(obc); + + auto null_op_req = OpRequestRef(); + if (!ctx->lock_manager.get_lock_type( + RWState::RWWRITE, + obc->obs.oi.soid, + obc, + null_op_req)) { + close_op_ctx(ctx.release()); + dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl; + return false; + } + + osd->agent_start_evict_op(); + ctx->register_on_finish( + [this]() { + osd->agent_finish_evict_op(); + }); + + ctx->at_version = get_next_version(); + ceph_assert(ctx->new_obs.exists); + int r = _delete_oid(ctx.get(), true, false); + if (obc->obs.oi.is_omap()) + ctx->delta_stats.num_objects_omap--; + ctx->delta_stats.num_evict++; + ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10); + if (obc->obs.oi.is_dirty()) + --ctx->delta_stats.num_objects_dirty; + ceph_assert(r == 0); + finish_ctx(ctx.get(), pg_log_entry_t::DELETE); + simple_opc_submit(std::move(ctx)); + osd->logger->inc(l_osd_tier_evict); + osd->logger->inc(l_osd_agent_evict); + return true; +} + +void PrimaryLogPG::agent_stop() +{ + dout(20) << __func__ << dendl; + if (agent_state && !agent_state->is_idle()) { + agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE; + agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE; + osd->agent_disable_pg(this, agent_state->evict_effort); + } +} + +void PrimaryLogPG::agent_delay() +{ + dout(20) << __func__ << dendl; + if (agent_state && !agent_state->is_idle()) { + ceph_assert(agent_state->delaying == false); + agent_state->delaying = true; + osd->agent_disable_pg(this, agent_state->evict_effort); + } +} + +void PrimaryLogPG::agent_choose_mode_restart() +{ + dout(20) << __func__ << dendl; + std::scoped_lock locker{*this}; + if (agent_state && agent_state->delaying) { + agent_state->delaying = false; + agent_choose_mode(true); + } +} + +bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op) +{ + bool requeued = false; + // Let delay play out + if (agent_state->delaying) { + dout(20) << __func__ << " " << this << " delaying, ignored" << dendl; + return requeued; + } + + TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE; + TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE; + unsigned evict_effort = 0; + + if (info.stats.stats_invalid) { + // idle; stats can't be trusted until we scrub. + dout(20) << __func__ << " stats invalid (post-split), idle" << dendl; + goto skip_calc; + } + + { + uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid); + ceph_assert(divisor > 0); + + // adjust (effective) user objects down based on the number + // of HitSet objects, which should not count toward our total since + // they cannot be flushed. + uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive; + + // also exclude omap objects if ec backing pool + const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of); + ceph_assert(base_pool); + if (!base_pool->supports_omap()) + unflushable += info.stats.stats.sum.num_objects_omap; + + uint64_t num_user_objects = info.stats.stats.sum.num_objects; + if (num_user_objects > unflushable) + num_user_objects -= unflushable; + else + num_user_objects = 0; + + uint64_t num_user_bytes = info.stats.stats.sum.num_bytes; + uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive; + num_user_bytes -= unflushable_bytes; + uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects); + num_user_bytes += num_overhead_bytes; + + // also reduce the num_dirty by num_objects_omap + int64_t num_dirty = info.stats.stats.sum.num_objects_dirty; + if (!base_pool->supports_omap()) { + if (num_dirty > info.stats.stats.sum.num_objects_omap) + num_dirty -= info.stats.stats.sum.num_objects_omap; + else + num_dirty = 0; + } + + dout(10) << __func__ + << " flush_mode: " + << TierAgentState::get_flush_mode_name(agent_state->flush_mode) + << " evict_mode: " + << TierAgentState::get_evict_mode_name(agent_state->evict_mode) + << " num_objects: " << info.stats.stats.sum.num_objects + << " num_bytes: " << info.stats.stats.sum.num_bytes + << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty + << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap + << " num_dirty: " << num_dirty + << " num_user_objects: " << num_user_objects + << " num_user_bytes: " << num_user_bytes + << " num_overhead_bytes: " << num_overhead_bytes + << " pool.info.target_max_bytes: " << pool.info.target_max_bytes + << " pool.info.target_max_objects: " << pool.info.target_max_objects + << dendl; + + // get dirty, full ratios + uint64_t dirty_micro = 0; + uint64_t full_micro = 0; + if (pool.info.target_max_bytes && num_user_objects > 0) { + uint64_t avg_size = num_user_bytes / num_user_objects; + dirty_micro = + num_dirty * avg_size * 1000000 / + std::max(pool.info.target_max_bytes / divisor, 1); + full_micro = + num_user_objects * avg_size * 1000000 / + std::max(pool.info.target_max_bytes / divisor, 1); + } + if (pool.info.target_max_objects > 0) { + uint64_t dirty_objects_micro = + num_dirty * 1000000 / + std::max(pool.info.target_max_objects / divisor, 1); + if (dirty_objects_micro > dirty_micro) + dirty_micro = dirty_objects_micro; + uint64_t full_objects_micro = + num_user_objects * 1000000 / + std::max(pool.info.target_max_objects / divisor, 1); + if (full_objects_micro > full_micro) + full_micro = full_objects_micro; + } + dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0) + << " full " << ((float)full_micro / 1000000.0) + << dendl; + + // flush mode + uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro; + uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro; + uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop; + if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) { + flush_target += flush_slop; + flush_high_target += flush_slop; + } else { + flush_target -= std::min(flush_target, flush_slop); + flush_high_target -= std::min(flush_high_target, flush_slop); + } + + if (dirty_micro > flush_high_target) { + flush_mode = TierAgentState::FLUSH_MODE_HIGH; + } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) { + flush_mode = TierAgentState::FLUSH_MODE_LOW; + } + + // evict mode + uint64_t evict_target = pool.info.cache_target_full_ratio_micro; + uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop; + if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) + evict_target += evict_slop; + else + evict_target -= std::min(evict_target, evict_slop); + + if (full_micro > 1000000) { + // evict anything clean + evict_mode = TierAgentState::EVICT_MODE_FULL; + evict_effort = 1000000; + } else if (full_micro > evict_target) { + // set effort in [0..1] range based on where we are between + evict_mode = TierAgentState::EVICT_MODE_SOME; + uint64_t over = full_micro - evict_target; + uint64_t span = 1000000 - evict_target; + evict_effort = std::max(over * 1000000 / span, + uint64_t(1000000.0 * + cct->_conf->osd_agent_min_evict_effort)); + + // quantize effort to avoid too much reordering in the agent_queue. + uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000; + ceph_assert(inc > 0); + uint64_t was = evict_effort; + evict_effort -= evict_effort % inc; + if (evict_effort < inc) + evict_effort = inc; + ceph_assert(evict_effort >= inc && evict_effort <= 1000000); + dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl; + } + } + + skip_calc: + bool old_idle = agent_state->is_idle(); + if (flush_mode != agent_state->flush_mode) { + dout(5) << __func__ << " flush_mode " + << TierAgentState::get_flush_mode_name(agent_state->flush_mode) + << " -> " + << TierAgentState::get_flush_mode_name(flush_mode) + << dendl; + recovery_state.update_stats( + [=](auto &history, auto &stats) { + if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) { + osd->agent_inc_high_count(); + stats.stats.sum.num_flush_mode_high = 1; + } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) { + stats.stats.sum.num_flush_mode_low = 1; + } + if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) { + osd->agent_dec_high_count(); + stats.stats.sum.num_flush_mode_high = 0; + } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) { + stats.stats.sum.num_flush_mode_low = 0; + } + return false; + }); + agent_state->flush_mode = flush_mode; + } + if (evict_mode != agent_state->evict_mode) { + dout(5) << __func__ << " evict_mode " + << TierAgentState::get_evict_mode_name(agent_state->evict_mode) + << " -> " + << TierAgentState::get_evict_mode_name(evict_mode) + << dendl; + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL && + is_active()) { + if (op) + requeue_op(op); + requeue_ops(waiting_for_flush); + requeue_ops(waiting_for_active); + requeue_ops(waiting_for_readable); + requeue_ops(waiting_for_scrub); + requeue_ops(waiting_for_cache_not_full); + objects_blocked_on_cache_full.clear(); + requeued = true; + } + recovery_state.update_stats( + [=](auto &history, auto &stats) { + if (evict_mode == TierAgentState::EVICT_MODE_SOME) { + stats.stats.sum.num_evict_mode_some = 1; + } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) { + stats.stats.sum.num_evict_mode_full = 1; + } + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) { + stats.stats.sum.num_evict_mode_some = 0; + } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + stats.stats.sum.num_evict_mode_full = 0; + } + return false; + }); + agent_state->evict_mode = evict_mode; + } + uint64_t old_effort = agent_state->evict_effort; + if (evict_effort != agent_state->evict_effort) { + dout(5) << __func__ << " evict_effort " + << ((float)agent_state->evict_effort / 1000000.0) + << " -> " + << ((float)evict_effort / 1000000.0) + << dendl; + agent_state->evict_effort = evict_effort; + } + + // NOTE: we are using evict_effort as a proxy for *all* agent effort + // (including flush). This is probably fine (they should be + // correlated) but it is not precisely correct. + if (agent_state->is_idle()) { + if (!restart && !old_idle) { + osd->agent_disable_pg(this, old_effort); + } + } else { + if (restart || old_idle) { + osd->agent_enable_pg(this, agent_state->evict_effort); + } else if (old_effort != agent_state->evict_effort) { + osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort); + } + } + return requeued; +} + +void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp) +{ + ceph_assert(hit_set); + ceph_assert(temp); + *temp = 0; + if (hit_set->contains(oid)) + *temp = 1000000; + unsigned i = 0; + int last_n = pool.info.hit_set_search_last_n; + for (map::reverse_iterator p = + agent_state->hit_set_map.rbegin(); last_n > 0 && + p != agent_state->hit_set_map.rend(); ++p, ++i) { + if (p->second->contains(oid)) { + *temp += pool.info.get_grade(i); + --last_n; + } + } +} + +// Dup op detection + +bool PrimaryLogPG::already_complete(eversion_t v) +{ + dout(20) << __func__ << ": " << v << dendl; + for (xlist::iterator i = repop_queue.begin(); + !i.end(); + ++i) { + dout(20) << __func__ << ": " << **i << dendl; + // skip copy from temp object ops + if ((*i)->v == eversion_t()) { + dout(20) << __func__ << ": " << **i + << " version is empty" << dendl; + continue; + } + if ((*i)->v > v) { + dout(20) << __func__ << ": " << **i + << " (*i)->v past v" << dendl; + break; + } + if (!(*i)->all_committed) { + dout(20) << __func__ << ": " << **i + << " not committed, returning false" + << dendl; + return false; + } + } + dout(20) << __func__ << ": returning true" << dendl; + return true; +} + + +// ========================================================================================== +// SCRUB + +void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op) +{ + dout(15) << __func__ << " is scrub active? " << m_scrubber->is_scrub_active() << dendl; + op->mark_started(); + + if (!m_scrubber->is_scrub_active()) { + dout(10) << __func__ << " scrub isn't active" << dendl; + return; + } + m_scrubber->map_from_replica(op); +} + +bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin, + const hobject_t& end) +{ + pair next; + next.second = object_contexts.lookup(begin); + next.first = begin; + bool more = true; + while (more && next.first < end) { + if (next.second && next.second->is_blocked()) { + next.second->requeue_scrub_on_unblock = true; + dout(10) << __func__ << ": scrub delayed, " + << next.first << " is blocked" + << dendl; + return false; + } + more = object_contexts.get_next(next.first, &next); + } + return true; +} + + +int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx) +{ + OpRequestRef op = ctx->op; + // Only supports replicated pools + ceph_assert(!pool.info.is_erasure()); + ceph_assert(is_primary()); + + dout(10) << __func__ << " " << soid + << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl; + + if (!is_clean()) { + block_for_clean(soid, op); + return -EAGAIN; + } + + ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid)); + auto& oi = ctx->new_obs.oi; + eversion_t v = oi.version; + + if (primary_error(soid, v)) { + dout(0) << __func__ << " No other replicas available for " << soid << dendl; + // XXX: If we knew that there is no down osd which could include this + // object, it would be nice if we could return EIO here. + // If a "never fail" flag was available, that could be used + // for rbd to NOT return EIO until object marked lost. + + // Drop through to save this op in case an osd comes up with the object. + } + + // Restart the op after object becomes readable again + waiting_for_unreadable_object[soid].push_back(op); + op->mark_delayed("waiting for missing object"); + + ceph_assert(is_clean()); + state_set(PG_STATE_REPAIR); + state_clear(PG_STATE_CLEAN); + queue_peering_event( + PGPeeringEventRef( + std::make_shared( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery()))); + + return -EAGAIN; +} + +/*---SnapTrimmer Logging---*/ +#undef dout_prefix +#define dout_prefix pg->gen_prefix(*_dout) + +void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name) +{ + ldout(pg->cct, 20) << "enter " << state_name << dendl; +} + +void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time) +{ + ldout(pg->cct, 20) << "exit " << state_name << dendl; +} + +bool PrimaryLogPG::SnapTrimmer::permit_trim() { + return + pg->is_clean() && + !pg->is_scrub_queued_or_active() && + !pg->snap_trimq.empty(); +} + +/*---SnapTrimmer states---*/ +#undef dout_prefix +#define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \ + << "SnapTrimmer state<" << get_state_name() << ">: ") + +/* NotTrimming */ +PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "NotTrimming") +{ + context< SnapTrimmer >().log_enter(state_name); +} + +void PrimaryLogPG::NotTrimming::exit() +{ + context< SnapTrimmer >().log_exit(state_name, enter_time); +} + +boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&) +{ + PrimaryLogPG *pg = context< SnapTrimmer >().pg; + ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl; + + if (!(pg->is_primary() && pg->is_active())) { + ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl; + return discard_event(); + } + if (!pg->is_clean() || + pg->snap_trimq.empty()) { + ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl; + return discard_event(); + } + if (pg->is_scrub_queued_or_active()) { + ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl; + return transit< WaitScrub >(); + } else { + return transit< Trimming >(); + } +} + +boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&) +{ + PrimaryLogPG *pg = context< SnapTrimmer >().pg; + ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl; + + pending = nullptr; + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } + + context().snap_to_trim = pg->snap_trimq.range_start(); + ldout(pg->cct, 10) << "NotTrimming: trimming " + << pg->snap_trimq.range_start() + << dendl; + return transit< AwaitAsyncWork >(); +} + +/* AwaitAsyncWork */ +PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "Trimming/AwaitAsyncWork") +{ + auto *pg = context< SnapTrimmer >().pg; + context< SnapTrimmer >().log_enter(state_name); + context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg); + pg->state_set(PG_STATE_SNAPTRIM); + pg->state_clear(PG_STATE_SNAPTRIM_ERROR); + pg->publish_stats_to_osd(); +} + +boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&) +{ + PrimaryLogPGRef pg = context< SnapTrimmer >().pg; + snapid_t snap_to_trim = context().snap_to_trim; + auto &in_flight = context().in_flight; + ceph_assert(in_flight.empty()); + + ceph_assert(pg->is_primary() && pg->is_active()); + if (!context< SnapTrimmer >().can_trim()) { + ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl; + post_event(KickTrim()); + return transit< NotTrimming >(); + } + + ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl; + + vector to_trim; + unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims; + // we need to look for at least 1 snaptrim, otherwise we'll misinterpret + // the ENOENT below and erase snap_to_trim. + ceph_assert(max > 0); + to_trim.reserve(max); + int r = pg->snap_mapper.get_next_objects_to_trim( + snap_to_trim, + max, + &to_trim); + if (r != 0 && r != -ENOENT) { + lderr(pg->cct) << "get_next_objects_to_trim returned " + << cpp_strerror(r) << dendl; + ceph_abort_msg("get_next_objects_to_trim returned an invalid code"); + } else if (r == -ENOENT) { + // Done! + ldout(pg->cct, 10) << "got ENOENT" << dendl; + + pg->snap_trimq.erase(snap_to_trim); + + if (pg->snap_trimq_repeat.count(snap_to_trim)) { + ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl; + pg->snap_trimq_repeat.erase(snap_to_trim); + } else { + ldout(pg->cct, 10) << "adding snap " << snap_to_trim + << " to purged_snaps" + << dendl; + ObjectStore::Transaction t; + pg->recovery_state.adjust_purged_snaps( + [snap_to_trim](auto &purged_snaps) { + purged_snaps.insert(snap_to_trim); + }); + pg->write_if_dirty(t); + + ldout(pg->cct, 10) << "purged_snaps now " + << pg->info.purged_snaps << ", snap_trimq now " + << pg->snap_trimq << dendl; + + int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL); + ceph_assert(tr == 0); + + pg->recovery_state.share_pg_info(); + } + post_event(KickTrim()); + return transit< NotTrimming >(); + } + ceph_assert(!to_trim.empty()); + + for (auto &&object: to_trim) { + // Get next + ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl; + OpContextUPtr ctx; + int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx); + if (error) { + if (error == -ENOLCK) { + ldout(pg->cct, 10) << "could not get write lock on obj " + << object << dendl; + } else { + pg->state_set(PG_STATE_SNAPTRIM_ERROR); + ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl; + } + if (!in_flight.empty()) { + ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl; + return transit< WaitRepops >(); + } + if (error == -ENOLCK) { + ldout(pg->cct, 10) << "waiting for it to clear" + << dendl; + return transit< WaitRWLock >(); + } else { + return transit< NotTrimming >(); + } + } + + in_flight.insert(object); + ctx->register_on_success( + [pg, object, &in_flight]() { + ceph_assert(in_flight.find(object) != in_flight.end()); + in_flight.erase(object); + if (in_flight.empty()) { + if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) { + pg->snap_trimmer_machine.process_event(Reset()); + } else { + pg->snap_trimmer_machine.process_event(RepopsComplete()); + } + } + }); + + pg->simple_opc_submit(std::move(ctx)); + } + + return transit< WaitRepops >(); +} + +void PrimaryLogPG::setattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const string &key, + bufferlist &val) +{ + t->setattr(obc->obs.oi.soid, key, val); +} + +void PrimaryLogPG::setattrs_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + map &attrs) +{ + t->setattrs(obc->obs.oi.soid, attrs); +} + +void PrimaryLogPG::rmattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const string &key) +{ + t->rmattr(obc->obs.oi.soid, key); +} + +int PrimaryLogPG::getattr_maybe_cache( + ObjectContextRef obc, + const string &key, + bufferlist *val) +{ + if (pool.info.is_erasure()) { + map::iterator i = obc->attr_cache.find(key); + if (i != obc->attr_cache.end()) { + if (val) + *val = i->second; + return 0; + } else { + return -ENODATA; + } + } + return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val); +} + +int PrimaryLogPG::getattrs_maybe_cache( + ObjectContextRef obc, + map *out) +{ + int r = 0; + ceph_assert(out); + if (pool.info.is_erasure()) { + *out = obc->attr_cache; + } else { + r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out); + } + map tmp; + for (map::iterator i = out->begin(); + i != out->end(); + ++i) { + if (i->first.size() > 1 && i->first[0] == '_') + tmp[i->first.substr(1, i->first.size())] = std::move(i->second); + } + tmp.swap(*out); + return r; +} + +bool PrimaryLogPG::check_failsafe_full() { + return osd->check_failsafe_full(get_dpp()); +} + +bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid) +{ + return m_scrubber->write_blocked_by_scrub(oid); +} + +void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); } +void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); } + +#ifdef PG_DEBUG_REFS +uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); } +void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); } +#endif + +void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); } +void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); } diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h new file mode 100644 index 000000000..68cdec24e --- /dev/null +++ b/src/osd/PrimaryLogPG.h @@ -0,0 +1,1969 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_REPLICATEDPG_H +#define CEPH_REPLICATEDPG_H + +#include +#include "include/ceph_assert.h" +#include "DynamicPerfStats.h" +#include "OSD.h" +#include "PG.h" +#include "Watch.h" +#include "TierAgentState.h" +#include "messages/MOSDOpReply.h" +#include "common/Checksummer.h" +#include "common/sharedptr_registry.hpp" +#include "common/shared_cache.hpp" +#include "ReplicatedBackend.h" +#include "PGTransaction.h" +#include "cls/cas/cls_cas_ops.h" + +class CopyFromCallback; +class PromoteCallback; +struct RefCountCallback; + +class PrimaryLogPG; +class PGLSFilter; +class HitSet; +struct TierAgentState; +class OSDService; + +void intrusive_ptr_add_ref(PrimaryLogPG *pg); +void intrusive_ptr_release(PrimaryLogPG *pg); +uint64_t get_with_id(PrimaryLogPG *pg); +void put_with_id(PrimaryLogPG *pg, uint64_t id); + +#ifdef PG_DEBUG_REFS + typedef TrackedIntPtr PrimaryLogPGRef; +#else + typedef boost::intrusive_ptr PrimaryLogPGRef; +#endif + +struct inconsistent_snapset_wrapper; + +class PrimaryLogPG : public PG, public PGBackend::Listener { + friend class OSD; + friend class Watch; + friend class PrimaryLogScrub; + +public: + MEMPOOL_CLASS_HELPERS(); + + /* + * state associated with a copy operation + */ + struct OpContext; + class CopyCallback; + + /** + * CopyResults stores the object metadata of interest to a copy initiator. + */ + struct CopyResults { + ceph::real_time mtime; ///< the copy source's mtime + uint64_t object_size; ///< the copied object's size + bool started_temp_obj; ///< true if the callback needs to delete temp object + hobject_t temp_oid; ///< temp object (if any) + + /** + * Function to fill in transaction; if non-empty the callback + * must execute it before any other accesses to the object + * (in order to complete the copy). + */ + std::function fill_in_final_tx; + + version_t user_version; ///< The copy source's user version + bool should_requeue; ///< op should be requeued on cancel + std::vector snaps; ///< src's snaps (if clone) + snapid_t snap_seq; ///< src's snap_seq (if head) + librados::snap_set_t snapset; ///< src snapset (if head) + bool mirror_snapset; + bool has_omap; + uint32_t flags; // object_copy_data_t::FLAG_* + uint32_t source_data_digest, source_omap_digest; + uint32_t data_digest, omap_digest; + mempool::osd_pglog::vector > reqids; // [(reqid, user_version)] + mempool::osd_pglog::map reqid_return_codes; // std::map reqids by index to error code + std::map attrs; // xattrs + uint64_t truncate_seq; + uint64_t truncate_size; + bool is_data_digest() { + return flags & object_copy_data_t::FLAG_DATA_DIGEST; + } + bool is_omap_digest() { + return flags & object_copy_data_t::FLAG_OMAP_DIGEST; + } + CopyResults() + : object_size(0), started_temp_obj(false), + user_version(0), + should_requeue(false), mirror_snapset(false), + has_omap(false), + flags(0), + source_data_digest(-1), source_omap_digest(-1), + data_digest(-1), omap_digest(-1), + truncate_seq(0), truncate_size(0) + {} + }; + + struct CopyOp; + typedef std::shared_ptr CopyOpRef; + + struct CopyOp { + CopyCallback *cb; + ObjectContextRef obc; + hobject_t src; + object_locator_t oloc; + unsigned flags; + bool mirror_snapset; + + CopyResults results; + + ceph_tid_t objecter_tid; + ceph_tid_t objecter_tid2; + + object_copy_cursor_t cursor; + std::map attrs; + ceph::buffer::list data; + ceph::buffer::list omap_header; + ceph::buffer::list omap_data; + int rval; + + object_copy_cursor_t temp_cursor; + + /* + * For CopyOp the process is: + * step1: read the data(attr/omap/data) from the source object + * step2: handle those data(w/ those data create a new object) + * src_obj_fadvise_flags used in step1; + * dest_obj_fadvise_flags used in step2 + */ + unsigned src_obj_fadvise_flags; + unsigned dest_obj_fadvise_flags; + + std::map chunk_cops; + int num_chunk; + bool failed; + uint64_t start_offset = 0; + uint64_t last_offset = 0; + std::vector chunk_ops; + + CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, + object_locator_t l, + version_t v, + unsigned f, + bool ms, + unsigned src_obj_fadvise_flags, + unsigned dest_obj_fadvise_flags) + : cb(cb_), obc(_obc), src(s), oloc(l), flags(f), + mirror_snapset(ms), + objecter_tid(0), + objecter_tid2(0), + rval(-1), + src_obj_fadvise_flags(src_obj_fadvise_flags), + dest_obj_fadvise_flags(dest_obj_fadvise_flags), + num_chunk(0), + failed(false) + { + results.user_version = v; + results.mirror_snapset = mirror_snapset; + } + }; + + /** + * The CopyCallback class defines an interface for completions to the + * copy_start code. Users of the copy infrastructure must implement + * one and give an instance of the class to start_copy. + * + * The implementer is responsible for making sure that the CopyCallback + * can associate itself with the correct copy operation. + */ + typedef boost::tuple CopyCallbackResults; + + friend class CopyFromCallback; + friend struct CopyFromFinisher; + friend class PromoteCallback; + friend struct PromoteFinisher; + + struct ProxyReadOp { + OpRequestRef op; + hobject_t soid; + ceph_tid_t objecter_tid; + std::vector &ops; + version_t user_version; + int data_offset; + bool canceled; ///< true if canceled + + ProxyReadOp(OpRequestRef _op, hobject_t oid, std::vector& _ops) + : op(_op), soid(oid), + objecter_tid(0), ops(_ops), + user_version(0), data_offset(0), + canceled(false) { } + }; + typedef std::shared_ptr ProxyReadOpRef; + + struct ProxyWriteOp { + OpContext *ctx; + OpRequestRef op; + hobject_t soid; + ceph_tid_t objecter_tid; + std::vector &ops; + version_t user_version; + bool sent_reply; + utime_t mtime; + bool canceled; + osd_reqid_t reqid; + + ProxyWriteOp(OpRequestRef _op, hobject_t oid, std::vector& _ops, osd_reqid_t _reqid) + : ctx(NULL), op(_op), soid(oid), + objecter_tid(0), ops(_ops), + user_version(0), sent_reply(false), + canceled(false), + reqid(_reqid) { } + }; + typedef std::shared_ptr ProxyWriteOpRef; + + struct FlushOp { + ObjectContextRef obc; ///< obc we are flushing + OpRequestRef op; ///< initiating op + std::list dup_ops; ///< bandwagon jumpers + version_t flushed_version; ///< user version we are flushing + ceph_tid_t objecter_tid; ///< copy-from request tid + int rval; ///< copy-from result + bool blocking; ///< whether we are blocking updates + bool removal; ///< we are removing the backend object + std::optional> on_flush; ///< callback, may be null + // for chunked object + std::map io_results; + std::map io_tids; + uint64_t chunks; + + FlushOp() + : flushed_version(0), objecter_tid(0), rval(0), + blocking(false), removal(false), chunks(0) {} + ~FlushOp() { ceph_assert(!on_flush); } + }; + typedef std::shared_ptr FlushOpRef; + + friend struct RefCountCallback; + struct ManifestOp { + RefCountCallback *cb; + ceph_tid_t objecter_tid; + OpRequestRef op; + std::map results; + std::map tids; + std::map> chunks; + uint64_t num_chunks = 0; + object_manifest_t new_manifest; + + + ManifestOp(RefCountCallback* cb) + : cb(cb), objecter_tid(0) {} + }; + typedef std::shared_ptr ManifestOpRef; + std::map manifest_ops; + + boost::scoped_ptr pgbackend; + PGBackend *get_pgbackend() override { + return pgbackend.get(); + } + + const PGBackend *get_pgbackend() const override { + return pgbackend.get(); + } + + /// Listener methods + DoutPrefixProvider *get_dpp() override { + return this; + } + + void on_local_recover( + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info, + ObjectContextRef obc, + bool is_delete, + ObjectStore::Transaction *t + ) override; + void on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info + ) override { + recovery_state.on_peer_recover(peer, oid, recovery_info.version); + } + void begin_peer_recover( + pg_shard_t peer, + const hobject_t oid) override { + recovery_state.begin_peer_recover(peer, oid); + } + void on_global_recover( + const hobject_t &oid, + const object_stat_sum_t &stat_diff, + bool is_delete) override; + void on_failed_pull( + const std::set &from, + const hobject_t &soid, + const eversion_t &version) override; + void cancel_pull(const hobject_t &soid) override; + void apply_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats) override; + + bool primary_error(const hobject_t& soid, eversion_t v); + + void remove_missing_object(const hobject_t &oid, + eversion_t v, + Context *on_complete) override; + + template class BlessedGenContext; + template class UnlockedBlessedGenContext; + class BlessedContext; + Context *bless_context(Context *c) override; + + GenContext *bless_gencontext( + GenContext *c) override; + GenContext *bless_unlocked_gencontext( + GenContext *c) override; + + void send_message(int to_osd, Message *m) override { + osd->send_message_osd_cluster(to_osd, m, get_osdmap_epoch()); + } + void queue_transaction(ObjectStore::Transaction&& t, + OpRequestRef op) override { + osd->store->queue_transaction(ch, std::move(t), op); + } + void queue_transactions(std::vector& tls, + OpRequestRef op) override { + osd->store->queue_transactions(ch, tls, op, NULL); + } + epoch_t get_interval_start_epoch() const override { + return info.history.same_interval_since; + } + epoch_t get_last_peering_reset_epoch() const override { + return get_last_peering_reset(); + } + const std::set &get_acting_recovery_backfill_shards() const override { + return get_acting_recovery_backfill(); + } + const std::set &get_acting_shards() const override { + return recovery_state.get_actingset(); + } + const std::set &get_backfill_shards() const override { + return get_backfill_targets(); + } + + std::ostream& gen_dbg_prefix(std::ostream& out) const override { + return gen_prefix(out); + } + + const HobjToShardSetMapping& get_missing_loc_shards() const override + { + return recovery_state.get_missing_loc().get_missing_locs(); + } + const std::map &get_shard_missing() const override { + return recovery_state.get_peer_missing(); + } + using PGBackend::Listener::get_shard_missing; + const std::map &get_shard_info() const override { + return recovery_state.get_peer_info(); + } + using PGBackend::Listener::get_shard_info; + const pg_missing_tracker_t &get_local_missing() const override { + return recovery_state.get_pg_log().get_missing(); + } + const PGLog &get_log() const override { + return recovery_state.get_pg_log(); + } + void add_local_next_event(const pg_log_entry_t& e) override { + recovery_state.add_local_next_event(e); + } + bool pgb_is_primary() const override { + return is_primary(); + } + const OSDMapRef& pgb_get_osdmap() const override final { + return get_osdmap(); + } + epoch_t pgb_get_osdmap_epoch() const override final { + return get_osdmap_epoch(); + } + const pg_info_t &get_info() const override { + return info; + } + const pg_pool_t &get_pool() const override { + return pool.info; + } + + ObjectContextRef get_obc( + const hobject_t &hoid, + const std::map &attrs) override { + return get_object_context(hoid, true, &attrs); + } + + bool try_lock_for_read( + const hobject_t &hoid, + ObcLockManager &manager) override { + if (is_missing_object(hoid)) + return false; + auto obc = get_object_context(hoid, false, nullptr); + if (!obc) + return false; + return manager.try_get_read_lock(hoid, obc); + } + + void release_locks(ObcLockManager &manager) override { + release_object_locks(manager); + } + + bool pg_is_repair() override { + return is_repair(); + } + void inc_osd_stat_repaired() override { + osd->inc_osd_stat_repaired(); + } + bool pg_is_remote_backfilling() override { + return is_remote_backfilling(); + } + void pg_add_local_num_bytes(int64_t num_bytes) override { + add_local_num_bytes(num_bytes); + } + void pg_sub_local_num_bytes(int64_t num_bytes) override { + sub_local_num_bytes(num_bytes); + } + void pg_add_num_bytes(int64_t num_bytes) override { + add_num_bytes(num_bytes); + } + void pg_sub_num_bytes(int64_t num_bytes) override { + sub_num_bytes(num_bytes); + } + + void pgb_set_object_snap_mapping( + const hobject_t &soid, + const std::set &snaps, + ObjectStore::Transaction *t) override { + return update_object_snap_mapping(t, soid, snaps); + } + void pgb_clear_object_snap_mapping( + const hobject_t &soid, + ObjectStore::Transaction *t) override { + return clear_object_snap_mapping(t, soid); + } + + void log_operation( + std::vector&& logv, + const std::optional &hset_history, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const eversion_t &min_last_complete_ondisk, + bool transaction_applied, + ObjectStore::Transaction &t, + bool async = false) override { + if (is_primary()) { + ceph_assert(trim_to <= recovery_state.get_last_update_ondisk()); + } + if (hset_history) { + recovery_state.update_hset(*hset_history); + } + if (transaction_applied) { + update_snap_map(logv, t); + } + auto last = logv.rbegin(); + if (is_primary() && last != logv.rend()) { + projected_log.skip_can_rollback_to_to_head(); + projected_log.trim(cct, last->version, nullptr, nullptr, nullptr); + } + if (!is_primary() && !is_ec_pg()) { + replica_clear_repop_obc(logv, t); + } + recovery_state.append_log( + std::move(logv), trim_to, roll_forward_to, min_last_complete_ondisk, + t, transaction_applied, async); + } + + void replica_clear_repop_obc( + const std::vector &logv, + ObjectStore::Transaction &t); + + void op_applied(const eversion_t &applied_version) override; + + bool should_send_op( + pg_shard_t peer, + const hobject_t &hoid) override; + + bool pg_is_undersized() const override { + return is_undersized(); + } + + bool pg_is_repair() const override { + return is_repair(); + } + + void update_peer_last_complete_ondisk( + pg_shard_t fromosd, + eversion_t lcod) override { + recovery_state.update_peer_last_complete_ondisk(fromosd, lcod); + } + + void update_last_complete_ondisk( + eversion_t lcod) override { + recovery_state.update_last_complete_ondisk(lcod); + } + + void update_stats( + const pg_stat_t &stat) override { + recovery_state.update_stats( + [&stat](auto &history, auto &stats) { + stats = stat; + return false; + }); + } + + void schedule_recovery_work( + GenContext *c) override; + + pg_shard_t whoami_shard() const override { + return pg_whoami; + } + spg_t primary_spg_t() const override { + return spg_t(info.pgid.pgid, get_primary().shard); + } + pg_shard_t primary_shard() const override { + return get_primary(); + } + uint64_t min_peer_features() const override { + return recovery_state.get_min_peer_features(); + } + uint64_t min_upacting_features() const override { + return recovery_state.get_min_upacting_features(); + } + void send_message_osd_cluster( + int peer, Message *m, epoch_t from_epoch) override { + osd->send_message_osd_cluster(peer, m, from_epoch); + } + void send_message_osd_cluster( + std::vector>& messages, epoch_t from_epoch) override { + osd->send_message_osd_cluster(messages, from_epoch); + } + void send_message_osd_cluster( + MessageRef m, Connection *con) override { + osd->send_message_osd_cluster(std::move(m), con); + } + void send_message_osd_cluster( + Message *m, const ConnectionRef& con) override { + osd->send_message_osd_cluster(m, con); + } + ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override; + entity_name_t get_cluster_msgr_name() override { + return osd->get_cluster_msgr_name(); + } + + PerfCounters *get_logger() override; + + ceph_tid_t get_tid() override { return osd->get_tid(); } + + OstreamTemp clog_error() override { return osd->clog->error(); } + OstreamTemp clog_warn() override { return osd->clog->warn(); } + + /** + * a scrub-map arrived from a replica + */ + void do_replica_scrub_map(OpRequestRef op); + + struct watch_disconnect_t { + uint64_t cookie; + entity_name_t name; + bool send_disconnect; + watch_disconnect_t(uint64_t c, entity_name_t n, bool sd) + : cookie(c), name(n), send_disconnect(sd) {} + }; + void complete_disconnect_watches( + ObjectContextRef obc, + const std::list &to_disconnect); + + struct OpFinisher { + virtual ~OpFinisher() { + } + + virtual int execute() = 0; + }; + + /* + * Capture all object state associated with an in-progress read or write. + */ + struct OpContext { + OpRequestRef op; + osd_reqid_t reqid; + std::vector *ops; + + const ObjectState *obs; // Old objectstate + const SnapSet *snapset; // Old snapset + + ObjectState new_obs; // resulting ObjectState + SnapSet new_snapset; // resulting SnapSet (in case of a write) + //pg_stat_t new_stats; // resulting Stats + object_stat_sum_t delta_stats; + + bool modify; // (force) modification (even if op_t is empty) + bool user_modify; // user-visible modification + bool undirty; // user explicitly un-dirtying this object + bool cache_operation; ///< true if this is a cache eviction + bool ignore_cache; ///< true if IGNORE_CACHE flag is std::set + bool ignore_log_op_stats; // don't log op stats + bool update_log_only; ///< this is a write that returned an error - just record in pg log for dup detection + ObjectCleanRegions clean_regions; + + // side effects + std::list > watch_connects; ///< new watch + will_ping flag + std::list watch_disconnects; ///< old watch + send_discon + std::list notifies; + struct NotifyAck { + std::optional watch_cookie; + uint64_t notify_id; + ceph::buffer::list reply_bl; + explicit NotifyAck(uint64_t notify_id) : notify_id(notify_id) {} + NotifyAck(uint64_t notify_id, uint64_t cookie, ceph::buffer::list& rbl) + : watch_cookie(cookie), notify_id(notify_id) { + reply_bl = std::move(rbl); + } + }; + std::list notify_acks; + + uint64_t bytes_written, bytes_read; + + utime_t mtime; + SnapContext snapc; // writer snap context + eversion_t at_version; // pg's current version pointer + version_t user_at_version; // pg's current user version pointer + + /// index of the current subop - only valid inside of do_osd_ops() + int current_osd_subop_num; + /// total number of subops processed in this context for cls_cxx_subop_version() + int processed_subop_count = 0; + + PGTransactionUPtr op_t; + std::vector log; + std::optional updated_hset_history; + + interval_set modified_ranges; + ObjectContextRef obc; + ObjectContextRef clone_obc; // if we created a clone + ObjectContextRef head_obc; // if we also update snapset (see trim_object) + + // FIXME: we may want to kill this msgr hint off at some point! + std::optional data_off = std::nullopt; + + MOSDOpReply *reply; + + PrimaryLogPG *pg; + + int num_read; ///< count read ops + int num_write; ///< count update ops + + mempool::osd_pglog::vector > extra_reqids; + mempool::osd_pglog::map extra_reqid_return_codes; + + hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking + + std::list> on_applied; + std::list> on_committed; + std::list> on_finish; + std::list> on_success; + template + void register_on_finish(F &&f) { + on_finish.emplace_back(std::forward(f)); + } + template + void register_on_success(F &&f) { + on_success.emplace_back(std::forward(f)); + } + template + void register_on_applied(F &&f) { + on_applied.emplace_back(std::forward(f)); + } + template + void register_on_commit(F &&f) { + on_committed.emplace_back(std::forward(f)); + } + + bool sent_reply = false; + + // pending async reads -> + std::list, + std::pair > > pending_async_reads; + int inflightreads; + friend struct OnReadComplete; + void start_async_reads(PrimaryLogPG *pg); + void finish_read(PrimaryLogPG *pg); + bool async_reads_complete() { + return inflightreads == 0; + } + + RWState::State lock_type; + ObcLockManager lock_manager; + + std::map> op_finishers; + + OpContext(const OpContext& other); + const OpContext& operator=(const OpContext& other); + + OpContext(OpRequestRef _op, osd_reqid_t _reqid, std::vector* _ops, + ObjectContextRef& obc, + PrimaryLogPG *_pg) : + op(_op), reqid(_reqid), ops(_ops), + obs(&obc->obs), + snapset(0), + new_obs(obs->oi, obs->exists), + modify(false), user_modify(false), undirty(false), cache_operation(false), + ignore_cache(false), ignore_log_op_stats(false), update_log_only(false), + bytes_written(0), bytes_read(0), user_at_version(0), + current_osd_subop_num(0), + obc(obc), + reply(NULL), pg(_pg), + num_read(0), + num_write(0), + sent_reply(false), + inflightreads(0), + lock_type(RWState::RWNONE) { + if (obc->ssc) { + new_snapset = obc->ssc->snapset; + snapset = &obc->ssc->snapset; + } + } + OpContext(OpRequestRef _op, osd_reqid_t _reqid, + std::vector* _ops, PrimaryLogPG *_pg) : + op(_op), reqid(_reqid), ops(_ops), obs(NULL), snapset(0), + modify(false), user_modify(false), undirty(false), cache_operation(false), + ignore_cache(false), ignore_log_op_stats(false), update_log_only(false), + bytes_written(0), bytes_read(0), user_at_version(0), + current_osd_subop_num(0), + reply(NULL), pg(_pg), + num_read(0), + num_write(0), + inflightreads(0), + lock_type(RWState::RWNONE) {} + void reset_obs(ObjectContextRef obc) { + new_obs = ObjectState(obc->obs.oi, obc->obs.exists); + if (obc->ssc) { + new_snapset = obc->ssc->snapset; + snapset = &obc->ssc->snapset; + } + } + ~OpContext() { + ceph_assert(!op_t); + if (reply) + reply->put(); + for (std::list, + std::pair > >::iterator i = + pending_async_reads.begin(); + i != pending_async_reads.end(); + pending_async_reads.erase(i++)) { + delete i->second.second; + } + } + uint64_t get_features() { + if (op && op->get_req()) { + return op->get_req()->get_connection()->get_features(); + } + return -1ull; + } + }; + using OpContextUPtr = std::unique_ptr; + friend struct OpContext; + + /* + * State on the PG primary associated with the replicated mutation + */ + class RepGather { + public: + hobject_t hoid; + OpRequestRef op; + xlist::item queue_item; + int nref; + + eversion_t v; + int r = 0; + + ceph_tid_t rep_tid; + + bool rep_aborted; + bool all_committed; + + utime_t start; + + eversion_t pg_local_last_complete; + + ObcLockManager lock_manager; + + std::list> on_committed; + std::list> on_success; + std::list> on_finish; + + RepGather( + OpContext *c, ceph_tid_t rt, + eversion_t lc) : + hoid(c->obc->obs.oi.soid), + op(c->op), + queue_item(this), + nref(1), + rep_tid(rt), + rep_aborted(false), + all_committed(false), + pg_local_last_complete(lc), + lock_manager(std::move(c->lock_manager)), + on_committed(std::move(c->on_committed)), + on_success(std::move(c->on_success)), + on_finish(std::move(c->on_finish)) {} + + RepGather( + ObcLockManager &&manager, + OpRequestRef &&o, + std::optional > &&on_complete, + ceph_tid_t rt, + eversion_t lc, + int r) : + op(o), + queue_item(this), + nref(1), + r(r), + rep_tid(rt), + rep_aborted(false), + all_committed(false), + pg_local_last_complete(lc), + lock_manager(std::move(manager)) { + if (on_complete) { + on_success.push_back(std::move(*on_complete)); + } + } + + RepGather *get() { + nref++; + return this; + } + void put() { + ceph_assert(nref > 0); + if (--nref == 0) { + delete this; + //generic_dout(0) << "deleting " << this << dendl; + } + } + }; + + +protected: + + /** + * Grabs locks for OpContext, should be cleaned up in close_op_ctx + * + * @param ctx [in,out] ctx to get locks for + * @return true on success, false if we are queued + */ + bool get_rw_locks(bool write_ordered, OpContext *ctx) { + /* If head_obc, !obc->obs->exists and we will always take the + * snapdir lock *before* the head lock. Since all callers will do + * this (read or write) if we get the first we will be guaranteed + * to get the second. + */ + if (write_ordered && ctx->op->may_read()) { + ctx->lock_type = RWState::RWEXCL; + } else if (write_ordered) { + ctx->lock_type = RWState::RWWRITE; + } else { + ceph_assert(ctx->op->may_read()); + ctx->lock_type = RWState::RWREAD; + } + + if (ctx->head_obc) { + ceph_assert(!ctx->obc->obs.exists); + if (!ctx->lock_manager.get_lock_type( + ctx->lock_type, + ctx->head_obc->obs.oi.soid, + ctx->head_obc, + ctx->op)) { + ctx->lock_type = RWState::RWNONE; + return false; + } + } + if (ctx->lock_manager.get_lock_type( + ctx->lock_type, + ctx->obc->obs.oi.soid, + ctx->obc, + ctx->op)) { + return true; + } else { + ceph_assert(!ctx->head_obc); + ctx->lock_type = RWState::RWNONE; + return false; + } + } + + /** + * Cleans up OpContext + * + * @param ctx [in] ctx to clean up + */ + void close_op_ctx(OpContext *ctx); + + /** + * Releases locks + * + * @param manager [in] manager with locks to release + * + * (moved to .cc due to scrubber access) + */ + void release_object_locks(ObcLockManager &lock_manager); + + // replica ops + // [primary|tail] + xlist repop_queue; + + friend class C_OSD_RepopCommit; + void repop_all_committed(RepGather *repop); + void eval_repop(RepGather*); + void issue_repop(RepGather *repop, OpContext *ctx); + RepGather *new_repop( + OpContext *ctx, + ObjectContextRef obc, + ceph_tid_t rep_tid); + boost::intrusive_ptr new_repop( + eversion_t version, + int r, + ObcLockManager &&manager, + OpRequestRef &&op, + std::optional > &&on_complete); + void remove_repop(RepGather *repop); + + OpContextUPtr simple_opc_create(ObjectContextRef obc); + void simple_opc_submit(OpContextUPtr ctx); + + /** + * Merge entries atomically into all acting_recovery_backfill osds + * adjusting missing and recovery state as necessary. + * + * Also used to store error log entries for dup detection. + */ + void submit_log_entries( + const mempool::osd_pglog::list &entries, + ObcLockManager &&manager, + std::optional > &&on_complete, + OpRequestRef op = OpRequestRef(), + int r = 0); + struct LogUpdateCtx { + boost::intrusive_ptr repop; + std::set waiting_on; + }; + void cancel_log_updates(); + std::map log_entry_update_waiting_on; + + + // hot/cold tracking + HitSetRef hit_set; ///< currently accumulating HitSet + utime_t hit_set_start_stamp; ///< time the current HitSet started recording + + + void hit_set_clear(); ///< discard any HitSet state + void hit_set_setup(); ///< initialize HitSet state + void hit_set_create(); ///< create a new HitSet + void hit_set_persist(); ///< persist hit info + bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet + void hit_set_trim(OpContextUPtr &ctx, unsigned max); ///< discard old HitSets + void hit_set_in_memory_trim(uint32_t max_in_memory); ///< discard old in memory HitSets + void hit_set_remove_all(); + + hobject_t get_hit_set_current_object(utime_t stamp); + hobject_t get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt); + + // agent + boost::scoped_ptr agent_state; + + void agent_setup(); ///< initialize agent state + bool agent_work(int max) override ///< entry point to do some agent work + { + return agent_work(max, max); + } + bool agent_work(int max, int agent_flush_quota) override; + bool agent_maybe_flush(ObjectContextRef& obc); ///< maybe flush + bool agent_maybe_evict(ObjectContextRef& obc, bool after_flush); ///< maybe evict + + void agent_load_hit_sets(); ///< load HitSets, if needed + + /// estimate object atime and temperature + /// + /// @param oid [in] object name + /// @param temperature [out] relative temperature (# consider both access time and frequency) + void agent_estimate_temp(const hobject_t& oid, int *temperature); + + /// stop the agent + void agent_stop() override; + void agent_delay() override; + + /// clear agent state + void agent_clear() override; + + /// choose (new) agent mode(s), returns true if op is requeued + bool agent_choose_mode(bool restart = false, OpRequestRef op = OpRequestRef()); + void agent_choose_mode_restart() override; + + /// true if we can send an ondisk/commit for v + bool already_complete(eversion_t v); + + // projected object info + SharedLRU object_contexts; + // std::map from oid.snapdir() to SnapSetContext * + std::map snapset_contexts; + ceph::mutex snapset_contexts_lock = + ceph::make_mutex("PrimaryLogPG::snapset_contexts_lock"); + + // debug order that client ops are applied + std::map> debug_op_order; + + void populate_obc_watchers(ObjectContextRef obc); + void check_blocklisted_obc_watchers(ObjectContextRef obc); + void check_blocklisted_watchers() override; + void get_watchers(std::list *ls) override; + void get_obc_watchers(ObjectContextRef obc, std::list &pg_watchers); +public: + void handle_watch_timeout(WatchRef watch); +protected: + + ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc); + ObjectContextRef get_object_context( + const hobject_t& soid, + bool can_create, + const std::map *attrs = 0 + ); + + void context_registry_on_change(); + void object_context_destructor_callback(ObjectContext *obc); + class C_PG_ObjectContext; + + int find_object_context(const hobject_t& oid, + ObjectContextRef *pobc, + bool can_create, + bool map_snapid_to_clone=false, + hobject_t *missing_oid=NULL); + + void add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *stat); + + void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc); + + SnapSetContext *get_snapset_context( + const hobject_t& oid, + bool can_create, + const std::map *attrs = 0, + bool oid_existed = true //indicate this oid whether exsited in backend + ); + void register_snapset_context(SnapSetContext *ssc) { + std::lock_guard l(snapset_contexts_lock); + _register_snapset_context(ssc); + } + void _register_snapset_context(SnapSetContext *ssc) { + ceph_assert(ceph_mutex_is_locked(snapset_contexts_lock)); + if (!ssc->registered) { + ceph_assert(snapset_contexts.count(ssc->oid) == 0); + ssc->registered = true; + snapset_contexts[ssc->oid] = ssc; + } + } + void put_snapset_context(SnapSetContext *ssc); + + std::map recovering; + + /* + * Backfill + * + * peer_info[backfill_target].last_backfill == info.last_backfill on the peer. + * + * objects prior to peer_info[backfill_target].last_backfill + * - are on the peer + * - are included in the peer stats + * + * objects \in (last_backfill, last_backfill_started] + * - are on the peer or are in backfills_in_flight + * - are not included in pg stats (yet) + * - have their stats in pending_backfill_updates on the primary + */ + std::set backfills_in_flight; + std::map pending_backfill_updates; + + void dump_recovery_info(ceph::Formatter *f) const override { + f->open_array_section("waiting_on_backfill"); + for (std::set::const_iterator p = waiting_on_backfill.begin(); + p != waiting_on_backfill.end(); ++p) + f->dump_stream("osd") << *p; + f->close_section(); + f->dump_stream("last_backfill_started") << last_backfill_started; + { + f->open_object_section("backfill_info"); + backfill_info.dump(f); + f->close_section(); + } + { + f->open_array_section("peer_backfill_info"); + for (std::map::const_iterator pbi = + peer_backfill_info.begin(); + pbi != peer_backfill_info.end(); ++pbi) { + f->dump_stream("osd") << pbi->first; + f->open_object_section("BackfillInterval"); + pbi->second.dump(f); + f->close_section(); + } + f->close_section(); + } + { + f->open_array_section("backfills_in_flight"); + for (std::set::const_iterator i = backfills_in_flight.begin(); + i != backfills_in_flight.end(); + ++i) { + f->dump_stream("object") << *i; + } + f->close_section(); + } + { + f->open_array_section("recovering"); + for (std::map::const_iterator i = recovering.begin(); + i != recovering.end(); + ++i) { + f->dump_stream("object") << i->first; + } + f->close_section(); + } + { + f->open_object_section("pg_backend"); + pgbackend->dump_recovery_info(f); + f->close_section(); + } + } + + /// last backfill operation started + hobject_t last_backfill_started; + bool new_backfill; + + int prep_object_replica_pushes(const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started); + int prep_object_replica_deletes(const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started); + + void finish_degraded_object(const hobject_t oid); + + // Cancels/resets pulls from peer + void check_recovery_sources(const OSDMapRef& map) override ; + + int recover_missing( + const hobject_t& oid, + eversion_t v, + int priority, + PGBackend::RecoveryHandle *h); + + // low level ops + + void _make_clone( + OpContext *ctx, + PGTransaction* t, + ObjectContextRef obc, + const hobject_t& head, const hobject_t& coid, + object_info_t *poi); + void execute_ctx(OpContext *ctx); + void finish_ctx(OpContext *ctx, int log_op_type, int result=0); + void reply_ctx(OpContext *ctx, int err); + void make_writeable(OpContext *ctx); + void log_op_stats(const OpRequest& op, uint64_t inb, uint64_t outb); + + void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi, + interval_set& modified, uint64_t offset, + uint64_t length, bool write_full=false); + inline void truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size); + + enum class cache_result_t { + NOOP, + BLOCKED_FULL, + BLOCKED_PROMOTE, + HANDLED_PROXY, + HANDLED_REDIRECT, + REPLIED_WITH_EAGAIN, + BLOCKED_RECOVERY, + }; + cache_result_t maybe_handle_cache_detail(OpRequestRef op, + bool write_ordered, + ObjectContextRef obc, int r, + hobject_t missing_oid, + bool must_promote, + bool in_hit_set, + ObjectContextRef *promote_obc); + cache_result_t maybe_handle_manifest_detail(OpRequestRef op, + bool write_ordered, + ObjectContextRef obc); + bool maybe_handle_manifest(OpRequestRef op, + bool write_ordered, + ObjectContextRef obc) { + return cache_result_t::NOOP != maybe_handle_manifest_detail( + op, + write_ordered, + obc); + } + + /** + * This helper function is called from do_op if the ObjectContext lookup fails. + * @returns true if the caching code is handling the Op, false otherwise. + */ + bool maybe_handle_cache(OpRequestRef op, + bool write_ordered, + ObjectContextRef obc, int r, + const hobject_t& missing_oid, + bool must_promote, + bool in_hit_set = false) { + return cache_result_t::NOOP != maybe_handle_cache_detail( + op, + write_ordered, + obc, + r, + missing_oid, + must_promote, + in_hit_set, + nullptr); + } + + /** + * This helper function checks if a promotion is needed. + */ + bool maybe_promote(ObjectContextRef obc, + const hobject_t& missing_oid, + const object_locator_t& oloc, + bool in_hit_set, + uint32_t recency, + OpRequestRef promote_op, + ObjectContextRef *promote_obc = nullptr); + /** + * This helper function tells the client to redirect their request elsewhere. + */ + void do_cache_redirect(OpRequestRef op); + /** + * This function attempts to start a promote. Either it succeeds, + * or places op on a wait std::list. If op is null, failure means that + * this is a noop. If a future user wants to be able to distinguish + * these cases, a return value should be added. + */ + void promote_object( + ObjectContextRef obc, ///< [optional] obc + const hobject_t& missing_object, ///< oid (if !obc) + const object_locator_t& oloc, ///< locator for obc|oid + OpRequestRef op, ///< [optional] client op + ObjectContextRef *promote_obc = nullptr ///< [optional] new obc for object + ); + + int prepare_transaction(OpContext *ctx); + std::list > in_progress_async_reads; + void complete_read_ctx(int result, OpContext *ctx); + + // pg on-disk content + void check_local() override; + + void _clear_recovery_state() override; + + bool start_recovery_ops( + uint64_t max, + ThreadPool::TPHandle &handle, uint64_t *started) override; + + uint64_t recover_primary(uint64_t max, ThreadPool::TPHandle &handle); + uint64_t recover_replicas(uint64_t max, ThreadPool::TPHandle &handle, + bool *recovery_started); + hobject_t earliest_peer_backfill() const; + bool all_peer_done() const; + /** + * @param work_started will be std::set to true if recover_backfill got anywhere + * @returns the number of operations started + */ + uint64_t recover_backfill(uint64_t max, ThreadPool::TPHandle &handle, + bool *work_started); + + /** + * scan a (hash) range of objects in the current pg + * + * @min return at least this many items, unless we are done + * @max return no more than this many items + * @bi.begin first item should be >= this value + * @bi [out] resulting std::map of objects to eversion_t's + */ + void scan_range( + int min, int max, BackfillInterval *bi, + ThreadPool::TPHandle &handle + ); + + /// Update a hash range to reflect changes since the last scan + void update_range( + BackfillInterval *bi, ///< [in,out] interval to update + ThreadPool::TPHandle &handle ///< [in] tp handle + ); + + int prep_backfill_object_push( + hobject_t oid, eversion_t v, ObjectContextRef obc, + std::vector peers, + PGBackend::RecoveryHandle *h); + void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer); + + + class C_OSD_AppliedRecoveredObject; + class C_OSD_CommittedPushedObject; + class C_OSD_AppliedRecoveredObjectReplica; + + void _applied_recovered_object(ObjectContextRef obc); + void _applied_recovered_object_replica(); + void _committed_pushed_object(epoch_t epoch, eversion_t lc); + void recover_got(hobject_t oid, eversion_t v); + + // -- copyfrom -- + std::map copy_ops; + + int do_copy_get(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& op, + ObjectContextRef& obc); + int finish_copy_get(); + + void fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid, + OSDOp& osd_op); + + /** + * To copy an object, call start_copy. + * + * @param cb: The CopyCallback to be activated when the copy is complete + * @param obc: The ObjectContext we are copying into + * @param src: The source object + * @param oloc: the source object locator + * @param version: the version of the source object to copy (0 for any) + */ + void start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src, + object_locator_t oloc, version_t version, unsigned flags, + bool mirror_snapset, unsigned src_obj_fadvise_flags, + unsigned dest_obj_fadvise_flags); + void process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r); + void _write_copy_chunk(CopyOpRef cop, PGTransaction *t); + uint64_t get_copy_chunk_size() const { + uint64_t size = cct->_conf->osd_copyfrom_max_chunk; + if (pool.info.required_alignment()) { + uint64_t alignment = pool.info.required_alignment(); + if (size % alignment) { + size += alignment - (size % alignment); + } + } + return size; + } + void _copy_some(ObjectContextRef obc, CopyOpRef cop); + void finish_copyfrom(CopyFromCallback *cb); + void finish_promote(int r, CopyResults *results, ObjectContextRef obc); + void cancel_copy(CopyOpRef cop, bool requeue, std::vector *tids); + void cancel_copy_ops(bool requeue, std::vector *tids); + + friend struct C_Copyfrom; + + // -- flush -- + std::map flush_ops; + + /// start_flush takes ownership of on_flush iff ret == -EINPROGRESS + int start_flush( + OpRequestRef op, ObjectContextRef obc, + bool blocking, hobject_t *pmissing, + std::optional> &&on_flush); + void finish_flush(hobject_t oid, ceph_tid_t tid, int r); + int try_flush_mark_clean(FlushOpRef fop); + void cancel_flush(FlushOpRef fop, bool requeue, std::vector *tids); + void cancel_flush_ops(bool requeue, std::vector *tids); + + /// @return false if clone is has been evicted + bool is_present_clone(hobject_t coid); + + friend struct C_Flush; + + // -- scrub -- + bool _range_available_for_scrub( + const hobject_t &begin, const hobject_t &end) override; + + void _split_into(pg_t child_pgid, PG *child, + unsigned split_bits) override; + void apply_and_flush_repops(bool requeue); + + int do_xattr_cmp_u64(int op, __u64 v1, ceph::buffer::list& xattr); + int do_xattr_cmp_str(int op, std::string& v1s, ceph::buffer::list& xattr); + + // -- checksum -- + int do_checksum(OpContext *ctx, OSDOp& osd_op, ceph::buffer::list::const_iterator *bl_it); + int finish_checksum(OSDOp& osd_op, Checksummer::CSumType csum_type, + ceph::buffer::list::const_iterator *init_value_bl_it, + const ceph::buffer::list &read_bl); + + friend struct C_ChecksumRead; + + int do_extent_cmp(OpContext *ctx, OSDOp& osd_op); + int finish_extent_cmp(OSDOp& osd_op, const ceph::buffer::list &read_bl); + + friend struct C_ExtentCmpRead; + + int do_read(OpContext *ctx, OSDOp& osd_op); + int do_sparse_read(OpContext *ctx, OSDOp& osd_op); + int do_writesame(OpContext *ctx, OSDOp& osd_op); + + bool pgls_filter(const PGLSFilter& filter, const hobject_t& sobj); + + std::pair> get_pgls_filter( + ceph::buffer::list::const_iterator& iter); + + std::map> in_progress_proxy_ops; + void kick_proxy_ops_blocked(hobject_t& soid); + void cancel_proxy_ops(bool requeue, std::vector *tids); + + // -- proxyread -- + std::map proxyread_ops; + + void do_proxy_read(OpRequestRef op, ObjectContextRef obc = NULL); + void finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r); + void cancel_proxy_read(ProxyReadOpRef prdop, std::vector *tids); + + friend struct C_ProxyRead; + + // -- proxywrite -- + std::map proxywrite_ops; + + void do_proxy_write(OpRequestRef op, ObjectContextRef obc = NULL); + void finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r); + void cancel_proxy_write(ProxyWriteOpRef pwop, std::vector *tids); + + friend struct C_ProxyWrite_Commit; + + // -- chunkop -- + enum class refcount_t { + INCREMENT_REF, + DECREMENT_REF, + CREATE_OR_GET_REF, + }; + void do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid, + ObjectContextRef obc, bool write_ordered); + void do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index, + uint64_t chunk_index, uint64_t req_offset, uint64_t req_length, + uint64_t req_total_len, bool write_ordered); + bool can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc); + void _copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset); + void process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset); + void finish_promote_manifest(int r, CopyResults *results, ObjectContextRef obc); + void cancel_and_requeue_proxy_ops(hobject_t oid); + void cancel_manifest_ops(bool requeue, vector *tids); + ceph_tid_t refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type, + Context *cb, std::optional chunk); + void dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx); + void dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs); + void dec_refcount_by_dirty(OpContext* ctx); + ObjectContextRef get_prev_clone_obc(ObjectContextRef obc); + bool recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op); + void get_adjacent_clones(ObjectContextRef src_obc, + ObjectContextRef& _l, ObjectContextRef& _g); + bool inc_refcount_by_set(OpContext* ctx, object_manifest_t& tgt, + OSDOp& osd_op); + int do_cdc(const object_info_t& oi, std::map& chunk_map, + std::map& chunks); + int start_dedup(OpRequestRef op, ObjectContextRef obc); + std::pair get_fpoid_from_chunk(const hobject_t soid, bufferlist& chunk); + int finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset); + + friend struct C_ProxyChunkRead; + friend class PromoteManifestCallback; + friend struct C_CopyChunk; + friend struct RefCountCallback; + friend struct C_SetDedupChunks; + +public: + PrimaryLogPG(OSDService *o, OSDMapRef curmap, + const PGPool &_pool, + const std::map& ec_profile, + spg_t p); + ~PrimaryLogPG() override; + + void do_command( + const std::string_view& prefix, + const cmdmap_t& cmdmap, + const ceph::buffer::list& idata, + std::function on_finish) override; + + void clear_cache() override; + int get_cache_obj_count() override { + return object_contexts.get_count(); + } + unsigned get_pg_shard() const { + return info.pgid.hash_to_shard(osd->get_num_shards()); + } + void do_request( + OpRequestRef& op, + ThreadPool::TPHandle &handle) override; + void do_op(OpRequestRef& op); + void record_write_error(OpRequestRef op, const hobject_t &soid, + MOSDOpReply *orig_reply, int r, + OpContext *ctx_for_op_returns=nullptr); + void do_pg_op(OpRequestRef op); + void do_scan( + OpRequestRef op, + ThreadPool::TPHandle &handle); + void do_backfill(OpRequestRef op); + void do_backfill_remove(OpRequestRef op); + + void handle_backoff(OpRequestRef& op); + + int trim_object(bool first, const hobject_t &coid, snapid_t snap_to_trim, + OpContextUPtr *ctxp); + void snap_trimmer(epoch_t e) override; + void kick_snap_trim() override; + void snap_trimmer_scrub_complete() override; + int do_osd_ops(OpContext *ctx, std::vector& ops); + + int _get_tmap(OpContext *ctx, ceph::buffer::list *header, ceph::buffer::list *vals); + int do_tmap2omap(OpContext *ctx, unsigned flags); + int do_tmapup(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& osd_op); + int do_tmapup_slow(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& osd_op, ceph::buffer::list& bl); + + void do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn); +private: + int do_scrub_ls(const MOSDOp *op, OSDOp *osd_op); + bool check_src_targ(const hobject_t& soid, const hobject_t& toid) const; + + uint64_t temp_seq; ///< last id for naming temp objects + /// generate a new temp object name + hobject_t generate_temp_object(const hobject_t& target); + /// generate a new temp object name (for recovery) + hobject_t get_temp_recovery_object(const hobject_t& target, + eversion_t version) override; + int get_recovery_op_priority() const { + int64_t pri = 0; + pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); + return pri > 0 ? pri : cct->_conf->osd_recovery_op_priority; + } + +public: + coll_t get_coll() { + return coll; + } + void split_colls( + spg_t child, + int split_bits, + int seed, + const pg_pool_t *pool, + ObjectStore::Transaction &t) override { + coll_t target = coll_t(child); + create_pg_collection(t, child, split_bits); + t.split_collection( + coll, + split_bits, + seed, + target); + init_pg_ondisk(t, child, pool); + } +private: + + struct DoSnapWork : boost::statechart::event< DoSnapWork > { + DoSnapWork() : boost::statechart::event < DoSnapWork >() {} + }; + struct KickTrim : boost::statechart::event< KickTrim > { + KickTrim() : boost::statechart::event < KickTrim >() {} + }; + struct RepopsComplete : boost::statechart::event< RepopsComplete > { + RepopsComplete() : boost::statechart::event < RepopsComplete >() {} + }; + struct ScrubComplete : boost::statechart::event< ScrubComplete > { + ScrubComplete() : boost::statechart::event < ScrubComplete >() {} + }; + struct TrimWriteUnblocked : boost::statechart::event< TrimWriteUnblocked > { + TrimWriteUnblocked() : boost::statechart::event < TrimWriteUnblocked >() {} + }; + struct Reset : boost::statechart::event< Reset > { + Reset() : boost::statechart::event< Reset >() {} + }; + struct SnapTrimReserved : boost::statechart::event< SnapTrimReserved > { + SnapTrimReserved() : boost::statechart::event< SnapTrimReserved >() {} + }; + struct SnapTrimTimerReady : boost::statechart::event< SnapTrimTimerReady > { + SnapTrimTimerReady() : boost::statechart::event< SnapTrimTimerReady >() {} + }; + + struct NotTrimming; + struct SnapTrimmer : public boost::statechart::state_machine< SnapTrimmer, NotTrimming > { + PrimaryLogPG *pg; + explicit SnapTrimmer(PrimaryLogPG *pg) : pg(pg) {} + void log_enter(const char *state_name); + void log_exit(const char *state_name, utime_t duration); + bool permit_trim(); + bool can_trim() { + return + permit_trim() && + !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM); + } + } snap_trimmer_machine; + + struct WaitReservation; + struct Trimming : boost::statechart::state< Trimming, SnapTrimmer, WaitReservation >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< KickTrim >, + boost::statechart::transition< Reset, NotTrimming > + > reactions; + + std::set in_flight; + snapid_t snap_to_trim; + + explicit Trimming(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "Trimming") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(context< SnapTrimmer >().permit_trim()); + ceph_assert(in_flight.empty()); + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + auto *pg = context< SnapTrimmer >().pg; + pg->osd->snap_reserver.cancel_reservation(pg->get_pgid()); + pg->state_clear(PG_STATE_SNAPTRIM); + pg->publish_stats_to_osd(); + } + boost::statechart::result react(const KickTrim&) { + return discard_event(); + } + }; + + /* SnapTrimmerStates */ + struct WaitTrimTimer : boost::statechart::state< WaitTrimTimer, Trimming >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< SnapTrimTimerReady > + > reactions; + Context *wakeup = nullptr; + explicit WaitTrimTimer(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "Trimming/WaitTrimTimer") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(context().in_flight.empty()); + struct OnTimer : Context { + PrimaryLogPGRef pg; + epoch_t epoch; + OnTimer(PrimaryLogPGRef pg, epoch_t epoch) : pg(pg), epoch(epoch) {} + void finish(int) override { + pg->lock(); + if (!pg->pg_has_reset_since(epoch)) + pg->snap_trimmer_machine.process_event(SnapTrimTimerReady()); + pg->unlock(); + } + }; + auto *pg = context< SnapTrimmer >().pg; + float osd_snap_trim_sleep = pg->osd->osd->get_osd_snap_trim_sleep(); + if (osd_snap_trim_sleep > 0) { + std::lock_guard l(pg->osd->sleep_lock); + wakeup = pg->osd->sleep_timer.add_event_after( + osd_snap_trim_sleep, + new OnTimer{pg, pg->get_osdmap_epoch()}); + } else { + post_event(SnapTrimTimerReady()); + } + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + auto *pg = context< SnapTrimmer >().pg; + if (wakeup) { + std::lock_guard l(pg->osd->sleep_lock); + pg->osd->sleep_timer.cancel_event(wakeup); + wakeup = nullptr; + } + } + boost::statechart::result react(const SnapTrimTimerReady &) { + wakeup = nullptr; + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } else { + return transit< AwaitAsyncWork >(); + } + } + }; + + struct WaitRWLock : boost::statechart::state< WaitRWLock, Trimming >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< TrimWriteUnblocked > + > reactions; + explicit WaitRWLock(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "Trimming/WaitRWLock") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(context().in_flight.empty()); + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + } + boost::statechart::result react(const TrimWriteUnblocked&) { + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } else { + return transit< AwaitAsyncWork >(); + } + } + }; + + struct WaitRepops : boost::statechart::state< WaitRepops, Trimming >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< RepopsComplete > + > reactions; + explicit WaitRepops(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "Trimming/WaitRepops") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(!context().in_flight.empty()); + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + } + boost::statechart::result react(const RepopsComplete&) { + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } else { + return transit< WaitTrimTimer >(); + } + } + }; + + struct AwaitAsyncWork : boost::statechart::state< AwaitAsyncWork, Trimming >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< DoSnapWork > + > reactions; + explicit AwaitAsyncWork(my_context ctx); + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + } + boost::statechart::result react(const DoSnapWork&); + }; + + struct WaitReservation : boost::statechart::state< WaitReservation, Trimming >, NamedState { + /* WaitReservation is a sub-state of trimming simply so that exiting Trimming + * always cancels the reservation */ + typedef boost::mpl::list < + boost::statechart::custom_reaction< SnapTrimReserved > + > reactions; + struct ReservationCB : public Context { + PrimaryLogPGRef pg; + bool canceled; + explicit ReservationCB(PrimaryLogPG *pg) : pg(pg), canceled(false) {} + void finish(int) override { + pg->lock(); + if (!canceled) + pg->snap_trimmer_machine.process_event(SnapTrimReserved()); + pg->unlock(); + } + void cancel() { + ceph_assert(pg->is_locked()); + ceph_assert(!canceled); + canceled = true; + } + }; + ReservationCB *pending = nullptr; + + explicit WaitReservation(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "Trimming/WaitReservation") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(context().in_flight.empty()); + auto *pg = context< SnapTrimmer >().pg; + pending = new ReservationCB(pg); + pg->osd->snap_reserver.request_reservation( + pg->get_pgid(), + pending, + 0); + pg->state_set(PG_STATE_SNAPTRIM_WAIT); + pg->publish_stats_to_osd(); + } + boost::statechart::result react(const SnapTrimReserved&); + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + if (pending) + pending->cancel(); + pending = nullptr; + auto *pg = context< SnapTrimmer >().pg; + pg->state_clear(PG_STATE_SNAPTRIM_WAIT); + pg->state_clear(PG_STATE_SNAPTRIM_ERROR); + pg->publish_stats_to_osd(); + } + }; + + struct WaitScrub : boost::statechart::state< WaitScrub, SnapTrimmer >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< ScrubComplete >, + boost::statechart::custom_reaction< KickTrim >, + boost::statechart::transition< Reset, NotTrimming > + > reactions; + explicit WaitScrub(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "Trimming/WaitScrub") { + context< SnapTrimmer >().log_enter(state_name); + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + } + boost::statechart::result react(const ScrubComplete&) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } + boost::statechart::result react(const KickTrim&) { + return discard_event(); + } + }; + + struct NotTrimming : boost::statechart::state< NotTrimming, SnapTrimmer >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< KickTrim >, + boost::statechart::transition< Reset, NotTrimming > + > reactions; + explicit NotTrimming(my_context ctx); + void exit(); + boost::statechart::result react(const KickTrim&); + }; + + int _verify_no_head_clones(const hobject_t& soid, + const SnapSet& ss); + // return true if we're creating a local object, false for a + // whiteout or no change. + void maybe_create_new_object(OpContext *ctx, bool ignore_transaction=false); + int _delete_oid(OpContext *ctx, bool no_whiteout, bool try_no_whiteout); + int _rollback_to(OpContext *ctx, ceph_osd_op& op); +public: + bool is_missing_object(const hobject_t& oid) const; + bool is_unreadable_object(const hobject_t &oid) const { + return is_missing_object(oid) || + !recovery_state.get_missing_loc().readable_with_acting( + oid, get_actingset()); + } + void maybe_kick_recovery(const hobject_t &soid); + void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op); + + int get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op); + + bool check_laggy(OpRequestRef& op); + bool check_laggy_requeue(OpRequestRef& op); + void recheck_readable() override; + + bool is_backfill_target(pg_shard_t osd) const { + return recovery_state.is_backfill_target(osd); + } + const std::set &get_backfill_targets() const { + return recovery_state.get_backfill_targets(); + } + bool is_async_recovery_target(pg_shard_t peer) const { + return recovery_state.is_async_recovery_target(peer); + } + const std::set &get_async_recovery_targets() const { + return recovery_state.get_async_recovery_targets(); + } + bool is_degraded_or_backfilling_object(const hobject_t& oid); + bool is_degraded_on_async_recovery_target(const hobject_t& soid); + void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op); + + void block_write_on_full_cache( + const hobject_t& oid, OpRequestRef op); + void block_for_clean( + const hobject_t& oid, OpRequestRef op); + void block_write_on_snap_rollback( + const hobject_t& oid, ObjectContextRef obc, OpRequestRef op); + void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op); + + bool maybe_await_blocked_head(const hobject_t &soid, OpRequestRef op); + void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op); + void kick_object_context_blocked(ObjectContextRef obc); + + void maybe_force_recovery(); + + void mark_all_unfound_lost( + int what, + std::function on_finish); + eversion_t pick_newest_available(const hobject_t& oid); + + void do_update_log_missing( + OpRequestRef &op); + + void do_update_log_missing_reply( + OpRequestRef &op); + + void plpg_on_role_change() override; + void plpg_on_pool_change() override; + void clear_async_reads(); + void on_change(ObjectStore::Transaction &t) override; + void on_activate_complete() override; + void on_flushed() override; + void on_removal(ObjectStore::Transaction &t) override; + void on_shutdown() override; + bool check_failsafe_full() override; + bool maybe_preempt_replica_scrub(const hobject_t& oid) override; + int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx); + + // attr cache handling + void setattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const std::string &key, + ceph::buffer::list &val); + void setattrs_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + std::map &attrs); + void rmattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const std::string &key); + int getattr_maybe_cache( + ObjectContextRef obc, + const std::string &key, + ceph::buffer::list *val); + int getattrs_maybe_cache( + ObjectContextRef obc, + std::map *out); + +public: + void set_dynamic_perf_stats_queries( + const std::list &queries) override; + void get_dynamic_perf_stats(DynamicPerfStats *stats) override; + +private: + DynamicPerfStats m_dynamic_perf_stats; +}; + +inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop) +{ + out << "repgather(" << &repop + << " " << repop.v + << " rep_tid=" << repop.rep_tid + << " committed?=" << repop.all_committed + << " r=" << repop.r + << ")"; + return out; +} + +inline ostream& operator<<(ostream& out, + const PrimaryLogPG::ProxyWriteOpRef& pwop) +{ + out << "proxywrite(" << &pwop + << " " << pwop->user_version + << " pwop_tid=" << pwop->objecter_tid; + if (pwop->ctx->op) + out << " op=" << *(pwop->ctx->op->get_req()); + out << ")"; + return out; +} + +void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop); +void intrusive_ptr_release(PrimaryLogPG::RepGather *repop); + + +#endif diff --git a/src/osd/PrimaryLogScrub.cc b/src/osd/PrimaryLogScrub.cc new file mode 100644 index 000000000..8cf76dd1d --- /dev/null +++ b/src/osd/PrimaryLogScrub.cc @@ -0,0 +1,589 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PrimaryLogScrub.h" + +#include "common/scrub_types.h" + +#include "PeeringState.h" +#include "PrimaryLogPG.h" +#include "scrub_machine.h" + +#define dout_context (m_osds->cct) +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +template +static ostream& _prefix(std::ostream* _dout, T* t) +{ + return t->gen_prefix(*_dout); +} + +using namespace Scrub; +using Scrub::ScrubMachine; + +bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const +{ + if (!m_store) { + return false; + } + + if (arg.get_snapsets) { + res_inout.vals = + m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return); + } else { + res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after, + arg.max_return); + } + return true; +} + +void PrimaryLogScrub::_scrub_finish() +{ + auto& info = m_pg->info; ///< a temporary alias + + dout(10) << __func__ + << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid") + << dendl; + + if (info.stats.stats_invalid) { + m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) { + stats.stats = m_scrub_cstat; + stats.stats_invalid = false; + return false; + }); + + if (m_pl_pg->agent_state) + m_pl_pg->agent_choose_mode(); + } + + dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/" + << info.stats.stats.sum.num_objects << " objects, " + << m_scrub_cstat.sum.num_object_clones << "/" + << info.stats.stats.sum.num_object_clones << " clones, " + << m_scrub_cstat.sum.num_objects_dirty << "/" + << info.stats.stats.sum.num_objects_dirty << " dirty, " + << m_scrub_cstat.sum.num_objects_omap << "/" + << info.stats.stats.sum.num_objects_omap << " omap, " + << m_scrub_cstat.sum.num_objects_pinned << "/" + << info.stats.stats.sum.num_objects_pinned << " pinned, " + << m_scrub_cstat.sum.num_objects_hit_set_archive << "/" + << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, " + << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes + << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/" + << info.stats.stats.sum.num_objects_manifest << " manifest objects, " + << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/" + << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes." + << dendl; + + if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects || + m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones || + (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty && + !info.stats.dirty_stats_invalid) || + (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap && + !info.stats.omap_stats_invalid) || + (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned && + !info.stats.pin_stats_invalid) || + (m_scrub_cstat.sum.num_objects_hit_set_archive != + info.stats.stats.sum.num_objects_hit_set_archive && + !info.stats.hitset_stats_invalid) || + (m_scrub_cstat.sum.num_bytes_hit_set_archive != + info.stats.stats.sum.num_bytes_hit_set_archive && + !info.stats.hitset_bytes_stats_invalid) || + (m_scrub_cstat.sum.num_objects_manifest != + info.stats.stats.sum.num_objects_manifest && + !info.stats.manifest_stats_invalid) || + m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts || + m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) { + m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got " + << m_scrub_cstat.sum.num_objects << "/" + << info.stats.stats.sum.num_objects << " objects, " + << m_scrub_cstat.sum.num_object_clones << "/" + << info.stats.stats.sum.num_object_clones << " clones, " + << m_scrub_cstat.sum.num_objects_dirty << "/" + << info.stats.stats.sum.num_objects_dirty << " dirty, " + << m_scrub_cstat.sum.num_objects_omap << "/" + << info.stats.stats.sum.num_objects_omap << " omap, " + << m_scrub_cstat.sum.num_objects_pinned << "/" + << info.stats.stats.sum.num_objects_pinned << " pinned, " + << m_scrub_cstat.sum.num_objects_hit_set_archive << "/" + << info.stats.stats.sum.num_objects_hit_set_archive + << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts + << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, " + << m_scrub_cstat.sum.num_bytes << "/" + << info.stats.stats.sum.num_bytes << " bytes, " + << m_scrub_cstat.sum.num_objects_manifest << "/" + << info.stats.stats.sum.num_objects_manifest + << " manifest objects, " + << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/" + << info.stats.stats.sum.num_bytes_hit_set_archive + << " hit_set_archive bytes."; + ++m_shallow_errors; + + if (m_is_repair) { + ++m_fixed_count; + m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) { + stats.stats = m_scrub_cstat; + stats.dirty_stats_invalid = false; + stats.omap_stats_invalid = false; + stats.hitset_stats_invalid = false; + stats.hitset_bytes_stats_invalid = false; + stats.pin_stats_invalid = false; + stats.manifest_stats_invalid = false; + return false; + }); + m_pl_pg->publish_stats_to_osd(); + m_pl_pg->recovery_state.share_pg_info(); + } + } + // Clear object context cache to get repair information + if (m_is_repair) + m_pl_pg->object_contexts.clear(); +} + +static bool doing_clones(const std::optional& snapset, + const vector::reverse_iterator& curclone) +{ + return snapset && curclone != snapset->clones.rend(); +} + +void PrimaryLogScrub::log_missing(int missing, + const std::optional& head, + LogChannelRef clog, + const spg_t& pgid, + const char* func, + bool allow_incomplete_clones) +{ + ceph_assert(head); + if (allow_incomplete_clones) { + dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped " + << missing << " clone(s) in cache tier" << dendl; + } else { + clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing + << " missing clone(s)"; + } +} + +int PrimaryLogScrub::process_clones_to(const std::optional& head, + const std::optional& snapset, + LogChannelRef clog, + const spg_t& pgid, + bool allow_incomplete_clones, + std::optional target, + vector::reverse_iterator* curclone, + inconsistent_snapset_wrapper& e) +{ + ceph_assert(head); + ceph_assert(snapset); + int missing_count = 0; + + // NOTE: clones are in descending order, thus **curclone > target test here + hobject_t next_clone(*head); + while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) { + + ++missing_count; + // it is okay to be missing one or more clones in a cache tier. + // skip higher-numbered clones in the list. + if (!allow_incomplete_clones) { + next_clone.snap = **curclone; + clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone " + << next_clone << " " << m_missing << " missing"; + ++m_shallow_errors; + e.set_clone_missing(next_clone.snap); + } + // Clones are descending + ++(*curclone); + } + return missing_count; +} + +/* + * Validate consistency of the object info and snap sets. + * + * We are sort of comparing 2 lists. The main loop is on objmap.objects. But + * the comparison of the objects is against multiple snapset.clones. There are + * multiple clone lists and in between lists we expect head. + * + * Example + * + * objects expected + * ======= ======= + * obj1 snap 1 head, unexpected obj1 snap 1 + * obj2 head head, match + * [SnapSet clones 6 4 2 1] + * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7 + * obj2 snap 6 obj2 snap 6, match + * obj2 snap 4 obj2 snap 4, match + * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match + * [Snapset clones 3 1] + * obj3 snap 3 obj3 snap 3 match + * obj3 snap 1 obj3 snap 1 match + * obj4 head head, match + * [Snapset clones 4] + * EOL obj4 snap 4, (expected) + */ +void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap, + const missing_map_t& missing_digest) +{ + dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects + << dendl; + + auto& info = m_pl_pg->info; + const PGPool& pool = m_pl_pg->pool; + bool allow_incomplete_clones = pool.info.allow_incomplete_clones(); + + std::optional all_clones; // Unspecified snapid_t or std::nullopt + + // traverse in reverse order. + std::optional head; + std::optional snapset; // If initialized so will head (above) + vector::reverse_iterator curclone; // Defined only if snapset initialized + int missing = 0; + inconsistent_snapset_wrapper soid_error, head_error; + int soid_error_count = 0; + + for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) { + + const hobject_t& soid = p->first; + ceph_assert(!soid.is_snapdir()); + soid_error = inconsistent_snapset_wrapper{soid}; + object_stat_sum_t stat; + std::optional oi; + + stat.num_objects++; + + if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace) + stat.num_objects_hit_set_archive++; + + if (soid.is_snap()) { + // it's a clone + stat.num_object_clones++; + } + + // basic checks. + if (p->second.attrs.count(OI_ATTR) == 0) { + oi = std::nullopt; + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '" + << OI_ATTR << "' attr"; + ++m_shallow_errors; + soid_error.set_info_missing(); + } else { + bufferlist bv; + bv.push_back(p->second.attrs[OI_ATTR]); + try { + oi = object_info_t(); // Initialize optional<> before decode into it + oi->decode(bv); + } catch (ceph::buffer::error& e) { + oi = std::nullopt; + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : can't decode '" << OI_ATTR << "' attr " << e.what(); + ++m_shallow_errors; + soid_error.set_info_corrupted(); + soid_error.set_info_missing(); // Not available too + } + } + + if (oi) { + if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : on disk size (" << p->second.size + << ") does not match object info size (" << oi->size + << ") adjusted for ondisk to (" + << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")"; + soid_error.set_size_mismatch(); + ++m_shallow_errors; + } + + dout(20) << m_mode_desc << " " << soid << " " << *oi << dendl; + + // A clone num_bytes will be added later when we have snapset + if (!soid.is_snap()) { + stat.num_bytes += oi->size; + } + if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace) + stat.num_bytes_hit_set_archive += oi->size; + + if (oi->is_dirty()) + ++stat.num_objects_dirty; + if (oi->is_whiteout()) + ++stat.num_whiteouts; + if (oi->is_omap()) + ++stat.num_objects_omap; + if (oi->is_cache_pinned()) + ++stat.num_objects_pinned; + if (oi->has_manifest()) + ++stat.num_objects_manifest; + } + + // Check for any problems while processing clones + if (doing_clones(snapset, curclone)) { + std::optional target; + // Expecting an object with snap for current head + if (soid.has_snapset() || soid.get_head() != head->get_head()) { + + dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid + << " while processing " << *head << dendl; + + target = all_clones; + } else { + ceph_assert(soid.is_snap()); + target = soid.snap; + } + + // Log any clones we were expecting to be there up to target + // This will set missing, but will be a no-op if snap.soid == *curclone. + missing += + process_clones_to(head, snapset, m_osds->clog, info.pgid, + allow_incomplete_clones, target, &curclone, head_error); + } + + bool expected; + // Check doing_clones() again in case we ran process_clones_to() + if (doing_clones(snapset, curclone)) { + // A head would have processed all clones above + // or all greater than *curclone. + ceph_assert(soid.is_snap() && *curclone <= soid.snap); + + // After processing above clone snap should match the expected curclone + expected = (*curclone == soid.snap); + } else { + // If we aren't doing clones any longer, then expecting head + expected = soid.has_snapset(); + } + if (!expected) { + // If we couldn't read the head's snapset, just ignore clones + if (head && !snapset) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : clone ignored due to missing snapset"; + } else { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : is an unexpected clone"; + } + ++m_shallow_errors; + soid_error.set_headless(); + m_store->add_snap_error(pool.id, soid_error); + ++soid_error_count; + if (head && soid.get_head() == head->get_head()) + head_error.set_clone(soid.snap); + continue; + } + + // new snapset? + if (soid.has_snapset()) { + + if (missing) { + log_missing(missing, head, m_osds->clog, info.pgid, __func__, + pool.info.allow_incomplete_clones()); + } + + // Save previous head error information + if (head && (head_error.errors || soid_error_count)) + m_store->add_snap_error(pool.id, head_error); + // Set this as a new head object + head = soid; + missing = 0; + head_error = soid_error; + soid_error_count = 0; + + dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl; + + if (p->second.attrs.count(SS_ATTR) == 0) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '" + << SS_ATTR << "' attr"; + ++m_shallow_errors; + snapset = std::nullopt; + head_error.set_snapset_missing(); + } else { + bufferlist bl; + bl.push_back(p->second.attrs[SS_ATTR]); + auto blp = bl.cbegin(); + try { + snapset = SnapSet(); // Initialize optional<> before decoding into it + decode(*snapset, blp); + head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]); + } catch (ceph::buffer::error& e) { + snapset = std::nullopt; + m_osds->clog->error() + << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR + << "' attr " << e.what(); + ++m_shallow_errors; + head_error.set_snapset_corrupted(); + } + } + + if (snapset) { + // what will be next? + curclone = snapset->clones.rbegin(); + + if (!snapset->clones.empty()) { + dout(20) << " snapset " << *snapset << dendl; + if (snapset->seq == 0) { + m_osds->clog->error() + << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set"; + ++m_shallow_errors; + head_error.set_snapset_error(); + } + } + } + } else { + ceph_assert(soid.is_snap()); + ceph_assert(head); + ceph_assert(snapset); + ceph_assert(soid.snap == *curclone); + + dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl; + + if (snapset->clone_size.count(soid.snap) == 0) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : is missing in clone_size"; + ++m_shallow_errors; + soid_error.set_size_mismatch(); + } else { + if (oi && oi->size != snapset->clone_size[soid.snap]) { + m_osds->clog->error() + << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size + << " != clone_size " << snapset->clone_size[*curclone]; + ++m_shallow_errors; + soid_error.set_size_mismatch(); + } + + if (snapset->clone_overlap.count(soid.snap) == 0) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : is missing in clone_overlap"; + ++m_shallow_errors; + soid_error.set_size_mismatch(); + } else { + // This checking is based on get_clone_bytes(). The first 2 asserts + // can't happen because we know we have a clone_size and + // a clone_overlap. Now we check that the interval_set won't + // cause the last assert. + uint64_t size = snapset->clone_size.find(soid.snap)->second; + const interval_set& overlap = + snapset->clone_overlap.find(soid.snap)->second; + bool bad_interval_set = false; + for (interval_set::const_iterator i = overlap.begin(); + i != overlap.end(); ++i) { + if (size < i.get_len()) { + bad_interval_set = true; + break; + } + size -= i.get_len(); + } + + if (bad_interval_set) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : bad interval_set in clone_overlap"; + ++m_shallow_errors; + soid_error.set_size_mismatch(); + } else { + stat.num_bytes += snapset->get_clone_bytes(soid.snap); + } + } + } + + // what's next? + ++curclone; + if (soid_error.errors) { + m_store->add_snap_error(pool.id, soid_error); + ++soid_error_count; + } + } + m_scrub_cstat.add(stat); + } + + if (doing_clones(snapset, curclone)) { + dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid + << " No more objects while processing " << *head << dendl; + + missing += + process_clones_to(head, snapset, m_osds->clog, info.pgid, + allow_incomplete_clones, all_clones, &curclone, head_error); + } + + // There could be missing found by the test above or even + // before dropping out of the loop for the last head. + if (missing) { + log_missing(missing, head, m_osds->clog, info.pgid, __func__, + allow_incomplete_clones); + } + if (head && (head_error.errors || soid_error_count)) + m_store->add_snap_error(pool.id, head_error); + + dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing" + << dendl; + for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) { + + ceph_assert(!p->first.is_snapdir()); + dout(10) << __func__ << " recording digests for " << p->first << dendl; + + ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false); + if (!obc) { + m_osds->clog->error() << info.pgid << " " << m_mode_desc + << " cannot get object context for object " << p->first; + continue; + } + if (obc->obs.oi.soid != p->first) { + m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first + << " : object has a valid oi attr with a mismatched name, " + << " obc->obs.oi.soid: " << obc->obs.oi.soid; + continue; + } + PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc); + ctx->at_version = m_pl_pg->get_next_version(); + ctx->mtime = utime_t(); // do not update mtime + if (p->second.first) { + ctx->new_obs.oi.set_data_digest(*p->second.first); + } else { + ctx->new_obs.oi.clear_data_digest(); + } + if (p->second.second) { + ctx->new_obs.oi.set_omap_digest(*p->second.second); + } else { + ctx->new_obs.oi.clear_omap_digest(); + } + m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY); + + ++num_digest_updates_pending; + ctx->register_on_success([this]() { + if ((num_digest_updates_pending >= 1) && + (--num_digest_updates_pending == 0)) { + m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops()); + } + }); + + m_pl_pg->simple_opc_submit(std::move(ctx)); + } + + dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl; +} + +PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {} + +void PrimaryLogScrub::_scrub_clear_state() +{ + dout(15) << __func__ << dendl; + m_scrub_cstat = object_stat_collection_t(); +} + +void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) +{ + // We scrub objects in hobject_t order, so objects before m_start have already been + // scrubbed and their stats have already been added to the scrubber. Objects after that + // point haven't been included in the scrubber's stats accounting yet, so they will be + // included when the scrubber gets to that object. + dout(15) << __func__ << " soid: " << soid << " scrub is active? " << is_scrub_active() + << dendl; + if (is_primary() && is_scrub_active()) { + if (soid < m_start) { + dout(20) << __func__ << " " << soid << " < [" << m_start << "," << m_end << ")" + << dendl; + m_scrub_cstat.add(delta_stats); + } else { + dout(20) << __func__ << " " << soid << " >= [" << m_start << "," << m_end << ")" + << dendl; + } + } +} diff --git a/src/osd/PrimaryLogScrub.h b/src/osd/PrimaryLogScrub.h new file mode 100644 index 000000000..78353d6db --- /dev/null +++ b/src/osd/PrimaryLogScrub.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +// the './' includes are marked this way to affect clang-format +#include "./pg_scrubber.h" + +#include +#include +#include + +#include "debug.h" + +#include "common/errno.h" +#include "common/scrub_types.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDRepScrub.h" +#include "messages/MOSDRepScrubMap.h" +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrubReserve.h" + +#include "OSD.h" +#include "scrub_machine.h" + +class PrimaryLogPG; + +/** + * The derivative of PgScrubber that is used by PrimaryLogPG. + */ +class PrimaryLogScrub : public PgScrubber { + public: + explicit PrimaryLogScrub(PrimaryLogPG* pg); + + void _scrub_finish() final; + + bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const final; + + void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) final; + + private: + // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object: + PrimaryLogPG* const m_pl_pg; + + /** + * Validate consistency of the object info and snap sets. + */ + void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final; + + void log_missing(int missing, + const std::optional& head, + LogChannelRef clog, + const spg_t& pgid, + const char* func, + bool allow_incomplete_clones); + + int process_clones_to(const std::optional& head, + const std::optional& snapset, + LogChannelRef clog, + const spg_t& pgid, + bool allow_incomplete_clones, + std::optional target, + std::vector::reverse_iterator* curclone, + inconsistent_snapset_wrapper& snap_error); + + + // handle our part in stats collection + object_stat_collection_t m_scrub_cstat; + void _scrub_clear_state() final; // which just clears the stats +}; diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc new file mode 100644 index 000000000..1468764c3 --- /dev/null +++ b/src/osd/ReplicatedBackend.cc @@ -0,0 +1,2425 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "common/errno.h" +#include "ReplicatedBackend.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPushReply.h" +#include "common/EventTrace.h" +#include "include/random.h" +#include "include/util.h" +#include "OSD.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, ReplicatedBackend *pgb) { + return pgb->get_parent()->gen_dbg_prefix(*_dout); +} + +using std::list; +using std::make_pair; +using std::map; +using std::ostringstream; +using std::set; +using std::pair; +using std::string; +using std::unique_ptr; +using std::vector; + +using ceph::bufferhash; +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; + +namespace { +class PG_SendMessageOnConn: public Context { + PGBackend::Listener *pg; + Message *reply; + ConnectionRef conn; + public: + PG_SendMessageOnConn( + PGBackend::Listener *pg, + Message *reply, + ConnectionRef conn) : pg(pg), reply(reply), conn(conn) {} + void finish(int) override { + pg->send_message_osd_cluster(MessageRef(reply, false), conn.get()); + } +}; + +class PG_RecoveryQueueAsync : public Context { + PGBackend::Listener *pg; + unique_ptr> c; + public: + PG_RecoveryQueueAsync( + PGBackend::Listener *pg, + GenContext *c) : pg(pg), c(c) {} + void finish(int) override { + pg->schedule_recovery_work(c.release()); + } +}; +} + +struct ReplicatedBackend::C_OSD_RepModifyCommit : public Context { + ReplicatedBackend *pg; + RepModifyRef rm; + C_OSD_RepModifyCommit(ReplicatedBackend *pg, RepModifyRef r) + : pg(pg), rm(r) {} + void finish(int r) override { + pg->repop_commit(rm); + } +}; + +static void log_subop_stats( + PerfCounters *logger, + OpRequestRef op, int subop) +{ + utime_t latency = ceph_clock_now(); + latency -= op->get_req()->get_recv_stamp(); + + + logger->inc(l_osd_sop); + logger->tinc(l_osd_sop_lat, latency); + logger->inc(subop); + + if (subop != l_osd_sop_pull) { + uint64_t inb = op->get_req()->get_data().length(); + logger->inc(l_osd_sop_inb, inb); + if (subop == l_osd_sop_w) { + logger->inc(l_osd_sop_w_inb, inb); + logger->tinc(l_osd_sop_w_lat, latency); + } else if (subop == l_osd_sop_push) { + logger->inc(l_osd_sop_push_inb, inb); + logger->tinc(l_osd_sop_push_lat, latency); + } else + ceph_abort_msg("no support subop"); + } else { + logger->tinc(l_osd_sop_pull_lat, latency); + } +} + +ReplicatedBackend::ReplicatedBackend( + PGBackend::Listener *pg, + const coll_t &coll, + ObjectStore::CollectionHandle &c, + ObjectStore *store, + CephContext *cct) : + PGBackend(cct, pg, store, coll, c) {} + +void ReplicatedBackend::run_recovery_op( + PGBackend::RecoveryHandle *_h, + int priority) +{ + RPGHandle *h = static_cast(_h); + send_pushes(priority, h->pushes); + send_pulls(priority, h->pulls); + send_recovery_deletes(priority, h->deletes); + delete h; +} + +int ReplicatedBackend::recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + RecoveryHandle *_h + ) +{ + dout(10) << __func__ << ": " << hoid << dendl; + RPGHandle *h = static_cast(_h); + if (get_parent()->get_local_missing().is_missing(hoid)) { + ceph_assert(!obc); + // pull + prepare_pull( + v, + hoid, + head, + h); + } else { + ceph_assert(obc); + int started = start_pushes( + hoid, + obc, + h); + if (started < 0) { + pushing[hoid].clear(); + return started; + } + } + return 0; +} + +void ReplicatedBackend::check_recovery_sources(const OSDMapRef& osdmap) +{ + for(map >::iterator i = pull_from_peer.begin(); + i != pull_from_peer.end(); + ) { + if (osdmap->is_down(i->first.osd)) { + dout(10) << "check_recovery_sources resetting pulls from osd." << i->first + << ", osdmap has it marked down" << dendl; + for (set::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + get_parent()->cancel_pull(*j); + clear_pull(pulling.find(*j), false); + } + pull_from_peer.erase(i++); + } else { + ++i; + } + } +} + +bool ReplicatedBackend::can_handle_while_inactive(OpRequestRef op) +{ + dout(10) << __func__ << ": " << op << dendl; + switch (op->get_req()->get_type()) { + case MSG_OSD_PG_PULL: + return true; + default: + return false; + } +} + +bool ReplicatedBackend::_handle_message( + OpRequestRef op + ) +{ + dout(10) << __func__ << ": " << op << dendl; + switch (op->get_req()->get_type()) { + case MSG_OSD_PG_PUSH: + do_push(op); + return true; + + case MSG_OSD_PG_PULL: + do_pull(op); + return true; + + case MSG_OSD_PG_PUSH_REPLY: + do_push_reply(op); + return true; + + case MSG_OSD_REPOP: { + do_repop(op); + return true; + } + + case MSG_OSD_REPOPREPLY: { + do_repop_reply(op); + return true; + } + + default: + break; + } + return false; +} + +void ReplicatedBackend::clear_recovery_state() +{ + // clear pushing/pulling maps + for (auto &&i: pushing) { + for (auto &&j: i.second) { + get_parent()->release_locks(j.second.lock_manager); + } + } + pushing.clear(); + + for (auto &&i: pulling) { + get_parent()->release_locks(i.second.lock_manager); + } + pulling.clear(); + pull_from_peer.clear(); +} + +void ReplicatedBackend::on_change() +{ + dout(10) << __func__ << dendl; + for (auto& op : in_progress_ops) { + delete op.second->on_commit; + op.second->on_commit = nullptr; + } + in_progress_ops.clear(); + clear_recovery_state(); +} + +int ReplicatedBackend::objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + bufferlist *bl) +{ + return store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags); +} + +int ReplicatedBackend::objects_readv_sync( + const hobject_t &hoid, + map&& m, + uint32_t op_flags, + bufferlist *bl) +{ + interval_set im(std::move(m)); + auto r = store->readv(ch, ghobject_t(hoid), im, *bl, op_flags); + if (r >= 0) { + m = std::move(im).detach(); + } + return r; +} + +void ReplicatedBackend::objects_read_async( + const hobject_t &hoid, + const list, + pair > > &to_read, + Context *on_complete, + bool fast_read) +{ + ceph_abort_msg("async read is not used by replica pool"); +} + +class C_OSD_OnOpCommit : public Context { + ReplicatedBackend *pg; + ceph::ref_t op; +public: + C_OSD_OnOpCommit(ReplicatedBackend *pg, ceph::ref_t op) + : pg(pg), op(std::move(op)) {} + void finish(int) override { + pg->op_commit(op); + } +}; + +void generate_transaction( + PGTransactionUPtr &pgt, + const coll_t &coll, + vector &log_entries, + ObjectStore::Transaction *t, + set *added, + set *removed, + const ceph_release_t require_osd_release = ceph_release_t::unknown ) +{ + ceph_assert(t); + ceph_assert(added); + ceph_assert(removed); + + for (auto &&le: log_entries) { + le.mark_unrollbackable(); + auto oiter = pgt->op_map.find(le.soid); + if (oiter != pgt->op_map.end() && oiter->second.updated_snaps) { + bufferlist bl(oiter->second.updated_snaps->second.size() * 8 + 8); + encode(oiter->second.updated_snaps->second, bl); + le.snaps.swap(bl); + le.snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + } + + pgt->safe_create_traverse( + [&](pair &obj_op) { + const hobject_t &oid = obj_op.first; + const ghobject_t goid = + ghobject_t(oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD); + const PGTransaction::ObjectOperation &op = obj_op.second; + + if (oid.is_temp()) { + if (op.is_fresh_object()) { + added->insert(oid); + } else if (op.is_delete()) { + removed->insert(oid); + } + } + + if (op.delete_first) { + t->remove(coll, goid); + } + + match( + op.init_type, + [&](const PGTransaction::ObjectOperation::Init::None &) { + }, + [&](const PGTransaction::ObjectOperation::Init::Create &op) { + if (require_osd_release >= ceph_release_t::octopus) { + t->create(coll, goid); + } else { + t->touch(coll, goid); + } + }, + [&](const PGTransaction::ObjectOperation::Init::Clone &op) { + t->clone( + coll, + ghobject_t( + op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + goid); + }, + [&](const PGTransaction::ObjectOperation::Init::Rename &op) { + ceph_assert(op.source.is_temp()); + t->collection_move_rename( + coll, + ghobject_t( + op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + coll, + goid); + }); + + if (op.truncate) { + t->truncate(coll, goid, op.truncate->first); + if (op.truncate->first != op.truncate->second) + t->truncate(coll, goid, op.truncate->second); + } + + if (!op.attr_updates.empty()) { + map attrs; + for (auto &&p: op.attr_updates) { + if (p.second) + attrs[p.first] = *(p.second); + else + t->rmattr(coll, goid, p.first); + } + t->setattrs(coll, goid, attrs); + } + + if (op.clear_omap) + t->omap_clear(coll, goid); + if (op.omap_header) + t->omap_setheader(coll, goid, *(op.omap_header)); + + for (auto &&up: op.omap_updates) { + using UpdateType = PGTransaction::ObjectOperation::OmapUpdateType; + switch (up.first) { + case UpdateType::Remove: + t->omap_rmkeys(coll, goid, up.second); + break; + case UpdateType::Insert: + t->omap_setkeys(coll, goid, up.second); + break; + case UpdateType::RemoveRange: + t->omap_rmkeyrange(coll, goid, up.second); + break; + } + } + + // updated_snaps doesn't matter since we marked unrollbackable + + if (op.alloc_hint) { + auto &hint = *(op.alloc_hint); + t->set_alloc_hint( + coll, + goid, + hint.expected_object_size, + hint.expected_write_size, + hint.flags); + } + + for (auto &&extent: op.buffer_updates) { + using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; + match( + extent.get_val(), + [&](const BufferUpdate::Write &op) { + t->write( + coll, + goid, + extent.get_off(), + extent.get_len(), + op.buffer, + op.fadvise_flags); + }, + [&](const BufferUpdate::Zero &op) { + t->zero( + coll, + goid, + extent.get_off(), + extent.get_len()); + }, + [&](const BufferUpdate::CloneRange &op) { + ceph_assert(op.len == extent.get_len()); + t->clone_range( + coll, + ghobject_t(op.from, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + goid, + op.offset, + extent.get_len(), + extent.get_off()); + }); + } + }); +} + +void ReplicatedBackend::submit_transaction( + const hobject_t &soid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&_t, + const eversion_t &trim_to, + const eversion_t &min_last_complete_ondisk, + vector&& _log_entries, + std::optional &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef orig_op) +{ + parent->apply_stats( + soid, + delta_stats); + + vector log_entries(_log_entries); + ObjectStore::Transaction op_t; + PGTransactionUPtr t(std::move(_t)); + set added, removed; + generate_transaction( + t, + coll, + log_entries, + &op_t, + &added, + &removed, + get_osdmap()->require_osd_release); + ceph_assert(added.size() <= 1); + ceph_assert(removed.size() <= 1); + + auto insert_res = in_progress_ops.insert( + make_pair( + tid, + ceph::make_ref( + tid, on_all_commit, + orig_op, at_version) + ) + ); + ceph_assert(insert_res.second); + InProgressOp &op = *insert_res.first->second; + +#ifdef HAVE_JAEGER + auto rep_sub_trans = jaeger_tracing::child_span("ReplicatedBackend::submit_transaction", orig_op->osd_parent_span); +#endif + op.waiting_for_commit.insert( + parent->get_acting_recovery_backfill_shards().begin(), + parent->get_acting_recovery_backfill_shards().end()); + + issue_op( + soid, + at_version, + tid, + reqid, + trim_to, + min_last_complete_ondisk, + added.size() ? *(added.begin()) : hobject_t(), + removed.size() ? *(removed.begin()) : hobject_t(), + log_entries, + hset_history, + &op, + op_t); + + add_temp_objs(added); + clear_temp_objs(removed); + + parent->log_operation( + std::move(log_entries), + hset_history, + trim_to, + at_version, + min_last_complete_ondisk, + true, + op_t); + + op_t.register_on_commit( + parent->bless_context( + new C_OSD_OnOpCommit(this, &op))); + + vector tls; + tls.push_back(std::move(op_t)); + + parent->queue_transactions(tls, op.op); + if (at_version != eversion_t()) { + parent->op_applied(at_version); + } +} + +void ReplicatedBackend::op_commit(const ceph::ref_t& op) +{ + if (op->on_commit == nullptr) { + // aborted + return; + } + + FUNCTRACE(cct); + OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_COMMIT_BEGIN", true); + dout(10) << __func__ << ": " << op->tid << dendl; + if (op->op) { + op->op->mark_event("op_commit"); + op->op->pg_trace.event("op commit"); + } + + op->waiting_for_commit.erase(get_parent()->whoami_shard()); + + if (op->waiting_for_commit.empty()) { + op->on_commit->complete(0); + op->on_commit = 0; + in_progress_ops.erase(op->tid); + } +} + +void ReplicatedBackend::do_repop_reply(OpRequestRef op) +{ + static_cast(op->get_nonconst_req())->finish_decode(); + auto r = op->get_req(); + ceph_assert(r->get_header().type == MSG_OSD_REPOPREPLY); + + op->mark_started(); + + // must be replication. + ceph_tid_t rep_tid = r->get_tid(); + pg_shard_t from = r->from; + + auto iter = in_progress_ops.find(rep_tid); + if (iter != in_progress_ops.end()) { + InProgressOp &ip_op = *iter->second; + const MOSDOp *m = nullptr; + if (ip_op.op) + m = ip_op.op->get_req(); + + if (m) + dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m + << " ack_type " << (int)r->ack_type + << " from " << from + << dendl; + else + dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) " + << " ack_type " << (int)r->ack_type + << " from " << from + << dendl; + + // oh, good. + + if (r->ack_type & CEPH_OSD_FLAG_ONDISK) { + ceph_assert(ip_op.waiting_for_commit.count(from)); + ip_op.waiting_for_commit.erase(from); + if (ip_op.op) { + ip_op.op->mark_event("sub_op_commit_rec"); + ip_op.op->pg_trace.event("sub_op_commit_rec"); + } + } else { + // legacy peer; ignore + } + + parent->update_peer_last_complete_ondisk( + from, + r->get_last_complete_ondisk()); + + if (ip_op.waiting_for_commit.empty() && + ip_op.on_commit) { + ip_op.on_commit->complete(0); + ip_op.on_commit = 0; + in_progress_ops.erase(iter); + } + } +} + +int ReplicatedBackend::be_deep_scrub( + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) +{ + dout(10) << __func__ << " " << poid << " pos " << pos << dendl; + int r; + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE; + + utime_t sleeptime; + sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep); + if (sleeptime != utime_t()) { + lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl; + sleeptime.sleep(); + } + + ceph_assert(poid == pos.ls[pos.pos]); + if (!pos.data_done()) { + if (pos.data_pos == 0) { + pos.data_hash = bufferhash(-1); + } + + bufferlist bl; + r = store->read( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + pos.data_pos, + cct->_conf->osd_deep_scrub_stride, bl, + fadvise_flags); + if (r < 0) { + dout(20) << __func__ << " " << poid << " got " + << r << " on read, read_error" << dendl; + o.read_error = true; + return 0; + } + if (r > 0) { + pos.data_hash << bl; + } + pos.data_pos += r; + if (r == cct->_conf->osd_deep_scrub_stride) { + dout(20) << __func__ << " " << poid << " more data, digest so far 0x" + << std::hex << pos.data_hash.digest() << std::dec << dendl; + return -EINPROGRESS; + } + // done with bytes + pos.data_pos = -1; + o.digest = pos.data_hash.digest(); + o.digest_present = true; + dout(20) << __func__ << " " << poid << " done with data, digest 0x" + << std::hex << o.digest << std::dec << dendl; + } + + // omap header + if (pos.omap_pos.empty()) { + pos.omap_hash = bufferhash(-1); + + bufferlist hdrbl; + r = store->omap_get_header( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + &hdrbl, true); + if (r == -EIO) { + dout(20) << __func__ << " " << poid << " got " + << r << " on omap header read, read_error" << dendl; + o.read_error = true; + return 0; + } + if (r == 0 && hdrbl.length()) { + bool encoded = false; + dout(25) << "CRC header " << cleanbin(hdrbl, encoded, true) << dendl; + pos.omap_hash << hdrbl; + } + } + + // omap + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + ceph_assert(iter); + if (pos.omap_pos.length()) { + iter->lower_bound(pos.omap_pos); + } else { + iter->seek_to_first(); + } + int max = g_conf()->osd_deep_scrub_keys; + while (iter->status() == 0 && iter->valid()) { + pos.omap_bytes += iter->value().length(); + ++pos.omap_keys; + --max; + // fixme: we can do this more efficiently. + bufferlist bl; + encode(iter->key(), bl); + encode(iter->value(), bl); + pos.omap_hash << bl; + + iter->next(); + + if (iter->valid() && max == 0) { + pos.omap_pos = iter->key(); + return -EINPROGRESS; + } + if (iter->status() < 0) { + dout(25) << __func__ << " " << poid + << " on omap scan, db status error" << dendl; + o.read_error = true; + return 0; + } + } + + if (pos.omap_keys > cct->_conf-> + osd_deep_scrub_large_omap_object_key_threshold || + pos.omap_bytes > cct->_conf-> + osd_deep_scrub_large_omap_object_value_sum_threshold) { + dout(25) << __func__ << " " << poid + << " large omap object detected. Object has " << pos.omap_keys + << " keys and size " << pos.omap_bytes << " bytes" << dendl; + o.large_omap_object_found = true; + o.large_omap_object_key_count = pos.omap_keys; + o.large_omap_object_value_size = pos.omap_bytes; + map.has_large_omap_object_errors = true; + } + + o.omap_digest = pos.omap_hash.digest(); + o.omap_digest_present = true; + dout(20) << __func__ << " done with " << poid << " omap_digest " + << std::hex << o.omap_digest << std::dec << dendl; + + // Sum up omap usage + if (pos.omap_keys > 0 || pos.omap_bytes > 0) { + dout(25) << __func__ << " adding " << pos.omap_keys << " keys and " + << pos.omap_bytes << " bytes to pg_stats sums" << dendl; + map.has_omap_keys = true; + o.object_omap_bytes = pos.omap_bytes; + o.object_omap_keys = pos.omap_keys; + } + + // done! + return 0; +} + +void ReplicatedBackend::_do_push(OpRequestRef op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_PUSH); + pg_shard_t from = m->from; + + op->mark_started(); + + vector replies; + ObjectStore::Transaction t; + if (get_parent()->check_failsafe_full()) { + dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl; + ceph_abort(); + } + for (vector::const_iterator i = m->pushes.begin(); + i != m->pushes.end(); + ++i) { + replies.push_back(PushReplyOp()); + handle_push(from, *i, &(replies.back()), &t, m->is_repair); + } + + MOSDPGPushReply *reply = new MOSDPGPushReply; + reply->from = get_parent()->whoami_shard(); + reply->set_priority(m->get_priority()); + reply->pgid = get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->replies.swap(replies); + reply->compute_cost(cct); + + t.register_on_complete( + new PG_SendMessageOnConn( + get_parent(), reply, m->get_connection())); + + get_parent()->queue_transaction(std::move(t)); +} + +struct C_ReplicatedBackend_OnPullComplete : GenContext { + ReplicatedBackend *bc; + list to_continue; + int priority; + C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority) + : bc(bc), priority(priority) {} + + void finish(ThreadPool::TPHandle &handle) override { + ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op(); + for (auto &&i: to_continue) { + auto j = bc->pulling.find(i.hoid); + ceph_assert(j != bc->pulling.end()); + ObjectContextRef obc = j->second.obc; + bc->clear_pull(j, false /* already did it */); + int started = bc->start_pushes(i.hoid, obc, h); + if (started < 0) { + bc->pushing[i.hoid].clear(); + bc->get_parent()->on_failed_pull( + { bc->get_parent()->whoami_shard() }, + i.hoid, obc->obs.oi.version); + } else if (!started) { + bc->get_parent()->on_global_recover( + i.hoid, i.stat, false); + } + handle.reset_tp_timeout(); + } + bc->run_recovery_op(h, priority); + } +}; + +void ReplicatedBackend::_do_pull_response(OpRequestRef op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_PUSH); + pg_shard_t from = m->from; + + op->mark_started(); + + vector replies(1); + if (get_parent()->check_failsafe_full()) { + dout(10) << __func__ << " Out of space (failsafe) processing pull response (push)." << dendl; + ceph_abort(); + } + + ObjectStore::Transaction t; + list to_continue; + for (vector::const_iterator i = m->pushes.begin(); + i != m->pushes.end(); + ++i) { + bool more = handle_pull_response(from, *i, &(replies.back()), &to_continue, &t); + if (more) + replies.push_back(PullOp()); + } + if (!to_continue.empty()) { + C_ReplicatedBackend_OnPullComplete *c = + new C_ReplicatedBackend_OnPullComplete( + this, + m->get_priority()); + c->to_continue.swap(to_continue); + t.register_on_complete( + new PG_RecoveryQueueAsync( + get_parent(), + get_parent()->bless_unlocked_gencontext(c))); + } + replies.erase(replies.end() - 1); + + if (replies.size()) { + MOSDPGPull *reply = new MOSDPGPull; + reply->from = parent->whoami_shard(); + reply->set_priority(m->get_priority()); + reply->pgid = get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->set_pulls(std::move(replies)); + reply->compute_cost(cct); + + t.register_on_complete( + new PG_SendMessageOnConn( + get_parent(), reply, m->get_connection())); + } + + get_parent()->queue_transaction(std::move(t)); +} + +void ReplicatedBackend::do_pull(OpRequestRef op) +{ + MOSDPGPull *m = static_cast(op->get_nonconst_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_PULL); + pg_shard_t from = m->from; + + map > replies; + for (auto& i : m->take_pulls()) { + replies[from].push_back(PushOp()); + handle_pull(from, i, &(replies[from].back())); + } + send_pushes(m->get_priority(), replies); +} + +void ReplicatedBackend::do_push_reply(OpRequestRef op) +{ + auto m = op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_PG_PUSH_REPLY); + pg_shard_t from = m->from; + + vector replies(1); + for (vector::const_iterator i = m->replies.begin(); + i != m->replies.end(); + ++i) { + bool more = handle_push_reply(from, *i, &(replies.back())); + if (more) + replies.push_back(PushOp()); + } + replies.erase(replies.end() - 1); + + map > _replies; + _replies[from].swap(replies); + send_pushes(m->get_priority(), _replies); +} + +Message * ReplicatedBackend::generate_subop( + const hobject_t &soid, + const eversion_t &at_version, + ceph_tid_t tid, + osd_reqid_t reqid, + eversion_t pg_trim_to, + eversion_t min_last_complete_ondisk, + hobject_t new_temp_oid, + hobject_t discard_temp_oid, + const bufferlist &log_entries, + std::optional &hset_hist, + ObjectStore::Transaction &op_t, + pg_shard_t peer, + const pg_info_t &pinfo) +{ + int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + // forward the write/update/whatever + MOSDRepOp *wr = new MOSDRepOp( + reqid, parent->whoami_shard(), + spg_t(get_info().pgid.pgid, peer.shard), + soid, acks_wanted, + get_osdmap_epoch(), + parent->get_last_peering_reset_epoch(), + tid, at_version); + + // ship resulting transaction, log entries, and pg_stats + if (!parent->should_send_op(peer, soid)) { + ObjectStore::Transaction t; + encode(t, wr->get_data()); + } else { + encode(op_t, wr->get_data()); + wr->get_header().data_off = op_t.get_data_alignment(); + } + + wr->logbl = log_entries; + + if (pinfo.is_incomplete()) + wr->pg_stats = pinfo.stats; // reflects backfill progress + else + wr->pg_stats = get_info().stats; + + wr->pg_trim_to = pg_trim_to; + + if (HAVE_FEATURE(parent->min_peer_features(), OSD_REPOP_MLCOD)) { + wr->min_last_complete_ondisk = min_last_complete_ondisk; + } else { + /* Some replicas need this field to be at_version. New replicas + * will ignore it */ + wr->set_rollback_to(at_version); + } + + wr->new_temp_oid = new_temp_oid; + wr->discard_temp_oid = discard_temp_oid; + wr->updated_hit_set_history = hset_hist; + return wr; +} + +void ReplicatedBackend::issue_op( + const hobject_t &soid, + const eversion_t &at_version, + ceph_tid_t tid, + osd_reqid_t reqid, + eversion_t pg_trim_to, + eversion_t min_last_complete_ondisk, + hobject_t new_temp_oid, + hobject_t discard_temp_oid, + const vector &log_entries, + std::optional &hset_hist, + InProgressOp *op, + ObjectStore::Transaction &op_t) +{ + if (parent->get_acting_recovery_backfill_shards().size() > 1) { + if (op->op) { + op->op->pg_trace.event("issue replication ops"); + ostringstream ss; + set replicas = parent->get_acting_recovery_backfill_shards(); + replicas.erase(parent->whoami_shard()); + ss << "waiting for subops from " << replicas; + op->op->mark_sub_op_sent(ss.str()); + } + + // avoid doing the same work in generate_subop + bufferlist logs; + encode(log_entries, logs); + + for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) { + if (shard == parent->whoami_shard()) continue; + const pg_info_t &pinfo = parent->get_shard_info().find(shard)->second; + + Message *wr; + wr = generate_subop( + soid, + at_version, + tid, + reqid, + pg_trim_to, + min_last_complete_ondisk, + new_temp_oid, + discard_temp_oid, + logs, + hset_hist, + op_t, + shard, + pinfo); + if (op->op && op->op->pg_trace) + wr->trace.init("replicated op", nullptr, &op->op->pg_trace); + get_parent()->send_message_osd_cluster( + shard.osd, wr, get_osdmap_epoch()); + } + } +} + +// sub op modify +void ReplicatedBackend::do_repop(OpRequestRef op) +{ + static_cast(op->get_nonconst_req())->finish_decode(); + auto m = op->get_req(); + int msg_type = m->get_type(); + ceph_assert(MSG_OSD_REPOP == msg_type); + + const hobject_t& soid = m->poid; + + dout(10) << __func__ << " " << soid + << " v " << m->version + << (m->logbl.length() ? " (transaction)" : " (parallel exec") + << " " << m->logbl.length() + << dendl; + +#ifdef HAVE_JAEGER + auto do_repop_span = jaeger_tracing::child_span(__func__, op->osd_parent_span); +#endif + + // sanity checks + ceph_assert(m->map_epoch >= get_info().history.same_interval_since); + + dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl; + parent->maybe_preempt_replica_scrub(soid); + + int ackerosd = m->get_source().num(); + + op->mark_started(); + + RepModifyRef rm(std::make_shared()); + rm->op = op; + rm->ackerosd = ackerosd; + rm->last_complete = get_info().last_complete; + rm->epoch_started = get_osdmap_epoch(); + + ceph_assert(m->logbl.length()); + // shipped transaction and log entries + vector log; + + auto p = const_cast(m->get_data()).cbegin(); + decode(rm->opt, p); + + if (m->new_temp_oid != hobject_t()) { + dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl; + add_temp_obj(m->new_temp_oid); + } + if (m->discard_temp_oid != hobject_t()) { + dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl; + if (rm->opt.empty()) { + dout(10) << __func__ << ": removing object " << m->discard_temp_oid + << " since we won't get the transaction" << dendl; + rm->localt.remove(coll, ghobject_t(m->discard_temp_oid)); + } + clear_temp_obj(m->discard_temp_oid); + } + + p = const_cast(m->logbl).begin(); + decode(log, p); + rm->opt.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + + bool update_snaps = false; + if (!rm->opt.empty()) { + // If the opt is non-empty, we infer we are before + // last_backfill (according to the primary, not our + // not-quite-accurate value), and should update the + // collections now. Otherwise, we do it later on push. + update_snaps = true; + } + + // flag set to true during async recovery + bool async = false; + pg_missing_tracker_t pmissing = get_parent()->get_local_missing(); + if (pmissing.is_missing(soid)) { + async = true; + dout(30) << __func__ << " is_missing " << pmissing.is_missing(soid) << dendl; + for (auto &&e: log) { + dout(30) << " add_next_event entry " << e << dendl; + get_parent()->add_local_next_event(e); + dout(30) << " entry is_delete " << e.is_delete() << dendl; + } + } + + parent->update_stats(m->pg_stats); + parent->log_operation( + std::move(log), + m->updated_hit_set_history, + m->pg_trim_to, + m->version, /* Replicated PGs don't have rollback info */ + m->min_last_complete_ondisk, + update_snaps, + rm->localt, + async); + + rm->opt.register_on_commit( + parent->bless_context( + new C_OSD_RepModifyCommit(this, rm))); + vector tls; + tls.reserve(2); + tls.push_back(std::move(rm->localt)); + tls.push_back(std::move(rm->opt)); + parent->queue_transactions(tls, op); + // op is cleaned up by oncommit/onapply when both are executed + dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl; +} + +void ReplicatedBackend::repop_commit(RepModifyRef rm) +{ + rm->op->mark_commit_sent(); + rm->op->pg_trace.event("sup_op_commit"); + rm->committed = true; + + // send commit. + auto m = rm->op->get_req(); + ceph_assert(m->get_type() == MSG_OSD_REPOP); + dout(10) << __func__ << " on op " << *m + << ", sending commit to osd." << rm->ackerosd + << dendl; + ceph_assert(get_osdmap()->is_up(rm->ackerosd)); + + get_parent()->update_last_complete_ondisk(rm->last_complete); + + MOSDRepOpReply *reply = new MOSDRepOpReply( + m, + get_parent()->whoami_shard(), + 0, get_osdmap_epoch(), m->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); + reply->set_last_complete_ondisk(rm->last_complete); + reply->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority! + reply->trace = rm->op->pg_trace; + get_parent()->send_message_osd_cluster( + rm->ackerosd, reply, get_osdmap_epoch()); + + log_subop_stats(get_parent()->get_logger(), rm->op, l_osd_sop_w); +} + + +// =========================================================== + +void ReplicatedBackend::calc_head_subsets( + ObjectContextRef obc, SnapSet& snapset, const hobject_t& head, + const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set& data_subset, + map>& clone_subsets, + ObcLockManager &manager) +{ + dout(10) << "calc_head_subsets " << head + << " clone_overlap " << snapset.clone_overlap << dendl; + + uint64_t size = obc->obs.oi.size; + if (size) + data_subset.insert(0, size); + + if (HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS)) { + const auto it = missing.get_items().find(head); + assert(it != missing.get_items().end()); + data_subset.intersection_of(it->second.clean_regions.get_dirty_regions()); + dout(10) << "calc_head_subsets " << head + << " data_subset " << data_subset << dendl; + } + + if (get_parent()->get_pool().allow_incomplete_clones()) { + dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl; + return; + } + + if (!cct->_conf->osd_recover_clone_overlap) { + dout(10) << "calc_head_subsets " << head << " -- osd_recover_clone_overlap disabled" << dendl; + return; + } + + + interval_set cloning; + interval_set prev; + hobject_t c = head; + if (size) + prev.insert(0, size); + + for (int j=snapset.clones.size()-1; j>=0; j--) { + c.snap = snapset.clones[j]; + prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); + if (!missing.is_missing(c) && + c < last_backfill && + get_parent()->try_lock_for_read(c, manager)) { + dout(10) << "calc_head_subsets " << head << " has prev " << c + << " overlap " << prev << dendl; + cloning = prev; + break; + } + dout(10) << "calc_head_subsets " << head << " does not have prev " << c + << " overlap " << prev << dendl; + } + + cloning.intersection_of(data_subset); + if (cloning.empty()) { + dout(10) << "skipping clone, nothing needs to clone" << dendl; + return; + } + + if (cloning.num_intervals() > g_conf().get_val("osd_recover_clone_overlap_limit")) { + dout(10) << "skipping clone, too many holes" << dendl; + get_parent()->release_locks(manager); + clone_subsets.clear(); + cloning.clear(); + return; + } + + // what's left for us to push? + clone_subsets[c] = cloning; + data_subset.subtract(cloning); + + dout(10) << "calc_head_subsets " << head + << " data_subset " << data_subset + << " clone_subsets " << clone_subsets << dendl; +} + +void ReplicatedBackend::calc_clone_subsets( + SnapSet& snapset, const hobject_t& soid, + const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set& data_subset, + map>& clone_subsets, + ObcLockManager &manager) +{ + dout(10) << "calc_clone_subsets " << soid + << " clone_overlap " << snapset.clone_overlap << dendl; + + uint64_t size = snapset.clone_size[soid.snap]; + if (size) + data_subset.insert(0, size); + + if (get_parent()->get_pool().allow_incomplete_clones()) { + dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl; + return; + } + + if (!cct->_conf->osd_recover_clone_overlap) { + dout(10) << "calc_clone_subsets " << soid << " -- osd_recover_clone_overlap disabled" << dendl; + return; + } + + unsigned i; + for (i=0; i < snapset.clones.size(); i++) + if (snapset.clones[i] == soid.snap) + break; + + // any overlap with next older clone? + interval_set cloning; + interval_set prev; + if (size) + prev.insert(0, size); + for (int j=i-1; j>=0; j--) { + hobject_t c = soid; + c.snap = snapset.clones[j]; + prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); + if (!missing.is_missing(c) && + c < last_backfill && + get_parent()->try_lock_for_read(c, manager)) { + dout(10) << "calc_clone_subsets " << soid << " has prev " << c + << " overlap " << prev << dendl; + clone_subsets[c] = prev; + cloning.union_of(prev); + break; + } + dout(10) << "calc_clone_subsets " << soid << " does not have prev " << c + << " overlap " << prev << dendl; + } + + // overlap with next newest? + interval_set next; + if (size) + next.insert(0, size); + for (unsigned j=i+1; jtry_lock_for_read(c, manager)) { + dout(10) << "calc_clone_subsets " << soid << " has next " << c + << " overlap " << next << dendl; + clone_subsets[c] = next; + cloning.union_of(next); + break; + } + dout(10) << "calc_clone_subsets " << soid << " does not have next " << c + << " overlap " << next << dendl; + } + + if (cloning.num_intervals() > g_conf().get_val("osd_recover_clone_overlap_limit")) { + dout(10) << "skipping clone, too many holes" << dendl; + get_parent()->release_locks(manager); + clone_subsets.clear(); + cloning.clear(); + } + + + // what's left for us to push? + data_subset.subtract(cloning); + + dout(10) << "calc_clone_subsets " << soid + << " data_subset " << data_subset + << " clone_subsets " << clone_subsets << dendl; +} + +void ReplicatedBackend::prepare_pull( + eversion_t v, + const hobject_t& soid, + ObjectContextRef headctx, + RPGHandle *h) +{ + const auto missing_iter = get_parent()->get_local_missing().get_items().find(soid); + ceph_assert(missing_iter != get_parent()->get_local_missing().get_items().end()); + eversion_t _v = missing_iter->second.need; + ceph_assert(_v == v); + const map> &missing_loc( + get_parent()->get_missing_loc_shards()); + const map &peer_missing( + get_parent()->get_shard_missing()); + map>::const_iterator q = missing_loc.find(soid); + ceph_assert(q != missing_loc.end()); + ceph_assert(!q->second.empty()); + + // pick a pullee + auto p = q->second.end(); + if (cct->_conf->osd_debug_feed_pullee >= 0) { + for (auto it = q->second.begin(); it != q->second.end(); it++) { + if (it->osd == cct->_conf->osd_debug_feed_pullee) { + p = it; + break; + } + } + } + if (p == q->second.end()) { + // probably because user feed a wrong pullee + p = q->second.begin(); + std::advance(p, + ceph::util::generate_random_number(0, + q->second.size() - 1)); + } + ceph_assert(get_osdmap()->is_up(p->osd)); + pg_shard_t fromshard = *p; + + dout(7) << "pull " << soid + << " v " << v + << " on osds " << q->second + << " from osd." << fromshard + << dendl; + + ceph_assert(peer_missing.count(fromshard)); + const pg_missing_t &pmissing = peer_missing.find(fromshard)->second; + if (pmissing.is_missing(soid, v)) { + ceph_assert(pmissing.get_items().find(soid)->second.have != v); + dout(10) << "pulling soid " << soid << " from osd " << fromshard + << " at version " << pmissing.get_items().find(soid)->second.have + << " rather than at version " << v << dendl; + v = pmissing.get_items().find(soid)->second.have; + ceph_assert(get_parent()->get_log().get_log().objects.count(soid) && + (get_parent()->get_log().get_log().objects.find(soid)->second->op == + pg_log_entry_t::LOST_REVERT) && + (get_parent()->get_log().get_log().objects.find( + soid)->second->reverting_to == + v)); + } + + ObjectRecoveryInfo recovery_info; + ObcLockManager lock_manager; + + if (soid.is_snap()) { + ceph_assert(!get_parent()->get_local_missing().is_missing(soid.get_head())); + ceph_assert(headctx); + // check snapset + SnapSetContext *ssc = headctx->ssc; + ceph_assert(ssc); + dout(10) << " snapset " << ssc->snapset << dendl; + recovery_info.ss = ssc->snapset; + calc_clone_subsets( + ssc->snapset, soid, get_parent()->get_local_missing(), + get_info().last_backfill, + recovery_info.copy_subset, + recovery_info.clone_subset, + lock_manager); + // FIXME: this may overestimate if we are pulling multiple clones in parallel... + dout(10) << " pulling " << recovery_info << dendl; + + ceph_assert(ssc->snapset.clone_size.count(soid.snap)); + recovery_info.size = ssc->snapset.clone_size[soid.snap]; + recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist(); + } else { + // pulling head or unversioned object. + // always pull the whole thing. + recovery_info.copy_subset.insert(0, (uint64_t)-1); + if (HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS)) + recovery_info.copy_subset.intersection_of(missing_iter->second.clean_regions.get_dirty_regions()); + recovery_info.size = ((uint64_t)-1); + recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist(); + } + + h->pulls[fromshard].push_back(PullOp()); + PullOp &op = h->pulls[fromshard].back(); + op.soid = soid; + + op.recovery_info = recovery_info; + op.recovery_info.soid = soid; + op.recovery_info.version = v; + op.recovery_progress.data_complete = false; + op.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() + && HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS); + op.recovery_progress.data_recovered_to = 0; + op.recovery_progress.first = true; + + ceph_assert(!pulling.count(soid)); + pull_from_peer[fromshard].insert(soid); + PullInfo &pi = pulling[soid]; + pi.from = fromshard; + pi.soid = soid; + pi.head_ctx = headctx; + pi.recovery_info = op.recovery_info; + pi.recovery_progress = op.recovery_progress; + pi.cache_dont_need = h->cache_dont_need; + pi.lock_manager = std::move(lock_manager); +} + +/* + * intelligently push an object to a replica. make use of existing + * clones/heads and dup data ranges where possible. + */ +int ReplicatedBackend::prep_push_to_replica( + ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, + PushOp *pop, bool cache_dont_need) +{ + const object_info_t& oi = obc->obs.oi; + uint64_t size = obc->obs.oi.size; + + dout(10) << __func__ << ": " << soid << " v" << oi.version + << " size " << size << " to osd." << peer << dendl; + + map> clone_subsets; + interval_set data_subset; + + ObcLockManager lock_manager; + // are we doing a clone on the replica? + if (soid.snap && soid.snap < CEPH_NOSNAP) { + hobject_t head = soid; + head.snap = CEPH_NOSNAP; + + // try to base push off of clones that succeed/preceed poid + // we need the head (and current SnapSet) locally to do that. + if (get_parent()->get_local_missing().is_missing(head)) { + dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl; + return prep_push(obc, soid, peer, pop, cache_dont_need); + } + + SnapSetContext *ssc = obc->ssc; + ceph_assert(ssc); + dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; + pop->recovery_info.ss = ssc->snapset; + map::const_iterator pm = + get_parent()->get_shard_missing().find(peer); + ceph_assert(pm != get_parent()->get_shard_missing().end()); + map::const_iterator pi = + get_parent()->get_shard_info().find(peer); + ceph_assert(pi != get_parent()->get_shard_info().end()); + calc_clone_subsets( + ssc->snapset, soid, + pm->second, + pi->second.last_backfill, + data_subset, clone_subsets, + lock_manager); + } else if (soid.snap == CEPH_NOSNAP) { + // pushing head or unversioned object. + // base this on partially on replica's clones? + SnapSetContext *ssc = obc->ssc; + ceph_assert(ssc); + dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; + calc_head_subsets( + obc, + ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second, + get_parent()->get_shard_info().find(peer)->second.last_backfill, + data_subset, clone_subsets, + lock_manager); + } + + return prep_push( + obc, + soid, + peer, + oi.version, + data_subset, + clone_subsets, + pop, + cache_dont_need, + std::move(lock_manager)); +} + +int ReplicatedBackend::prep_push(ObjectContextRef obc, + const hobject_t& soid, pg_shard_t peer, + PushOp *pop, bool cache_dont_need) +{ + interval_set data_subset; + if (obc->obs.oi.size) + data_subset.insert(0, obc->obs.oi.size); + map> clone_subsets; + + return prep_push(obc, soid, peer, + obc->obs.oi.version, data_subset, clone_subsets, + pop, cache_dont_need, ObcLockManager()); +} + +int ReplicatedBackend::prep_push( + ObjectContextRef obc, + const hobject_t& soid, pg_shard_t peer, + eversion_t version, + interval_set &data_subset, + map>& clone_subsets, + PushOp *pop, + bool cache_dont_need, + ObcLockManager &&lock_manager) +{ + get_parent()->begin_peer_recover(peer, soid); + const auto pmissing_iter = get_parent()->get_shard_missing().find(peer); + const auto missing_iter = pmissing_iter->second.get_items().find(soid); + assert(missing_iter != pmissing_iter->second.get_items().end()); + // take note. + PushInfo &pi = pushing[soid][peer]; + pi.obc = obc; + pi.recovery_info.size = obc->obs.oi.size; + pi.recovery_info.copy_subset = data_subset; + pi.recovery_info.clone_subset = clone_subsets; + pi.recovery_info.soid = soid; + pi.recovery_info.oi = obc->obs.oi; + pi.recovery_info.ss = pop->recovery_info.ss; + pi.recovery_info.version = version; + pi.recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist(); + pi.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() && + HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS); + pi.lock_manager = std::move(lock_manager); + + ObjectRecoveryProgress new_progress; + int r = build_push_op(pi.recovery_info, + pi.recovery_progress, + &new_progress, + pop, + &(pi.stat), cache_dont_need); + if (r < 0) + return r; + pi.recovery_progress = new_progress; + return 0; +} + +void ReplicatedBackend::submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + bool cache_dont_need, + interval_set &data_zeros, + const interval_set &intervals_included, + bufferlist data_included, + bufferlist omap_header, + const map &attrs, + const map &omap_entries, + ObjectStore::Transaction *t) +{ + hobject_t target_oid; + if (first && complete) { + target_oid = recovery_info.soid; + } else { + target_oid = get_parent()->get_temp_recovery_object(recovery_info.soid, + recovery_info.version); + if (first) { + dout(10) << __func__ << ": Adding oid " + << target_oid << " in the temp collection" << dendl; + add_temp_obj(target_oid); + } + } + + if (first) { + if (!complete) { + t->remove(coll, ghobject_t(target_oid)); + t->touch(coll, ghobject_t(target_oid)); + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll, ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } else { + if (!recovery_info.object_exist) { + t->remove(coll, ghobject_t(target_oid)); + t->touch(coll, ghobject_t(target_oid)); + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll, ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } + //remove xattr and update later if overwrite on original object + t->rmattrs(coll, ghobject_t(target_oid)); + //if need update omap, clear the previous content first + if (clear_omap) + t->omap_clear(coll, ghobject_t(target_oid)); + } + + t->truncate(coll, ghobject_t(target_oid), recovery_info.size); + if (omap_header.length()) + t->omap_setheader(coll, ghobject_t(target_oid), omap_header); + + struct stat st; + int r = store->stat(ch, ghobject_t(recovery_info.soid), &st); + if (get_parent()->pg_is_remote_backfilling()) { + uint64_t size = 0; + if (r == 0) + size = st.st_size; + // Don't need to do anything if object is still the same size + if (size != recovery_info.oi.size) { + get_parent()->pg_add_local_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size); + get_parent()->pg_add_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size); + dout(10) << __func__ << " " << recovery_info.soid + << " backfill size " << recovery_info.oi.size + << " previous size " << size + << " net size " << recovery_info.oi.size - size + << dendl; + } + } + if (!complete) { + //clone overlap content in local object + if (recovery_info.object_exist) { + assert(r == 0); + uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size); + interval_set local_intervals_included, local_intervals_excluded; + if (local_size) { + local_intervals_included.insert(0, local_size); + local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset); + local_intervals_included.subtract(local_intervals_excluded); + } + for (interval_set::const_iterator q = local_intervals_included.begin(); + q != local_intervals_included.end(); + ++q) { + dout(15) << " clone_range " << recovery_info.soid << " " + << q.get_start() << "~" << q.get_len() << dendl; + t->clone_range(coll, ghobject_t(recovery_info.soid), ghobject_t(target_oid), + q.get_start(), q.get_len(), q.get_start()); + } + } + } + } + uint64_t off = 0; + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; + if (cache_dont_need) + fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + // Punch zeros for data, if fiemap indicates nothing but it is marked dirty + if (data_zeros.size() > 0) { + data_zeros.intersection_of(recovery_info.copy_subset); + assert(intervals_included.subset_of(data_zeros)); + data_zeros.subtract(intervals_included); + + dout(20) << __func__ <<" recovering object " << recovery_info.soid + << " copy_subset: " << recovery_info.copy_subset + << " intervals_included: " << intervals_included + << " data_zeros: " << data_zeros << dendl; + + for (auto p = data_zeros.begin(); p != data_zeros.end(); ++p) + t->zero(coll, ghobject_t(target_oid), p.get_start(), p.get_len()); + } + for (interval_set::const_iterator p = intervals_included.begin(); + p != intervals_included.end(); + ++p) { + bufferlist bit; + bit.substr_of(data_included, off, p.get_len()); + t->write(coll, ghobject_t(target_oid), + p.get_start(), p.get_len(), bit, fadvise_flags); + off += p.get_len(); + } + + if (!omap_entries.empty()) + t->omap_setkeys(coll, ghobject_t(target_oid), omap_entries); + if (!attrs.empty()) + t->setattrs(coll, ghobject_t(target_oid), attrs); + + if (complete) { + if (!first) { + dout(10) << __func__ << ": Removing oid " + << target_oid << " from the temp collection" << dendl; + clear_temp_obj(target_oid); + t->remove(coll, ghobject_t(recovery_info.soid)); + t->collection_move_rename(coll, ghobject_t(target_oid), + coll, ghobject_t(recovery_info.soid)); + } + + submit_push_complete(recovery_info, t); + + } +} + +void ReplicatedBackend::submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t) +{ + for (map>::const_iterator p = + recovery_info.clone_subset.begin(); + p != recovery_info.clone_subset.end(); + ++p) { + for (interval_set::const_iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + dout(15) << " clone_range " << p->first << " " + << q.get_start() << "~" << q.get_len() << dendl; + t->clone_range(coll, ghobject_t(p->first), ghobject_t(recovery_info.soid), + q.get_start(), q.get_len(), q.get_start()); + } + } +} + +ObjectRecoveryInfo ReplicatedBackend::recalc_subsets( + const ObjectRecoveryInfo& recovery_info, + SnapSetContext *ssc, + ObcLockManager &manager) +{ + if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP) + return recovery_info; + ObjectRecoveryInfo new_info = recovery_info; + new_info.copy_subset.clear(); + new_info.clone_subset.clear(); + ceph_assert(ssc); + get_parent()->release_locks(manager); // might already have locks + calc_clone_subsets( + ssc->snapset, new_info.soid, get_parent()->get_local_missing(), + get_info().last_backfill, + new_info.copy_subset, new_info.clone_subset, + manager); + return new_info; +} + +bool ReplicatedBackend::handle_pull_response( + pg_shard_t from, const PushOp &pop, PullOp *response, + list *to_continue, + ObjectStore::Transaction *t) +{ + interval_set data_included = pop.data_included; + bufferlist data; + data = pop.data; + dout(10) << "handle_pull_response " + << pop.recovery_info + << pop.after_progress + << " data.size() is " << data.length() + << " data_included: " << data_included + << dendl; + if (pop.version == eversion_t()) { + // replica doesn't have it! + _failed_pull(from, pop.soid); + return false; + } + + const hobject_t &hoid = pop.soid; + ceph_assert((data_included.empty() && data.length() == 0) || + (!data_included.empty() && data.length() > 0)); + + auto piter = pulling.find(hoid); + if (piter == pulling.end()) { + return false; + } + + PullInfo &pi = piter->second; + if (pi.recovery_info.size == (uint64_t(-1))) { + pi.recovery_info.size = pop.recovery_info.size; + pi.recovery_info.copy_subset.intersection_of( + pop.recovery_info.copy_subset); + } + // If primary doesn't have object info and didn't know version + if (pi.recovery_info.version == eversion_t()) { + pi.recovery_info.version = pop.version; + } + + bool first = pi.recovery_progress.first; + if (first) { + // attrs only reference the origin bufferlist (decode from + // MOSDPGPush message) whose size is much greater than attrs in + // recovery. If obc cache it (get_obc maybe cache the attr), this + // causes the whole origin bufferlist would not be free until obc + // is evicted from obc cache. So rebuild the bufferlists before + // cache it. + auto attrset = pop.attrset; + for (auto& a : attrset) { + a.second.rebuild(); + } + pi.obc = get_parent()->get_obc(pi.recovery_info.soid, attrset); + if (attrset.find(SS_ATTR) != attrset.end()) { + bufferlist ssbv = attrset.at(SS_ATTR); + SnapSet ss(ssbv); + assert(!pi.obc->ssc->exists || ss.seq == pi.obc->ssc->snapset.seq); + } + pi.recovery_info.oi = pi.obc->obs.oi; + pi.recovery_info = recalc_subsets( + pi.recovery_info, + pi.obc->ssc, + pi.lock_manager); + } + + + interval_set usable_intervals; + bufferlist usable_data; + trim_pushed_data(pi.recovery_info.copy_subset, + data_included, + data, + &usable_intervals, + &usable_data); + data_included = usable_intervals; + data = std::move(usable_data); + + + pi.recovery_progress = pop.after_progress; + + dout(10) << "new recovery_info " << pi.recovery_info + << ", new progress " << pi.recovery_progress + << dendl; + interval_set data_zeros; + uint64_t z_offset = pop.before_progress.data_recovered_to; + uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to; + if (z_length) + data_zeros.insert(z_offset, z_length); + bool complete = pi.is_complete(); + bool clear_omap = !pop.before_progress.omap_complete; + + submit_push_data(pi.recovery_info, + first, + complete, + clear_omap, + pi.cache_dont_need, + data_zeros, + data_included, + data, + pop.omap_header, + pop.attrset, + pop.omap_entries, + t); + + pi.stat.num_keys_recovered += pop.omap_entries.size(); + pi.stat.num_bytes_recovered += data.length(); + get_parent()->get_logger()->inc(l_osd_rbytes, pop.omap_entries.size() + data.length()); + + if (complete) { + pi.stat.num_objects_recovered++; + // XXX: This could overcount if regular recovery is needed right after a repair + if (get_parent()->pg_is_repair()) { + pi.stat.num_objects_repaired++; + get_parent()->inc_osd_stat_repaired(); + } + clear_pull_from(piter); + to_continue->push_back({hoid, pi.stat}); + get_parent()->on_local_recover( + hoid, pi.recovery_info, pi.obc, false, t); + return false; + } else { + response->soid = pop.soid; + response->recovery_info = pi.recovery_info; + response->recovery_progress = pi.recovery_progress; + return true; + } +} + +void ReplicatedBackend::handle_push( + pg_shard_t from, const PushOp &pop, PushReplyOp *response, + ObjectStore::Transaction *t, bool is_repair) +{ + dout(10) << "handle_push " + << pop.recovery_info + << pop.after_progress + << dendl; + bufferlist data; + data = pop.data; + bool first = pop.before_progress.first; + bool complete = pop.after_progress.data_complete && + pop.after_progress.omap_complete; + bool clear_omap = !pop.before_progress.omap_complete; + interval_set data_zeros; + uint64_t z_offset = pop.before_progress.data_recovered_to; + uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to; + if (z_length) + data_zeros.insert(z_offset, z_length); + response->soid = pop.recovery_info.soid; + + submit_push_data(pop.recovery_info, + first, + complete, + clear_omap, + true, // must be replicate + data_zeros, + pop.data_included, + data, + pop.omap_header, + pop.attrset, + pop.omap_entries, + t); + + if (complete) { + if (is_repair) { + get_parent()->inc_osd_stat_repaired(); + dout(20) << __func__ << " repair complete" << dendl; + } + get_parent()->on_local_recover( + pop.recovery_info.soid, + pop.recovery_info, + ObjectContextRef(), // ok, is replica + false, + t); + } +} + +void ReplicatedBackend::send_pushes(int prio, map > &pushes) +{ + for (map >::iterator i = pushes.begin(); + i != pushes.end(); + ++i) { + ConnectionRef con = get_parent()->get_con_osd_cluster( + i->first.osd, + get_osdmap_epoch()); + if (!con) + continue; + vector::iterator j = i->second.begin(); + while (j != i->second.end()) { + uint64_t cost = 0; + uint64_t pushes = 0; + MOSDPGPush *msg = new MOSDPGPush(); + msg->from = get_parent()->whoami_shard(); + msg->pgid = get_parent()->primary_spg_t(); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); + msg->set_priority(prio); + msg->is_repair = get_parent()->pg_is_repair(); + for (; + (j != i->second.end() && + cost < cct->_conf->osd_max_push_cost && + pushes < cct->_conf->osd_max_push_objects) ; + ++j) { + dout(20) << __func__ << ": sending push " << *j + << " to osd." << i->first << dendl; + cost += j->cost(cct); + pushes += 1; + msg->pushes.push_back(*j); + } + msg->set_cost(cost); + get_parent()->send_message_osd_cluster(msg, con); + } + } +} + +void ReplicatedBackend::send_pulls(int prio, map > &pulls) +{ + for (map >::iterator i = pulls.begin(); + i != pulls.end(); + ++i) { + ConnectionRef con = get_parent()->get_con_osd_cluster( + i->first.osd, + get_osdmap_epoch()); + if (!con) + continue; + dout(20) << __func__ << ": sending pulls " << i->second + << " to osd." << i->first << dendl; + MOSDPGPull *msg = new MOSDPGPull(); + msg->from = parent->whoami_shard(); + msg->set_priority(prio); + msg->pgid = get_parent()->primary_spg_t(); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); + msg->set_pulls(std::move(i->second)); + msg->compute_cost(cct); + get_parent()->send_message_osd_cluster(msg, con); + } +} + +int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, + const ObjectRecoveryProgress &progress, + ObjectRecoveryProgress *out_progress, + PushOp *out_op, + object_stat_sum_t *stat, + bool cache_dont_need) +{ + ObjectRecoveryProgress _new_progress; + if (!out_progress) + out_progress = &_new_progress; + ObjectRecoveryProgress &new_progress = *out_progress; + new_progress = progress; + + dout(7) << __func__ << " " << recovery_info.soid + << " v " << recovery_info.version + << " size " << recovery_info.size + << " recovery_info: " << recovery_info + << dendl; + + eversion_t v = recovery_info.version; + object_info_t oi; + if (progress.first) { + int r = store->omap_get_header(ch, ghobject_t(recovery_info.soid), &out_op->omap_header); + if (r < 0) { + dout(1) << __func__ << " get omap header failed: " << cpp_strerror(-r) << dendl; + return r; + } + r = store->getattrs(ch, ghobject_t(recovery_info.soid), out_op->attrset); + if (r < 0) { + dout(1) << __func__ << " getattrs failed: " << cpp_strerror(-r) << dendl; + return r; + } + + // Debug + bufferlist bv = out_op->attrset[OI_ATTR]; + try { + auto bliter = bv.cbegin(); + decode(oi, bliter); + } catch (...) { + dout(0) << __func__ << ": bad object_info_t: " << recovery_info.soid << dendl; + return -EINVAL; + } + + // If requestor didn't know the version, use ours + if (v == eversion_t()) { + v = oi.version; + } else if (oi.version != v) { + get_parent()->clog_error() << get_info().pgid << " push " + << recovery_info.soid << " v " + << recovery_info.version + << " failed because local copy is " + << oi.version; + return -EINVAL; + } + + new_progress.first = false; + } + // Once we provide the version subsequent requests will have it, so + // at this point it must be known. + ceph_assert(v != eversion_t()); + + uint64_t available = cct->_conf->osd_recovery_max_chunk; + if (!progress.omap_complete) { + ObjectMap::ObjectMapIterator iter = + store->get_omap_iterator(ch, + ghobject_t(recovery_info.soid)); + ceph_assert(iter); + for (iter->lower_bound(progress.omap_recovered_to); + iter->valid(); + iter->next()) { + if (!out_op->omap_entries.empty() && + ((cct->_conf->osd_recovery_max_omap_entries_per_chunk > 0 && + out_op->omap_entries.size() >= cct->_conf->osd_recovery_max_omap_entries_per_chunk) || + available <= iter->key().size() + iter->value().length())) + break; + out_op->omap_entries.insert(make_pair(iter->key(), iter->value())); + + if ((iter->key().size() + iter->value().length()) <= available) + available -= (iter->key().size() + iter->value().length()); + else + available = 0; + } + if (!iter->valid()) + new_progress.omap_complete = true; + else + new_progress.omap_recovered_to = iter->key(); + } + + if (available > 0) { + if (!recovery_info.copy_subset.empty()) { + interval_set copy_subset = recovery_info.copy_subset; + map m; + int r = store->fiemap(ch, ghobject_t(recovery_info.soid), 0, + copy_subset.range_end(), m); + if (r >= 0) { + interval_set fiemap_included(std::move(m)); + copy_subset.intersection_of(fiemap_included); + } else { + // intersection of copy_subset and empty interval_set would be empty anyway + copy_subset.clear(); + } + + out_op->data_included.span_of(copy_subset, progress.data_recovered_to, + available); + // zero filled section, skip to end! + if (out_op->data_included.empty() || + out_op->data_included.range_end() == copy_subset.range_end()) + new_progress.data_recovered_to = recovery_info.copy_subset.range_end(); + else + new_progress.data_recovered_to = out_op->data_included.range_end(); + } + } else { + out_op->data_included.clear(); + } + + auto origin_size = out_op->data_included.size(); + bufferlist bit; + int r = store->readv(ch, ghobject_t(recovery_info.soid), + out_op->data_included, bit, + cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0); + if (cct->_conf->osd_debug_random_push_read_error && + (rand() % (int)(cct->_conf->osd_debug_random_push_read_error * 100.0)) == 0) { + dout(0) << __func__ << ": inject EIO " << recovery_info.soid << dendl; + r = -EIO; + } + if (r < 0) { + return r; + } + if (out_op->data_included.size() != origin_size) { + dout(10) << __func__ << " some extents get pruned " + << out_op->data_included.size() << "/" << origin_size + << dendl; + new_progress.data_complete = true; + } + out_op->data.claim_append(bit); + if (progress.first && !out_op->data_included.empty() && + out_op->data_included.begin().get_start() == 0 && + out_op->data.length() == oi.size && oi.is_data_digest()) { + uint32_t crc = out_op->data.crc32c(-1); + if (oi.data_digest != crc) { + dout(0) << __func__ << " " << coll << std::hex + << " full-object read crc 0x" << crc + << " != expected 0x" << oi.data_digest + << std::dec << " on " << recovery_info.soid << dendl; + return -EIO; + } + } + + if (new_progress.is_complete(recovery_info)) { + new_progress.data_complete = true; + if (stat) { + stat->num_objects_recovered++; + if (get_parent()->pg_is_repair()) + stat->num_objects_repaired++; + } + } else if (progress.first && progress.omap_complete) { + // If omap is not changed, we need recovery omap when recovery cannot be completed once + new_progress.omap_complete = false; + } + + if (stat) { + stat->num_keys_recovered += out_op->omap_entries.size(); + stat->num_bytes_recovered += out_op->data.length(); + get_parent()->get_logger()->inc(l_osd_rbytes, out_op->omap_entries.size() + out_op->data.length()); + } + + get_parent()->get_logger()->inc(l_osd_push); + get_parent()->get_logger()->inc(l_osd_push_outb, out_op->data.length()); + + // send + out_op->version = v; + out_op->soid = recovery_info.soid; + out_op->recovery_info = recovery_info; + out_op->after_progress = new_progress; + out_op->before_progress = progress; + return 0; +} + +void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op) +{ + op->recovery_info.version = eversion_t(); + op->version = eversion_t(); + op->soid = soid; +} + +bool ReplicatedBackend::handle_push_reply( + pg_shard_t peer, const PushReplyOp &op, PushOp *reply) +{ + const hobject_t &soid = op.soid; + if (pushing.count(soid) == 0) { + dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer + << ", or anybody else" + << dendl; + return false; + } else if (pushing[soid].count(peer) == 0) { + dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer + << dendl; + return false; + } else { + PushInfo *pi = &pushing[soid][peer]; + bool error = pushing[soid].begin()->second.recovery_progress.error; + + if (!pi->recovery_progress.data_complete && !error) { + dout(10) << " pushing more from, " + << pi->recovery_progress.data_recovered_to + << " of " << pi->recovery_info.copy_subset << dendl; + ObjectRecoveryProgress new_progress; + int r = build_push_op( + pi->recovery_info, + pi->recovery_progress, &new_progress, reply, + &(pi->stat)); + // Handle the case of a read error right after we wrote, which is + // hopefully extremely rare. + if (r < 0) { + dout(5) << __func__ << ": oid " << soid << " error " << r << dendl; + + error = true; + goto done; + } + pi->recovery_progress = new_progress; + return true; + } else { + // done! +done: + if (!error) + get_parent()->on_peer_recover( peer, soid, pi->recovery_info); + + get_parent()->release_locks(pi->lock_manager); + object_stat_sum_t stat = pi->stat; + eversion_t v = pi->recovery_info.version; + pushing[soid].erase(peer); + pi = NULL; + + if (pushing[soid].empty()) { + if (!error) + get_parent()->on_global_recover(soid, stat, false); + else + get_parent()->on_failed_pull( + std::set{ get_parent()->whoami_shard() }, + soid, + v); + pushing.erase(soid); + } else { + // This looks weird, but we erased the current peer and need to remember + // the error on any other one, while getting more acks. + if (error) + pushing[soid].begin()->second.recovery_progress.error = true; + dout(10) << "pushed " << soid << ", still waiting for push ack from " + << pushing[soid].size() << " others" << dendl; + } + return false; + } + } +} + +void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply) +{ + const hobject_t &soid = op.soid; + struct stat st; + int r = store->stat(ch, ghobject_t(soid), &st); + if (r != 0) { + get_parent()->clog_error() << get_info().pgid << " " + << peer << " tried to pull " << soid + << " but got " << cpp_strerror(-r); + prep_push_op_blank(soid, reply); + } else { + ObjectRecoveryInfo &recovery_info = op.recovery_info; + ObjectRecoveryProgress &progress = op.recovery_progress; + if (progress.first && recovery_info.size == ((uint64_t)-1)) { + // Adjust size and copy_subset + recovery_info.size = st.st_size; + if (st.st_size) { + interval_set object_range; + object_range.insert(0, st.st_size); + recovery_info.copy_subset.intersection_of(object_range); + } else { + recovery_info.copy_subset.clear(); + } + assert(recovery_info.clone_subset.empty()); + } + + r = build_push_op(recovery_info, progress, 0, reply); + if (r < 0) + prep_push_op_blank(soid, reply); + } +} + +/** + * trim received data to remove what we don't want + * + * @param copy_subset intervals we want + * @param data_included intervals we got + * @param data_recieved data we got + * @param intervals_usable intervals we want to keep + * @param data_usable matching data we want to keep + */ +void ReplicatedBackend::trim_pushed_data( + const interval_set ©_subset, + const interval_set &intervals_received, + bufferlist data_received, + interval_set *intervals_usable, + bufferlist *data_usable) +{ + if (intervals_received.subset_of(copy_subset)) { + *intervals_usable = intervals_received; + *data_usable = data_received; + return; + } + + intervals_usable->intersection_of(copy_subset, + intervals_received); + + uint64_t off = 0; + for (interval_set::const_iterator p = intervals_received.begin(); + p != intervals_received.end(); + ++p) { + interval_set x; + x.insert(p.get_start(), p.get_len()); + x.intersection_of(copy_subset); + for (interval_set::const_iterator q = x.begin(); + q != x.end(); + ++q) { + bufferlist sub; + uint64_t data_off = off + (q.get_start() - p.get_start()); + sub.substr_of(data_received, data_off, q.get_len()); + data_usable->claim_append(sub); + } + off += p.get_len(); + } +} + +void ReplicatedBackend::_failed_pull(pg_shard_t from, const hobject_t &soid) +{ + dout(20) << __func__ << ": " << soid << " from " << from << dendl; + auto it = pulling.find(soid); + assert(it != pulling.end()); + get_parent()->on_failed_pull( + { from }, + soid, + it->second.recovery_info.version); + + clear_pull(it); +} + +void ReplicatedBackend::clear_pull_from( + map::iterator piter) +{ + auto from = piter->second.from; + pull_from_peer[from].erase(piter->second.soid); + if (pull_from_peer[from].empty()) + pull_from_peer.erase(from); +} + +void ReplicatedBackend::clear_pull( + map::iterator piter, + bool clear_pull_from_peer) +{ + if (clear_pull_from_peer) { + clear_pull_from(piter); + } + get_parent()->release_locks(piter->second.lock_manager); + pulling.erase(piter); +} + +int ReplicatedBackend::start_pushes( + const hobject_t &soid, + ObjectContextRef obc, + RPGHandle *h) +{ + list< map::const_iterator > shards; + + dout(20) << __func__ << " soid " << soid << dendl; + // who needs it? + ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0); + for (set::iterator i = + get_parent()->get_acting_recovery_backfill_shards().begin(); + i != get_parent()->get_acting_recovery_backfill_shards().end(); + ++i) { + if (*i == get_parent()->whoami_shard()) continue; + pg_shard_t peer = *i; + map::const_iterator j = + get_parent()->get_shard_missing().find(peer); + ceph_assert(j != get_parent()->get_shard_missing().end()); + if (j->second.is_missing(soid)) { + shards.push_back(j); + } + } + + // If more than 1 read will occur ignore possible request to not cache + bool cache = shards.size() == 1 ? h->cache_dont_need : false; + + for (auto j : shards) { + pg_shard_t peer = j->first; + h->pushes[peer].push_back(PushOp()); + int r = prep_push_to_replica(obc, soid, peer, + &(h->pushes[peer].back()), cache); + if (r < 0) { + // Back out all failed reads + for (auto k : shards) { + pg_shard_t p = k->first; + dout(10) << __func__ << " clean up peer " << p << dendl; + h->pushes[p].pop_back(); + if (p == peer) break; + } + return r; + } + } + return shards.size(); +} diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h new file mode 100644 index 000000000..f4b506357 --- /dev/null +++ b/src/osd/ReplicatedBackend.h @@ -0,0 +1,437 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef REPBACKEND_H +#define REPBACKEND_H + +#include "PGBackend.h" + +struct C_ReplicatedBackend_OnPullComplete; +class ReplicatedBackend : public PGBackend { + struct RPGHandle : public PGBackend::RecoveryHandle { + std::map > pushes; + std::map > pulls; + }; + friend struct C_ReplicatedBackend_OnPullComplete; +public: + ReplicatedBackend( + PGBackend::Listener *pg, + const coll_t &coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct); + + /// @see PGBackend::open_recovery_op + RPGHandle *_open_recovery_op() { + return new RPGHandle(); + } + PGBackend::RecoveryHandle *open_recovery_op() override { + return _open_recovery_op(); + } + + /// @see PGBackend::run_recovery_op + void run_recovery_op( + PGBackend::RecoveryHandle *h, + int priority) override; + + /// @see PGBackend::recover_object + int recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + RecoveryHandle *h + ) override; + + void check_recovery_sources(const OSDMapRef& osdmap) override; + + bool can_handle_while_inactive(OpRequestRef op) override; + + /// @see PGBackend::handle_message + bool _handle_message( + OpRequestRef op + ) override; + + void on_change() override; + void clear_recovery_state() override; + + class RPCRecPred : public IsPGRecoverablePredicate { + public: + bool operator()(const std::set &have) const override { + return !have.empty(); + } + }; + IsPGRecoverablePredicate *get_is_recoverable_predicate() const override { + return new RPCRecPred; + } + + class RPCReadPred : public IsPGReadablePredicate { + pg_shard_t whoami; + public: + explicit RPCReadPred(pg_shard_t whoami) : whoami(whoami) {} + bool operator()(const std::set &have) const override { + return have.count(whoami); + } + }; + IsPGReadablePredicate *get_is_readable_predicate() const override { + return new RPCReadPred(get_parent()->whoami_shard()); + } + + void dump_recovery_info(ceph::Formatter *f) const override { + { + f->open_array_section("pull_from_peer"); + for (std::map >::const_iterator i = pull_from_peer.begin(); + i != pull_from_peer.end(); + ++i) { + f->open_object_section("pulling_from"); + f->dump_stream("pull_from") << i->first; + { + f->open_array_section("pulls"); + for (std::set::const_iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + f->open_object_section("pull_info"); + ceph_assert(pulling.count(*j)); + pulling.find(*j)->second.dump(f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + { + f->open_array_section("pushing"); + for (std::map>::const_iterator i = + pushing.begin(); + i != pushing.end(); + ++i) { + f->open_object_section("object"); + f->dump_stream("pushing") << i->first; + { + f->open_array_section("pushing_to"); + for (std::map::const_iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + f->open_object_section("push_progress"); + f->dump_stream("pushing_to") << j->first; + { + f->open_object_section("push_info"); + j->second.dump(f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + } + + int objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + ceph::buffer::list *bl) override; + + int objects_readv_sync( + const hobject_t &hoid, + std::map&& m, + uint32_t op_flags, + ceph::buffer::list *bl) override; + + void objects_read_async( + const hobject_t &hoid, + const std::list, + std::pair > > &to_read, + Context *on_complete, + bool fast_read = false) override; + +private: + // push + struct PushInfo { + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + ObjectContextRef obc; + object_stat_sum_t stat; + ObcLockManager lock_manager; + + void dump(ceph::Formatter *f) const { + { + f->open_object_section("recovery_progress"); + recovery_progress.dump(f); + f->close_section(); + } + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + } + }; + std::map> pushing; + + // pull + struct PullInfo { + pg_shard_t from; + hobject_t soid; + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + ObjectContextRef head_ctx; + ObjectContextRef obc; + object_stat_sum_t stat; + bool cache_dont_need; + ObcLockManager lock_manager; + + void dump(ceph::Formatter *f) const { + { + f->open_object_section("recovery_progress"); + recovery_progress.dump(f); + f->close_section(); + } + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + } + + bool is_complete() const { + return recovery_progress.is_complete(recovery_info); + } + }; + + std::map pulling; + + // Reverse mapping from osd peer to objects being pulled from that peer + std::map > pull_from_peer; + void clear_pull( + std::map::iterator piter, + bool clear_pull_from_peer = true); + void clear_pull_from( + std::map::iterator piter); + + void _do_push(OpRequestRef op); + void _do_pull_response(OpRequestRef op); + void do_push(OpRequestRef op) { + if (is_primary()) { + _do_pull_response(op); + } else { + _do_push(op); + } + } + void do_pull(OpRequestRef op); + void do_push_reply(OpRequestRef op); + + bool handle_push_reply(pg_shard_t peer, const PushReplyOp &op, PushOp *reply); + void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply); + + struct pull_complete_info { + hobject_t hoid; + object_stat_sum_t stat; + }; + bool handle_pull_response( + pg_shard_t from, const PushOp &op, PullOp *response, + std::list *to_continue, + ObjectStore::Transaction *t); + void handle_push(pg_shard_t from, const PushOp &op, PushReplyOp *response, + ObjectStore::Transaction *t, bool is_repair); + + static void trim_pushed_data(const interval_set ©_subset, + const interval_set &intervals_received, + ceph::buffer::list data_received, + interval_set *intervals_usable, + ceph::buffer::list *data_usable); + void _failed_pull(pg_shard_t from, const hobject_t &soid); + + void send_pushes(int prio, std::map > &pushes); + void prep_push_op_blank(const hobject_t& soid, PushOp *op); + void send_pulls( + int priority, + std::map > &pulls); + + int build_push_op(const ObjectRecoveryInfo &recovery_info, + const ObjectRecoveryProgress &progress, + ObjectRecoveryProgress *out_progress, + PushOp *out_op, + object_stat_sum_t *stat = 0, + bool cache_dont_need = true); + void submit_push_data(const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + bool cache_dont_need, + interval_set &data_zeros, + const interval_set &intervals_included, + ceph::buffer::list data_included, + ceph::buffer::list omap_header, + const std::map &attrs, + const std::map &omap_entries, + ObjectStore::Transaction *t); + void submit_push_complete(const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t); + + void calc_clone_subsets( + SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set& data_subset, + std::map>& clone_subsets, + ObcLockManager &lock_manager); + void prepare_pull( + eversion_t v, + const hobject_t& soid, + ObjectContextRef headctx, + RPGHandle *h); + int start_pushes( + const hobject_t &soid, + ObjectContextRef obj, + RPGHandle *h); + int prep_push_to_replica( + ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, + PushOp *pop, bool cache_dont_need = true); + int prep_push( + ObjectContextRef obc, + const hobject_t& oid, pg_shard_t dest, + PushOp *op, + bool cache_dont_need); + int prep_push( + ObjectContextRef obc, + const hobject_t& soid, pg_shard_t peer, + eversion_t version, + interval_set &data_subset, + std::map>& clone_subsets, + PushOp *op, + bool cache, + ObcLockManager &&lock_manager); + void calc_head_subsets( + ObjectContextRef obc, SnapSet& snapset, const hobject_t& head, + const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set& data_subset, + std::map>& clone_subsets, + ObcLockManager &lock_manager); + ObjectRecoveryInfo recalc_subsets( + const ObjectRecoveryInfo& recovery_info, + SnapSetContext *ssc, + ObcLockManager &lock_manager); + + /** + * Client IO + */ + struct InProgressOp : public RefCountedObject { + ceph_tid_t tid; + std::set waiting_for_commit; + Context *on_commit; + OpRequestRef op; + eversion_t v; + bool done() const { + return waiting_for_commit.empty(); + } + private: + FRIEND_MAKE_REF(InProgressOp); + InProgressOp(ceph_tid_t tid, Context *on_commit, OpRequestRef op, eversion_t v) + : + tid(tid), on_commit(on_commit), + op(op), v(v) {} + }; + std::map> in_progress_ops; +public: + friend class C_OSD_OnOpCommit; + + void call_write_ordered(std::function &&cb) override { + // ReplicatedBackend submits writes inline in submit_transaction, so + // we can just call the callback. + cb(); + } + + void submit_transaction( + const hobject_t &hoid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&t, + const eversion_t &trim_to, + const eversion_t &min_last_complete_ondisk, + std::vector&& log_entries, + std::optional &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef op + ) override; + +private: + Message * generate_subop( + const hobject_t &soid, + const eversion_t &at_version, + ceph_tid_t tid, + osd_reqid_t reqid, + eversion_t pg_trim_to, + eversion_t min_last_complete_ondisk, + hobject_t new_temp_oid, + hobject_t discard_temp_oid, + const ceph::buffer::list &log_entries, + std::optional &hset_history, + ObjectStore::Transaction &op_t, + pg_shard_t peer, + const pg_info_t &pinfo); + void issue_op( + const hobject_t &soid, + const eversion_t &at_version, + ceph_tid_t tid, + osd_reqid_t reqid, + eversion_t pg_trim_to, + eversion_t min_last_complete_ondisk, + hobject_t new_temp_oid, + hobject_t discard_temp_oid, + const std::vector &log_entries, + std::optional &hset_history, + InProgressOp *op, + ObjectStore::Transaction &op_t); + void op_commit(const ceph::ref_t& op); + void do_repop_reply(OpRequestRef op); + void do_repop(OpRequestRef op); + + struct RepModify { + OpRequestRef op; + bool committed; + int ackerosd; + eversion_t last_complete; + epoch_t epoch_started; + + ObjectStore::Transaction opt, localt; + + RepModify() : committed(false), ackerosd(-1), + epoch_started(0) {} + }; + typedef std::shared_ptr RepModifyRef; + + struct C_OSD_RepModifyCommit; + + void repop_commit(RepModifyRef rm); + bool auto_repair_supported() const override { return store->has_builtin_csum(); } + + + int be_deep_scrub( + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) override; + uint64_t be_get_ondisk_size(uint64_t logical_size) override { return logical_size; } +}; + +#endif diff --git a/src/osd/ScrubStore.cc b/src/osd/ScrubStore.cc new file mode 100644 index 000000000..a692a4435 --- /dev/null +++ b/src/osd/ScrubStore.cc @@ -0,0 +1,198 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ScrubStore.h" +#include "osd_types.h" +#include "common/scrub_types.h" +#include "include/rados/rados_types.hpp" + +using std::ostringstream; +using std::string; +using std::vector; + +using ceph::bufferlist; + +namespace { +ghobject_t make_scrub_object(const spg_t& pgid) +{ + ostringstream ss; + ss << "scrub_" << pgid; + return pgid.make_temp_ghobject(ss.str()); +} + +string first_object_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0x00000000, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +// the object_key should be unique across pools +string to_object_key(int64_t pool, const librados::object_id_t& oid) +{ + auto hoid = hobject_t(object_t(oid.name), + oid.locator, // key + oid.snap, + 0, // hash + pool, + oid.nspace); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +string last_object_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0xffffffff, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +string first_snap_key(int64_t pool) +{ + // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for + // the representing the minimal and maximum keys. and this relies on how + // hobject_t::to_str() works: hex(pool).hex(revhash). + auto hoid = hobject_t(object_t(), + "", + 0, + 0x00000000, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} + +string to_snap_key(int64_t pool, const librados::object_id_t& oid) +{ + auto hoid = hobject_t(object_t(oid.name), + oid.locator, // key + oid.snap, + 0x77777777, // hash + pool, + oid.nspace); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} + +string last_snap_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0xffffffff, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} +} + +namespace Scrub { + +Store* +Store::create(ObjectStore* store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll) +{ + ceph_assert(store); + ceph_assert(t); + ghobject_t oid = make_scrub_object(pgid); + t->touch(coll, oid); + return new Store{coll, oid, store}; +} + +Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store) + : coll(coll), + hoid(oid), + driver(store, coll, hoid), + backend(&driver) +{} + +Store::~Store() +{ + ceph_assert(results.empty()); +} + +void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) +{ + bufferlist bl; + e.encode(bl); + results[to_object_key(pool, e.object)] = bl; +} + +void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) +{ + bufferlist bl; + e.encode(bl); + results[to_snap_key(pool, e.object)] = bl; +} + +bool Store::empty() const +{ + return results.empty(); +} + +void Store::flush(ObjectStore::Transaction* t) +{ + if (t) { + OSDriver::OSTransaction txn = driver.get_transaction(t); + backend.set_keys(results, &txn); + } + results.clear(); +} + +void Store::cleanup(ObjectStore::Transaction* t) +{ + t->remove(coll, hoid); +} + +std::vector +Store::get_snap_errors(int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const +{ + const string begin = (start.name.empty() ? + first_snap_key(pool) : to_snap_key(pool, start)); + const string end = last_snap_key(pool); + return get_errors(begin, end, max_return); +} + +std::vector +Store::get_object_errors(int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const +{ + const string begin = (start.name.empty() ? + first_object_key(pool) : to_object_key(pool, start)); + const string end = last_object_key(pool); + return get_errors(begin, end, max_return); +} + +std::vector +Store::get_errors(const string& begin, + const string& end, + uint64_t max_return) const +{ + vector errors; + auto next = std::make_pair(begin, bufferlist{}); + while (max_return && !backend.get_next(next.first, &next)) { + if (next.first >= end) + break; + errors.push_back(next.second); + max_return--; + } + return errors; +} + +} // namespace Scrub diff --git a/src/osd/ScrubStore.h b/src/osd/ScrubStore.h new file mode 100644 index 000000000..721aae092 --- /dev/null +++ b/src/osd/ScrubStore.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_SCRUB_RESULT_H +#define CEPH_SCRUB_RESULT_H + +#include "SnapMapper.h" // for OSDriver +#include "common/map_cacher.hpp" + +namespace librados { + struct object_id_t; +} + +struct inconsistent_obj_wrapper; +struct inconsistent_snapset_wrapper; + +namespace Scrub { + +class Store { +public: + ~Store(); + static Store* create(ObjectStore* store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); + void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e); + void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e); + bool empty() const; + void flush(ObjectStore::Transaction *); + void cleanup(ObjectStore::Transaction *); + std::vector get_snap_errors(int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const; + std::vector get_object_errors(int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const; +private: + Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store); + std::vector get_errors(const std::string& start, const std::string& end, + uint64_t max_return) const; +private: + const coll_t coll; + const ghobject_t hoid; + // a temp object holding mappings from seq-id to inconsistencies found in + // scrubbing + OSDriver driver; + mutable MapCacher::MapCacher backend; + std::map results; +}; +} + +#endif // CEPH_SCRUB_RESULT_H diff --git a/src/osd/Session.cc b/src/osd/Session.cc new file mode 100644 index 000000000..454e1b857 --- /dev/null +++ b/src/osd/Session.cc @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PG.h" +#include "Session.h" + +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd + +using std::map; +using std::set; + +void Session::clear_backoffs() +{ + map>>> ls; + { + std::lock_guard l(backoff_lock); + ls.swap(backoffs); + backoff_count = 0; + } + for (auto& i : ls) { + for (auto& p : i.second) { + for (auto& b : p.second) { + std::lock_guard l(b->lock); + if (b->pg) { + ceph_assert(b->session == this); + ceph_assert(b->is_new() || b->is_acked()); + b->pg->rm_backoff(b); + b->pg.reset(); + b->session.reset(); + } else if (b->session) { + ceph_assert(b->session == this); + ceph_assert(b->is_deleting()); + b->session.reset(); + } + } + } + } +} + +void Session::ack_backoff( + CephContext *cct, + spg_t pgid, + uint64_t id, + const hobject_t& begin, + const hobject_t& end) +{ + std::lock_guard l(backoff_lock); + auto p = backoffs.find(pgid); + if (p == backoffs.end()) { + dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << "," + << end << ") pg not found" << dendl; + return; + } + auto q = p->second.find(begin); + if (q == p->second.end()) { + dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << "," + << end << ") begin not found" << dendl; + return; + } + for (auto i = q->second.begin(); i != q->second.end(); ++i) { + Backoff *b = (*i).get(); + if (b->id == id) { + if (b->is_new()) { + b->state = Backoff::STATE_ACKED; + dout(20) << __func__ << " now " << *b << dendl; + } else if (b->is_deleting()) { + dout(20) << __func__ << " deleting " << *b << dendl; + q->second.erase(i); + --backoff_count; + } + break; + } + } + if (q->second.empty()) { + dout(20) << __func__ << " clearing begin bin " << q->first << dendl; + p->second.erase(q); + if (p->second.empty()) { + dout(20) << __func__ << " clearing pg bin " << p->first << dendl; + backoffs.erase(p); + } + } + ceph_assert(!backoff_count == backoffs.empty()); +} + +bool Session::check_backoff( + CephContext *cct, spg_t pgid, const hobject_t& oid, const Message *m) +{ + auto b = have_backoff(pgid, oid); + if (b) { + dout(10) << __func__ << " session " << this << " has backoff " << *b + << " for " << *m << dendl; + ceph_assert(!b->is_acked() || !g_conf()->osd_debug_crash_on_ignored_backoff); + return true; + } + // we may race with ms_handle_reset. it clears session->con before removing + // backoffs, so if we see con is cleared here we have to abort this + // request. + if (!con) { + dout(10) << __func__ << " session " << this << " disconnected" << dendl; + return true; + } + return false; +} diff --git a/src/osd/Session.h b/src/osd/Session.h new file mode 100644 index 000000000..a42d37bfe --- /dev/null +++ b/src/osd/Session.h @@ -0,0 +1,240 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_SESSION_H +#define CEPH_OSD_SESSION_H + +#include "common/RefCountedObj.h" +#include "common/ceph_mutex.h" +#include "global/global_context.h" +#include "include/spinlock.h" +#include "OSDCap.h" +#include "Watch.h" +#include "OSDMap.h" +#include "PeeringState.h" + +//#define PG_DEBUG_REFS + +class PG; +#ifdef PG_DEBUG_REFS +#include "common/tracked_int_ptr.hpp" +typedef TrackedIntPtr PGRef; +#else +typedef boost::intrusive_ptr PGRef; +#endif + +/* + * A Backoff represents one instance of either a PG or an OID + * being plugged at the client. It's refcounted and linked from + * the PG {pg_oid}_backoffs map and from the client Session + * object. + * + * The Backoff has a lock that protects it's internal fields. + * + * The PG has a backoff_lock that protects it's maps to Backoffs. + * This lock is *inside* of Backoff::lock. + * + * The Session has a backoff_lock that protects it's map of pg and + * oid backoffs. This lock is *inside* the Backoff::lock *and* + * PG::backoff_lock. + * + * That's + * + * Backoff::lock + * PG::backoff_lock + * Session::backoff_lock + * + * When the Session goes away, we move our backoff lists aside, + * then we lock each of the Backoffs we + * previously referenced and clear the Session* pointer. If the PG + * is still linked, we unlink it, too. + * + * When the PG clears the backoff, it will send an unblock message + * if the Session* is still non-null, and unlink the session. + * + */ + +struct Backoff : public RefCountedObject { + enum { + STATE_NEW = 1, ///< backoff in flight to client + STATE_ACKED = 2, ///< backoff acked + STATE_DELETING = 3 ///< backoff deleted, but un-acked + }; + std::atomic state = {STATE_NEW}; + spg_t pgid; ///< owning pgid + uint64_t id = 0; ///< unique id (within the Session) + + bool is_new() const { + return state.load() == STATE_NEW; + } + bool is_acked() const { + return state.load() == STATE_ACKED; + } + bool is_deleting() const { + return state.load() == STATE_DELETING; + } + const char *get_state_name() const { + switch (state.load()) { + case STATE_NEW: return "new"; + case STATE_ACKED: return "acked"; + case STATE_DELETING: return "deleting"; + default: return "???"; + } + } + + ceph::mutex lock = ceph::make_mutex("Backoff::lock"); + // NOTE: the owning PG and session are either + // - *both* set, or + // - both null (teardown), or + // - only session is set (and state == DELETING) + PGRef pg; ///< owning pg + ceph::ref_t session; ///< owning session + hobject_t begin, end; ///< [) range to block, unless ==, then single obj + + friend ostream& operator<<(ostream& out, const Backoff& b) { + return out << "Backoff(" << &b << " " << b.pgid << " " << b.id + << " " << b.get_state_name() + << " [" << b.begin << "," << b.end << ") " + << " session " << b.session + << " pg " << b.pg << ")"; + } + +private: + FRIEND_MAKE_REF(Backoff); + Backoff(spg_t pgid, PGRef pg, ceph::ref_t s, + uint64_t i, + const hobject_t& b, const hobject_t& e) + : RefCountedObject(g_ceph_context), + pgid(pgid), + id(i), + pg(pg), + session(std::move(s)), + begin(b), + end(e) {} +}; + + + +struct Session : public RefCountedObject { + EntityName entity_name; + OSDCap caps; + ConnectionRef con; + entity_addr_t socket_addr; + WatchConState wstate; + + ceph::mutex session_dispatch_lock = + ceph::make_mutex("Session::session_dispatch_lock"); + boost::intrusive::list waiting_on_map; + + ceph::spinlock sent_epoch_lock; + epoch_t last_sent_epoch = 0; + + /// protects backoffs; orders inside Backoff::lock *and* PG::backoff_lock + ceph::mutex backoff_lock = ceph::make_mutex("Session::backoff_lock"); + std::atomic backoff_count= {0}; ///< simple count of backoffs + std::map>>> backoffs; + + std::atomic backoff_seq = {0}; + + // for heartbeat connections only + int peer = -1; + HeartbeatStampsRef stamps; + + entity_addr_t& get_peer_socket_addr() { + return socket_addr; + } + + void ack_backoff( + CephContext *cct, + spg_t pgid, + uint64_t id, + const hobject_t& start, + const hobject_t& end); + + ceph::ref_t have_backoff(spg_t pgid, const hobject_t& oid) { + if (!backoff_count.load()) { + return nullptr; + } + std::lock_guard l(backoff_lock); + ceph_assert(!backoff_count == backoffs.empty()); + auto i = backoffs.find(pgid); + if (i == backoffs.end()) { + return nullptr; + } + auto p = i->second.lower_bound(oid); + if (p != i->second.begin() && + (p == i->second.end() || p->first > oid)) { + --p; + } + if (p != i->second.end()) { + int r = cmp(oid, p->first); + if (r == 0 || r > 0) { + for (auto& q : p->second) { + if (r == 0 || oid < q->end) { + return &(*q); + } + } + } + } + return nullptr; + } + + bool check_backoff( + CephContext *cct, spg_t pgid, const hobject_t& oid, const Message *m); + + void add_backoff(ceph::ref_t b) { + std::lock_guard l(backoff_lock); + ceph_assert(!backoff_count == backoffs.empty()); + backoffs[b->pgid][b->begin].insert(std::move(b)); + ++backoff_count; + } + + // called by PG::release_*_backoffs and PG::clear_backoffs() + void rm_backoff(const ceph::ref_t& b) { + std::lock_guard l(backoff_lock); + ceph_assert(ceph_mutex_is_locked_by_me(b->lock)); + ceph_assert(b->session == this); + auto i = backoffs.find(b->pgid); + if (i != backoffs.end()) { + // may race with clear_backoffs() + auto p = i->second.find(b->begin); + if (p != i->second.end()) { + auto q = p->second.find(b); + if (q != p->second.end()) { + p->second.erase(q); + --backoff_count; + if (p->second.empty()) { + i->second.erase(p); + if (i->second.empty()) { + backoffs.erase(i); + } + } + } + } + } + ceph_assert(!backoff_count == backoffs.empty()); + } + void clear_backoffs(); + +private: + FRIEND_MAKE_REF(Session); + explicit Session(CephContext *cct, Connection *con_) : + RefCountedObject(cct), + con(con_), + socket_addr(con_->get_peer_socket_addr()), + wstate(cct) + {} +}; + +#endif diff --git a/src/osd/SnapMapper.cc b/src/osd/SnapMapper.cc new file mode 100644 index 000000000..804213b1f --- /dev/null +++ b/src/osd/SnapMapper.cc @@ -0,0 +1,752 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SnapMapper.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout << "snap_mapper." + +using std::make_pair; +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::timespan_str; + +const string SnapMapper::LEGACY_MAPPING_PREFIX = "MAP_"; +const string SnapMapper::MAPPING_PREFIX = "SNA_"; +const string SnapMapper::OBJECT_PREFIX = "OBJ_"; + +const char *SnapMapper::PURGED_SNAP_PREFIX = "PSN_"; + +/* + + We have a bidirectional mapping, (1) from each snap+obj to object, + sorted by snapshot, such that we can enumerate to identify all clones + mapped to a particular snapshot, and (2) from object to snaps, so we + can identify which reverse mappings exist for any given object (and, + e.g., clean up on deletion). + + "MAP_" + + ("%016x" % snapid) + + "_" + + (".%x" % shard_id) + + "_" + + hobject_t::to_str() ("%llx.%8x.%lx.name...." % pool, hash, snap) + -> SnapMapping::Mapping { snap, hoid } + + "SNA_" + + ("%lld" % poolid) + + "_" + + ("%016x" % snapid) + + "_" + + (".%x" % shard_id) + + "_" + + hobject_t::to_str() ("%llx.%8x.%lx.name...." % pool, hash, snap) + -> SnapMapping::Mapping { snap, hoid } + + "OBJ_" + + + (".%x" % shard_id) + + hobject_t::to_str() + -> SnapMapper::object_snaps { oid, set } + + */ + +int OSDriver::get_keys( + const std::set &keys, + std::map *out) +{ + return os->omap_get_values(ch, hoid, keys, out); +} + +int OSDriver::get_next( + const std::string &key, + pair *next) +{ + ObjectMap::ObjectMapIterator iter = + os->get_omap_iterator(ch, hoid); + if (!iter) { + ceph_abort(); + return -EINVAL; + } + iter->upper_bound(key); + if (iter->valid()) { + if (next) + *next = make_pair(iter->key(), iter->value()); + return 0; + } else { + return -ENOENT; + } +} + +string SnapMapper::get_prefix(int64_t pool, snapid_t snap) +{ + char buf[100]; + int len = snprintf( + buf, sizeof(buf), + "%lld_%.*X_", + (long long)pool, + (int)(sizeof(snap)*2), static_cast(snap)); + return MAPPING_PREFIX + string(buf, len); +} + +string SnapMapper::to_raw_key( + const pair &in) +{ + return get_prefix(in.second.pool, in.first) + shard_prefix + in.second.to_str(); +} + +pair SnapMapper::to_raw( + const pair &in) +{ + bufferlist bl; + encode(Mapping(in), bl); + return make_pair( + to_raw_key(in), + bl); +} + +pair SnapMapper::from_raw( + const pair &image) +{ + using ceph::decode; + Mapping map; + bufferlist bl(image.second); + auto bp = bl.cbegin(); + decode(map, bp); + return make_pair(map.snap, map.hoid); +} + +bool SnapMapper::is_mapping(const string &to_test) +{ + return to_test.substr(0, MAPPING_PREFIX.size()) == MAPPING_PREFIX; +} + +string SnapMapper::to_object_key(const hobject_t &hoid) +{ + return OBJECT_PREFIX + shard_prefix + hoid.to_str(); +} + +void SnapMapper::object_snaps::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(oid, bl); + encode(snaps, bl); + ENCODE_FINISH(bl); +} + +void SnapMapper::object_snaps::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(oid, bl); + decode(snaps, bl); + DECODE_FINISH(bl); +} + +bool SnapMapper::check(const hobject_t &hoid) const +{ + if (hoid.match(mask_bits, match)) { + return true; + } + derr << __func__ << " " << hoid << " mask_bits " << mask_bits + << " match 0x" << std::hex << match << std::dec << " is false" + << dendl; + return false; +} + +int SnapMapper::get_snaps( + const hobject_t &oid, + object_snaps *out) +{ + ceph_assert(check(oid)); + set keys; + map got; + keys.insert(to_object_key(oid)); + int r = backend.get_keys(keys, &got); + if (r < 0) { + dout(20) << __func__ << " " << oid << " got err " << r << dendl; + return r; + } + if (got.empty()) { + dout(20) << __func__ << " " << oid << " got.empty()" << dendl; + return -ENOENT; + } + if (out) { + auto bp = got.begin()->second.cbegin(); + decode(*out, bp); + dout(20) << __func__ << " " << oid << " " << out->snaps << dendl; + if (out->snaps.empty()) { + dout(1) << __func__ << " " << oid << " empty snapset" << dendl; + ceph_assert(!cct->_conf->osd_debug_verify_snaps); + } + } else { + dout(20) << __func__ << " " << oid << " (out == NULL)" << dendl; + } + return 0; +} + +void SnapMapper::clear_snaps( + const hobject_t &oid, + MapCacher::Transaction *t) +{ + dout(20) << __func__ << " " << oid << dendl; + ceph_assert(check(oid)); + set to_remove; + to_remove.insert(to_object_key(oid)); + if (g_conf()->subsys.should_gather()) { + for (auto& i : to_remove) { + dout(20) << __func__ << " rm " << i << dendl; + } + } + backend.remove_keys(to_remove, t); +} + +void SnapMapper::set_snaps( + const hobject_t &oid, + const object_snaps &in, + MapCacher::Transaction *t) +{ + ceph_assert(check(oid)); + map to_set; + bufferlist bl; + encode(in, bl); + to_set[to_object_key(oid)] = bl; + dout(20) << __func__ << " " << oid << " " << in.snaps << dendl; + if (g_conf()->subsys.should_gather()) { + for (auto& i : to_set) { + dout(20) << __func__ << " set " << i.first << dendl; + } + } + backend.set_keys(to_set, t); +} + +int SnapMapper::update_snaps( + const hobject_t &oid, + const set &new_snaps, + const set *old_snaps_check, + MapCacher::Transaction *t) +{ + dout(20) << __func__ << " " << oid << " " << new_snaps + << " was " << (old_snaps_check ? *old_snaps_check : set()) + << dendl; + ceph_assert(check(oid)); + if (new_snaps.empty()) + return remove_oid(oid, t); + + object_snaps out; + int r = get_snaps(oid, &out); + // Tolerate missing keys but not disk errors + if (r < 0 && r != -ENOENT) + return r; + if (old_snaps_check) + ceph_assert(out.snaps == *old_snaps_check); + + object_snaps in(oid, new_snaps); + set_snaps(oid, in, t); + + set to_remove; + for (set::iterator i = out.snaps.begin(); + i != out.snaps.end(); + ++i) { + if (!new_snaps.count(*i)) { + to_remove.insert(to_raw_key(make_pair(*i, oid))); + } + } + if (g_conf()->subsys.should_gather()) { + for (auto& i : to_remove) { + dout(20) << __func__ << " rm " << i << dendl; + } + } + backend.remove_keys(to_remove, t); + return 0; +} + +void SnapMapper::add_oid( + const hobject_t &oid, + const set& snaps, + MapCacher::Transaction *t) +{ + dout(20) << __func__ << " " << oid << " " << snaps << dendl; + ceph_assert(!snaps.empty()); + ceph_assert(check(oid)); + { + object_snaps out; + int r = get_snaps(oid, &out); + if (r != -ENOENT) { + derr << __func__ << " found existing snaps mapped on " << oid + << ", removing" << dendl; + ceph_assert(!cct->_conf->osd_debug_verify_snaps); + remove_oid(oid, t); + } + } + + object_snaps _snaps(oid, snaps); + set_snaps(oid, _snaps, t); + + map to_add; + for (set::iterator i = snaps.begin(); + i != snaps.end(); + ++i) { + to_add.insert(to_raw(make_pair(*i, oid))); + } + if (g_conf()->subsys.should_gather()) { + for (auto& i : to_add) { + dout(20) << __func__ << " set " << i.first << dendl; + } + } + backend.set_keys(to_add, t); +} + +int SnapMapper::get_next_objects_to_trim( + snapid_t snap, + unsigned max, + vector *out) +{ + ceph_assert(out); + ceph_assert(out->empty()); + + // if max would be 0, we return ENOENT and the caller would mistakenly + // trim the snaptrim queue + ceph_assert(max > 0); + int r = 0; + for (set::iterator i = prefixes.begin(); + i != prefixes.end() && out->size() < max && r == 0; + ++i) { + string prefix(get_prefix(pool, snap) + *i); + string pos = prefix; + while (out->size() < max) { + pair next; + r = backend.get_next(pos, &next); + dout(20) << __func__ << " get_next(" << pos << ") returns " << r + << " " << next << dendl; + if (r != 0) { + break; // Done + } + + if (next.first.substr(0, prefix.size()) != + prefix) { + break; // Done with this prefix + } + + ceph_assert(is_mapping(next.first)); + + dout(20) << __func__ << " " << next.first << dendl; + pair next_decoded(from_raw(next)); + ceph_assert(next_decoded.first == snap); + ceph_assert(check(next_decoded.second)); + + out->push_back(next_decoded.second); + pos = next.first; + } + } + if (out->size() == 0) { + return -ENOENT; + } else { + return 0; + } +} + + +int SnapMapper::remove_oid( + const hobject_t &oid, + MapCacher::Transaction *t) +{ + dout(20) << __func__ << " " << oid << dendl; + ceph_assert(check(oid)); + return _remove_oid(oid, t); +} + +int SnapMapper::_remove_oid( + const hobject_t &oid, + MapCacher::Transaction *t) +{ + dout(20) << __func__ << " " << oid << dendl; + object_snaps out; + int r = get_snaps(oid, &out); + if (r < 0) + return r; + + clear_snaps(oid, t); + + set to_remove; + for (set::iterator i = out.snaps.begin(); + i != out.snaps.end(); + ++i) { + to_remove.insert(to_raw_key(make_pair(*i, oid))); + } + if (g_conf()->subsys.should_gather()) { + for (auto& i : to_remove) { + dout(20) << __func__ << " rm " << i << dendl; + } + } + backend.remove_keys(to_remove, t); + return 0; +} + +int SnapMapper::get_snaps( + const hobject_t &oid, + std::set *snaps) +{ + ceph_assert(check(oid)); + object_snaps out; + int r = get_snaps(oid, &out); + if (r < 0) + return r; + if (snaps) + snaps->swap(out.snaps); + return 0; +} + + +// -- purged snaps -- + +string SnapMapper::make_purged_snap_key(int64_t pool, snapid_t last) +{ + char k[80]; + snprintf(k, sizeof(k), "%s_%llu_%016llx", PURGED_SNAP_PREFIX, + (unsigned long long)pool, (unsigned long long)last); + return k; +} + +void SnapMapper::make_purged_snap_key_value( + int64_t pool, snapid_t begin, snapid_t end, map *m) +{ + string k = make_purged_snap_key(pool, end - 1); + auto& v = (*m)[k]; + ceph::encode(pool, v); + ceph::encode(begin, v); + ceph::encode(end, v); +} + +int SnapMapper::_lookup_purged_snap( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + const ghobject_t& hoid, + int64_t pool, snapid_t snap, + snapid_t *begin, snapid_t *end) +{ + string k = make_purged_snap_key(pool, snap); + auto it = store->get_omap_iterator(ch, hoid); + it->lower_bound(k); + if (!it->valid()) { + dout(20) << __func__ << " pool " << pool << " snap " << snap + << " key '" << k << "' lower_bound not found" << dendl; + return -ENOENT; + } + if (it->key().find(PURGED_SNAP_PREFIX) != 0) { + dout(20) << __func__ << " pool " << pool << " snap " << snap + << " key '" << k << "' lower_bound got mismatched prefix '" + << it->key() << "'" << dendl; + return -ENOENT; + } + bufferlist v = it->value(); + auto p = v.cbegin(); + int64_t gotpool; + decode(gotpool, p); + decode(*begin, p); + decode(*end, p); + if (snap < *begin || snap >= *end) { + dout(20) << __func__ << " pool " << pool << " snap " << snap + << " found [" << *begin << "," << *end << "), no overlap" << dendl; + return -ENOENT; + } + return 0; +} + +void SnapMapper::record_purged_snaps( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + ghobject_t hoid, + ObjectStore::Transaction *t, + map> purged_snaps) +{ + dout(10) << __func__ << " purged_snaps " << purged_snaps << dendl; + map m; + set rm; + for (auto& [epoch, bypool] : purged_snaps) { + // index by (pool, snap) + for (auto& [pool, snaps] : bypool) { + for (auto i = snaps.begin(); + i != snaps.end(); + ++i) { + snapid_t begin = i.get_start(); + snapid_t end = i.get_end(); + snapid_t before_begin, before_end; + snapid_t after_begin, after_end; + int b = _lookup_purged_snap(cct, store, ch, hoid, + pool, begin - 1, &before_begin, &before_end); + int a = _lookup_purged_snap(cct, store, ch, hoid, + pool, end, &after_begin, &after_end); + if (!b && !a) { + dout(10) << __func__ + << " [" << begin << "," << end << ") - joins [" + << before_begin << "," << before_end << ") and [" + << after_begin << "," << after_end << ")" << dendl; + // erase only the begin record; we'll overwrite the end one + rm.insert(make_purged_snap_key(pool, before_end - 1)); + make_purged_snap_key_value(pool, before_begin, after_end, &m); + } else if (!b) { + dout(10) << __func__ + << " [" << begin << "," << end << ") - join with earlier [" + << before_begin << "," << before_end << ")" << dendl; + rm.insert(make_purged_snap_key(pool, before_end - 1)); + make_purged_snap_key_value(pool, before_begin, end, &m); + } else if (!a) { + dout(10) << __func__ + << " [" << begin << "," << end << ") - join with later [" + << after_begin << "," << after_end << ")" << dendl; + // overwrite after record + make_purged_snap_key_value(pool, begin, after_end, &m); + } else { + make_purged_snap_key_value(pool, begin, end, &m); + } + } + } + } + t->omap_rmkeys(ch->cid, hoid, rm); + t->omap_setkeys(ch->cid, hoid, m); + dout(10) << __func__ << " rm " << rm.size() << " keys, set " << m.size() + << " keys" << dendl; +} + + +bool SnapMapper::Scrubber::_parse_p() +{ + if (!psit->valid()) { + pool = -1; + return false; + } + if (psit->key().find(PURGED_SNAP_PREFIX) != 0) { + pool = -1; + return false; + } + bufferlist v = psit->value(); + auto p = v.cbegin(); + ceph::decode(pool, p); + ceph::decode(begin, p); + ceph::decode(end, p); + dout(20) << __func__ << " purged_snaps pool " << pool + << " [" << begin << "," << end << ")" << dendl; + psit->next(); + return true; +} + +bool SnapMapper::Scrubber::_parse_m() +{ + if (!mapit->valid()) { + return false; + } + if (mapit->key().find(MAPPING_PREFIX) != 0) { + return false; + } + auto v = mapit->value(); + auto p = v.cbegin(); + mapping.decode(p); + + { + unsigned long long p, s; + long sh; + string k = mapit->key(); + int r = sscanf(k.c_str(), "SNA_%lld_%llx.%lx", &p, &s, &sh); + if (r != 1) { + shard = shard_id_t::NO_SHARD; + } else { + shard = shard_id_t(sh); + } + } + dout(20) << __func__ << " mapping pool " << mapping.hoid.pool + << " snap " << mapping.snap + << " shard " << shard + << " " << mapping.hoid << dendl; + mapit->next(); + return true; +} + +void SnapMapper::Scrubber::run() +{ + dout(10) << __func__ << dendl; + + psit = store->get_omap_iterator(ch, purged_snaps_hoid); + psit->upper_bound(PURGED_SNAP_PREFIX); + _parse_p(); + + mapit = store->get_omap_iterator(ch, mapping_hoid); + mapit->upper_bound(MAPPING_PREFIX); + + while (_parse_m()) { + // advance to next purged_snaps range? + while (pool >= 0 && + (mapping.hoid.pool > pool || + (mapping.hoid.pool == pool && mapping.snap >= end))) { + _parse_p(); + } + if (pool < 0) { + dout(10) << __func__ << " passed final purged_snaps interval, rest ok" + << dendl; + break; + } + if (mapping.hoid.pool < pool || + mapping.snap < begin) { + // ok + dout(20) << __func__ << " ok " << mapping.hoid + << " snap " << mapping.snap + << " precedes pool " << pool + << " purged_snaps [" << begin << "," << end << ")" << dendl; + } else { + assert(mapping.snap >= begin && + mapping.snap < end && + mapping.hoid.pool == pool); + // invalid + dout(10) << __func__ << " stray " << mapping.hoid + << " snap " << mapping.snap + << " in pool " << pool + << " shard " << shard + << " purged_snaps [" << begin << "," << end << ")" << dendl; + stray.emplace_back(std::tuple( + pool, mapping.snap, mapping.hoid.get_hash(), + shard + )); + } + } + + dout(10) << __func__ << " end, found " << stray.size() << " stray" << dendl; + psit = ObjectMap::ObjectMapIterator(); + mapit = ObjectMap::ObjectMapIterator(); +} + + +// ------------------------------------- +// legacy conversion/support + +string SnapMapper::get_legacy_prefix(snapid_t snap) +{ + char buf[100]; + int len = snprintf( + buf, sizeof(buf), + "%.*X_", + (int)(sizeof(snap)*2), static_cast(snap)); + return LEGACY_MAPPING_PREFIX + string(buf, len); +} + +string SnapMapper::to_legacy_raw_key( + const pair &in) +{ + return get_legacy_prefix(in.first) + shard_prefix + in.second.to_str(); +} + +bool SnapMapper::is_legacy_mapping(const string &to_test) +{ + return to_test.substr(0, LEGACY_MAPPING_PREFIX.size()) == + LEGACY_MAPPING_PREFIX; +} + +/* Octopus modified the SnapMapper key format from + * + * __ + * + * to + * + * ___ + * + * We can't reconstruct the new key format just from the value since the + * Mapping object contains an hobject rather than a ghobject. Instead, + * we exploit the fact that the new format is identical starting at . + * + * Note that the original version of this conversion introduced in 94ebe0ea + * had a crucial bug which essentially destroyed legacy keys by mapping + * them to + * + * __ + * + * without the object-unique suffix. + * See https://tracker.ceph.com/issues/56147 + */ +std::string SnapMapper::convert_legacy_key( + const std::string& old_key, + const bufferlist& value) +{ + auto old = from_raw(make_pair(old_key, value)); + std::string object_suffix = old_key.substr( + SnapMapper::LEGACY_MAPPING_PREFIX.length()); + return SnapMapper::MAPPING_PREFIX + std::to_string(old.second.pool) + + "_" + object_suffix; +} + +int SnapMapper::convert_legacy( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + ghobject_t hoid, + unsigned max) +{ + uint64_t n = 0; + + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, hoid); + if (!iter) { + return -EIO; + } + + auto start = ceph::mono_clock::now(); + + iter->upper_bound(SnapMapper::LEGACY_MAPPING_PREFIX); + map to_set; + while (iter->valid()) { + bool valid = SnapMapper::is_legacy_mapping(iter->key()); + if (valid) { + to_set.emplace( + convert_legacy_key(iter->key(), iter->value()), + iter->value()); + ++n; + iter->next(); + } + if (!valid || !iter->valid() || to_set.size() >= max) { + ObjectStore::Transaction t; + t.omap_setkeys(ch->cid, hoid, to_set); + int r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + to_set.clear(); + if (!valid) { + break; + } + dout(10) << __func__ << " converted " << n << " keys" << dendl; + } + } + + auto end = ceph::mono_clock::now(); + + dout(1) << __func__ << " converted " << n << " keys in " + << timespan_str(end - start) << dendl; + + // remove the old keys + { + ObjectStore::Transaction t; + string end = SnapMapper::LEGACY_MAPPING_PREFIX; + ++end[end.size()-1]; // turn _ to whatever comes after _ + t.omap_rmkeyrange(ch->cid, hoid, + SnapMapper::LEGACY_MAPPING_PREFIX, + end); + int r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + return 0; +} diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h new file mode 100644 index 000000000..90b0c7c8d --- /dev/null +++ b/src/osd/SnapMapper.h @@ -0,0 +1,338 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef SNAPMAPPER_H +#define SNAPMAPPER_H + +#include +#include +#include +#include + +#include "common/map_cacher.hpp" +#include "common/hobject.h" +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/object.h" +#include "os/ObjectStore.h" +#include "osd/OSDMap.h" + +class OSDriver : public MapCacher::StoreDriver { + ObjectStore *os; + ObjectStore::CollectionHandle ch; + ghobject_t hoid; + +public: + class OSTransaction : public MapCacher::Transaction { + friend class OSDriver; + coll_t cid; + ghobject_t hoid; + ObjectStore::Transaction *t; + OSTransaction( + const coll_t &cid, + const ghobject_t &hoid, + ObjectStore::Transaction *t) + : cid(cid), hoid(hoid), t(t) {} + public: + void set_keys( + const std::map &to_set) override { + t->omap_setkeys(cid, hoid, to_set); + } + void remove_keys( + const std::set &to_remove) override { + t->omap_rmkeys(cid, hoid, to_remove); + } + void add_callback( + Context *c) override { + t->register_on_applied(c); + } + }; + + OSTransaction get_transaction( + ObjectStore::Transaction *t) { + return OSTransaction(ch->cid, hoid, t); + } + + OSDriver(ObjectStore *os, const coll_t& cid, const ghobject_t &hoid) : + os(os), + hoid(hoid) { + ch = os->open_collection(cid); + } + int get_keys( + const std::set &keys, + std::map *out) override; + int get_next( + const std::string &key, + std::pair *next) override; +}; + +/** + * SnapMapper + * + * Manages two mappings: + * 1) hobject_t -> {snapid} + * 2) snapid -> {hobject_t} + * + * We accomplish this using two sets of keys: + * 1) OBJECT_PREFIX + obj.str() -> encoding of object_snaps + * 2) MAPPING_PREFIX + poolid + snapid_t + obj.str() -> encoding of std::pair + * + * The on disk strings and encodings are implemented in to_raw, to_raw_key, + * from_raw, to_object_key. + * + * The object -> {snapid} mapping is primarily included so that the + * SnapMapper state can be verified against the external PG state during + * scrub etc. + * + * The 2) mapping is arranged such that all objects in a particular + * snap will sort together, and so that all objects in a pg for a + * particular snap will group under up to 8 prefixes. + */ +class SnapMapper { + friend class MapperVerifier; +public: + CephContext* cct; + struct object_snaps { + hobject_t oid; + std::set snaps; + object_snaps(hobject_t oid, const std::set &snaps) + : oid(oid), snaps(snaps) {} + object_snaps() {} + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bp); + }; + + struct Mapping { + snapid_t snap; + hobject_t hoid; + explicit Mapping(const std::pair &in) + : snap(in.first), hoid(in.second) {} + Mapping() : snap(0) {} + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(snap, bl); + encode(hoid, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(1, bl); + decode(snap, bl); + decode(hoid, bl); + DECODE_FINISH(bl); + } + }; + + static const std::string LEGACY_MAPPING_PREFIX; + static const std::string MAPPING_PREFIX; + static const std::string OBJECT_PREFIX; + static const char *PURGED_SNAP_EPOCH_PREFIX; + static const char *PURGED_SNAP_PREFIX; + + struct Scrubber { + CephContext *cct; + ObjectStore *store; + ObjectStore::CollectionHandle ch; + ghobject_t mapping_hoid; + ghobject_t purged_snaps_hoid; + + ObjectMap::ObjectMapIterator psit; + int64_t pool; + snapid_t begin, end; + + bool _parse_p(); ///< advance the purged_snaps pointer + + ObjectMap::ObjectMapIterator mapit; + Mapping mapping; + shard_id_t shard; + + bool _parse_m(); ///< advance the (object) mapper pointer + + std::vector> stray; + + Scrubber( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + ghobject_t mapping_hoid, + ghobject_t purged_snaps_hoid) + : cct(cct), + store(store), + ch(ch), + mapping_hoid(mapping_hoid), + purged_snaps_hoid(purged_snaps_hoid) {} + + void run(); + }; + + static std::string convert_legacy_key( + const std::string& old_key, + const bufferlist& value); + + static int convert_legacy( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + ghobject_t hoid, + unsigned max); + + static void record_purged_snaps( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + ghobject_t hoid, + ObjectStore::Transaction *t, + std::map> purged_snaps); + static void scrub_purged_snaps( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + ghobject_t mapper_hoid, + ghobject_t purged_snaps_hoid); + +private: + static int _lookup_purged_snap( + CephContext *cct, + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + const ghobject_t& hoid, + int64_t pool, snapid_t snap, + snapid_t *begin, snapid_t *end); + static void make_purged_snap_key_value( + int64_t pool, snapid_t begin, + snapid_t end, std::map *m); + static std::string make_purged_snap_key(int64_t pool, snapid_t last); + + + MapCacher::MapCacher backend; + + static std::string get_legacy_prefix(snapid_t snap); + std::string to_legacy_raw_key( + const std::pair &to_map); + static bool is_legacy_mapping(const std::string &to_test); + + static std::string get_prefix(int64_t pool, snapid_t snap); + std::string to_raw_key( + const std::pair &to_map); + + std::pair to_raw( + const std::pair &to_map); + + static bool is_mapping(const std::string &to_test); + + static std::pair from_raw( + const std::pair &image); + + std::string to_object_key(const hobject_t &hoid); + + int get_snaps(const hobject_t &oid, object_snaps *out); + + void set_snaps( + const hobject_t &oid, + const object_snaps &out, + MapCacher::Transaction *t); + + void clear_snaps( + const hobject_t &oid, + MapCacher::Transaction *t); + + // True if hoid belongs in this mapping based on mask_bits and match + bool check(const hobject_t &hoid) const; + + int _remove_oid( + const hobject_t &oid, ///< [in] oid to remove + MapCacher::Transaction *t ///< [out] transaction + ); + +public: + static std::string make_shard_prefix(shard_id_t shard) { + if (shard == shard_id_t::NO_SHARD) + return std::string(); + char buf[20]; + int r = snprintf(buf, sizeof(buf), ".%x", (int)shard); + ceph_assert(r < (int)sizeof(buf)); + return std::string(buf, r) + '_'; + } + uint32_t mask_bits; + const uint32_t match; + std::string last_key_checked; + const int64_t pool; + const shard_id_t shard; + const std::string shard_prefix; + SnapMapper( + CephContext* cct, + MapCacher::StoreDriver *driver, + uint32_t match, ///< [in] pgid + uint32_t bits, ///< [in] current split bits + int64_t pool, ///< [in] pool + shard_id_t shard ///< [in] shard + ) + : cct(cct), backend(driver), mask_bits(bits), match(match), pool(pool), + shard(shard), shard_prefix(make_shard_prefix(shard)) { + update_bits(mask_bits); + } + + std::set prefixes; + /// Update bits in case of pg split or merge + void update_bits( + uint32_t new_bits ///< [in] new split bits + ) { + mask_bits = new_bits; + std::set _prefixes = hobject_t::get_prefixes( + mask_bits, + match, + pool); + prefixes.clear(); + for (auto i = _prefixes.begin(); i != _prefixes.end(); ++i) { + prefixes.insert(shard_prefix + *i); + } + } + + /// Update snaps for oid, empty new_snaps removes the mapping + int update_snaps( + const hobject_t &oid, ///< [in] oid to update + const std::set &new_snaps, ///< [in] new snap std::set + const std::set *old_snaps, ///< [in] old snaps (for debugging) + MapCacher::Transaction *t ///< [out] transaction + ); ///@ return error, 0 on success + + /// Add mapping for oid, must not already be mapped + void add_oid( + const hobject_t &oid, ///< [in] oid to add + const std::set& new_snaps, ///< [in] snaps + MapCacher::Transaction *t ///< [out] transaction + ); + + /// Returns first object with snap as a snap + int get_next_objects_to_trim( + snapid_t snap, ///< [in] snap to check + unsigned max, ///< [in] max to get + std::vector *out ///< [out] next objects to trim (must be empty) + ); ///< @return error, -ENOENT if no more objects + + /// Remove mapping for oid + int remove_oid( + const hobject_t &oid, ///< [in] oid to remove + MapCacher::Transaction *t ///< [out] transaction + ); ///< @return error, -ENOENT if the object is not mapped + + /// Get snaps for oid + int get_snaps( + const hobject_t &oid, ///< [in] oid to get snaps for + std::set *snaps ///< [out] snaps + ); ///< @return error, -ENOENT if oid is not recorded +}; +WRITE_CLASS_ENCODER(SnapMapper::object_snaps) +WRITE_CLASS_ENCODER(SnapMapper::Mapping) + +#endif diff --git a/src/osd/TierAgentState.h b/src/osd/TierAgentState.h new file mode 100644 index 000000000..28e1598a9 --- /dev/null +++ b/src/osd/TierAgentState.h @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_TIERAGENT_H +#define CEPH_OSD_TIERAGENT_H + +#include +#include +#include +#include + +#include "common/Formatter.h" +#include "common/histogram.h" +#include "common/hobject.h" + +#include "osd/HitSet.h" + +struct TierAgentState { + /// current position iterating across pool + hobject_t position; + /// Count of agent_work since "start" position of object hash space + int started; + hobject_t start; + bool delaying; + + /// histogram of ages we've encountered + pow2_hist_t temp_hist; + int hist_age; + + /// past HitSet(s) (not current) + std::map hit_set_map; + + /// a few recent things we've seen that are clean + std::list recent_clean; + + enum flush_mode_t { + FLUSH_MODE_IDLE, // nothing to flush + FLUSH_MODE_LOW, // flush dirty objects with a low speed + FLUSH_MODE_HIGH, //flush dirty objects with a high speed + } flush_mode; ///< current flush behavior + static const char *get_flush_mode_name(flush_mode_t m) { + switch (m) { + case FLUSH_MODE_IDLE: return "idle"; + case FLUSH_MODE_LOW: return "low"; + case FLUSH_MODE_HIGH: return "high"; + default: ceph_abort_msg("bad flush mode"); + } + } + const char *get_flush_mode_name() const { + return get_flush_mode_name(flush_mode); + } + + enum evict_mode_t { + EVICT_MODE_IDLE, // no need to evict anything + EVICT_MODE_SOME, // evict some things as we are near the target + EVICT_MODE_FULL, // evict anything + } evict_mode; ///< current evict behavior + static const char *get_evict_mode_name(evict_mode_t m) { + switch (m) { + case EVICT_MODE_IDLE: return "idle"; + case EVICT_MODE_SOME: return "some"; + case EVICT_MODE_FULL: return "full"; + default: ceph_abort_msg("bad evict mode"); + } + } + const char *get_evict_mode_name() const { + return get_evict_mode_name(evict_mode); + } + + /// approximate ratio of objects (assuming they are uniformly + /// distributed) that i should aim to evict. + unsigned evict_effort; + + TierAgentState() + : started(0), + delaying(false), + hist_age(0), + flush_mode(FLUSH_MODE_IDLE), + evict_mode(EVICT_MODE_IDLE), + evict_effort(0) + {} + + /// false if we have any work to do + bool is_idle() const { + return + delaying || + (flush_mode == FLUSH_MODE_IDLE && + evict_mode == EVICT_MODE_IDLE); + } + + /// add archived HitSet + void add_hit_set(time_t start, HitSetRef hs) { + hit_set_map.insert(std::make_pair(start, hs)); + } + + /// remove old/trimmed HitSet + void remove_oldest_hit_set() { + if (!hit_set_map.empty()) + hit_set_map.erase(hit_set_map.begin()); + } + + /// discard all open hit sets + void discard_hit_sets() { + hit_set_map.clear(); + } + + void dump(ceph::Formatter *f) const { + f->dump_string("flush_mode", get_flush_mode_name()); + f->dump_string("evict_mode", get_evict_mode_name()); + f->dump_unsigned("evict_effort", evict_effort); + f->dump_stream("position") << position; + f->open_object_section("temp_hist"); + temp_hist.dump(f); + f->close_section(); + } +}; + +#endif diff --git a/src/osd/Watch.cc b/src/osd/Watch.cc new file mode 100644 index 000000000..78aae6e2d --- /dev/null +++ b/src/osd/Watch.cc @@ -0,0 +1,550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +#include "PG.h" + +#include "include/types.h" +#include "messages/MWatchNotify.h" + +#include + +#include "OSD.h" +#include "PrimaryLogPG.h" +#include "Watch.h" +#include "Session.h" + +#include "common/config.h" + +#define dout_context osd->cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +using std::list; +using std::make_pair; +using std::pair; +using std::ostream; +using std::set; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; + +struct CancelableContext : public Context { + virtual void cancel() = 0; +}; + + +static ostream& _prefix( + std::ostream* _dout, + Notify *notify) { + return notify->gen_dbg_prefix(*_dout); +} + +Notify::Notify( + ConnectionRef client, + uint64_t client_gid, + bufferlist &payload, + uint32_t timeout, + uint64_t cookie, + uint64_t notify_id, + uint64_t version, + OSDService *osd) + : client(client), + client_gid(client_gid), + complete(false), + discarded(false), + timed_out(false), + payload(payload), + timeout(timeout), + cookie(cookie), + notify_id(notify_id), + version(version), + osd(osd), + cb(nullptr) {} + +NotifyRef Notify::makeNotifyRef( + ConnectionRef client, + uint64_t client_gid, + bufferlist &payload, + uint32_t timeout, + uint64_t cookie, + uint64_t notify_id, + uint64_t version, + OSDService *osd) { + NotifyRef ret( + new Notify( + client, client_gid, + payload, timeout, + cookie, notify_id, + version, osd)); + ret->set_self(ret); + return ret; +} + +class NotifyTimeoutCB : public CancelableContext { + NotifyRef notif; + bool canceled; // protected by notif lock +public: + explicit NotifyTimeoutCB(NotifyRef notif) : notif(notif), canceled(false) {} + void finish(int) override { + notif->osd->watch_lock.unlock(); + notif->lock.lock(); + if (!canceled) + notif->do_timeout(); // drops lock + else + notif->lock.unlock(); + notif->osd->watch_lock.lock(); + } + void cancel() override { + ceph_assert(ceph_mutex_is_locked(notif->lock)); + canceled = true; + } +}; + +void Notify::do_timeout() +{ + ceph_assert(ceph_mutex_is_locked(lock)); + dout(10) << "timeout" << dendl; + cb = nullptr; + if (is_discarded()) { + lock.unlock(); + return; + } + + timed_out = true; // we will send the client an error code + maybe_complete_notify(); + ceph_assert(complete); + set _watchers; + _watchers.swap(watchers); + lock.unlock(); + + for (auto i = _watchers.begin(); i != _watchers.end(); ++i) { + boost::intrusive_ptr pg((*i)->get_pg()); + pg->lock(); + if (!(*i)->is_discarded()) { + (*i)->cancel_notify(self.lock()); + } + pg->unlock(); + } +} + +void Notify::register_cb() +{ + ceph_assert(ceph_mutex_is_locked(lock)); + { + std::lock_guard l{osd->watch_lock}; + cb = new NotifyTimeoutCB(self.lock()); + if (!osd->watch_timer.add_event_after(timeout, cb)) { + cb = nullptr; + } + } +} + +void Notify::unregister_cb() +{ + ceph_assert(ceph_mutex_is_locked(lock)); + if (!cb) + return; + cb->cancel(); + { + std::lock_guard l{osd->watch_lock}; + osd->watch_timer.cancel_event(cb); + cb = nullptr; + } +} + +void Notify::start_watcher(WatchRef watch) +{ + std::lock_guard l(lock); + dout(10) << "start_watcher" << dendl; + watchers.insert(watch); +} + +void Notify::complete_watcher(WatchRef watch, bufferlist& reply_bl) +{ + std::lock_guard l(lock); + dout(10) << "complete_watcher" << dendl; + if (is_discarded()) + return; + ceph_assert(watchers.count(watch)); + watchers.erase(watch); + notify_replies.insert(make_pair(make_pair(watch->get_watcher_gid(), + watch->get_cookie()), + reply_bl)); + maybe_complete_notify(); +} + +void Notify::complete_watcher_remove(WatchRef watch) +{ + std::lock_guard l(lock); + dout(10) << __func__ << dendl; + if (is_discarded()) + return; + ceph_assert(watchers.count(watch)); + watchers.erase(watch); + maybe_complete_notify(); +} + +void Notify::maybe_complete_notify() +{ + dout(10) << "maybe_complete_notify -- " + << watchers.size() + << " in progress watchers " << dendl; + if (watchers.empty() || timed_out) { + // prepare reply + bufferlist bl; + encode(notify_replies, bl); + list > missed; + for (auto p = watchers.begin(); p != watchers.end(); ++p) { + missed.push_back(make_pair((*p)->get_watcher_gid(), + (*p)->get_cookie())); + } + encode(missed, bl); + + bufferlist empty; + auto* const reply = new MWatchNotify( + cookie, + version, + notify_id, + CEPH_WATCH_EVENT_NOTIFY_COMPLETE, + empty, + client_gid); + reply->set_data(bl); + if (timed_out) + reply->return_code = -ETIMEDOUT; + client->send_message(reply); + unregister_cb(); + + complete = true; + } +} + +void Notify::discard() +{ + std::lock_guard l(lock); + discarded = true; + unregister_cb(); + watchers.clear(); +} + +void Notify::init() +{ + std::lock_guard l(lock); + register_cb(); + maybe_complete_notify(); +} + +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, watch.get()) + +static ostream& _prefix( + std::ostream* _dout, + Watch *watch) { + return watch->gen_dbg_prefix(*_dout); +} + +class HandleWatchTimeout : public CancelableContext { + WatchRef watch; +public: + bool canceled; // protected by watch->pg->lock + explicit HandleWatchTimeout(WatchRef watch) : watch(watch), canceled(false) {} + void cancel() override { + canceled = true; + } + void finish(int) override { ceph_abort(); /* not used */ } + void complete(int) override { + OSDService *osd(watch->osd); + ldout(osd->cct, 10) << "HandleWatchTimeout" << dendl; + boost::intrusive_ptr pg(watch->pg); + osd->watch_lock.unlock(); + pg->lock(); + watch->cb = nullptr; + if (!watch->is_discarded() && !canceled) + watch->pg->handle_watch_timeout(watch); + delete this; // ~Watch requires pg lock! + pg->unlock(); + osd->watch_lock.lock(); + } +}; + +class HandleDelayedWatchTimeout : public CancelableContext { + WatchRef watch; +public: + bool canceled; + explicit HandleDelayedWatchTimeout(WatchRef watch) : watch(watch), canceled(false) {} + void cancel() override { + canceled = true; + } + void finish(int) override { + OSDService *osd(watch->osd); + dout(10) << "HandleWatchTimeoutDelayed" << dendl; + ceph_assert(watch->pg->is_locked()); + watch->cb = nullptr; + if (!watch->is_discarded() && !canceled) + watch->pg->handle_watch_timeout(watch); + } +}; + +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +std::ostream& Watch::gen_dbg_prefix(std::ostream& out) { + return pg->gen_prefix(out) << " -- Watch(" + << make_pair(cookie, entity) << ") "; +} + +Watch::Watch( + PrimaryLogPG *pg, + OSDService *osd, + ObjectContextRef obc, + uint32_t timeout, + uint64_t cookie, + entity_name_t entity, + const entity_addr_t &addr) + : cb(NULL), + osd(osd), + pg(pg), + obc(obc), + timeout(timeout), + cookie(cookie), + addr(addr), + will_ping(false), + entity(entity), + discarded(false) { + dout(10) << "Watch()" << dendl; +} + +Watch::~Watch() { + dout(10) << "~Watch" << dendl; + // users must have called remove() or discard() prior to this point + ceph_assert(!obc); + ceph_assert(!is_connected()); +} + +Context *Watch::get_delayed_cb() +{ + ceph_assert(!cb); + cb = new HandleDelayedWatchTimeout(self.lock()); + return cb; +} + +void Watch::register_cb() +{ + std::lock_guard l(osd->watch_lock); + if (cb) { + dout(15) << "re-registering callback, timeout: " << timeout << dendl; + cb->cancel(); + osd->watch_timer.cancel_event(cb); + } else { + dout(15) << "registering callback, timeout: " << timeout << dendl; + } + cb = new HandleWatchTimeout(self.lock()); + if (!osd->watch_timer.add_event_after(timeout, cb)) { + cb = nullptr; + } +} + +void Watch::unregister_cb() +{ + dout(15) << "unregister_cb" << dendl; + if (!cb) + return; + dout(15) << "actually registered, cancelling" << dendl; + cb->cancel(); + { + std::lock_guard l(osd->watch_lock); + osd->watch_timer.cancel_event(cb); // harmless if not registered with timer + } + cb = nullptr; +} + +void Watch::got_ping(utime_t t) +{ + last_ping = t; + if (is_connected()) { + register_cb(); + } +} + +void Watch::connect(ConnectionRef con, bool _will_ping) +{ + if (is_connected(con.get())) { + dout(10) << __func__ << " con " << con << " - already connected" << dendl; + return; + } + dout(10) << __func__ << " con " << con << dendl; + conn = con; + will_ping = _will_ping; + auto priv = con->get_priv(); + if (priv) { + auto sessionref = static_cast(priv.get()); + sessionref->wstate.addWatch(self.lock()); + priv.reset(); + for (auto i = in_progress_notifies.begin(); + i != in_progress_notifies.end(); + ++i) { + send_notify(i->second); + } + } + if (will_ping) { + last_ping = ceph_clock_now(); + register_cb(); + } else { + unregister_cb(); + } +} + +void Watch::disconnect() +{ + dout(10) << "disconnect (con was " << conn << ")" << dendl; + conn = ConnectionRef(); + if (!will_ping) + register_cb(); +} + +void Watch::discard() +{ + dout(10) << "discard" << dendl; + for (auto i = in_progress_notifies.begin(); + i != in_progress_notifies.end(); + ++i) { + i->second->discard(); + } + discard_state(); +} + +void Watch::discard_state() +{ + ceph_assert(pg->is_locked()); + ceph_assert(!discarded); + ceph_assert(obc); + in_progress_notifies.clear(); + unregister_cb(); + discarded = true; + if (is_connected()) { + if (auto priv = conn->get_priv(); priv) { + auto session = static_cast(priv.get()); + session->wstate.removeWatch(self.lock()); + } + conn = ConnectionRef(); + } + obc = ObjectContextRef(); +} + +bool Watch::is_discarded() const +{ + return discarded; +} + +void Watch::remove(bool send_disconnect) +{ + dout(10) << "remove" << dendl; + if (send_disconnect && is_connected()) { + bufferlist empty; + MWatchNotify *reply(new MWatchNotify(cookie, 0, 0, + CEPH_WATCH_EVENT_DISCONNECT, empty)); + conn->send_message(reply); + } + for (auto i = in_progress_notifies.begin(); + i != in_progress_notifies.end(); + ++i) { + i->second->complete_watcher_remove(self.lock()); + } + discard_state(); +} + +void Watch::start_notify(NotifyRef notif) +{ + ceph_assert(in_progress_notifies.find(notif->notify_id) == + in_progress_notifies.end()); + if (will_ping) { + utime_t cutoff = ceph_clock_now(); + cutoff.sec_ref() -= timeout; + if (last_ping < cutoff) { + dout(10) << __func__ << " " << notif->notify_id + << " last_ping " << last_ping << " < cutoff " << cutoff + << ", disconnecting" << dendl; + disconnect(); + return; + } + } + dout(10) << "start_notify " << notif->notify_id << dendl; + in_progress_notifies[notif->notify_id] = notif; + notif->start_watcher(self.lock()); + if (is_connected()) + send_notify(notif); +} + +void Watch::cancel_notify(NotifyRef notif) +{ + dout(10) << "cancel_notify " << notif->notify_id << dendl; + in_progress_notifies.erase(notif->notify_id); +} + +void Watch::send_notify(NotifyRef notif) +{ + dout(10) << "send_notify" << dendl; + MWatchNotify *notify_msg = new MWatchNotify( + cookie, + notif->version, + notif->notify_id, + CEPH_WATCH_EVENT_NOTIFY, + notif->payload, + notif->client_gid); + conn->send_message(notify_msg); +} + +void Watch::notify_ack(uint64_t notify_id, bufferlist& reply_bl) +{ + dout(10) << "notify_ack" << dendl; + auto i = in_progress_notifies.find(notify_id); + if (i != in_progress_notifies.end()) { + i->second->complete_watcher(self.lock(), reply_bl); + in_progress_notifies.erase(i); + } +} + +WatchRef Watch::makeWatchRef( + PrimaryLogPG *pg, OSDService *osd, + ObjectContextRef obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, const entity_addr_t& addr) +{ + WatchRef ret(new Watch(pg, osd, obc, timeout, cookie, entity, addr)); + ret->set_self(ret); + return ret; +} + +void WatchConState::addWatch(WatchRef watch) +{ + std::lock_guard l(lock); + watches.insert(watch); +} + +void WatchConState::removeWatch(WatchRef watch) +{ + std::lock_guard l(lock); + watches.erase(watch); +} + +void WatchConState::reset(Connection *con) +{ + set _watches; + { + std::lock_guard l(lock); + _watches.swap(watches); + } + for (set::iterator i = _watches.begin(); + i != _watches.end(); + ++i) { + boost::intrusive_ptr pg((*i)->get_pg()); + pg->lock(); + if (!(*i)->is_discarded()) { + if ((*i)->is_connected(con)) { + (*i)->disconnect(); + } else { + lgeneric_derr(cct) << __func__ << " not still connected to " << (*i) << dendl; + } + } + pg->unlock(); + } +} diff --git a/src/osd/Watch.h b/src/osd/Watch.h new file mode 100644 index 000000000..8d6d93a7d --- /dev/null +++ b/src/osd/Watch.h @@ -0,0 +1,291 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_WATCH_H +#define CEPH_WATCH_H + +#include +#include "msg/Connection.h" +#include "include/Context.h" + +enum WatcherState { + WATCHER_PENDING, + WATCHER_NOTIFIED, +}; + +class OSDService; +class PrimaryLogPG; +void intrusive_ptr_add_ref(PrimaryLogPG *pg); +void intrusive_ptr_release(PrimaryLogPG *pg); +struct ObjectContext; +class MWatchNotify; + +class Watch; +typedef std::shared_ptr WatchRef; +typedef std::weak_ptr WWatchRef; + +class Notify; +typedef std::shared_ptr NotifyRef; +typedef std::weak_ptr WNotifyRef; + +struct CancelableContext; + +/** + * Notify tracks the progress of a particular notify + * + * References are held by Watch and the timeout callback. + */ +class Notify { + friend class NotifyTimeoutCB; + friend class Watch; + WNotifyRef self; + ConnectionRef client; + uint64_t client_gid; + bool complete; + bool discarded; + bool timed_out; ///< true if the notify timed out + std::set watchers; + + ceph::buffer::list payload; + uint32_t timeout; + uint64_t cookie; + uint64_t notify_id; + uint64_t version; + + OSDService *osd; + CancelableContext *cb; + ceph::mutex lock = ceph::make_mutex("Notify::lock"); + + /// (gid,cookie) -> reply_bl for everyone who acked the notify + std::multimap, ceph::buffer::list> notify_replies; + + /// true if this notify is being discarded + bool is_discarded() { + return discarded || complete; + } + + /// Sends notify completion if watchers.empty() or timeout + void maybe_complete_notify(); + + /// Called on Notify timeout + void do_timeout(); + + Notify( + ConnectionRef client, + uint64_t client_gid, + ceph::buffer::list& payload, + uint32_t timeout, + uint64_t cookie, + uint64_t notify_id, + uint64_t version, + OSDService *osd); + + /// registers a timeout callback with the watch_timer + void register_cb(); + + /// removes the timeout callback, called on completion or cancellation + void unregister_cb(); +public: + + std::ostream& gen_dbg_prefix(std::ostream& out) { + return out << "Notify(" << std::make_pair(cookie, notify_id) << " " + << " watchers=" << watchers.size() + << ") "; + } + void set_self(NotifyRef _self) { + self = _self; + } + static NotifyRef makeNotifyRef( + ConnectionRef client, + uint64_t client_gid, + ceph::buffer::list &payload, + uint32_t timeout, + uint64_t cookie, + uint64_t notify_id, + uint64_t version, + OSDService *osd); + + /// Call after creation to initialize + void init(); + + /// Called once per watcher prior to init() + void start_watcher( + WatchRef watcher ///< [in] watcher to complete + ); + + /// Called once per NotifyAck + void complete_watcher( + WatchRef watcher, ///< [in] watcher to complete + ceph::buffer::list& reply_bl ///< [in] reply buffer from the notified watcher + ); + /// Called when a watcher unregisters or times out + void complete_watcher_remove( + WatchRef watcher ///< [in] watcher to complete + ); + + /// Called when the notify is canceled due to a new peering interval + void discard(); +}; + +/** + * Watch is a mapping between a Connection and an ObjectContext + * + * References are held by ObjectContext and the timeout callback + */ +class HandleWatchTimeout; +class HandleDelayedWatchTimeout; +class Watch { + WWatchRef self; + friend class HandleWatchTimeout; + friend class HandleDelayedWatchTimeout; + ConnectionRef conn; + CancelableContext *cb; + + OSDService *osd; + boost::intrusive_ptr pg; + std::shared_ptr obc; + + std::map in_progress_notifies; + + // Could have watch_info_t here, but this file includes osd_types.h + uint32_t timeout; ///< timeout in seconds + uint64_t cookie; + entity_addr_t addr; + + bool will_ping; ///< is client new enough to ping the watch + utime_t last_ping; ///< last client ping + + entity_name_t entity; + bool discarded; + + Watch( + PrimaryLogPG *pg, OSDService *osd, + std::shared_ptr obc, uint32_t timeout, + uint64_t cookie, entity_name_t entity, + const entity_addr_t& addr); + + /// Registers the timeout callback with watch_timer + void register_cb(); + + /// send a Notify message when connected for notif + void send_notify(NotifyRef notif); + + /// Cleans up state on discard or remove (including Connection state, obc) + void discard_state(); +public: + /// Unregisters the timeout callback + void unregister_cb(); + + /// note receipt of a ping + void got_ping(utime_t t); + utime_t get_last_ping() const { + return last_ping; + } + + /// True if currently connected + bool is_connected() const { + return conn.get() != NULL; + } + bool is_connected(Connection *con) const { + return conn.get() == con; + } + + /// NOTE: must be called with pg lock held + ~Watch(); + + uint64_t get_watcher_gid() const { + return entity.num(); + } + + std::ostream& gen_dbg_prefix(std::ostream& out); + static WatchRef makeWatchRef( + PrimaryLogPG *pg, OSDService *osd, + std::shared_ptr obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, const entity_addr_t &addr); + void set_self(WatchRef _self) { + self = _self; + } + + /// Does not grant a ref count! + boost::intrusive_ptr get_pg() { return pg; } + + std::shared_ptr get_obc() { return obc; } + + uint64_t get_cookie() const { return cookie; } + entity_name_t get_entity() const { return entity; } + entity_addr_t get_peer_addr() const { return addr; } + uint32_t get_timeout() const { return timeout; } + + /// Generates context for use if watch timeout is delayed by scrub or recovery + Context *get_delayed_cb(); + + /// Transitions Watch to connected, unregister_cb, resends pending Notifies + void connect( + ConnectionRef con, ///< [in] Reference to new connection + bool will_ping ///< [in] client is new and will send pings + ); + + /// Transitions watch to disconnected, register_cb + void disconnect(); + + /// Called if Watch state is discarded due to new peering interval + void discard(); + + /// True if removed or discarded + bool is_discarded() const; + + /// Called on unwatch + void remove(bool send_disconnect); + + /// Adds notif as in-progress notify + void start_notify( + NotifyRef notif ///< [in] Reference to new in-progress notify + ); + + /// Removes timed out notify + void cancel_notify( + NotifyRef notif ///< [in] notify which timed out + ); + + /// Call when notify_ack received on notify_id + void notify_ack( + uint64_t notify_id, ///< [in] id of acked notify + ceph::buffer::list& reply_bl ///< [in] notify reply buffer + ); +}; + +/** + * Holds weak refs to Watch structures corresponding to a connection + * Lives in the Session object of an OSD connection + */ +class WatchConState { + ceph::mutex lock = ceph::make_mutex("WatchConState"); + std::set watches; +public: + CephContext* cct; + explicit WatchConState(CephContext* cct) : cct(cct) {} + + /// Add a watch + void addWatch( + WatchRef watch ///< [in] Ref to new watch object + ); + + /// Remove a watch + void removeWatch( + WatchRef watch ///< [in] Ref to watch object to remove + ); + + /// Called on session reset, disconnects watchers + void reset(Connection *con); +}; + +#endif diff --git a/src/osd/error_code.cc b/src/osd/error_code.cc new file mode 100644 index 000000000..97f0012fd --- /dev/null +++ b/src/osd/error_code.cc @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat + * Author: Adam C. Emerson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include + +#include "common/error_code.h" +#include "common/errno.h" +#include "error_code.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wnon-virtual-dtor" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wnon-virtual-dtor" +class osd_error_category : public ceph::converting_category { +public: + osd_error_category(){} + const char* name() const noexcept override; + const char* message(int ev, char*, std::size_t) const noexcept override; + std::string message(int ev) const override; + boost::system::error_condition default_error_condition(int ev) const noexcept + override; + bool equivalent(int ev, const boost::system::error_condition& c) const + noexcept override; + using ceph::converting_category::equivalent; + int from_code(int ev) const noexcept override; +}; +#pragma GCC diagnostic pop +#pragma clang diagnostic pop + +const char* osd_error_category::name() const noexcept { + return "osd"; +} + +const char* osd_error_category::message(int ev, char* buf, + std::size_t len) const noexcept { + if (ev == 0) + return "No error"; + + switch (static_cast(ev)) { + case osd_errc::old_snapc: + return "ORDERSNAP flag set; writer has old snapc"; + case osd_errc::blocklisted: + return "Blocklisted"; + } + + if (len) { + auto s = cpp_strerror(ev); + auto n = s.copy(buf, len - 1); + *(buf + n) = '\0'; + } + return buf; +} + +std::string osd_error_category::message(int ev) const { + if (ev == 0) + return "No error"; + + switch (static_cast(ev)) { + case osd_errc::old_snapc: + return "ORDERSNAP flag set; writer has old snapc"; + case osd_errc::blocklisted: + return "Blocklisted"; + } + + return cpp_strerror(ev); +} + +boost::system::error_condition osd_error_category::default_error_condition(int ev) const noexcept { + if (ev == static_cast(osd_errc::old_snapc) || + ev == static_cast(osd_errc::blocklisted)) + return { ev, *this }; + else + return { ev, boost::system::generic_category() }; +} + +bool osd_error_category::equivalent(int ev, const boost::system::error_condition& c) const noexcept { + switch (static_cast(ev)) { + case osd_errc::old_snapc: + return c == boost::system::errc::invalid_argument; + case osd_errc::blocklisted: + return c == boost::system::errc::operation_not_permitted; + } + return default_error_condition(ev) == c; +} + +int osd_error_category::from_code(int ev) const noexcept { + return -ev; +} + +const boost::system::error_category& osd_category() noexcept { + static const osd_error_category c; + return c; +} diff --git a/src/osd/error_code.h b/src/osd/error_code.h new file mode 100644 index 000000000..d36e79db4 --- /dev/null +++ b/src/osd/error_code.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat + * Author: Adam C. Emerson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include + +#include "include/rados.h" + +const boost::system::error_category& osd_category() noexcept; + +// Since the OSD mostly uses POSIX error codes plus a couple +// additions, this will be a degenerate error category for now that +// mostly forwards to POSIX. + +enum class osd_errc { + old_snapc = 85, /* ORDERSNAP flag set; writer has old snapc*/ + blocklisted = 108 /* blocklisted */ +}; + +namespace boost::system { +template<> +struct is_error_code_enum<::osd_errc> { + static const bool value = true; +}; + +template<> +struct is_error_condition_enum<::osd_errc> { + static const bool value = false; +}; +} + +// implicit conversion: +inline boost::system::error_code make_error_code(osd_errc e) noexcept { + return { static_cast(e), osd_category() }; +} + +// explicit conversion: +inline boost::system::error_condition make_error_condition(osd_errc e) noexcept { + return { static_cast(e), osd_category() }; +} diff --git a/src/osd/objclass.cc b/src/osd/objclass.cc new file mode 100644 index 000000000..274f5e063 --- /dev/null +++ b/src/osd/objclass.cc @@ -0,0 +1,702 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "common/ceph_context.h" +#include "common/ceph_releases.h" +#include "common/config.h" +#include "common/debug.h" + +#include "objclass/objclass.h" +#include "osd/PrimaryLogPG.h" + +#include "osd/ClassHandler.h" + +#include "auth/Crypto.h" +#include "common/armor.h" + +#define dout_context ClassHandler::get_instance().cct + +using std::map; +using std::set; +using std::string; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::real_time; + + +int cls_call(cls_method_context_t hctx, const char *cls, const char *method, + char *indata, int datalen, char **outdata, int *outdatalen) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + bufferlist idata; + vector nops(1); + OSDOp& op = nops[0]; + int r; + + op.op.op = CEPH_OSD_OP_CALL; + op.op.cls.class_len = strlen(cls); + op.op.cls.method_len = strlen(method); + op.op.cls.indata_len = datalen; + op.indata.append(cls, op.op.cls.class_len); + op.indata.append(method, op.op.cls.method_len); + op.indata.append(indata, datalen); + r = (*pctx)->pg->do_osd_ops(*pctx, nops); + if (r < 0) + return r; + + *outdata = (char *)malloc(op.outdata.length()); + if (!*outdata) + return -ENOMEM; + memcpy(*outdata, op.outdata.c_str(), op.outdata.length()); + *outdatalen = op.outdata.length(); + + return r; +} + +int cls_getxattr(cls_method_context_t hctx, const char *name, + char **outdata, int *outdatalen) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector nops(1); + OSDOp& op = nops[0]; + int r; + + op.op.op = CEPH_OSD_OP_GETXATTR; + op.op.xattr.name_len = strlen(name); + op.indata.append(name, op.op.xattr.name_len); + r = (*pctx)->pg->do_osd_ops(*pctx, nops); + if (r < 0) + return r; + + *outdata = (char *)malloc(op.outdata.length()); + if (!*outdata) + return -ENOMEM; + memcpy(*outdata, op.outdata.c_str(), op.outdata.length()); + *outdatalen = op.outdata.length(); + + return r; +} + +int cls_setxattr(cls_method_context_t hctx, const char *name, + const char *value, int val_len) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector nops(1); + OSDOp& op = nops[0]; + int r; + + op.op.op = CEPH_OSD_OP_SETXATTR; + op.op.xattr.name_len = strlen(name); + op.op.xattr.value_len = val_len; + op.indata.append(name, op.op.xattr.name_len); + op.indata.append(value, val_len); + r = (*pctx)->pg->do_osd_ops(*pctx, nops); + + return r; +} + +int cls_read(cls_method_context_t hctx, int ofs, int len, + char **outdata, int *outdatalen) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + ops[0].op.op = CEPH_OSD_OP_SYNC_READ; + ops[0].op.extent.offset = ofs; + ops[0].op.extent.length = len; + int r = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (r < 0) + return r; + + *outdata = (char *)malloc(ops[0].outdata.length()); + if (!*outdata) + return -ENOMEM; + memcpy(*outdata, ops[0].outdata.c_str(), ops[0].outdata.length()); + *outdatalen = ops[0].outdata.length(); + + return *outdatalen; +} + +int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) +{ + PrimaryLogPG::OpContext **pctx = static_cast(hctx); + *origin = (*pctx)->op->get_req()->get_orig_source_inst(); + return 0; +} + +int cls_cxx_create(cls_method_context_t hctx, bool exclusive) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + ops[0].op.op = CEPH_OSD_OP_CREATE; + ops[0].op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0); + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_remove(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + ops[0].op.op = CEPH_OSD_OP_DELETE; + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + int ret; + ops[0].op.op = CEPH_OSD_OP_STAT; + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + auto iter = ops[0].outdata.cbegin(); + utime_t ut; + uint64_t s; + try { + decode(s, iter); + decode(ut, iter); + } catch (ceph::buffer::error& err) { + return -EIO; + } + if (size) + *size = s; + if (mtime) + *mtime = ut.sec(); + return 0; +} + +int cls_cxx_stat2(cls_method_context_t hctx, uint64_t *size, ceph::real_time *mtime) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + int ret; + ops[0].op.op = CEPH_OSD_OP_STAT; + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + auto iter = ops[0].outdata.cbegin(); + real_time ut; + uint64_t s; + try { + decode(s, iter); + decode(ut, iter); + } catch (ceph::buffer::error& err) { + return -EIO; + } + if (size) + *size = s; + if (mtime) + *mtime = ut; + return 0; +} + +int cls_cxx_read2(cls_method_context_t hctx, int ofs, int len, + bufferlist *outbl, uint32_t op_flags) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + int ret; + ops[0].op.op = CEPH_OSD_OP_SYNC_READ; + ops[0].op.extent.offset = ofs; + ops[0].op.extent.length = len; + ops[0].op.flags = op_flags; + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + *outbl = std::move(ops[0].outdata); + return outbl->length(); +} + +int cls_cxx_write2(cls_method_context_t hctx, int ofs, int len, + bufferlist *inbl, uint32_t op_flags) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + ops[0].op.op = CEPH_OSD_OP_WRITE; + ops[0].op.extent.offset = ofs; + ops[0].op.extent.length = len; + ops[0].op.flags = op_flags; + ops[0].indata = *inbl; + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_write_full(cls_method_context_t hctx, bufferlist *inbl) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + ops[0].op.op = CEPH_OSD_OP_WRITEFULL; + ops[0].op.extent.offset = 0; + ops[0].op.extent.length = inbl->length(); + ops[0].indata = *inbl; + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_replace(cls_method_context_t hctx, int ofs, int len, bufferlist *inbl) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(2); + ops[0].op.op = CEPH_OSD_OP_TRUNCATE; + ops[0].op.extent.offset = 0; + ops[0].op.extent.length = 0; + ops[1].op.op = CEPH_OSD_OP_WRITE; + ops[1].op.extent.offset = ofs; + ops[1].op.extent.length = len; + ops[1].indata = *inbl; + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_truncate(cls_method_context_t hctx, int ofs) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + ops[0].op.op = CEPH_OSD_OP_TRUNCATE; + ops[0].op.extent.offset = ofs; + ops[0].op.extent.length = 0; + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_write_zero(cls_method_context_t hctx, int ofs, int len) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + ops[0].op.op = CEPH_OSD_OP_ZERO; + ops[0].op.extent.offset = ofs; + ops[0].op.extent.length = len; + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_getxattr(cls_method_context_t hctx, const char *name, + bufferlist *outbl) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector nops(1); + OSDOp& op = nops[0]; + int r; + + op.op.op = CEPH_OSD_OP_GETXATTR; + op.op.xattr.name_len = strlen(name); + op.indata.append(name, op.op.xattr.name_len); + r = (*pctx)->pg->do_osd_ops(*pctx, nops); + if (r < 0) + return r; + + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_getxattrs(cls_method_context_t hctx, map *attrset) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector nops(1); + OSDOp& op = nops[0]; + int r; + + op.op.op = CEPH_OSD_OP_GETXATTRS; + r = (*pctx)->pg->do_osd_ops(*pctx, nops); + if (r < 0) + return r; + + auto iter = op.outdata.cbegin(); + try { + decode(*attrset, iter); + } catch (ceph::buffer::error& err) { + return -EIO; + } + return 0; +} + +int cls_cxx_setxattr(cls_method_context_t hctx, const char *name, + bufferlist *inbl) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector nops(1); + OSDOp& op = nops[0]; + int r; + + op.op.op = CEPH_OSD_OP_SETXATTR; + op.op.xattr.name_len = strlen(name); + op.op.xattr.value_len = inbl->length(); + op.indata.append(name, op.op.xattr.name_len); + op.indata.append(*inbl); + r = (*pctx)->pg->do_osd_ops(*pctx, nops); + + return r; +} + +int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + ops[0].op.op = CEPH_OSD_OP_ROLLBACK; + ops[0].op.snap.snapid = snapid; + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_map_get_all_vals(cls_method_context_t hctx, map* vals, + bool *more) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + int ret; + + string start_after; + string filter_prefix; + uint64_t max = (uint64_t)-1; + + encode(start_after, op.indata); + encode(max, op.indata); + encode(filter_prefix, op.indata); + + op.op.op = CEPH_OSD_OP_OMAPGETVALS; + + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + + auto iter = op.outdata.cbegin(); + try { + decode(*vals, iter); + decode(*more, iter); + } catch (ceph::buffer::error& err) { + return -EIO; + } + return vals->size(); +} + +int cls_cxx_map_get_keys(cls_method_context_t hctx, const string &start_obj, + uint64_t max_to_get, set *keys, + bool *more) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + int ret; + + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + + op.op.op = CEPH_OSD_OP_OMAPGETKEYS; + + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + + auto iter = op.outdata.cbegin(); + try { + decode(*keys, iter); + decode(*more, iter); + } catch (ceph::buffer::error& err) { + return -EIO; + } + return keys->size(); +} + +int cls_cxx_map_get_vals(cls_method_context_t hctx, const string &start_obj, + const string &filter_prefix, uint64_t max_to_get, + map *vals, bool *more) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + int ret; + + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + encode(filter_prefix, op.indata); + + op.op.op = CEPH_OSD_OP_OMAPGETVALS; + + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + + auto iter = op.outdata.cbegin(); + try { + decode(*vals, iter); + decode(*more, iter); + } catch (ceph::buffer::error& err) { + return -EIO; + } + return vals->size(); +} + +int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + int ret; + op.op.op = CEPH_OSD_OP_OMAPGETHEADER; + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + + *outbl = std::move(op.outdata); + + return 0; +} + +int cls_cxx_map_get_val(cls_method_context_t hctx, const string &key, + bufferlist *outbl) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + int ret; + + set k; + k.insert(key); + encode(k, op.indata); + + op.op.op = CEPH_OSD_OP_OMAPGETVALSBYKEYS; + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + + auto iter = op.outdata.cbegin(); + try { + map m; + + decode(m, iter); + map::iterator iter = m.begin(); + if (iter == m.end()) + return -ENOENT; + + *outbl = iter->second; + } catch (ceph::buffer::error& e) { + return -EIO; + } + return 0; +} + +int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx, + const std::set &keys, + std::map *map) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + int ret; + + encode(keys, op.indata); + + op.op.op = CEPH_OSD_OP_OMAPGETVALSBYKEYS; + ret = (*pctx)->pg->do_osd_ops(*pctx, ops); + if (ret < 0) + return ret; + + auto iter = op.outdata.cbegin(); + try { + decode(*map, iter); + } catch (buffer::error& e) { + return -EIO; + } + return 0; +} + +int cls_cxx_map_set_val(cls_method_context_t hctx, const string &key, + bufferlist *inbl) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + bufferlist& update_bl = op.indata; + map m; + m[key] = *inbl; + encode(m, update_bl); + + op.op.op = CEPH_OSD_OP_OMAPSETVALS; + + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_map_set_vals(cls_method_context_t hctx, + const std::map *map) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + bufferlist& update_bl = op.indata; + encode(*map, update_bl); + + op.op.op = CEPH_OSD_OP_OMAPSETVALS; + + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_map_clear(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + + op.op.op = CEPH_OSD_OP_OMAPCLEAR; + + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + op.indata = std::move(*inbl); + + op.op.op = CEPH_OSD_OP_OMAPSETHEADER; + + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_map_remove_range(cls_method_context_t hctx, + const std::string& key_begin, + const std::string& key_end) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + bufferlist& update_bl = op.indata; + + ::encode(key_begin, update_bl); + ::encode(key_end, update_bl); + + op.op.op = CEPH_OSD_OP_OMAPRMKEYRANGE; + + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector ops(1); + OSDOp& op = ops[0]; + bufferlist& update_bl = op.indata; + set to_rm; + to_rm.insert(key); + + encode(to_rm, update_bl); + + op.op.op = CEPH_OSD_OP_OMAPRMKEYS; + + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_cxx_list_watchers(cls_method_context_t hctx, + obj_list_watch_response_t *watchers) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + vector nops(1); + OSDOp& op = nops[0]; + int r; + + op.op.op = CEPH_OSD_OP_LIST_WATCHERS; + r = (*pctx)->pg->do_osd_ops(*pctx, nops); + if (r < 0) + return r; + + auto iter = op.outdata.cbegin(); + try { + decode(*watchers, iter); + } catch (ceph::buffer::error& err) { + return -EIO; + } + return 0; +} + +uint64_t cls_current_version(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + + return ctx->pg->get_last_user_version(); +} + + +int cls_current_subop_num(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + + return ctx->processed_subop_count; +} + +uint64_t cls_get_features(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + return ctx->pg->get_osdmap()->get_up_osd_features(); +} + +uint64_t cls_get_client_features(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + return ctx->op->get_req()->get_connection()->get_features(); +} + +ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + return ctx->pg->get_osdmap()->require_osd_release; +} + +ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + return ctx->pg->get_osdmap()->get_require_min_compat_client(); +} + +int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq) { + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + if (!ctx->new_obs.exists || (ctx->new_obs.oi.is_whiteout() && + ctx->obc->ssc->snapset.clones.empty())) { + return -ENOENT; + } + *snap_seq = ctx->obc->ssc->snapset.seq; + return 0; +} + +int cls_cxx_chunk_write_and_set(cls_method_context_t hctx, int ofs, int len, + bufferlist *write_inbl, uint32_t op_flags, + bufferlist *set_inbl, int set_len) +{ + PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx; + char cname[] = "cas"; + char method[] = "chunk_set"; + + vector ops(2); + ops[0].op.op = CEPH_OSD_OP_WRITE; + ops[0].op.extent.offset = ofs; + ops[0].op.extent.length = len; + ops[0].op.flags = op_flags; + ops[0].indata = *write_inbl; + + ops[1].op.op = CEPH_OSD_OP_CALL; + ops[1].op.cls.class_len = strlen(cname); + ops[1].op.cls.method_len = strlen(method); + ops[1].op.cls.indata_len = set_len; + ops[1].indata.append(cname, ops[1].op.cls.class_len); + ops[1].indata.append(method, ops[1].op.cls.method_len); + ops[1].indata.append(*set_inbl); + + return (*pctx)->pg->do_osd_ops(*pctx, ops); +} + +int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid) +{ + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + return ctx->pg->get_manifest_ref_count(ctx->obc, fp_oid, ctx->op); +} + +uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) { + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + + return ctx->pg->get_min_alloc_size(); +} + +uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx) +{ + PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx; + + return ctx->pg->get_pool().stripe_width; +} diff --git a/src/osd/object_state.h b/src/osd/object_state.h new file mode 100644 index 000000000..31987d2a4 --- /dev/null +++ b/src/osd/object_state.h @@ -0,0 +1,190 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd_types.h" + +struct ObjectState { + object_info_t oi; + bool exists; ///< the stored object exists (i.e., we will remember the object_info_t) + + ObjectState() : exists(false) {} + + ObjectState(const object_info_t &oi_, bool exists_) + : oi(oi_), exists(exists_) {} + ObjectState(object_info_t &&oi_, bool exists_) + : oi(std::move(oi_)), exists(exists_) {} + ObjectState(const hobject_t &obj) : oi(obj), exists(false) {} +}; + +struct RWState { + enum State { + RWNONE, + RWREAD, + RWWRITE, + RWEXCL, + }; + static const char *get_state_name(State s) { + switch (s) { + case RWNONE: return "none"; + case RWREAD: return "read"; + case RWWRITE: return "write"; + case RWEXCL: return "excl"; + default: return "???"; + } + } + const char *get_state_name() const { + return get_state_name(state); + } + + int count; ///< number of readers or writers + int waiters = 0; ///< number waiting + + State state:4; ///< rw state + /// if set, restart backfill when we can get a read lock + bool recovery_read_marker:1; + /// if set, requeue snaptrim on lock release + bool snaptrimmer_write_marker:1; + + RWState() + : count(0), + state(RWNONE), + recovery_read_marker(false), + snaptrimmer_write_marker(false) + {} + + /// this function adjusts the counts if necessary + bool get_read_lock() { + // don't starve anybody! + if (waiters > 0) { + return false; + } + switch (state) { + case RWNONE: + ceph_assert(count == 0); + state = RWREAD; + // fall through + case RWREAD: + count++; + return true; + case RWWRITE: + return false; + case RWEXCL: + return false; + default: + ceph_abort_msg("unhandled case"); + return false; + } + } + + bool get_write_lock(bool greedy=false) { + if (!greedy) { + // don't starve anybody! + if (waiters > 0 || + recovery_read_marker) { + return false; + } + } + switch (state) { + case RWNONE: + ceph_assert(count == 0); + state = RWWRITE; + // fall through + case RWWRITE: + count++; + return true; + case RWREAD: + return false; + case RWEXCL: + return false; + default: + ceph_abort_msg("unhandled case"); + return false; + } + } + bool get_excl_lock() { + switch (state) { + case RWNONE: + ceph_assert(count == 0); + state = RWEXCL; + count = 1; + return true; + case RWWRITE: + return false; + case RWREAD: + return false; + case RWEXCL: + return false; + default: + ceph_abort_msg("unhandled case"); + return false; + } + } + /// same as get_write_lock, but ignore starvation + bool take_write_lock() { + if (state == RWWRITE) { + count++; + return true; + } + return get_write_lock(); + } + bool dec() { + ceph_assert(count > 0); + count--; + if (count == 0) { + state = RWNONE; + return true; + } else { + return false; + } + } + bool put_read() { + ceph_assert(state == RWREAD); + return dec(); + } + bool put_write() { + ceph_assert(state == RWWRITE); + return dec(); + } + bool put_excl() { + ceph_assert(state == RWEXCL); + return dec(); + } + void inc_waiters() { + ++waiters; + } + void release_waiters() { + waiters = 0; + } + void dec_waiters(int count) { + ceph_assert(waiters >= count); + waiters -= count; + } + bool empty() const { return state == RWNONE; } + + bool get_snaptrimmer_write(bool mark_if_unsuccessful) { + if (get_write_lock()) { + return true; + } else { + if (mark_if_unsuccessful) + snaptrimmer_write_marker = true; + return false; + } + } + bool get_recovery_read() { + recovery_read_marker = true; + if (get_read_lock()) { + return true; + } + return false; + } +}; + +inline std::ostream& operator<<(std::ostream& out, const RWState& rw) +{ + return out << "rwstate(" << rw.get_state_name() + << " n=" << rw.count + << " w=" << rw.waiters + << ")"; +} diff --git a/src/osd/osd_internal_types.h b/src/osd/osd_internal_types.h new file mode 100644 index 000000000..17f4f3146 --- /dev/null +++ b/src/osd/osd_internal_types.h @@ -0,0 +1,320 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OSD_INTERNAL_TYPES_H +#define CEPH_OSD_INTERNAL_TYPES_H + +#include "osd_types.h" +#include "OpRequest.h" +#include "object_state.h" + +/* + * keep tabs on object modifications that are in flight. + * we need to know the projected existence, size, snapset, + * etc., because we don't send writes down to disk until after + * replicas ack. + */ + +struct SnapSetContext { + hobject_t oid; + SnapSet snapset; + int ref; + bool registered : 1; + bool exists : 1; + + explicit SnapSetContext(const hobject_t& o) : + oid(o), ref(0), registered(false), exists(true) { } +}; +struct ObjectContext; +typedef std::shared_ptr ObjectContextRef; + +struct ObjectContext { + ObjectState obs; + + SnapSetContext *ssc; // may be null + + Context *destructor_callback; + +public: + + // any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers. + std::map, WatchRef> watchers; + + // attr cache + std::map attr_cache; + + RWState rwstate; + std::list waiters; ///< ops waiting on state change + bool get_read(OpRequestRef& op) { + if (rwstate.get_read_lock()) { + return true; + } // else + // Now we really need to bump up the ref-counter. + waiters.emplace_back(op); + rwstate.inc_waiters(); + return false; + } + bool get_write(OpRequestRef& op, bool greedy=false) { + if (rwstate.get_write_lock(greedy)) { + return true; + } // else + if (op) { + waiters.emplace_back(op); + rwstate.inc_waiters(); + } + return false; + } + bool get_excl(OpRequestRef& op) { + if (rwstate.get_excl_lock()) { + return true; + } // else + if (op) { + waiters.emplace_back(op); + rwstate.inc_waiters(); + } + return false; + } + void wake(std::list *requeue) { + rwstate.release_waiters(); + requeue->splice(requeue->end(), waiters); + } + void put_read(std::list *requeue) { + if (rwstate.put_read()) { + wake(requeue); + } + } + void put_write(std::list *requeue) { + if (rwstate.put_write()) { + wake(requeue); + } + } + void put_excl(std::list *requeue) { + if (rwstate.put_excl()) { + wake(requeue); + } + } + bool empty() const { return rwstate.empty(); } + + bool get_lock_type(OpRequestRef& op, RWState::State type) { + switch (type) { + case RWState::RWWRITE: + return get_write(op); + case RWState::RWREAD: + return get_read(op); + case RWState::RWEXCL: + return get_excl(op); + default: + ceph_abort_msg("invalid lock type"); + return true; + } + } + bool get_write_greedy(OpRequestRef& op) { + return get_write(op, true); + } + bool get_snaptrimmer_write(bool mark_if_unsuccessful) { + return rwstate.get_snaptrimmer_write(mark_if_unsuccessful); + } + bool get_recovery_read() { + return rwstate.get_recovery_read(); + } + bool try_get_read_lock() { + return rwstate.get_read_lock(); + } + void drop_recovery_read(std::list *ls) { + ceph_assert(rwstate.recovery_read_marker); + put_read(ls); + rwstate.recovery_read_marker = false; + } + void put_lock_type( + RWState::State type, + std::list *to_wake, + bool *requeue_recovery, + bool *requeue_snaptrimmer) { + switch (type) { + case RWState::RWWRITE: + put_write(to_wake); + break; + case RWState::RWREAD: + put_read(to_wake); + break; + case RWState::RWEXCL: + put_excl(to_wake); + break; + default: + ceph_abort_msg("invalid lock type"); + } + if (rwstate.empty() && rwstate.recovery_read_marker) { + rwstate.recovery_read_marker = false; + *requeue_recovery = true; + } + if (rwstate.empty() && rwstate.snaptrimmer_write_marker) { + rwstate.snaptrimmer_write_marker = false; + *requeue_snaptrimmer = true; + } + } + bool is_request_pending() { + return !rwstate.empty(); + } + + ObjectContext() + : ssc(NULL), + destructor_callback(0), + blocked(false), requeue_scrub_on_unblock(false) {} + + ~ObjectContext() { + ceph_assert(rwstate.empty()); + if (destructor_callback) + destructor_callback->complete(0); + } + + void start_block() { + ceph_assert(!blocked); + blocked = true; + } + void stop_block() { + ceph_assert(blocked); + blocked = false; + } + bool is_blocked() const { + return blocked; + } + + /// in-progress copyfrom ops for this object + bool blocked:1; + bool requeue_scrub_on_unblock:1; // true if we need to requeue scrub on unblock + +}; + +inline std::ostream& operator<<(std::ostream& out, const ObjectState& obs) +{ + out << obs.oi.soid; + if (!obs.exists) + out << "(dne)"; + return out; +} + +inline std::ostream& operator<<(std::ostream& out, const ObjectContext& obc) +{ + return out << "obc(" << obc.obs << " " << obc.rwstate << ")"; +} + +class ObcLockManager { + struct ObjectLockState { + ObjectContextRef obc; + RWState::State type; + ObjectLockState( + ObjectContextRef obc, + RWState::State type) + : obc(std::move(obc)), type(type) {} + }; + std::map locks; +public: + ObcLockManager() = default; + ObcLockManager(ObcLockManager &&) = default; + ObcLockManager(const ObcLockManager &) = delete; + ObcLockManager &operator=(ObcLockManager &&) = default; + bool empty() const { + return locks.empty(); + } + bool get_lock_type( + RWState::State type, + const hobject_t &hoid, + ObjectContextRef& obc, + OpRequestRef& op) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->get_lock_type(op, type)) { + locks.insert(std::make_pair(hoid, ObjectLockState(obc, type))); + return true; + } else { + return false; + } + } + /// Get write lock, ignore starvation + bool take_write_lock( + const hobject_t &hoid, + ObjectContextRef obc) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->rwstate.take_write_lock()) { + locks.insert( + std::make_pair( + hoid, ObjectLockState(obc, RWState::RWWRITE))); + return true; + } else { + return false; + } + } + /// Get write lock for snap trim + bool get_snaptrimmer_write( + const hobject_t &hoid, + ObjectContextRef obc, + bool mark_if_unsuccessful) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->get_snaptrimmer_write(mark_if_unsuccessful)) { + locks.insert( + std::make_pair( + hoid, ObjectLockState(obc, RWState::RWWRITE))); + return true; + } else { + return false; + } + } + /// Get write lock greedy + bool get_write_greedy( + const hobject_t &hoid, + ObjectContextRef obc, + OpRequestRef op) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->get_write_greedy(op)) { + locks.insert( + std::make_pair( + hoid, ObjectLockState(obc, RWState::RWWRITE))); + return true; + } else { + return false; + } + } + + /// try get read lock + bool try_get_read_lock( + const hobject_t &hoid, + ObjectContextRef obc) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->try_get_read_lock()) { + locks.insert( + std::make_pair( + hoid, + ObjectLockState(obc, RWState::RWREAD))); + return true; + } else { + return false; + } + } + + void put_locks( + std::list > > *to_requeue, + bool *requeue_recovery, + bool *requeue_snaptrimmer) { + for (auto& p: locks) { + std::list _to_requeue; + p.second.obc->put_lock_type( + p.second.type, + &_to_requeue, + requeue_recovery, + requeue_snaptrimmer); + if (to_requeue) { + // We can safely std::move here as the whole `locks` is going + // to die just after the loop. + to_requeue->emplace_back(std::move(p.second.obc), + std::move(_to_requeue)); + } + } + locks.clear(); + } + ~ObcLockManager() { + ceph_assert(locks.empty()); + } +}; + + + +#endif diff --git a/src/osd/osd_op_util.cc b/src/osd/osd_op_util.cc new file mode 100644 index 000000000..54c590ee2 --- /dev/null +++ b/src/osd/osd_op_util.cc @@ -0,0 +1,263 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd/osd_op_util.h" + +#include "osd/ClassHandler.h" +#include "messages/MOSDOp.h" + +using std::ostream; +using std::string; +using std::vector; + +using ceph::bufferlist; + +bool OpInfo::check_rmw(int flag) const { + ceph_assert(rmw_flags != 0); + return rmw_flags & flag; +} +bool OpInfo::may_read() const { + return need_read_cap() || check_rmw(CEPH_OSD_RMW_FLAG_CLASS_READ); +} +bool OpInfo::may_write() const { + return need_write_cap() || check_rmw(CEPH_OSD_RMW_FLAG_CLASS_WRITE); +} +bool OpInfo::may_cache() const { return check_rmw(CEPH_OSD_RMW_FLAG_CACHE); } +bool OpInfo::rwordered_forced() const { + return check_rmw(CEPH_OSD_RMW_FLAG_RWORDERED); +} +bool OpInfo::rwordered() const { + return may_write() || may_cache() || rwordered_forced(); +} + +bool OpInfo::includes_pg_op() const { + return check_rmw(CEPH_OSD_RMW_FLAG_PGOP); +} +bool OpInfo::need_read_cap() const { + return check_rmw(CEPH_OSD_RMW_FLAG_READ); +} +bool OpInfo::need_write_cap() const { + return check_rmw(CEPH_OSD_RMW_FLAG_WRITE); +} +bool OpInfo::need_promote() const { + return check_rmw(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE); +} +bool OpInfo::need_skip_handle_cache() const { + return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE); +} +bool OpInfo::need_skip_promote() const { + return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE); +} +bool OpInfo::allows_returnvec() const { + return check_rmw(CEPH_OSD_RMW_FLAG_RETURNVEC); +} + +void OpInfo::set_rmw_flags(int flags) { + rmw_flags |= flags; +} + +void OpInfo::set_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_READ); } +void OpInfo::set_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_WRITE); } +void OpInfo::set_class_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_READ); } +void OpInfo::set_class_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_WRITE); } +void OpInfo::set_pg_op() { set_rmw_flags(CEPH_OSD_RMW_FLAG_PGOP); } +void OpInfo::set_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CACHE); } +void OpInfo::set_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE); } +void OpInfo::set_skip_handle_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE); } +void OpInfo::set_skip_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE); } +void OpInfo::set_force_rwordered() { set_rmw_flags(CEPH_OSD_RMW_FLAG_RWORDERED); } +void OpInfo::set_returnvec() { set_rmw_flags(CEPH_OSD_RMW_FLAG_RETURNVEC); } + + +int OpInfo::set_from_op( + const MOSDOp *m, + const OSDMap &osdmap) +{ + vector::const_iterator iter; + + // client flags have no bearing on whether an op is a read, write, etc. + clear(); + + if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) { + set_force_rwordered(); + } + if (m->has_flag(CEPH_OSD_FLAG_RETURNVEC)) { + set_returnvec(); + } + + // set bits based on op codes, called methods. + for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) { + if ((iter->op.op == CEPH_OSD_OP_WATCH && + iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) { + /* This a bit odd. PING isn't actually a write. It can't + * result in an update to the object_info. PINGs also aren't + * resent, so there's no reason to write out a log entry. + * + * However, we pipeline them behind writes, so let's force + * the write_ordered flag. + */ + set_force_rwordered(); + } else { + if (ceph_osd_op_mode_modify(iter->op.op)) + set_write(); + } + if (ceph_osd_op_mode_read(iter->op.op)) + set_read(); + + // set READ flag if there are src_oids + if (iter->soid.oid.name.length()) + set_read(); + + // set PGOP flag if there are PG ops + if (ceph_osd_op_type_pg(iter->op.op)) + set_pg_op(); + + if (ceph_osd_op_mode_cache(iter->op.op)) + set_cache(); + + // check for ec base pool + int64_t poolid = m->get_pg().pool(); + const pg_pool_t *pool = osdmap.get_pg_pool(poolid); + if (pool && pool->is_tier()) { + const pg_pool_t *base_pool = osdmap.get_pg_pool(pool->tier_of); + if (base_pool && base_pool->require_rollback()) { + if ((iter->op.op != CEPH_OSD_OP_READ) && + (iter->op.op != CEPH_OSD_OP_CHECKSUM) && + (iter->op.op != CEPH_OSD_OP_CMPEXT) && + (iter->op.op != CEPH_OSD_OP_STAT) && + (iter->op.op != CEPH_OSD_OP_ISDIRTY) && + (iter->op.op != CEPH_OSD_OP_UNDIRTY) && + (iter->op.op != CEPH_OSD_OP_GETXATTR) && + (iter->op.op != CEPH_OSD_OP_GETXATTRS) && + (iter->op.op != CEPH_OSD_OP_CMPXATTR) && + (iter->op.op != CEPH_OSD_OP_ASSERT_VER) && + (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) && + (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) && + (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) && + (iter->op.op != CEPH_OSD_OP_WRITEFULL) && + (iter->op.op != CEPH_OSD_OP_ROLLBACK) && + (iter->op.op != CEPH_OSD_OP_CREATE) && + (iter->op.op != CEPH_OSD_OP_DELETE) && + (iter->op.op != CEPH_OSD_OP_SETXATTR) && + (iter->op.op != CEPH_OSD_OP_RMXATTR) && + (iter->op.op != CEPH_OSD_OP_STARTSYNC) && + (iter->op.op != CEPH_OSD_OP_COPY_GET) && + (iter->op.op != CEPH_OSD_OP_COPY_FROM) && + (iter->op.op != CEPH_OSD_OP_COPY_FROM2)) { + set_promote(); + } + } + } + + switch (iter->op.op) { + case CEPH_OSD_OP_CALL: + { + bufferlist::iterator bp = const_cast(iter->indata).begin(); + int is_write, is_read; + string cname, mname; + bp.copy(iter->op.cls.class_len, cname); + bp.copy(iter->op.cls.method_len, mname); + + ClassHandler::ClassData *cls; + int r = ClassHandler::get_instance().open_class(cname, &cls); + if (r) { + if (r == -ENOENT) + r = -EOPNOTSUPP; + else if (r != -EPERM) // propagate permission errors + r = -EIO; + return r; + } + int flags = cls->get_method_flags(mname); + if (flags < 0) { + if (flags == -ENOENT) + r = -EOPNOTSUPP; + else + r = flags; + return r; + } + is_read = flags & CLS_METHOD_RD; + is_write = flags & CLS_METHOD_WR; + bool is_promote = flags & CLS_METHOD_PROMOTE; + + if (is_read) + set_class_read(); + if (is_write) + set_class_write(); + if (is_promote) + set_promote(); + add_class(std::move(cname), std::move(mname), is_read, is_write, + cls->allowed); + break; + } + + case CEPH_OSD_OP_WATCH: + // force the read bit for watch since it is depends on previous + // watch state (and may return early if the watch exists) or, in + // the case of ping, is simply a read op. + set_read(); + // fall through + case CEPH_OSD_OP_NOTIFY: + case CEPH_OSD_OP_NOTIFY_ACK: + { + set_promote(); + break; + } + + case CEPH_OSD_OP_DELETE: + // if we get a delete with FAILOK we can skip handle cache. without + // FAILOK we still need to promote (or do something smarter) to + // determine whether to return ENOENT or 0. + if (iter == m->ops.begin() && + iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) { + set_skip_handle_cache(); + } + // skip promotion when proxying a delete op + if (m->ops.size() == 1) { + set_skip_promote(); + } + break; + + case CEPH_OSD_OP_CACHE_TRY_FLUSH: + case CEPH_OSD_OP_CACHE_FLUSH: + case CEPH_OSD_OP_CACHE_EVICT: + // If try_flush/flush/evict is the only op, can skip handle cache. + if (m->ops.size() == 1) { + set_skip_handle_cache(); + } + break; + + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_CHECKSUM: + case CEPH_OSD_OP_WRITEFULL: + if (m->ops.size() == 1 && + (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE || + iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) { + set_skip_promote(); + } + break; + + // force promotion when pin an object in cache tier + case CEPH_OSD_OP_CACHE_PIN: + set_promote(); + break; + + default: + break; + } + } + + if (rmw_flags == 0) + return -EINVAL; + + return 0; + +} + +ostream& operator<<(ostream& out, const OpInfo::ClassInfo& i) +{ + out << "class " << i.class_name << " method " << i.method_name + << " rd " << i.read << " wr " << i.write << " allowed " << i.allowed; + return out; +} diff --git a/src/osd/osd_op_util.h b/src/osd/osd_op_util.h new file mode 100644 index 000000000..5fb568e40 --- /dev/null +++ b/src/osd/osd_op_util.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +#include "osd/OSDMap.h" + +#include "messages/MOSDOp.h" + +class OpInfo { +public: + struct ClassInfo { + ClassInfo(std::string&& class_name, std::string&& method_name, + bool read, bool write, bool allowed) : + class_name(std::move(class_name)), method_name(std::move(method_name)), + read(read), write(write), allowed(allowed) + {} + const std::string class_name; + const std::string method_name; + const bool read, write, allowed; + }; + +private: + uint64_t rmw_flags = 0; + std::vector classes; + + void set_rmw_flags(int flags); + + void add_class(std::string&& class_name, std::string&& method_name, + bool read, bool write, bool allowed) { + classes.emplace_back(std::move(class_name), std::move(method_name), + read, write, allowed); + } + +public: + + void clear() { + rmw_flags = 0; + } + + uint64_t get_flags() const { + return rmw_flags; + } + + bool check_rmw(int flag) const ; + bool may_read() const; + bool may_write() const; + bool may_cache() const; + bool rwordered_forced() const; + bool rwordered() const; + bool includes_pg_op() const; + bool need_read_cap() const; + bool need_write_cap() const; + bool need_promote() const; + bool need_skip_handle_cache() const; + bool need_skip_promote() const; + bool allows_returnvec() const; + + void set_read(); + void set_write(); + void set_cache(); + void set_class_read(); + void set_class_write(); + void set_pg_op(); + void set_promote(); + void set_skip_handle_cache(); + void set_skip_promote(); + void set_force_rwordered(); + void set_returnvec(); + + int set_from_op( + const MOSDOp *m, + const OSDMap &osdmap); + + std::vector get_classes() const { + return classes; + } +}; + +std::ostream& operator<<(std::ostream& out, const OpInfo::ClassInfo& i); diff --git a/src/osd/osd_perf_counters.cc b/src/osd/osd_perf_counters.cc new file mode 100644 index 000000000..ed63b4d3f --- /dev/null +++ b/src/osd/osd_perf_counters.cc @@ -0,0 +1,321 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_perf_counters.h" +#include "include/common_fwd.h" + + +PerfCounters *build_osd_logger(CephContext *cct) { + PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last); + + // Latency axis configuration for op histograms, values are in nanoseconds + PerfHistogramCommon::axis_config_d op_hist_x_axis_config{ + "Latency (usec)", + PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale + 0, ///< Start at 0 + 100000, ///< Quantization unit is 100usec + 32, ///< Enough to cover much longer than slow requests + }; + + // Op size axis configuration for op histograms, values are in bytes + PerfHistogramCommon::axis_config_d op_hist_y_axis_config{ + "Request size (bytes)", + PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale + 0, ///< Start at 0 + 512, ///< Quantization unit is 512 bytes + 32, ///< Enough to cover requests larger than GB + }; + + + // All the basic OSD operation stats are to be considered useful + osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + osd_plb.add_u64( + l_osd_op_wip, "op_wip", + "Replication operations currently being processed (primary)"); + osd_plb.add_u64_counter( + l_osd_op, "op", + "Client operations", + "ops", PerfCountersBuilder::PRIO_CRITICAL); + osd_plb.add_u64_counter( + l_osd_op_inb, "op_in_bytes", + "Client operations total write size", + "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); + osd_plb.add_u64_counter( + l_osd_op_outb, "op_out_bytes", + "Client operations total read size", + "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_op_lat, "op_latency", + "Latency of client operations (including queue time)", + "l", 9); + osd_plb.add_time_avg( + l_osd_op_process_lat, "op_process_latency", + "Latency of client operations (excluding queue time)"); + osd_plb.add_time_avg( + l_osd_op_prepare_lat, "op_prepare_latency", + "Latency of client operations (excluding queue time and wait for finished)"); + + osd_plb.add_u64_counter( + l_osd_op_r, "op_r", "Client read operations"); + osd_plb.add_u64_counter( + l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_op_r_lat, "op_r_latency", + "Latency of read operation (including queue time)"); + osd_plb.add_u64_counter_histogram( + l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of operation latency (including queue time) + data read"); + osd_plb.add_time_avg( + l_osd_op_r_process_lat, "op_r_process_latency", + "Latency of read operation (excluding queue time)"); + osd_plb.add_time_avg( + l_osd_op_r_prepare_lat, "op_r_prepare_latency", + "Latency of read operations (excluding queue time and wait for finished)"); + osd_plb.add_u64_counter( + l_osd_op_w, "op_w", "Client write operations"); + osd_plb.add_u64_counter( + l_osd_op_w_inb, "op_w_in_bytes", "Client data written"); + osd_plb.add_time_avg( + l_osd_op_w_lat, "op_w_latency", + "Latency of write operation (including queue time)"); + osd_plb.add_u64_counter_histogram( + l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of operation latency (including queue time) + data written"); + osd_plb.add_time_avg( + l_osd_op_w_process_lat, "op_w_process_latency", + "Latency of write operation (excluding queue time)"); + osd_plb.add_time_avg( + l_osd_op_w_prepare_lat, "op_w_prepare_latency", + "Latency of write operations (excluding queue time and wait for finished)"); + osd_plb.add_u64_counter( + l_osd_op_rw, "op_rw", + "Client read-modify-write operations"); + osd_plb.add_u64_counter( + l_osd_op_rw_inb, "op_rw_in_bytes", + "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_u64_counter( + l_osd_op_rw_outb,"op_rw_out_bytes", + "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_op_rw_lat, "op_rw_latency", + "Latency of read-modify-write operation (including queue time)"); + osd_plb.add_u64_counter_histogram( + l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of rw operation latency (including queue time) + data written"); + osd_plb.add_u64_counter_histogram( + l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of rw operation latency (including queue time) + data read"); + osd_plb.add_time_avg( + l_osd_op_rw_process_lat, "op_rw_process_latency", + "Latency of read-modify-write operation (excluding queue time)"); + osd_plb.add_time_avg( + l_osd_op_rw_prepare_lat, "op_rw_prepare_latency", + "Latency of read-modify-write operations (excluding queue time and wait for finished)"); + + // Now we move on to some more obscure stats, revert to assuming things + // are low priority unless otherwise specified. + osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); + + osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat", + "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency + osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat", + "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency + + osd_plb.add_u64_counter( + l_osd_sop, "subop", "Suboperations"); + osd_plb.add_u64_counter( + l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency"); + + osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes"); + osd_plb.add_u64_counter( + l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency"); + osd_plb.add_u64_counter( + l_osd_sop_pull, "subop_pull", "Suboperations pull requests"); + osd_plb.add_time_avg( + l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency"); + osd_plb.add_u64_counter( + l_osd_sop_push, "subop_push", "Suboperations push messages"); + osd_plb.add_u64_counter( + l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency"); + + osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent"); + osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent"); + osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES)); + + osd_plb.add_u64_counter( + l_osd_rop, "recovery_ops", + "Started recovery operations", + "rop", PerfCountersBuilder::PRIO_INTERESTING); + + osd_plb.add_u64_counter( + l_osd_rbytes, "recovery_bytes", + "recovery bytes", + "rbt", PerfCountersBuilder::PRIO_INTERESTING); + + osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load"); + osd_plb.add_u64( + l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache"); + osd_plb.add_u64( + l_osd_cached_crc_adjusted, "cached_crc_adjusted", + "Total number getting crc from crc_cache with adjusting"); + osd_plb.add_u64(l_osd_missed_crc, "missed_crc", + "Total number of crc cache misses"); + + osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups", + "pgs", PerfCountersBuilder::PRIO_USEFUL); + osd_plb.add_u64( + l_osd_pg_primary, "numpg_primary", + "Placement groups for which this osd is primary"); + osd_plb.add_u64( + l_osd_pg_replica, "numpg_replica", + "Placement groups for which this osd is replica"); + osd_plb.add_u64( + l_osd_pg_stray, "numpg_stray", + "Placement groups ready to be deleted from this osd"); + osd_plb.add_u64( + l_osd_pg_removing, "numpg_removing", + "Placement groups queued for local deletion", "pgsr", + PerfCountersBuilder::PRIO_USEFUL); + osd_plb.add_u64( + l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to"); + osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages"); + osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs"); + osd_plb.add_u64_counter( + l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates"); + osd_plb.add_u64_counter( + l_osd_waiting_for_map, "messages_delayed_for_map", + "Operations waiting for OSD map"); + + osd_plb.add_u64_counter( + l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit"); + osd_plb.add_u64_counter( + l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss"); + osd_plb.add_u64_counter( + l_osd_map_cache_miss_low, "osd_map_cache_miss_low", + "osdmap cache miss below cache lower bound"); + osd_plb.add_u64_avg( + l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg", + "osdmap cache miss, avg distance below cache lower bound"); + osd_plb.add_u64_counter( + l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit", + "OSDMap buffer cache hits"); + osd_plb.add_u64_counter( + l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss", + "OSDMap buffer cache misses"); + + osd_plb.add_u64( + l_osd_stat_bytes, "stat_bytes", "OSD size", "size", + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_u64( + l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used", + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES)); + + osd_plb.add_u64_counter( + l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations"); + + osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions"); + osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes"); + osd_plb.add_u64_counter( + l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes"); + osd_plb.add_u64_counter( + l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts"); + osd_plb.add_u64_counter( + l_osd_tier_try_flush_fail, "tier_try_flush_fail", + "Failed tier flush attempts"); + osd_plb.add_u64_counter( + l_osd_tier_evict, "tier_evict", "Tier evictions"); + osd_plb.add_u64_counter( + l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts"); + osd_plb.add_u64_counter( + l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set"); + osd_plb.add_u64_counter( + l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned"); + osd_plb.add_u64_counter( + l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)"); + osd_plb.add_u64_counter( + l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads"); + osd_plb.add_u64_counter( + l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes"); + + osd_plb.add_u64_counter( + l_osd_agent_wake, "agent_wake", "Tiering agent wake up"); + osd_plb.add_u64_counter( + l_osd_agent_skip, "agent_skip", "Objects skipped by agent"); + osd_plb.add_u64_counter( + l_osd_agent_flush, "agent_flush", "Tiering agent flushes"); + osd_plb.add_u64_counter( + l_osd_agent_evict, "agent_evict", "Tiering agent evictions"); + + osd_plb.add_u64_counter( + l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits"); + osd_plb.add_u64_counter( + l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups"); + + osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit"); + osd_plb.add_time_avg( + l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency"); + osd_plb.add_time_avg( + l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency"); + osd_plb.add_time_avg( + l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency"); + + osd_plb.add_u64_counter( + l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)"); + osd_plb.add_u64_counter( + l_osd_pg_fastinfo, "osd_pg_fastinfo", + "PG updated its info using fastinfo attr"); + osd_plb.add_u64_counter( + l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr"); + + return osd_plb.create_perf_counters(); +} + + +PerfCounters *build_recoverystate_perf(CephContext *cct) { + PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last); + + rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency"); + rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency"); + rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency"); + rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency"); + rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency"); + rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency"); + rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency"); + rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency"); + rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency"); + rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency"); + rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency"); + rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency"); + rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency"); + rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency"); + rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency"); + rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency"); + rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency"); + rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency"); + rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency"); + rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency"); + rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency"); + rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency"); + rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency"); + rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency"); + rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency"); + + return rs_perf.create_perf_counters(); +} diff --git a/src/osd/osd_perf_counters.h b/src/osd/osd_perf_counters.h new file mode 100644 index 000000000..9966a7f7d --- /dev/null +++ b/src/osd/osd_perf_counters.h @@ -0,0 +1,163 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/common_fwd.h" +#include "common/perf_counters.h" + +enum { + l_osd_first = 10000, + l_osd_op_wip, + l_osd_op, + l_osd_op_inb, + l_osd_op_outb, + l_osd_op_lat, + l_osd_op_process_lat, + l_osd_op_prepare_lat, + l_osd_op_r, + l_osd_op_r_outb, + l_osd_op_r_lat, + l_osd_op_r_lat_outb_hist, + l_osd_op_r_process_lat, + l_osd_op_r_prepare_lat, + l_osd_op_w, + l_osd_op_w_inb, + l_osd_op_w_lat, + l_osd_op_w_lat_inb_hist, + l_osd_op_w_process_lat, + l_osd_op_w_prepare_lat, + l_osd_op_rw, + l_osd_op_rw_inb, + l_osd_op_rw_outb, + l_osd_op_rw_lat, + l_osd_op_rw_lat_inb_hist, + l_osd_op_rw_lat_outb_hist, + l_osd_op_rw_process_lat, + l_osd_op_rw_prepare_lat, + + l_osd_op_before_queue_op_lat, + l_osd_op_before_dequeue_op_lat, + + l_osd_sop, + l_osd_sop_inb, + l_osd_sop_lat, + l_osd_sop_w, + l_osd_sop_w_inb, + l_osd_sop_w_lat, + l_osd_sop_pull, + l_osd_sop_pull_lat, + l_osd_sop_push, + l_osd_sop_push_inb, + l_osd_sop_push_lat, + + l_osd_pull, + l_osd_push, + l_osd_push_outb, + + l_osd_rop, + l_osd_rbytes, + + l_osd_loadavg, + l_osd_cached_crc, + l_osd_cached_crc_adjusted, + l_osd_missed_crc, + + l_osd_pg, + l_osd_pg_primary, + l_osd_pg_replica, + l_osd_pg_stray, + l_osd_pg_removing, + l_osd_hb_to, + l_osd_map, + l_osd_mape, + l_osd_mape_dup, + + l_osd_waiting_for_map, + + l_osd_map_cache_hit, + l_osd_map_cache_miss, + l_osd_map_cache_miss_low, + l_osd_map_cache_miss_low_avg, + l_osd_map_bl_cache_hit, + l_osd_map_bl_cache_miss, + + l_osd_stat_bytes, + l_osd_stat_bytes_used, + l_osd_stat_bytes_avail, + + l_osd_copyfrom, + + l_osd_tier_promote, + l_osd_tier_flush, + l_osd_tier_flush_fail, + l_osd_tier_try_flush, + l_osd_tier_try_flush_fail, + l_osd_tier_evict, + l_osd_tier_whiteout, + l_osd_tier_dirty, + l_osd_tier_clean, + l_osd_tier_delay, + l_osd_tier_proxy_read, + l_osd_tier_proxy_write, + + l_osd_agent_wake, + l_osd_agent_skip, + l_osd_agent_flush, + l_osd_agent_evict, + + l_osd_object_ctx_cache_hit, + l_osd_object_ctx_cache_total, + + l_osd_op_cache_hit, + l_osd_tier_flush_lat, + l_osd_tier_promote_lat, + l_osd_tier_r_lat, + + l_osd_pg_info, + l_osd_pg_fastinfo, + l_osd_pg_biginfo, + + l_osd_last, +}; + +PerfCounters *build_osd_logger(CephContext *cct); + +// PeeringState perf counters +enum { + rs_first = 20000, + rs_initial_latency, + rs_started_latency, + rs_reset_latency, + rs_start_latency, + rs_primary_latency, + rs_peering_latency, + rs_backfilling_latency, + rs_waitremotebackfillreserved_latency, + rs_waitlocalbackfillreserved_latency, + rs_notbackfilling_latency, + rs_repnotrecovering_latency, + rs_repwaitrecoveryreserved_latency, + rs_repwaitbackfillreserved_latency, + rs_reprecovering_latency, + rs_activating_latency, + rs_waitlocalrecoveryreserved_latency, + rs_waitremoterecoveryreserved_latency, + rs_recovering_latency, + rs_recovered_latency, + rs_clean_latency, + rs_active_latency, + rs_replicaactive_latency, + rs_stray_latency, + rs_getinfo_latency, + rs_getlog_latency, + rs_waitactingchange_latency, + rs_incomplete_latency, + rs_down_latency, + rs_getmissing_latency, + rs_waitupthru_latency, + rs_notrecovering_latency, + rs_last, +}; + +PerfCounters *build_recoverystate_perf(CephContext *cct); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc new file mode 100644 index 000000000..13358560f --- /dev/null +++ b/src/osd/osd_types.cc @@ -0,0 +1,7212 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +#include "include/ceph_features.h" +#include "include/encoding.h" +#include "include/stringify.h" +extern "C" { +#include "crush/hash.h" +} + +#include "common/Formatter.h" +#include "common/StackStringStream.h" +#include "OSDMap.h" +#include "osd_types.h" +#include "os/Transaction.h" + +using std::list; +using std::make_pair; +using std::map; +using std::ostream; +using std::pair; +using std::set; +using std::string; +using std::unique_ptr; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::decode_nohead; +using ceph::encode; +using ceph::encode_nohead; +using ceph::Formatter; +using ceph::make_timespan; +using ceph::JSONFormatter; + +using namespace std::literals; + +const char *ceph_osd_flag_name(unsigned flag) +{ + switch (flag) { + case CEPH_OSD_FLAG_ACK: return "ack"; + case CEPH_OSD_FLAG_ONNVRAM: return "onnvram"; + case CEPH_OSD_FLAG_ONDISK: return "ondisk"; + case CEPH_OSD_FLAG_RETRY: return "retry"; + case CEPH_OSD_FLAG_READ: return "read"; + case CEPH_OSD_FLAG_WRITE: return "write"; + case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap"; + case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old"; + case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads"; + case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec"; + case CEPH_OSD_FLAG_PGOP: return "pgop"; + case CEPH_OSD_FLAG_EXEC: return "exec"; + case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public"; + case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads"; + case CEPH_OSD_FLAG_RWORDERED: return "rwordered"; + case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache"; + case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks"; + case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay"; + case CEPH_OSD_FLAG_FLUSH: return "flush"; + case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone"; + case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc"; + case CEPH_OSD_FLAG_REDIRECTED: return "redirected"; + case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected"; + case CEPH_OSD_FLAG_FULL_TRY: return "full_try"; + case CEPH_OSD_FLAG_FULL_FORCE: return "full_force"; + case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect"; + case CEPH_OSD_FLAG_RETURNVEC: return "returnvec"; + default: return "???"; + } +} + +string ceph_osd_flag_string(unsigned flags) +{ + string s; + for (unsigned i=0; i<32; ++i) { + if (flags & (1u<dump_string("alert", s); + } + } +} + +// -- osd_reqid_t -- +void osd_reqid_t::dump(Formatter *f) const +{ + f->dump_stream("name") << name; + f->dump_int("inc", inc); + f->dump_unsigned("tid", tid); +} + +void osd_reqid_t::generate_test_instances(list& o) +{ + o.push_back(new osd_reqid_t); + o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678)); +} + +// -- object_locator_t -- + +void object_locator_t::encode(ceph::buffer::list& bl) const +{ + // verify that nobody's corrupted the locator + ceph_assert(hash == -1 || key.empty()); + __u8 encode_compat = 3; + ENCODE_START(6, encode_compat, bl); + encode(pool, bl); + int32_t preferred = -1; // tell old code there is no preferred osd (-1). + encode(preferred, bl); + encode(key, bl); + encode(nspace, bl); + encode(hash, bl); + if (hash != -1) + encode_compat = std::max(encode_compat, 6); // need to interpret the hash + ENCODE_FINISH_NEW_COMPAT(bl, encode_compat); +} + +void object_locator_t::decode(ceph::buffer::list::const_iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p); + if (struct_v < 2) { + int32_t op; + decode(op, p); + pool = op; + int16_t pref; + decode(pref, p); + } else { + decode(pool, p); + int32_t preferred; + decode(preferred, p); + } + decode(key, p); + if (struct_v >= 5) + decode(nspace, p); + if (struct_v >= 6) + decode(hash, p); + else + hash = -1; + DECODE_FINISH(p); + // verify that nobody's corrupted the locator + ceph_assert(hash == -1 || key.empty()); +} + +void object_locator_t::dump(Formatter *f) const +{ + f->dump_int("pool", pool); + f->dump_string("key", key); + f->dump_string("namespace", nspace); + f->dump_int("hash", hash); +} + +void object_locator_t::generate_test_instances(list& o) +{ + o.push_back(new object_locator_t); + o.push_back(new object_locator_t(123)); + o.push_back(new object_locator_t(123, 876)); + o.push_back(new object_locator_t(1, "n2")); + o.push_back(new object_locator_t(1234, "", "key")); + o.push_back(new object_locator_t(12, "n1", "key2")); +} + +// -- request_redirect_t -- +void request_redirect_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(redirect_locator, bl); + encode(redirect_object, bl); + // legacy of the removed osd_instructions member + encode((uint32_t)0, bl); + ENCODE_FINISH(bl); +} + +void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + uint32_t legacy_osd_instructions_len; + decode(redirect_locator, bl); + decode(redirect_object, bl); + decode(legacy_osd_instructions_len, bl); + if (legacy_osd_instructions_len) { + bl += legacy_osd_instructions_len; + } + DECODE_FINISH(bl); +} + +void request_redirect_t::dump(Formatter *f) const +{ + f->dump_string("object", redirect_object); + f->open_object_section("locator"); + redirect_locator.dump(f); + f->close_section(); // locator +} + +void request_redirect_t::generate_test_instances(list& o) +{ + object_locator_t loc(1, "redir_obj"); + o.push_back(new request_redirect_t()); + o.push_back(new request_redirect_t(loc, 0)); + o.push_back(new request_redirect_t(loc, "redir_obj")); + o.push_back(new request_redirect_t(loc)); +} + +void objectstore_perf_stat_t::dump(Formatter *f) const +{ + // *_ms values just for compatibility. + f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0); + f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0); + f->dump_unsigned("commit_latency_ns", os_commit_latency_ns); + f->dump_unsigned("apply_latency_ns", os_apply_latency_ns); +} + +void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const +{ + uint8_t target_v = 2; + if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) { + target_v = 1; + } + ENCODE_START(target_v, target_v, bl); + if (target_v >= 2) { + encode(os_commit_latency_ns, bl); + encode(os_apply_latency_ns, bl); + } else { + constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count(); + uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS; + uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS; + encode(commit_latency_ms, bl); // for compatibility with older monitor. + encode(apply_latency_ms, bl); // for compatibility with older monitor. + } + ENCODE_FINISH(bl); +} + +void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(2, bl); + if (struct_v >= 2) { + decode(os_commit_latency_ns, bl); + decode(os_apply_latency_ns, bl); + } else { + uint32_t commit_latency_ms; + uint32_t apply_latency_ms; + decode(commit_latency_ms, bl); + decode(apply_latency_ms, bl); + constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count(); + os_commit_latency_ns = commit_latency_ms * NS_PER_MS; + os_apply_latency_ns = apply_latency_ms * NS_PER_MS; + } + DECODE_FINISH(bl); +} + +void objectstore_perf_stat_t::generate_test_instances(std::list& o) +{ + o.push_back(new objectstore_perf_stat_t()); + o.push_back(new objectstore_perf_stat_t()); + o.back()->os_commit_latency_ns = 20000000; + o.back()->os_apply_latency_ns = 30000000; +} + +// -- osd_stat_t -- +void osd_stat_t::dump(Formatter *f, bool with_net) const +{ + f->dump_unsigned("up_from", up_from); + f->dump_unsigned("seq", seq); + f->dump_unsigned("num_pgs", num_pgs); + f->dump_unsigned("num_osds", num_osds); + f->dump_unsigned("num_per_pool_osds", num_per_pool_osds); + f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds); + + /// dump legacy stats fields to ensure backward compatibility. + f->dump_unsigned("kb", statfs.kb()); + f->dump_unsigned("kb_used", statfs.kb_used_raw()); + f->dump_unsigned("kb_used_data", statfs.kb_used_data()); + f->dump_unsigned("kb_used_omap", statfs.kb_used_omap()); + f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata()); + f->dump_unsigned("kb_avail", statfs.kb_avail()); + //////////////////// + + f->open_object_section("statfs"); + statfs.dump(f); + f->close_section(); + f->open_array_section("hb_peers"); + for (auto p : hb_peers) + f->dump_int("osd", p); + f->close_section(); + f->dump_int("snap_trim_queue_len", snap_trim_queue_len); + f->dump_int("num_snap_trimming", num_snap_trimming); + f->dump_int("num_shards_repaired", num_shards_repaired); + f->open_object_section("op_queue_age_hist"); + op_queue_age_hist.dump(f); + f->close_section(); + f->open_object_section("perf_stat"); + os_perf_stat.dump(f); + f->close_section(); + f->open_array_section("alerts"); + ::dump(f, os_alerts); + f->close_section(); + if (with_net) { + dump_ping_time(f); + } +} + +void osd_stat_t::dump_ping_time(Formatter *f) const +{ + f->open_array_section("network_ping_times"); + for (auto &i : hb_pingtime) { + f->open_object_section("entry"); + f->dump_int("osd", i.first); + const time_t lu(i.second.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + f->dump_string("last update", lustr); + f->open_array_section("interfaces"); + f->open_object_section("interface"); + f->dump_string("interface", "back"); + f->open_object_section("average"); + f->dump_float("1min", i.second.back_pingtime[0]/1000.0); + f->dump_float("5min", i.second.back_pingtime[1]/1000.0); + f->dump_float("15min", i.second.back_pingtime[2]/1000.0); + f->close_section(); // average + f->open_object_section("min"); + f->dump_float("1min", i.second.back_min[0]/1000.0); + f->dump_float("5min", i.second.back_min[1]/1000.0); + f->dump_float("15min", i.second.back_min[2]/1000.0); + f->close_section(); // min + f->open_object_section("max"); + f->dump_float("1min", i.second.back_max[0]/1000.0); + f->dump_float("5min", i.second.back_max[1]/1000.0); + f->dump_float("15min", i.second.back_max[2]/1000.0); + f->close_section(); // max + f->dump_float("last", i.second.back_last/1000.0); + f->close_section(); // interface + + if (i.second.front_pingtime[0] != 0) { + f->open_object_section("interface"); + f->dump_string("interface", "front"); + f->open_object_section("average"); + f->dump_float("1min", i.second.front_pingtime[0]/1000.0); + f->dump_float("5min", i.second.front_pingtime[1]/1000.0); + f->dump_float("15min", i.second.front_pingtime[2]/1000.0); + f->close_section(); // average + f->open_object_section("min"); + f->dump_float("1min", i.second.front_min[0]/1000.0); + f->dump_float("5min", i.second.front_min[1]/1000.0); + f->dump_float("15min", i.second.front_min[2]/1000.0); + f->close_section(); // min + f->open_object_section("max"); + f->dump_float("1min", i.second.front_max[0]/1000.0); + f->dump_float("5min", i.second.front_max[1]/1000.0); + f->dump_float("15min", i.second.front_max[2]/1000.0); + f->close_section(); // max + f->dump_float("last", i.second.front_last/1000.0); + f->close_section(); // interface + } + f->close_section(); // interfaces + f->close_section(); // entry + } + f->close_section(); // network_ping_time +} + +void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(14, 2, bl); + + //////// for compatibility //////// + int64_t kb = statfs.kb(); + int64_t kb_used = statfs.kb_used_raw(); + int64_t kb_avail = statfs.kb_avail(); + encode(kb, bl); + encode(kb_used, bl); + encode(kb_avail, bl); + /////////////////////////////////// + + encode(snap_trim_queue_len, bl); + encode(num_snap_trimming, bl); + encode(hb_peers, bl); + encode((uint32_t)0, bl); + encode(op_queue_age_hist, bl); + encode(os_perf_stat, bl, features); + encode(up_from, bl); + encode(seq, bl); + encode(num_pgs, bl); + + //////// for compatibility //////// + int64_t kb_used_data = statfs.kb_used_data(); + int64_t kb_used_omap = statfs.kb_used_omap(); + int64_t kb_used_meta = statfs.kb_used_internal_metadata(); + encode(kb_used_data, bl); + encode(kb_used_omap, bl); + encode(kb_used_meta, bl); + encode(statfs, bl); + /////////////////////////////////// + encode(os_alerts, bl); + encode(num_shards_repaired, bl); + encode(num_osds, bl); + encode(num_per_pool_osds, bl); + encode(num_per_pool_omap_osds, bl); + + // hb_pingtime map + encode((int)hb_pingtime.size(), bl); + for (auto i : hb_pingtime) { + encode(i.first, bl); // osd + encode(i.second.last_update, bl); + encode(i.second.back_pingtime[0], bl); + encode(i.second.back_pingtime[1], bl); + encode(i.second.back_pingtime[2], bl); + encode(i.second.back_min[0], bl); + encode(i.second.back_min[1], bl); + encode(i.second.back_min[2], bl); + encode(i.second.back_max[0], bl); + encode(i.second.back_max[1], bl); + encode(i.second.back_max[2], bl); + encode(i.second.back_last, bl); + encode(i.second.front_pingtime[0], bl); + encode(i.second.front_pingtime[1], bl); + encode(i.second.front_pingtime[2], bl); + encode(i.second.front_min[0], bl); + encode(i.second.front_min[1], bl); + encode(i.second.front_min[2], bl); + encode(i.second.front_max[0], bl); + encode(i.second.front_max[1], bl); + encode(i.second.front_max[2], bl); + encode(i.second.front_last, bl); + } + ENCODE_FINISH(bl); +} + +void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl) +{ + int64_t kb, kb_used,kb_avail; + int64_t kb_used_data, kb_used_omap, kb_used_meta; + DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl); + decode(kb, bl); + decode(kb_used, bl); + decode(kb_avail, bl); + decode(snap_trim_queue_len, bl); + decode(num_snap_trimming, bl); + decode(hb_peers, bl); + vector num_hb_out; + decode(num_hb_out, bl); + if (struct_v >= 3) + decode(op_queue_age_hist, bl); + if (struct_v >= 4) + decode(os_perf_stat, bl); + if (struct_v >= 6) { + decode(up_from, bl); + decode(seq, bl); + } + if (struct_v >= 7) { + decode(num_pgs, bl); + } + if (struct_v >= 8) { + decode(kb_used_data, bl); + decode(kb_used_omap, bl); + decode(kb_used_meta, bl); + } else { + kb_used_data = kb_used; + kb_used_omap = 0; + kb_used_meta = 0; + } + if (struct_v >= 9) { + decode(statfs, bl); + } else { + statfs.reset(); + statfs.total = kb << 10; + statfs.available = kb_avail << 10; + // actually it's totally unexpected to have ststfs.total < statfs.available + // here but unfortunately legacy generate_test_instances produced such a + // case hence inserting some handling rather than assert + statfs.internally_reserved = + statfs.total > statfs.available ? statfs.total - statfs.available : 0; + kb_used <<= 10; + if ((int64_t)statfs.internally_reserved > kb_used) { + statfs.internally_reserved -= kb_used; + } else { + statfs.internally_reserved = 0; + } + statfs.allocated = kb_used_data << 10; + statfs.omap_allocated = kb_used_omap << 10; + statfs.internal_metadata = kb_used_meta << 10; + } + if (struct_v >= 10) { + decode(os_alerts, bl); + } else { + os_alerts.clear(); + } + if (struct_v >= 11) { + decode(num_shards_repaired, bl); + } else { + num_shards_repaired = 0; + } + if (struct_v >= 12) { + decode(num_osds, bl); + decode(num_per_pool_osds, bl); + } else { + num_osds = 0; + num_per_pool_osds = 0; + } + if (struct_v >= 13) { + decode(num_per_pool_omap_osds, bl); + } else { + num_per_pool_omap_osds = 0; + } + hb_pingtime.clear(); + if (struct_v >= 14) { + int count; + decode(count, bl); + for (int i = 0 ; i < count ; i++) { + int osd; + decode(osd, bl); + struct Interfaces ifs; + decode(ifs.last_update, bl); + decode(ifs.back_pingtime[0],bl); + decode(ifs.back_pingtime[1], bl); + decode(ifs.back_pingtime[2], bl); + decode(ifs.back_min[0],bl); + decode(ifs.back_min[1], bl); + decode(ifs.back_min[2], bl); + decode(ifs.back_max[0],bl); + decode(ifs.back_max[1], bl); + decode(ifs.back_max[2], bl); + decode(ifs.back_last, bl); + decode(ifs.front_pingtime[0], bl); + decode(ifs.front_pingtime[1], bl); + decode(ifs.front_pingtime[2], bl); + decode(ifs.front_min[0], bl); + decode(ifs.front_min[1], bl); + decode(ifs.front_min[2], bl); + decode(ifs.front_max[0], bl); + decode(ifs.front_max[1], bl); + decode(ifs.front_max[2], bl); + decode(ifs.front_last, bl); + hb_pingtime[osd] = ifs; + } + } + DECODE_FINISH(bl); +} + +void osd_stat_t::generate_test_instances(std::list& o) +{ + o.push_back(new osd_stat_t); + + o.push_back(new osd_stat_t); + list ll; + store_statfs_t::generate_test_instances(ll); + o.back()->statfs = *ll.back(); + o.back()->hb_peers.push_back(7); + o.back()->snap_trim_queue_len = 8; + o.back()->num_snap_trimming = 99; + o.back()->num_shards_repaired = 101; + o.back()->os_alerts[0].emplace( + "some alert", "some alert details"); + o.back()->os_alerts[1].emplace( + "some alert2", "some alert2 details"); + struct Interfaces gen_interfaces = { + 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, + { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 }; + o.back()->hb_pingtime[20] = gen_interfaces; + gen_interfaces = { + 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; + o.back()->hb_pingtime[30] = gen_interfaces; +} + +// -- pg_t -- + +int pg_t::print(char *o, int maxlen) const +{ + return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps()); +} + +bool pg_t::parse(const char *s) +{ + uint64_t ppool; + uint32_t pseed; + int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed); + if (r < 2) + return false; + m_pool = ppool; + m_seed = pseed; + return true; +} + +bool spg_t::parse(const char *s) +{ + shard = shard_id_t::NO_SHARD; + uint64_t ppool; + uint32_t pseed; + uint32_t pshard; + int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed); + if (r < 2) + return false; + pgid.set_pool(ppool); + pgid.set_ps(pseed); + + const char *p = strchr(s, 's'); + if (p) { + r = sscanf(p, "s%u", &pshard); + if (r == 1) { + shard = shard_id_t(pshard); + } else { + return false; + } + } + return true; +} + +char *spg_t::calc_name(char *buf, const char *suffix_backwords) const +{ + while (*suffix_backwords) + *--buf = *suffix_backwords++; + + if (!is_no_shard()) { + buf = ritoa((uint8_t)shard.id, buf); + *--buf = 's'; + } + + return pgid.calc_name(buf, ""); +} + +ostream& operator<<(ostream& out, const spg_t &pg) +{ + char buf[spg_t::calc_name_buf_size]; + buf[spg_t::calc_name_buf_size - 1] = '\0'; + out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, ""); + return out; +} + +pg_t pg_t::get_ancestor(unsigned old_pg_num) const +{ + int old_bits = cbits(old_pg_num); + int old_mask = (1 << old_bits) - 1; + pg_t ret = *this; + ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask); + return ret; +} + +bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set *children) const +{ + //ceph_assert(m_seed < old_pg_num); + if (m_seed >= old_pg_num) { + // degenerate case + return false; + } + if (new_pg_num <= old_pg_num) + return false; + + bool split = false; + if (true) { + unsigned old_bits = cbits(old_pg_num); + unsigned old_mask = (1 << old_bits) - 1; + for (unsigned n = 1; ; n++) { + unsigned next_bit = (n << (old_bits-1)); + unsigned s = next_bit | m_seed; + + if (s < old_pg_num || s == m_seed) + continue; + if (s >= new_pg_num) + break; + if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) { + split = true; + if (children) + children->insert(pg_t(s, m_pool)); + } + } + } + if (false) { + // brute force + int old_bits = cbits(old_pg_num); + int old_mask = (1 << old_bits) - 1; + for (unsigned x = old_pg_num; x < new_pg_num; ++x) { + unsigned o = ceph_stable_mod(x, old_pg_num, old_mask); + if (o == m_seed) { + split = true; + children->insert(pg_t(x, m_pool)); + } + } + } + return split; +} + +unsigned pg_t::get_split_bits(unsigned pg_num) const { + if (pg_num == 1) + return 0; + ceph_assert(pg_num > 1); + + // Find unique p such that pg_num \in [2^(p-1), 2^p) + unsigned p = cbits(pg_num); + ceph_assert(p); // silence coverity #751330 + + if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1)))) + return p; + else + return p - 1; +} + +bool pg_t::is_merge_source( + unsigned old_pg_num, + unsigned new_pg_num, + pg_t *parent) const +{ + if (m_seed < old_pg_num && + m_seed >= new_pg_num) { + if (parent) { + pg_t t = *this; + while (t.m_seed >= new_pg_num) { + t = t.get_parent(); + } + *parent = t; + } + return true; + } + return false; +} + +pg_t pg_t::get_parent() const +{ + unsigned bits = cbits(m_seed); + ceph_assert(bits); + pg_t retval = *this; + retval.m_seed &= ~((~0)<<(bits - 1)); + return retval; +} + +hobject_t pg_t::get_hobj_start() const +{ + return hobject_t(object_t(), string(), 0, m_seed, m_pool, + string()); +} + +hobject_t pg_t::get_hobj_end(unsigned pg_num) const +{ + // note: this assumes a bitwise sort; with the legacy nibblewise + // sort a PG did not always cover a single contiguous range of the + // (bit-reversed) hash range. + unsigned bits = get_split_bits(pg_num); + uint64_t rev_start = hobject_t::_reverse_bits(m_seed); + uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1; + if (rev_end >= 0x100000000) { + ceph_assert(rev_end == 0x100000000); + return hobject_t::get_max(); + } else { + return hobject_t(object_t(), string(), CEPH_NOSNAP, + hobject_t::_reverse_bits(rev_end), m_pool, + string()); + } +} + +void pg_t::dump(Formatter *f) const +{ + f->dump_unsigned("pool", m_pool); + f->dump_unsigned("seed", m_seed); +} + +void pg_t::generate_test_instances(list& o) +{ + o.push_back(new pg_t); + o.push_back(new pg_t(1, 2)); + o.push_back(new pg_t(13123, 3)); + o.push_back(new pg_t(131223, 4)); +} + +char *pg_t::calc_name(char *buf, const char *suffix_backwords) const +{ + while (*suffix_backwords) + *--buf = *suffix_backwords++; + + buf = ritoa(m_seed, buf); + + *--buf = '.'; + + return ritoa(m_pool, buf); +} + +ostream& operator<<(ostream& out, const pg_t &pg) +{ + char buf[pg_t::calc_name_buf_size]; + buf[pg_t::calc_name_buf_size - 1] = '\0'; + out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, ""); + return out; +} + + +// -- coll_t -- + +void coll_t::calc_str() +{ + switch (type) { + case TYPE_META: + strcpy(_str_buff, "meta"); + _str = _str_buff; + break; + case TYPE_PG: + _str_buff[spg_t::calc_name_buf_size - 1] = '\0'; + _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_"); + break; + case TYPE_PG_TEMP: + _str_buff[spg_t::calc_name_buf_size - 1] = '\0'; + _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_"); + break; + default: + ceph_abort_msg("unknown collection type"); + } +} + +bool coll_t::parse(const std::string& s) +{ + if (s == "meta") { + type = TYPE_META; + pgid = spg_t(); + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + if (s.find("_head") == s.length() - 5 && + pgid.parse(s.substr(0, s.length() - 5))) { + type = TYPE_PG; + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + if (s.find("_TEMP") == s.length() - 5 && + pgid.parse(s.substr(0, s.length() - 5))) { + type = TYPE_PG_TEMP; + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + return false; +} + +void coll_t::encode(ceph::buffer::list& bl) const +{ + using ceph::encode; + // when changing this, remember to update encoded_size() too. + if (is_temp()) { + // can't express this as v2... + __u8 struct_v = 3; + encode(struct_v, bl); + encode(to_str(), bl); + } else { + __u8 struct_v = 2; + encode(struct_v, bl); + encode((__u8)type, bl); + encode(pgid, bl); + snapid_t snap = CEPH_NOSNAP; + encode(snap, bl); + } +} + +size_t coll_t::encoded_size() const +{ + size_t r = sizeof(__u8); + if (is_temp()) { + // v3 + r += sizeof(__u32); + if (_str) { + r += strlen(_str); + } + } else { + // v2 + // 1. type + r += sizeof(__u8); + // 2. pgid + // - encoding header + r += sizeof(ceph_le32) + 2 * sizeof(__u8); + // - pg_t + r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t); + // - shard_id_t + r += sizeof(int8_t); + // 3. snapid_t + r += sizeof(uint64_t); + } + + return r; +} + +void coll_t::decode(ceph::buffer::list::const_iterator& bl) +{ + using ceph::decode; + __u8 struct_v; + decode(struct_v, bl); + switch (struct_v) { + case 1: + { + snapid_t snap; + decode(pgid, bl); + decode(snap, bl); + + // infer the type + if (pgid == spg_t() && snap == 0) { + type = TYPE_META; + } else { + type = TYPE_PG; + } + removal_seq = 0; + } + break; + + case 2: + { + __u8 _type; + snapid_t snap; + decode(_type, bl); + decode(pgid, bl); + decode(snap, bl); + type = (type_t)_type; + removal_seq = 0; + } + break; + + case 3: + { + string str; + decode(str, bl); + bool ok = parse(str); + if (!ok) + throw std::domain_error(std::string("unable to parse pg ") + str); + } + break; + + default: + { + CachedStackStringStream css; + *css << "coll_t::decode(): don't know how to decode version " + << struct_v; + throw std::domain_error(css->str()); + } + } +} + +void coll_t::dump(Formatter *f) const +{ + f->dump_unsigned("type_id", (unsigned)type); + if (type != TYPE_META) + f->dump_stream("pgid") << pgid; + f->dump_string("name", to_str()); +} + +void coll_t::generate_test_instances(list& o) +{ + o.push_back(new coll_t()); + o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD))); + o.push_back(new coll_t(o.back()->get_temp())); + o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12)))); + o.push_back(new coll_t(o.back()->get_temp())); + o.push_back(new coll_t()); +} + +// --- + +std::string pg_vector_string(const vector &a) +{ + CachedStackStringStream css; + *css << "["; + for (auto i = a.cbegin(); i != a.cend(); ++i) { + if (i != a.begin()) + *css << ","; + if (*i != CRUSH_ITEM_NONE) + *css << *i; + else + *css << "NONE"; + } + *css << "]"; + return css->str(); +} + +std::string pg_state_string(uint64_t state) +{ + CachedStackStringStream css; + if (state & PG_STATE_STALE) + *css << "stale+"; + if (state & PG_STATE_CREATING) + *css << "creating+"; + if (state & PG_STATE_ACTIVE) + *css << "active+"; + if (state & PG_STATE_ACTIVATING) + *css << "activating+"; + if (state & PG_STATE_CLEAN) + *css << "clean+"; + if (state & PG_STATE_RECOVERY_WAIT) + *css << "recovery_wait+"; + if (state & PG_STATE_RECOVERY_TOOFULL) + *css << "recovery_toofull+"; + if (state & PG_STATE_RECOVERING) + *css << "recovering+"; + if (state & PG_STATE_FORCED_RECOVERY) + *css << "forced_recovery+"; + if (state & PG_STATE_DOWN) + *css << "down+"; + if (state & PG_STATE_RECOVERY_UNFOUND) + *css << "recovery_unfound+"; + if (state & PG_STATE_BACKFILL_UNFOUND) + *css << "backfill_unfound+"; + if (state & PG_STATE_UNDERSIZED) + *css << "undersized+"; + if (state & PG_STATE_DEGRADED) + *css << "degraded+"; + if (state & PG_STATE_REMAPPED) + *css << "remapped+"; + if (state & PG_STATE_PREMERGE) + *css << "premerge+"; + if (state & PG_STATE_SCRUBBING) + *css << "scrubbing+"; + if (state & PG_STATE_DEEP_SCRUB) + *css << "deep+"; + if (state & PG_STATE_INCONSISTENT) + *css << "inconsistent+"; + if (state & PG_STATE_PEERING) + *css << "peering+"; + if (state & PG_STATE_REPAIR) + *css << "repair+"; + if (state & PG_STATE_BACKFILL_WAIT) + *css << "backfill_wait+"; + if (state & PG_STATE_BACKFILLING) + *css << "backfilling+"; + if (state & PG_STATE_FORCED_BACKFILL) + *css << "forced_backfill+"; + if (state & PG_STATE_BACKFILL_TOOFULL) + *css << "backfill_toofull+"; + if (state & PG_STATE_INCOMPLETE) + *css << "incomplete+"; + if (state & PG_STATE_PEERED) + *css << "peered+"; + if (state & PG_STATE_SNAPTRIM) + *css << "snaptrim+"; + if (state & PG_STATE_SNAPTRIM_WAIT) + *css << "snaptrim_wait+"; + if (state & PG_STATE_SNAPTRIM_ERROR) + *css << "snaptrim_error+"; + if (state & PG_STATE_FAILED_REPAIR) + *css << "failed_repair+"; + if (state & PG_STATE_LAGGY) + *css << "laggy+"; + if (state & PG_STATE_WAIT) + *css << "wait+"; + auto ret = css->str(); + if (ret.length() > 0) + ret.resize(ret.length() - 1); + else + ret = "unknown"; + return ret; +} + +std::optional pg_string_state(const std::string& state) +{ + std::optional type; + if (state == "active") + type = PG_STATE_ACTIVE; + else if (state == "clean") + type = PG_STATE_CLEAN; + else if (state == "down") + type = PG_STATE_DOWN; + else if (state == "recovery_unfound") + type = PG_STATE_RECOVERY_UNFOUND; + else if (state == "backfill_unfound") + type = PG_STATE_BACKFILL_UNFOUND; + else if (state == "premerge") + type = PG_STATE_PREMERGE; + else if (state == "scrubbing") + type = PG_STATE_SCRUBBING; + else if (state == "degraded") + type = PG_STATE_DEGRADED; + else if (state == "inconsistent") + type = PG_STATE_INCONSISTENT; + else if (state == "peering") + type = PG_STATE_PEERING; + else if (state == "repair") + type = PG_STATE_REPAIR; + else if (state == "recovering") + type = PG_STATE_RECOVERING; + else if (state == "forced_recovery") + type = PG_STATE_FORCED_RECOVERY; + else if (state == "backfill_wait") + type = PG_STATE_BACKFILL_WAIT; + else if (state == "incomplete") + type = PG_STATE_INCOMPLETE; + else if (state == "stale") + type = PG_STATE_STALE; + else if (state == "remapped") + type = PG_STATE_REMAPPED; + else if (state == "deep") + type = PG_STATE_DEEP_SCRUB; + else if (state == "backfilling") + type = PG_STATE_BACKFILLING; + else if (state == "forced_backfill") + type = PG_STATE_FORCED_BACKFILL; + else if (state == "backfill_toofull") + type = PG_STATE_BACKFILL_TOOFULL; + else if (state == "recovery_wait") + type = PG_STATE_RECOVERY_WAIT; + else if (state == "recovery_toofull") + type = PG_STATE_RECOVERY_TOOFULL; + else if (state == "undersized") + type = PG_STATE_UNDERSIZED; + else if (state == "activating") + type = PG_STATE_ACTIVATING; + else if (state == "peered") + type = PG_STATE_PEERED; + else if (state == "snaptrim") + type = PG_STATE_SNAPTRIM; + else if (state == "snaptrim_wait") + type = PG_STATE_SNAPTRIM_WAIT; + else if (state == "snaptrim_error") + type = PG_STATE_SNAPTRIM_ERROR; + else if (state == "creating") + type = PG_STATE_CREATING; + else if (state == "failed_repair") + type = PG_STATE_FAILED_REPAIR; + else if (state == "laggy") + type = PG_STATE_LAGGY; + else if (state == "wait") + type = PG_STATE_WAIT; + else if (state == "unknown") + type = 0; + else + type = std::nullopt; + return type; +} + +// -- eversion_t -- +string eversion_t::get_key_name() const +{ + std::string key(32, ' '); + get_key_name(&key[0]); + key.resize(31); // remove the null terminator + return key; +} + +// -- pool_snap_info_t -- +void pool_snap_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("snapid", snapid); + f->dump_stream("stamp") << stamp; + f->dump_string("name", name); +} + +void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGPOOL3) == 0) { + __u8 struct_v = 1; + encode(struct_v, bl); + encode(snapid, bl); + encode(stamp, bl); + encode(name, bl); + return; + } + ENCODE_START(2, 2, bl); + encode(snapid, bl); + encode(stamp, bl); + encode(name, bl); + ENCODE_FINISH(bl); +} + +void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(snapid, bl); + decode(stamp, bl); + decode(name, bl); + DECODE_FINISH(bl); +} + +void pool_snap_info_t::generate_test_instances(list& o) +{ + o.push_back(new pool_snap_info_t); + o.push_back(new pool_snap_info_t); + o.back()->snapid = 1; + o.back()->stamp = utime_t(1, 2); + o.back()->name = "foo"; +} + +// -- pool_opts_t -- + +// The order of items in the list is important, therefore, +// you should always add to the end of the list when adding new options. + +typedef std::map opt_mapping_t; +static opt_mapping_t opt_mapping = boost::assign::map_list_of + ("scrub_min_interval", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE)) + ("scrub_max_interval", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE)) + ("deep_scrub_interval", pool_opts_t::opt_desc_t( + pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE)) + ("recovery_priority", pool_opts_t::opt_desc_t( + pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT)) + ("recovery_op_priority", pool_opts_t::opt_desc_t( + pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT)) + ("scrub_priority", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT)) + ("compression_mode", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR)) + ("compression_algorithm", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR)) + ("compression_required_ratio", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE)) + ("compression_max_blob_size", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT)) + ("compression_min_blob_size", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT)) + ("csum_type", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_TYPE, pool_opts_t::INT)) + ("csum_max_block", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT)) + ("csum_min_block", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT)) + ("fingerprint_algorithm", pool_opts_t::opt_desc_t( + pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR)) + ("pg_num_min", pool_opts_t::opt_desc_t( + pool_opts_t::PG_NUM_MIN, pool_opts_t::INT)) + ("target_size_bytes", pool_opts_t::opt_desc_t( + pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT)) + ("target_size_ratio", pool_opts_t::opt_desc_t( + pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE)) + ("pg_autoscale_bias", pool_opts_t::opt_desc_t( + pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE)) + ("read_lease_interval", pool_opts_t::opt_desc_t( + pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE)) + ("dedup_tier", pool_opts_t::opt_desc_t( + pool_opts_t::DEDUP_TIER, pool_opts_t::INT)) + ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t( + pool_opts_t::DEDUP_CHUNK_ALGORITHM, pool_opts_t::STR)) + ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t( + pool_opts_t::DEDUP_CDC_CHUNK_SIZE, pool_opts_t::INT)) + ("pg_num_max", pool_opts_t::opt_desc_t( + pool_opts_t::PG_NUM_MAX, pool_opts_t::INT)); + +bool pool_opts_t::is_opt_name(const std::string& name) +{ + return opt_mapping.count(name); +} + +pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) +{ + auto i = opt_mapping.find(name); + ceph_assert(i != opt_mapping.end()); + return i->second; +} + +bool pool_opts_t::is_set(pool_opts_t::key_t key) const +{ + return opts.count(key); +} + +const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const +{ + auto i = opts.find(key); + ceph_assert(i != opts.end()); + return i->second; +} + +bool pool_opts_t::unset(pool_opts_t::key_t key) { + return opts.erase(key) > 0; +} + +class pool_opts_dumper_t : public boost::static_visitor<> { +public: + pool_opts_dumper_t(const std::string& name_, Formatter* f_) : + name(name_.c_str()), f(f_) {} + + void operator()(std::string s) const { + f->dump_string(name, s); + } + void operator()(int64_t i) const { + f->dump_int(name, i); + } + void operator()(double d) const { + f->dump_float(name, d); + } + +private: + const char* name; + Formatter* f; +}; + +void pool_opts_t::dump(const std::string& name, Formatter* f) const +{ + const opt_desc_t& desc = get_opt_desc(name); + auto i = opts.find(desc.key); + if (i == opts.end()) { + return; + } + boost::apply_visitor(pool_opts_dumper_t(name, f), i->second); +} + +void pool_opts_t::dump(Formatter* f) const +{ + for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) { + const std::string& name = i->first; + const opt_desc_t& desc = i->second; + auto j = opts.find(desc.key); + if (j == opts.end()) { + continue; + } + boost::apply_visitor(pool_opts_dumper_t(name, f), j->second); + } +} + +class pool_opts_encoder_t : public boost::static_visitor<> { +public: + explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features) + : bl(bl_), + features(features) {} + + void operator()(const std::string &s) const { + encode(static_cast(pool_opts_t::STR), bl); + encode(s, bl); + } + void operator()(int64_t i) const { + encode(static_cast(pool_opts_t::INT), bl); + if (HAVE_FEATURE(features, SERVER_NAUTILUS)) { + encode(i, bl); + } else { + encode(static_cast(i), bl); + } + } + void operator()(double d) const { + encode(static_cast(pool_opts_t::DOUBLE), bl); + encode(d, bl); + } + +private: + ceph::buffer::list& bl; + uint64_t features; +}; + +void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + unsigned v = 2; + if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 1; + } + ENCODE_START(v, 1, bl); + uint32_t n = static_cast(opts.size()); + encode(n, bl); + for (auto i = opts.cbegin(); i != opts.cend(); ++i) { + encode(static_cast(i->first), bl); + boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second); + } + ENCODE_FINISH(bl); +} + +void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + __u32 n; + decode(n, bl); + opts.clear(); + while (n--) { + int32_t k, t; + decode(k, bl); + decode(t, bl); + if (t == STR) { + std::string s; + decode(s, bl); + opts[static_cast(k)] = s; + } else if (t == INT) { + int64_t i; + if (struct_v >= 2) { + decode(i, bl); + } else { + int ii; + decode(ii, bl); + i = ii; + } + opts[static_cast(k)] = i; + } else if (t == DOUBLE) { + double d; + decode(d, bl); + opts[static_cast(k)] = d; + } else { + ceph_assert(!"invalid type"); + } + } + DECODE_FINISH(bl); +} + +ostream& operator<<(ostream& out, const pool_opts_t& opts) +{ + for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) { + const std::string& name = i->first; + const pool_opts_t::opt_desc_t& desc = i->second; + auto j = opts.opts.find(desc.key); + if (j == opts.opts.end()) { + continue; + } + out << " " << name << " " << j->second; + } + return out; +} + +// -- pg_pool_t -- + +const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs"); +const char *pg_pool_t::APPLICATION_NAME_RBD("rbd"); +const char *pg_pool_t::APPLICATION_NAME_RGW("rgw"); + +void pg_pool_t::dump(Formatter *f) const +{ + f->dump_stream("create_time") << get_create_time(); + f->dump_unsigned("flags", get_flags()); + f->dump_string("flags_names", get_flags_string()); + f->dump_int("type", get_type()); + f->dump_int("size", get_size()); + f->dump_int("min_size", get_min_size()); + f->dump_int("crush_rule", get_crush_rule()); + f->dump_int("peering_crush_bucket_count", peering_crush_bucket_count); + f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target); + f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier); + f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member); + f->dump_int("object_hash", get_object_hash()); + f->dump_string("pg_autoscale_mode", + get_pg_autoscale_mode_name(pg_autoscale_mode)); + f->dump_unsigned("pg_num", get_pg_num()); + f->dump_unsigned("pg_placement_num", get_pgp_num()); + f->dump_unsigned("pg_placement_num_target", get_pgp_num_target()); + f->dump_unsigned("pg_num_target", get_pg_num_target()); + f->dump_unsigned("pg_num_pending", get_pg_num_pending()); + f->dump_object("last_pg_merge_meta", last_pg_merge_meta); + f->dump_stream("last_change") << get_last_change(); + f->dump_stream("last_force_op_resend") << get_last_force_op_resend(); + f->dump_stream("last_force_op_resend_prenautilus") + << get_last_force_op_resend_prenautilus(); + f->dump_stream("last_force_op_resend_preluminous") + << get_last_force_op_resend_preluminous(); + f->dump_unsigned("auid", get_auid()); + f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged"); + f->dump_unsigned("snap_seq", get_snap_seq()); + f->dump_unsigned("snap_epoch", get_snap_epoch()); + f->open_array_section("pool_snaps"); + for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) { + f->open_object_section("pool_snap_info"); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_stream("removed_snaps") << removed_snaps; + f->dump_unsigned("quota_max_bytes", quota_max_bytes); + f->dump_unsigned("quota_max_objects", quota_max_objects); + f->open_array_section("tiers"); + for (auto p = tiers.cbegin(); p != tiers.cend(); ++p) + f->dump_unsigned("pool_id", *p); + f->close_section(); + f->dump_int("tier_of", tier_of); + f->dump_int("read_tier", read_tier); + f->dump_int("write_tier", write_tier); + f->dump_string("cache_mode", get_cache_mode_name()); + f->dump_unsigned("target_max_bytes", target_max_bytes); + f->dump_unsigned("target_max_objects", target_max_objects); + f->dump_unsigned("cache_target_dirty_ratio_micro", + cache_target_dirty_ratio_micro); + f->dump_unsigned("cache_target_dirty_high_ratio_micro", + cache_target_dirty_high_ratio_micro); + f->dump_unsigned("cache_target_full_ratio_micro", + cache_target_full_ratio_micro); + f->dump_unsigned("cache_min_flush_age", cache_min_flush_age); + f->dump_unsigned("cache_min_evict_age", cache_min_evict_age); + f->dump_string("erasure_code_profile", erasure_code_profile); + f->open_object_section("hit_set_params"); + hit_set_params.dump(f); + f->close_section(); // hit_set_params + f->dump_unsigned("hit_set_period", hit_set_period); + f->dump_unsigned("hit_set_count", hit_set_count); + f->dump_bool("use_gmt_hitset", use_gmt_hitset); + f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote); + f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote); + f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate); + f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n); + f->open_array_section("grade_table"); + for (unsigned i = 0; i < hit_set_count; ++i) + f->dump_unsigned("value", get_grade(i)); + f->close_section(); + f->dump_unsigned("stripe_width", get_stripe_width()); + f->dump_unsigned("expected_num_objects", expected_num_objects); + f->dump_bool("fast_read", fast_read); + f->open_object_section("options"); + opts.dump(f); + f->close_section(); // options + f->open_object_section("application_metadata"); + for (auto &app_pair : application_metadata) { + f->open_object_section(app_pair.first.c_str()); + for (auto &kv_pair : app_pair.second) { + f->dump_string(kv_pair.first.c_str(), kv_pair.second); + } + f->close_section(); // application + } + f->close_section(); // application_metadata +} + +void pg_pool_t::convert_to_pg_shards(const vector &from, set* to) const { + for (size_t i = 0; i < from.size(); ++i) { + if (from[i] != CRUSH_ITEM_NONE) { + to->insert( + pg_shard_t( + from[i], + is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + } +} + +void pg_pool_t::calc_pg_masks() +{ + pg_num_mask = (1 << cbits(pg_num-1)) - 1; + pgp_num_mask = (1 << cbits(pgp_num-1)) - 1; +} + +unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const +{ + if (pg_num == pg_num_mask + 1) + return pg_num; // power-of-2 split + unsigned mask = pg_num_mask >> 1; + if ((pgid.ps() & mask) < (pg_num & mask)) + return pg_num_mask + 1; // smaller bin size (already split) + else + return (pg_num_mask + 1) >> 1; // bigger bin (not yet split) +} + +bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const +{ + if (pg_num_pending >= pg_num) { + return false; + } + if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) { + if (target) { + *target = false; + } + return true; + } + for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) { + if (pg_t(ps, pgid.pool()).get_parent() == pgid) { + if (target) { + *target = true; + } + return true; + } + } + return false; +} + +/* + * we have two snap modes: + * - pool snaps + * - snap existence/non-existence defined by snaps[] and snap_seq + * - user managed snaps + * - existence tracked by librados user + */ +bool pg_pool_t::is_pool_snaps_mode() const +{ + return has_flag(FLAG_POOL_SNAPS); +} + +bool pg_pool_t::is_unmanaged_snaps_mode() const +{ + return has_flag(FLAG_SELFMANAGED_SNAPS); +} + +bool pg_pool_t::is_removed_snap(snapid_t s) const +{ + if (is_pool_snaps_mode()) + return s <= get_snap_seq() && snaps.count(s) == 0; + else + return removed_snaps.contains(s); +} + +snapid_t pg_pool_t::snap_exists(std::string_view s) const +{ + for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) + if (p->second.name == s) + return p->second.snapid; + return 0; +} + +void pg_pool_t::add_snap(const char *n, utime_t stamp) +{ + ceph_assert(!is_unmanaged_snaps_mode()); + flags |= FLAG_POOL_SNAPS; + snapid_t s = get_snap_seq() + 1; + snap_seq = s; + snaps[s].snapid = s; + snaps[s].name = n; + snaps[s].stamp = stamp; +} + +uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat) +{ + ceph_assert(!is_pool_snaps_mode()); + if (snap_seq == 0) { + if (preoctopus_compat) { + // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after + // mimic this field is not decoded but our flag is set; pre-mimic, we + // have a non-empty removed_snaps to signifiy a non-pool-snaps pool. + removed_snaps.insert(snapid_t(1)); + } + snap_seq = 1; + } + flags |= FLAG_SELFMANAGED_SNAPS; + snap_seq = snap_seq + 1; + return snap_seq; +} + +void pg_pool_t::remove_snap(snapid_t s) +{ + ceph_assert(snaps.count(s)); + snaps.erase(s); + snap_seq = snap_seq + 1; +} + +void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat) +{ + ceph_assert(is_unmanaged_snaps_mode()); + ++snap_seq; + if (preoctopus_compat) { + removed_snaps.insert(s); + // try to add in the new seq, just to try to keep the interval_set contiguous + if (!removed_snaps.contains(get_snap_seq())) { + removed_snaps.insert(get_snap_seq()); + } + } +} + +SnapContext pg_pool_t::get_snap_context() const +{ + vector s(snaps.size()); + unsigned i = 0; + for (auto p = snaps.crbegin(); p != snaps.crend(); ++p) + s[i++] = p->first; + return SnapContext(get_snap_seq(), s); +} + +uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const +{ + if (ns.empty()) + return ceph_str_hash(object_hash, key.data(), key.length()); + int nsl = ns.length(); + int len = key.length() + nsl + 1; + char buf[len]; + memcpy(&buf[0], ns.data(), nsl); + buf[nsl] = '\037'; + memcpy(&buf[nsl+1], key.data(), key.length()); + return ceph_str_hash(object_hash, &buf[0], len); +} + +uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const +{ + return ceph_stable_mod(v, pg_num, pg_num_mask); +} + +/* + * map a raw pg (with full precision ps) into an actual pg, for storage + */ +pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const +{ + pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask)); + return pg; +} + +/* + * map raw pg (full precision ps) into a placement seed. include + * pool id in that value so that different pools don't use the same + * seeds. + */ +ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const +{ + if (flags & FLAG_HASHPSPOOL) { + // Hash the pool id so that pool PGs do not overlap. + return + crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask), + pg.pool()); + } else { + // Legacy behavior; add ps and pool together. This is not a great + // idea because the PGs from each pool will essentially overlap on + // top of each other: 0.5 == 1.4 == 2.3 == ... + return + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) + + pg.pool(); + } +} + +uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const +{ + uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123); + if (pg_num == pg_num_mask + 1) { + r &= ~pg_num_mask; + } else { + unsigned smaller_mask = pg_num_mask >> 1; + if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) { + r &= ~pg_num_mask; + } else { + r &= ~smaller_mask; + } + } + r |= pg.ps(); + return r; +} + +void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGPOOL3) == 0) { + // this encoding matches the old struct ceph_pg_pool + __u8 struct_v = 2; + encode(struct_v, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + + __u32 n = snaps.size(); + encode(n, bl); + n = removed_snaps.num_intervals(); + encode(n, bl); + + encode(auid, bl); + + encode_nohead(snaps, bl, features); + encode_nohead(removed_snaps, bl); + return; + } + + if ((features & CEPH_FEATURE_OSDENC) == 0) { + __u8 struct_v = 4; + encode(struct_v, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + encode(flags, bl); + encode((uint32_t)0, bl); // crash_replay_interval + return; + } + + if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) { + // we simply added last_force_op_resend here, which is a fully + // backward compatible change. however, encoding the same map + // differently between monitors triggers scrub noise (even though + // they are decodable without the feature), so let's be pendantic + // about it. + ENCODE_START(14, 5, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + encode(flags, bl); + encode((uint32_t)0, bl); // crash_replay_interval + encode(min_size, bl); + encode(quota_max_bytes, bl); + encode(quota_max_objects, bl); + encode(tiers, bl); + encode(tier_of, bl); + __u8 c = cache_mode; + encode(c, bl); + encode(read_tier, bl); + encode(write_tier, bl); + encode(properties, bl); + encode(hit_set_params, bl); + encode(hit_set_period, bl); + encode(hit_set_count, bl); + encode(stripe_width, bl); + encode(target_max_bytes, bl); + encode(target_max_objects, bl); + encode(cache_target_dirty_ratio_micro, bl); + encode(cache_target_full_ratio_micro, bl); + encode(cache_min_flush_age, bl); + encode(cache_min_evict_age, bl); + encode(erasure_code_profile, bl); + ENCODE_FINISH(bl); + return; + } + + uint8_t v = 30; + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) { + // this was the first post-hammer thing we added; if it's missing, encode + // like hammer. + v = 21; + } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 24; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 26; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 27; + } else if (!is_stretch_pool()) { + v = 29; + } + + ENCODE_START(v, 5, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + if (v >= 27) { + encode(flags, bl); + } else { + auto tmp = flags; + tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING); + encode(tmp, bl); + } + encode((uint32_t)0, bl); // crash_replay_interval + encode(min_size, bl); + encode(quota_max_bytes, bl); + encode(quota_max_objects, bl); + encode(tiers, bl); + encode(tier_of, bl); + __u8 c = cache_mode; + encode(c, bl); + encode(read_tier, bl); + encode(write_tier, bl); + encode(properties, bl); + encode(hit_set_params, bl); + encode(hit_set_period, bl); + encode(hit_set_count, bl); + encode(stripe_width, bl); + encode(target_max_bytes, bl); + encode(target_max_objects, bl); + encode(cache_target_dirty_ratio_micro, bl); + encode(cache_target_full_ratio_micro, bl); + encode(cache_min_flush_age, bl); + encode(cache_min_evict_age, bl); + encode(erasure_code_profile, bl); + encode(last_force_op_resend_preluminous, bl); + encode(min_read_recency_for_promote, bl); + encode(expected_num_objects, bl); + if (v >= 19) { + encode(cache_target_dirty_high_ratio_micro, bl); + } + if (v >= 20) { + encode(min_write_recency_for_promote, bl); + } + if (v >= 21) { + encode(use_gmt_hitset, bl); + } + if (v >= 22) { + encode(fast_read, bl); + } + if (v >= 23) { + encode(hit_set_grade_decay_rate, bl); + encode(hit_set_search_last_n, bl); + } + if (v >= 24) { + encode(opts, bl, features); + } + if (v >= 25) { + encode(last_force_op_resend_prenautilus, bl); + } + if (v >= 26) { + encode(application_metadata, bl); + } + if (v >= 27) { + encode(create_time, bl); + } + if (v >= 28) { + encode(pg_num_target, bl); + encode(pgp_num_target, bl); + encode(pg_num_pending, bl); + encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01] + encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01] + encode(last_force_op_resend, bl); + encode(pg_autoscale_mode, bl); + } + if (v >= 29) { + encode(last_pg_merge_meta, bl); + } + if (v >= 30) { + encode(peering_crush_bucket_count, bl); + encode(peering_crush_bucket_target, bl); + encode(peering_crush_bucket_barrier, bl); + encode(peering_crush_mandatory_member, bl); + } + ENCODE_FINISH(bl); +} + +void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl); + decode(type, bl); + decode(size, bl); + decode(crush_rule, bl); + decode(object_hash, bl); + decode(pg_num, bl); + decode(pgp_num, bl); + { + __u32 lpg_num, lpgp_num; + decode(lpg_num, bl); + decode(lpgp_num, bl); + } + decode(last_change, bl); + decode(snap_seq, bl); + decode(snap_epoch, bl); + + if (struct_v >= 3) { + decode(snaps, bl); + decode(removed_snaps, bl); + decode(auid, bl); + } else { + __u32 n, m; + decode(n, bl); + decode(m, bl); + decode(auid, bl); + decode_nohead(n, snaps, bl); + decode_nohead(m, removed_snaps, bl); + } + + if (struct_v >= 4) { + decode(flags, bl); + uint32_t crash_replay_interval; + decode(crash_replay_interval, bl); + } else { + flags = 0; + } + // upgrade path for selfmanaged vs pool snaps + if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) { + if (!removed_snaps.empty()) { + flags |= FLAG_SELFMANAGED_SNAPS; + } else { + flags |= FLAG_POOL_SNAPS; + } + } + if (struct_v >= 7) { + decode(min_size, bl); + } else { + min_size = size - size/2; + } + if (struct_v >= 8) { + decode(quota_max_bytes, bl); + decode(quota_max_objects, bl); + } + if (struct_v >= 9) { + decode(tiers, bl); + decode(tier_of, bl); + __u8 v; + decode(v, bl); + cache_mode = (cache_mode_t)v; + decode(read_tier, bl); + decode(write_tier, bl); + } + if (struct_v >= 10) { + decode(properties, bl); + } + if (struct_v >= 11) { + decode(hit_set_params, bl); + decode(hit_set_period, bl); + decode(hit_set_count, bl); + } else { + pg_pool_t def; + hit_set_period = def.hit_set_period; + hit_set_count = def.hit_set_count; + } + if (struct_v >= 12) { + decode(stripe_width, bl); + } else { + set_stripe_width(0); + } + if (struct_v >= 13) { + decode(target_max_bytes, bl); + decode(target_max_objects, bl); + decode(cache_target_dirty_ratio_micro, bl); + decode(cache_target_full_ratio_micro, bl); + decode(cache_min_flush_age, bl); + decode(cache_min_evict_age, bl); + } else { + target_max_bytes = 0; + target_max_objects = 0; + cache_target_dirty_ratio_micro = 0; + cache_target_full_ratio_micro = 0; + cache_min_flush_age = 0; + cache_min_evict_age = 0; + } + if (struct_v >= 14) { + decode(erasure_code_profile, bl); + } + if (struct_v >= 15) { + decode(last_force_op_resend_preluminous, bl); + } else { + last_force_op_resend_preluminous = 0; + } + if (struct_v >= 16) { + decode(min_read_recency_for_promote, bl); + } else { + min_read_recency_for_promote = 1; + } + if (struct_v >= 17) { + decode(expected_num_objects, bl); + } else { + expected_num_objects = 0; + } + if (struct_v >= 19) { + decode(cache_target_dirty_high_ratio_micro, bl); + } else { + cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro; + } + if (struct_v >= 20) { + decode(min_write_recency_for_promote, bl); + } else { + min_write_recency_for_promote = 1; + } + if (struct_v >= 21) { + decode(use_gmt_hitset, bl); + } else { + use_gmt_hitset = false; + } + if (struct_v >= 22) { + decode(fast_read, bl); + } else { + fast_read = false; + } + if (struct_v >= 23) { + decode(hit_set_grade_decay_rate, bl); + decode(hit_set_search_last_n, bl); + } else { + hit_set_grade_decay_rate = 0; + hit_set_search_last_n = 1; + } + if (struct_v >= 24) { + decode(opts, bl); + } + if (struct_v >= 25) { + decode(last_force_op_resend_prenautilus, bl); + } else { + last_force_op_resend_prenautilus = last_force_op_resend_preluminous; + } + if (struct_v >= 26) { + decode(application_metadata, bl); + } + if (struct_v >= 27) { + decode(create_time, bl); + } + if (struct_v >= 28) { + decode(pg_num_target, bl); + decode(pgp_num_target, bl); + decode(pg_num_pending, bl); + epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started; + decode(old_merge_last_epoch_started, bl); + decode(old_merge_last_epoch_clean, bl); + decode(last_force_op_resend, bl); + decode(pg_autoscale_mode, bl); + if (struct_v >= 29) { + decode(last_pg_merge_meta, bl); + } else { + last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean; + last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started; + } + } else { + pg_num_target = pg_num; + pgp_num_target = pgp_num; + pg_num_pending = pg_num; + last_force_op_resend = last_force_op_resend_prenautilus; + pg_autoscale_mode = pg_autoscale_mode_t::WARN; // default to warn on upgrade + } + if (struct_v >= 30) { + decode(peering_crush_bucket_count, bl); + decode(peering_crush_bucket_target, bl); + decode(peering_crush_bucket_barrier, bl); + decode(peering_crush_mandatory_member, bl); + } + DECODE_FINISH(bl); + calc_pg_masks(); + calc_grade_table(); +} + +bool pg_pool_t::stretch_set_can_peer(const set& want, const OSDMap& osdmap, + std::ostream * out) const +{ + if (!is_stretch_pool()) return true; + const uint32_t barrier_id = peering_crush_bucket_barrier; + const uint32_t barrier_count = peering_crush_bucket_count; + set ancestors; + const shared_ptr& crush = osdmap.crush; + for (int osdid : want) { + int ancestor = crush->get_parent_of_type(osdid, barrier_id, + crush_rule); + ancestors.insert(ancestor); + } + if (ancestors.size() < barrier_count) { + if (out) { + *out << __func__ << ": not enough crush buckets with OSDs in want set " + << want; + } + return false; + } else if (peering_crush_mandatory_member != CRUSH_ITEM_NONE && + !ancestors.count(peering_crush_mandatory_member)) { + if (out) { + *out << __func__ << ": missing mandatory crush bucket member " + << peering_crush_mandatory_member; + } + return false; + } + return true; +} + +void pg_pool_t::generate_test_instances(list& o) +{ + pg_pool_t a; + o.push_back(new pg_pool_t(a)); + + a.create_time = utime_t(4,5); + a.type = TYPE_REPLICATED; + a.size = 2; + a.crush_rule = 3; + a.object_hash = 4; + a.pg_num = 6; + a.pgp_num = 4; + a.pgp_num_target = 4; + a.pg_num_target = 5; + a.pg_num_pending = 5; + a.last_pg_merge_meta.last_epoch_started = 2; + a.last_pg_merge_meta.last_epoch_clean = 2; + a.last_change = 9; + a.last_force_op_resend = 123823; + a.last_force_op_resend_preluminous = 123824; + a.snap_seq = 10; + a.snap_epoch = 11; + a.flags = FLAG_POOL_SNAPS; + a.auid = 12; + a.quota_max_bytes = 473; + a.quota_max_objects = 474; + o.push_back(new pg_pool_t(a)); + + a.snaps[3].name = "asdf"; + a.snaps[3].snapid = 3; + a.snaps[3].stamp = utime_t(123, 4); + a.snaps[6].name = "qwer"; + a.snaps[6].snapid = 6; + a.snaps[6].stamp = utime_t(23423, 4); + o.push_back(new pg_pool_t(a)); + + a.flags = FLAG_SELFMANAGED_SNAPS; + a.snaps.clear(); + a.removed_snaps.insert(2); + a.quota_max_bytes = 2473; + a.quota_max_objects = 4374; + a.tiers.insert(0); + a.tiers.insert(1); + a.tier_of = 2; + a.cache_mode = CACHEMODE_WRITEBACK; + a.read_tier = 1; + a.write_tier = 1; + a.hit_set_params = HitSet::Params(new BloomHitSet::Params); + a.hit_set_period = 3600; + a.hit_set_count = 8; + a.min_read_recency_for_promote = 1; + a.min_write_recency_for_promote = 1; + a.hit_set_grade_decay_rate = 50; + a.hit_set_search_last_n = 1; + a.calc_grade_table(); + a.set_stripe_width(12345); + a.target_max_bytes = 1238132132; + a.target_max_objects = 1232132; + a.cache_target_dirty_ratio_micro = 187232; + a.cache_target_dirty_high_ratio_micro = 309856; + a.cache_target_full_ratio_micro = 987222; + a.cache_min_flush_age = 231; + a.cache_min_evict_age = 2321; + a.erasure_code_profile = "profile in osdmap"; + a.expected_num_objects = 123456; + a.fast_read = false; + a.application_metadata = {{"rbd", {{"key", "value"}}}}; + o.push_back(new pg_pool_t(a)); +} + +ostream& operator<<(ostream& out, const pg_pool_t& p) +{ + out << p.get_type_name(); + if (p.get_type_name() == "erasure") { + out << " profile " << p.erasure_code_profile; + } + out << " size " << p.get_size() + << " min_size " << p.get_min_size() + << " crush_rule " << p.get_crush_rule() + << " object_hash " << p.get_object_hash_name() + << " pg_num " << p.get_pg_num() + << " pgp_num " << p.get_pgp_num(); + if (p.get_pg_num_target() != p.get_pg_num()) { + out << " pg_num_target " << p.get_pg_num_target(); + } + if (p.get_pgp_num_target() != p.get_pgp_num()) { + out << " pgp_num_target " << p.get_pgp_num_target(); + } + if (p.get_pg_num_pending() != p.get_pg_num()) { + out << " pg_num_pending " << p.get_pg_num_pending(); + } + if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) { + out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode); + } + out << " last_change " << p.get_last_change(); + if (p.get_last_force_op_resend() || + p.get_last_force_op_resend_prenautilus() || + p.get_last_force_op_resend_preluminous()) + out << " lfor " << p.get_last_force_op_resend() << "/" + << p.get_last_force_op_resend_prenautilus() << "/" + << p.get_last_force_op_resend_preluminous(); + if (p.get_auid()) + out << " owner " << p.get_auid(); + if (p.flags) + out << " flags " << p.get_flags_string(); + if (p.quota_max_bytes) + out << " max_bytes " << p.quota_max_bytes; + if (p.quota_max_objects) + out << " max_objects " << p.quota_max_objects; + if (!p.tiers.empty()) + out << " tiers " << p.tiers; + if (p.is_tier()) + out << " tier_of " << p.tier_of; + if (p.has_read_tier()) + out << " read_tier " << p.read_tier; + if (p.has_write_tier()) + out << " write_tier " << p.write_tier; + if (p.cache_mode) + out << " cache_mode " << p.get_cache_mode_name(); + if (p.target_max_bytes) + out << " target_bytes " << p.target_max_bytes; + if (p.target_max_objects) + out << " target_objects " << p.target_max_objects; + if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) { + out << " hit_set " << p.hit_set_params + << " " << p.hit_set_period << "s" + << " x" << p.hit_set_count << " decay_rate " + << p.hit_set_grade_decay_rate + << " search_last_n " << p.hit_set_search_last_n; + } + if (p.min_read_recency_for_promote) + out << " min_read_recency_for_promote " << p.min_read_recency_for_promote; + if (p.min_write_recency_for_promote) + out << " min_write_recency_for_promote " << p.min_write_recency_for_promote; + out << " stripe_width " << p.get_stripe_width(); + if (p.expected_num_objects) + out << " expected_num_objects " << p.expected_num_objects; + if (p.fast_read) + out << " fast_read " << p.fast_read; + out << p.opts; + if (!p.application_metadata.empty()) { + out << " application "; + for (auto it = p.application_metadata.begin(); + it != p.application_metadata.end(); ++it) { + if (it != p.application_metadata.begin()) + out << ","; + out << it->first; + } + } + return out; +} + + +// -- object_stat_sum_t -- + +void object_stat_sum_t::dump(Formatter *f) const +{ + f->dump_int("num_bytes", num_bytes); + f->dump_int("num_objects", num_objects); + f->dump_int("num_object_clones", num_object_clones); + f->dump_int("num_object_copies", num_object_copies); + f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary); + f->dump_int("num_objects_missing", num_objects_missing); + f->dump_int("num_objects_degraded", num_objects_degraded); + f->dump_int("num_objects_misplaced", num_objects_misplaced); + f->dump_int("num_objects_unfound", num_objects_unfound); + f->dump_int("num_objects_dirty", num_objects_dirty); + f->dump_int("num_whiteouts", num_whiteouts); + f->dump_int("num_read", num_rd); + f->dump_int("num_read_kb", num_rd_kb); + f->dump_int("num_write", num_wr); + f->dump_int("num_write_kb", num_wr_kb); + f->dump_int("num_scrub_errors", num_scrub_errors); + f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors); + f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors); + f->dump_int("num_objects_recovered", num_objects_recovered); + f->dump_int("num_bytes_recovered", num_bytes_recovered); + f->dump_int("num_keys_recovered", num_keys_recovered); + f->dump_int("num_objects_omap", num_objects_omap); + f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive); + f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive); + f->dump_int("num_flush", num_flush); + f->dump_int("num_flush_kb", num_flush_kb); + f->dump_int("num_evict", num_evict); + f->dump_int("num_evict_kb", num_evict_kb); + f->dump_int("num_promote", num_promote); + f->dump_int("num_flush_mode_high", num_flush_mode_high); + f->dump_int("num_flush_mode_low", num_flush_mode_low); + f->dump_int("num_evict_mode_some", num_evict_mode_some); + f->dump_int("num_evict_mode_full", num_evict_mode_full); + f->dump_int("num_objects_pinned", num_objects_pinned); + f->dump_int("num_legacy_snapsets", num_legacy_snapsets); + f->dump_int("num_large_omap_objects", num_large_omap_objects); + f->dump_int("num_objects_manifest", num_objects_manifest); + f->dump_int("num_omap_bytes", num_omap_bytes); + f->dump_int("num_omap_keys", num_omap_keys); + f->dump_int("num_objects_repaired", num_objects_repaired); +} + +void object_stat_sum_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(20, 14, bl); +#if defined(CEPH_LITTLE_ENDIAN) + bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t)); +#else + encode(num_bytes, bl); + encode(num_objects, bl); + encode(num_object_clones, bl); + encode(num_object_copies, bl); + encode(num_objects_missing_on_primary, bl); + encode(num_objects_degraded, bl); + encode(num_objects_unfound, bl); + encode(num_rd, bl); + encode(num_rd_kb, bl); + encode(num_wr, bl); + encode(num_wr_kb, bl); + encode(num_scrub_errors, bl); + encode(num_objects_recovered, bl); + encode(num_bytes_recovered, bl); + encode(num_keys_recovered, bl); + encode(num_shallow_scrub_errors, bl); + encode(num_deep_scrub_errors, bl); + encode(num_objects_dirty, bl); + encode(num_whiteouts, bl); + encode(num_objects_omap, bl); + encode(num_objects_hit_set_archive, bl); + encode(num_objects_misplaced, bl); + encode(num_bytes_hit_set_archive, bl); + encode(num_flush, bl); + encode(num_flush_kb, bl); + encode(num_evict, bl); + encode(num_evict_kb, bl); + encode(num_promote, bl); + encode(num_flush_mode_high, bl); + encode(num_flush_mode_low, bl); + encode(num_evict_mode_some, bl); + encode(num_evict_mode_full, bl); + encode(num_objects_pinned, bl); + encode(num_objects_missing, bl); + encode(num_legacy_snapsets, bl); + encode(num_large_omap_objects, bl); + encode(num_objects_manifest, bl); + encode(num_omap_bytes, bl); + encode(num_omap_keys, bl); + encode(num_objects_repaired, bl); +#endif + ENCODE_FINISH(bl); +} + +void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl) +{ + bool decode_finish = false; + static const int STAT_SUM_DECODE_VERSION = 20; + DECODE_START(STAT_SUM_DECODE_VERSION, bl); +#if defined(CEPH_LITTLE_ENDIAN) + if (struct_v == STAT_SUM_DECODE_VERSION) { + bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes)); + decode_finish = true; + } +#endif + if (!decode_finish) { + decode(num_bytes, bl); + decode(num_objects, bl); + decode(num_object_clones, bl); + decode(num_object_copies, bl); + decode(num_objects_missing_on_primary, bl); + decode(num_objects_degraded, bl); + decode(num_objects_unfound, bl); + decode(num_rd, bl); + decode(num_rd_kb, bl); + decode(num_wr, bl); + decode(num_wr_kb, bl); + decode(num_scrub_errors, bl); + decode(num_objects_recovered, bl); + decode(num_bytes_recovered, bl); + decode(num_keys_recovered, bl); + decode(num_shallow_scrub_errors, bl); + decode(num_deep_scrub_errors, bl); + decode(num_objects_dirty, bl); + decode(num_whiteouts, bl); + decode(num_objects_omap, bl); + decode(num_objects_hit_set_archive, bl); + decode(num_objects_misplaced, bl); + decode(num_bytes_hit_set_archive, bl); + decode(num_flush, bl); + decode(num_flush_kb, bl); + decode(num_evict, bl); + decode(num_evict_kb, bl); + decode(num_promote, bl); + decode(num_flush_mode_high, bl); + decode(num_flush_mode_low, bl); + decode(num_evict_mode_some, bl); + decode(num_evict_mode_full, bl); + decode(num_objects_pinned, bl); + decode(num_objects_missing, bl); + if (struct_v >= 16) { + decode(num_legacy_snapsets, bl); + } else { + num_legacy_snapsets = num_object_clones; // upper bound + } + if (struct_v >= 17) { + decode(num_large_omap_objects, bl); + } + if (struct_v >= 18) { + decode(num_objects_manifest, bl); + } + if (struct_v >= 19) { + decode(num_omap_bytes, bl); + decode(num_omap_keys, bl); + } + if (struct_v >= 20) { + decode(num_objects_repaired, bl); + } + } + DECODE_FINISH(bl); +} + +void object_stat_sum_t::generate_test_instances(list& o) +{ + object_stat_sum_t a; + + a.num_bytes = 1; + a.num_objects = 3; + a.num_object_clones = 4; + a.num_object_copies = 5; + a.num_objects_missing_on_primary = 6; + a.num_objects_missing = 123; + a.num_objects_degraded = 7; + a.num_objects_unfound = 8; + a.num_rd = 9; a.num_rd_kb = 10; + a.num_wr = 11; a.num_wr_kb = 12; + a.num_objects_recovered = 14; + a.num_bytes_recovered = 15; + a.num_keys_recovered = 16; + a.num_deep_scrub_errors = 17; + a.num_shallow_scrub_errors = 18; + a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors; + a.num_objects_dirty = 21; + a.num_whiteouts = 22; + a.num_objects_misplaced = 1232; + a.num_objects_hit_set_archive = 2; + a.num_bytes_hit_set_archive = 27; + a.num_flush = 5; + a.num_flush_kb = 6; + a.num_evict = 7; + a.num_evict_kb = 8; + a.num_promote = 9; + a.num_flush_mode_high = 0; + a.num_flush_mode_low = 1; + a.num_evict_mode_some = 1; + a.num_evict_mode_full = 0; + a.num_objects_pinned = 20; + a.num_large_omap_objects = 5; + a.num_objects_manifest = 2; + a.num_omap_bytes = 20000; + a.num_omap_keys = 200; + a.num_objects_repaired = 300; + o.push_back(new object_stat_sum_t(a)); +} + +void object_stat_sum_t::add(const object_stat_sum_t& o) +{ + num_bytes += o.num_bytes; + num_objects += o.num_objects; + num_object_clones += o.num_object_clones; + num_object_copies += o.num_object_copies; + num_objects_missing_on_primary += o.num_objects_missing_on_primary; + num_objects_missing += o.num_objects_missing; + num_objects_degraded += o.num_objects_degraded; + num_objects_misplaced += o.num_objects_misplaced; + num_rd += o.num_rd; + num_rd_kb += o.num_rd_kb; + num_wr += o.num_wr; + num_wr_kb += o.num_wr_kb; + num_objects_unfound += o.num_objects_unfound; + num_scrub_errors += o.num_scrub_errors; + num_shallow_scrub_errors += o.num_shallow_scrub_errors; + num_deep_scrub_errors += o.num_deep_scrub_errors; + num_objects_recovered += o.num_objects_recovered; + num_bytes_recovered += o.num_bytes_recovered; + num_keys_recovered += o.num_keys_recovered; + num_objects_dirty += o.num_objects_dirty; + num_whiteouts += o.num_whiteouts; + num_objects_omap += o.num_objects_omap; + num_objects_hit_set_archive += o.num_objects_hit_set_archive; + num_bytes_hit_set_archive += o.num_bytes_hit_set_archive; + num_flush += o.num_flush; + num_flush_kb += o.num_flush_kb; + num_evict += o.num_evict; + num_evict_kb += o.num_evict_kb; + num_promote += o.num_promote; + num_flush_mode_high += o.num_flush_mode_high; + num_flush_mode_low += o.num_flush_mode_low; + num_evict_mode_some += o.num_evict_mode_some; + num_evict_mode_full += o.num_evict_mode_full; + num_objects_pinned += o.num_objects_pinned; + num_legacy_snapsets += o.num_legacy_snapsets; + num_large_omap_objects += o.num_large_omap_objects; + num_objects_manifest += o.num_objects_manifest; + num_omap_bytes += o.num_omap_bytes; + num_omap_keys += o.num_omap_keys; + num_objects_repaired += o.num_objects_repaired; +} + +void object_stat_sum_t::sub(const object_stat_sum_t& o) +{ + num_bytes -= o.num_bytes; + num_objects -= o.num_objects; + num_object_clones -= o.num_object_clones; + num_object_copies -= o.num_object_copies; + num_objects_missing_on_primary -= o.num_objects_missing_on_primary; + num_objects_missing -= o.num_objects_missing; + num_objects_degraded -= o.num_objects_degraded; + num_objects_misplaced -= o.num_objects_misplaced; + num_rd -= o.num_rd; + num_rd_kb -= o.num_rd_kb; + num_wr -= o.num_wr; + num_wr_kb -= o.num_wr_kb; + num_objects_unfound -= o.num_objects_unfound; + num_scrub_errors -= o.num_scrub_errors; + num_shallow_scrub_errors -= o.num_shallow_scrub_errors; + num_deep_scrub_errors -= o.num_deep_scrub_errors; + num_objects_recovered -= o.num_objects_recovered; + num_bytes_recovered -= o.num_bytes_recovered; + num_keys_recovered -= o.num_keys_recovered; + num_objects_dirty -= o.num_objects_dirty; + num_whiteouts -= o.num_whiteouts; + num_objects_omap -= o.num_objects_omap; + num_objects_hit_set_archive -= o.num_objects_hit_set_archive; + num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive; + num_flush -= o.num_flush; + num_flush_kb -= o.num_flush_kb; + num_evict -= o.num_evict; + num_evict_kb -= o.num_evict_kb; + num_promote -= o.num_promote; + num_flush_mode_high -= o.num_flush_mode_high; + num_flush_mode_low -= o.num_flush_mode_low; + num_evict_mode_some -= o.num_evict_mode_some; + num_evict_mode_full -= o.num_evict_mode_full; + num_objects_pinned -= o.num_objects_pinned; + num_legacy_snapsets -= o.num_legacy_snapsets; + num_large_omap_objects -= o.num_large_omap_objects; + num_objects_manifest -= o.num_objects_manifest; + num_omap_bytes -= o.num_omap_bytes; + num_omap_keys -= o.num_omap_keys; + num_objects_repaired -= o.num_objects_repaired; +} + +bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r) +{ + return + l.num_bytes == r.num_bytes && + l.num_objects == r.num_objects && + l.num_object_clones == r.num_object_clones && + l.num_object_copies == r.num_object_copies && + l.num_objects_missing_on_primary == r.num_objects_missing_on_primary && + l.num_objects_missing == r.num_objects_missing && + l.num_objects_degraded == r.num_objects_degraded && + l.num_objects_misplaced == r.num_objects_misplaced && + l.num_objects_unfound == r.num_objects_unfound && + l.num_rd == r.num_rd && + l.num_rd_kb == r.num_rd_kb && + l.num_wr == r.num_wr && + l.num_wr_kb == r.num_wr_kb && + l.num_scrub_errors == r.num_scrub_errors && + l.num_shallow_scrub_errors == r.num_shallow_scrub_errors && + l.num_deep_scrub_errors == r.num_deep_scrub_errors && + l.num_objects_recovered == r.num_objects_recovered && + l.num_bytes_recovered == r.num_bytes_recovered && + l.num_keys_recovered == r.num_keys_recovered && + l.num_objects_dirty == r.num_objects_dirty && + l.num_whiteouts == r.num_whiteouts && + l.num_objects_omap == r.num_objects_omap && + l.num_objects_hit_set_archive == r.num_objects_hit_set_archive && + l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive && + l.num_flush == r.num_flush && + l.num_flush_kb == r.num_flush_kb && + l.num_evict == r.num_evict && + l.num_evict_kb == r.num_evict_kb && + l.num_promote == r.num_promote && + l.num_flush_mode_high == r.num_flush_mode_high && + l.num_flush_mode_low == r.num_flush_mode_low && + l.num_evict_mode_some == r.num_evict_mode_some && + l.num_evict_mode_full == r.num_evict_mode_full && + l.num_objects_pinned == r.num_objects_pinned && + l.num_legacy_snapsets == r.num_legacy_snapsets && + l.num_large_omap_objects == r.num_large_omap_objects && + l.num_objects_manifest == r.num_objects_manifest && + l.num_omap_bytes == r.num_omap_bytes && + l.num_omap_keys == r.num_omap_keys && + l.num_objects_repaired == r.num_objects_repaired; +} + +// -- object_stat_collection_t -- + +void object_stat_collection_t::dump(Formatter *f) const +{ + f->open_object_section("stat_sum"); + sum.dump(f); + f->close_section(); +} + +void object_stat_collection_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(2, 2, bl); + encode(sum, bl); + encode((__u32)0, bl); + ENCODE_FINISH(bl); +} + +void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(sum, bl); + { + map cat_sum; + decode(cat_sum, bl); + } + DECODE_FINISH(bl); +} + +void object_stat_collection_t::generate_test_instances(list& o) +{ + object_stat_collection_t a; + o.push_back(new object_stat_collection_t(a)); + list l; + object_stat_sum_t::generate_test_instances(l); + for (auto p = l.begin(); p != l.end(); ++p) { + a.add(**p); + o.push_back(new object_stat_collection_t(a)); + } +} + + +// -- pg_stat_t -- + +bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const +{ + if (primary && osd == acting_primary) { + return true; + } else if (!primary) { + for(auto it = acting.cbegin(); it != acting.cend(); ++it) + { + if (*it == osd) + return true; + } + } + return false; +} + +void pg_stat_t::dump(Formatter *f) const +{ + f->dump_stream("version") << version; + f->dump_unsigned("reported_seq", reported_seq); + f->dump_unsigned("reported_epoch", reported_epoch); + f->dump_string("state", pg_state_string(state)); + f->dump_stream("last_fresh") << last_fresh; + f->dump_stream("last_change") << last_change; + f->dump_stream("last_active") << last_active; + f->dump_stream("last_peered") << last_peered; + f->dump_stream("last_clean") << last_clean; + f->dump_stream("last_became_active") << last_became_active; + f->dump_stream("last_became_peered") << last_became_peered; + f->dump_stream("last_unstale") << last_unstale; + f->dump_stream("last_undegraded") << last_undegraded; + f->dump_stream("last_fullsized") << last_fullsized; + f->dump_unsigned("mapping_epoch", mapping_epoch); + f->dump_stream("log_start") << log_start; + f->dump_stream("ondisk_log_start") << ondisk_log_start; + f->dump_unsigned("created", created); + f->dump_unsigned("last_epoch_clean", last_epoch_clean); + f->dump_stream("parent") << parent; + f->dump_unsigned("parent_split_bits", parent_split_bits); + f->dump_stream("last_scrub") << last_scrub; + f->dump_stream("last_scrub_stamp") << last_scrub_stamp; + f->dump_stream("last_deep_scrub") << last_deep_scrub; + f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp; + f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp; + f->dump_int("log_size", log_size); + f->dump_int("ondisk_log_size", ondisk_log_size); + f->dump_bool("stats_invalid", stats_invalid); + f->dump_bool("dirty_stats_invalid", dirty_stats_invalid); + f->dump_bool("omap_stats_invalid", omap_stats_invalid); + f->dump_bool("hitset_stats_invalid", hitset_stats_invalid); + f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid); + f->dump_bool("pin_stats_invalid", pin_stats_invalid); + f->dump_bool("manifest_stats_invalid", manifest_stats_invalid); + f->dump_unsigned("snaptrimq_len", snaptrimq_len); + stats.dump(f); + f->open_array_section("up"); + for (auto p = up.cbegin(); p != up.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (auto p = acting.cbegin(); p != acting.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("avail_no_missing"); + for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p) + f->dump_stream("shard") << *p; + f->close_section(); + f->open_array_section("object_location_counts"); + for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) { + f->open_object_section("entry"); + f->dump_stream("shards") << p->first; + f->dump_int("objects", p->second); + f->close_section(); + } + f->close_section(); + f->open_array_section("blocked_by"); + for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("up_primary", up_primary); + f->dump_int("acting_primary", acting_primary); + f->open_array_section("purged_snaps"); + for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) { + f->open_object_section("interval"); + f->dump_stream("start") << i.get_start(); + f->dump_stream("length") << i.get_len(); + f->close_section(); + } + f->close_section(); +} + +void pg_stat_t::dump_brief(Formatter *f) const +{ + f->dump_string("state", pg_state_string(state)); + f->open_array_section("up"); + for (auto p = up.cbegin(); p != up.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (auto p = acting.cbegin(); p != acting.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("up_primary", up_primary); + f->dump_int("acting_primary", acting_primary); +} + +void pg_stat_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(26, 22, bl); + encode(version, bl); + encode(reported_seq, bl); + encode(reported_epoch, bl); + encode((__u32)state, bl); // for older peers + encode(log_start, bl); + encode(ondisk_log_start, bl); + encode(created, bl); + encode(last_epoch_clean, bl); + encode(parent, bl); + encode(parent_split_bits, bl); + encode(last_scrub, bl); + encode(last_scrub_stamp, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + encode(up, bl); + encode(acting, bl); + encode(last_fresh, bl); + encode(last_change, bl); + encode(last_active, bl); + encode(last_clean, bl); + encode(last_unstale, bl); + encode(mapping_epoch, bl); + encode(last_deep_scrub, bl); + encode(last_deep_scrub_stamp, bl); + encode(stats_invalid, bl); + encode(last_clean_scrub_stamp, bl); + encode(last_became_active, bl); + encode(dirty_stats_invalid, bl); + encode(up_primary, bl); + encode(acting_primary, bl); + encode(omap_stats_invalid, bl); + encode(hitset_stats_invalid, bl); + encode(blocked_by, bl); + encode(last_undegraded, bl); + encode(last_fullsized, bl); + encode(hitset_bytes_stats_invalid, bl); + encode(last_peered, bl); + encode(last_became_peered, bl); + encode(pin_stats_invalid, bl); + encode(snaptrimq_len, bl); + __u32 top_state = (state >> 32); + encode(top_state, bl); + encode(purged_snaps, bl); + encode(manifest_stats_invalid, bl); + encode(avail_no_missing, bl); + encode(object_location_counts, bl); + ENCODE_FINISH(bl); +} + +void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl) +{ + bool tmp; + uint32_t old_state; + DECODE_START(26, bl); + decode(version, bl); + decode(reported_seq, bl); + decode(reported_epoch, bl); + decode(old_state, bl); + decode(log_start, bl); + decode(ondisk_log_start, bl); + decode(created, bl); + decode(last_epoch_clean, bl); + decode(parent, bl); + decode(parent_split_bits, bl); + decode(last_scrub, bl); + decode(last_scrub_stamp, bl); + decode(stats, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + decode(up, bl); + decode(acting, bl); + decode(last_fresh, bl); + decode(last_change, bl); + decode(last_active, bl); + decode(last_clean, bl); + decode(last_unstale, bl); + decode(mapping_epoch, bl); + decode(last_deep_scrub, bl); + decode(last_deep_scrub_stamp, bl); + decode(tmp, bl); + stats_invalid = tmp; + decode(last_clean_scrub_stamp, bl); + decode(last_became_active, bl); + decode(tmp, bl); + dirty_stats_invalid = tmp; + decode(up_primary, bl); + decode(acting_primary, bl); + decode(tmp, bl); + omap_stats_invalid = tmp; + decode(tmp, bl); + hitset_stats_invalid = tmp; + decode(blocked_by, bl); + decode(last_undegraded, bl); + decode(last_fullsized, bl); + decode(tmp, bl); + hitset_bytes_stats_invalid = tmp; + decode(last_peered, bl); + decode(last_became_peered, bl); + decode(tmp, bl); + pin_stats_invalid = tmp; + if (struct_v >= 23) { + decode(snaptrimq_len, bl); + if (struct_v >= 24) { + __u32 top_state; + decode(top_state, bl); + state = (uint64_t)old_state | ((uint64_t)top_state << 32); + decode(purged_snaps, bl); + } else { + state = old_state; + } + if (struct_v >= 25) { + decode(tmp, bl); + manifest_stats_invalid = tmp; + } else { + manifest_stats_invalid = true; + } + if (struct_v >= 26) { + decode(avail_no_missing, bl); + decode(object_location_counts, bl); + } + } + DECODE_FINISH(bl); +} + +void pg_stat_t::generate_test_instances(list& o) +{ + pg_stat_t a; + o.push_back(new pg_stat_t(a)); + + a.version = eversion_t(1, 3); + a.reported_epoch = 1; + a.reported_seq = 2; + a.state = 123; + a.mapping_epoch = 998; + a.last_fresh = utime_t(1002, 1); + a.last_change = utime_t(1002, 2); + a.last_active = utime_t(1002, 3); + a.last_clean = utime_t(1002, 4); + a.last_unstale = utime_t(1002, 5); + a.last_undegraded = utime_t(1002, 7); + a.last_fullsized = utime_t(1002, 8); + a.log_start = eversion_t(1, 4); + a.ondisk_log_start = eversion_t(1, 5); + a.created = 6; + a.last_epoch_clean = 7; + a.parent = pg_t(1, 2); + a.parent_split_bits = 12; + a.last_scrub = eversion_t(9, 10); + a.last_scrub_stamp = utime_t(11, 12); + a.last_deep_scrub = eversion_t(13, 14); + a.last_deep_scrub_stamp = utime_t(15, 16); + a.last_clean_scrub_stamp = utime_t(17, 18); + a.snaptrimq_len = 1048576; + list l; + object_stat_collection_t::generate_test_instances(l); + a.stats = *l.back(); + a.log_size = 99; + a.ondisk_log_size = 88; + a.up.push_back(123); + a.up_primary = 123; + a.acting.push_back(456); + a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD)); + set sset = { pg_shard_t(0), pg_shard_t(1) }; + a.object_location_counts.insert(make_pair(sset, 10)); + sset.insert(pg_shard_t(2)); + a.object_location_counts.insert(make_pair(sset, 5)); + a.acting_primary = 456; + o.push_back(new pg_stat_t(a)); + + a.up.push_back(124); + a.up_primary = 124; + a.acting.push_back(124); + a.acting_primary = 124; + a.blocked_by.push_back(155); + a.blocked_by.push_back(156); + o.push_back(new pg_stat_t(a)); +} + +bool operator==(const pg_stat_t& l, const pg_stat_t& r) +{ + return + l.version == r.version && + l.reported_seq == r.reported_seq && + l.reported_epoch == r.reported_epoch && + l.state == r.state && + l.last_fresh == r.last_fresh && + l.last_change == r.last_change && + l.last_active == r.last_active && + l.last_peered == r.last_peered && + l.last_clean == r.last_clean && + l.last_unstale == r.last_unstale && + l.last_undegraded == r.last_undegraded && + l.last_fullsized == r.last_fullsized && + l.log_start == r.log_start && + l.ondisk_log_start == r.ondisk_log_start && + l.created == r.created && + l.last_epoch_clean == r.last_epoch_clean && + l.parent == r.parent && + l.parent_split_bits == r.parent_split_bits && + l.last_scrub == r.last_scrub && + l.last_deep_scrub == r.last_deep_scrub && + l.last_scrub_stamp == r.last_scrub_stamp && + l.last_deep_scrub_stamp == r.last_deep_scrub_stamp && + l.last_clean_scrub_stamp == r.last_clean_scrub_stamp && + l.stats == r.stats && + l.stats_invalid == r.stats_invalid && + l.log_size == r.log_size && + l.ondisk_log_size == r.ondisk_log_size && + l.up == r.up && + l.acting == r.acting && + l.avail_no_missing == r.avail_no_missing && + l.object_location_counts == r.object_location_counts && + l.mapping_epoch == r.mapping_epoch && + l.blocked_by == r.blocked_by && + l.last_became_active == r.last_became_active && + l.last_became_peered == r.last_became_peered && + l.dirty_stats_invalid == r.dirty_stats_invalid && + l.omap_stats_invalid == r.omap_stats_invalid && + l.hitset_stats_invalid == r.hitset_stats_invalid && + l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid && + l.up_primary == r.up_primary && + l.acting_primary == r.acting_primary && + l.pin_stats_invalid == r.pin_stats_invalid && + l.manifest_stats_invalid == r.manifest_stats_invalid && + l.purged_snaps == r.purged_snaps && + l.snaptrimq_len == r.snaptrimq_len; +} + +// -- store_statfs_t -- + +bool store_statfs_t::operator==(const store_statfs_t& other) const +{ + return total == other.total + && available == other.available + && allocated == other.allocated + && internally_reserved == other.internally_reserved + && data_stored == other.data_stored + && data_compressed == other.data_compressed + && data_compressed_allocated == other.data_compressed_allocated + && data_compressed_original == other.data_compressed_original + && omap_allocated == other.omap_allocated + && internal_metadata == other.internal_metadata; +} + +void store_statfs_t::dump(Formatter *f) const +{ + f->dump_int("total", total); + f->dump_int("available", available); + f->dump_int("internally_reserved", internally_reserved); + f->dump_int("allocated", allocated); + f->dump_int("data_stored", data_stored); + f->dump_int("data_compressed", data_compressed); + f->dump_int("data_compressed_allocated", data_compressed_allocated); + f->dump_int("data_compressed_original", data_compressed_original); + f->dump_int("omap_allocated", omap_allocated); + f->dump_int("internal_metadata", internal_metadata); +} + +ostream& operator<<(ostream& out, const store_statfs_t &s) +{ + out << std::hex + << "store_statfs(0x" << s.available + << "/0x" << s.internally_reserved + << "/0x" << s.total + << ", data 0x" << s.data_stored + << "/0x" << s.allocated + << ", compress 0x" << s.data_compressed + << "/0x" << s.data_compressed_allocated + << "/0x" << s.data_compressed_original + << ", omap 0x" << s.omap_allocated + << ", meta 0x" << s.internal_metadata + << std::dec + << ")"; + return out; +} + +void store_statfs_t::generate_test_instances(list& o) +{ + store_statfs_t a; + o.push_back(new store_statfs_t(a)); + a.total = 234; + a.available = 123; + a.internally_reserved = 33; + a.allocated = 32; + a.data_stored = 44; + a.data_compressed = 21; + a.data_compressed_allocated = 12; + a.data_compressed_original = 13; + a.omap_allocated = 14; + a.internal_metadata = 15; + o.push_back(new store_statfs_t(a)); +} + +// -- pool_stat_t -- + +void pool_stat_t::dump(Formatter *f) const +{ + stats.dump(f); + f->open_object_section("store_stats"); + store_stats.dump(f); + f->close_section(); + f->dump_int("log_size", log_size); + f->dump_int("ondisk_log_size", ondisk_log_size); + f->dump_int("up", up); + f->dump_int("acting", acting); + f->dump_int("num_store_stats", num_store_stats); +} + +void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDENC) == 0) { + __u8 v = 4; + encode(v, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + return; + } + + ENCODE_START(7, 5, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + encode(up, bl); + encode(acting, bl); + encode(store_stats, bl); + encode(num_store_stats, bl); + ENCODE_FINISH(bl); +} + +void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); + if (struct_v >= 4) { + decode(stats, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + if (struct_v >= 6) { + decode(up, bl); + decode(acting, bl); + } else { + up = 0; + acting = 0; + } + if (struct_v >= 7) { + decode(store_stats, bl); + decode(num_store_stats, bl); + } else { + store_stats.reset(); + num_store_stats = 0; + } + + } else { + decode(stats.sum.num_bytes, bl); + uint64_t num_kb; + decode(num_kb, bl); + decode(stats.sum.num_objects, bl); + decode(stats.sum.num_object_clones, bl); + decode(stats.sum.num_object_copies, bl); + decode(stats.sum.num_objects_missing_on_primary, bl); + decode(stats.sum.num_objects_degraded, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + if (struct_v >= 2) { + decode(stats.sum.num_rd, bl); + decode(stats.sum.num_rd_kb, bl); + decode(stats.sum.num_wr, bl); + decode(stats.sum.num_wr_kb, bl); + } + if (struct_v >= 3) { + decode(stats.sum.num_objects_unfound, bl); + } + } + DECODE_FINISH(bl); +} + +void pool_stat_t::generate_test_instances(list& o) +{ + pool_stat_t a; + o.push_back(new pool_stat_t(a)); + + list l; + object_stat_collection_t::generate_test_instances(l); + list ll; + store_statfs_t::generate_test_instances(ll); + a.stats = *l.back(); + a.store_stats = *ll.back(); + a.log_size = 123; + a.ondisk_log_size = 456; + a.acting = 3; + a.up = 4; + a.num_store_stats = 1; + o.push_back(new pool_stat_t(a)); +} + + +// -- pg_history_t -- + +void pg_history_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(10, 4, bl); + encode(epoch_created, bl); + encode(last_epoch_started, bl); + encode(last_epoch_clean, bl); + encode(last_epoch_split, bl); + encode(same_interval_since, bl); + encode(same_up_since, bl); + encode(same_primary_since, bl); + encode(last_scrub, bl); + encode(last_scrub_stamp, bl); + encode(last_deep_scrub, bl); + encode(last_deep_scrub_stamp, bl); + encode(last_clean_scrub_stamp, bl); + encode(last_epoch_marked_full, bl); + encode(last_interval_started, bl); + encode(last_interval_clean, bl); + encode(epoch_pool_created, bl); + encode(prior_readable_until_ub, bl); + ENCODE_FINISH(bl); +} + +void pg_history_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl); + decode(epoch_created, bl); + decode(last_epoch_started, bl); + if (struct_v >= 3) + decode(last_epoch_clean, bl); + else + last_epoch_clean = last_epoch_started; // careful, it's a lie! + decode(last_epoch_split, bl); + decode(same_interval_since, bl); + decode(same_up_since, bl); + decode(same_primary_since, bl); + if (struct_v >= 2) { + decode(last_scrub, bl); + decode(last_scrub_stamp, bl); + } + if (struct_v >= 5) { + decode(last_deep_scrub, bl); + decode(last_deep_scrub_stamp, bl); + } + if (struct_v >= 6) { + decode(last_clean_scrub_stamp, bl); + } + if (struct_v >= 7) { + decode(last_epoch_marked_full, bl); + } + if (struct_v >= 8) { + decode(last_interval_started, bl); + decode(last_interval_clean, bl); + } else { + if (last_epoch_started >= same_interval_since) { + last_interval_started = same_interval_since; + } else { + last_interval_started = last_epoch_started; // best guess + } + if (last_epoch_clean >= same_interval_since) { + last_interval_clean = same_interval_since; + } else { + last_interval_clean = last_epoch_clean; // best guess + } + } + if (struct_v >= 9) { + decode(epoch_pool_created, bl); + } else { + epoch_pool_created = epoch_created; + } + if (struct_v >= 10) { + decode(prior_readable_until_ub, bl); + } + DECODE_FINISH(bl); +} + +void pg_history_t::dump(Formatter *f) const +{ + f->dump_int("epoch_created", epoch_created); + f->dump_int("epoch_pool_created", epoch_pool_created); + f->dump_int("last_epoch_started", last_epoch_started); + f->dump_int("last_interval_started", last_interval_started); + f->dump_int("last_epoch_clean", last_epoch_clean); + f->dump_int("last_interval_clean", last_interval_clean); + f->dump_int("last_epoch_split", last_epoch_split); + f->dump_int("last_epoch_marked_full", last_epoch_marked_full); + f->dump_int("same_up_since", same_up_since); + f->dump_int("same_interval_since", same_interval_since); + f->dump_int("same_primary_since", same_primary_since); + f->dump_stream("last_scrub") << last_scrub; + f->dump_stream("last_scrub_stamp") << last_scrub_stamp; + f->dump_stream("last_deep_scrub") << last_deep_scrub; + f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp; + f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp; + f->dump_float( + "prior_readable_until_ub", + std::chrono::duration(prior_readable_until_ub).count()); +} + +void pg_history_t::generate_test_instances(list& o) +{ + o.push_back(new pg_history_t); + o.push_back(new pg_history_t); + o.back()->epoch_created = 1; + o.back()->epoch_pool_created = 1; + o.back()->last_epoch_started = 2; + o.back()->last_interval_started = 2; + o.back()->last_epoch_clean = 3; + o.back()->last_interval_clean = 2; + o.back()->last_epoch_split = 4; + o.back()->prior_readable_until_ub = make_timespan(3.1415); + o.back()->same_up_since = 5; + o.back()->same_interval_since = 6; + o.back()->same_primary_since = 7; + o.back()->last_scrub = eversion_t(8, 9); + o.back()->last_scrub_stamp = utime_t(10, 11); + o.back()->last_deep_scrub = eversion_t(12, 13); + o.back()->last_deep_scrub_stamp = utime_t(14, 15); + o.back()->last_clean_scrub_stamp = utime_t(16, 17); + o.back()->last_epoch_marked_full = 18; +} + + +// -- pg_info_t -- + +void pg_info_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(32, 26, bl); + encode(pgid.pgid, bl); + encode(last_update, bl); + encode(last_complete, bl); + encode(log_tail, bl); + encode(hobject_t(), bl); // old (nibblewise) last_backfill + encode(stats, bl); + history.encode(bl); + encode(purged_snaps, bl); + encode(last_epoch_started, bl); + encode(last_user_version, bl); + encode(hit_set, bl); + encode(pgid.shard, bl); + encode(last_backfill, bl); + encode(true, bl); // was last_backfill_bitwise + encode(last_interval_started, bl); + ENCODE_FINISH(bl); +} + +void pg_info_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(32, bl); + decode(pgid.pgid, bl); + decode(last_update, bl); + decode(last_complete, bl); + decode(log_tail, bl); + { + hobject_t old_last_backfill; + decode(old_last_backfill, bl); + } + decode(stats, bl); + history.decode(bl); + decode(purged_snaps, bl); + decode(last_epoch_started, bl); + decode(last_user_version, bl); + decode(hit_set, bl); + decode(pgid.shard, bl); + decode(last_backfill, bl); + { + bool last_backfill_bitwise; + decode(last_backfill_bitwise, bl); + // note: we may see a false value here since the default value for + // the member was false, so it often didn't get set to true until + // peering progressed. + } + if (struct_v >= 32) { + decode(last_interval_started, bl); + } else { + last_interval_started = last_epoch_started; + } + DECODE_FINISH(bl); +} + +// -- pg_info_t -- + +void pg_info_t::dump(Formatter *f) const +{ + f->dump_stream("pgid") << pgid; + f->dump_stream("last_update") << last_update; + f->dump_stream("last_complete") << last_complete; + f->dump_stream("log_tail") << log_tail; + f->dump_int("last_user_version", last_user_version); + f->dump_stream("last_backfill") << last_backfill; + f->open_array_section("purged_snaps"); + for (interval_set::const_iterator i=purged_snaps.begin(); + i != purged_snaps.end(); + ++i) { + f->open_object_section("purged_snap_interval"); + f->dump_stream("start") << i.get_start(); + f->dump_stream("length") << i.get_len(); + f->close_section(); + } + f->close_section(); + f->open_object_section("history"); + history.dump(f); + f->close_section(); + f->open_object_section("stats"); + stats.dump(f); + f->close_section(); + + f->dump_int("empty", is_empty()); + f->dump_int("dne", dne()); + f->dump_int("incomplete", is_incomplete()); + f->dump_int("last_epoch_started", last_epoch_started); + + f->open_object_section("hit_set_history"); + hit_set.dump(f); + f->close_section(); +} + +void pg_info_t::generate_test_instances(list& o) +{ + o.push_back(new pg_info_t); + o.push_back(new pg_info_t); + list h; + pg_history_t::generate_test_instances(h); + o.back()->history = *h.back(); + o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD); + o.back()->last_update = eversion_t(3, 4); + o.back()->last_complete = eversion_t(5, 6); + o.back()->last_user_version = 2; + o.back()->log_tail = eversion_t(7, 8); + o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, ""); + { + list s; + pg_stat_t::generate_test_instances(s); + o.back()->stats = *s.back(); + } + { + list s; + pg_hit_set_history_t::generate_test_instances(s); + o.back()->hit_set = *s.back(); + } +} + +// -- pg_notify_t -- +void pg_notify_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(3, 2, bl); + encode(query_epoch, bl); + encode(epoch_sent, bl); + encode(info, bl); + encode(to, bl); + encode(from, bl); + encode(past_intervals, bl); + ENCODE_FINISH(bl); +} + +void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(3, bl); + decode(query_epoch, bl); + decode(epoch_sent, bl); + decode(info, bl); + decode(to, bl); + decode(from, bl); + if (struct_v >= 3) { + decode(past_intervals, bl); + } + DECODE_FINISH(bl); +} + +void pg_notify_t::dump(Formatter *f) const +{ + f->dump_int("from", from); + f->dump_int("to", to); + f->dump_unsigned("query_epoch", query_epoch); + f->dump_unsigned("epoch_sent", epoch_sent); + { + f->open_object_section("info"); + info.dump(f); + f->close_section(); + } + f->dump_object("past_intervals", past_intervals); +} + +void pg_notify_t::generate_test_instances(list& o) +{ + o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, + pg_info_t(), PastIntervals())); + o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, + pg_info_t(), PastIntervals())); +} + +ostream &operator<<(ostream &lhs, const pg_notify_t ¬ify) +{ + lhs << "(query:" << notify.query_epoch + << " sent:" << notify.epoch_sent + << " " << notify.info; + if (notify.from != shard_id_t::NO_SHARD || + notify.to != shard_id_t::NO_SHARD) + lhs << " " << (unsigned)notify.from + << "->" << (unsigned)notify.to; + lhs << " " << notify.past_intervals; + return lhs << ")"; +} + +// -- pg_interval_t -- + +void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(4, 2, bl); + encode(first, bl); + encode(last, bl); + encode(up, bl); + encode(acting, bl); + encode(maybe_went_rw, bl); + encode(primary, bl); + encode(up_primary, bl); + ENCODE_FINISH(bl); +} + +void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl); + decode(first, bl); + decode(last, bl); + decode(up, bl); + decode(acting, bl); + decode(maybe_went_rw, bl); + if (struct_v >= 3) { + decode(primary, bl); + } else { + if (acting.size()) + primary = acting[0]; + } + if (struct_v >= 4) { + decode(up_primary, bl); + } else { + if (up.size()) + up_primary = up[0]; + } + DECODE_FINISH(bl); +} + +void PastIntervals::pg_interval_t::dump(Formatter *f) const +{ + f->dump_unsigned("first", first); + f->dump_unsigned("last", last); + f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0); + f->open_array_section("up"); + for (auto p = up.cbegin(); p != up.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (auto p = acting.cbegin(); p != acting.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("primary", primary); + f->dump_int("up_primary", up_primary); +} + +void PastIntervals::pg_interval_t::generate_test_instances(list& o) +{ + o.push_back(new pg_interval_t); + o.push_back(new pg_interval_t); + o.back()->up.push_back(1); + o.back()->acting.push_back(2); + o.back()->acting.push_back(3); + o.back()->first = 4; + o.back()->last = 5; + o.back()->maybe_went_rw = true; +} + +WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t) + + +/** + * pi_compact_rep + * + * PastIntervals only needs to be able to answer two questions: + * 1) Where should the primary look for unfound objects? + * 2) List a set of subsets of the OSDs such that contacting at least + * one from each subset guarantees we speak to at least one witness + * of any completed write. + * + * Crucially, 2) does not require keeping *all* past intervals. Certainly, + * we don't need to keep any where maybe_went_rw would be false. We also + * needn't keep two intervals where the actingset in one is a subset + * of the other (only need to keep the smaller of the two sets). In order + * to accurately trim the set of intervals as last_epoch_started changes + * without rebuilding the set from scratch, we'll retain the larger set + * if it in an older interval. + */ +struct compact_interval_t { + epoch_t first; + epoch_t last; + set acting; + bool supersedes(const compact_interval_t &other) { + for (auto &&i: acting) { + if (!other.acting.count(i)) + return false; + } + return true; + } + void dump(Formatter *f) const { + f->open_object_section("compact_interval_t"); + f->dump_stream("first") << first; + f->dump_stream("last") << last; + f->dump_stream("acting") << acting; + f->close_section(); + } + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(last, bl); + encode(acting, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(1, bl); + decode(first, bl); + decode(last, bl); + decode(acting, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(list & o) { + /* Not going to be used, we'll generate pi_compact_rep directly */ + } +}; +ostream &operator<<(ostream &o, const compact_interval_t &rhs) +{ + return o << "([" << rhs.first << "," << rhs.last + << "] acting " << rhs.acting << ")"; +} +WRITE_CLASS_ENCODER(compact_interval_t) + +class pi_compact_rep : public PastIntervals::interval_rep { + epoch_t first = 0; + epoch_t last = 0; // inclusive + set all_participants; + list intervals; + pi_compact_rep( + bool ec_pool, + std::list &&intervals) { + for (auto &&i: intervals) + add_interval(ec_pool, i); + } +public: + pi_compact_rep() = default; + pi_compact_rep(const pi_compact_rep &) = default; + pi_compact_rep(pi_compact_rep &&) = default; + pi_compact_rep &operator=(const pi_compact_rep &) = default; + pi_compact_rep &operator=(pi_compact_rep &&) = default; + + size_t size() const override { return intervals.size(); } + bool empty() const override { + return first > last || (first == 0 && last == 0); + } + void clear() override { + *this = pi_compact_rep(); + } + pair get_bounds() const override { + return make_pair(first, last + 1); + } + void adjust_start_backwards(epoch_t last_epoch_clean) override { + first = last_epoch_clean; + } + + set get_all_participants( + bool ec_pool) const override { + return all_participants; + } + void add_interval( + bool ec_pool, const PastIntervals::pg_interval_t &interval) override { + if (first == 0) + first = interval.first; + ceph_assert(interval.last > last); + last = interval.last; + set acting; + for (unsigned i = 0; i < interval.acting.size(); ++i) { + if (interval.acting[i] == CRUSH_ITEM_NONE) + continue; + acting.insert( + pg_shard_t( + interval.acting[i], + ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + all_participants.insert(acting.begin(), acting.end()); + if (!interval.maybe_went_rw) + return; + intervals.push_back( + compact_interval_t{interval.first, interval.last, acting}); + auto plast = intervals.end(); + --plast; + for (auto cur = intervals.begin(); cur != plast; ) { + if (plast->supersedes(*cur)) { + intervals.erase(cur++); + } else { + ++cur; + } + } + } + unique_ptr clone() const override { + return unique_ptr(new pi_compact_rep(*this)); + } + ostream &print(ostream &out) const override { + return out << "([" << first << "," << last + << "] all_participants=" << all_participants + << " intervals=" << intervals << ")"; + } + void encode(ceph::buffer::list &bl) const override { + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(last, bl); + encode(all_participants, bl); + encode(intervals, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) override { + DECODE_START(1, bl); + decode(first, bl); + decode(last, bl); + decode(all_participants, bl); + decode(intervals, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const override { + f->open_object_section("PastIntervals::compact_rep"); + f->dump_stream("first") << first; + f->dump_stream("last") << last; + f->open_array_section("all_participants"); + for (auto& i : all_participants) { + f->dump_object("pg_shard", i); + } + f->close_section(); + f->open_array_section("intervals"); + for (auto &&i: intervals) { + i.dump(f); + } + f->close_section(); + f->close_section(); + } + static void generate_test_instances(list &o) { + using ival = PastIntervals::pg_interval_t; + using ivallst = std::list; + o.push_back( + new pi_compact_rep( + true, ivallst + { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0} + , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1} + , ival{{ 2}, { 2}, 31, 35, false, 2, 2} + , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0} + })); + o.push_back( + new pi_compact_rep( + false, ivallst + { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0} + , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1} + , ival{{ 2}, { 2}, 31, 35, false, 2, 2} + , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0} + })); + o.push_back( + new pi_compact_rep( + true, ivallst + { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1} + , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0} + , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2} + , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0} + })); + } + void iterate_mayberw_back_to( + epoch_t les, + std::function &)> &&f) const override { + for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) { + if (i->last < les) + break; + f(i->first, i->acting); + } + } + virtual ~pi_compact_rep() override {} +}; +WRITE_CLASS_ENCODER(pi_compact_rep) + +PastIntervals::PastIntervals() +{ + past_intervals.reset(new pi_compact_rep); +} + +PastIntervals::PastIntervals(const PastIntervals &rhs) + : past_intervals(rhs.past_intervals ? + rhs.past_intervals->clone() : + nullptr) {} + +PastIntervals &PastIntervals::operator=(const PastIntervals &rhs) +{ + PastIntervals other(rhs); + swap(other); + return *this; +} + +ostream& operator<<(ostream& out, const PastIntervals &i) +{ + if (i.past_intervals) { + return i.past_intervals->print(out); + } else { + return out << "(empty)"; + } +} + +ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i) +{ + return out << "PriorSet(" + << "ec_pool: " << i.ec_pool + << ", probe: " << i.probe + << ", down: " << i.down + << ", blocked_by: " << i.blocked_by + << ", pg_down: " << i.pg_down + << ")"; +} + +void PastIntervals::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + __u8 type = 0; + decode(type, bl); + switch (type) { + case 0: + break; + case 1: + ceph_abort_msg("pi_simple_rep support removed post-luminous"); + break; + case 2: + past_intervals.reset(new pi_compact_rep); + past_intervals->decode(bl); + break; + } + DECODE_FINISH(bl); +} + +void PastIntervals::generate_test_instances(list &o) +{ + { + list compact; + pi_compact_rep::generate_test_instances(compact); + for (auto &&i: compact) { + // takes ownership of contents + o.push_back(new PastIntervals(i)); + } + } + return; +} + +bool PastIntervals::is_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector &old_acting, + const vector &new_acting, + int old_up_primary, + int new_up_primary, + const vector &old_up, + const vector &new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + unsigned old_pg_num, + unsigned new_pg_num, + unsigned old_pg_num_pending, + unsigned new_pg_num_pending, + bool old_sort_bitwise, + bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, + uint32_t old_crush_count, + uint32_t new_crush_count, + uint32_t old_crush_target, + uint32_t new_crush_target, + uint32_t old_crush_barrier, + uint32_t new_crush_barrier, + int32_t old_crush_member, + int32_t new_crush_member, + pg_t pgid) { + return old_acting_primary != new_acting_primary || + new_acting != old_acting || + old_up_primary != new_up_primary || + new_up != old_up || + old_min_size != new_min_size || + old_size != new_size || + pgid.is_split(old_pg_num, new_pg_num, 0) || + // (is or was) pre-merge source + pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) || + pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) || + // merge source + pgid.is_merge_source(old_pg_num, new_pg_num, 0) || + // (is or was) pre-merge target + pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) || + pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) || + // merge target + pgid.is_merge_target(old_pg_num, new_pg_num) || + old_sort_bitwise != new_sort_bitwise || + old_recovery_deletes != new_recovery_deletes || + old_crush_count != new_crush_count || + old_crush_target != new_crush_target || + old_crush_barrier != new_crush_barrier || + old_crush_member != new_crush_member; +} + +bool PastIntervals::is_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector &old_acting, + const vector &new_acting, + int old_up_primary, + int new_up_primary, + const vector &old_up, + const vector &new_up, + const OSDMap *osdmap, + const OSDMap *lastmap, + pg_t pgid) +{ + const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool()); + if (!plast) { + return false; // after pool is deleted there are no more interval changes + } + const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool()); + if (!pi) { + return true; // pool was deleted this epoch -> (final!) interval change + } + return + is_new_interval(old_acting_primary, + new_acting_primary, + old_acting, + new_acting, + old_up_primary, + new_up_primary, + old_up, + new_up, + plast->size, + pi->size, + plast->min_size, + pi->min_size, + plast->get_pg_num(), + pi->get_pg_num(), + plast->get_pg_num_pending(), + pi->get_pg_num_pending(), + lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE), + osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE), + lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES), + osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES), + plast->peering_crush_bucket_count, pi->peering_crush_bucket_count, + plast->peering_crush_bucket_target, pi->peering_crush_bucket_target, + plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier, + plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member, + pgid); +} + +bool PastIntervals::check_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector &old_acting, + const vector &new_acting, + int old_up_primary, + int new_up_primary, + const vector &old_up, + const vector &new_up, + epoch_t same_interval_since, + epoch_t last_epoch_clean, + const OSDMap *osdmap, + const OSDMap *lastmap, + pg_t pgid, + const IsPGRecoverablePredicate &could_have_gone_active, + PastIntervals *past_intervals, + std::ostream *out) +{ + /* + * We have to be careful to gracefully deal with situations like + * so. Say we have a power outage or something that takes out both + * OSDs, but the monitor doesn't mark them down in the same epoch. + * The history may look like + * + * 1: A B + * 2: B + * 3: let's say B dies for good, too (say, from the power spike) + * 4: A + * + * which makes it look like B may have applied updates to the PG + * that we need in order to proceed. This sucks... + * + * To minimize the risk of this happening, we CANNOT go active if + * _any_ OSDs in the prior set are down until we send an MOSDAlive + * to the monitor such that the OSDMap sets osd_up_thru to an epoch. + * Then, we have something like + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: + * 4: A + * + * -> we can ignore B, bc it couldn't have gone active (up_thru still 0). + * + * or, + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: B up_thru[B]=2 + * 4: + * 5: A + * + * -> we must wait for B, bc it was alive through 2, and could have + * written to the pg. + * + * If B is really dead, then an administrator will need to manually + * intervene by marking the OSD as "lost." + */ + + // remember past interval + // NOTE: a change in the up set primary triggers an interval + // change, even though the interval members in the pg_interval_t + // do not change. + ceph_assert(past_intervals); + ceph_assert(past_intervals->past_intervals); + if (is_new_interval( + old_acting_primary, + new_acting_primary, + old_acting, + new_acting, + old_up_primary, + new_up_primary, + old_up, + new_up, + osdmap, + lastmap, + pgid)) { + pg_interval_t i; + i.first = same_interval_since; + i.last = osdmap->get_epoch() - 1; + ceph_assert(i.first <= i.last); + i.acting = old_acting; + i.up = old_up; + i.primary = old_acting_primary; + i.up_primary = old_up_primary; + + unsigned num_acting = 0; + for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p) + if (*p != CRUSH_ITEM_NONE) + ++num_acting; + + ceph_assert(lastmap->get_pools().count(pgid.pool())); + const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second; + set old_acting_shards; + old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards); + + if (num_acting && + i.primary != -1 && + num_acting >= old_pg_pool.min_size && + (!old_pg_pool.is_stretch_pool() || + old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) && + could_have_gone_active(old_acting_shards)) { + if (out) + *out << __func__ << " " << i + << " up_thru " << lastmap->get_up_thru(i.primary) + << " up_from " << lastmap->get_up_from(i.primary) + << " last_epoch_clean " << last_epoch_clean; + if (lastmap->get_up_thru(i.primary) >= i.first && + lastmap->get_up_from(i.primary) <= i.first) { + i.maybe_went_rw = true; + if (out) + *out << " " << i + << " : primary up " << lastmap->get_up_from(i.primary) + << "-" << lastmap->get_up_thru(i.primary) + << " includes interval" + << std::endl; + } else if (last_epoch_clean >= i.first && + last_epoch_clean <= i.last) { + // If the last_epoch_clean is included in this interval, then + // the pg must have been rw (for recovery to have completed). + // This is important because we won't know the _real_ + // first_epoch because we stop at last_epoch_clean, and we + // don't want the oldest interval to randomly have + // maybe_went_rw false depending on the relative up_thru vs + // last_epoch_clean timing. + i.maybe_went_rw = true; + if (out) + *out << " " << i + << " : includes last_epoch_clean " << last_epoch_clean + << " and presumed to have been rw" + << std::endl; + } else { + i.maybe_went_rw = false; + if (out) + *out << " " << i + << " : primary up " << lastmap->get_up_from(i.primary) + << "-" << lastmap->get_up_thru(i.primary) + << " does not include interval" + << std::endl; + } + } else { + i.maybe_went_rw = false; + if (out) + *out << __func__ << " " << i << " : acting set is too small" << std::endl; + } + past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i); + return true; + } else { + return false; + } +} + +// true if the given map affects the prior set +bool PastIntervals::PriorSet::affected_by_map( + const OSDMap &osdmap, + const DoutPrefixProvider *dpp) const +{ + for (auto p = probe.begin(); p != probe.end(); ++p) { + int o = p->osd; + + // did someone in the prior set go down? + if (osdmap.is_down(o) && down.count(o) == 0) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl; + return true; + } + + // did a down osd in cur get (re)marked as lost? + auto r = blocked_by.find(o); + if (r != blocked_by.end()) { + if (!osdmap.exists(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl; + return true; + } + if (osdmap.get_info(o).lost_at != r->second) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl; + return true; + } + } + } + + // did someone in the prior down set go up? + for (auto p = down.cbegin(); p != down.cend(); ++p) { + int o = *p; + + if (osdmap.is_up(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl; + return true; + } + + // did someone in the prior set get lost or destroyed? + if (!osdmap.exists(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl; + return true; + } + // did a down osd in down get (re)marked as lost? + auto r = blocked_by.find(o); + if (r != blocked_by.end()) { + if (osdmap.get_info(o).lost_at != r->second) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl; + return true; + } + } + } + + return false; +} + +ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i) +{ + out << "interval(" << i.first << "-" << i.last + << " up " << i.up << "(" << i.up_primary << ")" + << " acting " << i.acting << "(" << i.primary << ")"; + if (i.maybe_went_rw) + out << " maybe_went_rw"; + out << ")"; + return out; +} + + + +// -- pg_query_t -- + +void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const { + ENCODE_START(3, 3, bl); + encode(type, bl); + encode(since, bl); + history.encode(bl); + encode(epoch_sent, bl); + encode(to, bl); + encode(from, bl); + ENCODE_FINISH(bl); +} + +void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(3, bl); + decode(type, bl); + decode(since, bl); + history.decode(bl); + decode(epoch_sent, bl); + decode(to, bl); + decode(from, bl); + DECODE_FINISH(bl); +} + +void pg_query_t::dump(Formatter *f) const +{ + f->dump_int("from", from); + f->dump_int("to", to); + f->dump_string("type", get_type_name()); + f->dump_stream("since") << since; + f->dump_stream("epoch_sent") << epoch_sent; + f->open_object_section("history"); + history.dump(f); + f->close_section(); +} +void pg_query_t::generate_test_instances(list& o) +{ + o.push_back(new pg_query_t()); + list h; + pg_history_t::generate_test_instances(h); + o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0), + eversion_t(4, 5), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::FULLLOG, + shard_id_t::NO_SHARD, shard_id_t::NO_SHARD, + *h.back(), 5)); +} + +// -- pg_lease_t -- + +void pg_lease_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(readable_until, bl); + encode(readable_until_ub, bl); + encode(interval, bl); + ENCODE_FINISH(bl); +} + +void pg_lease_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(readable_until, p); + decode(readable_until_ub, p); + decode(interval, p); + DECODE_FINISH(p); +} + +void pg_lease_t::dump(Formatter *f) const +{ + f->dump_stream("readable_until") << readable_until; + f->dump_stream("readable_until_ub") << readable_until_ub; + f->dump_stream("interval") << interval; +} + +void pg_lease_t::generate_test_instances(std::list& o) +{ + o.push_back(new pg_lease_t()); + o.push_back(new pg_lease_t()); + o.back()->readable_until = make_timespan(1.5); + o.back()->readable_until_ub = make_timespan(3.4); + o.back()->interval = make_timespan(1.0); +} + +// -- pg_lease_ack_t -- + +void pg_lease_ack_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(readable_until_ub, bl); + ENCODE_FINISH(bl); +} + +void pg_lease_ack_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(readable_until_ub, p); + DECODE_FINISH(p); +} + +void pg_lease_ack_t::dump(Formatter *f) const +{ + f->dump_stream("readable_until_ub") << readable_until_ub; +} + +void pg_lease_ack_t::generate_test_instances(std::list& o) +{ + o.push_back(new pg_lease_ack_t()); + o.push_back(new pg_lease_ack_t()); + o.back()->readable_until_ub = make_timespan(3.4); +} + + +// -- ObjectModDesc -- +void ObjectModDesc::visit(Visitor *visitor) const +{ + auto bp = bl.cbegin(); + try { + while (!bp.end()) { + DECODE_START(max_required_version, bp); + uint8_t code; + decode(code, bp); + switch (code) { + case APPEND: { + uint64_t size; + decode(size, bp); + visitor->append(size); + break; + } + case SETATTRS: { + map > attrs; + decode(attrs, bp); + visitor->setattrs(attrs); + break; + } + case DELETE: { + version_t old_version; + decode(old_version, bp); + visitor->rmobject(old_version); + break; + } + case CREATE: { + visitor->create(); + break; + } + case UPDATE_SNAPS: { + set snaps; + decode(snaps, bp); + visitor->update_snaps(snaps); + break; + } + case TRY_DELETE: { + version_t old_version; + decode(old_version, bp); + visitor->try_rmobject(old_version); + break; + } + case ROLLBACK_EXTENTS: { + vector > extents; + version_t gen; + decode(gen, bp); + decode(extents, bp); + visitor->rollback_extents(gen,extents); + break; + } + default: + ceph_abort_msg("Invalid rollback code"); + } + DECODE_FINISH(bp); + } + } catch (...) { + ceph_abort_msg("Invalid encoding"); + } +} + +struct DumpVisitor : public ObjectModDesc::Visitor { + Formatter *f; + explicit DumpVisitor(Formatter *f) : f(f) {} + void append(uint64_t old_size) override { + f->open_object_section("op"); + f->dump_string("code", "APPEND"); + f->dump_unsigned("old_size", old_size); + f->close_section(); + } + void setattrs(map > &attrs) override { + f->open_object_section("op"); + f->dump_string("code", "SETATTRS"); + f->open_array_section("attrs"); + for (auto i = attrs.begin(); i != attrs.end(); ++i) { + f->dump_string("attr_name", i->first); + } + f->close_section(); + f->close_section(); + } + void rmobject(version_t old_version) override { + f->open_object_section("op"); + f->dump_string("code", "RMOBJECT"); + f->dump_unsigned("old_version", old_version); + f->close_section(); + } + void try_rmobject(version_t old_version) override { + f->open_object_section("op"); + f->dump_string("code", "TRY_RMOBJECT"); + f->dump_unsigned("old_version", old_version); + f->close_section(); + } + void create() override { + f->open_object_section("op"); + f->dump_string("code", "CREATE"); + f->close_section(); + } + void update_snaps(const set &snaps) override { + f->open_object_section("op"); + f->dump_string("code", "UPDATE_SNAPS"); + f->dump_stream("snaps") << snaps; + f->close_section(); + } + void rollback_extents( + version_t gen, + const vector > &extents) override { + f->open_object_section("op"); + f->dump_string("code", "ROLLBACK_EXTENTS"); + f->dump_unsigned("gen", gen); + f->dump_stream("snaps") << extents; + f->close_section(); + } +}; + +void ObjectModDesc::dump(Formatter *f) const +{ + f->open_object_section("object_mod_desc"); + f->dump_bool("can_local_rollback", can_local_rollback); + f->dump_bool("rollback_info_completed", rollback_info_completed); + { + f->open_array_section("ops"); + DumpVisitor vis(f); + visit(&vis); + f->close_section(); + } + f->close_section(); +} + +void ObjectModDesc::generate_test_instances(list& o) +{ + map > attrs; + attrs[OI_ATTR]; + attrs[SS_ATTR]; + attrs["asdf"]; + o.push_back(new ObjectModDesc()); + o.back()->append(100); + o.back()->setattrs(attrs); + o.push_back(new ObjectModDesc()); + o.back()->rmobject(1001); + o.push_back(new ObjectModDesc()); + o.back()->create(); + o.back()->setattrs(attrs); + o.push_back(new ObjectModDesc()); + o.back()->create(); + o.back()->setattrs(attrs); + o.back()->mark_unrollbackable(); + o.back()->append(1000); +} + +void ObjectModDesc::encode(ceph::buffer::list &_bl) const +{ + ENCODE_START(max_required_version, max_required_version, _bl); + encode(can_local_rollback, _bl); + encode(rollback_info_completed, _bl); + encode(bl, _bl); + ENCODE_FINISH(_bl); +} +void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl) +{ + DECODE_START(2, _bl); + max_required_version = struct_v; + decode(can_local_rollback, _bl); + decode(rollback_info_completed, _bl); + decode(bl, _bl); + // ensure bl does not pin a larger ceph::buffer in memory + bl.rebuild(); + bl.reassign_to_mempool(mempool::mempool_osd_pglog); + DECODE_FINISH(_bl); +} + +std::atomic ObjectCleanRegions::max_num_intervals = {10}; + +void ObjectCleanRegions::set_max_num_intervals(uint32_t num) +{ + max_num_intervals = num; +} + +void ObjectCleanRegions::trim() +{ + while(clean_offsets.num_intervals() > max_num_intervals) { + typename interval_set::iterator shortest_interval = clean_offsets.begin(); + if (shortest_interval == clean_offsets.end()) + break; + for (typename interval_set::iterator it = clean_offsets.begin(); + it != clean_offsets.end(); + ++it) { + if (it.get_len() < shortest_interval.get_len()) + shortest_interval = it; + } + clean_offsets.erase(shortest_interval); + } +} + +void ObjectCleanRegions::merge(const ObjectCleanRegions &other) +{ + clean_offsets.intersection_of(other.clean_offsets); + clean_omap = clean_omap && other.clean_omap; + trim(); +} + +void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len) +{ + interval_set clean_region; + clean_region.insert(0, (uint64_t)-1); + clean_region.erase(offset, len); + clean_offsets.intersection_of(clean_region); + trim(); +} + +bool ObjectCleanRegions::is_clean_region(uint64_t offset, uint64_t len) const +{ + return clean_offsets.contains(offset, len); +} + +void ObjectCleanRegions::mark_omap_dirty() +{ + clean_omap = false; +} + +void ObjectCleanRegions::mark_object_new() +{ + new_object = true; +} + +void ObjectCleanRegions::mark_fully_dirty() +{ + mark_data_region_dirty(0, (uint64_t)-1); + mark_omap_dirty(); + mark_object_new(); +} + +interval_set ObjectCleanRegions::get_dirty_regions() const +{ + interval_set dirty_region; + dirty_region.insert(0, (uint64_t)-1); + dirty_region.subtract(clean_offsets); + return dirty_region; +} + +bool ObjectCleanRegions::omap_is_dirty() const +{ + return !clean_omap; +} + +bool ObjectCleanRegions::object_is_exist() const +{ + return !new_object; +} + +void ObjectCleanRegions::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + using ceph::encode; + encode(clean_offsets, bl); + encode(clean_omap, bl); + encode(new_object, bl); + ENCODE_FINISH(bl); +} + +void ObjectCleanRegions::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + using ceph::decode; + decode(clean_offsets, bl); + decode(clean_omap, bl); + decode(new_object, bl); + DECODE_FINISH(bl); +} + +void ObjectCleanRegions::dump(Formatter *f) const +{ + f->open_object_section("object_clean_regions"); + f->dump_stream("clean_offsets") << clean_offsets; + f->dump_bool("clean_omap", clean_omap); + f->dump_bool("new_object", new_object); + f->close_section(); +} + +void ObjectCleanRegions::generate_test_instances(list& o) +{ + o.push_back(new ObjectCleanRegions()); + o.push_back(new ObjectCleanRegions()); + o.back()->mark_data_region_dirty(4096, 40960); + o.back()->mark_omap_dirty(); + o.back()->mark_object_new(); +} + +ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr) +{ + return out << "clean_offsets: " << ocr.clean_offsets + << ", clean_omap: " << ocr.clean_omap + << ", new_object: " << ocr.new_object; +} + +// -- pg_log_entry_t -- + +string pg_log_entry_t::get_key_name() const +{ + return version.get_key_name(); +} + +void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const +{ + using ceph::encode; + ceph::buffer::list ebl(sizeof(*this)*2); + this->encode(ebl); + __u32 crc = ebl.crc32c(0); + encode(ebl, bl); + encode(crc, bl); +} + +void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p) +{ + using ceph::decode; + ceph::buffer::list bl; + decode(bl, p); + __u32 crc; + decode(crc, p); + if (crc != bl.crc32c(0)) + throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t"); + auto q = bl.cbegin(); + this->decode(q); +} + +void pg_log_entry_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(14, 4, bl); + encode(op, bl); + encode(soid, bl); + encode(version, bl); + + /** + * Added with reverting_to: + * Previous code used prior_version to encode + * what we now call reverting_to. This will + * allow older code to decode reverting_to + * into prior_version as expected. + */ + if (op == LOST_REVERT) + encode(reverting_to, bl); + else + encode(prior_version, bl); + + encode(reqid, bl); + encode(mtime, bl); + if (op == LOST_REVERT) + encode(prior_version, bl); + encode(snaps, bl); + encode(user_version, bl); + encode(mod_desc, bl); + encode(extra_reqids, bl); + if (op == ERROR) + encode(return_code, bl); + if (!extra_reqids.empty()) + encode(extra_reqid_return_codes, bl); + encode(clean_regions, bl); + if (op != ERROR) + encode(return_code, bl); + encode(op_returns, bl); + ENCODE_FINISH(bl); +} + +void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl); + decode(op, bl); + if (struct_v < 2) { + sobject_t old_soid; + decode(old_soid, bl); + soid.oid = old_soid.oid; + soid.snap = old_soid.snap; + invalid_hash = true; + } else { + decode(soid, bl); + } + if (struct_v < 3) + invalid_hash = true; + decode(version, bl); + + if (struct_v >= 6 && op == LOST_REVERT) + decode(reverting_to, bl); + else + decode(prior_version, bl); + + decode(reqid, bl); + + decode(mtime, bl); + if (struct_v < 5) + invalid_pool = true; + + if (op == LOST_REVERT) { + if (struct_v >= 6) { + decode(prior_version, bl); + } else { + reverting_to = prior_version; + } + } + if (struct_v >= 7 || // for v >= 7, this is for all ops. + op == CLONE) { // for v < 7, it's only present for CLONE. + decode(snaps, bl); + // ensure snaps does not pin a larger ceph::buffer in memory + snaps.rebuild(); + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + if (struct_v >= 8) + decode(user_version, bl); + else + user_version = version.version; + + if (struct_v >= 9) + decode(mod_desc, bl); + else + mod_desc.mark_unrollbackable(); + if (struct_v >= 10) + decode(extra_reqids, bl); + if (struct_v >= 11 && op == ERROR) + decode(return_code, bl); + if (struct_v >= 12 && !extra_reqids.empty()) + decode(extra_reqid_return_codes, bl); + if (struct_v >= 13) + decode(clean_regions, bl); + else + clean_regions.mark_fully_dirty(); + if (struct_v >= 14) { + if (op != ERROR) { + decode(return_code, bl); + } + decode(op_returns, bl); + } + DECODE_FINISH(bl); +} + +void pg_log_entry_t::dump(Formatter *f) const +{ + f->dump_string("op", get_op_name()); + f->dump_stream("object") << soid; + f->dump_stream("version") << version; + f->dump_stream("prior_version") << prior_version; + f->dump_stream("reqid") << reqid; + f->open_array_section("extra_reqids"); + uint32_t idx = 0; + for (auto p = extra_reqids.begin(); + p != extra_reqids.end(); + ++idx, ++p) { + f->open_object_section("extra_reqid"); + f->dump_stream("reqid") << p->first; + f->dump_stream("user_version") << p->second; + auto it = extra_reqid_return_codes.find(idx); + if (it != extra_reqid_return_codes.end()) { + f->dump_int("return_code", it->second); + } + f->close_section(); + } + f->close_section(); + f->dump_stream("mtime") << mtime; + f->dump_int("return_code", return_code); + if (!op_returns.empty()) { + f->open_array_section("op_returns"); + for (auto& i : op_returns) { + f->dump_object("op", i); + } + f->close_section(); + } + if (snaps.length() > 0) { + vector v; + ceph::buffer::list c = snaps; + auto p = c.cbegin(); + try { + using ceph::decode; + decode(v, p); + } catch (...) { + v.clear(); + } + f->open_object_section("snaps"); + for (auto p = v.begin(); p != v.end(); ++p) + f->dump_unsigned("snap", *p); + f->close_section(); + } + { + f->open_object_section("mod_desc"); + mod_desc.dump(f); + f->close_section(); + } + { + f->open_object_section("clean_regions"); + clean_regions.dump(f); + f->close_section(); + } +} + +void pg_log_entry_t::generate_test_instances(list& o) +{ + o.push_back(new pg_log_entry_t()); + hobject_t oid(object_t("objname"), "key", 123, 456, 0, ""); + o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4), + 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + utime_t(8,9), 0)); + o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4), + 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + utime_t(8,9), -ENOENT)); +} + +ostream& operator<<(ostream& out, const pg_log_entry_t& e) +{ + out << e.version << " (" << e.prior_version << ") " + << std::left << std::setw(8) << e.get_op_name() << ' ' + << e.soid << " by " << e.reqid << " " << e.mtime + << " " << e.return_code; + if (!e.op_returns.empty()) { + out << " " << e.op_returns; + } + if (e.snaps.length()) { + vector snaps; + ceph::buffer::list c = e.snaps; + auto p = c.cbegin(); + try { + decode(snaps, p); + } catch (...) { + snaps.clear(); + } + out << " snaps " << snaps; + } + out << " ObjectCleanRegions " << e.clean_regions; + return out; +} + +// -- pg_log_dup_t -- + +std::string pg_log_dup_t::get_key_name() const +{ + static const char prefix[] = "dup_"; + std::string key(36, ' '); + memcpy(&key[0], prefix, 4); + version.get_key_name(&key[4]); + key.resize(35); // remove the null terminator + return key; +} + +void pg_log_dup_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(2, 1, bl); + encode(reqid, bl); + encode(version, bl); + encode(user_version, bl); + encode(return_code, bl); + encode(op_returns, bl); + ENCODE_FINISH(bl); +} + +void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(2, bl); + decode(reqid, bl); + decode(version, bl); + decode(user_version, bl); + decode(return_code, bl); + if (struct_v >= 2) { + decode(op_returns, bl); + } + DECODE_FINISH(bl); +} + +void pg_log_dup_t::dump(Formatter *f) const +{ + f->dump_stream("reqid") << reqid; + f->dump_stream("version") << version; + f->dump_stream("user_version") << user_version; + f->dump_stream("return_code") << return_code; + if (!op_returns.empty()) { + f->open_array_section("op_returns"); + for (auto& i : op_returns) { + f->dump_object("op", i); + } + f->close_section(); + } +} + +void pg_log_dup_t::generate_test_instances(list& o) +{ + o.push_back(new pg_log_dup_t()); + o.push_back(new pg_log_dup_t(eversion_t(1,2), + 1, + osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + 0)); + o.push_back(new pg_log_dup_t(eversion_t(1,2), + 2, + osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + -ENOENT)); +} + + +std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) { + out << "log_dup(reqid=" << e.reqid << + " v=" << e.version << " uv=" << e.user_version << + " rc=" << e.return_code; + if (!e.op_returns.empty()) { + out << " " << e.op_returns; + } + return out << ")"; +} + + +// -- pg_log_t -- + +// out: pg_log_t that only has entries that apply to import_pgid using curmap +// reject: Entries rejected from "in" are in the reject.log. Other fields not set. +void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap, + const string &hit_set_namespace, const pg_log_t &in, + pg_log_t &out, pg_log_t &reject) +{ + out = in; + out.log.clear(); + reject.log.clear(); + + for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) { + + // Reject pg log entries for temporary objects + if (i->soid.is_temp()) { + reject.log.push_back(*i); + continue; + } + + if (i->soid.nspace != hit_set_namespace) { + object_t oid = i->soid.oid; + object_locator_t loc(i->soid); + pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc); + pg_t pgid = curmap.raw_pg_to_pg(raw_pgid); + + if (import_pgid.pgid == pgid) { + out.log.push_back(*i); + } else { + reject.log.push_back(*i); + } + } else { + out.log.push_back(*i); + } + } +} + +void pg_log_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(7, 3, bl); + encode(head, bl); + encode(tail, bl); + encode(log, bl); + encode(can_rollback_to, bl); + encode(rollback_info_trimmed_to, bl); + encode(dups, bl); + ENCODE_FINISH(bl); +} + +void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool) +{ + DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl); + decode(head, bl); + decode(tail, bl); + if (struct_v < 2) { + bool backlog; + decode(backlog, bl); + } + decode(log, bl); + if (struct_v >= 5) + decode(can_rollback_to, bl); + + if (struct_v >= 6) + decode(rollback_info_trimmed_to, bl); + else + rollback_info_trimmed_to = tail; + + if (struct_v >= 7) + decode(dups, bl); + + DECODE_FINISH(bl); + + // handle hobject_t format change + if (struct_v < 4) { + for (auto i = log.begin(); i != log.end(); ++i) { + if (!i->soid.is_max() && i->soid.pool == -1) + i->soid.pool = pool; + } + } +} + +void pg_log_t::dump(Formatter *f) const +{ + f->dump_stream("head") << head; + f->dump_stream("tail") << tail; + f->open_array_section("log"); + for (auto p = log.cbegin(); p != log.cend(); ++p) { + f->open_object_section("entry"); + p->dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("dups"); + for (const auto& entry : dups) { + f->open_object_section("entry"); + entry.dump(f); + f->close_section(); + } + f->close_section(); +} + +void pg_log_t::generate_test_instances(list& o) +{ + o.push_back(new pg_log_t); + + // this is nonsensical: + o.push_back(new pg_log_t); + o.back()->head = eversion_t(1,2); + o.back()->tail = eversion_t(3,4); + list e; + pg_log_entry_t::generate_test_instances(e); + for (auto p = e.begin(); p != e.end(); ++p) + o.back()->log.push_back(**p); +} + +static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups) +{ + auto earliest_dup_version = + target.head.version < maxdups ? 0u : target.head.version - maxdups + 1; + lgeneric_subdout(cct, osd, 20) << __func__ << " earliest_dup_version " + << earliest_dup_version << dendl; + + for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) { + if (d->version.version >= earliest_dup_version) { + lgeneric_subdout(cct, osd, 20) + << "copy_up_to/copy_after copy dup version " + << d->version << dendl; + target.dups.push_back(pg_log_dup_t(*d)); + } + } + + for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) { + ceph_assert(i->version > other.tail); + if (i->version > target.tail) + break; + if (i->version.version >= earliest_dup_version) { + lgeneric_subdout(cct, osd, 20) + << "copy_up_to/copy_after copy dup from log version " + << i->version << dendl; + target.dups.push_back(pg_log_dup_t(*i)); + } + } +} + + +void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v) +{ + can_rollback_to = other.can_rollback_to; + head = other.head; + tail = other.tail; + lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v + << " dups.size()=" << dups.size() + << " other.dups.size()=" << other.dups.size() << dendl; + for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) { + ceph_assert(i->version > other.tail); + if (i->version <= v) { + // make tail accurate. + tail = i->version; + break; + } + lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl; + log.push_front(*i); + } + _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked); + lgeneric_subdout(cct, osd, 20) << __func__ << " END v " << v + << " dups.size()=" << dups.size() + << " other.dups.size()=" << other.dups.size() << dendl; +} + +void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max) +{ + can_rollback_to = other.can_rollback_to; + int n = 0; + head = other.head; + tail = other.tail; + lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max + << " dups.size()=" << dups.size() + << " other.dups.size()=" << other.dups.size() << dendl; + for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) { + ceph_assert(i->version > other.tail); + if (n++ >= max) { + tail = i->version; + break; + } + lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl; + log.push_front(*i); + } + _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked); + lgeneric_subdout(cct, osd, 20) << __func__ << " END max " << max + << " dups.size()=" << dups.size() + << " other.dups.size()=" << other.dups.size() << dendl; +} + +ostream& pg_log_t::print(ostream& out) const +{ + out << *this << std::endl; + for (auto p = log.cbegin(); p != log.cend(); ++p) + out << *p << std::endl; + for (const auto& entry : dups) { + out << " dup entry: " << entry << std::endl; + } + return out; +} + +// -- pg_missing_t -- + +ostream& operator<<(ostream& out, const pg_missing_item& i) +{ + out << i.need; + if (i.have != eversion_t()) + out << "(" << i.have << ")"; + out << " flags = " << i.flag_str() + << " " << i.clean_regions; + return out; +} + +// -- object_copy_cursor_t -- + +void object_copy_cursor_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(attr_complete, bl); + encode(data_offset, bl); + encode(data_complete, bl); + encode(omap_offset, bl); + encode(omap_complete, bl); + ENCODE_FINISH(bl); +} + +void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(attr_complete, bl); + decode(data_offset, bl); + decode(data_complete, bl); + decode(omap_offset, bl); + decode(omap_complete, bl); + DECODE_FINISH(bl); +} + +void object_copy_cursor_t::dump(Formatter *f) const +{ + f->dump_unsigned("attr_complete", (int)attr_complete); + f->dump_unsigned("data_offset", data_offset); + f->dump_unsigned("data_complete", (int)data_complete); + f->dump_string("omap_offset", omap_offset); + f->dump_unsigned("omap_complete", (int)omap_complete); +} + +void object_copy_cursor_t::generate_test_instances(list& o) +{ + o.push_back(new object_copy_cursor_t); + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_offset = 123; + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_complete = true; + o.back()->omap_offset = "foo"; + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_complete = true; + o.back()->omap_complete = true; +} + +// -- object_copy_data_t -- + +void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + ENCODE_START(8, 5, bl); + encode(size, bl); + encode(mtime, bl); + encode(attrs, bl); + encode(data, bl); + encode(omap_data, bl); + encode(cursor, bl); + encode(omap_header, bl); + encode(snaps, bl); + encode(snap_seq, bl); + encode(flags, bl); + encode(data_digest, bl); + encode(omap_digest, bl); + encode(reqids, bl); + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(reqid_return_codes, bl); + ENCODE_FINISH(bl); +} + +void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(8, bl); + if (struct_v < 5) { + // old + decode(size, bl); + decode(mtime, bl); + { + string category; + decode(category, bl); // no longer used + } + decode(attrs, bl); + decode(data, bl); + { + map omap; + decode(omap, bl); + omap_data.clear(); + if (!omap.empty()) { + using ceph::encode; + encode(omap, omap_data); + } + } + decode(cursor, bl); + if (struct_v >= 2) + decode(omap_header, bl); + if (struct_v >= 3) { + decode(snaps, bl); + decode(snap_seq, bl); + } else { + snaps.clear(); + snap_seq = 0; + } + if (struct_v >= 4) { + decode(flags, bl); + decode(data_digest, bl); + decode(omap_digest, bl); + } + } else { + // current + decode(size, bl); + decode(mtime, bl); + decode(attrs, bl); + decode(data, bl); + decode(omap_data, bl); + decode(cursor, bl); + decode(omap_header, bl); + decode(snaps, bl); + decode(snap_seq, bl); + if (struct_v >= 4) { + decode(flags, bl); + decode(data_digest, bl); + decode(omap_digest, bl); + } + if (struct_v >= 6) { + decode(reqids, bl); + } + if (struct_v >= 7) { + decode(truncate_seq, bl); + decode(truncate_size, bl); + } + if (struct_v >= 8) { + decode(reqid_return_codes, bl); + } + } + DECODE_FINISH(bl); +} + +void object_copy_data_t::generate_test_instances(list& o) +{ + o.push_back(new object_copy_data_t()); + + list cursors; + object_copy_cursor_t::generate_test_instances(cursors); + auto ci = cursors.begin(); + o.back()->cursor = **(ci++); + + o.push_back(new object_copy_data_t()); + o.back()->cursor = **(ci++); + + o.push_back(new object_copy_data_t()); + o.back()->size = 1234; + o.back()->mtime.set_from_double(1234); + ceph::buffer::ptr bp("there", 5); + ceph::buffer::list bl; + bl.push_back(bp); + o.back()->attrs["hello"] = bl; + ceph::buffer::ptr bp2("not", 3); + ceph::buffer::list bl2; + bl2.push_back(bp2); + map omap; + omap["why"] = bl2; + using ceph::encode; + encode(omap, o.back()->omap_data); + ceph::buffer::ptr databp("iamsomedatatocontain", 20); + o.back()->data.push_back(databp); + o.back()->omap_header.append("this is an omap header"); + o.back()->snaps.push_back(123); + o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t())); +} + +void object_copy_data_t::dump(Formatter *f) const +{ + f->open_object_section("cursor"); + cursor.dump(f); + f->close_section(); // cursor + f->dump_int("size", size); + f->dump_stream("mtime") << mtime; + /* we should really print out the attrs here, but ceph::buffer::list + const-correctness prevents that */ + f->dump_int("attrs_size", attrs.size()); + f->dump_int("flags", flags); + f->dump_unsigned("data_digest", data_digest); + f->dump_unsigned("omap_digest", omap_digest); + f->dump_int("omap_data_length", omap_data.length()); + f->dump_int("omap_header_length", omap_header.length()); + f->dump_int("data_length", data.length()); + f->open_array_section("snaps"); + for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) + f->dump_unsigned("snap", *p); + f->close_section(); + f->open_array_section("reqids"); + uint32_t idx = 0; + for (auto p = reqids.begin(); + p != reqids.end(); + ++idx, ++p) { + f->open_object_section("extra_reqid"); + f->dump_stream("reqid") << p->first; + f->dump_stream("user_version") << p->second; + auto it = reqid_return_codes.find(idx); + if (it != reqid_return_codes.end()) { + f->dump_int("return_code", it->second); + } + f->close_section(); + } + f->close_section(); +} + +// -- pg_create_t -- + +void pg_create_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(1, 1, bl); + encode(created, bl); + encode(parent, bl); + encode(split_bits, bl); + ENCODE_FINISH(bl); +} + +void pg_create_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(created, bl); + decode(parent, bl); + decode(split_bits, bl); + DECODE_FINISH(bl); +} + +void pg_create_t::dump(Formatter *f) const +{ + f->dump_unsigned("created", created); + f->dump_stream("parent") << parent; + f->dump_int("split_bits", split_bits); +} + +void pg_create_t::generate_test_instances(list& o) +{ + o.push_back(new pg_create_t); + o.push_back(new pg_create_t(1, pg_t(3, 4), 2)); +} + + +// -- pg_hit_set_info_t -- + +void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(2, 1, bl); + encode(begin, bl); + encode(end, bl); + encode(version, bl); + encode(using_gmt, bl); + ENCODE_FINISH(bl); +} + +void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p) +{ + DECODE_START(2, p); + decode(begin, p); + decode(end, p); + decode(version, p); + if (struct_v >= 2) { + decode(using_gmt, p); + } else { + using_gmt = false; + } + DECODE_FINISH(p); +} + +void pg_hit_set_info_t::dump(Formatter *f) const +{ + f->dump_stream("begin") << begin; + f->dump_stream("end") << end; + f->dump_stream("version") << version; + f->dump_stream("using_gmt") << using_gmt; +} + +void pg_hit_set_info_t::generate_test_instances(list& ls) +{ + ls.push_back(new pg_hit_set_info_t); + ls.push_back(new pg_hit_set_info_t); + ls.back()->begin = utime_t(1, 2); + ls.back()->end = utime_t(3, 4); +} + + +// -- pg_hit_set_history_t -- + +void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(current_last_update, bl); + { + utime_t dummy_stamp; + encode(dummy_stamp, bl); + } + { + pg_hit_set_info_t dummy_info; + encode(dummy_info, bl); + } + encode(history, bl); + ENCODE_FINISH(bl); +} + +void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p) +{ + DECODE_START(1, p); + decode(current_last_update, p); + { + utime_t dummy_stamp; + decode(dummy_stamp, p); + } + { + pg_hit_set_info_t dummy_info; + decode(dummy_info, p); + } + decode(history, p); + DECODE_FINISH(p); +} + +void pg_hit_set_history_t::dump(Formatter *f) const +{ + f->dump_stream("current_last_update") << current_last_update; + f->open_array_section("history"); + for (auto p = history.cbegin(); p != history.cend(); ++p) { + f->open_object_section("info"); + p->dump(f); + f->close_section(); + } + f->close_section(); +} + +void pg_hit_set_history_t::generate_test_instances(list& ls) +{ + ls.push_back(new pg_hit_set_history_t); + ls.push_back(new pg_hit_set_history_t); + ls.back()->current_last_update = eversion_t(1, 2); + ls.back()->history.push_back(pg_hit_set_info_t()); +} + +// -- OSDSuperblock -- + +void OSDSuperblock::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(9, 5, bl); + encode(cluster_fsid, bl); + encode(whoami, bl); + encode(current_epoch, bl); + encode(oldest_map, bl); + encode(newest_map, bl); + encode(weight, bl); + compat_features.encode(bl); + encode(clean_thru, bl); + encode(mounted, bl); + encode(osd_fsid, bl); + encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full + encode((uint32_t)0, bl); // map pool_last_epoch_marked_full + encode(purged_snaps_last, bl); + encode(last_purged_snaps_scrub, bl); + ENCODE_FINISH(bl); +} + +void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl); + if (struct_v < 3) { + string magic; + decode(magic, bl); + } + decode(cluster_fsid, bl); + decode(whoami, bl); + decode(current_epoch, bl); + decode(oldest_map, bl); + decode(newest_map, bl); + decode(weight, bl); + if (struct_v >= 2) { + compat_features.decode(bl); + } else { //upgrade it! + compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + } + decode(clean_thru, bl); + decode(mounted, bl); + if (struct_v >= 4) + decode(osd_fsid, bl); + if (struct_v >= 6) { + epoch_t last_map_marked_full; + decode(last_map_marked_full, bl); + } + if (struct_v >= 7) { + map pool_last_map_marked_full; + decode(pool_last_map_marked_full, bl); + } + if (struct_v >= 9) { + decode(purged_snaps_last, bl); + decode(last_purged_snaps_scrub, bl); + } else { + purged_snaps_last = 0; + } + DECODE_FINISH(bl); +} + +void OSDSuperblock::dump(Formatter *f) const +{ + f->dump_stream("cluster_fsid") << cluster_fsid; + f->dump_stream("osd_fsid") << osd_fsid; + f->dump_int("whoami", whoami); + f->dump_int("current_epoch", current_epoch); + f->dump_int("oldest_map", oldest_map); + f->dump_int("newest_map", newest_map); + f->dump_float("weight", weight); + f->open_object_section("compat"); + compat_features.dump(f); + f->close_section(); + f->dump_int("clean_thru", clean_thru); + f->dump_int("last_epoch_mounted", mounted); + f->dump_unsigned("purged_snaps_last", purged_snaps_last); + f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub; +} + +void OSDSuperblock::generate_test_instances(list& o) +{ + OSDSuperblock z; + o.push_back(new OSDSuperblock(z)); + z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101"); + z.osd_fsid.parse("02020202-0202-0202-0202-020202020202"); + z.whoami = 3; + z.current_epoch = 4; + z.oldest_map = 5; + z.newest_map = 9; + z.mounted = 8; + z.clean_thru = 7; + o.push_back(new OSDSuperblock(z)); + o.push_back(new OSDSuperblock(z)); +} + +// -- SnapSet -- + +void SnapSet::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(3, 2, bl); + encode(seq, bl); + encode(true, bl); // head_exists + encode(snaps, bl); + encode(clones, bl); + encode(clone_overlap, bl); + encode(clone_size, bl); + encode(clone_snaps, bl); + ENCODE_FINISH(bl); +} + +void SnapSet::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(seq, bl); + bl += 1u; // skip legacy head_exists (always true) + decode(snaps, bl); + decode(clones, bl); + decode(clone_overlap, bl); + decode(clone_size, bl); + if (struct_v >= 3) { + decode(clone_snaps, bl); + } else { + clone_snaps.clear(); + } + DECODE_FINISH(bl); +} + +void SnapSet::dump(Formatter *f) const +{ + f->dump_unsigned("seq", seq); + f->open_array_section("clones"); + for (auto p = clones.cbegin(); p != clones.cend(); ++p) { + f->open_object_section("clone"); + f->dump_unsigned("snap", *p); + auto cs = clone_size.find(*p); + if (cs != clone_size.end()) + f->dump_unsigned("size", cs->second); + else + f->dump_string("size", "????"); + auto co = clone_overlap.find(*p); + if (co != clone_overlap.end()) + f->dump_stream("overlap") << co->second; + else + f->dump_stream("overlap") << "????"; + auto q = clone_snaps.find(*p); + if (q != clone_snaps.end()) { + f->open_array_section("snaps"); + for (auto s : q->second) { + f->dump_unsigned("snap", s); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} + +void SnapSet::generate_test_instances(list& o) +{ + o.push_back(new SnapSet); + o.push_back(new SnapSet); + o.back()->seq = 123; + o.back()->snaps.push_back(123); + o.back()->snaps.push_back(12); + o.push_back(new SnapSet); + o.back()->seq = 123; + o.back()->snaps.push_back(123); + o.back()->snaps.push_back(12); + o.back()->clones.push_back(12); + o.back()->clone_size[12] = 12345; + o.back()->clone_overlap[12]; + o.back()->clone_snaps[12] = {12, 10, 8}; +} + +ostream& operator<<(ostream& out, const SnapSet& cs) +{ + return out << cs.seq << "=" << cs.snaps << ":" + << cs.clone_snaps; +} + +void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy) +{ + // NOTE: our reconstruction of snaps (and the snapc) is not strictly + // correct: it will not include snaps that still logically exist + // but for which there was no clone that is defined. For all + // practical purposes this doesn't matter, since we only use that + // information to clone on the OSD, and we have already moved + // forward past that part of the object history. + + seq = ss.seq; + set _snaps; + set _clones; + for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) { + if (p->cloneid != librados::SNAP_HEAD) { + _clones.insert(p->cloneid); + _snaps.insert(p->snaps.begin(), p->snaps.end()); + clone_size[p->cloneid] = p->size; + clone_overlap[p->cloneid]; // the entry must exist, even if it's empty. + for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q) + clone_overlap[p->cloneid].insert(q->first, q->second); + if (!legacy) { + // p->snaps is ascending; clone_snaps is descending + vector& v = clone_snaps[p->cloneid]; + for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) { + v.push_back(*q); + } + } + } + } + + // ascending + clones.clear(); + clones.reserve(_clones.size()); + for (auto p = _clones.begin(); p != _clones.end(); ++p) + clones.push_back(*p); + + // descending + snaps.clear(); + snaps.reserve(_snaps.size()); + for (auto p = _snaps.rbegin(); + p != _snaps.rend(); ++p) + snaps.push_back(*p); +} + +uint64_t SnapSet::get_clone_bytes(snapid_t clone) const +{ + ceph_assert(clone_size.count(clone)); + uint64_t size = clone_size.find(clone)->second; + ceph_assert(clone_overlap.count(clone)); + const interval_set &overlap = clone_overlap.find(clone)->second; + ceph_assert(size >= (uint64_t)overlap.size()); + return size - overlap.size(); +} + +void SnapSet::filter(const pg_pool_t &pinfo) +{ + vector oldsnaps; + oldsnaps.swap(snaps); + for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) { + if (!pinfo.is_removed_snap(*i)) + snaps.push_back(*i); + } +} + +SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const +{ + SnapSet ss = *this; + ss.filter(pinfo); + return ss; +} + +// -- watch_info_t -- + +void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + ENCODE_START(4, 3, bl); + encode(cookie, bl); + encode(timeout_seconds, bl); + encode(addr, bl, features); + ENCODE_FINISH(bl); +} + +void watch_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl); + decode(cookie, bl); + if (struct_v < 2) { + uint64_t ver; + decode(ver, bl); + } + decode(timeout_seconds, bl); + if (struct_v >= 4) { + decode(addr, bl); + } + DECODE_FINISH(bl); +} + +void watch_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("cookie", cookie); + f->dump_unsigned("timeout_seconds", timeout_seconds); + f->open_object_section("addr"); + addr.dump(f); + f->close_section(); +} + +void watch_info_t::generate_test_instances(list& o) +{ + o.push_back(new watch_info_t); + o.push_back(new watch_info_t); + o.back()->cookie = 123; + o.back()->timeout_seconds = 99; + entity_addr_t ea; + ea.set_type(entity_addr_t::TYPE_LEGACY); + ea.set_nonce(1); + ea.set_family(AF_INET); + ea.set_in4_quad(0, 127); + ea.set_in4_quad(1, 0); + ea.set_in4_quad(2, 1); + ea.set_in4_quad(3, 2); + ea.set_port(2); + o.back()->addr = ea; +} + +// -- chunk_info_t -- + +void chunk_info_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(offset, bl); + encode(length, bl); + encode(oid, bl); + __u32 _flags = flags; + encode(_flags, bl); + ENCODE_FINISH(bl); +} + +void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(offset, bl); + decode(length, bl); + decode(oid, bl); + __u32 _flags; + decode(_flags, bl); + flags = (cflag_t)_flags; + DECODE_FINISH(bl); +} + +void chunk_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("length", length); + f->open_object_section("oid"); + oid.dump(f); + f->close_section(); + f->dump_unsigned("flags", flags); +} + + +bool chunk_info_t::operator==(const chunk_info_t& cit) const +{ + if (has_fingerprint()) { + if (oid.oid.name == cit.oid.oid.name) { + return true; + } + } else { + if (offset == cit.offset && length == cit.length && + oid.oid.name == cit.oid.oid.name) { + return true; + } + + } + return false; +} + +bool operator==(const std::pair & l, + const std::pair & r) +{ + return l.first == r.first && + l.second == r.second; +} + +ostream& operator<<(ostream& out, const chunk_info_t& ci) +{ + return out << "(len: " << ci.length << " oid: " << ci.oid + << " offset: " << ci.offset + << " flags: " << ci.get_flag_string(ci.flags) << ")"; +} + +// -- object_manifest_t -- + +std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci) +{ + return out << ci.ref_delta << std::endl; +} + +void object_manifest_t::calc_refs_to_inc_on_set( + const object_manifest_t* _g, + const object_manifest_t* _l, + object_ref_delta_t &refs) const +{ + /* avoid to increment the same reference on adjacent clones */ + auto iter = chunk_map.begin(); + auto find_chunk = [](decltype(iter) &i, const object_manifest_t* cur) + -> bool { + if (cur) { + auto c = cur->chunk_map.find(i->first); + if (c != cur->chunk_map.end() && c->second == i->second) { + return true; + + } + } + return false; + }; + + /* If at least a same chunk exists on either _g or _l, do not increment + * the reference + * + * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc + * 20: [0, 2) aaa, <- set_chunk + * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc + * --> incremnt the reference + * + * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc + * 20: [0, 2) ccc, <- set_chunk + * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc + * --> do not need to increment + * + * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc + * 20: [0, 2) ccc, <- set_chunk + * 30: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc + * --> decrement the reference of ccc + * + */ + for (; iter != chunk_map.end(); ++iter) { + auto found_g = find_chunk(iter, _g); + auto found_l = find_chunk(iter, _l); + if (!found_g && !found_l) { + refs.inc_ref(iter->second.oid); + } else if (found_g && found_l) { + refs.dec_ref(iter->second.oid); + } + } +} + +void object_manifest_t::calc_refs_to_drop_on_modify( + const object_manifest_t* _l, + const ObjectCleanRegions& clean_regions, + object_ref_delta_t &refs) const +{ + for (auto &p : chunk_map) { + if (!clean_regions.is_clean_region(p.first, p.second.length)) { + // has previous snapshot + if (_l) { + /* + * Let's assume that there is a manifest snapshotted object which has three chunks + * head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc + * 20: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc + * + * If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because + * 20 has the reference for bbb. Therefore, we only drop the reference if two chunks + * (head: [6, 2) and 20: [6, 2)) are different. + * + */ + auto c = _l->chunk_map.find(p.first); + if (c != _l->chunk_map.end()) { + if (p.second == c->second) { + continue; + } + } + refs.dec_ref(p.second.oid); + } else { + // decrement the reference of the updated chunks if the manifest object has no snapshot + refs.dec_ref(p.second.oid); + } + } + } +} + +void object_manifest_t::calc_refs_to_drop_on_removal( + const object_manifest_t* _g, + const object_manifest_t* _l, + object_ref_delta_t &refs) const +{ + /* At a high level, the rule is that consecutive clones with the same reference + * at the same offset share a reference. As such, removing *this may result + * in removing references in two cases: + * 1) *this has a reference which it shares with neither _g nor _l + * 2) _g and _l have a reference which they share with each other but not + * *this. + * + * For a particular offset, both 1 and 2 can happen. + * + * Notably, this means that to evaluate the reference change from removing + * the object with *this, we only need to look at the two adjacent clones. + */ + + // Paper over possibly missing _g or _l -- nullopt is semantically the same + // as an empty chunk_map + static const object_manifest_t empty; + const object_manifest_t &g = _g ? *_g : empty; + const object_manifest_t &l = _l ? *_l : empty; + + auto giter = g.chunk_map.begin(); + auto iter = chunk_map.begin(); + auto liter = l.chunk_map.begin(); + + // Translate iter, map pair to the current offset, end() -> max + auto get_offset = [](decltype(iter) &i, const object_manifest_t &manifest) + -> uint64_t { + return i == manifest.chunk_map.end() ? + std::numeric_limits::max() : i->first; + }; + + /* If current matches the offset at iter, returns the chunk at *iter + * and increments iter. Otherwise, returns nullptr. + * + * current will always be derived from the min of *giter, *iter, and + * *liter on each cycle, so the result will be that each loop iteration + * will pick up all chunks at the offest being considered, each offset + * will be considered once, and all offsets will be considered. + */ + auto get_chunk = []( + uint64_t current, decltype(iter) &i, const object_manifest_t &manifest) + -> const chunk_info_t * { + if (i == manifest.chunk_map.end() || current != i->first) { + return nullptr; + } else { + return &(i++)->second; + } + }; + + while (giter != g.chunk_map.end() || + iter != chunk_map.end() || + liter != l.chunk_map.end()) { + auto current = std::min( + std::min(get_offset(giter, g), get_offset(iter, *this)), + get_offset(liter, l)); + + auto gchunk = get_chunk(current, giter, g); + auto chunk = get_chunk(current, iter, *this); + auto lchunk = get_chunk(current, liter, l); + + if (gchunk && lchunk && *gchunk == *lchunk && + (!chunk || *gchunk != *chunk)) { + // case 1 from above: l and g match, chunk does not + refs.dec_ref(gchunk->oid); + } + + if (chunk && + (!gchunk || chunk->oid != gchunk->oid) && + (!lchunk || chunk->oid != lchunk->oid)) { + // case 2 from above: *this matches neither + refs.dec_ref(chunk->oid); + } + } +} + +void object_manifest_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(type, bl); + switch (type) { + case TYPE_NONE: break; + case TYPE_REDIRECT: + encode(redirect_target, bl); + break; + case TYPE_CHUNKED: + encode(chunk_map, bl); + break; + default: + ceph_abort(); + } + ENCODE_FINISH(bl); +} + +void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(type, bl); + switch (type) { + case TYPE_NONE: break; + case TYPE_REDIRECT: + decode(redirect_target, bl); + break; + case TYPE_CHUNKED: + decode(chunk_map, bl); + break; + default: + ceph_abort(); + } + DECODE_FINISH(bl); +} + +void object_manifest_t::dump(Formatter *f) const +{ + f->dump_unsigned("type", type); + if (type == TYPE_REDIRECT) { + f->open_object_section("redirect_target"); + redirect_target.dump(f); + f->close_section(); + } else if (type == TYPE_CHUNKED) { + f->open_array_section("chunk_map"); + for (auto& p : chunk_map) { + f->open_object_section("chunk"); + f->dump_unsigned("offset", p.first); + p.second.dump(f); + f->close_section(); + } + f->close_section(); + } +} + +void object_manifest_t::generate_test_instances(list& o) +{ + o.push_back(new object_manifest_t()); + o.back()->type = TYPE_REDIRECT; +} + +ostream& operator<<(ostream& out, const object_manifest_t& om) +{ + out << "manifest(" << om.get_type_name(); + if (om.is_redirect()) { + out << " " << om.redirect_target; + } else if (om.is_chunked()) { + out << " " << om.chunk_map; + } + out << ")"; + return out; +} + +// -- object_info_t -- + +void object_info_t::copy_user_bits(const object_info_t& other) +{ + // these bits are copied from head->clone. + size = other.size; + mtime = other.mtime; + local_mtime = other.local_mtime; + last_reqid = other.last_reqid; + truncate_seq = other.truncate_seq; + truncate_size = other.truncate_size; + flags = other.flags; + user_version = other.user_version; + data_digest = other.data_digest; + omap_digest = other.omap_digest; +} + +void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + object_locator_t myoloc(soid); + map old_watchers; + for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) { + old_watchers.insert(make_pair(i->first.second, i->second)); + } + ENCODE_START(17, 8, bl); + encode(soid, bl); + encode(myoloc, bl); //Retained for compatibility + encode((__u32)0, bl); // was category, no longer used + encode(version, bl); + encode(prior_version, bl); + encode(last_reqid, bl); + encode(size, bl); + encode(mtime, bl); + if (soid.snap == CEPH_NOSNAP) + encode(osd_reqid_t(), bl); // used to be wrlock_by + else + encode((uint32_t)0, bl); // was legacy_snaps + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(is_lost(), bl); + encode(old_watchers, bl, features); + /* shenanigans to avoid breaking backwards compatibility in the disk format. + * When we can, switch this out for simply putting the version_t on disk. */ + eversion_t user_eversion(0, user_version); + encode(user_eversion, bl); + encode(test_flag(FLAG_USES_TMAP), bl); + encode(watchers, bl, features); + __u32 _flags = flags; + encode(_flags, bl); + encode(local_mtime, bl); + encode(data_digest, bl); + encode(omap_digest, bl); + encode(expected_object_size, bl); + encode(expected_write_size, bl); + encode(alloc_hint_flags, bl); + if (has_manifest()) { + encode(manifest, bl); + } + ENCODE_FINISH(bl); +} + +void object_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + object_locator_t myoloc; + DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl); + map old_watchers; + decode(soid, bl); + decode(myoloc, bl); + { + string category; + decode(category, bl); // no longer used + } + decode(version, bl); + decode(prior_version, bl); + decode(last_reqid, bl); + decode(size, bl); + decode(mtime, bl); + if (soid.snap == CEPH_NOSNAP) { + osd_reqid_t wrlock_by; + decode(wrlock_by, bl); + } else { + vector legacy_snaps; + decode(legacy_snaps, bl); + } + decode(truncate_seq, bl); + decode(truncate_size, bl); + + // if this is struct_v >= 13, we will overwrite this + // below since this field is just here for backwards + // compatibility + __u8 lo; + decode(lo, bl); + flags = (flag_t)lo; + + decode(old_watchers, bl); + eversion_t user_eversion; + decode(user_eversion, bl); + user_version = user_eversion.version; + + if (struct_v >= 9) { + bool uses_tmap = false; + decode(uses_tmap, bl); + if (uses_tmap) + set_flag(FLAG_USES_TMAP); + } else { + set_flag(FLAG_USES_TMAP); + } + if (struct_v < 10) + soid.pool = myoloc.pool; + if (struct_v >= 11) { + decode(watchers, bl); + } else { + for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) { + watchers.insert( + make_pair( + make_pair(i->second.cookie, i->first), i->second)); + } + } + if (struct_v >= 13) { + __u32 _flags; + decode(_flags, bl); + flags = (flag_t)_flags; + } + if (struct_v >= 14) { + decode(local_mtime, bl); + } else { + local_mtime = utime_t(); + } + if (struct_v >= 15) { + decode(data_digest, bl); + decode(omap_digest, bl); + } else { + data_digest = omap_digest = -1; + clear_flag(FLAG_DATA_DIGEST); + clear_flag(FLAG_OMAP_DIGEST); + } + if (struct_v >= 16) { + decode(expected_object_size, bl); + decode(expected_write_size, bl); + decode(alloc_hint_flags, bl); + } else { + expected_object_size = 0; + expected_write_size = 0; + alloc_hint_flags = 0; + } + if (struct_v >= 17) { + if (has_manifest()) { + decode(manifest, bl); + } + } + DECODE_FINISH(bl); +} + +void object_info_t::dump(Formatter *f) const +{ + f->open_object_section("oid"); + soid.dump(f); + f->close_section(); + f->dump_stream("version") << version; + f->dump_stream("prior_version") << prior_version; + f->dump_stream("last_reqid") << last_reqid; + f->dump_unsigned("user_version", user_version); + f->dump_unsigned("size", size); + f->dump_stream("mtime") << mtime; + f->dump_stream("local_mtime") << local_mtime; + f->dump_unsigned("lost", (int)is_lost()); + vector sv = get_flag_vector(flags); + f->open_array_section("flags"); + for (auto str: sv) + f->dump_string("flags", str); + f->close_section(); + f->dump_unsigned("truncate_seq", truncate_seq); + f->dump_unsigned("truncate_size", truncate_size); + f->dump_format("data_digest", "0x%08x", data_digest); + f->dump_format("omap_digest", "0x%08x", omap_digest); + f->dump_unsigned("expected_object_size", expected_object_size); + f->dump_unsigned("expected_write_size", expected_write_size); + f->dump_unsigned("alloc_hint_flags", alloc_hint_flags); + f->dump_object("manifest", manifest); + f->open_object_section("watchers"); + for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) { + CachedStackStringStream css; + *css << p->first.second; + f->open_object_section(css->strv()); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void object_info_t::generate_test_instances(list& o) +{ + o.push_back(new object_info_t()); + + // fixme +} + + +ostream& operator<<(ostream& out, const object_info_t& oi) +{ + out << oi.soid << "(" << oi.version + << " " << oi.last_reqid; + if (oi.flags) + out << " " << oi.get_flag_string(); + out << " s " << oi.size; + out << " uv " << oi.user_version; + if (oi.is_data_digest()) + out << " dd " << std::hex << oi.data_digest << std::dec; + if (oi.is_omap_digest()) + out << " od " << std::hex << oi.omap_digest << std::dec; + out << " alloc_hint [" << oi.expected_object_size + << " " << oi.expected_write_size + << " " << oi.alloc_hint_flags << "]"; + if (oi.has_manifest()) + out << " " << oi.manifest; + out << ")"; + return out; +} + +// -- ObjectRecovery -- +void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(data_complete, bl); + encode(data_recovered_to, bl); + encode(omap_recovered_to, bl); + encode(omap_complete, bl); + ENCODE_FINISH(bl); +} + +void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(first, bl); + decode(data_complete, bl); + decode(data_recovered_to, bl); + decode(omap_recovered_to, bl); + decode(omap_complete, bl); + DECODE_FINISH(bl); +} + +ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog) +{ + return prog.print(out); +} + +void ObjectRecoveryProgress::generate_test_instances( + list& o) +{ + o.push_back(new ObjectRecoveryProgress); + o.back()->first = false; + o.back()->data_complete = true; + o.back()->omap_complete = true; + o.back()->data_recovered_to = 100; + + o.push_back(new ObjectRecoveryProgress); + o.back()->first = true; + o.back()->data_complete = false; + o.back()->omap_complete = false; + o.back()->data_recovered_to = 0; +} + +ostream &ObjectRecoveryProgress::print(ostream &out) const +{ + return out << "ObjectRecoveryProgress(" + << ( first ? "" : "!" ) << "first, " + << "data_recovered_to:" << data_recovered_to + << ", data_complete:" << ( data_complete ? "true" : "false" ) + << ", omap_recovered_to:" << omap_recovered_to + << ", omap_complete:" << ( omap_complete ? "true" : "false" ) + << ", error:" << ( error ? "true" : "false" ) + << ")"; +} + +void ObjectRecoveryProgress::dump(Formatter *f) const +{ + f->dump_int("first?", first); + f->dump_int("data_complete?", data_complete); + f->dump_unsigned("data_recovered_to", data_recovered_to); + f->dump_int("omap_complete?", omap_complete); + f->dump_string("omap_recovered_to", omap_recovered_to); +} + +void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(3, 1, bl); + encode(soid, bl); + encode(version, bl); + encode(size, bl); + encode(oi, bl, features); + encode(ss, bl); + encode(copy_subset, bl); + encode(clone_subset, bl); + encode(object_exist, bl); + ENCODE_FINISH(bl); +} + +void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl, + int64_t pool) +{ + DECODE_START(3, bl); + decode(soid, bl); + decode(version, bl); + decode(size, bl); + decode(oi, bl); + decode(ss, bl); + decode(copy_subset, bl); + decode(clone_subset, bl); + if (struct_v > 2) + decode(object_exist, bl); + else + object_exist = false; + DECODE_FINISH(bl); + if (struct_v < 2) { + if (!soid.is_max() && soid.pool == -1) + soid.pool = pool; + map> tmp; + tmp.swap(clone_subset); + for (auto i = tmp.begin(); i != tmp.end(); ++i) { + hobject_t first(i->first); + if (!first.is_max() && first.pool == -1) + first.pool = pool; + clone_subset[first].swap(i->second); + } + } +} + +void ObjectRecoveryInfo::generate_test_instances( + list& o) +{ + o.push_back(new ObjectRecoveryInfo); + o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP)); + o.back()->version = eversion_t(0,0); + o.back()->size = 100; + o.back()->object_exist = false; +} + + +void ObjectRecoveryInfo::dump(Formatter *f) const +{ + f->dump_stream("object") << soid; + f->dump_stream("at_version") << version; + f->dump_stream("size") << size; + { + f->open_object_section("object_info"); + oi.dump(f); + f->close_section(); + } + { + f->open_object_section("snapset"); + ss.dump(f); + f->close_section(); + } + f->dump_stream("copy_subset") << copy_subset; + f->dump_stream("clone_subset") << clone_subset; + f->dump_stream("object_exist") << object_exist; +} + +ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf) +{ + return inf.print(out); +} + +ostream &ObjectRecoveryInfo::print(ostream &out) const +{ + return out << "ObjectRecoveryInfo(" + << soid << "@" << version + << ", size: " << size + << ", copy_subset: " << copy_subset + << ", clone_subset: " << clone_subset + << ", snapset: " << ss + << ", object_exist: " << object_exist + << ")"; +} + +// -- PushReplyOp -- +void PushReplyOp::generate_test_instances(list &o) +{ + o.push_back(new PushReplyOp); + o.push_back(new PushReplyOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.push_back(new PushReplyOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); +} + +void PushReplyOp::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + ENCODE_FINISH(bl); +} + +void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + DECODE_FINISH(bl); +} + +void PushReplyOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; +} + +ostream &PushReplyOp::print(ostream &out) const +{ + return out + << "PushReplyOp(" << soid + << ")"; +} + +ostream& operator<<(ostream& out, const PushReplyOp &op) +{ + return op.print(out); +} + +uint64_t PushReplyOp::cost(CephContext *cct) const +{ + + return cct->_conf->osd_push_per_object_cost + + cct->_conf->osd_recovery_max_chunk; +} + +// -- PullOp -- +void PullOp::generate_test_instances(list &o) +{ + o.push_back(new PullOp); + o.push_back(new PullOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.back()->recovery_info.version = eversion_t(3, 10); + o.push_back(new PullOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); + o.back()->recovery_info.version = eversion_t(0, 0); +} + +void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + encode(recovery_info, bl, features); + encode(recovery_progress, bl); + ENCODE_FINISH(bl); +} + +void PullOp::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + decode(recovery_info, bl); + decode(recovery_progress, bl); + DECODE_FINISH(bl); +} + +void PullOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + { + f->open_object_section("recovery_progress"); + recovery_progress.dump(f); + f->close_section(); + } +} + +ostream &PullOp::print(ostream &out) const +{ + return out + << "PullOp(" << soid + << ", recovery_info: " << recovery_info + << ", recovery_progress: " << recovery_progress + << ")"; +} + +ostream& operator<<(ostream& out, const PullOp &op) +{ + return op.print(out); +} + +uint64_t PullOp::cost(CephContext *cct) const +{ + return cct->_conf->osd_push_per_object_cost + + cct->_conf->osd_recovery_max_chunk; +} + +// -- PushOp -- +void PushOp::generate_test_instances(list &o) +{ + o.push_back(new PushOp); + o.push_back(new PushOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.back()->version = eversion_t(3, 10); + o.push_back(new PushOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); + o.back()->version = eversion_t(0, 0); +} + +void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + encode(version, bl); + encode(data, bl); + encode(data_included, bl); + encode(omap_header, bl); + encode(omap_entries, bl); + encode(attrset, bl); + encode(recovery_info, bl, features); + encode(after_progress, bl); + encode(before_progress, bl); + ENCODE_FINISH(bl); +} + +void PushOp::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + decode(version, bl); + decode(data, bl); + decode(data_included, bl); + decode(omap_header, bl); + decode(omap_entries, bl); + decode(attrset, bl); + decode(recovery_info, bl); + decode(after_progress, bl); + decode(before_progress, bl); + DECODE_FINISH(bl); +} + +void PushOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; + f->dump_stream("version") << version; + f->dump_int("data_len", data.length()); + f->dump_stream("data_included") << data_included; + f->dump_int("omap_header_len", omap_header.length()); + f->dump_int("omap_entries_len", omap_entries.size()); + f->dump_int("attrset_len", attrset.size()); + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + { + f->open_object_section("after_progress"); + after_progress.dump(f); + f->close_section(); + } + { + f->open_object_section("before_progress"); + before_progress.dump(f); + f->close_section(); + } +} + +ostream &PushOp::print(ostream &out) const +{ + return out + << "PushOp(" << soid + << ", version: " << version + << ", data_included: " << data_included + << ", data_size: " << data.length() + << ", omap_header_size: " << omap_header.length() + << ", omap_entries_size: " << omap_entries.size() + << ", attrset_size: " << attrset.size() + << ", recovery_info: " << recovery_info + << ", after_progress: " << after_progress + << ", before_progress: " << before_progress + << ")"; +} + +ostream& operator<<(ostream& out, const PushOp &op) +{ + return op.print(out); +} + +uint64_t PushOp::cost(CephContext *cct) const +{ + uint64_t cost = data_included.size(); + for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) { + cost += i->second.length(); + } + cost += cct->_conf->osd_push_per_object_cost; + return cost; +} + +// -- ScrubMap -- + +void ScrubMap::merge_incr(const ScrubMap &l) +{ + ceph_assert(valid_through == l.incr_since); + valid_through = l.valid_through; + + for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){ + if (p->second.negative) { + auto q = objects.find(p->first); + if (q != objects.end()) { + objects.erase(q); + } + } else { + objects[p->first] = p->second; + } + } +} + +void ScrubMap::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(3, 2, bl); + encode(objects, bl); + encode((__u32)0, bl); // used to be attrs; now deprecated + ceph::buffer::list old_logbl; // not used + encode(old_logbl, bl); + encode(valid_through, bl); + encode(incr_since, bl); + ENCODE_FINISH(bl); +} + +void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(objects, bl); + { + map attrs; // deprecated + decode(attrs, bl); + } + ceph::buffer::list old_logbl; // not used + decode(old_logbl, bl); + decode(valid_through, bl); + decode(incr_since, bl); + DECODE_FINISH(bl); + + // handle hobject_t upgrade + if (struct_v < 3) { + map tmp; + tmp.swap(objects); + for (auto i = tmp.begin(); i != tmp.end(); ++i) { + hobject_t first(i->first); + if (!first.is_max() && first.pool == -1) + first.pool = pool; + objects[first] = i->second; + } + } +} + +void ScrubMap::dump(Formatter *f) const +{ + f->dump_stream("valid_through") << valid_through; + f->dump_stream("incremental_since") << incr_since; + f->open_array_section("objects"); + for (auto p = objects.cbegin(); p != objects.cend(); ++p) { + f->open_object_section("object"); + f->dump_string("name", p->first.oid.name); + f->dump_unsigned("hash", p->first.get_hash()); + f->dump_string("key", p->first.get_key()); + f->dump_int("snapid", p->first.snap); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void ScrubMap::generate_test_instances(list& o) +{ + o.push_back(new ScrubMap); + o.push_back(new ScrubMap); + o.back()->valid_through = eversion_t(1, 2); + o.back()->incr_since = eversion_t(3, 4); + list obj; + object::generate_test_instances(obj); + o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back(); + obj.pop_back(); + o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back(); +} + +// -- ScrubMap::object -- + +void ScrubMap::object::encode(ceph::buffer::list& bl) const +{ + bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch; + ENCODE_START(10, 7, bl); + encode(size, bl); + encode(negative, bl); + encode(attrs, bl); + encode(digest, bl); + encode(digest_present, bl); + encode((uint32_t)0, bl); // obsolete nlinks + encode((uint32_t)0, bl); // snapcolls + encode(omap_digest, bl); + encode(omap_digest_present, bl); + encode(compat_read_error, bl); + encode(stat_error, bl); + encode(read_error, bl); + encode(ec_hash_mismatch, bl); + encode(ec_size_mismatch, bl); + encode(large_omap_object_found, bl); + encode(large_omap_object_key_count, bl); + encode(large_omap_object_value_size, bl); + encode(object_omap_bytes, bl); + encode(object_omap_keys, bl); + ENCODE_FINISH(bl); +} + +void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(10, bl); + decode(size, bl); + bool tmp, compat_read_error = false; + decode(tmp, bl); + negative = tmp; + decode(attrs, bl); + decode(digest, bl); + decode(tmp, bl); + digest_present = tmp; + { + uint32_t nlinks; + decode(nlinks, bl); + set snapcolls; + decode(snapcolls, bl); + } + decode(omap_digest, bl); + decode(tmp, bl); + omap_digest_present = tmp; + decode(compat_read_error, bl); + decode(tmp, bl); + stat_error = tmp; + if (struct_v >= 8) { + decode(tmp, bl); + read_error = tmp; + decode(tmp, bl); + ec_hash_mismatch = tmp; + decode(tmp, bl); + ec_size_mismatch = tmp; + } + // If older encoder found a read_error, set read_error + if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch) + read_error = true; + if (struct_v >= 9) { + decode(tmp, bl); + large_omap_object_found = tmp; + decode(large_omap_object_key_count, bl); + decode(large_omap_object_value_size, bl); + } + if (struct_v >= 10) { + decode(object_omap_bytes, bl); + decode(object_omap_keys, bl); + } + DECODE_FINISH(bl); +} + +void ScrubMap::object::dump(Formatter *f) const +{ + f->dump_int("size", size); + f->dump_int("negative", negative); + f->open_array_section("attrs"); + for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) { + f->open_object_section("attr"); + f->dump_string("name", p->first); + f->dump_int("length", p->second.length()); + f->close_section(); + } + f->close_section(); +} + +void ScrubMap::object::generate_test_instances(list& o) +{ + o.push_back(new object); + o.push_back(new object); + o.back()->negative = true; + o.push_back(new object); + o.back()->size = 123; + o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3); + o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6); +} + +// -- OSDOp -- + +ostream& operator<<(ostream& out, const OSDOp& op) +{ + out << ceph_osd_op_name(op.op.op); + if (ceph_osd_op_type_data(op.op.op)) { + // data extent + switch (op.op.op) { + case CEPH_OSD_OP_ASSERT_VER: + out << " v" << op.op.assert_ver.ver; + break; + case CEPH_OSD_OP_TRUNCATE: + out << " " << op.op.extent.offset; + break; + case CEPH_OSD_OP_MASKTRUNC: + case CEPH_OSD_OP_TRIMTRUNC: + out << " " << op.op.extent.truncate_seq << "@" + << (int64_t)op.op.extent.truncate_size; + break; + case CEPH_OSD_OP_ROLLBACK: + out << " " << snapid_t(op.op.snap.snapid); + break; + case CEPH_OSD_OP_WATCH: + out << " " << ceph_osd_watch_op_name(op.op.watch.op) + << " cookie " << op.op.watch.cookie; + if (op.op.watch.gen) + out << " gen " << op.op.watch.gen; + break; + case CEPH_OSD_OP_NOTIFY: + out << " cookie " << op.op.notify.cookie; + break; + case CEPH_OSD_OP_COPY_GET: + out << " max " << op.op.copy_get.max; + break; + case CEPH_OSD_OP_COPY_FROM: + out << " ver " << op.op.copy_from.src_version; + break; + case CEPH_OSD_OP_SETALLOCHINT: + out << " object_size " << op.op.alloc_hint.expected_object_size + << " write_size " << op.op.alloc_hint.expected_write_size; + break; + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_MAPEXT: + case CEPH_OSD_OP_CMPEXT: + out << " " << op.op.extent.offset << "~" << op.op.extent.length; + if (op.op.extent.truncate_seq) + out << " [" << op.op.extent.truncate_seq << "@" + << (int64_t)op.op.extent.truncate_size << "]"; + if (op.op.flags) + out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]"; + default: + // don't show any arg info + break; + } + } else if (ceph_osd_op_type_attr(op.op.op)) { + // xattr name + if (op.op.xattr.name_len && op.indata.length()) { + out << " "; + op.indata.write(0, op.op.xattr.name_len, out); + } + if (op.op.xattr.value_len) + out << " (" << op.op.xattr.value_len << ")"; + if (op.op.op == CEPH_OSD_OP_CMPXATTR) + out << " op " << (int)op.op.xattr.cmp_op + << " mode " << (int)op.op.xattr.cmp_mode; + } else if (ceph_osd_op_type_exec(op.op.op)) { + // class.method + if (op.op.cls.class_len && op.indata.length()) { + out << " "; + op.indata.write(0, op.op.cls.class_len, out); + out << "."; + op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out); + } + } else if (ceph_osd_op_type_pg(op.op.op)) { + switch (op.op.op) { + case CEPH_OSD_OP_PGLS: + case CEPH_OSD_OP_PGLS_FILTER: + case CEPH_OSD_OP_PGNLS: + case CEPH_OSD_OP_PGNLS_FILTER: + out << " start_epoch " << op.op.pgls.start_epoch; + break; + case CEPH_OSD_OP_PG_HITSET_LS: + break; + case CEPH_OSD_OP_PG_HITSET_GET: + out << " " << utime_t(op.op.hit_set_get.stamp); + break; + case CEPH_OSD_OP_SCRUBLS: + break; + } + } + if (op.indata.length()) { + out << " in=" << op.indata.length() << "b"; + } + if (op.outdata.length()) { + out << " out=" << op.outdata.length() << "b"; + } + return out; +} + + +void OSDOp::split_osd_op_vector_out_data(vector& ops, ceph::buffer::list& in) +{ + auto datap = in.begin(); + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].op.payload_len) { + datap.copy(ops[i].op.payload_len, ops[i].outdata); + } + } +} + +void OSDOp::merge_osd_op_vector_out_data(vector& ops, ceph::buffer::list& out) +{ + for (unsigned i = 0; i < ops.size(); i++) { + ops[i].op.payload_len = ops[i].outdata.length(); + if (ops[i].outdata.length()) { + out.append(ops[i].outdata); + } + } +} + +int prepare_info_keymap( + CephContext* cct, + map *km, + string *key_to_remove, + epoch_t epoch, + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + bool dirty_big_info, + bool dirty_epoch, + bool try_fast_info, + PerfCounters *logger, + DoutPrefixProvider *dpp) +{ + if (dirty_epoch) { + encode(epoch, (*km)[string(epoch_key)]); + } + + if (logger) + logger->inc(l_osd_pg_info); + + // try to do info efficiently? + if (!dirty_big_info && try_fast_info && + info.last_update > last_written_info.last_update) { + pg_fast_info_t fast; + fast.populate_from(info); + bool did = fast.try_apply_to(&last_written_info); + ceph_assert(did); // we verified last_update increased above + if (info == last_written_info) { + encode(fast, (*km)[string(fastinfo_key)]); + if (logger) + logger->inc(l_osd_pg_fastinfo); + return 0; + } + if (dpp) { + ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n"; + { + JSONFormatter jf(true); + jf.dump_object("info", info); + jf.flush(*_dout); + } + { + *_dout << "\nlast_written_info:\n"; + JSONFormatter jf(true); + jf.dump_object("last_written_info", last_written_info); + jf.flush(*_dout); + } + *_dout << dendl; + } + } else if (info.last_update <= last_written_info.last_update) { + // clean up any potentially stale fastinfo key resulting from last_update + // not moving forwards (e.g., a backwards jump during peering) + *key_to_remove = fastinfo_key; + } + + last_written_info = info; + + // info. store purged_snaps separately. + interval_set purged_snaps; + purged_snaps.swap(info.purged_snaps); + encode(info, (*km)[string(info_key)]); + purged_snaps.swap(info.purged_snaps); + + if (dirty_big_info) { + // potentially big stuff + bufferlist& bigbl = (*km)[string(biginfo_key)]; + encode(past_intervals, bigbl); + encode(info.purged_snaps, bigbl); + //dout(20) << "write_info bigbl " << bigbl.length() << dendl; + if (logger) + logger->inc(l_osd_pg_biginfo); + } + + return 0; +} + +void create_pg_collection( + ceph::os::Transaction& t, spg_t pgid, int bits) +{ + coll_t coll(pgid); + t.create_collection(coll, bits); +} + +void init_pg_ondisk( + ceph::os::Transaction& t, + spg_t pgid, + const pg_pool_t *pool) +{ + coll_t coll(pgid); + if (pool) { + // Give a hint to the PG collection + bufferlist hint; + uint32_t pg_num = pool->get_pg_num(); + uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num; + encode(pg_num, hint); + encode(expected_num_objects_pg, hint); + uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS; + t.collection_hint(coll, hint_type, hint); + } + + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + t.touch(coll, pgmeta_oid); + map values; + __u8 struct_v = pg_latest_struct_v; + encode(struct_v, values[string(infover_key)]); + t.omap_setkeys(coll, pgmeta_oid, values); +} + +PGLSFilter::PGLSFilter() : cct(nullptr) +{ +} + +PGLSFilter::~PGLSFilter() +{ +} + +int PGLSPlainFilter::init(ceph::bufferlist::const_iterator ¶ms) +{ + try { + decode(xattr, params); + decode(val, params); + } catch (ceph::buffer::error &e) { + return -EINVAL; + } + return 0; +} + +bool PGLSPlainFilter::filter(const hobject_t& obj, + const ceph::bufferlist& xattr_data) const +{ + return xattr_data.contents_equal(val.c_str(), val.size()); +} diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h new file mode 100644 index 000000000..93645c5f2 --- /dev/null +++ b/src/osd/osd_types.h @@ -0,0 +1,6568 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_TYPES_H +#define CEPH_OSD_TYPES_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "include/rados/rados_types.hpp" +#include "include/mempool.h" + +#include "msg/msg_types.h" +#include "include/compat.h" +#include "include/types.h" +#include "include/utime.h" +#include "include/CompatSet.h" +#include "common/ceph_context.h" +#include "common/histogram.h" +#include "include/interval_set.h" +#include "include/inline_memory.h" +#include "common/Formatter.h" +#include "common/bloom_filter.hpp" +#include "common/hobject.h" +#include "common/snap_types.h" +#include "HitSet.h" +#include "Watch.h" +#include "include/cmp.h" +#include "librados/ListObjectImpl.h" +#include "compressor/Compressor.h" +#include "osd_perf_counters.h" + +#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026" + +#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)") +#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object") +#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator") +#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean") +#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories") +#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool") +#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog") +#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper") +#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects") +#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints") +#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object") +#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set") +#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr") +#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set") +#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure") + + +/// pool priority range set by user +#define OSD_POOL_PRIORITY_MAX 10 +#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX + +/// min recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_MIN 0 + +/// base backfill priority for MBackfillReserve +#define OSD_BACKFILL_PRIORITY_BASE 100 + +/// base backfill priority for MBackfillReserve (degraded PG) +#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140 + +/// base recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_BASE 180 + +/// base backfill priority for MBackfillReserve (inactive PG) +#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220 + +/// base recovery priority for MRecoveryReserve (inactive PG) +#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220 + +/// max manually/automatically set recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_MAX 253 + +/// backfill priority for MBackfillReserve, when forced manually +#define OSD_BACKFILL_PRIORITY_FORCED 254 + +/// recovery priority for MRecoveryReserve, when forced manually +#define OSD_RECOVERY_PRIORITY_FORCED 255 + +/// priority for pg deletion when osd is not fullish +#define OSD_DELETE_PRIORITY_NORMAL 179 + +/// priority for pg deletion when osd is approaching full +#define OSD_DELETE_PRIORITY_FULLISH 219 + +/// priority when more full +#define OSD_DELETE_PRIORITY_FULL 255 + +static std::map max_prio_map = { + {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1}, + {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1}, + {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1}, + {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}, + {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX} +}; + +typedef hobject_t collection_list_handle_t; + +/// convert a single CPEH_OSD_FLAG_* to a std::string +const char *ceph_osd_flag_name(unsigned flag); +/// convert a single CEPH_OSD_OF_FLAG_* to a std::string +const char *ceph_osd_op_flag_name(unsigned flag); + +/// convert CEPH_OSD_FLAG_* op flags to a std::string +std::string ceph_osd_flag_string(unsigned flags); +/// conver CEPH_OSD_OP_FLAG_* op flags to a std::string +std::string ceph_osd_op_flag_string(unsigned flags); +/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string +std::string ceph_osd_alloc_hint_flag_string(unsigned flags); + +typedef std::map osd_alert_list_t; +/// map osd id -> alert_list_t +typedef std::map osd_alerts_t; +void dump(ceph::Formatter* f, const osd_alerts_t& alerts); + + +typedef interval_set< + snapid_t, + mempool::osdmap::flat_map> snap_interval_set_t; + + +/** + * osd request identifier + * + * caller name + incarnation# + tid to unique identify this request. + */ +struct osd_reqid_t { + entity_name_t name; // who + ceph_tid_t tid; + int32_t inc; // incarnation + + osd_reqid_t() + : tid(0), inc(0) + {} + osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t) + : name(a), tid(t), inc(i) + {} + + DENC(osd_reqid_t, v, p) { + DENC_START(2, 2, p); + denc(v.name, p); + denc(v.tid, p); + denc(v.inc, p); + DENC_FINISH(p); + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_DENC(osd_reqid_t) + + + +struct pg_shard_t { + static const int32_t NO_OSD = 0x7fffffff; + int32_t osd; + shard_id_t shard; + pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {} + explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {} + pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {} + bool is_undefined() const { + return osd == -1; + } + std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const { + f->dump_unsigned("osd", osd); + if (shard != shard_id_t::NO_SHARD) { + f->dump_unsigned("shard", shard); + } + } +}; +WRITE_CLASS_ENCODER(pg_shard_t) +WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard) +WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard) +std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs); + +using HobjToShardSetMapping = std::map>; + +class IsPGRecoverablePredicate { +public: + /** + * have encodes the shards available + */ + virtual bool operator()(const std::set &have) const = 0; + virtual ~IsPGRecoverablePredicate() {} +}; + +class IsPGReadablePredicate { +public: + /** + * have encodes the shards available + */ + virtual bool operator()(const std::set &have) const = 0; + virtual ~IsPGReadablePredicate() {} +}; + +inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) { + return out << r.name << "." << r.inc << ":" << r.tid; +} + +inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); +} +inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); +} +inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid < r.tid); +} +inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); +} +inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); } +inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); } + +namespace std { + template<> struct hash { + size_t operator()(const osd_reqid_t &r) const { + static hash H; + return H(r.name.num() ^ r.tid ^ r.inc); + } + }; +} // namespace std + + +// ----- + +// a locator constrains the placement of an object. mainly, which pool +// does it go in. +struct object_locator_t { + // You specify either the hash or the key -- not both + std::int64_t pool; ///< pool id + std::string key; ///< key string (if non-empty) + std::string nspace; ///< namespace + std::int64_t hash; ///< hash position (if >= 0) + + explicit object_locator_t() + : pool(-1), hash(-1) {} + explicit object_locator_t(int64_t po) + : pool(po), hash(-1) {} + explicit object_locator_t(int64_t po, int64_t ps) + : pool(po), hash(ps) {} + explicit object_locator_t(int64_t po, std::string_view ns) + : pool(po), nspace(ns), hash(-1) {} + explicit object_locator_t(int64_t po, std::string_view ns, int64_t ps) + : pool(po), nspace(ns), hash(ps) {} + explicit object_locator_t(int64_t po, std::string_view ns, std::string_view s) + : pool(po), key(s), nspace(ns), hash(-1) {} + explicit object_locator_t(const hobject_t& soid) + : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {} + + int64_t get_pool() const { + return pool; + } + + void clear() { + pool = -1; + key = ""; + nspace = ""; + hash = -1; + } + + bool empty() const { + return pool == -1; + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(object_locator_t) + +inline bool operator==(const object_locator_t& l, const object_locator_t& r) { + return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash; +} +inline bool operator!=(const object_locator_t& l, const object_locator_t& r) { + return !(l == r); +} + +inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc) +{ + out << "@" << loc.pool; + if (loc.nspace.length()) + out << ";" << loc.nspace; + if (loc.key.length()) + out << ":" << loc.key; + return out; +} + +struct request_redirect_t { +private: + object_locator_t redirect_locator; ///< this is authoritative + std::string redirect_object; ///< If non-empty, the request goes to this object name + + friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir); +public: + + request_redirect_t() {} + explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) : + redirect_locator(orig) { redirect_locator.pool = rpool; } + explicit request_redirect_t(const object_locator_t& rloc) : + redirect_locator(rloc) {} + explicit request_redirect_t(const object_locator_t& orig, + const std::string& robj) : + redirect_locator(orig), redirect_object(robj) {} + + bool empty() const { return redirect_locator.empty() && + redirect_object.empty(); } + + void combine_with_locator(object_locator_t& orig, std::string& obj) const { + orig = redirect_locator; + if (!redirect_object.empty()) + obj = redirect_object; + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(request_redirect_t) + +inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) { + out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}"; + return out; +} + +// Internal OSD op flags - set by the OSD based on the op types +enum { + CEPH_OSD_RMW_FLAG_READ = (1 << 1), + CEPH_OSD_RMW_FLAG_WRITE = (1 << 2), + CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3), + CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4), + CEPH_OSD_RMW_FLAG_PGOP = (1 << 5), + CEPH_OSD_RMW_FLAG_CACHE = (1 << 6), + CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7), + CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8), + CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9), + CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10), + CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11), +}; + + +// pg stuff + +#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0))) + +// placement seed (a hash value) +typedef uint32_t ps_t; + +// old (v1) pg_t encoding (wrap old struct ceph_pg) +struct old_pg_t { + ceph_pg v; + void encode(ceph::buffer::list& bl) const { + ceph::encode_raw(v, bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + ceph::decode_raw(v, bl); + } +}; +WRITE_CLASS_ENCODER(old_pg_t) + +// placement group id +struct pg_t { + uint64_t m_pool; + uint32_t m_seed; + + pg_t() : m_pool(0), m_seed(0) {} + pg_t(ps_t seed, uint64_t pool) : + m_pool(pool), m_seed(seed) {} + // cppcheck-suppress noExplicitConstructor + pg_t(const ceph_pg& cpg) : + m_pool(cpg.pool), m_seed(cpg.ps) {} + + // cppcheck-suppress noExplicitConstructor + pg_t(const old_pg_t& opg) { + *this = opg.v; + } + + old_pg_t get_old_pg() const { + old_pg_t o; + ceph_assert(m_pool < 0xffffffffull); + o.v.pool = m_pool; + o.v.ps = m_seed; + o.v.preferred = (__s16)-1; + return o; + } + + ps_t ps() const { + return m_seed; + } + int64_t pool() const { + return m_pool; + } + + static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0' + char *calc_name(char *buf, const char *suffix_backwords) const; + + void set_ps(ps_t p) { + m_seed = p; + } + void set_pool(uint64_t p) { + m_pool = p; + } + + pg_t get_parent() const; + pg_t get_ancestor(unsigned old_pg_num) const; + + int print(char *o, int maxlen) const; + bool parse(const char *s); + + bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set *pchildren) const; + + bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const; + bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const { + return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr); + } + + /** + * Returns b such that for all object o: + * ~((~0)< p.ps()) { + return 1; + } else { + return 0; + } + } + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(m_pool, bl); + encode(m_seed, bl); + encode((int32_t)-1, bl); // was preferred + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + __u8 v; + decode(v, bl); + decode(m_pool, bl); + decode(m_seed, bl); + bl += sizeof(int32_t); // was preferred + } + void decode_old(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + old_pg_t opg; + decode(opg, bl); + *this = opg; + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(pg_t) + +inline bool operator<(const pg_t& l, const pg_t& r) { + return l.compare(r) < 0; +} +inline bool operator<=(const pg_t& l, const pg_t& r) { + return l.compare(r) <= 0; +} +inline bool operator==(const pg_t& l, const pg_t& r) { + return l.compare(r) == 0; +} +inline bool operator!=(const pg_t& l, const pg_t& r) { + return l.compare(r) != 0; +} +inline bool operator>(const pg_t& l, const pg_t& r) { + return l.compare(r) > 0; +} +inline bool operator>=(const pg_t& l, const pg_t& r) { + return l.compare(r) >= 0; +} + +std::ostream& operator<<(std::ostream& out, const pg_t &pg); + +namespace std { + template<> struct hash< pg_t > + { + size_t operator()( const pg_t& x ) const + { + static hash H; + // xor (s32)-1 in there to preserve original m_preferred result (paranoia!) + return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1)); + } + }; +} // namespace std + +struct spg_t { + pg_t pgid; + shard_id_t shard; + spg_t() : shard(shard_id_t::NO_SHARD) {} + spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {} + explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {} + unsigned get_split_bits(unsigned pg_num) const { + return pgid.get_split_bits(pg_num); + } + spg_t get_parent() const { + return spg_t(pgid.get_parent(), shard); + } + ps_t ps() const { + return pgid.ps(); + } + uint64_t pool() const { + return pgid.pool(); + } + void reset_shard(shard_id_t s) { + shard = s; + } + + static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255"); + char *calc_name(char *buf, const char *suffix_backwords) const; + + bool parse(const char *s); + bool parse(const std::string& s) { + return parse(s.c_str()); + } + + spg_t get_ancestor(unsigned old_pg_num) const { + return spg_t(pgid.get_ancestor(old_pg_num), shard); + } + + bool is_split(unsigned old_pg_num, unsigned new_pg_num, + std::set *pchildren) const { + std::set _children; + std::set *children = pchildren ? &_children : NULL; + bool is_split = pgid.is_split(old_pg_num, new_pg_num, children); + if (pchildren && is_split) { + for (std::set::iterator i = _children.begin(); + i != _children.end(); + ++i) { + pchildren->insert(spg_t(*i, shard)); + } + } + return is_split; + } + bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const { + return pgid.is_merge_target(old_pg_num, new_pg_num); + } + bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, + spg_t *parent) const { + spg_t out = *this; + bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid); + if (r && parent) { + *parent = out; + } + return r; + } + + bool is_no_shard() const { + return shard == shard_id_t::NO_SHARD; + } + + ghobject_t make_pgmeta_oid() const { + return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard); + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(pgid, bl); + encode(shard, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(pgid, bl); + decode(shard, bl); + DECODE_FINISH(bl); + } + + ghobject_t make_temp_ghobject(const std::string& name) const { + return ghobject_t( + hobject_t(object_t(name), "", CEPH_NOSNAP, + pgid.ps(), + hobject_t::get_temp_pool(pgid.pool()), + ""), + ghobject_t::NO_GEN, + shard); + } + + unsigned hash_to_shard(unsigned num_shards) const { + return ps() % num_shards; + } +}; +WRITE_CLASS_ENCODER(spg_t) +WRITE_EQ_OPERATORS_2(spg_t, pgid, shard) +WRITE_CMP_OPERATORS_2(spg_t, pgid, shard) + +namespace std { + template<> struct hash< spg_t > + { + size_t operator()( const spg_t& x ) const + { + static hash H; + return H(hash()(x.pgid) ^ x.shard); + } + }; +} // namespace std + +std::ostream& operator<<(std::ostream& out, const spg_t &pg); + +// ---------------------- + +class coll_t { + enum type_t { + TYPE_META = 0, + TYPE_LEGACY_TEMP = 1, /* no longer used */ + TYPE_PG = 2, + TYPE_PG_TEMP = 3, + }; + type_t type; + spg_t pgid; + uint64_t removal_seq; // note: deprecated, not encoded + + char _str_buff[spg_t::calc_name_buf_size]; + char *_str; + + void calc_str(); + + coll_t(type_t t, spg_t p, uint64_t r) + : type(t), pgid(p), removal_seq(r) { + calc_str(); + } + +public: + coll_t() : type(TYPE_META), removal_seq(0) + { + calc_str(); + } + + coll_t(const coll_t& other) + : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) { + calc_str(); + } + + explicit coll_t(spg_t pgid) + : type(TYPE_PG), pgid(pgid), removal_seq(0) + { + calc_str(); + } + + coll_t& operator=(const coll_t& rhs) + { + this->type = rhs.type; + this->pgid = rhs.pgid; + this->removal_seq = rhs.removal_seq; + this->calc_str(); + return *this; + } + + // named constructors + static coll_t meta() { + return coll_t(); + } + static coll_t pg(spg_t p) { + return coll_t(p); + } + + const std::string to_str() const { + return std::string(_str); + } + const char *c_str() const { + return _str; + } + + bool parse(const std::string& s); + + int operator<(const coll_t &rhs) const { + return type < rhs.type || + (type == rhs.type && pgid < rhs.pgid); + } + + bool is_meta() const { + return type == TYPE_META; + } + bool is_pg_prefix(spg_t *pgid_) const { + if (type == TYPE_PG || type == TYPE_PG_TEMP) { + *pgid_ = pgid; + return true; + } + return false; + } + bool is_pg() const { + return type == TYPE_PG; + } + bool is_pg(spg_t *pgid_) const { + if (type == TYPE_PG) { + *pgid_ = pgid; + return true; + } + return false; + } + bool is_temp() const { + return type == TYPE_PG_TEMP; + } + bool is_temp(spg_t *pgid_) const { + if (type == TYPE_PG_TEMP) { + *pgid_ = pgid; + return true; + } + return false; + } + int64_t pool() const { + return pgid.pool(); + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + size_t encoded_size() const; + + inline bool operator==(const coll_t& rhs) const { + // only compare type if meta + if (type != rhs.type) + return false; + if (type == TYPE_META) + return true; + return type == rhs.type && pgid == rhs.pgid; + } + inline bool operator!=(const coll_t& rhs) const { + return !(*this == rhs); + } + + // get a TEMP collection that corresponds to the current collection, + // which we presume is a pg collection. + coll_t get_temp() const { + ceph_assert(type == TYPE_PG); + return coll_t(TYPE_PG_TEMP, pgid, 0); + } + + ghobject_t get_min_hobj() const { + ghobject_t o; + switch (type) { + case TYPE_PG: + o.hobj.pool = pgid.pool(); + o.set_shard(pgid.shard); + break; + case TYPE_META: + o.hobj.pool = -1; + break; + default: + break; + } + return o; + } + + unsigned hash_to_shard(unsigned num_shards) const { + if (type == TYPE_PG) + return pgid.hash_to_shard(num_shards); + return 0; // whatever. + } + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; + +WRITE_CLASS_ENCODER(coll_t) + +inline std::ostream& operator<<(std::ostream& out, const coll_t& c) { + out << c.to_str(); + return out; +} + +namespace std { + template<> struct hash { + size_t operator()(const coll_t &c) const { + size_t h = 0; + std::string str(c.to_str()); + std::string::const_iterator end(str.end()); + for (std::string::const_iterator s = str.begin(); s != end; ++s) { + h += *s; + h += (h << 10); + h ^= (h >> 6); + } + h += (h << 3); + h ^= (h >> 11); + h += (h << 15); + return h; + } + }; +} // namespace std + +inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol) +{ + out << pg_t(ol.ol_pgid); + int su = ol.ol_stripe_unit; + if (su) + out << ".su=" << su; + return out; +} + + + +// compound rados version type +/* WARNING: If add member in eversion_t, please make sure the encode/decode function + * work well. For little-endian machine, we should make sure there is no padding + * in 32-bit machine and 64-bit machine. + */ +class eversion_t { +public: + version_t version; + epoch_t epoch; + __u32 __pad; + eversion_t() : version(0), epoch(0), __pad(0) {} + eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {} + + // cppcheck-suppress noExplicitConstructor + eversion_t(const ceph_eversion& ce) : + version(ce.version), + epoch(ce.epoch), + __pad(0) { } + + explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); } + + static const eversion_t& max() { + static const eversion_t max(-1,-1); + return max; + } + + operator ceph_eversion() { + ceph_eversion c; + c.epoch = epoch; + c.version = version; + return c; + } + + std::string get_key_name() const; + + // key must point to the beginning of a block of 32 chars + inline void get_key_name(char* key) const { + // Below is equivalent of sprintf("%010u.%020llu"); + key[31] = 0; + ritoa(version, key + 31); + key[10] = '.'; + ritoa(epoch, key + 10); + } + + void encode(ceph::buffer::list &bl) const { +#if defined(CEPH_LITTLE_ENDIAN) + bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t)); +#else + using ceph::encode; + encode(version, bl); + encode(epoch, bl); +#endif + } + void decode(ceph::buffer::list::const_iterator &bl) { +#if defined(CEPH_LITTLE_ENDIAN) + bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this); +#else + using ceph::decode; + decode(version, bl); + decode(epoch, bl); +#endif + } + void decode(ceph::buffer::list& bl) { + auto p = std::cbegin(bl); + decode(p); + } +}; +WRITE_CLASS_ENCODER(eversion_t) + +inline bool operator==(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) && (l.version == r.version); +} +inline bool operator!=(const eversion_t& l, const eversion_t& r) { + return (l.epoch != r.epoch) || (l.version != r.version); +} +inline bool operator<(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); +} +inline bool operator<=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); +} +inline bool operator>(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); +} +inline bool operator>=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); +} +inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) { + return out << e.epoch << "'" << e.version; +} + +/** + * objectstore_perf_stat_t + * + * current perf information about the osd + */ +struct objectstore_perf_stat_t { + // cur_op_latency is in ns since double add/sub are not associative + uint64_t os_commit_latency_ns; + uint64_t os_apply_latency_ns; + + objectstore_perf_stat_t() : + os_commit_latency_ns(0), os_apply_latency_ns(0) {} + + bool operator==(const objectstore_perf_stat_t &r) const { + return os_commit_latency_ns == r.os_commit_latency_ns && + os_apply_latency_ns == r.os_apply_latency_ns; + } + + void add(const objectstore_perf_stat_t &o) { + os_commit_latency_ns += o.os_commit_latency_ns; + os_apply_latency_ns += o.os_apply_latency_ns; + } + void sub(const objectstore_perf_stat_t &o) { + os_commit_latency_ns -= o.os_commit_latency_ns; + os_apply_latency_ns -= o.os_apply_latency_ns; + } + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t) + +/* + * pg states + */ +#define PG_STATE_CREATING (1ULL << 0) // creating +#define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too) +#define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas. +#define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline +#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound +#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound +#define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging +#define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing +//#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub +#define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy +#define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be) +#define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering +#define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub +#define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects +#define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill +#define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed. +#define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown. +#define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH +#define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files +#define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content +#define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full +#define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations +#define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size +#define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active +#define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover +#define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps +#define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps +#define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full +#define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps +#define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other +#define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other +#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors +#define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings +#define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire + +std::string pg_state_string(uint64_t state); +std::string pg_vector_string(const std::vector &a); +std::optional pg_string_state(const std::string& state); + + +/* + * pool_snap_info_t + * + * attributes for a single pool snapshot. + */ +struct pool_snap_info_t { + snapid_t snapid; + utime_t stamp; + std::string name; + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t) + +inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) { + return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')'; +} + + +/* + * pool_opts_t + * + * pool options. + */ + +// The order of items in the list is important, therefore, +// you should always add to the end of the list when adding new options. + +class pool_opts_t { +public: + enum key_t { + SCRUB_MIN_INTERVAL, + SCRUB_MAX_INTERVAL, + DEEP_SCRUB_INTERVAL, + RECOVERY_PRIORITY, + RECOVERY_OP_PRIORITY, + SCRUB_PRIORITY, + COMPRESSION_MODE, + COMPRESSION_ALGORITHM, + COMPRESSION_REQUIRED_RATIO, + COMPRESSION_MAX_BLOB_SIZE, + COMPRESSION_MIN_BLOB_SIZE, + CSUM_TYPE, + CSUM_MAX_BLOCK, + CSUM_MIN_BLOCK, + FINGERPRINT_ALGORITHM, + PG_NUM_MIN, // min pg_num + TARGET_SIZE_BYTES, // total bytes in pool + TARGET_SIZE_RATIO, // fraction of total cluster + PG_AUTOSCALE_BIAS, + READ_LEASE_INTERVAL, + DEDUP_TIER, + DEDUP_CHUNK_ALGORITHM, + DEDUP_CDC_CHUNK_SIZE, + PG_NUM_MAX, // max pg_num + }; + + enum type_t { + STR, + INT, + DOUBLE, + }; + + struct opt_desc_t { + key_t key; + type_t type; + + opt_desc_t(key_t k, type_t t) : key(k), type(t) {} + + bool operator==(const opt_desc_t& rhs) const { + return key == rhs.key && type == rhs.type; + } + }; + + typedef boost::variant value_t; + + static bool is_opt_name(const std::string& name); + static opt_desc_t get_opt_desc(const std::string& name); + + pool_opts_t() : opts() {} + + bool is_set(key_t key) const; + + template + void set(key_t key, const T &val) { + value_t value = val; + opts[key] = value; + } + + template + bool get(key_t key, T *val) const { + opts_t::const_iterator i = opts.find(key); + if (i == opts.end()) { + return false; + } + *val = boost::get(i->second); + return true; + } + + template + T value_or(key_t key, T&& default_value) const { + auto i = opts.find(key); + if (i == opts.end()) { + return std::forward(default_value); + } + return boost::get(i->second); + } + + const value_t& get(key_t key) const; + + bool unset(key_t key); + + void dump(const std::string& name, ceph::Formatter *f) const; + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + +private: + typedef std::map opts_t; + opts_t opts; + + friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_opts_t) + +struct pg_merge_meta_t { + pg_t source_pgid; + epoch_t ready_epoch = 0; + epoch_t last_epoch_started = 0; + epoch_t last_epoch_clean = 0; + eversion_t source_version; + eversion_t target_version; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(source_pgid, bl); + encode(ready_epoch, bl); + encode(last_epoch_started, bl); + encode(last_epoch_clean, bl); + encode(source_version, bl); + encode(target_version, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(source_pgid, p); + decode(ready_epoch, p); + decode(last_epoch_started, p); + decode(last_epoch_clean, p); + decode(source_version, p); + decode(target_version, p); + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const { + f->dump_stream("source_pgid") << source_pgid; + f->dump_unsigned("ready_epoch", ready_epoch); + f->dump_unsigned("last_epoch_started", last_epoch_started); + f->dump_unsigned("last_epoch_clean", last_epoch_clean); + f->dump_stream("source_version") << source_version; + f->dump_stream("target_version") << target_version; + } +}; +WRITE_CLASS_ENCODER(pg_merge_meta_t) + +class OSDMap; + +/* + * pg_pool + */ +struct pg_pool_t { + static const char *APPLICATION_NAME_CEPHFS; + static const char *APPLICATION_NAME_RBD; + static const char *APPLICATION_NAME_RGW; + + enum { + TYPE_REPLICATED = 1, // replication + //TYPE_RAID4 = 2, // raid4 (never implemented) + TYPE_ERASURE = 3, // erasure-coded + }; + static constexpr uint32_t pg_CRUSH_ITEM_NONE = 0x7fffffff; /* can't import crush.h here */ + static std::string_view get_type_name(int t) { + switch (t) { + case TYPE_REPLICATED: return "replicated"; + //case TYPE_RAID4: return "raid4"; + case TYPE_ERASURE: return "erasure"; + default: return "???"; + } + } + std::string_view get_type_name() const { + return get_type_name(type); + } + + enum { + FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding) + FLAG_FULL = 1<<1, // pool is full + FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled + FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay) + FLAG_NODELETE = 1<<4, // pool can't be deleted + FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed + FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed + FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED + FLAG_NOSCRUB = 1<<8, // block periodic scrub + FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub + FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too + FLAG_NEARFULL = 1<<11, // pool is nearfull + FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull + FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps + FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps + FLAG_CREATING = 1<<15, // initial pool PGs are being created + FLAG_BULK = 1<<17, //pool is large + }; + + static const char *get_flag_name(int f) { + switch (f) { + case FLAG_HASHPSPOOL: return "hashpspool"; + case FLAG_FULL: return "full"; + case FLAG_EC_OVERWRITES: return "ec_overwrites"; + case FLAG_INCOMPLETE_CLONES: return "incomplete_clones"; + case FLAG_NODELETE: return "nodelete"; + case FLAG_NOPGCHANGE: return "nopgchange"; + case FLAG_NOSIZECHANGE: return "nosizechange"; + case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed"; + case FLAG_NOSCRUB: return "noscrub"; + case FLAG_NODEEP_SCRUB: return "nodeep-scrub"; + case FLAG_FULL_QUOTA: return "full_quota"; + case FLAG_NEARFULL: return "nearfull"; + case FLAG_BACKFILLFULL: return "backfillfull"; + case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps"; + case FLAG_POOL_SNAPS: return "pool_snaps"; + case FLAG_CREATING: return "creating"; + case FLAG_BULK: return "bulk"; + default: return "???"; + } + } + static std::string get_flags_string(uint64_t f) { + std::string s; + for (unsigned n=0; f && n<64; ++n) { + if (f & (1ull << n)) { + if (s.length()) + s += ","; + s += get_flag_name(1ull << n); + } + } + return s; + } + std::string get_flags_string() const { + return get_flags_string(flags); + } + static uint64_t get_flag_by_name(const std::string& name) { + if (name == "hashpspool") + return FLAG_HASHPSPOOL; + if (name == "full") + return FLAG_FULL; + if (name == "ec_overwrites") + return FLAG_EC_OVERWRITES; + if (name == "incomplete_clones") + return FLAG_INCOMPLETE_CLONES; + if (name == "nodelete") + return FLAG_NODELETE; + if (name == "nopgchange") + return FLAG_NOPGCHANGE; + if (name == "nosizechange") + return FLAG_NOSIZECHANGE; + if (name == "write_fadvise_dontneed") + return FLAG_WRITE_FADVISE_DONTNEED; + if (name == "noscrub") + return FLAG_NOSCRUB; + if (name == "nodeep-scrub") + return FLAG_NODEEP_SCRUB; + if (name == "full_quota") + return FLAG_FULL_QUOTA; + if (name == "nearfull") + return FLAG_NEARFULL; + if (name == "backfillfull") + return FLAG_BACKFILLFULL; + if (name == "selfmanaged_snaps") + return FLAG_SELFMANAGED_SNAPS; + if (name == "pool_snaps") + return FLAG_POOL_SNAPS; + if (name == "creating") + return FLAG_CREATING; + if (name == "bulk") + return FLAG_BULK; + return 0; + } + + /// converts the acting/up vector to a set of pg shards + void convert_to_pg_shards(const std::vector &from, std::set* to) const; + + typedef enum { + CACHEMODE_NONE = 0, ///< no caching + CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later + CACHEMODE_FORWARD = 2, ///< forward if not in cache + CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent] + CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later + CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later + CACHEMODE_PROXY = 6, ///< proxy if not in cache + } cache_mode_t; + static const char *get_cache_mode_name(cache_mode_t m) { + switch (m) { + case CACHEMODE_NONE: return "none"; + case CACHEMODE_WRITEBACK: return "writeback"; + case CACHEMODE_FORWARD: return "forward"; + case CACHEMODE_READONLY: return "readonly"; + case CACHEMODE_READFORWARD: return "readforward"; + case CACHEMODE_READPROXY: return "readproxy"; + case CACHEMODE_PROXY: return "proxy"; + default: return "unknown"; + } + } + static cache_mode_t get_cache_mode_from_str(const std::string& s) { + if (s == "none") + return CACHEMODE_NONE; + if (s == "writeback") + return CACHEMODE_WRITEBACK; + if (s == "forward") + return CACHEMODE_FORWARD; + if (s == "readonly") + return CACHEMODE_READONLY; + if (s == "readforward") + return CACHEMODE_READFORWARD; + if (s == "readproxy") + return CACHEMODE_READPROXY; + if (s == "proxy") + return CACHEMODE_PROXY; + return (cache_mode_t)-1; + } + const char *get_cache_mode_name() const { + return get_cache_mode_name(cache_mode); + } + bool cache_mode_requires_hit_set() const { + switch (cache_mode) { + case CACHEMODE_NONE: + case CACHEMODE_FORWARD: + case CACHEMODE_READONLY: + case CACHEMODE_PROXY: + return false; + case CACHEMODE_WRITEBACK: + case CACHEMODE_READFORWARD: + case CACHEMODE_READPROXY: + return true; + default: + ceph_abort_msg("implement me"); + } + } + + enum class pg_autoscale_mode_t : uint8_t { + OFF = 0, + WARN = 1, + ON = 2, + UNKNOWN = UINT8_MAX, + }; + static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) { + switch (m) { + case pg_autoscale_mode_t::OFF: return "off"; + case pg_autoscale_mode_t::ON: return "on"; + case pg_autoscale_mode_t::WARN: return "warn"; + default: return "???"; + } + } + static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) { + if (m == "off") { + return pg_autoscale_mode_t::OFF; + } + if (m == "warn") { + return pg_autoscale_mode_t::WARN; + } + if (m == "on") { + return pg_autoscale_mode_t::ON; + } + return pg_autoscale_mode_t::UNKNOWN; + } + + utime_t create_time; + uint64_t flags = 0; ///< FLAG_* + __u8 type = 0; ///< TYPE_* + __u8 size = 0, min_size = 0; ///< number of osds in each pg + __u8 crush_rule = 0; ///< crush placement rule + __u8 object_hash = 0; ///< hash mapping object name to ps + pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN; + +private: + __u32 pg_num = 0, pgp_num = 0; ///< number of pgs + __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to + __u32 pg_num_target = 0; ///< pg_num we should converge toward + __u32 pgp_num_target = 0; ///< pgp_num we should converge toward + +public: + std::map properties; ///< OBSOLETE + std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap + epoch_t last_change = 0; ///< most recent epoch changed, exclusing snapshot changes + // If non-zero, require OSDs in at least this many different instances... + uint32_t peering_crush_bucket_count = 0; + // of this bucket type... + uint32_t peering_crush_bucket_barrier = 0; + // including this one + int32_t peering_crush_mandatory_member = pg_CRUSH_ITEM_NONE; + // The per-bucket replica count is calculated with this "target" + // instead of the above crush_bucket_count. This means we can maintain a + // target size of 4 without attempting to place them all in 1 DC + uint32_t peering_crush_bucket_target = 0; + /// last epoch that forced clients to resend + epoch_t last_force_op_resend = 0; + /// last epoch that forced clients to resend (pre-nautilus clients only) + epoch_t last_force_op_resend_prenautilus = 0; + /// last epoch that forced clients to resend (pre-luminous clients only) + epoch_t last_force_op_resend_preluminous = 0; + + /// metadata for the most recent PG merge + pg_merge_meta_t last_pg_merge_meta; + + snapid_t snap_seq = 0; ///< seq for per-pool snapshot + epoch_t snap_epoch = 0; ///< osdmap epoch of last snap + uint64_t auid = 0; ///< who owns the pg + + uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool + uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool + + /* + * Pool snaps (global to this pool). These define a SnapContext for + * the pool, unless the client manually specifies an alternate + * context. + */ + std::map snaps; + /* + * Alternatively, if we are defining non-pool snaps (e.g. via the + * Ceph MDS), we must track @removed_snaps (since @snaps is not + * used). Snaps and removed_snaps are to be used exclusive of each + * other! + */ + interval_set removed_snaps; + + unsigned pg_num_mask = 0, pgp_num_mask = 0; + + std::set tiers; ///< pools that are tiers of us + int64_t tier_of = -1; ///< pool for which we are a tier + // Note that write wins for read+write ops + int64_t read_tier = -1; ///< pool/tier for objecter to direct reads to + int64_t write_tier = -1; ///< pool/tier for objecter to direct writes to + cache_mode_t cache_mode = CACHEMODE_NONE; ///< cache pool mode + + bool is_tier() const { return tier_of >= 0; } + bool has_tiers() const { return !tiers.empty(); } + void clear_tier() { + tier_of = -1; + clear_read_tier(); + clear_write_tier(); + clear_tier_tunables(); + } + bool has_read_tier() const { return read_tier >= 0; } + void clear_read_tier() { read_tier = -1; } + bool has_write_tier() const { return write_tier >= 0; } + void clear_write_tier() { write_tier = -1; } + void clear_tier_tunables() { + if (cache_mode != CACHEMODE_NONE) + flags |= FLAG_INCOMPLETE_CLONES; + cache_mode = CACHEMODE_NONE; + + target_max_bytes = 0; + target_max_objects = 0; + cache_target_dirty_ratio_micro = 0; + cache_target_dirty_high_ratio_micro = 0; + cache_target_full_ratio_micro = 0; + hit_set_params = HitSet::Params(); + hit_set_period = 0; + hit_set_count = 0; + hit_set_grade_decay_rate = 0; + hit_set_search_last_n = 0; + grade_table.resize(0); + } + + bool is_stretch_pool() const { + return peering_crush_bucket_count != 0; + } + + bool stretch_set_can_peer(const set& want, const OSDMap& osdmap, + std::ostream *out) const; + bool stretch_set_can_peer(const vector& want, const OSDMap& osdmap, + std::ostream *out) const { + if (!is_stretch_pool()) return true; + set swant; + for (auto i : want) swant.insert(i); + return stretch_set_can_peer(swant, osdmap, out); + } + + uint64_t target_max_bytes = 0; ///< tiering: target max pool size + uint64_t target_max_objects = 0; ///< tiering: target max pool size + + uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty + uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of target to flush with high speed + uint32_t cache_target_full_ratio_micro = 0; ///< cache: fraction of target to fill before we evict in earnest + + uint32_t cache_min_flush_age = 0; ///< minimum age (seconds) before we can flush + uint32_t cache_min_evict_age = 0; ///< minimum age (seconds) before we can evict + + HitSet::Params hit_set_params; ///< The HitSet params to use on this pool + uint32_t hit_set_period = 0; ///< periodicity of HitSet segments (seconds) + uint32_t hit_set_count = 0; ///< number of periods to retain + bool use_gmt_hitset = true; ///< use gmt to name the hitset archive object + uint32_t min_read_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on read + uint32_t min_write_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on write + uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects + ///< temperature count,the follow hit_set's priority decay + ///< by this params than pre hit_set + uint32_t hit_set_search_last_n = 0; ///< accumulate atmost N hit_sets for temperature + + uint32_t stripe_width = 0; ///< erasure coded stripe size in bytes + + uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates + ///< user does not specify any expected value + bool fast_read = false; ///< whether turn on fast read on the pool or not + + pool_opts_t opts; ///< options + + typedef enum { + TYPE_FINGERPRINT_NONE = 0, + TYPE_FINGERPRINT_SHA1 = 1, + TYPE_FINGERPRINT_SHA256 = 2, + TYPE_FINGERPRINT_SHA512 = 3, + } fingerprint_t; + static fingerprint_t get_fingerprint_from_str(const std::string& s) { + if (s == "none") + return TYPE_FINGERPRINT_NONE; + if (s == "sha1") + return TYPE_FINGERPRINT_SHA1; + if (s == "sha256") + return TYPE_FINGERPRINT_SHA256; + if (s == "sha512") + return TYPE_FINGERPRINT_SHA512; + return (fingerprint_t)-1; + } + const fingerprint_t get_fingerprint_type() const { + std::string fp_str; + opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str); + return get_fingerprint_from_str(fp_str); + } + const char *get_fingerprint_name() const { + std::string fp_str; + fingerprint_t fp_t; + opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str); + fp_t = get_fingerprint_from_str(fp_str); + return get_fingerprint_name(fp_t); + } + static const char *get_fingerprint_name(fingerprint_t m) { + switch (m) { + case TYPE_FINGERPRINT_NONE: return "none"; + case TYPE_FINGERPRINT_SHA1: return "sha1"; + case TYPE_FINGERPRINT_SHA256: return "sha256"; + case TYPE_FINGERPRINT_SHA512: return "sha512"; + default: return "unknown"; + } + } + + typedef enum { + TYPE_DEDUP_CHUNK_NONE = 0, + TYPE_DEDUP_CHUNK_FASTCDC = 1, + TYPE_DEDUP_CHUNK_FIXEDCDC = 2, + } dedup_chunk_algo_t; + static dedup_chunk_algo_t get_dedup_chunk_algorithm_from_str(const std::string& s) { + if (s == "none") + return TYPE_DEDUP_CHUNK_NONE; + if (s == "fastcdc") + return TYPE_DEDUP_CHUNK_FASTCDC; + if (s == "fixed") + return TYPE_DEDUP_CHUNK_FIXEDCDC; + return (dedup_chunk_algo_t)-1; + } + const dedup_chunk_algo_t get_dedup_chunk_algorithm_type() const { + std::string algo_str; + opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &algo_str); + return get_dedup_chunk_algorithm_from_str(algo_str); + } + const char *get_dedup_chunk_algorithm_name() const { + std::string dedup_chunk_algo_str; + dedup_chunk_algo_t dedup_chunk_algo_t; + opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &dedup_chunk_algo_str); + dedup_chunk_algo_t = get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str); + return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t); + } + static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m) { + switch (m) { + case TYPE_DEDUP_CHUNK_NONE: return "none"; + case TYPE_DEDUP_CHUNK_FASTCDC: return "fastcdc"; + case TYPE_DEDUP_CHUNK_FIXEDCDC: return "fixed"; + default: return "unknown"; + } + } + + int64_t get_dedup_tier() const { + int64_t tier_id = 0; + opts.get(pool_opts_t::DEDUP_TIER, &tier_id); + return tier_id; + } + int64_t get_dedup_cdc_chunk_size() const { + int64_t chunk_size = 0; + opts.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE, &chunk_size); + return chunk_size; + } + + /// application -> key/value metadata + std::map> application_metadata; + +private: + std::vector grade_table; + +public: + uint32_t get_grade(unsigned i) const { + if (grade_table.size() <= i) + return 0; + return grade_table[i]; + } + void calc_grade_table() { + unsigned v = 1000000; + grade_table.resize(hit_set_count); + for (unsigned i = 0; i < hit_set_count; i++) { + v = v * (1 - (hit_set_grade_decay_rate / 100.0)); + grade_table[i] = v; + } + } + + pg_pool_t() = default; + + void dump(ceph::Formatter *f) const; + + const utime_t &get_create_time() const { return create_time; } + uint64_t get_flags() const { return flags; } + bool has_flag(uint64_t f) const { return flags & f; } + void set_flag(uint64_t f) { flags |= f; } + void unset_flag(uint64_t f) { flags &= ~f; } + + bool require_rollback() const { + return is_erasure(); + } + + /// true if incomplete clones may be present + bool allow_incomplete_clones() const { + return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES); + } + + unsigned get_type() const { return type; } + unsigned get_size() const { return size; } + unsigned get_min_size() const { return min_size; } + int get_crush_rule() const { return crush_rule; } + int get_object_hash() const { return object_hash; } + const char *get_object_hash_name() const { + return ceph_str_hash_name(get_object_hash()); + } + epoch_t get_last_change() const { return last_change; } + epoch_t get_last_force_op_resend() const { return last_force_op_resend; } + epoch_t get_last_force_op_resend_prenautilus() const { + return last_force_op_resend_prenautilus; + } + epoch_t get_last_force_op_resend_preluminous() const { + return last_force_op_resend_preluminous; + } + epoch_t get_snap_epoch() const { return snap_epoch; } + snapid_t get_snap_seq() const { return snap_seq; } + uint64_t get_auid() const { return auid; } + + void set_snap_seq(snapid_t s) { snap_seq = s; } + void set_snap_epoch(epoch_t e) { snap_epoch = e; } + + void set_stripe_width(uint32_t s) { stripe_width = s; } + uint32_t get_stripe_width() const { return stripe_width; } + + bool is_replicated() const { return get_type() == TYPE_REPLICATED; } + bool is_erasure() const { return get_type() == TYPE_ERASURE; } + + bool supports_omap() const { + return !(get_type() == TYPE_ERASURE); + } + + bool requires_aligned_append() const { + return is_erasure() && !has_flag(FLAG_EC_OVERWRITES); + } + uint64_t required_alignment() const { return stripe_width; } + + bool allows_ecoverwrites() const { + return has_flag(FLAG_EC_OVERWRITES); + } + + bool can_shift_osds() const { + switch (get_type()) { + case TYPE_REPLICATED: + return true; + case TYPE_ERASURE: + return false; + default: + ceph_abort_msg("unhandled pool type"); + } + } + + unsigned get_pg_num() const { return pg_num; } + unsigned get_pgp_num() const { return pgp_num; } + unsigned get_pg_num_target() const { return pg_num_target; } + unsigned get_pgp_num_target() const { return pgp_num_target; } + unsigned get_pg_num_pending() const { return pg_num_pending; } + + unsigned get_pg_num_mask() const { return pg_num_mask; } + unsigned get_pgp_num_mask() const { return pgp_num_mask; } + + // if pg_num is not a multiple of two, pgs are not equally sized. + // return, for a given pg, the fraction (denominator) of the total + // pool size that it represents. + unsigned get_pg_num_divisor(pg_t pgid) const; + + bool is_pending_merge(pg_t pgid, bool *target) const; + + void set_pg_num(int p) { + pg_num = p; + pg_num_pending = p; + calc_pg_masks(); + } + void set_pgp_num(int p) { + pgp_num = p; + calc_pg_masks(); + } + void set_pg_num_pending(int p) { + pg_num_pending = p; + calc_pg_masks(); + } + void set_pg_num_target(int p) { + pg_num_target = p; + } + void set_pgp_num_target(int p) { + pgp_num_target = p; + } + void dec_pg_num(pg_t source_pgid, + epoch_t ready_epoch, + eversion_t source_version, + eversion_t target_version, + epoch_t last_epoch_started, + epoch_t last_epoch_clean) { + --pg_num; + last_pg_merge_meta.source_pgid = source_pgid; + last_pg_merge_meta.ready_epoch = ready_epoch; + last_pg_merge_meta.source_version = source_version; + last_pg_merge_meta.target_version = target_version; + last_pg_merge_meta.last_epoch_started = last_epoch_started; + last_pg_merge_meta.last_epoch_clean = last_epoch_clean; + calc_pg_masks(); + } + + void set_quota_max_bytes(uint64_t m) { + quota_max_bytes = m; + } + uint64_t get_quota_max_bytes() { + return quota_max_bytes; + } + + void set_quota_max_objects(uint64_t m) { + quota_max_objects = m; + } + uint64_t get_quota_max_objects() { + return quota_max_objects; + } + + void set_last_force_op_resend(uint64_t t) { + last_force_op_resend = t; + last_force_op_resend_prenautilus = t; + last_force_op_resend_preluminous = t; + } + + void calc_pg_masks(); + + /* + * we have two snap modes: + * - pool global snaps + * - snap existence/non-existence defined by snaps[] and snap_seq + * - user managed snaps + * - removal governed by removed_snaps + * + * we know which mode we're using based on whether removed_snaps is empty. + * If nothing has been created, both functions report false. + */ + bool is_pool_snaps_mode() const; + bool is_unmanaged_snaps_mode() const; + bool is_removed_snap(snapid_t s) const; + + snapid_t snap_exists(std::string_view s) const; + void add_snap(const char *n, utime_t stamp); + uint64_t add_unmanaged_snap(bool preoctopus_compat); + void remove_snap(snapid_t s); + void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat); + + SnapContext get_snap_context() const; + + /// hash a object name+namespace key to a hash position + uint32_t hash_key(const std::string& key, const std::string& ns) const; + + /// round a hash position down to a pg num + uint32_t raw_hash_to_pg(uint32_t v) const; + + /* + * map a raw pg (with full precision ps) into an actual pg, for storage + */ + pg_t raw_pg_to_pg(pg_t pg) const; + + /* + * map raw pg (full precision ps) into a placement seed. include + * pool id in that value so that different pools don't use the same + * seeds. + */ + ps_t raw_pg_to_pps(pg_t pg) const; + + /// choose a random hash position within a pg + uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const; + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pg_pool_t) + +std::ostream& operator<<(std::ostream& out, const pg_pool_t& p); + + +/** + * a summation of object stats + * + * This is just a container for object stats; we don't know what for. + * + * If you add members in object_stat_sum_t, you should make sure there are + * not padding among these members. + * You should also modify the padding_check function. + + */ +struct object_stat_sum_t { + /************************************************************************** + * WARNING: be sure to update operator==, floor, and split when + * adding/removing fields! + **************************************************************************/ + int64_t num_bytes; // in bytes + int64_t num_objects; + int64_t num_object_clones; + int64_t num_object_copies; // num_objects * num_replicas + int64_t num_objects_missing_on_primary; + int64_t num_objects_degraded; + int64_t num_objects_unfound; + int64_t num_rd; + int64_t num_rd_kb; + int64_t num_wr; + int64_t num_wr_kb; + int64_t num_scrub_errors; // total deep and shallow scrub errors + int64_t num_objects_recovered; + int64_t num_bytes_recovered; + int64_t num_keys_recovered; + int64_t num_shallow_scrub_errors; + int64_t num_deep_scrub_errors; + int64_t num_objects_dirty; + int64_t num_whiteouts; + int64_t num_objects_omap; + int64_t num_objects_hit_set_archive; + int64_t num_objects_misplaced; + int64_t num_bytes_hit_set_archive; + int64_t num_flush; + int64_t num_flush_kb; + int64_t num_evict; + int64_t num_evict_kb; + int64_t num_promote; + int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0 + int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0 + int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0 + int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0 + int64_t num_objects_pinned; + int64_t num_objects_missing; + int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets + int64_t num_large_omap_objects = 0; + int64_t num_objects_manifest = 0; + int64_t num_omap_bytes = 0; + int64_t num_omap_keys = 0; + int64_t num_objects_repaired = 0; + + object_stat_sum_t() + : num_bytes(0), + num_objects(0), num_object_clones(0), num_object_copies(0), + num_objects_missing_on_primary(0), num_objects_degraded(0), + num_objects_unfound(0), + num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0), + num_scrub_errors(0), + num_objects_recovered(0), + num_bytes_recovered(0), + num_keys_recovered(0), + num_shallow_scrub_errors(0), + num_deep_scrub_errors(0), + num_objects_dirty(0), + num_whiteouts(0), + num_objects_omap(0), + num_objects_hit_set_archive(0), + num_objects_misplaced(0), + num_bytes_hit_set_archive(0), + num_flush(0), + num_flush_kb(0), + num_evict(0), + num_evict_kb(0), + num_promote(0), + num_flush_mode_high(0), num_flush_mode_low(0), + num_evict_mode_some(0), num_evict_mode_full(0), + num_objects_pinned(0), + num_objects_missing(0), + num_legacy_snapsets(0) + {} + + void floor(int64_t f) { +#define FLOOR(x) if (x < f) x = f + FLOOR(num_bytes); + FLOOR(num_objects); + FLOOR(num_object_clones); + FLOOR(num_object_copies); + FLOOR(num_objects_missing_on_primary); + FLOOR(num_objects_missing); + FLOOR(num_objects_degraded); + FLOOR(num_objects_misplaced); + FLOOR(num_objects_unfound); + FLOOR(num_rd); + FLOOR(num_rd_kb); + FLOOR(num_wr); + FLOOR(num_wr_kb); + FLOOR(num_large_omap_objects); + FLOOR(num_objects_manifest); + FLOOR(num_omap_bytes); + FLOOR(num_omap_keys); + FLOOR(num_shallow_scrub_errors); + FLOOR(num_deep_scrub_errors); + num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors; + FLOOR(num_objects_recovered); + FLOOR(num_bytes_recovered); + FLOOR(num_keys_recovered); + FLOOR(num_objects_dirty); + FLOOR(num_whiteouts); + FLOOR(num_objects_omap); + FLOOR(num_objects_hit_set_archive); + FLOOR(num_bytes_hit_set_archive); + FLOOR(num_flush); + FLOOR(num_flush_kb); + FLOOR(num_evict); + FLOOR(num_evict_kb); + FLOOR(num_promote); + FLOOR(num_flush_mode_high); + FLOOR(num_flush_mode_low); + FLOOR(num_evict_mode_some); + FLOOR(num_evict_mode_full); + FLOOR(num_objects_pinned); + FLOOR(num_legacy_snapsets); + FLOOR(num_objects_repaired); +#undef FLOOR + } + + void split(std::vector &out) const { +#define SPLIT(PARAM) \ + for (unsigned i = 0; i < out.size(); ++i) { \ + out[i].PARAM = PARAM / out.size(); \ + if (i < (PARAM % out.size())) { \ + out[i].PARAM++; \ + } \ + } +#define SPLIT_PRESERVE_NONZERO(PARAM) \ + for (unsigned i = 0; i < out.size(); ++i) { \ + if (PARAM) \ + out[i].PARAM = 1 + PARAM / out.size(); \ + else \ + out[i].PARAM = 0; \ + } + + SPLIT(num_bytes); + SPLIT(num_objects); + SPLIT(num_object_clones); + SPLIT(num_object_copies); + SPLIT(num_objects_missing_on_primary); + SPLIT(num_objects_missing); + SPLIT(num_objects_degraded); + SPLIT(num_objects_misplaced); + SPLIT(num_objects_unfound); + SPLIT(num_rd); + SPLIT(num_rd_kb); + SPLIT(num_wr); + SPLIT(num_wr_kb); + SPLIT(num_large_omap_objects); + SPLIT(num_objects_manifest); + SPLIT(num_omap_bytes); + SPLIT(num_omap_keys); + SPLIT(num_objects_repaired); + SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors); + SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors); + for (unsigned i = 0; i < out.size(); ++i) { + out[i].num_scrub_errors = out[i].num_shallow_scrub_errors + + out[i].num_deep_scrub_errors; + } + SPLIT(num_objects_recovered); + SPLIT(num_bytes_recovered); + SPLIT(num_keys_recovered); + SPLIT(num_objects_dirty); + SPLIT(num_whiteouts); + SPLIT(num_objects_omap); + SPLIT(num_objects_hit_set_archive); + SPLIT(num_bytes_hit_set_archive); + SPLIT(num_flush); + SPLIT(num_flush_kb); + SPLIT(num_evict); + SPLIT(num_evict_kb); + SPLIT(num_promote); + SPLIT(num_flush_mode_high); + SPLIT(num_flush_mode_low); + SPLIT(num_evict_mode_some); + SPLIT(num_evict_mode_full); + SPLIT(num_objects_pinned); + SPLIT_PRESERVE_NONZERO(num_legacy_snapsets); +#undef SPLIT +#undef SPLIT_PRESERVE_NONZERO + } + + void clear() { + // FIPS zeroization audit 20191117: this memset is not security related. + memset(this, 0, sizeof(*this)); + } + + void calc_copies(int nrep) { + num_object_copies = nrep * num_objects; + } + + bool is_zero() const { + return mem_is_zero((char*)this, sizeof(*this)); + } + + void add(const object_stat_sum_t& o); + void sub(const object_stat_sum_t& o); + + void dump(ceph::Formatter *f) const; + void padding_check() { + static_assert( + sizeof(object_stat_sum_t) == + sizeof(num_bytes) + + sizeof(num_objects) + + sizeof(num_object_clones) + + sizeof(num_object_copies) + + sizeof(num_objects_missing_on_primary) + + sizeof(num_objects_degraded) + + sizeof(num_objects_unfound) + + sizeof(num_rd) + + sizeof(num_rd_kb) + + sizeof(num_wr) + + sizeof(num_wr_kb) + + sizeof(num_scrub_errors) + + sizeof(num_large_omap_objects) + + sizeof(num_objects_manifest) + + sizeof(num_omap_bytes) + + sizeof(num_omap_keys) + + sizeof(num_objects_repaired) + + sizeof(num_objects_recovered) + + sizeof(num_bytes_recovered) + + sizeof(num_keys_recovered) + + sizeof(num_shallow_scrub_errors) + + sizeof(num_deep_scrub_errors) + + sizeof(num_objects_dirty) + + sizeof(num_whiteouts) + + sizeof(num_objects_omap) + + sizeof(num_objects_hit_set_archive) + + sizeof(num_objects_misplaced) + + sizeof(num_bytes_hit_set_archive) + + sizeof(num_flush) + + sizeof(num_flush_kb) + + sizeof(num_evict) + + sizeof(num_evict_kb) + + sizeof(num_promote) + + sizeof(num_flush_mode_high) + + sizeof(num_flush_mode_low) + + sizeof(num_evict_mode_some) + + sizeof(num_evict_mode_full) + + sizeof(num_objects_pinned) + + sizeof(num_objects_missing) + + sizeof(num_legacy_snapsets) + , + "object_stat_sum_t have padding"); + } + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(object_stat_sum_t) + +bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r); + +/** + * a collection of object stat sums + * + * This is a collection of stat sums over different categories. + */ +struct object_stat_collection_t { + /************************************************************************** + * WARNING: be sure to update the operator== when adding/removing fields! * + **************************************************************************/ + object_stat_sum_t sum; + + void calc_copies(int nrep) { + sum.calc_copies(nrep); + } + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list& o); + + bool is_zero() const { + return sum.is_zero(); + } + + void clear() { + sum.clear(); + } + + void floor(int64_t f) { + sum.floor(f); + } + + void add(const object_stat_sum_t& o) { + sum.add(o); + } + + void add(const object_stat_collection_t& o) { + sum.add(o.sum); + } + void sub(const object_stat_collection_t& o) { + sum.sub(o.sum); + } +}; +WRITE_CLASS_ENCODER(object_stat_collection_t) + +inline bool operator==(const object_stat_collection_t& l, + const object_stat_collection_t& r) { + return l.sum == r.sum; +} + + +/** pg_stat + * aggregate stats for a single PG. + */ +struct pg_stat_t { + /************************************************************************** + * WARNING: be sure to update the operator== when adding/removing fields! * + **************************************************************************/ + eversion_t version; + version_t reported_seq; // sequence number + epoch_t reported_epoch; // epoch of this report + uint64_t state; + utime_t last_fresh; // last reported + utime_t last_change; // new state != previous state + utime_t last_active; // state & PG_STATE_ACTIVE + utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED + utime_t last_clean; // state & PG_STATE_CLEAN + utime_t last_unstale; // (state & PG_STATE_STALE) == 0 + utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0 + utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0 + + eversion_t log_start; // (log_start,version] + eversion_t ondisk_log_start; // there may be more on disk + + epoch_t created; + epoch_t last_epoch_clean; + pg_t parent; + __u32 parent_split_bits; + + eversion_t last_scrub; + eversion_t last_deep_scrub; + utime_t last_scrub_stamp; + utime_t last_deep_scrub_stamp; + utime_t last_clean_scrub_stamp; + + object_stat_collection_t stats; + + int64_t log_size; + int64_t ondisk_log_size; // >= active_log_size + + std::vector up, acting; + std::vector avail_no_missing; + std::map< std::set, int32_t > object_location_counts; + epoch_t mapping_epoch; + + std::vector blocked_by; ///< osds on which the pg is blocked + + interval_set purged_snaps; ///< recently removed snaps that we've purged + + utime_t last_became_active; + utime_t last_became_peered; + + /// up, acting primaries + int32_t up_primary; + int32_t acting_primary; + + // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is + // absurd already, so cap it to 2^32 and save 4 bytes at the same time + uint32_t snaptrimq_len; + + bool stats_invalid:1; + /// true if num_objects_dirty is not accurate (because it was not + /// maintained starting from pool creation) + bool dirty_stats_invalid:1; + bool omap_stats_invalid:1; + bool hitset_stats_invalid:1; + bool hitset_bytes_stats_invalid:1; + bool pin_stats_invalid:1; + bool manifest_stats_invalid:1; + + pg_stat_t() + : reported_seq(0), + reported_epoch(0), + state(0), + created(0), last_epoch_clean(0), + parent_split_bits(0), + log_size(0), ondisk_log_size(0), + mapping_epoch(0), + up_primary(-1), + acting_primary(-1), + snaptrimq_len(0), + stats_invalid(false), + dirty_stats_invalid(false), + omap_stats_invalid(false), + hitset_stats_invalid(false), + hitset_bytes_stats_invalid(false), + pin_stats_invalid(false), + manifest_stats_invalid(false) + { } + + epoch_t get_effective_last_epoch_clean() const { + if (state & PG_STATE_CLEAN) { + // we are clean as of this report, and should thus take the + // reported epoch + return reported_epoch; + } else { + return last_epoch_clean; + } + } + + std::pair get_version_pair() const { + return { reported_epoch, reported_seq }; + } + + void floor(int64_t f) { + stats.floor(f); + if (log_size < f) + log_size = f; + if (ondisk_log_size < f) + ondisk_log_size = f; + if (snaptrimq_len < f) + snaptrimq_len = f; + } + + void add_sub_invalid_flags(const pg_stat_t& o) { + // adding (or subtracting!) invalid stats render our stats invalid too + stats_invalid |= o.stats_invalid; + dirty_stats_invalid |= o.dirty_stats_invalid; + omap_stats_invalid |= o.omap_stats_invalid; + hitset_stats_invalid |= o.hitset_stats_invalid; + hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid; + pin_stats_invalid |= o.pin_stats_invalid; + manifest_stats_invalid |= o.manifest_stats_invalid; + } + void add(const pg_stat_t& o) { + stats.add(o.stats); + log_size += o.log_size; + ondisk_log_size += o.ondisk_log_size; + snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len, + (uint64_t)(1ull << 31)); + add_sub_invalid_flags(o); + } + void sub(const pg_stat_t& o) { + stats.sub(o.stats); + log_size -= o.log_size; + ondisk_log_size -= o.ondisk_log_size; + if (o.snaptrimq_len < snaptrimq_len) { + snaptrimq_len -= o.snaptrimq_len; + } else { + snaptrimq_len = 0; + } + add_sub_invalid_flags(o); + } + + bool is_acting_osd(int32_t osd, bool primary) const; + void dump(ceph::Formatter *f) const; + void dump_brief(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(pg_stat_t) + +bool operator==(const pg_stat_t& l, const pg_stat_t& r); + +/** store_statfs_t + * ObjectStore full statfs information + */ +struct store_statfs_t +{ + uint64_t total = 0; ///< Total bytes + uint64_t available = 0; ///< Free bytes available + uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes + + int64_t allocated = 0; ///< Bytes allocated by the store + + int64_t data_stored = 0; ///< Bytes actually stored by the user + int64_t data_compressed = 0; ///< Bytes stored after compression + int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data + int64_t data_compressed_original = 0; ///< Bytes that were compressed + + int64_t omap_allocated = 0; ///< approx usage of omap data + int64_t internal_metadata = 0; ///< approx usage of internal metadata + + void reset() { + *this = store_statfs_t(); + } + void floor(int64_t f) { +#define FLOOR(x) if (int64_t(x) < f) x = f + FLOOR(total); + FLOOR(available); + FLOOR(internally_reserved); + FLOOR(allocated); + FLOOR(data_stored); + FLOOR(data_compressed); + FLOOR(data_compressed_allocated); + FLOOR(data_compressed_original); + + FLOOR(omap_allocated); + FLOOR(internal_metadata); +#undef FLOOR + } + + bool operator ==(const store_statfs_t& other) const; + bool is_zero() const { + return *this == store_statfs_t(); + } + + uint64_t get_used() const { + return total - available - internally_reserved; + } + + // this accumulates both actually used and statfs's internally_reserved + uint64_t get_used_raw() const { + return total - available; + } + + float get_used_raw_ratio() const { + if (total) { + return (float)get_used_raw() / (float)total; + } else { + return 0.0; + } + } + + // helpers to ease legacy code porting + uint64_t kb_avail() const { + return available >> 10; + } + uint64_t kb() const { + return total >> 10; + } + uint64_t kb_used() const { + return (total - available - internally_reserved) >> 10; + } + uint64_t kb_used_raw() const { + return get_used_raw() >> 10; + } + + uint64_t kb_used_data() const { + return allocated >> 10; + } + uint64_t kb_used_omap() const { + return omap_allocated >> 10; + } + + uint64_t kb_used_internal_metadata() const { + return internal_metadata >> 10; + } + + void add(const store_statfs_t& o) { + total += o.total; + available += o.available; + internally_reserved += o.internally_reserved; + allocated += o.allocated; + data_stored += o.data_stored; + data_compressed += o.data_compressed; + data_compressed_allocated += o.data_compressed_allocated; + data_compressed_original += o.data_compressed_original; + omap_allocated += o.omap_allocated; + internal_metadata += o.internal_metadata; + } + void sub(const store_statfs_t& o) { + total -= o.total; + available -= o.available; + internally_reserved -= o.internally_reserved; + allocated -= o.allocated; + data_stored -= o.data_stored; + data_compressed -= o.data_compressed; + data_compressed_allocated -= o.data_compressed_allocated; + data_compressed_original -= o.data_compressed_original; + omap_allocated -= o.omap_allocated; + internal_metadata -= o.internal_metadata; + } + void dump(ceph::Formatter *f) const; + DENC(store_statfs_t, v, p) { + DENC_START(1, 1, p); + denc(v.total, p); + denc(v.available, p); + denc(v.internally_reserved, p); + denc(v.allocated, p); + denc(v.data_stored, p); + denc(v.data_compressed, p); + denc(v.data_compressed_allocated, p); + denc(v.data_compressed_original, p); + denc(v.omap_allocated, p); + denc(v.internal_metadata, p); + DENC_FINISH(p); + } + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_DENC(store_statfs_t) + +std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs); + +/** osd_stat + * aggregate stats for an osd + */ +struct osd_stat_t { + store_statfs_t statfs; + std::vector hb_peers; + int32_t snap_trim_queue_len, num_snap_trimming; + uint64_t num_shards_repaired; + + pow2_hist_t op_queue_age_hist; + + objectstore_perf_stat_t os_perf_stat; + osd_alerts_t os_alerts; + + epoch_t up_from = 0; + uint64_t seq = 0; + + uint32_t num_pgs = 0; + + uint32_t num_osds = 0; + uint32_t num_per_pool_osds = 0; + uint32_t num_per_pool_omap_osds = 0; + + struct Interfaces { + uint32_t last_update; // in seconds + uint32_t back_pingtime[3]; + uint32_t back_min[3]; + uint32_t back_max[3]; + uint32_t back_last; + uint32_t front_pingtime[3]; + uint32_t front_min[3]; + uint32_t front_max[3]; + uint32_t front_last; + }; + std::map hb_pingtime; ///< map of osd id to Interfaces + + osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0), + num_shards_repaired(0) {} + + void add(const osd_stat_t& o) { + statfs.add(o.statfs); + snap_trim_queue_len += o.snap_trim_queue_len; + num_snap_trimming += o.num_snap_trimming; + num_shards_repaired += o.num_shards_repaired; + op_queue_age_hist.add(o.op_queue_age_hist); + os_perf_stat.add(o.os_perf_stat); + num_pgs += o.num_pgs; + num_osds += o.num_osds; + num_per_pool_osds += o.num_per_pool_osds; + num_per_pool_omap_osds += o.num_per_pool_omap_osds; + for (const auto& a : o.os_alerts) { + auto& target = os_alerts[a.first]; + for (auto& i : a.second) { + target.emplace(i.first, i.second); + } + } + } + void sub(const osd_stat_t& o) { + statfs.sub(o.statfs); + snap_trim_queue_len -= o.snap_trim_queue_len; + num_snap_trimming -= o.num_snap_trimming; + num_shards_repaired -= o.num_shards_repaired; + op_queue_age_hist.sub(o.op_queue_age_hist); + os_perf_stat.sub(o.os_perf_stat); + num_pgs -= o.num_pgs; + num_osds -= o.num_osds; + num_per_pool_osds -= o.num_per_pool_osds; + num_per_pool_omap_osds -= o.num_per_pool_omap_osds; + for (const auto& a : o.os_alerts) { + auto& target = os_alerts[a.first]; + for (auto& i : a.second) { + target.erase(i.first); + } + if (target.empty()) { + os_alerts.erase(a.first); + } + } + } + void dump(ceph::Formatter *f, bool with_net = true) const; + void dump_ping_time(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(osd_stat_t) + +inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) { + return l.statfs == r.statfs && + l.snap_trim_queue_len == r.snap_trim_queue_len && + l.num_snap_trimming == r.num_snap_trimming && + l.num_shards_repaired == r.num_shards_repaired && + l.hb_peers == r.hb_peers && + l.op_queue_age_hist == r.op_queue_age_hist && + l.os_perf_stat == r.os_perf_stat && + l.num_pgs == r.num_pgs && + l.num_osds == r.num_osds && + l.num_per_pool_osds == r.num_per_pool_osds && + l.num_per_pool_omap_osds == r.num_per_pool_omap_osds; +} +inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) { + return !(l == r); +} + +inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) { + return out << "osd_stat(" << s.statfs << ", " + << "peers " << s.hb_peers + << " op hist " << s.op_queue_age_hist.h + << ")"; +} + +/* + * summation over an entire pool + */ +struct pool_stat_t { + object_stat_collection_t stats; + store_statfs_t store_stats; + int64_t log_size; + int64_t ondisk_log_size; // >= active_log_size + int32_t up; ///< number of up replicas or shards + int32_t acting; ///< number of acting replicas or shards + int32_t num_store_stats; ///< amount of store_stats accumulated + + pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0), + num_store_stats(0) + { } + + void floor(int64_t f) { + stats.floor(f); + store_stats.floor(f); + if (log_size < f) + log_size = f; + if (ondisk_log_size < f) + ondisk_log_size = f; + if (up < f) + up = f; + if (acting < f) + acting = f; + if (num_store_stats < f) + num_store_stats = f; + } + + void add(const store_statfs_t& o) { + store_stats.add(o); + ++num_store_stats; + } + void sub(const store_statfs_t& o) { + store_stats.sub(o); + --num_store_stats; + } + + void add(const pg_stat_t& o) { + stats.add(o.stats); + log_size += o.log_size; + ondisk_log_size += o.ondisk_log_size; + up += o.up.size(); + acting += o.acting.size(); + } + void sub(const pg_stat_t& o) { + stats.sub(o.stats); + log_size -= o.log_size; + ondisk_log_size -= o.ondisk_log_size; + up -= o.up.size(); + acting -= o.acting.size(); + } + + bool is_zero() const { + return (stats.is_zero() && + store_stats.is_zero() && + log_size == 0 && + ondisk_log_size == 0 && + up == 0 && + acting == 0 && + num_store_stats == 0); + } + + // helper accessors to retrieve used/netto bytes depending on the + // collection method: new per-pool objectstore report or legacy PG + // summation at OSD. + // In legacy mode used and netto values are the same. But for new per-pool + // collection 'used' provides amount of space ALLOCATED at all related OSDs + // and 'netto' is amount of stored user data. + uint64_t get_allocated_data_bytes(bool per_pool) const { + if (per_pool) { + return store_stats.allocated; + } else { + // legacy mode, use numbers from 'stats' + return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive; + } + } + uint64_t get_allocated_omap_bytes(bool per_pool_omap) const { + if (per_pool_omap) { + return store_stats.omap_allocated; + } else { + // omap is not broken out by pool by nautilus bluestore; report the + // scrub value. this will be imprecise in that it won't account for + // any storage overhead/efficiency. + return stats.sum.num_omap_bytes; + } + } + uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor + bool per_pool) const { + // NOTE: we need the space amp factor so that we can work backwards from + // the raw utilization to the amount of data that the user actually stored. + if (per_pool) { + return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0; + } else { + // legacy mode, use numbers from 'stats'. note that we do NOT use the + // raw_used_rate factor here because we are working from the PG stats + // directly. + return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive; + } + } + uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor + bool per_pool_omap) const { + if (per_pool_omap) { + return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0; + } else { + // omap usage is lazily reported during scrub; this value may lag. + return stats.sum.num_omap_bytes; + } + } + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_stat_t) + + +// ----------------------------------------- + +/** + * pg_hit_set_info_t - information about a single recorded HitSet + * + * Track basic metadata about a HitSet, like the number of insertions + * and the time range it covers. + */ +struct pg_hit_set_info_t { + utime_t begin, end; ///< time interval + eversion_t version; ///< version this HitSet object was written + bool using_gmt; ///< use gmt for creating the hit_set archive object name + + friend bool operator==(const pg_hit_set_info_t& l, + const pg_hit_set_info_t& r) { + return + l.begin == r.begin && + l.end == r.end && + l.version == r.version && + l.using_gmt == r.using_gmt; + } + + explicit pg_hit_set_info_t(bool using_gmt = true) + : using_gmt(using_gmt) {} + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(pg_hit_set_info_t) + +/** + * pg_hit_set_history_t - information about a history of hitsets + * + * Include information about the currently accumulating hit set as well + * as archived/historical ones. + */ +struct pg_hit_set_history_t { + eversion_t current_last_update; ///< last version inserted into current set + std::list history; ///< archived sets, sorted oldest -> newest + + friend bool operator==(const pg_hit_set_history_t& l, + const pg_hit_set_history_t& r) { + return + l.current_last_update == r.current_last_update && + l.history == r.history; + } + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(pg_hit_set_history_t) + + +// ----------------------------------------- + +/** + * pg_history_t - information about recent pg peering/mapping history + * + * This is aggressively shared between OSDs to bound the amount of past + * history they need to worry about. + */ +struct pg_history_t { + epoch_t epoch_created = 0; // epoch in which *pg* was created (pool or pg) + epoch_t epoch_pool_created = 0; // epoch in which *pool* was created + // (note: may be pg creation epoch for + // pre-luminous clusters) + epoch_t last_epoch_started = 0;; // lower bound on last epoch started (anywhere, not necessarily locally) + // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/ + epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval + epoch_t last_epoch_clean = 0;; // lower bound on last epoch the PG was completely clean. + epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval + epoch_t last_epoch_split = 0;; // as parent or child + epoch_t last_epoch_marked_full = 0;; // pool or cluster + + /** + * In the event of a map discontinuity, same_*_since may reflect the first + * map the osd has seen in the new map sequence rather than the actual start + * of the interval. This is ok since a discontinuity at epoch e means there + * must have been a clean interval between e and now and that we cannot be + * in the active set during the interval containing e. + */ + epoch_t same_up_since = 0;; // same acting set since + epoch_t same_interval_since = 0;; // same acting AND up set since + epoch_t same_primary_since = 0;; // same primary at least back through this epoch. + + eversion_t last_scrub; + eversion_t last_deep_scrub; + utime_t last_scrub_stamp; + utime_t last_deep_scrub_stamp; + utime_t last_clean_scrub_stamp; + + /// upper bound on how long prior interval readable (relative to encode time) + ceph::timespan prior_readable_until_ub = ceph::timespan::zero(); + + friend bool operator==(const pg_history_t& l, const pg_history_t& r) { + return + l.epoch_created == r.epoch_created && + l.epoch_pool_created == r.epoch_pool_created && + l.last_epoch_started == r.last_epoch_started && + l.last_interval_started == r.last_interval_started && + l.last_epoch_clean == r.last_epoch_clean && + l.last_interval_clean == r.last_interval_clean && + l.last_epoch_split == r.last_epoch_split && + l.last_epoch_marked_full == r.last_epoch_marked_full && + l.same_up_since == r.same_up_since && + l.same_interval_since == r.same_interval_since && + l.same_primary_since == r.same_primary_since && + l.last_scrub == r.last_scrub && + l.last_deep_scrub == r.last_deep_scrub && + l.last_scrub_stamp == r.last_scrub_stamp && + l.last_deep_scrub_stamp == r.last_deep_scrub_stamp && + l.last_clean_scrub_stamp == r.last_clean_scrub_stamp && + l.prior_readable_until_ub == r.prior_readable_until_ub; + } + + pg_history_t() {} + pg_history_t(epoch_t created, utime_t stamp) + : epoch_created(created), + epoch_pool_created(created), + same_up_since(created), + same_interval_since(created), + same_primary_since(created), + last_scrub_stamp(stamp), + last_deep_scrub_stamp(stamp), + last_clean_scrub_stamp(stamp) {} + + bool merge(const pg_history_t &other) { + // Here, we only update the fields which cannot be calculated from the OSDmap. + bool modified = false; + if (epoch_created < other.epoch_created) { + epoch_created = other.epoch_created; + modified = true; + } + if (epoch_pool_created < other.epoch_pool_created) { + // FIXME: for jewel compat only; this should either be 0 or always the + // same value across all pg instances. + epoch_pool_created = other.epoch_pool_created; + modified = true; + } + if (last_epoch_started < other.last_epoch_started) { + last_epoch_started = other.last_epoch_started; + modified = true; + } + if (last_interval_started < other.last_interval_started) { + last_interval_started = other.last_interval_started; + // if we are learning about a newer *started* interval, our + // readable_until_ub is obsolete + prior_readable_until_ub = other.prior_readable_until_ub; + modified = true; + } else if (other.last_interval_started == last_interval_started && + other.prior_readable_until_ub < prior_readable_until_ub) { + // if other is the *same* interval, than pull our upper bound in + // if they have a tighter bound. + prior_readable_until_ub = other.prior_readable_until_ub; + modified = true; + } + if (last_epoch_clean < other.last_epoch_clean) { + last_epoch_clean = other.last_epoch_clean; + modified = true; + } + if (last_interval_clean < other.last_interval_clean) { + last_interval_clean = other.last_interval_clean; + modified = true; + } + if (last_epoch_split < other.last_epoch_split) { + last_epoch_split = other.last_epoch_split; + modified = true; + } + if (last_epoch_marked_full < other.last_epoch_marked_full) { + last_epoch_marked_full = other.last_epoch_marked_full; + modified = true; + } + if (other.last_scrub > last_scrub) { + last_scrub = other.last_scrub; + modified = true; + } + if (other.last_scrub_stamp > last_scrub_stamp) { + last_scrub_stamp = other.last_scrub_stamp; + modified = true; + } + if (other.last_deep_scrub > last_deep_scrub) { + last_deep_scrub = other.last_deep_scrub; + modified = true; + } + if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) { + last_deep_scrub_stamp = other.last_deep_scrub_stamp; + modified = true; + } + if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) { + last_clean_scrub_stamp = other.last_clean_scrub_stamp; + modified = true; + } + return modified; + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + + ceph::signedspan refresh_prior_readable_until_ub( + ceph::signedspan now, ///< now, relative to osd startup_time + ceph::signedspan ub) { ///< ub, relative to osd startup_time + if (now >= ub) { + // prior interval(s) are unreadable; we can zero the upper bound + prior_readable_until_ub = ceph::signedspan::zero(); + return ceph::signedspan::zero(); + } else { + prior_readable_until_ub = ub - now; + return ub; + } + } + ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) { + if (prior_readable_until_ub == ceph::signedspan::zero()) { + return ceph::signedspan::zero(); + } + return now + prior_readable_until_ub; + } +}; +WRITE_CLASS_ENCODER(pg_history_t) + +inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) { + out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created + << " lis/c=" << h.last_interval_started + << "/" << h.last_interval_clean + << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean + << "/" << h.last_epoch_marked_full + << " sis=" << h.same_interval_since; + if (h.prior_readable_until_ub != ceph::timespan::zero()) { + out << " pruub=" << h.prior_readable_until_ub; + } + return out; +} + + +/** + * pg_info_t - summary of PG statistics. + * + * some notes: + * - last_complete implies we have all objects that existed as of that + * stamp, OR a newer object, OR have already applied a later delete. + * - if last_complete >= log.tail, then we know pg contents thru log.head. + * otherwise, we have no idea what the pg is supposed to contain. + */ +struct pg_info_t { + spg_t pgid; + eversion_t last_update; ///< last object version applied to store. + eversion_t last_complete; ///< last version pg was complete through. + epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd + epoch_t last_interval_started; ///< first epoch of last_epoch_started interval + + version_t last_user_version; ///< last user object version applied to store + + eversion_t log_tail; ///< oldest log entry. + + hobject_t last_backfill; ///< objects >= this and < last_complete may be missing + + interval_set purged_snaps; + + pg_stat_t stats; + + pg_history_t history; + pg_hit_set_history_t hit_set; + + friend bool operator==(const pg_info_t& l, const pg_info_t& r) { + return + l.pgid == r.pgid && + l.last_update == r.last_update && + l.last_complete == r.last_complete && + l.last_epoch_started == r.last_epoch_started && + l.last_interval_started == r.last_interval_started && + l.last_user_version == r.last_user_version && + l.log_tail == r.log_tail && + l.last_backfill == r.last_backfill && + l.purged_snaps == r.purged_snaps && + l.stats == r.stats && + l.history == r.history && + l.hit_set == r.hit_set; + } + + pg_info_t() + : last_epoch_started(0), + last_interval_started(0), + last_user_version(0), + last_backfill(hobject_t::get_max()) + { } + // cppcheck-suppress noExplicitConstructor + pg_info_t(spg_t p) + : pgid(p), + last_epoch_started(0), + last_interval_started(0), + last_user_version(0), + last_backfill(hobject_t::get_max()) + { } + + void set_last_backfill(hobject_t pos) { + last_backfill = pos; + } + + bool is_empty() const { return last_update.version == 0; } + bool dne() const { return history.epoch_created == 0; } + + bool has_missing() const { return last_complete != last_update; } + bool is_incomplete() const { return !last_backfill.is_max(); } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(pg_info_t) + +inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi) +{ + out << pgi.pgid << "("; + if (pgi.dne()) + out << " DNE"; + if (pgi.is_empty()) + out << " empty"; + else { + out << " v " << pgi.last_update; + if (pgi.last_complete != pgi.last_update) + out << " lc " << pgi.last_complete; + out << " (" << pgi.log_tail << "," << pgi.last_update << "]"; + } + if (pgi.is_incomplete()) + out << " lb " << pgi.last_backfill; + //out << " c " << pgi.epoch_created; + out << " local-lis/les=" << pgi.last_interval_started + << "/" << pgi.last_epoch_started; + out << " n=" << pgi.stats.stats.sum.num_objects; + out << " " << pgi.history + << ")"; + return out; +} + +/** + * pg_fast_info_t - common pg_info_t fields + * + * These are the fields of pg_info_t (and children) that are updated for + * most IO operations. + * + * ** WARNING ** + * Because we rely on these fields to be applied to the normal + * info struct, adding a new field here that is not also new in info + * means that we must set an incompat OSD feature bit! + */ +struct pg_fast_info_t { + eversion_t last_update; + eversion_t last_complete; + version_t last_user_version; + struct { // pg_stat_t stats + eversion_t version; + version_t reported_seq; + utime_t last_fresh; + utime_t last_active; + utime_t last_peered; + utime_t last_clean; + utime_t last_unstale; + utime_t last_undegraded; + utime_t last_fullsized; + int64_t log_size; // (also ondisk_log_size, which has the same value) + struct { // object_stat_collection_t stats; + struct { // objct_stat_sum_t sum + int64_t num_bytes; // in bytes + int64_t num_objects; + int64_t num_object_copies; + int64_t num_rd; + int64_t num_rd_kb; + int64_t num_wr; + int64_t num_wr_kb; + int64_t num_objects_dirty; + } sum; + } stats; + } stats; + + void populate_from(const pg_info_t& info) { + last_update = info.last_update; + last_complete = info.last_complete; + last_user_version = info.last_user_version; + stats.version = info.stats.version; + stats.reported_seq = info.stats.reported_seq; + stats.last_fresh = info.stats.last_fresh; + stats.last_active = info.stats.last_active; + stats.last_peered = info.stats.last_peered; + stats.last_clean = info.stats.last_clean; + stats.last_unstale = info.stats.last_unstale; + stats.last_undegraded = info.stats.last_undegraded; + stats.last_fullsized = info.stats.last_fullsized; + stats.log_size = info.stats.log_size; + stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes; + stats.stats.sum.num_objects = info.stats.stats.sum.num_objects; + stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies; + stats.stats.sum.num_rd = info.stats.stats.sum.num_rd; + stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb; + stats.stats.sum.num_wr = info.stats.stats.sum.num_wr; + stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb; + stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty; + } + + bool try_apply_to(pg_info_t* info) { + if (last_update <= info->last_update) + return false; + info->last_update = last_update; + info->last_complete = last_complete; + info->last_user_version = last_user_version; + info->stats.version = stats.version; + info->stats.reported_seq = stats.reported_seq; + info->stats.last_fresh = stats.last_fresh; + info->stats.last_active = stats.last_active; + info->stats.last_peered = stats.last_peered; + info->stats.last_clean = stats.last_clean; + info->stats.last_unstale = stats.last_unstale; + info->stats.last_undegraded = stats.last_undegraded; + info->stats.last_fullsized = stats.last_fullsized; + info->stats.log_size = stats.log_size; + info->stats.ondisk_log_size = stats.log_size; + info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes; + info->stats.stats.sum.num_objects = stats.stats.sum.num_objects; + info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies; + info->stats.stats.sum.num_rd = stats.stats.sum.num_rd; + info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb; + info->stats.stats.sum.num_wr = stats.stats.sum.num_wr; + info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb; + info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty; + return true; + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(last_update, bl); + encode(last_complete, bl); + encode(last_user_version, bl); + encode(stats.version, bl); + encode(stats.reported_seq, bl); + encode(stats.last_fresh, bl); + encode(stats.last_active, bl); + encode(stats.last_peered, bl); + encode(stats.last_clean, bl); + encode(stats.last_unstale, bl); + encode(stats.last_undegraded, bl); + encode(stats.last_fullsized, bl); + encode(stats.log_size, bl); + encode(stats.stats.sum.num_bytes, bl); + encode(stats.stats.sum.num_objects, bl); + encode(stats.stats.sum.num_object_copies, bl); + encode(stats.stats.sum.num_rd, bl); + encode(stats.stats.sum.num_rd_kb, bl); + encode(stats.stats.sum.num_wr, bl); + encode(stats.stats.sum.num_wr_kb, bl); + encode(stats.stats.sum.num_objects_dirty, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(last_update, p); + decode(last_complete, p); + decode(last_user_version, p); + decode(stats.version, p); + decode(stats.reported_seq, p); + decode(stats.last_fresh, p); + decode(stats.last_active, p); + decode(stats.last_peered, p); + decode(stats.last_clean, p); + decode(stats.last_unstale, p); + decode(stats.last_undegraded, p); + decode(stats.last_fullsized, p); + decode(stats.log_size, p); + decode(stats.stats.sum.num_bytes, p); + decode(stats.stats.sum.num_objects, p); + decode(stats.stats.sum.num_object_copies, p); + decode(stats.stats.sum.num_rd, p); + decode(stats.stats.sum.num_rd_kb, p); + decode(stats.stats.sum.num_wr, p); + decode(stats.stats.sum.num_wr_kb, p); + decode(stats.stats.sum.num_objects_dirty, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(pg_fast_info_t) + + +/** + * PastIntervals -- information needed to determine the PriorSet and + * the might_have_unfound set + */ +class PastIntervals { +#ifdef WITH_SEASTAR + using OSDMapRef = boost::local_shared_ptr; +#else + using OSDMapRef = std::shared_ptr; +#endif +public: + struct pg_interval_t { + std::vector up, acting; + epoch_t first, last; + bool maybe_went_rw; + int32_t primary; + int32_t up_primary; + + pg_interval_t() + : first(0), last(0), + maybe_went_rw(false), + primary(-1), + up_primary(-1) + {} + + pg_interval_t( + std::vector &&up, + std::vector &&acting, + epoch_t first, + epoch_t last, + bool maybe_went_rw, + int32_t primary, + int32_t up_primary) + : up(up), acting(acting), first(first), last(last), + maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary) + {} + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + }; + + PastIntervals(); + PastIntervals(PastIntervals &&rhs) = default; + PastIntervals &operator=(PastIntervals &&rhs) = default; + + PastIntervals(const PastIntervals &rhs); + PastIntervals &operator=(const PastIntervals &rhs); + + class interval_rep { + public: + virtual size_t size() const = 0; + virtual bool empty() const = 0; + virtual void clear() = 0; + virtual std::pair get_bounds() const = 0; + virtual std::set get_all_participants( + bool ec_pool) const = 0; + virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0; + virtual std::unique_ptr clone() const = 0; + virtual std::ostream &print(std::ostream &out) const = 0; + virtual void encode(ceph::buffer::list &bl) const = 0; + virtual void decode(ceph::buffer::list::const_iterator &bl) = 0; + virtual void dump(ceph::Formatter *f) const = 0; + virtual void iterate_mayberw_back_to( + epoch_t les, + std::function &)> &&f) const = 0; + + virtual bool has_full_intervals() const { return false; } + virtual void iterate_all_intervals( + std::function &&f) const { + ceph_assert(!has_full_intervals()); + ceph_abort_msg("not valid for this implementation"); + } + virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0; + + virtual ~interval_rep() {} + }; + friend class pi_compact_rep; +private: + + std::unique_ptr past_intervals; + + explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {} + +public: + void add_interval(bool ec_pool, const pg_interval_t &interval) { + ceph_assert(past_intervals); + return past_intervals->add_interval(ec_pool, interval); + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + if (past_intervals) { + __u8 type = 2; + encode(type, bl); + past_intervals->encode(bl); + } else { + encode((__u8)0, bl); + } + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator &bl); + + void dump(ceph::Formatter *f) const { + ceph_assert(past_intervals); + past_intervals->dump(f); + } + static void generate_test_instances(std::list & o); + + /** + * Determines whether there is an interval change + */ + static bool is_new_interval( + int old_acting_primary, + int new_acting_primary, + const std::vector &old_acting, + const std::vector &new_acting, + int old_up_primary, + int new_up_primary, + const std::vector &old_up, + const std::vector &new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + unsigned old_pg_num, + unsigned new_pg_num, + unsigned old_pg_num_pending, + unsigned new_pg_num_pending, + bool old_sort_bitwise, + bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, + uint32_t old_crush_count, + uint32_t new_crush_count, + uint32_t old_crush_target, + uint32_t new_crush_target, + uint32_t old_crush_barrier, + uint32_t new_crush_barrier, + int32_t old_crush_member, + int32_t new_crush_member, + pg_t pgid + ); + + /** + * Determines whether there is an interval change + */ + static bool is_new_interval( + int old_acting_primary, ///< [in] primary as of lastmap + int new_acting_primary, ///< [in] primary as of lastmap + const std::vector &old_acting, ///< [in] acting as of lastmap + const std::vector &new_acting, ///< [in] acting as of osdmap + int old_up_primary, ///< [in] up primary of lastmap + int new_up_primary, ///< [in] up primary of osdmap + const std::vector &old_up, ///< [in] up as of lastmap + const std::vector &new_up, ///< [in] up as of osdmap + const OSDMap *osdmap, ///< [in] current map + const OSDMap *lastmap, ///< [in] last map + pg_t pgid ///< [in] pgid for pg + ); + + /** + * Integrates a new map into *past_intervals, returns true + * if an interval was closed out. + */ + static bool check_new_interval( + int old_acting_primary, ///< [in] primary as of lastmap + int new_acting_primary, ///< [in] primary as of osdmap + const std::vector &old_acting, ///< [in] acting as of lastmap + const std::vector &new_acting, ///< [in] acting as of osdmap + int old_up_primary, ///< [in] up primary of lastmap + int new_up_primary, ///< [in] up primary of osdmap + const std::vector &old_up, ///< [in] up as of lastmap + const std::vector &new_up, ///< [in] up as of osdmap + epoch_t same_interval_since, ///< [in] as of osdmap + epoch_t last_epoch_clean, ///< [in] current + const OSDMap *osdmap, ///< [in] current map + const OSDMap *lastmap, ///< [in] last map + pg_t pgid, ///< [in] pgid for pg + const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active + PastIntervals *past_intervals, ///< [out] intervals + std::ostream *out = 0 ///< [out] debug ostream + ); + static bool check_new_interval( + int old_acting_primary, ///< [in] primary as of lastmap + int new_acting_primary, ///< [in] primary as of osdmap + const std::vector &old_acting, ///< [in] acting as of lastmap + const std::vector &new_acting, ///< [in] acting as of osdmap + int old_up_primary, ///< [in] up primary of lastmap + int new_up_primary, ///< [in] up primary of osdmap + const std::vector &old_up, ///< [in] up as of lastmap + const std::vector &new_up, ///< [in] up as of osdmap + epoch_t same_interval_since, ///< [in] as of osdmap + epoch_t last_epoch_clean, ///< [in] current + OSDMapRef osdmap, ///< [in] current map + OSDMapRef lastmap, ///< [in] last map + pg_t pgid, ///< [in] pgid for pg + const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active + PastIntervals *past_intervals, ///< [out] intervals + std::ostream *out = 0 ///< [out] debug ostream + ) { + return check_new_interval( + old_acting_primary, new_acting_primary, + old_acting, new_acting, + old_up_primary, new_up_primary, + old_up, new_up, + same_interval_since, last_epoch_clean, + osdmap.get(), lastmap.get(), + pgid, + could_have_gone_active, + past_intervals, + out); + } + + friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i); + + template + void iterate_mayberw_back_to( + epoch_t les, + F &&f) const { + ceph_assert(past_intervals); + past_intervals->iterate_mayberw_back_to(les, std::forward(f)); + } + void clear() { + ceph_assert(past_intervals); + past_intervals->clear(); + } + + /** + * Should return a value which gives an indication of the amount + * of state contained + */ + size_t size() const { + ceph_assert(past_intervals); + return past_intervals->size(); + } + + bool empty() const { + ceph_assert(past_intervals); + return past_intervals->empty(); + } + + void swap(PastIntervals &other) { + using std::swap; + swap(other.past_intervals, past_intervals); + } + + /** + * Return all shards which have been in the acting set back to the + * latest epoch to which we have trimmed except for pg_whoami + */ + std::set get_might_have_unfound( + pg_shard_t pg_whoami, + bool ec_pool) const { + ceph_assert(past_intervals); + auto ret = past_intervals->get_all_participants(ec_pool); + ret.erase(pg_whoami); + return ret; + } + + /** + * Return all shards which we might want to talk to for peering + */ + std::set get_all_probe( + bool ec_pool) const { + ceph_assert(past_intervals); + return past_intervals->get_all_participants(ec_pool); + } + + /* Return the set of epochs [start, end) represented by the + * past_interval set. + */ + std::pair get_bounds() const { + ceph_assert(past_intervals); + return past_intervals->get_bounds(); + } + + void adjust_start_backwards(epoch_t last_epoch_clean) { + ceph_assert(past_intervals); + past_intervals->adjust_start_backwards(last_epoch_clean); + } + + enum osd_state_t { + UP, + DOWN, + DNE, + LOST + }; + struct PriorSet { + bool ec_pool = false; + std::set probe; ///< current+prior OSDs we need to probe. + std::set down; ///< down osds that would normally be in @a probe and might be interesting. + std::map blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set + + bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set. + const IsPGRecoverablePredicate* pcontdec = nullptr; + + PriorSet() = default; + PriorSet(PriorSet &&) = default; + PriorSet &operator=(PriorSet &&) = default; + + PriorSet &operator=(const PriorSet &) = delete; + PriorSet(const PriorSet &) = delete; + + bool operator==(const PriorSet &rhs) const { + return (ec_pool == rhs.ec_pool) && + (probe == rhs.probe) && + (down == rhs.down) && + (blocked_by == rhs.blocked_by) && + (pg_down == rhs.pg_down); + } + + bool affected_by_map( + const OSDMap &osdmap, + const DoutPrefixProvider *dpp) const; + + // For verifying tests + PriorSet( + bool ec_pool, + std::set probe, + std::set down, + std::map blocked_by, + bool pg_down, + const IsPGRecoverablePredicate *pcontdec) + : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by), + pg_down(pg_down), pcontdec(pcontdec) {} + + private: + template + PriorSet( + const PastIntervals &past_intervals, + bool ec_pool, + epoch_t last_epoch_started, + const IsPGRecoverablePredicate *c, + F f, + const std::vector &up, + const std::vector &acting, + const DoutPrefixProvider *dpp); + + friend class PastIntervals; + }; + + template + PriorSet get_prior_set(Args&&... args) const { + return PriorSet(*this, std::forward(args)...); + } +}; +WRITE_CLASS_ENCODER(PastIntervals) + +std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i); +std::ostream& operator<<(std::ostream& out, const PastIntervals &i); +std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i); + +template +PastIntervals::PriorSet::PriorSet( + const PastIntervals &past_intervals, + bool ec_pool, + epoch_t last_epoch_started, + const IsPGRecoverablePredicate *c, + F f, + const std::vector &up, + const std::vector &acting, + const DoutPrefixProvider *dpp) + : ec_pool(ec_pool), pg_down(false), pcontdec(c) +{ + /* + * We have to be careful to gracefully deal with situations like + * so. Say we have a power outage or something that takes out both + * OSDs, but the monitor doesn't mark them down in the same epoch. + * The history may look like + * + * 1: A B + * 2: B + * 3: let's say B dies for good, too (say, from the power spike) + * 4: A + * + * which makes it look like B may have applied updates to the PG + * that we need in order to proceed. This sucks... + * + * To minimize the risk of this happening, we CANNOT go active if + * _any_ OSDs in the prior set are down until we send an MOSDAlive + * to the monitor such that the OSDMap sets osd_up_thru to an epoch. + * Then, we have something like + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: + * 4: A + * + * -> we can ignore B, bc it couldn't have gone active (alive_thru + * still 0). + * + * or, + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: B up_thru[B]=2 + * 4: + * 5: A + * + * -> we must wait for B, bc it was alive through 2, and could have + * written to the pg. + * + * If B is really dead, then an administrator will need to manually + * intervene by marking the OSD as "lost." + */ + + // Include current acting and up nodes... not because they may + // contain old data (this interval hasn't gone active, obviously), + // but because we want their pg_info to inform choose_acting(), and + // so that we know what they do/do not have explicitly before + // sending them any new info/logs/whatever. + for (unsigned i = 0; i < acting.size(); i++) { + if (acting[i] != pg_pool_t::pg_CRUSH_ITEM_NONE) + probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + // It may be possible to exclude the up nodes, but let's keep them in + // there for now. + for (unsigned i = 0; i < up.size(); i++) { + if (up[i] != pg_pool_t::pg_CRUSH_ITEM_NONE) + probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + + std::set all_probe = past_intervals.get_all_probe(ec_pool); + ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl; + for (auto &&i: all_probe) { + switch (f(0, i.osd, nullptr)) { + case UP: { + probe.insert(i); + break; + } + case DNE: + case LOST: + case DOWN: { + down.insert(i.osd); + break; + } + } + } + + past_intervals.iterate_mayberw_back_to( + last_epoch_started, + [&](epoch_t start, const std::set &acting) { + ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start + << ", acting: " << acting << dendl; + + // look at candidate osds during this interval. each falls into + // one of three categories: up, down (but potentially + // interesting), or lost (down, but we won't wait for it). + std::set up_now; + std::map candidate_blocked_by; + // any candidates down now (that might have useful data) + bool any_down_now = false; + + // consider ACTING osds + for (auto &&so: acting) { + epoch_t lost_at = 0; + switch (f(start, so.osd, &lost_at)) { + case UP: { + // include past acting osds if they are up. + up_now.insert(so); + break; + } + case DNE: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " no longer exists" << dendl; + break; + } + case LOST: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " is down, but lost_at " << lost_at << dendl; + up_now.insert(so); + break; + } + case DOWN: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " is down" << dendl; + candidate_blocked_by[so.osd] = lost_at; + any_down_now = true; + break; + } + } + } + + // if not enough osds survived this interval, and we may have gone rw, + // then we need to wait for one of those osds to recover to + // ensure that we haven't lost any information. + if (!(*pcontdec)(up_now) && any_down_now) { + // fixme: how do we identify a "clean" shutdown anyway? + ldpp_dout(dpp, 10) << "build_prior possibly went active+rw," + << " insufficient up; including down osds" << dendl; + ceph_assert(!candidate_blocked_by.empty()); + pg_down = true; + blocked_by.insert( + candidate_blocked_by.begin(), + candidate_blocked_by.end()); + } + }); + + ldpp_dout(dpp, 10) << "build_prior final: probe " << probe + << " down " << down + << " blocked_by " << blocked_by + << (pg_down ? " pg_down":"") + << dendl; +} + +struct pg_notify_t { + epoch_t query_epoch; + epoch_t epoch_sent; + pg_info_t info; + shard_id_t to; + shard_id_t from; + PastIntervals past_intervals; + pg_notify_t() : + query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD), + from(shard_id_t::NO_SHARD) {} + pg_notify_t( + shard_id_t to, + shard_id_t from, + epoch_t query_epoch, + epoch_t epoch_sent, + const pg_info_t &info, + const PastIntervals& pi) + : query_epoch(query_epoch), + epoch_sent(epoch_sent), + info(info), to(to), from(from), + past_intervals(pi) { + ceph_assert(from == info.pgid.shard); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list &o); +}; +WRITE_CLASS_ENCODER(pg_notify_t) +std::ostream &operator<<(std::ostream &lhs, const pg_notify_t ¬ify); + + +/** + * pg_query_t - used to ask a peer for information about a pg. + * + * note: if version=0, type=LOG, then we just provide our full log. + */ +struct pg_query_t { + enum { + INFO = 0, + LOG = 1, + MISSING = 4, + FULLLOG = 5, + }; + std::string_view get_type_name() const { + switch (type) { + case INFO: return "info"; + case LOG: return "log"; + case MISSING: return "missing"; + case FULLLOG: return "fulllog"; + default: return "???"; + } + } + + __s32 type; + eversion_t since; + pg_history_t history; + epoch_t epoch_sent; + shard_id_t to; + shard_id_t from; + + pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD), + from(shard_id_t::NO_SHARD) {} + pg_query_t( + int t, + shard_id_t to, + shard_id_t from, + const pg_history_t& h, + epoch_t epoch_sent) + : type(t), + history(h), + epoch_sent(epoch_sent), + to(to), from(from) { + ceph_assert(t != LOG); + } + pg_query_t( + int t, + shard_id_t to, + shard_id_t from, + eversion_t s, + const pg_history_t& h, + epoch_t epoch_sent) + : type(t), since(s), history(h), + epoch_sent(epoch_sent), to(to), from(from) { + ceph_assert(t == LOG); + } + + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pg_query_t) + +inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) { + out << "query(" << q.get_type_name() << " " << q.since; + if (q.type == pg_query_t::LOG) + out << " " << q.history; + out << " epoch_sent " << q.epoch_sent; + out << ")"; + return out; +} + +/** + * pg_lease_t - readable lease metadata, from primary -> non-primary + * + * This metadata serves to increase either or both of the lease expiration + * and upper bound on the non-primary. + */ +struct pg_lease_t { + /// pg readable_until value; replicas must not be readable beyond this + ceph::signedspan readable_until = ceph::signedspan::zero(); + + /// upper bound on any acting osd's readable_until + ceph::signedspan readable_until_ub = ceph::signedspan::zero(); + + /// duration of the lease (in case clock deltas aren't available) + ceph::signedspan interval = ceph::signedspan::zero(); + + pg_lease_t() {} + pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub, + ceph::signedspan i) + : readable_until(ru), + readable_until_ub(ruub), + interval(i) {} + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + + friend std::ostream& operator<<(std::ostream& out, const pg_lease_t& l) { + return out << "pg_lease(ru " << l.readable_until + << " ub " << l.readable_until_ub + << " int " << l.interval << ")"; + } +}; +WRITE_CLASS_ENCODER(pg_lease_t) + +/** + * pg_lease_ack_t - lease ack, from non-primary -> primary + * + * This metadata acknowledges to the primary what a non-primary's noted + * upper bound is. + */ +struct pg_lease_ack_t { + /// highest upper bound non-primary has recorded (primary's clock) + ceph::signedspan readable_until_ub = ceph::signedspan::zero(); + + pg_lease_ack_t() {} + pg_lease_ack_t(ceph::signedspan ub) + : readable_until_ub(ub) {} + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + + friend std::ostream& operator<<(std::ostream& out, const pg_lease_ack_t& l) { + return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")"; + } +}; +WRITE_CLASS_ENCODER(pg_lease_ack_t) + + + +class PGBackend; +class ObjectModDesc { + bool can_local_rollback; + bool rollback_info_completed; + + // version required to decode, reflected in encode/decode version + __u8 max_required_version = 1; +public: + class Visitor { + public: + virtual void append(uint64_t old_offset) {} + virtual void setattrs(std::map> &attrs) {} + virtual void rmobject(version_t old_version) {} + /** + * Used to support the unfound_lost_delete log event: if the stashed + * version exists, we unstash it, otherwise, we do nothing. This way + * each replica rolls back to whatever state it had prior to the attempt + * at mark unfound lost delete + */ + virtual void try_rmobject(version_t old_version) { + rmobject(old_version); + } + virtual void create() {} + virtual void update_snaps(const std::set &old_snaps) {} + virtual void rollback_extents( + version_t gen, + const std::vector > &extents) {} + virtual ~Visitor() {} + }; + void visit(Visitor *visitor) const; + mutable ceph::buffer::list bl; + enum ModID { + APPEND = 1, + SETATTRS = 2, + DELETE = 3, + CREATE = 4, + UPDATE_SNAPS = 5, + TRY_DELETE = 6, + ROLLBACK_EXTENTS = 7 + }; + ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) { + bl.reassign_to_mempool(mempool::mempool_osd_pglog); + } + void claim(ObjectModDesc &other) { + bl = std::move(other.bl); + can_local_rollback = other.can_local_rollback; + rollback_info_completed = other.rollback_info_completed; + } + void claim_append(ObjectModDesc &other) { + if (!can_local_rollback || rollback_info_completed) + return; + if (!other.can_local_rollback) { + mark_unrollbackable(); + return; + } + bl.claim_append(other.bl); + rollback_info_completed = other.rollback_info_completed; + } + void swap(ObjectModDesc &other) { + bl.swap(other.bl); + + using std::swap; + swap(other.can_local_rollback, can_local_rollback); + swap(other.rollback_info_completed, rollback_info_completed); + swap(other.max_required_version, max_required_version); + } + void append_id(ModID id) { + using ceph::encode; + uint8_t _id(id); + encode(_id, bl); + } + void append(uint64_t old_size) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(APPEND); + encode(old_size, bl); + ENCODE_FINISH(bl); + } + void setattrs(std::map> &old_attrs) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(SETATTRS); + encode(old_attrs, bl); + ENCODE_FINISH(bl); + } + bool rmobject(version_t deletion_version) { + if (!can_local_rollback || rollback_info_completed) + return false; + ENCODE_START(1, 1, bl); + append_id(DELETE); + encode(deletion_version, bl); + ENCODE_FINISH(bl); + rollback_info_completed = true; + return true; + } + bool try_rmobject(version_t deletion_version) { + if (!can_local_rollback || rollback_info_completed) + return false; + ENCODE_START(1, 1, bl); + append_id(TRY_DELETE); + encode(deletion_version, bl); + ENCODE_FINISH(bl); + rollback_info_completed = true; + return true; + } + void create() { + if (!can_local_rollback || rollback_info_completed) + return; + rollback_info_completed = true; + ENCODE_START(1, 1, bl); + append_id(CREATE); + ENCODE_FINISH(bl); + } + void update_snaps(const std::set &old_snaps) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(UPDATE_SNAPS); + encode(old_snaps, bl); + ENCODE_FINISH(bl); + } + void rollback_extents( + version_t gen, const std::vector > &extents) { + ceph_assert(can_local_rollback); + ceph_assert(!rollback_info_completed); + if (max_required_version < 2) + max_required_version = 2; + ENCODE_START(2, 2, bl); + append_id(ROLLBACK_EXTENTS); + encode(gen, bl); + encode(extents, bl); + ENCODE_FINISH(bl); + } + + // cannot be rolled back + void mark_unrollbackable() { + can_local_rollback = false; + bl.clear(); + } + bool can_rollback() const { + return can_local_rollback; + } + bool empty() const { + return can_local_rollback && (bl.length() == 0); + } + + bool requires_kraken() const { + return max_required_version >= 2; + } + + /** + * Create fresh copy of bl bytes to avoid keeping large buffers around + * in the case that bl contains ptrs which point into a much larger + * message buffer + */ + void trim_bl() const { + if (bl.length() > 0) + bl.rebuild(); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(ObjectModDesc) + +class ObjectCleanRegions { +private: + bool new_object; + bool clean_omap; + interval_set clean_offsets; + static std::atomic max_num_intervals; + + /** + * trim the number of intervals if clean_offsets.num_intervals() + * exceeds the given upbound max_num_intervals + * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]} + * then new interval [30~10] will evict out the shortest one [20~5] + * finally, clean_offsets becomes {[5~10], [30~10]} + */ + void trim(); + friend std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr); +public: + ObjectCleanRegions() : new_object(false), clean_omap(true) { + clean_offsets.insert(0, (uint64_t)-1); + } + ObjectCleanRegions(uint64_t offset, uint64_t len, bool co) + : new_object(false), clean_omap(co) { + clean_offsets.insert(offset, len); + } + bool operator==(const ObjectCleanRegions &orc) const { + return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets; + } + static void set_max_num_intervals(uint32_t num); + void merge(const ObjectCleanRegions &other); + void mark_data_region_dirty(uint64_t offset, uint64_t len); + void mark_omap_dirty(); + void mark_object_new(); + void mark_fully_dirty(); + interval_set get_dirty_regions() const; + bool omap_is_dirty() const; + bool object_is_exist() const; + bool is_clean_region(uint64_t offset, uint64_t len) const; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(ObjectCleanRegions) +std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr); + + +struct OSDOp { + ceph_osd_op op; + sobject_t soid; + + ceph::buffer::list indata, outdata; + errorcode32_t rval = 0; + + OSDOp() { + // FIPS zeroization audit 20191115: this memset clean for security + memset(&op, 0, sizeof(ceph_osd_op)); + } + + OSDOp(const int op_code) { + // FIPS zeroization audit 20191115: this memset clean for security + memset(&op, 0, sizeof(ceph_osd_op)); + op.op = op_code; + } + + /** + * split a ceph::buffer::list into constituent indata members of a vector of OSDOps + * + * @param ops [out] vector of OSDOps + * @param in [in] combined data buffer + */ + template + static void split_osd_op_vector_in_data(V& ops, + ceph::buffer::list& in) { + ceph::buffer::list::iterator datap = in.begin(); + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].op.payload_len) { + datap.copy(ops[i].op.payload_len, ops[i].indata); + } + } + } + + /** + * merge indata members of a vector of OSDOp into a single ceph::buffer::list + * + * Notably this also encodes certain other OSDOp data into the data + * buffer, including the sobject_t soid. + * + * @param ops [in] vector of OSDOps + * @param out [out] combined data buffer + */ + template + static void merge_osd_op_vector_in_data(V& ops, ceph::buffer::list& out) { + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].indata.length()) { + ops[i].op.payload_len = ops[i].indata.length(); + out.append(ops[i].indata); + } + } + } + + /** + * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps + * + * @param ops [out] vector of OSDOps + * @param in [in] combined data buffer + */ + static void split_osd_op_vector_out_data(std::vector& ops, ceph::buffer::list& in); + + /** + * merge outdata members of a vector of OSDOps into a single ceph::buffer::list + * + * @param ops [in] vector of OSDOps + * @param out [out] combined data buffer + */ + static void merge_osd_op_vector_out_data(std::vector& ops, ceph::buffer::list& out); + + /** + * Clear data as much as possible, leave minimal data for historical op dump + * + * @param ops [in] vector of OSDOps + */ + template + static void clear_data(V& ops) { + for (unsigned i = 0; i < ops.size(); i++) { + OSDOp& op = ops[i]; + op.outdata.clear(); + if (ceph_osd_op_type_attr(op.op.op) && + op.op.xattr.name_len && + op.indata.length() >= op.op.xattr.name_len) { + ceph::buffer::list bl; + bl.push_back(ceph::buffer::ptr_node::create(op.op.xattr.name_len)); + bl.begin().copy_in(op.op.xattr.name_len, op.indata); + op.indata = std::move(bl); + } else if (ceph_osd_op_type_exec(op.op.op) && + op.op.cls.class_len && + op.indata.length() > + (op.op.cls.class_len + op.op.cls.method_len)) { + __u8 len = op.op.cls.class_len + op.op.cls.method_len; + ceph::buffer::list bl; + bl.push_back(ceph::buffer::ptr_node::create(len)); + bl.begin().copy_in(len, op.indata); + op.indata = std::move(bl); + } else { + op.indata.clear(); + } + } + } +}; +std::ostream& operator<<(std::ostream& out, const OSDOp& op); + +struct pg_log_op_return_item_t { + int32_t rval; + ceph::buffer::list bl; + void encode(ceph::buffer::list& p) const { + using ceph::encode; + encode(rval, p); + encode(bl, p); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(rval, p); + decode(bl, p); + } + void dump(ceph::Formatter *f) const { + f->dump_int("rval", rval); + f->dump_unsigned("bl_length", bl.length()); + } + friend bool operator==(const pg_log_op_return_item_t& lhs, + const pg_log_op_return_item_t& rhs) { + return lhs.rval == rhs.rval && + lhs.bl.contents_equal(rhs.bl); + } + friend bool operator!=(const pg_log_op_return_item_t& lhs, + const pg_log_op_return_item_t& rhs) { + return !(lhs == rhs); + } + friend std::ostream& operator<<(std::ostream& out, const pg_log_op_return_item_t& i) { + return out << "r=" << i.rval << "+" << i.bl.length() << "b"; + } +}; +WRITE_CLASS_ENCODER(pg_log_op_return_item_t) + +/** + * pg_log_entry_t - single entry/event in pg log + * + */ +struct pg_log_entry_t { + enum { + MODIFY = 1, // some unspecified modification (but not *all* modifications) + CLONE = 2, // cloned object from head + DELETE = 3, // deleted object + //BACKLOG = 4, // event invented by generate_backlog [obsolete] + LOST_REVERT = 5, // lost new version, revert to an older version. + LOST_DELETE = 6, // lost new version, revert to no object (deleted). + LOST_MARK = 7, // lost new version, now EIO + PROMOTE = 8, // promoted object from another tier + CLEAN = 9, // mark an object clean + ERROR = 10, // write that returned an error + }; + static const char *get_op_name(int op) { + switch (op) { + case MODIFY: + return "modify"; + case PROMOTE: + return "promote"; + case CLONE: + return "clone"; + case DELETE: + return "delete"; + case LOST_REVERT: + return "l_revert"; + case LOST_DELETE: + return "l_delete"; + case LOST_MARK: + return "l_mark"; + case CLEAN: + return "clean"; + case ERROR: + return "error"; + default: + return "unknown"; + } + } + const char *get_op_name() const { + return get_op_name(op); + } + + // describes state for a locally-rollbackable entry + ObjectModDesc mod_desc; + ceph::buffer::list snaps; // only for clone entries + hobject_t soid; + osd_reqid_t reqid; // caller+tid to uniquely identify request + mempool::osd_pglog::vector > extra_reqids; + + /// map extra_reqids by index to error return code (if any) + mempool::osd_pglog::map extra_reqid_return_codes; + + eversion_t version, prior_version, reverting_to; + version_t user_version; // the user version for this entry + utime_t mtime; // this is the _user_ mtime, mind you + int32_t return_code; // only stored for ERRORs for dup detection + + std::vector op_returns; + + __s32 op; + bool invalid_hash; // only when decoding sobject_t based entries + bool invalid_pool; // only when decoding pool-less hobject based entries + ObjectCleanRegions clean_regions; + + pg_log_entry_t() + : user_version(0), return_code(0), op(0), + invalid_hash(false), invalid_pool(false) { + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + pg_log_entry_t(int _op, const hobject_t& _soid, + const eversion_t& v, const eversion_t& pv, + version_t uv, + const osd_reqid_t& rid, const utime_t& mt, + int return_code) + : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv), + mtime(mt), return_code(return_code), op(_op), + invalid_hash(false), invalid_pool(false) { + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + bool is_clone() const { return op == CLONE; } + bool is_modify() const { return op == MODIFY; } + bool is_promote() const { return op == PROMOTE; } + bool is_clean() const { return op == CLEAN; } + bool is_lost_revert() const { return op == LOST_REVERT; } + bool is_lost_delete() const { return op == LOST_DELETE; } + bool is_lost_mark() const { return op == LOST_MARK; } + bool is_error() const { return op == ERROR; } + + bool is_update() const { + return + is_clone() || is_modify() || is_promote() || is_clean() || + is_lost_revert() || is_lost_mark(); + } + bool is_delete() const { + return op == DELETE || op == LOST_DELETE; + } + + bool can_rollback() const { + return mod_desc.can_rollback(); + } + + void mark_unrollbackable() { + mod_desc.mark_unrollbackable(); + } + + bool requires_kraken() const { + return mod_desc.requires_kraken(); + } + + // Errors are only used for dup detection, whereas + // the index by objects is used by recovery, copy_get, + // and other facilities that don't expect or need to + // be aware of error entries. + bool object_is_indexed() const { + return !is_error(); + } + + bool reqid_is_indexed() const { + return reqid != osd_reqid_t() && + (op == MODIFY || op == DELETE || op == ERROR); + } + + void set_op_returns(const std::vector& ops) { + op_returns.resize(ops.size()); + for (unsigned i = 0; i < ops.size(); ++i) { + op_returns[i].rval = ops[i].rval; + op_returns[i].bl = ops[i].outdata; + } + } + + std::string get_key_name() const; + void encode_with_checksum(ceph::buffer::list& bl) const; + void decode_with_checksum(ceph::buffer::list::const_iterator& p); + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + +}; +WRITE_CLASS_ENCODER(pg_log_entry_t) + +std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e); + +struct pg_log_dup_t { + osd_reqid_t reqid; // caller+tid to uniquely identify request + eversion_t version; + version_t user_version; // the user version for this entry + int32_t return_code; // only stored for ERRORs for dup detection + + std::vector op_returns; + + pg_log_dup_t() + : user_version(0), return_code(0) + {} + explicit pg_log_dup_t(const pg_log_entry_t& entry) + : reqid(entry.reqid), version(entry.version), + user_version(entry.user_version), + return_code(entry.return_code), + op_returns(entry.op_returns) + {} + pg_log_dup_t(const eversion_t& v, version_t uv, + const osd_reqid_t& rid, int return_code) + : reqid(rid), version(v), user_version(uv), + return_code(return_code) + {} + + std::string get_key_name() const; + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + + bool operator==(const pg_log_dup_t &rhs) const { + return reqid == rhs.reqid && + version == rhs.version && + user_version == rhs.user_version && + return_code == rhs.return_code && + op_returns == rhs.op_returns; + } + bool operator!=(const pg_log_dup_t &rhs) const { + return !(*this == rhs); + } + + friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e); +}; +WRITE_CLASS_ENCODER(pg_log_dup_t) + +std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e); + +/** + * pg_log_t - incremental log of recent pg changes. + * + * serves as a recovery queue for recent changes. + */ +struct pg_log_t { + /* + * head - newest entry (update|delete) + * tail - entry previous to oldest (update|delete) for which we have + * complete negative information. + * i.e. we can infer pg contents for any store whose last_update >= tail. + */ + eversion_t head; // newest entry + eversion_t tail; // version prior to oldest + +protected: + // We can rollback rollback-able entries > can_rollback_to + eversion_t can_rollback_to; + + // always <= can_rollback_to, indicates how far stashed rollback + // data can be found + eversion_t rollback_info_trimmed_to; + +public: + // the actual log + mempool::osd_pglog::list log; + + // entries just for dup op detection ordered oldest to newest + mempool::osd_pglog::list dups; + + pg_log_t() = default; + pg_log_t(const eversion_t &last_update, + const eversion_t &log_tail, + const eversion_t &can_rollback_to, + const eversion_t &rollback_info_trimmed_to, + mempool::osd_pglog::list &&entries, + mempool::osd_pglog::list &&dup_entries) + : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to), + rollback_info_trimmed_to(rollback_info_trimmed_to), + log(std::move(entries)), dups(std::move(dup_entries)) {} + pg_log_t(const eversion_t &last_update, + const eversion_t &log_tail, + const eversion_t &can_rollback_to, + const eversion_t &rollback_info_trimmed_to, + const std::list &entries, + const std::list &dup_entries) + : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to), + rollback_info_trimmed_to(rollback_info_trimmed_to) { + for (auto &&entry: entries) { + log.push_back(entry); + } + for (auto &&entry: dup_entries) { + dups.push_back(entry); + } + } + + void clear() { + eversion_t z; + rollback_info_trimmed_to = can_rollback_to = head = tail = z; + log.clear(); + dups.clear(); + } + + eversion_t get_rollback_info_trimmed_to() const { + return rollback_info_trimmed_to; + } + eversion_t get_can_rollback_to() const { + return can_rollback_to; + } + + + pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) { + mempool::osd_pglog::list oldlog, childlog; + oldlog.swap(log); + + eversion_t old_tail; + unsigned mask = ~((~0)<soid.get_hash() & mask) == child_pgid.m_seed) { + childlog.push_back(*i); + } else { + log.push_back(*i); + } + oldlog.erase(i++); + } + + // osd_reqid is unique, so it doesn't matter if there are extra + // dup entries in each pg. To avoid storing oid with the dup + // entries, just copy the whole list. + auto childdups(dups); + + return pg_log_t( + head, + tail, + can_rollback_to, + rollback_info_trimmed_to, + std::move(childlog), + std::move(childdups)); + } + + mempool::osd_pglog::list rewind_from_head(eversion_t newhead) { + ceph_assert(newhead >= tail); + + mempool::osd_pglog::list::iterator p = log.end(); + mempool::osd_pglog::list divergent; + while (true) { + if (p == log.begin()) { + // yikes, the whole thing is divergent! + using std::swap; + swap(divergent, log); + break; + } + --p; + if (p->version.version <= newhead.version) { + /* + * look at eversion.version here. we want to avoid a situation like: + * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529 + * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529 + * lower_bound = 100'9 + * i.e, same request, different version. If the eversion.version is > the + * lower_bound, we it is divergent. + */ + ++p; + divergent.splice(divergent.begin(), log, p, log.end()); + break; + } + ceph_assert(p->version > newhead); + } + head = newhead; + + if (can_rollback_to > newhead) + can_rollback_to = newhead; + + if (rollback_info_trimmed_to > newhead) + rollback_info_trimmed_to = newhead; + + return divergent; + } + + void merge_from(const std::vector& slogs, eversion_t last_update) { + log.clear(); + + // sort and merge dups + std::multimap sorted; + for (auto& d : dups) { + sorted.emplace(d.version, d); + } + for (auto l : slogs) { + for (auto& d : l->dups) { + sorted.emplace(d.version, d); + } + } + dups.clear(); + for (auto& i : sorted) { + dups.push_back(i.second); + } + + head = last_update; + tail = last_update; + can_rollback_to = last_update; + rollback_info_trimmed_to = last_update; + } + + bool empty() const { + return log.empty(); + } + + bool null() const { + return head.version == 0 && head.epoch == 0; + } + + uint64_t approx_size() const { + return head.version - tail.version; + } + + static void filter_log(spg_t import_pgid, const OSDMap &curmap, + const std::string &hit_set_namespace, const pg_log_t &in, + pg_log_t &out, pg_log_t &reject); + + /** + * copy entries from the tail of another pg_log_t + * + * @param other pg_log_t to copy from + * @param from copy entries after this version + */ + void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from); + + /** + * copy up to N entries + * + * @param other source log + * @param max max number of entries to copy + */ + void copy_up_to(CephContext* cct, const pg_log_t &other, int max); + + std::ostream& print(std::ostream& out) const; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(pg_log_t) + +inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log) +{ + out << "log((" << log.tail << "," << log.head << "], crt=" + << log.get_can_rollback_to() << ")"; + return out; +} + + +/** + * pg_missing_t - summary of missing objects. + * + * kept in memory, as a supplement to pg_log_t + * also used to pass missing info in messages. + */ +struct pg_missing_item { + eversion_t need, have; + ObjectCleanRegions clean_regions; + enum missing_flags_t { + FLAG_NONE = 0, + FLAG_DELETE = 1, + } flags; + pg_missing_item() : flags(FLAG_NONE) {} + explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version + pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) : + need(n), have(h) { + set_delete(is_delete); + if (old_style) + clean_regions.mark_fully_dirty(); + } + + void encode(ceph::buffer::list& bl, uint64_t features) const { + using ceph::encode; + if (HAVE_FEATURE(features, SERVER_OCTOPUS)) { + // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、 + // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not + // possible. This can be replaced with the legacy encoding + encode(eversion_t(), bl); + encode(eversion_t(-1, -1), bl); + encode(need, bl); + encode(have, bl); + encode(static_cast(flags), bl); + encode(clean_regions, bl); + } else { + encode(eversion_t(), bl); + encode(need, bl); + encode(have, bl); + encode(static_cast(flags), bl); + } + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + eversion_t e, l; + decode(e, bl); + decode(l, bl); + if(l == eversion_t(-1, -1)) { + // support all + decode(need, bl); + decode(have, bl); + uint8_t f; + decode(f, bl); + flags = static_cast(f); + decode(clean_regions, bl); + } else { + // support OSD_RECOVERY_DELETES + need = l; + decode(have, bl); + uint8_t f; + decode(f, bl); + flags = static_cast(f); + clean_regions.mark_fully_dirty(); + } + } + + void set_delete(bool is_delete) { + flags = is_delete ? FLAG_DELETE : FLAG_NONE; + } + + bool is_delete() const { + return (flags & FLAG_DELETE) == FLAG_DELETE; + } + + std::string flag_str() const { + if (flags == FLAG_NONE) { + return "none"; + } else { + return "delete"; + } + } + + void dump(ceph::Formatter *f) const { + f->dump_stream("need") << need; + f->dump_stream("have") << have; + f->dump_stream("flags") << flag_str(); + f->dump_stream("clean_regions") << clean_regions; + } + static void generate_test_instances(std::list& o) { + o.push_back(new pg_missing_item); + o.push_back(new pg_missing_item); + o.back()->need = eversion_t(1, 2); + o.back()->have = eversion_t(1, 1); + o.push_back(new pg_missing_item); + o.back()->need = eversion_t(3, 5); + o.back()->have = eversion_t(3, 4); + o.back()->clean_regions.mark_data_region_dirty(4096, 8192); + o.back()->clean_regions.mark_omap_dirty(); + o.back()->flags = FLAG_DELETE; + } + bool operator==(const pg_missing_item &rhs) const { + return need == rhs.need && have == rhs.have && flags == rhs.flags; + } + bool operator!=(const pg_missing_item &rhs) const { + return !(*this == rhs); + } +}; +WRITE_CLASS_ENCODER_FEATURES(pg_missing_item) +std::ostream& operator<<(std::ostream& out, const pg_missing_item &item); + +class pg_missing_const_i { +public: + virtual const std::map & + get_items() const = 0; + virtual const std::map &get_rmissing() const = 0; + virtual bool get_may_include_deletes() const = 0; + virtual unsigned int num_missing() const = 0; + virtual bool have_missing() const = 0; + virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0; + virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0; + virtual ~pg_missing_const_i() {} +}; + + +template +class ChangeTracker { +public: + void changed(const hobject_t &obj) {} + template + void get_changed(F &&f) const {} + void flush() {} + bool is_clean() const { + return true; + } +}; +template <> +class ChangeTracker { + std::set _changed; +public: + void changed(const hobject_t &obj) { + _changed.insert(obj); + } + template + void get_changed(F &&f) const { + for (auto const &i: _changed) { + f(i); + } + } + void flush() { + _changed.clear(); + } + bool is_clean() const { + return _changed.empty(); + } +}; + +template +class pg_missing_set : public pg_missing_const_i { + using item = pg_missing_item; + std::map missing; // oid -> (need v, have v) + std::map rmissing; // v -> oid + ChangeTracker tracker; + +public: + pg_missing_set() = default; + + template + pg_missing_set(const missing_type &m) { + missing = m.get_items(); + rmissing = m.get_rmissing(); + may_include_deletes = m.get_may_include_deletes(); + for (auto &&i: missing) + tracker.changed(i.first); + } + + bool may_include_deletes = false; + + const std::map &get_items() const override { + return missing; + } + const std::map &get_rmissing() const override { + return rmissing; + } + bool get_may_include_deletes() const override { + return may_include_deletes; + } + unsigned int num_missing() const override { + return missing.size(); + } + bool have_missing() const override { + return !missing.empty(); + } + void merge(const pg_log_entry_t& e) { + auto miter = missing.find(e.soid); + if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have) + miter->second.clean_regions.merge(e.clean_regions); + } + bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override { + auto iter = missing.find(oid); + if (iter == missing.end()) + return false; + if (out) + *out = iter->second; + return true; + } + bool is_missing(const hobject_t& oid, eversion_t v) const override { + std::map::const_iterator m = + missing.find(oid); + if (m == missing.end()) + return false; + const item &item(m->second); + if (item.need > v) + return false; + return true; + } + eversion_t get_oldest_need() const { + if (missing.empty()) { + return eversion_t(); + } + auto it = missing.find(rmissing.begin()->second); + ceph_assert(it != missing.end()); + return it->second.need; + } + + void claim(pg_missing_set&& o) { + static_assert(!TrackChanges, "Can't use claim with TrackChanges"); + missing = std::move(o.missing); + rmissing = std::move(o.rmissing); + } + + /* + * this needs to be called in log order as we extend the log. it + * assumes missing is accurate up through the previous log entry. + */ + void add_next_event(const pg_log_entry_t& e) { + std::map::iterator missing_it; + missing_it = missing.find(e.soid); + bool is_missing_divergent_item = missing_it != missing.end(); + if (e.prior_version == eversion_t() || e.is_clone()) { + // new object. + if (is_missing_divergent_item) { // use iterator + rmissing.erase(missing_it->second.need.version); + // .have = nil + missing_it->second = item(e.version, eversion_t(), e.is_delete()); + missing_it->second.clean_regions.mark_fully_dirty(); + } else { + // create new element in missing map + // .have = nil + missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); + missing[e.soid].clean_regions.mark_fully_dirty(); + } + } else if (is_missing_divergent_item) { + // already missing (prior). + rmissing.erase((missing_it->second).need.version); + missing_it->second.need = e.version; // leave .have unchanged. + missing_it->second.set_delete(e.is_delete()); + if (e.is_lost_revert()) + missing_it->second.clean_regions.mark_fully_dirty(); + else + missing_it->second.clean_regions.merge(e.clean_regions); + } else { + // not missing, we must have prior_version (if any) + ceph_assert(!is_missing_divergent_item); + missing[e.soid] = item(e.version, e.prior_version, e.is_delete()); + if (e.is_lost_revert()) + missing[e.soid].clean_regions.mark_fully_dirty(); + else + missing[e.soid].clean_regions = e.clean_regions; + } + rmissing[e.version.version] = e.soid; + tracker.changed(e.soid); + } + + void revise_need(hobject_t oid, eversion_t need, bool is_delete) { + auto p = missing.find(oid); + if (p != missing.end()) { + rmissing.erase((p->second).need.version); + p->second.need = need; // do not adjust .have + p->second.set_delete(is_delete); + p->second.clean_regions.mark_fully_dirty(); + } else { + missing[oid] = item(need, eversion_t(), is_delete); + missing[oid].clean_regions.mark_fully_dirty(); + } + rmissing[need.version] = oid; + + tracker.changed(oid); + } + + void revise_have(hobject_t oid, eversion_t have) { + auto p = missing.find(oid); + if (p != missing.end()) { + tracker.changed(oid); + (p->second).have = have; + } + } + + void mark_fully_dirty(const hobject_t& oid) { + auto p = missing.find(oid); + if (p != missing.end()) { + tracker.changed(oid); + (p->second).clean_regions.mark_fully_dirty(); + } + } + + void add(const hobject_t& oid, eversion_t need, eversion_t have, + bool is_delete) { + missing[oid] = item(need, have, is_delete, true); + rmissing[need.version] = oid; + tracker.changed(oid); + } + + void add(const hobject_t& oid, pg_missing_item&& item) { + rmissing[item.need.version] = oid; + missing.insert({oid, std::move(item)}); + tracker.changed(oid); + } + + void rm(const hobject_t& oid, eversion_t v) { + std::map::iterator p = missing.find(oid); + if (p != missing.end() && p->second.need <= v) + rm(p); + } + + void rm(std::map::const_iterator m) { + tracker.changed(m->first); + rmissing.erase(m->second.need.version); + missing.erase(m); + } + + void got(const hobject_t& oid, eversion_t v) { + std::map::iterator p = missing.find(oid); + ceph_assert(p != missing.end()); + ceph_assert(p->second.need <= v || p->second.is_delete()); + got(p); + } + + void got(std::map::const_iterator m) { + tracker.changed(m->first); + rmissing.erase(m->second.need.version); + missing.erase(m); + } + + void split_into( + pg_t child_pgid, + unsigned split_bits, + pg_missing_set *omissing) { + omissing->may_include_deletes = may_include_deletes; + unsigned mask = ~((~0)<::iterator i = missing.begin(); + i != missing.end(); + ) { + if ((i->first.get_hash() & mask) == child_pgid.m_seed) { + omissing->add(i->first, i->second.need, i->second.have, + i->second.is_delete()); + rm(i++); + } else { + ++i; + } + } + } + + void clear() { + for (auto const &i: missing) + tracker.changed(i.first); + missing.clear(); + rmissing.clear(); + } + + void encode(ceph::buffer::list &bl, uint64_t features) const { + ENCODE_START(5, 2, bl) + encode(missing, bl, features); + encode(may_include_deletes, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) { + for (auto const &i: missing) + tracker.changed(i.first); + DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl); + decode(missing, bl); + if (struct_v >= 4) { + decode(may_include_deletes, bl); + } + DECODE_FINISH(bl); + + if (struct_v < 3) { + // Handle hobject_t upgrade + std::map tmp; + for (std::map::iterator i = + missing.begin(); + i != missing.end(); + ) { + if (!i->first.is_max() && i->first.pool == -1) { + hobject_t to_insert(i->first); + to_insert.pool = pool; + tmp[to_insert] = i->second; + missing.erase(i++); + } else { + ++i; + } + } + missing.insert(tmp.begin(), tmp.end()); + } + + for (std::map::iterator it = + missing.begin(); + it != missing.end(); + ++it) + rmissing[it->second.need.version] = it->first; + for (auto const &i: missing) + tracker.changed(i.first); + } + void dump(ceph::Formatter *f) const { + f->open_array_section("missing"); + for (std::map::const_iterator p = + missing.begin(); p != missing.end(); ++p) { + f->open_object_section("item"); + f->dump_stream("object") << p->first; + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_bool("may_include_deletes", may_include_deletes); + } + template + void filter_objects(F &&f) { + for (auto i = missing.begin(); i != missing.end();) { + if (f(i->first)) { + rm(i++); + } else { + ++i; + } + } + } + static void generate_test_instances(std::list& o) { + o.push_back(new pg_missing_set); + o.back()->may_include_deletes = true; + o.push_back(new pg_missing_set); + o.back()->add( + hobject_t(object_t("foo"), "foo", 123, 456, 0, ""), + eversion_t(5, 6), eversion_t(5, 1), false); + o.back()->may_include_deletes = true; + o.push_back(new pg_missing_set); + o.back()->add( + hobject_t(object_t("foo"), "foo", 123, 456, 0, ""), + eversion_t(5, 6), eversion_t(5, 1), true); + o.back()->may_include_deletes = true; + } + template + void get_changed(F &&f) const { + tracker.get_changed(f); + } + void flush() { + tracker.flush(); + } + bool is_clean() const { + return tracker.is_clean(); + } + template + bool debug_verify_from_init( + const missing_t &init_missing, + std::ostream *oss) const { + if (!TrackChanges) + return true; + auto check_missing(init_missing.get_items()); + tracker.get_changed([&](const hobject_t &hoid) { + check_missing.erase(hoid); + if (missing.count(hoid)) { + check_missing.insert(*(missing.find(hoid))); + } + }); + bool ok = true; + if (check_missing.size() != missing.size()) { + if (oss) { + *oss << "Size mismatch, check: " << check_missing.size() + << ", actual: " << missing.size() << "\n"; + } + ok = false; + } + for (auto &i: missing) { + if (!check_missing.count(i.first)) { + if (oss) + *oss << "check_missing missing " << i.first << "\n"; + ok = false; + } else if (check_missing[i.first] != i.second) { + if (oss) + *oss << "check_missing missing item mismatch on " << i.first + << ", check: " << check_missing[i.first] + << ", actual: " << i.second << "\n"; + ok = false; + } + } + if (oss && !ok) { + *oss << "check_missing: " << check_missing << "\n"; + std::set changed; + tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); }); + *oss << "changed: " << changed << "\n"; + } + return ok; + } +}; +template +void encode( + const pg_missing_set &c, ceph::buffer::list &bl, uint64_t features=0) { + ENCODE_DUMP_PRE(); + c.encode(bl, features); + ENCODE_DUMP_POST(cl); +} +template +void decode(pg_missing_set &c, ceph::buffer::list::const_iterator &p) { + c.decode(p); +} +template +std::ostream& operator<<(std::ostream& out, const pg_missing_set &missing) +{ + out << "missing(" << missing.num_missing() + << " may_include_deletes = " << missing.may_include_deletes; + //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; + out << ")"; + return out; +} + +using pg_missing_t = pg_missing_set; +using pg_missing_tracker_t = pg_missing_set; + + + + +/** + * pg list objects response format + * + */ + +template +struct pg_nls_response_template { + collection_list_handle_t handle; + std::vector entries; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(handle, bl); + __u32 n = (__u32)entries.size(); + encode(n, bl); + for (auto i = entries.begin(); i != entries.end(); ++i) { + encode(i->nspace, bl); + encode(i->oid, bl); + encode(i->locator, bl); + } + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(handle, bl); + __u32 n; + decode(n, bl); + entries.clear(); + while (n--) { + T i; + decode(i.nspace, bl); + decode(i.oid, bl); + decode(i.locator, bl); + entries.push_back(i); + } + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->dump_stream("handle") << handle; + f->open_array_section("entries"); + for (auto p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("object"); + f->dump_string("namespace", p->nspace); + f->dump_string("object", p->oid); + f->dump_string("key", p->locator); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list*>& o) { + o.push_back(new pg_nls_response_template); + o.push_back(new pg_nls_response_template); + o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("", "one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("", "three", "")); + o.push_back(new pg_nls_response_template); + o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", "")); + o.push_back(new pg_nls_response_template); + o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("", "one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("", "three", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", "")); + } +}; + +using pg_nls_response_t = pg_nls_response_template; + +WRITE_CLASS_ENCODER(pg_nls_response_t) + +// For backwards compatibility with older OSD requests +struct pg_ls_response_t { + collection_list_handle_t handle; + std::list > entries; + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(handle, bl); + encode(entries, bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + __u8 v; + decode(v, bl); + ceph_assert(v == 1); + decode(handle, bl); + decode(entries, bl); + } + void dump(ceph::Formatter *f) const { + f->dump_stream("handle") << handle; + f->open_array_section("entries"); + for (std::list >::const_iterator p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("object"); + f->dump_stream("object") << p->first; + f->dump_string("key", p->second); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list& o) { + o.push_back(new pg_ls_response_t); + o.push_back(new pg_ls_response_t); + o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, ""); + o.back()->entries.push_back(std::make_pair(object_t("one"), std::string())); + o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey"))); + } +}; + +WRITE_CLASS_ENCODER(pg_ls_response_t) + +/** + * object_copy_cursor_t + */ +struct object_copy_cursor_t { + uint64_t data_offset; + std::string omap_offset; + bool attr_complete; + bool data_complete; + bool omap_complete; + + object_copy_cursor_t() + : data_offset(0), + attr_complete(false), + data_complete(false), + omap_complete(false) + {} + + bool is_initial() const { + return !attr_complete && data_offset == 0 && omap_offset.empty(); + } + bool is_complete() const { + return attr_complete && data_complete && omap_complete; + } + + static void generate_test_instances(std::list& o); + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; +}; +WRITE_CLASS_ENCODER(object_copy_cursor_t) + +/** + * object_copy_data_t + * + * Return data from a copy request. The semantics are a little strange + * as a result of the encoding's heritage. + * + * In particular, the sender unconditionally fills in the cursor (from what + * it receives and sends), the size, and the mtime, but is responsible for + * figuring out whether it should put any data in the attrs, data, or + * omap members (corresponding to xattrs, object data, and the omap entries) + * based on external data (the client includes a max amount to return with + * the copy request). The client then looks into the attrs, data, and/or omap + * based on the contents of the cursor. + */ +struct object_copy_data_t { + enum { + FLAG_DATA_DIGEST = 1<<0, + FLAG_OMAP_DIGEST = 1<<1, + }; + object_copy_cursor_t cursor; + uint64_t size; + utime_t mtime; + uint32_t data_digest, omap_digest; + uint32_t flags; + std::map attrs; + ceph::buffer::list data; + ceph::buffer::list omap_header; + ceph::buffer::list omap_data; + + /// which snaps we are defined for (if a snap and not the head) + std::vector snaps; + /// latest snap seq for the object (if head) + snapid_t snap_seq; + + /// recent reqids on this object + mempool::osd_pglog::vector > reqids; + + /// map reqids by index to error return code (if any) + mempool::osd_pglog::map reqid_return_codes; + + uint64_t truncate_seq; + uint64_t truncate_size; + +public: + object_copy_data_t() : + size((uint64_t)-1), data_digest(-1), + omap_digest(-1), flags(0), + truncate_seq(0), + truncate_size(0) {} + + static void generate_test_instances(std::list& o); + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; +}; +WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t) + +/** + * pg creation info + */ +struct pg_create_t { + epoch_t created; // epoch pg created + pg_t parent; // split from parent (if != pg_t()) + __s32 split_bits; + + pg_create_t() + : created(0), split_bits(0) {} + pg_create_t(unsigned c, pg_t p, int s) + : created(c), parent(p), split_bits(s) {} + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(pg_create_t) + +// ----------------------------------------- + +class ObjectExtent { + /** + * ObjectExtents are used for specifying IO behavior against RADOS + * objects when one is using the ObjectCacher. + * + * To use this in a real system, *every member* must be filled + * out correctly. In particular, make sure to initialize the + * oloc correctly, as its default values are deliberate poison + * and will cause internal ObjectCacher asserts. + * + * Similarly, your buffer_extents vector *must* specify a total + * size equal to your length. If the buffer_extents inadvertently + * contain less space than the length member specifies, you + * will get unintelligible asserts deep in the ObjectCacher. + * + * If you are trying to do testing and don't care about actual + * RADOS function, the simplest thing to do is to initialize + * the ObjectExtent (truncate_size can be 0), create a single entry + * in buffer_extents matching the length, and set oloc.pool to 0. + */ + public: + object_t oid; // object id + uint64_t objectno; + uint64_t offset; // in object + uint64_t length; // in object + uint64_t truncate_size; // in object + + object_locator_t oloc; // object locator (pool etc) + + std::vector > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) + + ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {} + ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) : + oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { } +}; + +inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex) +{ + return out << "extent(" + << ex.oid << " (" << ex.objectno << ") in " << ex.oloc + << " " << ex.offset << "~" << ex.length + << " -> " << ex.buffer_extents + << ")"; +} + + +// --------------------------------------- + +class OSDSuperblock { +public: + uuid_d cluster_fsid, osd_fsid; + int32_t whoami = -1; // my role in this fs. + epoch_t current_epoch = 0; // most recent epoch + epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have. + double weight = 0.0; + + CompatSet compat_features; + + // last interval over which i mounted and was then active + epoch_t mounted = 0; // last epoch i mounted + epoch_t clean_thru = 0; // epoch i was active and clean thru + + epoch_t purged_snaps_last = 0; + utime_t last_purged_snaps_scrub; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(OSDSuperblock) + +inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb) +{ + return out << "sb(" << sb.cluster_fsid + << " osd." << sb.whoami + << " " << sb.osd_fsid + << " e" << sb.current_epoch + << " [" << sb.oldest_map << "," << sb.newest_map << "]" + << " lci=[" << sb.mounted << "," << sb.clean_thru << "]" + << ")"; +} + + +// ------- + + + + + + +/* + * attached to object head. describes most recent snap context, and + * set of existing clones. + */ +struct SnapSet { + snapid_t seq; + // NOTE: this is for pre-octopus compatibility only! remove in Q release + std::vector snaps; // descending + std::vector clones; // ascending + std::map > clone_overlap; // overlap w/ next newest + std::map clone_size; + std::map> clone_snaps; // descending + + SnapSet() : seq(0) {} + explicit SnapSet(ceph::buffer::list& bl) { + auto p = std::cbegin(bl); + decode(p); + } + + /// populate SnapSet from a librados::snap_set_t + void from_snap_set(const librados::snap_set_t& ss, bool legacy); + + /// get space accounted to clone + uint64_t get_clone_bytes(snapid_t clone) const; + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + + SnapContext get_ssc_as_of(snapid_t as_of) const { + SnapContext out; + out.seq = as_of; + for (auto p = clone_snaps.rbegin(); + p != clone_snaps.rend(); + ++p) { + for (auto snap : p->second) { + if (snap <= as_of) { + out.snaps.push_back(snap); + } + } + } + return out; + } + + + SnapSet get_filtered(const pg_pool_t &pinfo) const; + void filter(const pg_pool_t &pinfo); +}; +WRITE_CLASS_ENCODER(SnapSet) + +std::ostream& operator<<(std::ostream& out, const SnapSet& cs); + + + +#define OI_ATTR "_" +#define SS_ATTR "snapset" + +struct watch_info_t { + uint64_t cookie; + uint32_t timeout_seconds; + entity_addr_t addr; + + watch_info_t() : cookie(0), timeout_seconds(0) { } + watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {} + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(watch_info_t) + +static inline bool operator==(const watch_info_t& l, const watch_info_t& r) { + return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds + && l.addr == r.addr; +} + +static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) { + return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s" + << " " << w.addr << ")"; +} + +struct notify_info_t { + uint64_t cookie; + uint64_t notify_id; + uint32_t timeout; + ceph::buffer::list bl; +}; + +static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) { + return out << "notify(cookie " << n.cookie + << " notify" << n.notify_id + << " " << n.timeout << "s)"; +} + +class object_ref_delta_t { + std::map ref_delta; + +public: + object_ref_delta_t() = default; + object_ref_delta_t(const object_ref_delta_t &) = default; + object_ref_delta_t(object_ref_delta_t &&) = default; + + object_ref_delta_t(decltype(ref_delta) &&ref_delta) + : ref_delta(std::move(ref_delta)) {} + object_ref_delta_t(const decltype(ref_delta) &ref_delta) + : ref_delta(ref_delta) {} + + object_ref_delta_t &operator=(const object_ref_delta_t &) = default; + object_ref_delta_t &operator=(object_ref_delta_t &&) = default; + + void dec_ref(const hobject_t &hoid, unsigned num=1) { + mut_ref(hoid, -num); + } + void inc_ref(const hobject_t &hoid, unsigned num=1) { + mut_ref(hoid, num); + } + void mut_ref(const hobject_t &hoid, int num) { + [[maybe_unused]] auto [iter, _] = ref_delta.try_emplace(hoid, 0); + iter->second += num; + if (iter->second == 0) + ref_delta.erase(iter); + } + + auto begin() const { return ref_delta.begin(); } + auto end() const { return ref_delta.end(); } + auto find(hobject_t &key) const { return ref_delta.find(key); } + + bool operator==(const object_ref_delta_t &rhs) const { + return ref_delta == rhs.ref_delta; + } + bool operator!=(const object_ref_delta_t &rhs) const { + return !(*this == rhs); + } + bool is_empty() { + return ref_delta.empty(); + } + uint64_t size() { + return ref_delta.size(); + } + friend std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci); +}; + +struct chunk_info_t { + typedef enum { + FLAG_DIRTY = 1, + FLAG_MISSING = 2, + FLAG_HAS_REFERENCE = 4, + FLAG_HAS_FINGERPRINT = 8, + } cflag_t; + uint32_t offset; + uint32_t length; + hobject_t oid; + cflag_t flags; // FLAG_* + + chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { } + chunk_info_t(uint32_t offset, uint32_t length, hobject_t oid) : + offset(offset), length(length), oid(oid), flags((cflag_t)0) { } + + static std::string get_flag_string(uint64_t flags) { + std::string r; + if (flags & FLAG_DIRTY) { + r += "|dirty"; + } + if (flags & FLAG_MISSING) { + r += "|missing"; + } + if (flags & FLAG_HAS_REFERENCE) { + r += "|has_reference"; + } + if (flags & FLAG_HAS_FINGERPRINT) { + r += "|has_fingerprint"; + } + if (r.length()) + return r.substr(1); + return r; + } + bool test_flag(cflag_t f) const { + return (flags & f) == f; + } + void set_flag(cflag_t f) { + flags = (cflag_t)(flags | f); + } + void set_flags(cflag_t f) { + flags = f; + } + void clear_flag(cflag_t f) { + flags = (cflag_t)(flags & ~f); + } + void clear_flags() { + flags = (cflag_t)0; + } + bool is_dirty() const { + return test_flag(FLAG_DIRTY); + } + bool is_missing() const { + return test_flag(FLAG_MISSING); + } + bool has_reference() const { + return test_flag(FLAG_HAS_REFERENCE); + } + bool has_fingerprint() const { + return test_flag(FLAG_HAS_FINGERPRINT); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci); + bool operator==(const chunk_info_t& cit) const; + bool operator!=(const chunk_info_t& cit) const { + return !(cit == *this); + } +}; +WRITE_CLASS_ENCODER(chunk_info_t) +std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci); + +struct object_info_t; +struct object_manifest_t { + enum { + TYPE_NONE = 0, + TYPE_REDIRECT = 1, + TYPE_CHUNKED = 2, + }; + uint8_t type; // redirect, chunked, ... + hobject_t redirect_target; + std::map chunk_map; + + object_manifest_t() : type(0) { } + object_manifest_t(uint8_t type, const hobject_t& redirect_target) + : type(type), redirect_target(redirect_target) { } + + bool is_empty() const { + return type == TYPE_NONE; + } + bool is_redirect() const { + return type == TYPE_REDIRECT; + } + bool is_chunked() const { + return type == TYPE_CHUNKED; + } + static std::string_view get_type_name(uint8_t m) { + switch (m) { + case TYPE_NONE: return "none"; + case TYPE_REDIRECT: return "redirect"; + case TYPE_CHUNKED: return "chunked"; + default: return "unknown"; + } + } + std::string_view get_type_name() const { + return get_type_name(type); + } + void clear() { + type = 0; + redirect_target = hobject_t(); + chunk_map.clear(); + } + + /** + * calc_refs_to_inc_on_set + * + * Takes a manifest and returns the set of refs to + * increment upon set-chunk + * + * l should be nullptr if there are no clones, or + * l and g may each be null if the corresponding clone does not exist. + * *this contains the set of new references to set + * + */ + void calc_refs_to_inc_on_set( + const object_manifest_t* g, ///< [in] manifest for clone > *this + const object_manifest_t* l, ///< [in] manifest for clone < *this + object_ref_delta_t &delta ///< [out] set of refs to drop + ) const; + + /** + * calc_refs_to_drop_on_modify + * + * Takes a manifest and returns the set of refs to + * drop upon modification + * + * l should be nullptr if there are no clones, or + * l may be null if the corresponding clone does not exist. + * + */ + void calc_refs_to_drop_on_modify( + const object_manifest_t* l, ///< [in] manifest for previous clone + const ObjectCleanRegions& clean_regions, ///< [in] clean regions + object_ref_delta_t &delta ///< [out] set of refs to drop + ) const; + + /** + * calc_refs_to_drop_on_removal + * + * Takes the two adjacent manifests and returns the set of refs to + * drop upon removal of the clone containing *this. + * + * g should be nullptr if *this is on HEAD, l should be nullptr if + * *this is on the oldest clone (or head if there are no clones). + */ + void calc_refs_to_drop_on_removal( + const object_manifest_t* g, ///< [in] manifest for clone > *this + const object_manifest_t* l, ///< [in] manifest for clone < *this + object_ref_delta_t &delta ///< [out] set of refs to drop + ) const; + + static void generate_test_instances(std::list& o); + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi); +}; +WRITE_CLASS_ENCODER(object_manifest_t) +std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi); + +struct object_info_t { + hobject_t soid; + eversion_t version, prior_version; + version_t user_version; + osd_reqid_t last_reqid; + + uint64_t size; + utime_t mtime; + utime_t local_mtime; // local mtime + + // note: these are currently encoded into a total 16 bits; see + // encode()/decode() for the weirdness. + typedef enum { + FLAG_LOST = 1<<0, + FLAG_WHITEOUT = 1<<1, // object logically does not exist + FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied + FLAG_OMAP = 1<<3, // has (or may have) some/any omap data + FLAG_DATA_DIGEST = 1<<4, // has data crc + FLAG_OMAP_DIGEST = 1<<5, // has omap crc + FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier + FLAG_MANIFEST = 1<<7, // has manifest + FLAG_USES_TMAP = 1<<8, // deprecated; no longer used + FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference + } flag_t; + + flag_t flags; + + static std::string get_flag_string(flag_t flags) { + std::string s; + std::vector sv = get_flag_vector(flags); + for (auto ss : sv) { + s += std::string("|") + ss; + } + if (s.length()) + return s.substr(1); + return s; + } + static std::vector get_flag_vector(flag_t flags) { + std::vector sv; + if (flags & FLAG_LOST) + sv.insert(sv.end(), "lost"); + if (flags & FLAG_WHITEOUT) + sv.insert(sv.end(), "whiteout"); + if (flags & FLAG_DIRTY) + sv.insert(sv.end(), "dirty"); + if (flags & FLAG_USES_TMAP) + sv.insert(sv.end(), "uses_tmap"); + if (flags & FLAG_OMAP) + sv.insert(sv.end(), "omap"); + if (flags & FLAG_DATA_DIGEST) + sv.insert(sv.end(), "data_digest"); + if (flags & FLAG_OMAP_DIGEST) + sv.insert(sv.end(), "omap_digest"); + if (flags & FLAG_CACHE_PIN) + sv.insert(sv.end(), "cache_pin"); + if (flags & FLAG_MANIFEST) + sv.insert(sv.end(), "manifest"); + if (flags & FLAG_REDIRECT_HAS_REFERENCE) + sv.insert(sv.end(), "redirect_has_reference"); + return sv; + } + std::string get_flag_string() const { + return get_flag_string(flags); + } + + uint64_t truncate_seq, truncate_size; + + std::map, watch_info_t> watchers; + + // opportunistic checksums; may or may not be present + __u32 data_digest; ///< data crc32c + __u32 omap_digest; ///< omap crc32c + + // alloc hint attribute + uint64_t expected_object_size, expected_write_size; + uint32_t alloc_hint_flags; + + struct object_manifest_t manifest; + + void copy_user_bits(const object_info_t& other); + + bool test_flag(flag_t f) const { + return (flags & f) == f; + } + void set_flag(flag_t f) { + flags = (flag_t)(flags | f); + } + void clear_flag(flag_t f) { + flags = (flag_t)(flags & ~f); + } + bool is_lost() const { + return test_flag(FLAG_LOST); + } + bool is_whiteout() const { + return test_flag(FLAG_WHITEOUT); + } + bool is_dirty() const { + return test_flag(FLAG_DIRTY); + } + bool is_omap() const { + return test_flag(FLAG_OMAP); + } + bool is_data_digest() const { + return test_flag(FLAG_DATA_DIGEST); + } + bool is_omap_digest() const { + return test_flag(FLAG_OMAP_DIGEST); + } + bool is_cache_pinned() const { + return test_flag(FLAG_CACHE_PIN); + } + bool has_manifest() const { + return test_flag(FLAG_MANIFEST); + } + void set_data_digest(__u32 d) { + set_flag(FLAG_DATA_DIGEST); + data_digest = d; + } + void set_omap_digest(__u32 d) { + set_flag(FLAG_OMAP_DIGEST); + omap_digest = d; + } + void clear_data_digest() { + clear_flag(FLAG_DATA_DIGEST); + data_digest = -1; + } + void clear_omap_digest() { + clear_flag(FLAG_OMAP_DIGEST); + omap_digest = -1; + } + void new_object() { + clear_data_digest(); + clear_omap_digest(); + } + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void decode(const ceph::buffer::list& bl) { + auto p = std::cbegin(bl); + decode(p); + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + + explicit object_info_t() + : user_version(0), size(0), flags((flag_t)0), + truncate_seq(0), truncate_size(0), + data_digest(-1), omap_digest(-1), + expected_object_size(0), expected_write_size(0), + alloc_hint_flags(0) + {} + + explicit object_info_t(const hobject_t& s) + : soid(s), + user_version(0), size(0), flags((flag_t)0), + truncate_seq(0), truncate_size(0), + data_digest(-1), omap_digest(-1), + expected_object_size(0), expected_write_size(0), + alloc_hint_flags(0) + {} + + explicit object_info_t(ceph::buffer::list& bl) { + decode(bl); + } +}; +WRITE_CLASS_ENCODER_FEATURES(object_info_t) + +std::ostream& operator<<(std::ostream& out, const object_info_t& oi); + + + +// Object recovery +struct ObjectRecoveryInfo { + hobject_t soid; + eversion_t version; + uint64_t size; + object_info_t oi; + SnapSet ss; // only populated if soid is_snap() + interval_set copy_subset; + std::map> clone_subset; + bool object_exist; + + ObjectRecoveryInfo() : size(0), object_exist(true) { } + + static void generate_test_instances(std::list& o); + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; +}; +WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo) +std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf); + +struct ObjectRecoveryProgress { + uint64_t data_recovered_to; + std::string omap_recovered_to; + bool first; + bool data_complete; + bool omap_complete; + bool error = false; + + ObjectRecoveryProgress() + : data_recovered_to(0), + first(true), + data_complete(false), omap_complete(false) { } + + bool is_complete(const ObjectRecoveryInfo& info) const { + return (data_recovered_to >= ( + info.copy_subset.empty() ? + 0 : info.copy_subset.range_end())) && + omap_complete; + } + + static void generate_test_instances(std::list& o); + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; +}; +WRITE_CLASS_ENCODER(ObjectRecoveryProgress) +std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog); + +struct PushReplyOp { + hobject_t soid; + + static void generate_test_instances(std::list& o); + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER(PushReplyOp) +std::ostream& operator<<(std::ostream& out, const PushReplyOp &op); + +struct PullOp { + hobject_t soid; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress recovery_progress; + + static void generate_test_instances(std::list& o); + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER_FEATURES(PullOp) +std::ostream& operator<<(std::ostream& out, const PullOp &op); + +struct PushOp { + hobject_t soid; + eversion_t version; + ceph::buffer::list data; + interval_set data_included; + ceph::buffer::list omap_header; + std::map omap_entries; + std::map attrset; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress before_progress; + ObjectRecoveryProgress after_progress; + + static void generate_test_instances(std::list& o); + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER_FEATURES(PushOp) +std::ostream& operator<<(std::ostream& out, const PushOp &op); + +enum class scrub_level_t : bool { shallow = false, deep = true }; +enum class scrub_type_t : bool { not_repair = false, do_repair = true }; + +/* + * summarize pg contents for purposes of a scrub + */ +struct ScrubMap { + struct object { + std::map attrs; + uint64_t size; + __u32 omap_digest; ///< omap crc32c + __u32 digest; ///< data crc32c + bool negative:1; + bool digest_present:1; + bool omap_digest_present:1; + bool read_error:1; + bool stat_error:1; + bool ec_hash_mismatch:1; + bool ec_size_mismatch:1; + bool large_omap_object_found:1; + uint64_t large_omap_object_key_count = 0; + uint64_t large_omap_object_value_size = 0; + uint64_t object_omap_bytes = 0; + uint64_t object_omap_keys = 0; + + object() : + // Init invalid size so it won't match if we get a stat EIO error + size(-1), omap_digest(0), digest(0), + negative(false), digest_present(false), omap_digest_present(false), + read_error(false), stat_error(false), ec_hash_mismatch(false), + ec_size_mismatch(false), large_omap_object_found(false) {} + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + }; + WRITE_CLASS_ENCODER(object) + + std::map objects; + eversion_t valid_through; + eversion_t incr_since; + bool has_large_omap_object_errors:1; + bool has_omap_keys:1; + + void merge_incr(const ScrubMap &l); + void clear_from(const hobject_t& start) { + objects.erase(objects.lower_bound(start), objects.end()); + } + void insert(const ScrubMap &r) { + objects.insert(r.objects.begin(), r.objects.end()); + } + void swap(ScrubMap &r) { + using std::swap; + swap(objects, r.objects); + swap(valid_through, r.valid_through); + swap(incr_since, r.incr_since); + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(ScrubMap::object) +WRITE_CLASS_ENCODER(ScrubMap) + +struct ScrubMapBuilder { + bool deep = false; + std::vector ls; + size_t pos = 0; + int64_t data_pos = 0; + std::string omap_pos; + int ret = 0; + ceph::buffer::hash data_hash, omap_hash; ///< accumulatinng hash value + uint64_t omap_keys = 0; + uint64_t omap_bytes = 0; + + bool empty() { + return ls.empty(); + } + bool done() { + return pos >= ls.size(); + } + void reset() { + *this = ScrubMapBuilder(); + } + + bool data_done() { + return data_pos < 0; + } + + void next_object() { + ++pos; + data_pos = 0; + omap_pos.clear(); + omap_keys = 0; + omap_bytes = 0; + } + + friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) { + out << "(" << pos.pos << "/" << pos.ls.size(); + if (pos.pos < pos.ls.size()) { + out << " " << pos.ls[pos.pos]; + } + if (pos.data_pos < 0) { + out << " byte " << pos.data_pos; + } + if (!pos.omap_pos.empty()) { + out << " key " << pos.omap_pos; + } + if (pos.deep) { + out << " deep"; + } + if (pos.ret) { + out << " ret " << pos.ret; + } + return out << ")"; + } +}; + +struct watch_item_t { + entity_name_t name; + uint64_t cookie; + uint32_t timeout_seconds; + entity_addr_t addr; + + watch_item_t() : cookie(0), timeout_seconds(0) { } + watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout, + const entity_addr_t& addr) + : name(name), cookie(cookie), timeout_seconds(timeout), + addr(addr) { } + + void encode(ceph::buffer::list &bl, uint64_t features) const { + ENCODE_START(2, 1, bl); + encode(name, bl); + encode(cookie, bl); + encode(timeout_seconds, bl); + encode(addr, bl, features); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(2, bl); + decode(name, bl); + decode(cookie, bl); + decode(timeout_seconds, bl); + if (struct_v >= 2) { + decode(addr, bl); + } + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->dump_stream("watcher") << name; + f->dump_int("cookie", cookie); + f->dump_int("timeout", timeout_seconds); + f->open_object_section("addr"); + addr.dump(f); + f->close_section(); + } + static void generate_test_instances(std::list& o) { + entity_addr_t ea; + ea.set_type(entity_addr_t::TYPE_LEGACY); + ea.set_nonce(1000); + ea.set_family(AF_INET); + ea.set_in4_quad(0, 127); + ea.set_in4_quad(1, 0); + ea.set_in4_quad(2, 0); + ea.set_in4_quad(3, 1); + ea.set_port(1024); + o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea)); + ea.set_nonce(1001); + ea.set_in4_quad(3, 2); + ea.set_port(1025); + o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea)); + } +}; +WRITE_CLASS_ENCODER_FEATURES(watch_item_t) + +struct obj_watch_item_t { + hobject_t obj; + watch_item_t wi; +}; + +/** + * obj list watch response format + * + */ +struct obj_list_watch_response_t { + std::list entries; + + void encode(ceph::buffer::list& bl, uint64_t features) const { + ENCODE_START(1, 1, bl); + encode(entries, bl, features); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(entries, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->open_array_section("entries"); + for (std::list::const_iterator p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("watch"); + p->dump(f); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list& o) { + entity_addr_t ea; + o.push_back(new obj_list_watch_response_t); + o.push_back(new obj_list_watch_response_t); + std::list test_watchers; + watch_item_t::generate_test_instances(test_watchers); + for (auto &e : test_watchers) { + o.back()->entries.push_back(*e); + delete e; + } + } +}; +WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t) + +struct clone_info { + snapid_t cloneid; + std::vector snaps; // ascending + std::vector< std::pair > overlap; + uint64_t size; + + clone_info() : cloneid(CEPH_NOSNAP), size(0) {} + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(cloneid, bl); + encode(snaps, bl); + encode(overlap, bl); + encode(size, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(cloneid, bl); + decode(snaps, bl); + decode(overlap, bl); + decode(size, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + if (cloneid == CEPH_NOSNAP) + f->dump_string("cloneid", "HEAD"); + else + f->dump_unsigned("cloneid", cloneid.val); + f->open_array_section("snapshots"); + for (std::vector::const_iterator p = snaps.begin(); p != snaps.end(); ++p) { + f->open_object_section("snap"); + f->dump_unsigned("id", p->val); + f->close_section(); + } + f->close_section(); + f->open_array_section("overlaps"); + for (std::vector< std::pair >::const_iterator q = overlap.begin(); + q != overlap.end(); ++q) { + f->open_object_section("overlap"); + f->dump_unsigned("offset", q->first); + f->dump_unsigned("length", q->second); + f->close_section(); + } + f->close_section(); + f->dump_unsigned("size", size); + } + static void generate_test_instances(std::list& o) { + o.push_back(new clone_info); + o.push_back(new clone_info); + o.back()->cloneid = 1; + o.back()->snaps.push_back(1); + o.back()->overlap.push_back(std::pair(0,4096)); + o.back()->overlap.push_back(std::pair(8192,4096)); + o.back()->size = 16384; + o.push_back(new clone_info); + o.back()->cloneid = CEPH_NOSNAP; + o.back()->size = 32768; + } +}; +WRITE_CLASS_ENCODER(clone_info) + +/** + * obj list snaps response format + * + */ +struct obj_list_snap_response_t { + std::vector clones; // ascending + snapid_t seq; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(2, 1, bl); + encode(clones, bl); + encode(seq, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(2, bl); + decode(clones, bl); + if (struct_v >= 2) + decode(seq, bl); + else + seq = CEPH_NOSNAP; + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->open_array_section("clones"); + for (std::vector::const_iterator p = clones.begin(); p != clones.end(); ++p) { + f->open_object_section("clone"); + p->dump(f); + f->close_section(); + } + f->dump_unsigned("seq", seq); + f->close_section(); + } + static void generate_test_instances(std::list& o) { + o.push_back(new obj_list_snap_response_t); + o.push_back(new obj_list_snap_response_t); + clone_info cl; + cl.cloneid = 1; + cl.snaps.push_back(1); + cl.overlap.push_back(std::pair(0,4096)); + cl.overlap.push_back(std::pair(8192,4096)); + cl.size = 16384; + o.back()->clones.push_back(cl); + cl.cloneid = CEPH_NOSNAP; + cl.snaps.clear(); + cl.overlap.clear(); + cl.size = 32768; + o.back()->clones.push_back(cl); + o.back()->seq = 123; + } +}; + +WRITE_CLASS_ENCODER(obj_list_snap_response_t) + +// PromoteCounter + +struct PromoteCounter { + std::atomic attempts{0}; + std::atomic objects{0}; + std::atomic bytes{0}; + + void attempt() { + attempts++; + } + + void finish(uint64_t size) { + objects++; + bytes += size; + } + + void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) { + *a = attempts; + *o = objects; + *b = bytes; + attempts = *a / 2; + objects = *o / 2; + bytes = *b / 2; + } +}; + +struct pool_pg_num_history_t { + /// last epoch updated + epoch_t epoch = 0; + /// poolid -> epoch -> pg_num + std::map> pg_nums; + /// pair(epoch, poolid) + std::set> deleted_pools; + + void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) { + pg_nums[pool][epoch] = pg_num; + } + void log_pool_delete(epoch_t epoch, int64_t pool) { + deleted_pools.insert(std::make_pair(epoch, pool)); + } + + /// prune history based on oldest osdmap epoch in the cluster + void prune(epoch_t oldest_epoch) { + auto i = deleted_pools.begin(); + while (i != deleted_pools.end()) { + if (i->first >= oldest_epoch) { + break; + } + pg_nums.erase(i->second); + i = deleted_pools.erase(i); + } + for (auto& j : pg_nums) { + auto k = j.second.lower_bound(oldest_epoch); + // keep this and the entry before it (just to be paranoid) + if (k != j.second.begin()) { + --k; + j.second.erase(j.second.begin(), k); + } + } + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(epoch, bl); + encode(pg_nums, bl); + encode(deleted_pools, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(epoch, p); + decode(pg_nums, p); + decode(deleted_pools, p); + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("epoch", epoch); + f->open_object_section("pools"); + for (auto& i : pg_nums) { + f->open_object_section("pool"); + f->dump_unsigned("pool_id", i.first); + f->open_array_section("changes"); + for (auto& j : i.second) { + f->open_object_section("change"); + f->dump_unsigned("epoch", j.first); + f->dump_unsigned("pg_num", j.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("deleted_pools"); + for (auto& i : deleted_pools) { + f->open_object_section("deletion"); + f->dump_unsigned("pool_id", i.second); + f->dump_unsigned("epoch", i.first); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new pool_pg_num_history_t); + } + friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) { + return out << "pg_num_history(e" << h.epoch + << " pg_nums " << h.pg_nums + << " deleted_pools " << h.deleted_pools + << ")"; + } +}; +WRITE_CLASS_ENCODER(pool_pg_num_history_t) + +// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can +// easily skip them +static const std::string_view infover_key = "_infover"; +static const std::string_view info_key = "_info"; +static const std::string_view biginfo_key = "_biginfo"; +static const std::string_view epoch_key = "_epoch"; +static const std::string_view fastinfo_key = "_fastinfo"; + +static const __u8 pg_latest_struct_v = 10; +// v10 is the new past_intervals encoding +// v9 was fastinfo_key addition +// v8 was the move to a per-pg pgmeta object +// v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad +// (first appeared in cuttlefish). +static const __u8 pg_compat_struct_v = 10; + +int prepare_info_keymap( + CephContext* cct, + std::map *km, + std::string *key_to_remove, + epoch_t epoch, + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + bool dirty_big_info, + bool dirty_epoch, + bool try_fast_info, + PerfCounters *logger = nullptr, + DoutPrefixProvider *dpp = nullptr); + +namespace ceph::os { + class Transaction; +}; + +void create_pg_collection( + ceph::os::Transaction& t, spg_t pgid, int bits); + +void init_pg_ondisk( + ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool); + +// omap specific stats +struct omap_stat_t { + int large_omap_objects; + int64_t omap_bytes; + int64_t omap_keys; +}; + +// filter for pg listings +class PGLSFilter { + CephContext* cct; +protected: + std::string xattr; +public: + PGLSFilter(); + virtual ~PGLSFilter(); + virtual bool filter(const hobject_t &obj, + const ceph::buffer::list& xattr_data) const = 0; + + /** + * Arguments passed from the RADOS client. Implementations must + * handle any encoding errors, and return an appropriate error code, + * or 0 on valid input. + */ + virtual int init(ceph::buffer::list::const_iterator ¶ms) = 0; + + /** + * xattr key, or empty string. If non-empty, this xattr will be fetched + * and the value passed into ::filter + */ + virtual const std::string& get_xattr() const { return xattr; } + + /** + * If true, objects without the named xattr (if xattr name is not empty) + * will be rejected without calling ::filter + */ + virtual bool reject_empty_xattr() const { return true; } +}; + +class PGLSPlainFilter : public PGLSFilter { + std::string val; +public: + int init(ceph::buffer::list::const_iterator ¶ms) override; + ~PGLSPlainFilter() override {} + bool filter(const hobject_t& obj, + const ceph::buffer::list& xattr_data) const override; +}; + +// alias name for this structure: +using missing_map_t = std::map, + std::optional>>; + +#endif diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc new file mode 100644 index 000000000..20ab0a1aa --- /dev/null +++ b/src/osd/pg_scrubber.cc @@ -0,0 +1,2384 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=2 sw=2 smarttab + +#include "./pg_scrubber.h" // the '.' notation used to affect clang-format order + +#include +#include + +#include "debug.h" + +#include "common/errno.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDRepScrub.h" +#include "messages/MOSDRepScrubMap.h" +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrubReserve.h" + +#include "OSD.h" +#include "ScrubStore.h" +#include "scrub_machine.h" + +using namespace Scrub; +using namespace std::chrono; +using namespace std::chrono_literals; + +#define dout_context (m_osds->cct) +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +template +static ostream& _prefix(std::ostream* _dout, T* t) +{ + return t->gen_prefix(*_dout); +} + +ostream& operator<<(ostream& out, const scrub_flags_t& sf) +{ + if (sf.auto_repair) + out << " AUTO_REPAIR"; + if (sf.check_repair) + out << " CHECK_REPAIR"; + if (sf.deep_scrub_on_error) + out << " DEEP_SCRUB_ON_ERROR"; + if (sf.required) + out << " REQ_SCRUB"; + + return out; +} + +ostream& operator<<(ostream& out, const requested_scrub_t& sf) +{ + if (sf.must_repair) + out << " MUST_REPAIR"; + if (sf.auto_repair) + out << " planned AUTO_REPAIR"; + if (sf.check_repair) + out << " planned CHECK_REPAIR"; + if (sf.deep_scrub_on_error) + out << " planned DEEP_SCRUB_ON_ERROR"; + if (sf.must_deep_scrub) + out << " MUST_DEEP_SCRUB"; + if (sf.must_scrub) + out << " MUST_SCRUB"; + if (sf.time_for_deep) + out << " TIME_FOR_DEEP"; + if (sf.need_auto) + out << " NEED_AUTO"; + if (sf.req_scrub) + out << " planned REQ_SCRUB"; + + return out; +} + +/* + * if the incoming message is from a previous interval, it must mean + * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard + * the stale message. + */ +bool PgScrubber::check_interval(epoch_t epoch_to_verify) +{ + return epoch_to_verify >= m_pg->get_same_interval_since(); +} + +bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify) +{ + if (!m_active) { + // not scrubbing. We can assume that the scrub was already terminated, and we + // can silently discard the incoming event. + return false; + } + + // is this a message from before we started this scrub? + if (epoch_to_verify < m_epoch_start) { + return false; + } + + // has a new interval started? + if (!check_interval(epoch_to_verify)) { + // if this is a new interval, on_change() has already terminated that + // old scrub. + return false; + } + + ceph_assert(is_primary()); + + // were we instructed to abort? + return verify_against_abort(epoch_to_verify); +} + +bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify) +{ + if (!should_abort()) { + return true; + } + + dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify + << " vs last-aborted: " << m_last_aborted << dendl; + + // if we were not aware of the abort before - kill the scrub. + if (epoch_to_verify >= m_last_aborted) { + scrub_clear_state(); + m_last_aborted = std::max(epoch_to_verify, m_epoch_start); + } + return false; +} + +bool PgScrubber::should_abort() const +{ + if (m_flags.required) { + return false; // not stopping 'required' scrubs for configuration changes + } + + if (m_is_deep) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || + m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) { + dout(10) << "nodeep_scrub set, aborting" << dendl; + return true; + } + } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || + m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { + dout(10) << "noscrub set, aborting" << dendl; + return true; + } + + return false; +} + +// initiating state-machine events -------------------------------- + +/* + * a note re the checks performed before sending scrub-initiating messages: + * + * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that + * possibly were in the queue while the PG changed state and became unavailable for + * scrubbing: + * + * The check_interval() catches all major changes to the PG. As for the other conditions + * we may check (and see is_message_relevant() above): + * + * - we are not 'active' yet, so must not check against is_active(), and: + * + * - the 'abort' flags were just verified (when the triggering message was queued). As + * those are only modified in human speeds - they need not be queried again. + * + * Some of the considerations above are also relevant to the replica-side initiation + * ('StartReplica' & 'StartReplicaNoWait'). + */ + +void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued) +{ + dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + // we may have lost our Primary status while the message languished in the queue + if (check_interval(epoch_queued)) { + dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl; + reset_epoch(epoch_queued); + m_fsm->process_event(StartScrub{}); + dout(10) << "scrubber event --<< StartScrub" << dendl; + } else { + // and just in case snap trimming was blocked by the aborted scrub + m_pg->snap_trimmer_scrub_complete(); + clear_queued_or_active(); + } +} + +void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued) +{ + dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + // we may have lost our Primary status while the message languished in the queue + if (check_interval(epoch_queued)) { + dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl; + reset_epoch(epoch_queued); + m_fsm->process_event(AfterRepairScrub{}); + dout(10) << "scrubber event --<< AfterRepairScrub" << dendl; + } else { + m_pg->snap_trimmer_scrub_complete(); + clear_queued_or_active(); + } +} +void PgScrubber::send_scrub_unblock(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->process_event(Unblocked{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_scrub_resched(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->process_event(InternalSchedScrub{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued + << " token: " << token << dendl; + if (is_primary()) { + // shouldn't happen. Ignore + dout(1) << "got a replica scrub request while Primary!" << dendl; + return; + } + + if (check_interval(epoch_queued) && is_token_current(token)) { + // save us some time by not waiting for updates if there are none + // to wait for. Affects the transition from NotActive into either + // ReplicaWaitUpdates or ActiveReplica. + if (pending_active_pushes()) + m_fsm->process_event(StartReplica{}); + else + m_fsm->process_event(StartReplicaNoWait{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued + << " token: " << token << dendl; + if (check_interval(epoch_queued) && is_token_current(token)) { + m_fsm->process_event(SchedReplica{}); // retest for map availability + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::active_pushes_notification(epoch_t epoch_queued) +{ + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->process_event(ActivePushesUpd{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::update_applied_notification(epoch_t epoch_queued) +{ + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->process_event(UpdatesApplied{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::digest_update_notification(epoch_t epoch_queued) +{ + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->process_event(DigestUpdate{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_local_map_done(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->process_event(Scrub::IntLocalMapDone{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->process_event(GotReplicas{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { + m_fsm->process_event(ReplicaPushesUpd{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_remotes_reserved(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + // note: scrub is not active yet + if (check_interval(epoch_queued)) { + m_fsm->process_event(RemotesReserved{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_reservation_failure(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { // do not check for 'active'! + m_fsm->process_event(ReservationFailure{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_full_reset(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + + m_fsm->process_event(Scrub::FullReset{}); + + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_chunk_free(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { + m_fsm->process_event(Scrub::SelectedChunkFree{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_chunk_busy(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { + m_fsm->process_event(Scrub::ChunkIsBusy{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_get_next_chunk(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->process_event(Scrub::NextChunk{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + + // can't check for "active" + + m_fsm->process_event(Scrub::ScrubFinished{}); + + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_maps_compared(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + + m_fsm->process_event(Scrub::MapsCompared{}); + + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +// ----------------- + +bool PgScrubber::is_reserving() const +{ + return m_fsm->is_reserving(); +} + +void PgScrubber::reset_epoch(epoch_t epoch_queued) +{ + dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl; + m_fsm->assert_not_active(); + + m_epoch_start = epoch_queued; + m_needs_sleep = true; + m_is_deep = state_test(PG_STATE_DEEP_SCRUB); + update_op_mode_text(); +} + +unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const +{ + unsigned int qu_priority = m_flags.priority; + + if (with_priority == Scrub::scrub_prio_t::high_priority) { + qu_priority = + std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority); + } + return qu_priority; +} + +unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const +{ + if (with_priority == Scrub::scrub_prio_t::high_priority) { + suggested_priority = std::max(suggested_priority, + (unsigned int)m_pg->cct->_conf->osd_client_op_priority); + } + return suggested_priority; +} + +// ///////////////////////////////////////////////////////////////////// // +// scrub-op registration handling + +bool PgScrubber::is_scrub_registered() const +{ + return !m_scrub_reg_stamp.is_zero(); +} + +void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags) +{ + if (!is_primary()) { + // normal. No warning is required. + return; + } + + dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? " + << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp + << dendl; + + ceph_assert(!is_scrub_registered()); + + utime_t reg_stamp; + bool must = false; + + if (request_flags.must_scrub || request_flags.need_auto) { + // Set the smallest time that isn't utime_t() + reg_stamp = PgScrubber::scrub_must_stamp(); + must = true; + } else if (m_pg->info.stats.stats_invalid && + m_pg->cct->_conf->osd_scrub_invalid_stats) { + reg_stamp = ceph_clock_now(); + must = true; + } else { + reg_stamp = m_pg->info.history.last_scrub_stamp; + } + + dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must + << " required:" << m_flags.required << " flags: " << request_flags + << " stamp: " << reg_stamp << dendl; + + const double scrub_min_interval = + m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0); + const double scrub_max_interval = + m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0); + + // note the sched_time, so we can locate this scrub, and remove it later + m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval, + scrub_max_interval, must); + dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time " + << m_scrub_reg_stamp << ", must = " << (int)must << dendl; +} + +void PgScrubber::unreg_next_scrub() +{ + if (is_scrub_registered()) { + dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl; + m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp); + m_scrub_reg_stamp = utime_t{}; + } +} + +void PgScrubber::scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) +{ + dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ") + << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ") + << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered() + << dendl; + + unreg_next_scrub(); + + req_flags.must_scrub = true; + req_flags.must_deep_scrub = + (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair); + req_flags.must_repair = (scrub_type == scrub_type_t::do_repair); + // User might intervene, so clear this + req_flags.need_auto = false; + req_flags.req_scrub = true; + + dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl; + + reg_next_scrub(req_flags); +} + +void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags) +{ + dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? " + << is_scrub_registered() << dendl; + + unreg_next_scrub(); + req_flags.need_auto = true; + reg_next_scrub(req_flags); +} + +bool PgScrubber::reserve_local() +{ + // try to create the reservation object (which translates into asking the + // OSD for the local scrub resource). If failing - undo it immediately + + m_local_osd_resource.emplace(m_osds); + if (m_local_osd_resource->is_reserved()) { + dout(15) << __func__ << ": local resources reserved" << dendl; + return true; + } + + dout(10) << __func__ << ": failed to reserve local scrub resources" << dendl; + m_local_osd_resource.reset(); + return false; +} + +// ---------------------------------------------------------------------------- + +bool PgScrubber::has_pg_marked_new_updates() const +{ + auto last_applied = m_pg->recovery_state.get_last_update_applied(); + dout(10) << __func__ << " recovery last: " << last_applied + << " vs. scrub's: " << m_subset_last_update << dendl; + + return last_applied >= m_subset_last_update; +} + +void PgScrubber::set_subset_last_update(eversion_t e) +{ + m_subset_last_update = e; + dout(15) << __func__ << " last-update: " << e << dendl; +} + +void PgScrubber::on_applied_when_primary(const eversion_t& applied_version) +{ + // we are only interested in updates if we are the Primary, and in state + // WaitLastUpdate + if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) { + m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops()); + dout(15) << __func__ << " update: " << applied_version + << " vs. required: " << m_subset_last_update << dendl; + } +} + +/* + * The selected range is set directly into 'm_start' and 'm_end' + * setting: + * - m_subset_last_update + * - m_max_end + * - end + * - start + */ +bool PgScrubber::select_range() +{ + m_primary_scrubmap = ScrubMap{}; + m_received_maps.clear(); + + /* get the start and end of our scrub chunk + * + * Our scrub chunk has an important restriction we're going to need to + * respect. We can't let head be start or end. + * Using a half-open interval means that if end == head, + * we'd scrub/lock head and the clone right next to head in different + * chunks which would allow us to miss clones created between + * scrubbing that chunk and scrubbing the chunk including head. + * This isn't true for any of the other clones since clones can + * only be created "just to the left of" head. There is one exception + * to this: promotion of clones which always happens to the left of the + * left-most clone, but promote_object checks the scrubber in that + * case, so it should be ok. Also, it's ok to "miss" clones at the + * left end of the range if we are a tier because they may legitimately + * not exist (see _scrub). + */ + int min_idx = std::max( + 3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor()); + + int max_idx = std::max(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max / + preemption_data.chunk_divisor()); + + dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx + << " Div: " << preemption_data.chunk_divisor() << dendl; + + hobject_t start = m_start; + hobject_t candidate_end; + std::vector objects; + int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects, + &candidate_end); + ceph_assert(ret >= 0); + + if (!objects.empty()) { + + hobject_t back = objects.back(); + while (candidate_end.is_head() && candidate_end == back.get_head()) { + candidate_end = back; + objects.pop_back(); + if (objects.empty()) { + ceph_assert(0 == + "Somehow we got more than 2 objects which" + "have the same head but are not clones"); + } + back = objects.back(); + } + + if (candidate_end.is_head()) { + ceph_assert(candidate_end != back.get_head()); + candidate_end = candidate_end.get_object_boundary(); + } + + } else { + ceph_assert(candidate_end.is_max()); + } + + // is that range free for us? if not - we will be rescheduled later by whoever + // triggered us this time + + if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) { + // we'll be requeued by whatever made us unavailable for scrub + dout(10) << __func__ << ": scrub blocked somewhere in range " + << "[" << m_start << ", " << candidate_end << ")" << dendl; + return false; + } + + m_end = candidate_end; + if (m_end > m_max_end) + m_max_end = m_end; + + dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// " + << m_max_end << dendl; + return true; +} + +void PgScrubber::select_range_n_notify() +{ + if (select_range()) { + // the next chunk to handle is not blocked + dout(20) << __func__ << ": selection OK" << dendl; + m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority); + + } else { + // we will wait for the objects range to become available for scrubbing + dout(10) << __func__ << ": selected chunk is busy" << dendl; + m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority); + } +} + +bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid) +{ + if (soid < m_start || soid >= m_end) { + return false; + } + + dout(20) << __func__ << " " << soid << " can preempt? " + << preemption_data.is_preemptable() << " already preempted? " + << preemption_data.was_preempted() << dendl; + + if (preemption_data.was_preempted()) { + // otherwise - write requests arriving while 'already preempted' is set + // but 'preemptable' is not - will not be allowed to continue, and will + // not be requeued on time. + return false; + } + + if (preemption_data.is_preemptable()) { + + dout(10) << __func__ << " " << soid << " preempted" << dendl; + + // signal the preemption + preemption_data.do_preempt(); + m_end = m_start; // free the range we were scrubbing + + return false; + } + return true; +} + +bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end) +{ + // does [start, end] intersect [scrubber.start, scrubber.m_max_end) + return (start < m_max_end && end >= m_start); +} + +/** + * if we are required to sleep: + * arrange a callback sometimes later. + * be sure to be able to identify a stale callback. + * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue) + * anyway. + */ +void PgScrubber::add_delayed_scheduling() +{ + m_end = m_start; // not blocking any range now + + milliseconds sleep_time{0ms}; + if (m_needs_sleep) { + double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required); + sleep_time = milliseconds{long(scrub_sleep)}; + } + dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? " + << m_needs_sleep << dendl; + + if (sleep_time.count()) { + // schedule a transition for some 'sleep_time' ms in the future + + m_needs_sleep = false; + m_sleep_started_at = ceph_clock_now(); + + // the following log line is used by osd-scrub-test.sh + dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl; + + // the 'delayer' for crimson is different. Will be factored out. + + spg_t pgid = m_pg->get_pgid(); + auto callbk = new LambdaContext([osds = m_osds, pgid, + scrbr = this]([[maybe_unused]] int r) mutable { + PGRef pg = osds->osd->lookup_lock_pg(pgid); + if (!pg) { + lgeneric_subdout(g_ceph_context, osd, 10) + << "scrub_requeue_callback: Could not find " + << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl; + return; + } + scrbr->m_needs_sleep = true; + lgeneric_dout(scrbr->get_pg_cct(), 7) + << "scrub_requeue_callback: slept for " + << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl; + + scrbr->m_sleep_started_at = utime_t{}; + osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority); + pg->unlock(); + }); + + std::lock_guard l(m_osds->sleep_lock); + m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk); + + } else { + // just a requeue + m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority); + } +} + +eversion_t PgScrubber::search_log_for_updates() const +{ + auto& projected = m_pg->projected_log.log; + auto pi = find_if( + projected.crbegin(), projected.crend(), + [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; }); + + if (pi != projected.crend()) + return pi->version; + + // there was no relevant update entry in the log + + auto& log = m_pg->recovery_state.get_pg_log().get_log().log; + auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool { + return e.soid >= m_start && e.soid < m_end; + }); + + if (p == log.crend()) + return eversion_t{}; + else + return p->version; +} + +void PgScrubber::get_replicas_maps(bool replica_can_preempt) +{ + dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/" + << m_interval_start + << " pg same_interval_since: " << m_pg->info.history.same_interval_since + << dendl; + + m_primary_scrubmap_pos.reset(); + + // ask replicas to scan and send maps + for (const auto& i : m_pg->get_actingset()) { + + if (i == m_pg_whoami) + continue; + + m_maps_status.mark_replica_map_request(i); + _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep, + replica_can_preempt); + } + + dout(10) << __func__ << " awaiting" << m_maps_status << dendl; +} + +bool PgScrubber::was_epoch_changed() const +{ + // for crimson we have m_pg->get_info().history.same_interval_since + dout(10) << __func__ << " epoch_start: " << m_interval_start + << " from pg: " << m_pg->get_history().same_interval_since << dendl; + + return m_interval_start < m_pg->get_history().same_interval_since; +} + +void PgScrubber::mark_local_map_ready() +{ + m_maps_status.mark_local_map_ready(); +} + +bool PgScrubber::are_all_maps_available() const +{ + return m_maps_status.are_all_maps_available(); +} + +std::string PgScrubber::dump_awaited_maps() const +{ + return m_maps_status.dump(); +} + +void PgScrubber::update_op_mode_text() +{ + auto visible_repair = state_test(PG_STATE_REPAIR); + m_mode_desc = (visible_repair ? "repair"sv : (m_is_deep ? "deep-scrub"sv : "scrub"sv)); + + dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false") + << ", internal: " << (m_is_repair ? "true" : "false") + << ". Displayed: " << m_mode_desc << dendl; +} + +void PgScrubber::_request_scrub_map(pg_shard_t replica, + eversion_t version, + hobject_t start, + hobject_t end, + bool deep, + bool allow_preemption) +{ + ceph_assert(replica != m_pg_whoami); + dout(10) << __func__ << " scrubmap from osd." << replica + << (deep ? " deep" : " shallow") << dendl; + + auto repscrubop = + new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version, + get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep, + allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub()); + + // default priority. We want the replica-scrub processed prior to any recovery + // or client io messages (we are holding a lock!) + m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch()); +} + +void PgScrubber::cleanup_store(ObjectStore::Transaction* t) +{ + if (!m_store) + return; + + struct OnComplete : Context { + std::unique_ptr store; + explicit OnComplete(std::unique_ptr&& store) : store(std::move(store)) + {} + void finish(int) override {} + }; + m_store->cleanup(t); + t->register_on_complete(new OnComplete(std::move(m_store))); + ceph_assert(!m_store); +} + +void PgScrubber::on_init() +{ + // going upwards from 'inactive' + ceph_assert(!is_scrub_active()); + + preemption_data.reset(); + m_pg->publish_stats_to_osd(); + m_interval_start = m_pg->get_history().same_interval_since; + + dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl; + + // create a new store + { + ObjectStore::Transaction t; + cleanup_store(&t); + m_store.reset( + Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll)); + m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + } + + m_start = m_pg->info.pgid.pgid.get_hobj_start(); + m_active = true; +} + +void PgScrubber::on_replica_init() +{ + m_active = true; +} + +void PgScrubber::_scan_snaps(ScrubMap& smap) +{ + hobject_t head; + SnapSet snapset; + + // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings + // in this function + dout(15) << "_scan_snaps starts" << dendl; + + for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) { + + const hobject_t& hoid = i->first; + ScrubMap::object& o = i->second; + + dout(20) << __func__ << " " << hoid << dendl; + + ceph_assert(!hoid.is_snapdir()); + if (hoid.is_head()) { + // parse the SnapSet + bufferlist bl; + if (o.attrs.find(SS_ATTR) == o.attrs.end()) { + continue; + } + bl.push_back(o.attrs[SS_ATTR]); + auto p = bl.cbegin(); + try { + decode(snapset, p); + } catch (...) { + continue; + } + head = hoid.get_head(); + continue; + } + + if (hoid.snap < CEPH_MAXSNAP) { + // check and if necessary fix snap_mapper + if (hoid.get_head() != head) { + derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl; + continue; + } + set obj_snaps; + auto p = snapset.clone_snaps.find(hoid.snap); + if (p == snapset.clone_snaps.end()) { + derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl; + continue; + } + obj_snaps.insert(p->second.begin(), p->second.end()); + set cur_snaps; + int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps); + if (r != 0 && r != -ENOENT) { + derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + if (r == -ENOENT || cur_snaps != obj_snaps) { + ObjectStore::Transaction t; + OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t)); + if (r == 0) { + r = m_pg->snap_mapper.remove_oid(hoid, &_t); + if (r != 0) { + derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + m_pg->osd->clog->error() + << "osd." << m_pg->osd->whoami << " found snap mapper error on pg " + << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps + << ", oi: " << obj_snaps << "...repaired"; + } else { + m_pg->osd->clog->error() + << "osd." << m_pg->osd->whoami << " found snap mapper error on pg " + << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper" + << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r + << "...repaired"; + } + m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t); + + // wait for repair to apply to avoid confusing other bits of the system. + { + dout(15) << __func__ << " wait on repair!" << dendl; + + ceph::condition_variable my_cond; + ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock"); + int e = 0; + bool done; + + t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e)); + + e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t)); + if (e != 0) { + derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl; + } else { + std::unique_lock l{my_lock}; + my_cond.wait(l, [&done] { return done; }); + } + } + } + } + } +} + +int PgScrubber::build_primary_map_chunk() +{ + epoch_t map_building_since = m_pg->get_osdmap_epoch(); + dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl; + + auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start, + m_end, m_is_deep); + + if (ret == -EINPROGRESS) { + // reschedule another round of asking the backend to collect the scrub data + m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority); + } + return ret; +} + +int PgScrubber::build_replica_map_chunk() +{ + dout(10) << __func__ << " interval start: " << m_interval_start + << " current token: " << m_current_token << " epoch: " << m_epoch_start + << " deep: " << m_is_deep << dendl; + + auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end, + m_is_deep); + + switch (ret) { + + case -EINPROGRESS: + // must wait for the backend to finish. No external event source. + // (note: previous version used low priority here. Now switched to using the + // priority of the original message) + m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority, + m_flags.priority, m_current_token); + break; + + case 0: { + // finished! + m_cleaned_meta_map.clear_from(m_start); + m_cleaned_meta_map.insert(replica_scrubmap); + auto for_meta_scrub = clean_meta_map(); + _scan_snaps(for_meta_scrub); + + // the local map has been created. Send it to the primary. + // Note: once the message reaches the Primary, it may ask us for another + // chunk - and we better be done with the current scrub. Thus - the preparation of + // the reply message is separate, and we clear the scrub state before actually + // sending it. + + auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption); + replica_handling_done(); + dout(15) << __func__ << " chunk map sent " << dendl; + send_replica_map(reply); + } break; + + default: + // negative retval: build_scrub_map_chunk() signalled an error + // Pre-Pacific code ignored this option, treating it as a success. + // \todo Add an error flag in the returning message. + dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret + << dendl; + replica_handling_done(); + // only in debug mode for now: + assert(false && "backend error"); + break; + }; + + return ret; +} + +int PgScrubber::build_scrub_map_chunk( + ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep) +{ + dout(10) << __func__ << " [" << start << "," << end << ") " + << " pos " << pos << " Deep: " << deep << dendl; + + // start + while (pos.empty()) { + + pos.deep = deep; + map.valid_through = m_pg->info.last_update; + + // objects + vector rollback_obs; + pos.ret = + m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs); + dout(10) << __func__ << " while pos empty " << pos.ret << dendl; + if (pos.ret < 0) { + dout(5) << "objects_list_range error: " << pos.ret << dendl; + return pos.ret; + } + dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl; + if (pos.ls.empty()) { + break; + } + m_pg->_scan_rollback_obs(rollback_obs); + pos.pos = 0; + return -EINPROGRESS; + } + + // scan objects + while (!pos.done()) { + + int r = m_pg->get_pgbackend()->be_scan_list(map, pos); + if (r == -EINPROGRESS) { + dout(20) << __func__ << " in progress" << dendl; + return r; + } + } + + // finish + dout(20) << __func__ << " finishing" << dendl; + ceph_assert(pos.done()); + m_pg->_repair_oinfo_oid(map); + + dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl; + return 0; +} + +/* + * Process: + * Building a map of objects suitable for snapshot validation. + * The data in m_cleaned_meta_map is the left over partial items that need to + * be completed before they can be processed. + * + * Snapshots in maps precede the head object, which is why we are scanning backwards. + */ +ScrubMap PgScrubber::clean_meta_map() +{ + ScrubMap for_meta_scrub; + + if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) { + m_cleaned_meta_map.swap(for_meta_scrub); + } else { + auto iter = m_cleaned_meta_map.objects.end(); + --iter; // not empty, see 'if' clause + auto begin = m_cleaned_meta_map.objects.begin(); + if (iter->first.has_snapset()) { + ++iter; + } else { + while (iter != begin) { + auto next = iter--; + if (next->first.get_head() != iter->first.get_head()) { + ++iter; + break; + } + } + } + for_meta_scrub.objects.insert(begin, iter); + m_cleaned_meta_map.objects.erase(begin, iter); + } + + return for_meta_scrub; +} + +void PgScrubber::run_callbacks() +{ + std::list to_run; + to_run.swap(m_callbacks); + + for (auto& tr : to_run) { + tr->complete(0); + } +} + +void PgScrubber::maps_compare_n_cleanup() +{ + scrub_compare_maps(); + m_start = m_end; + run_callbacks(); + requeue_waiting(); + m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority); +} + +Scrub::preemption_t& PgScrubber::get_preemptor() +{ + return preemption_data; +} + +/* + * Process note: called for the arriving "give me your map, replica!" request. Unlike + * the original implementation, we do not requeue the Op waiting for + * updates. Instead - we trigger the FSM. + */ +void PgScrubber::replica_scrub_op(OpRequestRef op) +{ + op->mark_started(); + auto msg = op->get_req(); + dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch + << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl; + + // are we still processing a previous scrub-map request without noticing that the + // interval changed? won't see it here, but rather at the reservation stage. + + if (msg->map_epoch < m_pg->info.history.same_interval_since) { + dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch + << " < " << m_pg->info.history.same_interval_since << dendl; + + // is there a general sync issue? are we holding a stale reservation? + // not checking now - assuming we will actively react to interval change. + + return; + } + + if (is_queued_or_active()) { + // this is bug! + // Somehow, we have received a new scrub request from our Primary, before + // having finished with the previous one. Did we go through an interval + // change without reseting the FSM? Possible responses: + // - crashing (the original assert_not_active() implemented that one), or + // - trying to recover: + // - (logging enough information to debug this scenario) + // - reset the FSM. + m_osds->clog->warn() + << __func__ + << ": error: a second scrub-op received while handling the previous one"; + + scrub_clear_state(); + m_osds->clog->warn() << __func__ + << ": after a reset. Now handling the new OP"; + } + // make sure the FSM is at NotActive + m_fsm->assert_not_active(); + + replica_scrubmap = ScrubMap{}; + replica_scrubmap_pos = ScrubMapBuilder{}; + + m_replica_min_epoch = msg->min_epoch; + m_start = msg->start; + m_end = msg->end; + m_max_end = msg->end; + m_is_deep = msg->deep; + m_interval_start = m_pg->info.history.same_interval_since; + m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority + : Scrub::scrub_prio_t::low_priority; + m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority(); + + preemption_data.reset(); + preemption_data.force_preemptability(msg->allow_preemption); + + replica_scrubmap_pos.reset(); + + set_queued_or_active(); + m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, + m_flags.priority, m_current_token); +} + +void PgScrubber::set_op_parameters(requested_scrub_t& request) +{ + dout(10) << __func__ << " input: " << request << dendl; + + // write down the epoch of starting a new scrub. Will be used + // to discard stale messages from previous aborted scrubs. + m_epoch_start = m_pg->get_osdmap_epoch(); + + m_flags.check_repair = request.check_repair; + m_flags.auto_repair = request.auto_repair || request.need_auto; + m_flags.required = request.req_scrub || request.must_scrub; + + m_flags.priority = (request.must_scrub || request.need_auto) + ? get_pg_cct()->_conf->osd_requested_scrub_priority + : m_pg->get_scrub_priority(); + + state_set(PG_STATE_SCRUBBING); + + // will we be deep-scrubbing? + if (request.must_deep_scrub || request.need_auto || request.time_for_deep) { + state_set(PG_STATE_DEEP_SCRUB); + } + + // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e. + // deep-scrub with the auto_repair configuration flag set). m_is_repair value + // determines the scrubber behavior. + // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the + // PG status as appearing in the logs). + m_is_repair = request.must_repair || m_flags.auto_repair; + if (request.must_repair) { + state_set(PG_STATE_REPAIR); + // not calling update_op_mode_text() yet, as m_is_deep not set yet + } + + // the publishing here seems to be required for tests synchronization + m_pg->publish_stats_to_osd(); + m_flags.deep_scrub_on_error = request.deep_scrub_on_error; +} + +void PgScrubber::scrub_compare_maps() +{ + dout(10) << __func__ << " has maps, analyzing" << dendl; + + // construct authoritative scrub map for type-specific scrubbing + m_cleaned_meta_map.insert(m_primary_scrubmap); + map, std::optional>> missing_digest; + + map maps; + maps[m_pg_whoami] = &m_primary_scrubmap; + + for (const auto& i : m_pg->get_actingset()) { + if (i == m_pg_whoami) + continue; + dout(2) << __func__ << " replica " << i << " has " + << m_received_maps[i].objects.size() << " items" << dendl; + maps[i] = &m_received_maps[i]; + } + + set master_set; + + // Construct master set + for (const auto& map : maps) { + for (const auto& i : map.second->objects) { + master_set.insert(i.first); + } + } + + stringstream ss; + m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss); + + if (!ss.str().empty()) { + m_osds->clog->warn(ss); + } + + if (m_pg->recovery_state.get_actingset().size() > 1) { + + dout(10) << __func__ << " comparing replica scrub maps" << dendl; + + // Map from object with errors to good peer + map> authoritative; + + dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has " + << m_primary_scrubmap.objects.size() << " items" << dendl; + + ss.str(""); + ss.clear(); + + m_pg->get_pgbackend()->be_compare_scrubmaps( + maps, master_set, m_is_repair, m_missing, m_inconsistent, + authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(), + m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss); + + if (!ss.str().empty()) { + m_osds->clog->error(ss); + } + + for (auto& i : authoritative) { + list> good_peers; + for (list::const_iterator j = i.second.begin(); j != i.second.end(); + ++j) { + good_peers.emplace_back(maps[*j]->objects[i.first], *j); + } + m_authoritative.emplace(i.first, good_peers); + } + + for (auto i = authoritative.begin(); i != authoritative.end(); ++i) { + m_cleaned_meta_map.objects.erase(i->first); + m_cleaned_meta_map.objects.insert( + *(maps[i->second.back()]->objects.find(i->first))); + } + } + + auto for_meta_scrub = clean_meta_map(); + + // ok, do the pg-type specific scrubbing + + // (Validates consistency of the object info and snap sets) + scrub_snapshot_metadata(for_meta_scrub, missing_digest); + + // Called here on the primary can use an authoritative map if it isn't the primary + _scan_snaps(for_meta_scrub); + + if (!m_store->empty()) { + + if (m_is_repair) { + dout(10) << __func__ << ": discarding scrub results" << dendl; + m_store->flush(nullptr); + } else { + dout(10) << __func__ << ": updating scrub object" << dendl; + ObjectStore::Transaction t; + m_store->flush(&t); + m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + } + } +} + +ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg( + PreemptionNoted was_preempted) +{ + dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl; + + auto reply = + make_message(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), + m_replica_min_epoch, m_pg_whoami); + + reply->preempted = (was_preempted == PreemptionNoted::preempted); + ::encode(replica_scrubmap, reply->get_data()); + + return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch}; +} + +void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared) +{ + m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg, + preprepared.m_epoch, false); +} + +void PgScrubber::send_preempted_replica() +{ + auto reply = + make_message(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard}, + m_replica_min_epoch, m_pg_whoami); + + reply->preempted = true; + ::encode(replica_scrubmap, reply->get_data()); // must not skip this + m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false); +} + +/* + * - if the replica lets us know it was interrupted, we mark the chunk as interrupted. + * The state-machine will react to that when all replica maps are received. + * - when all maps are received, we signal the FSM with the GotReplicas event (see + * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the + * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to + * handle. + */ +void PgScrubber::map_from_replica(OpRequestRef op) +{ + auto m = op->get_req(); + dout(15) << __func__ << " " << *m << dendl; + + if (m->map_epoch < m_pg->info.history.same_interval_since) { + dout(10) << __func__ << " discarding old from " << m->map_epoch << " < " + << m_pg->info.history.same_interval_since << dendl; + return; + } + + auto p = const_cast(m->get_data()).cbegin(); + + m_received_maps[m->from].decode(p, m_pg->info.pgid.pool()); + dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl; + + auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from); + if (!is_ok) { + // previously an unexpected map was triggering an assert. Now, as scrubs can be + // aborted at any time, the chances of this happening have increased, and aborting is + // not justified + dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl; + return; + } + + if (m->preempted) { + dout(10) << __func__ << " replica was preempted, setting flag" << dendl; + preemption_data.do_preempt(); + } + + if (m_maps_status.are_all_maps_available()) { + dout(15) << __func__ << " all repl-maps available" << dendl; + m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops()); + } +} + +void PgScrubber::handle_scrub_reserve_request(OpRequestRef op) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + auto request_ep = op->get_req()->get_map_epoch(); + + /* + * if we are currently holding a reservation, then: + * either (1) we, the scrubber, did not yet notice an interval change. The remembered + * reservation epoch is from before our interval, and we can silently discard the + * reservation (no message is required). + * or: + * (2) the interval hasn't changed, but the same Primary that (we think) holds the + * lock just sent us a new request. Note that we know it's the same Primary, as + * otherwise the interval would have changed. + * Ostensibly we can discard & redo the reservation. But then we + * will be temporarily releasing the OSD resource - and might not be able to grab it + * again. Thus, we simply treat this as a successful new request + * (but mark the fact that if there is a previous request from the primary to + * scrub a specific chunk - that request is now defunct). + */ + + if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) { + // we are holding a stale reservation from a past epoch + m_remote_osd_resource.reset(); + dout(10) << __func__ << " cleared existing stale reservation" << dendl; + } + + if (request_ep < m_pg->get_same_interval_since()) { + // will not ack stale requests + return; + } + + bool granted{false}; + if (m_remote_osd_resource.has_value()) { + + dout(10) << __func__ << " already reserved." << dendl; + + /* + * it might well be that we did not yet finish handling the latest scrub-op from + * our primary. This happens, for example, if 'noscrub' was set via a command, then + * reset. The primary in this scenario will remain in the same interval, but we do need + * to reset our internal state (otherwise - the first renewed 'give me your scrub map' + * from the primary will see us in active state, crashing the OSD). + */ + advance_token(); + granted = true; + + } else if (m_pg->cct->_conf->osd_scrub_during_recovery || + !m_osds->is_recovery_active()) { + m_remote_osd_resource.emplace(this, m_pg, m_osds, request_ep); + // OSD resources allocated? + granted = m_remote_osd_resource->is_reserved(); + if (!granted) { + // just forget it + m_remote_osd_resource.reset(); + dout(20) << __func__ << ": failed to reserve remotely" << dendl; + } + } + + dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl; + + Message* reply = new MOSDScrubReserve( + spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep, + granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami); + + m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection()); +} + +void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + if (m_reservations.has_value()) { + m_reservations->handle_reserve_grant(op, from); + } else { + dout(20) << __func__ << ": late/unsolicited reservation grant from osd " + << from << " (" << op << ")" << dendl; + } +} + +void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + if (m_reservations.has_value()) { + // there is an active reservation process. No action is required otherwise. + m_reservations->handle_reserve_reject(op, from); + } +} + +void PgScrubber::handle_scrub_reserve_release(OpRequestRef op) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + /* + * this specific scrub session has terminated. All incoming events carrying the old + * tag will be discarded. + */ + advance_token(); + m_remote_osd_resource.reset(); +} + +void PgScrubber::discard_replica_reservations() +{ + dout(10) << __func__ << dendl; + if (m_reservations.has_value()) { + m_reservations->discard_all(); + } +} + +void PgScrubber::clear_scrub_reservations() +{ + dout(10) << __func__ << dendl; + m_reservations.reset(); // the remote reservations + m_local_osd_resource.reset(); // the local reservation + m_remote_osd_resource.reset(); // we as replica reserved for a Primary +} + +void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text) +{ + ceph_assert(m_pg->recovery_state.get_backfill_targets().empty()); + + std::vector> messages; + messages.reserve(m_pg->get_actingset().size()); + + epoch_t epch = get_osdmap_epoch(); + + for (auto& p : m_pg->get_actingset()) { + + if (p == m_pg_whoami) + continue; + + dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch + << dendl; + Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode, + m_pg_whoami); + messages.push_back(std::make_pair(p.osd, m)); + } + + if (!messages.empty()) { + m_osds->send_message_osd_cluster(messages, epch); + } +} + +void PgScrubber::unreserve_replicas() +{ + dout(10) << __func__ << dendl; + m_reservations.reset(); +} + +void PgScrubber::set_queued_or_active() +{ + m_queued_or_active = true; +} + +void PgScrubber::clear_queued_or_active() +{ + m_queued_or_active = false; +} + +bool PgScrubber::is_queued_or_active() const +{ + return m_queued_or_active; +} + +[[nodiscard]] bool PgScrubber::scrub_process_inconsistent() +{ + dout(10) << __func__ << ": checking authoritative (mode=" + << m_mode_desc << ", auth remaining #: " << m_authoritative.size() + << ")" << dendl; + + // authoritative only store objects which are missing or inconsistent. + if (!m_authoritative.empty()) { + + stringstream ss; + ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, " + << m_inconsistent.size() << " inconsistent objects"; + dout(2) << ss.str() << dendl; + m_osds->clog->error(ss); + + if (m_is_repair) { + state_clear(PG_STATE_CLEAN); + // we know we have a problem, so it's OK to set the user-visible flag + // even if we only reached here via auto-repair + state_set(PG_STATE_REPAIR); + update_op_mode_text(); + + for (const auto& [hobj, shrd_list] : m_authoritative) { + + auto missing_entry = m_missing.find(hobj); + + if (missing_entry != m_missing.end()) { + m_pg->repair_object(hobj, shrd_list, missing_entry->second); + m_fixed_count += missing_entry->second.size(); + } + + if (m_inconsistent.count(hobj)) { + m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]); + m_fixed_count += m_inconsistent[hobj].size(); + } + } + } + } + return (!m_authoritative.empty() && m_is_repair); +} + +/* + * note: only called for the Primary. + */ +void PgScrubber::scrub_finish() +{ + dout(10) << __func__ << " before flags: " << m_flags + << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair") + << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl; + + ceph_assert(m_pg->is_locked()); + ceph_assert(is_queued_or_active()); + + m_pg->m_planned_scrub = requested_scrub_t{}; + + // if the repair request comes from auto-repair and large number of errors, + // we would like to cancel auto-repair + if (m_is_repair && m_flags.auto_repair && + m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) { + + dout(10) << __func__ << " undoing the repair" << dendl; + state_clear(PG_STATE_REPAIR); // not expected to be set, anyway + m_is_repair = false; + update_op_mode_text(); + } + + bool do_auto_scrub = false; + + // if a regular scrub had errors within the limit, do a deep scrub to auto repair + if (m_flags.deep_scrub_on_error && !m_authoritative.empty() && + m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) { + ceph_assert(!m_is_deep); + do_auto_scrub = true; + dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl; + } + + m_flags.deep_scrub_on_error = false; + + // type-specific finish (can tally more errors) + _scrub_finish(); + + bool has_error = scrub_process_inconsistent(); + + { + stringstream oss; + oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " "; + int total_errors = m_shallow_errors + m_deep_errors; + if (total_errors) + oss << total_errors << " errors"; + else + oss << "ok"; + if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors) + oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors + << " remaining deep scrub error details lost)"; + if (m_is_repair) + oss << ", " << m_fixed_count << " fixed"; + if (total_errors) + m_osds->clog->error(oss); + else + m_osds->clog->debug(oss); + } + + // Since we don't know which errors were fixed, we can only clear them + // when every one has been fixed. + if (m_is_repair) { + if (m_fixed_count == m_shallow_errors + m_deep_errors) { + + ceph_assert(m_is_deep); + m_shallow_errors = 0; + m_deep_errors = 0; + dout(20) << __func__ << " All may be fixed" << dendl; + + } else if (has_error) { + + // Deep scrub in order to get corrected error counts + m_pg->scrub_after_recovery = true; + m_pg->m_planned_scrub.req_scrub = + m_pg->m_planned_scrub.req_scrub || m_flags.required; + + dout(20) << __func__ << " Current 'required': " << m_flags.required + << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl; + + } else if (m_shallow_errors || m_deep_errors) { + + // We have errors but nothing can be fixed, so there is no repair + // possible. + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors) + << " error(s) present with no repair possible" << dendl; + } + } + + { + // finish up + ObjectStore::Transaction t; + m_pg->recovery_state.update_stats( + [this](auto& history, auto& stats) { + dout(10) << "m_pg->recovery_state.update_stats()" << dendl; + utime_t now = ceph_clock_now(); + history.last_scrub = m_pg->recovery_state.get_info().last_update; + history.last_scrub_stamp = now; + if (m_is_deep) { + history.last_deep_scrub = m_pg->recovery_state.get_info().last_update; + history.last_deep_scrub_stamp = now; + } + + if (m_is_deep) { + if ((m_shallow_errors == 0) && (m_deep_errors == 0)) + history.last_clean_scrub_stamp = now; + stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors; + stats.stats.sum.num_deep_scrub_errors = m_deep_errors; + stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects; + stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes; + stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys; + dout(25) << "scrub_finish shard " << m_pg_whoami + << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes + << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl; + } else { + stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors; + // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent + // because of deep-scrub errors + if (m_shallow_errors == 0) + history.last_clean_scrub_stamp = now; + } + stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors + + stats.stats.sum.num_deep_scrub_errors; + if (m_flags.check_repair) { + m_flags.check_repair = false; + if (m_pg->info.stats.stats.sum.num_scrub_errors) { + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors + << " error(s) still present after re-scrub" << dendl; + } + } + return true; + }, + &t); + int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + ceph_assert(tr == 0); + } + + if (has_error) { + m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared( + get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery()))); + } else { + m_is_repair = false; + state_clear(PG_STATE_REPAIR); + update_op_mode_text(); + } + + cleanup_on_finish(); + if (do_auto_scrub) { + request_rescrubbing(m_pg->m_planned_scrub); + } + + if (m_pg->is_active() && m_pg->is_primary()) { + m_pg->recovery_state.share_pg_info(); + } + + // we may have blocked the snap trimmer + m_pg->snap_trimmer_scrub_complete(); +} + +void PgScrubber::on_digest_updates() +{ + dout(10) << __func__ << " #pending: " << num_digest_updates_pending + << (m_end.is_max() ? " " : " ") + << (is_queued_or_active() ? "" : " ** not marked as scrubbing **") + << dendl; + + if (num_digest_updates_pending > 0) { + // do nothing for now. We will be called again when new updates arrive + return; + } + + // got all updates, and finished with this chunk. Any more? + if (m_end.is_max()) { + m_osds->queue_scrub_is_finished(m_pg); + } else { + // go get a new chunk (via "requeue") + preemption_data.reset(); + m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops()); + } +} + + +/* + * note that the flags-set fetched from the PG (m_pg->m_planned_scrub) + * is cleared once scrubbing starts; Some of the values dumped here are + * thus transitory. + */ +void PgScrubber::dump(ceph::Formatter* f) const +{ + f->open_object_section("scrubber"); + f->dump_stream("epoch_start") << m_interval_start; + f->dump_bool("active", m_active); + if (m_active) { + f->dump_stream("start") << m_start; + f->dump_stream("end") << m_end; + f->dump_stream("m_max_end") << m_max_end; + f->dump_stream("subset_last_update") << m_subset_last_update; + f->dump_bool("deep", m_is_deep); + f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required)); + f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub); + f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair); + f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto); + f->dump_bool("req_scrub", m_flags.required); + f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep); + f->dump_bool("auto_repair", m_flags.auto_repair); + f->dump_bool("check_repair", m_flags.check_repair); + f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error); + f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp; // utime_t + f->dump_unsigned("priority", m_flags.priority); + f->dump_int("shallow_errors", m_shallow_errors); + f->dump_int("deep_errors", m_deep_errors); + f->dump_int("fixed", m_fixed_count); + { + f->open_array_section("waiting_on_whom"); + for (const auto& p : m_maps_status.get_awaited()) { + f->dump_stream("shard") << p; + } + f->close_section(); + } + } + f->close_section(); +} + + +void PgScrubber::handle_query_state(ceph::Formatter* f) +{ + dout(10) << __func__ << dendl; + + f->open_object_section("scrub"); + f->dump_stream("scrubber.epoch_start") << m_interval_start; + f->dump_bool("scrubber.active", m_active); + f->dump_stream("scrubber.start") << m_start; + f->dump_stream("scrubber.end") << m_end; + f->dump_stream("scrubber.m_max_end") << m_max_end; + f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update; + f->dump_bool("scrubber.deep", m_is_deep); + { + f->open_array_section("scrubber.waiting_on_whom"); + for (const auto& p : m_maps_status.get_awaited()) { + f->dump_stream("shard") << p; + } + f->close_section(); + } + + f->dump_string("comment", "DEPRECATED - may be removed in the next release"); + + f->close_section(); +} + +PgScrubber::~PgScrubber() = default; + +PgScrubber::PgScrubber(PG* pg) + : m_pg{pg} + , m_pg_id{pg->pg_id} + , m_osds{m_pg->osd} + , m_pg_whoami{pg->pg_whoami} + , preemption_data{pg} +{ + m_fsm = std::make_unique(m_pg, this); + m_fsm->initiate(); +} + +void PgScrubber::scrub_begin() +{ + stringstream ss; + ss << m_pg->info.pgid.pgid << " " << m_mode_desc << " starts"; + dout(2) << ss.str() << dendl; + m_osds->clog->debug(ss); +} + +void PgScrubber::reserve_replicas() +{ + dout(10) << __func__ << dendl; + m_reservations.emplace(m_pg, m_pg_whoami); +} + +void PgScrubber::cleanup_on_finish() +{ + dout(10) << __func__ << dendl; + ceph_assert(m_pg->is_locked()); + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + m_pg->publish_stats_to_osd(); + + clear_scrub_reservations(); + m_pg->publish_stats_to_osd(); + + requeue_waiting(); + + reset_internal_state(); + m_flags = scrub_flags_t{}; + + // type-specific state clear + _scrub_clear_state(); +} + +// uses process_event(), so must be invoked externally +void PgScrubber::scrub_clear_state() +{ + dout(10) << __func__ << dendl; + + clear_pgscrub_state(); + m_fsm->process_event(FullReset{}); +} + +/* + * note: does not access the state-machine + */ +void PgScrubber::clear_pgscrub_state() +{ + dout(10) << __func__ << dendl; + ceph_assert(m_pg->is_locked()); + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + + state_clear(PG_STATE_REPAIR); + + clear_scrub_reservations(); + m_pg->publish_stats_to_osd(); + + requeue_waiting(); + + reset_internal_state(); + m_flags = scrub_flags_t{}; + + // type-specific state clear + _scrub_clear_state(); +} + +void PgScrubber::replica_handling_done() +{ + dout(10) << __func__ << dendl; + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + + reset_internal_state(); + + m_pg->publish_stats_to_osd(); +} + +/* + * note: performs run_callbacks() + * note: reservations-related variables are not reset here + */ +void PgScrubber::reset_internal_state() +{ + dout(10) << __func__ << dendl; + + preemption_data.reset(); + m_maps_status.reset(); + m_received_maps.clear(); + + m_start = hobject_t{}; + m_end = hobject_t{}; + m_max_end = hobject_t{}; + m_subset_last_update = eversion_t{}; + m_shallow_errors = 0; + m_deep_errors = 0; + m_fixed_count = 0; + m_omap_stats = (const struct omap_stat_t){0}; + + run_callbacks(); + + m_inconsistent.clear(); + m_missing.clear(); + m_authoritative.clear(); + num_digest_updates_pending = 0; + m_primary_scrubmap = ScrubMap{}; + m_primary_scrubmap_pos.reset(); + replica_scrubmap = ScrubMap{}; + replica_scrubmap_pos.reset(); + m_cleaned_meta_map = ScrubMap{}; + m_needs_sleep = true; + m_sleep_started_at = utime_t{}; + + m_active = false; + clear_queued_or_active(); +} + +// note that only applicable to the Replica: +void PgScrubber::advance_token() +{ + dout(10) << __func__ << " was: " << m_current_token << dendl; + m_current_token++; + + // when advance_token() is called, it is assumed that no scrubbing takes place. + // We will, though, verify that. And if we are actually still handling a stale request - + // both our internal state and the FSM state will be cleared. + replica_handling_done(); + m_fsm->process_event(FullReset{}); +} + +bool PgScrubber::is_token_current(Scrub::act_token_t received_token) +{ + if (received_token == 0 || received_token == m_current_token) { + return true; + } + dout(5) << __func__ << " obsolete token (" << received_token + << " vs current " << m_current_token << dendl; + + return false; +} + +const OSDMapRef& PgScrubber::get_osdmap() const +{ + return m_pg->get_osdmap(); +} + +ostream& operator<<(ostream& out, const PgScrubber& scrubber) +{ + return out << scrubber.m_flags; +} + +std::ostream& PgScrubber::gen_prefix(std::ostream& out) const +{ + const auto fsm_state = m_fsm ? m_fsm->current_states_desc() : "- :"; + if (m_pg) { + return m_pg->gen_prefix(out) << "scrubber " << fsm_state << ": "; + } else { + return out << " scrubber [~] " << fsm_state << ": "; + } +} + +ostream& PgScrubber::show(ostream& out) const +{ + return out << " [ " << m_pg_id << ": " << m_flags << " ] "; +} + +// ///////////////////// preemption_data_t ////////////////////////////////// + +PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg} +{ + m_left = static_cast( + m_pg->get_cct()->_conf.get_val("osd_scrub_max_preemptions")); +} + +void PgScrubber::preemption_data_t::reset() +{ + std::lock_guard lk{m_preemption_lock}; + + m_preemptable = false; + m_preempted = false; + m_left = + static_cast(m_pg->cct->_conf.get_val("osd_scrub_max_preemptions")); + m_size_divisor = 1; +} + + +// ///////////////////// ReplicaReservations ////////////////////////////////// +namespace Scrub { + +void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch) +{ + auto m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, peer.shard), epoch, + MOSDScrubReserve::RELEASE, m_pg->pg_whoami); + m_osds->send_message_osd_cluster(peer.osd, m, epoch); +} + +ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami) + : m_pg{pg} + , m_acting_set{pg->get_actingset()} + , m_osds{m_pg->osd} + , m_pending{static_cast(m_acting_set.size()) - 1} +{ + epoch_t epoch = m_pg->get_osdmap_epoch(); + + { + std::stringstream prefix; + prefix << "osd." << m_osds->whoami << " ep: " << epoch + << " scrubber::ReplicaReservations pg[" << pg->pg_id << "]: "; + m_log_msg_prefix = prefix.str(); + } + + // handle the special case of no replicas + if (m_pending <= 0) { + // just signal the scrub state-machine to continue + send_all_done(); + + } else { + + for (auto p : m_acting_set) { + if (p == whoami) + continue; + auto m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epoch, + MOSDScrubReserve::REQUEST, m_pg->pg_whoami); + m_osds->send_message_osd_cluster(p.osd, m, epoch); + m_waited_for_peers.push_back(p); + dout(10) << __func__ << ": reserve " << p.osd << dendl; + } + } +} + +void ReplicaReservations::send_all_done() +{ + m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority); +} + +void ReplicaReservations::send_reject() +{ + m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority); +} + +void ReplicaReservations::discard_all() +{ + dout(10) << __func__ << ": " << m_reserved_peers << dendl; + + m_had_rejections = true; // preventing late-coming responses from triggering events + m_reserved_peers.clear(); + m_waited_for_peers.clear(); +} + +ReplicaReservations::~ReplicaReservations() +{ + m_had_rejections = true; // preventing late-coming responses from triggering events + + // send un-reserve messages to all reserved replicas. We do not wait for answer (there + // wouldn't be one). Other incoming messages will be discarded on the way, by our + // owner. + epoch_t epoch = m_pg->get_osdmap_epoch(); + + for (auto& p : m_reserved_peers) { + release_replica(p, epoch); + } + m_reserved_peers.clear(); + + // note: the release will follow on the heels of the request. When tried otherwise, + // grants that followed a reject arrived after the whole scrub machine-state was + // reset, causing leaked reservations. + for (auto& p : m_waited_for_peers) { + release_replica(p, epoch); + } + m_waited_for_peers.clear(); +} + +/** + * @ATTN we would not reach here if the ReplicaReservation object managed by the + * scrubber was reset. + */ +void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << ": granted by " << from << dendl; + op->mark_started(); + + { + // reduce the amount of extra release messages. Not a must, but the log is cleaner + auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from); + if (w != m_waited_for_peers.end()) + m_waited_for_peers.erase(w); + } + + // are we forced to reject the reservation? + if (m_had_rejections) { + + dout(10) << __func__ << ": rejecting late-coming reservation from " + << from << dendl; + release_replica(from, m_pg->get_osdmap_epoch()); + + } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) != + m_reserved_peers.end()) { + + dout(10) << __func__ << ": already had osd." << from << " reserved" << dendl; + + } else { + + dout(10) << __func__ << ": osd." << from << " scrub reserve = success" + << dendl; + m_reserved_peers.push_back(from); + if (--m_pending == 0) { + send_all_done(); + } + } +} + +void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << ": rejected by " << from << dendl; + dout(15) << __func__ << ": " << *op->get_req() << dendl; + op->mark_started(); + + { + // reduce the amount of extra release messages. Not a must, but the log is cleaner + auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from); + if (w != m_waited_for_peers.end()) + m_waited_for_peers.erase(w); + } + + if (m_had_rejections) { + + // our failure was already handled when the first rejection arrived + dout(15) << __func__ << ": ignoring late-coming rejection from " + << from << dendl; + + } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) != + m_reserved_peers.end()) { + + dout(10) << __func__ << ": already had osd." << from << " reserved" << dendl; + + } else { + + dout(10) << __func__ << ": osd." << from << " scrub reserve = fail" << dendl; + m_had_rejections = true; // preventing any additional notifications + send_reject(); + } +} + +std::ostream& ReplicaReservations::gen_prefix(std::ostream& out) const +{ + return out << m_log_msg_prefix; +} + +// ///////////////////// LocalReservation ////////////////////////////////// + +// note: no dout()s in LocalReservation functions. Client logs interactions. +LocalReservation::LocalReservation(OSDService* osds) + : m_osds{osds} +{ + if (m_osds->inc_scrubs_local()) { + // the failure is signalled by not having m_holding_local_reservation set + m_holding_local_reservation = true; + } +} + +LocalReservation::~LocalReservation() +{ + if (m_holding_local_reservation) { + m_holding_local_reservation = false; + m_osds->dec_scrubs_local(); + } +} + +// ///////////////////// ReservedByRemotePrimary /////////////////////////////// + +ReservedByRemotePrimary::ReservedByRemotePrimary(const PgScrubber* scrubber, + PG* pg, + OSDService* osds, + epoch_t epoch) + : m_scrubber{scrubber} + , m_pg{pg} + , m_osds{osds} + , m_reserved_at{epoch} +{ + if (!m_osds->inc_scrubs_remote()) { + dout(10) << __func__ << ": failed to reserve at Primary request" << dendl; + // the failure is signalled by not having m_reserved_by_remote_primary set + return; + } + + dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl; + m_reserved_by_remote_primary = true; +} + +bool ReservedByRemotePrimary::is_stale() const +{ + return m_reserved_at < m_pg->get_same_interval_since(); +} + +ReservedByRemotePrimary::~ReservedByRemotePrimary() +{ + if (m_reserved_by_remote_primary) { + m_reserved_by_remote_primary = false; + m_osds->dec_scrubs_remote(); + } +} + +std::ostream& ReservedByRemotePrimary::gen_prefix(std::ostream& out) const +{ + return m_scrubber->gen_prefix(out); +} + +// ///////////////////// MapsCollectionStatus //////////////////////////////// + +auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from) + -> std::tuple +{ + auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from); + if (fe != m_maps_awaited_for.end()) { + // we are indeed waiting for a map from this replica + m_maps_awaited_for.erase(fe); + return std::tuple{true, ""sv}; + } else { + return std::tuple{false, " unsolicited scrub-map"sv}; + } +} + +void MapsCollectionStatus::reset() +{ + *this = MapsCollectionStatus{}; +} + +std::string MapsCollectionStatus::dump() const +{ + std::string all; + for (const auto& rp : m_maps_awaited_for) { + all.append(rp.get_osd() + " "s); + } + return all; +} + +ostream& operator<<(ostream& out, const MapsCollectionStatus& sf) +{ + out << " [ "; + for (const auto& rp : sf.m_maps_awaited_for) { + out << rp.get_osd() << " "; + } + if (!sf.m_local_map_ready) { + out << " local "; + } + return out << " ] "; +} + +} // namespace Scrub diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h new file mode 100644 index 000000000..392a4a588 --- /dev/null +++ b/src/osd/pg_scrubber.h @@ -0,0 +1,821 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "PG.h" +#include "ScrubStore.h" +#include "scrub_machine_lstnr.h" +#include "scrubber_common.h" + +class Callback; + +namespace Scrub { +class ScrubMachine; +struct BuildMap; + +/** + * Reserving/freeing scrub resources at the replicas. + * + * When constructed - sends reservation requests to the acting_set. + * A rejection triggers a "couldn't acquire the replicas' scrub resources" event. + * All previous requests, whether already granted or not, are explicitly released. + * + * A note re performance: I've measured a few container alternatives for + * m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as + * expected. flat_set is only slightly better. Surprisingly - std::vector (with no + * sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve. + */ +class ReplicaReservations { + using OrigSet = decltype(std::declval().get_actingset()); + + PG* m_pg; + OrigSet m_acting_set; + OSDService* m_osds; + std::vector m_waited_for_peers; + std::vector m_reserved_peers; + bool m_had_rejections{false}; + int m_pending{-1}; + + void release_replica(pg_shard_t peer, epoch_t epoch); + + void send_all_done(); ///< all reservations are granted + + /// notify the scrubber that we have failed to reserve replicas' resources + void send_reject(); + + public: + std::string m_log_msg_prefix; + + /** + * quietly discard all knowledge about existing reservations. No messages + * are sent to peers. + * To be used upon interval change, as we know the the running scrub is no longer + * relevant, and that the replicas had reset the reservations on their side. + */ + void discard_all(); + + ReplicaReservations(PG* pg, pg_shard_t whoami); + + ~ReplicaReservations(); + + void handle_reserve_grant(OpRequestRef op, pg_shard_t from); + + void handle_reserve_reject(OpRequestRef op, pg_shard_t from); + + std::ostream& gen_prefix(std::ostream& out) const; +}; + +/** + * wraps the local OSD scrub resource reservation in an RAII wrapper + */ +class LocalReservation { + OSDService* m_osds; + bool m_holding_local_reservation{false}; + + public: + LocalReservation(OSDService* osds); + ~LocalReservation(); + bool is_reserved() const { return m_holding_local_reservation; } +}; + +/** + * wraps the OSD resource we are using when reserved as a replica by a scrubbing master. + */ +class ReservedByRemotePrimary { + const PgScrubber* m_scrubber; ///< we will be using its gen_prefix() + PG* m_pg; + OSDService* m_osds; + bool m_reserved_by_remote_primary{false}; + const epoch_t m_reserved_at; + + public: + ReservedByRemotePrimary(const PgScrubber* scrubber, PG* pg, OSDService* osds, epoch_t epoch); + ~ReservedByRemotePrimary(); + [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; } + + /// compare the remembered reserved-at epoch to the current interval + [[nodiscard]] bool is_stale() const; + + std::ostream& gen_prefix(std::ostream& out) const; +}; + +/** + * Once all replicas' scrub maps are received, we go on to compare the maps. That is - + * unless we we have not yet completed building our own scrub map. MapsCollectionStatus + * combines the status of waiting for both the local map and the replicas, without + * resorting to adding dummy entries into a list. + */ +class MapsCollectionStatus { + + bool m_local_map_ready{false}; + std::vector m_maps_awaited_for; + + public: + [[nodiscard]] bool are_all_maps_available() const + { + return m_local_map_ready && m_maps_awaited_for.empty(); + } + + void mark_local_map_ready() { m_local_map_ready = true; } + + void mark_replica_map_request(pg_shard_t from_whom) + { + m_maps_awaited_for.push_back(from_whom); + } + + /// @returns true if indeed waiting for this one. Otherwise: an error string + auto mark_arriving_map(pg_shard_t from) -> std::tuple; + + std::vector get_awaited() const { return m_maps_awaited_for; } + + void reset(); + + std::string dump() const; + + friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf); +}; + +} // namespace Scrub + + +/** + * the scrub operation flags. Primary only. + * Set at scrub start. Checked in multiple locations - mostly + * at finish. + */ +struct scrub_flags_t { + + unsigned int priority{0}; + + /** + * set by queue_scrub() if either planned_scrub.auto_repair or + * need_auto were set. + * Tested at scrub end. + */ + bool auto_repair{false}; + + /// this flag indicates that we are scrubbing post repair to verify everything is fixed + bool check_repair{false}; + + /// checked at the end of the scrub, to possibly initiate a deep-scrub + bool deep_scrub_on_error{false}; + + /** + * scrub must not be aborted. + * Set for explicitly requested scrubs, and for scrubs originated by the pairing + * process with the 'repair' flag set (in the RequestScrub event). + */ + bool required{false}; +}; + +ostream& operator<<(ostream& out, const scrub_flags_t& sf); + + +/** + * The part of PG-scrubbing code that isn't state-machine wiring. + * + * Why the separation? I wish to move to a different FSM implementation. Thus I + * am forced to strongly decouple the state-machine implementation details from + * the actual scrubbing code. + */ +class PgScrubber : public ScrubPgIF, public ScrubMachineListener { + + public: + explicit PgScrubber(PG* pg); + + // ------------------ the I/F exposed to the PG (ScrubPgIF) ------------- + + /// are we waiting for resource reservation grants form our replicas? + [[nodiscard]] bool is_reserving() const final; + + void initiate_regular_scrub(epoch_t epoch_queued) final; + + void initiate_scrub_after_repair(epoch_t epoch_queued) final; + + void send_scrub_resched(epoch_t epoch_queued) final; + + void active_pushes_notification(epoch_t epoch_queued) final; + + void update_applied_notification(epoch_t epoch_queued) final; + + void send_scrub_unblock(epoch_t epoch_queued) final; + + void digest_update_notification(epoch_t epoch_queued) final; + + void send_replica_maps_ready(epoch_t epoch_queued) final; + + void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; + + void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; + + void send_replica_pushes_upd(epoch_t epoch_queued) final; + /** + * The PG has updated its 'applied version'. It might be that we are waiting for this + * information: after selecting a range of objects to scrub, we've marked the latest + * version of these objects in m_subset_last_update. We will not start the map building + * before we know that the PG has reached this version. + */ + void on_applied_when_primary(const eversion_t& applied_version) final; + + void send_full_reset(epoch_t epoch_queued) final; + + void send_chunk_free(epoch_t epoch_queued) final; + + void send_chunk_busy(epoch_t epoch_queued) final; + + void send_local_map_done(epoch_t epoch_queued) final; + + void send_maps_compared(epoch_t epoch_queued) final; + + void send_get_next_chunk(epoch_t epoch_queued) final; + + void send_scrub_is_finished(epoch_t epoch_queued) final; + + /** + * we allow some number of preemptions of the scrub, which mean we do + * not block. Then we start to block. Once we start blocking, we do + * not stop until the scrub range is completed. + */ + bool write_blocked_by_scrub(const hobject_t& soid) final; + + /// true if the given range intersects the scrub interval in any way + bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final; + + /** + * we are a replica being asked by the Primary to reserve OSD resources for + * scrubbing + */ + void handle_scrub_reserve_request(OpRequestRef op) final; + + void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_release(OpRequestRef op) final; + void discard_replica_reservations() final; + void clear_scrub_reservations() final; // PG::clear... fwds to here + void unreserve_replicas() final; + + // managing scrub op registration + + void reg_next_scrub(const requested_scrub_t& request_flags) final; + + void unreg_next_scrub() final; + + void scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) final; + + /** + * Reserve local scrub resources (managed by the OSD) + * + * Fails if OSD's local-scrubs budget was exhausted + * \returns were local resources reserved? + */ + bool reserve_local() final; + + void handle_query_state(ceph::Formatter* f) final; + + void dump(ceph::Formatter* f) const override; + + // used if we are a replica + + void replica_scrub_op(OpRequestRef op) final; + + /// the op priority, taken from the primary's request message + Scrub::scrub_prio_t replica_op_priority() const final + { + return m_replica_request_priority; + }; + + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const final; + /// the version that refers to m_flags.priority + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final; + + void add_callback(Context* context) final { m_callbacks.push_back(context); } + + [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc + { + return !m_callbacks.empty(); + } + + /// handle a message carrying a replica map + void map_from_replica(OpRequestRef op) final; + + void scrub_clear_state() final; + + bool is_queued_or_active() const final; + + /** + * add to scrub statistics, but only if the soid is below the scrub start + */ + virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) override + { + ceph_assert(false); + } + + /** + * finalize the parameters of the initiated scrubbing session: + * + * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set; + * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set. + */ + void set_op_parameters(requested_scrub_t& request) final; + + void cleanup_store(ObjectStore::Transaction* t) final; + + bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const override + { + return false; + } + + // ------------------------------------------------------------------------------------------- + // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener) + + [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); } + + void select_range_n_notify() final; + + /// walk the log to find the latest update that affects our chunk + eversion_t search_log_for_updates() const final; + + eversion_t get_last_update_applied() const final + { + return m_pg->recovery_state.get_last_update_applied(); + } + + int pending_active_pushes() const final { return m_pg->active_pushes; } + + void on_init() final; + void on_replica_init() final; + void replica_handling_done() final; + + /// the version of 'scrub_clear_state()' that does not try to invoke FSM services + /// (thus can be called from FSM reactions) + void clear_pgscrub_state() final; + + /* + * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep' + * is asserted - after a configuration-dependent timeout. + */ + void add_delayed_scheduling() final; + + void get_replicas_maps(bool replica_can_preempt) final; + + void on_digest_updates() final; + + void scrub_begin() final; + + void scrub_finish() final; + + ScrubMachineListener::MsgAndEpoch + prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final; + + void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final; + + void send_preempted_replica() final; + + void send_remotes_reserved(epoch_t epoch_queued) final; + void send_reservation_failure(epoch_t epoch_queued) final; + + /** + * does the PG have newer updates than what we (the scrubber) know? + */ + [[nodiscard]] bool has_pg_marked_new_updates() const final; + + void set_subset_last_update(eversion_t e) final; + + void maps_compare_n_cleanup() final; + + Scrub::preemption_t& get_preemptor() final; + + int build_primary_map_chunk() final; + + int build_replica_map_chunk() final; + + void reserve_replicas() final; + + [[nodiscard]] bool was_epoch_changed() const final; + + void set_queued_or_active() final; + void clear_queued_or_active() final; + + void mark_local_map_ready() final; + + [[nodiscard]] bool are_all_maps_available() const final; + + std::string dump_awaited_maps() const final; + + std::ostream& gen_prefix(std::ostream& out) const final; + + protected: + bool state_test(uint64_t m) const { return m_pg->state_test(m); } + void state_set(uint64_t m) { m_pg->state_set(m); } + void state_clear(uint64_t m) { m_pg->state_clear(m); } + + [[nodiscard]] bool is_scrub_registered() const; + + virtual void _scrub_clear_state() {} + + utime_t m_scrub_reg_stamp; ///< stamp we registered for + + ostream& show(ostream& out) const override; + + public: + // ------------------------------------------------------------------------------------------- + + friend ostream& operator<<(ostream& out, const PgScrubber& scrubber); + + static utime_t scrub_must_stamp() { return utime_t(1, 1); } + + virtual ~PgScrubber(); // must be defined separately, in the .cc file + + [[nodiscard]] bool is_scrub_active() const final { return m_active; } + + private: + void reset_internal_state(); + + /** + * the current scrubbing operation is done. We should mark that fact, so that + * all events related to the previous operation can be discarded. + */ + void advance_token(); + + bool is_token_current(Scrub::act_token_t received_token); + + void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); } + + void _scan_snaps(ScrubMap& smap); + + ScrubMap clean_meta_map(); + + /** + * mark down some parameters of the initiated scrub: + * - the epoch when started; + * - the depth of the scrub requested (from the PG_STATE variable) + */ + void reset_epoch(epoch_t epoch_queued); + + void run_callbacks(); + + // ----- methods used to verify the relevance of incoming events: + + /** + * is the incoming event still relevant, and should be processed? + * + * It isn't if: + * - (1) we are no longer 'actively scrubbing'; or + * - (2) the message is from an epoch prior to when we started the current scrub + * session; or + * - (3) the message epoch is from a previous interval; or + * - (4) the 'abort' configuration flags were set. + * + * For (1) & (2) - teh incoming message is discarded, w/o further action. + * + * For (3): (see check_interval() for a full description) if we have not reacted yet + * to this specific new interval, we do now: + * - replica reservations are silently discarded (we count on the replicas to notice + * the interval change and un-reserve themselves); + * - the scrubbing is halted. + * + * For (4): the message will be discarded, but also: + * if this is the first time we've noticed the 'abort' request, we perform the abort. + * + * \returns should the incoming event be processed? + */ + bool is_message_relevant(epoch_t epoch_to_verify); + + /** + * check the 'no scrub' configuration options. + */ + [[nodiscard]] bool should_abort() const; + + /** + * Check the 'no scrub' configuration flags. + * + * Reset everything if the abort was not handled before. + * @returns false if the message was discarded due to abort flag. + */ + [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify); + + [[nodiscard]] bool check_interval(epoch_t epoch_to_verify); + + epoch_t m_last_aborted{}; // last time we've noticed a request to abort + + /** + * return true if any inconsistency/missing is repaired, false otherwise + */ + [[nodiscard]] bool scrub_process_inconsistent(); + + void scrub_compare_maps(); + + bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always + ///< 'true', unless we just got out of a sleep period + + utime_t m_sleep_started_at; + + + // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed' + // to guarantee un-reserving when deleted. + std::optional m_reservations; + std::optional m_local_osd_resource; + + /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing + std::optional m_remote_osd_resource; + + void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when + // Active->NotActive + + protected: + PG* const m_pg; + + /** + * the derivative-specific scrub-finishing touches: + */ + virtual void _scrub_finish() {} + + /** + * Validate consistency of the object info and snap sets. + */ + virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) + {} + + // common code used by build_primary_map_chunk() and build_replica_map_chunk(): + int build_scrub_map_chunk(ScrubMap& map, // primary or replica? + ScrubMapBuilder& pos, + hobject_t start, + hobject_t end, + bool deep); + + std::unique_ptr m_fsm; + const spg_t m_pg_id; ///< a local copy of m_pg->pg_id + OSDService* const m_osds; + const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami; + + epoch_t m_interval_start{0}; ///< interval's 'from' of when scrubbing was first scheduled + /* + * the exact epoch when the scrubbing actually started (started here - cleared checks + * for no-scrub conf). Incoming events are verified against this, with stale events + * discarded. + */ + epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started + + /** + * (replica) a tag identifying a specific scrub "session". Incremented whenever the + * Primary releases the replica scrub resources. + * When the scrub session is terminated (even if the interval remains unchanged, as + * might happen following an asok no-scrub command), stale scrub-resched messages + * triggered by the backend will be discarded. + */ + Scrub::act_token_t m_current_token{1}; + + scrub_flags_t m_flags; + + bool m_active{false}; + + /** + * a flag designed to prevent the initiation of a second scrub on a PG for which scrubbing + * has been initiated. + * + * set once scrubbing was initiated (i.e. - even before the FSM event that + * will trigger a state-change out of Inactive was handled), and only reset + * once the FSM is back in Inactive. + * In other words - its ON period encompasses: + * - the time period covered today by 'queued', and + * - the time when m_active is set, and + * - all the time from scrub_finish() calling update_stats() till the + * FSM handles the 'finished' event + * + * Compared with 'm_active', this flag is asserted earlier and remains ON for longer. + */ + bool m_queued_or_active{false}; + + eversion_t m_subset_last_update{}; + + std::unique_ptr m_store; + + int num_digest_updates_pending{0}; + hobject_t m_start, m_end; ///< note: half-closed: [start,end) + + /// Returns reference to current osdmap + const OSDMapRef& get_osdmap() const; + + /// Returns epoch of current osdmap + epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); } + + CephContext* get_pg_cct() const { return m_pg->cct; } + + // collected statistics + int m_shallow_errors{0}; + int m_deep_errors{0}; + int m_fixed_count{0}; + + /// Maps from objects with errors to missing peers + HobjToShardSetMapping m_missing; + + protected: + /** + * 'm_is_deep' - is the running scrub a deep one? + * + * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is + * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is + * meaningful both for the primary and the replicas, and is used as a parameter when + * building the scrub maps. + */ + bool m_is_deep{false}; + + /** + * If set: affects the backend & scrubber-backend functions called after all + * scrub maps are available. + * + * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be + * a "user facing" status display only). + */ + bool m_is_repair{false}; + + /** + * User-readable summary of the scrubber's current mode of operation. Used for + * both osd.*.log and the cluster log. + * One of: + * "repair" + * "deep-scrub", + * "scrub + * + * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for + * auto_repair will show as "deep-scrub" and not as "repair" (until the first error + * is detected). + */ + std::string_view m_mode_desc; + + void update_op_mode_text(); + +private: + + /** + * initiate a deep-scrub after the current scrub ended with errors. + */ + void request_rescrubbing(requested_scrub_t& req_flags); + + /* + * Select a range of objects to scrub. + * + * By: + * - setting tentative range based on conf and divisor + * - requesting a partial list of elements from the backend; + * - handling some head/clones issues + * + * The selected range is set directly into 'm_start' and 'm_end' + */ + bool select_range(); + + std::list m_callbacks; + + /** + * send a replica (un)reservation request to the acting set + * + * @param opcode - one of MOSDScrubReserve::REQUEST + * or MOSDScrubReserve::RELEASE + */ + void message_all_replicas(int32_t opcode, std::string_view op_text); + + hobject_t m_max_end; ///< Largest end that may have been sent to replicas + ScrubMap m_primary_scrubmap; + ScrubMapBuilder m_primary_scrubmap_pos; + + std::map m_received_maps; + + /// Cleaned std::map pending snap metadata scrub + ScrubMap m_cleaned_meta_map; + + void _request_scrub_map(pg_shard_t replica, + eversion_t version, + hobject_t start, + hobject_t end, + bool deep, + bool allow_preemption); + + + Scrub::MapsCollectionStatus m_maps_status; + + omap_stat_t m_omap_stats = (const struct omap_stat_t){0}; + + /// Maps from objects with errors to inconsistent peers + HobjToShardSetMapping m_inconsistent; + + /// Maps from object with errors to good peers + std::map>> m_authoritative; + + // ------------ members used if we are a replica + + epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message + + ScrubMapBuilder replica_scrubmap_pos; + ScrubMap replica_scrubmap; + + /** + * we mark the request priority as it arrived. It influences the queuing priority + * when we wait for local updates + */ + Scrub::scrub_prio_t m_replica_request_priority; + + /** + * the 'preemption' "state-machine". + * Note: I was considering an orthogonal sub-machine implementation, but as + * the state diagram is extremely simple, the added complexity wasn't justified. + */ + class preemption_data_t : public Scrub::preemption_t { + public: + preemption_data_t(PG* pg); // the PG access is used for conf access (and logs) + + [[nodiscard]] bool is_preemptable() const final { return m_preemptable; } + + bool do_preempt() final + { + if (m_preempted || !m_preemptable) + return false; + + std::lock_guard lk{m_preemption_lock}; + if (!m_preemptable) + return false; + + m_preempted = true; + return true; + } + + /// same as 'do_preempt()' but w/o checks (as once a replica + /// was preempted, we cannot continue) + void replica_preempted() { m_preempted = true; } + + void enable_preemption() + { + std::lock_guard lk{m_preemption_lock}; + if (are_preemptions_left() && !m_preempted) { + m_preemptable = true; + } + } + + /// used by a replica to set preemptability state according to the Primary's request + void force_preemptability(bool is_allowed) + { + // note: no need to lock for a replica + m_preempted = false; + m_preemptable = is_allowed; + } + + bool disable_and_test() final + { + std::lock_guard lk{m_preemption_lock}; + m_preemptable = false; + return m_preempted; + } + + [[nodiscard]] bool was_preempted() const { return m_preempted; } + + [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; } + + void reset(); + + void adjust_parameters() final + { + std::lock_guard lk{m_preemption_lock}; + + if (m_preempted) { + m_preempted = false; + m_preemptable = adjust_left(); + } else { + m_preemptable = are_preemptions_left(); + } + } + + private: + PG* m_pg; + mutable std::mutex m_preemption_lock; + bool m_preemptable{false}; + bool m_preempted{false}; + int m_left; + size_t m_size_divisor{1}; + bool are_preemptions_left() const { return m_left > 0; } + + bool adjust_left() + { + if (m_left > 0) { + --m_left; + m_size_divisor *= 2; + } + return m_left > 0; + } + }; + + preemption_data_t preemption_data; +}; diff --git a/src/osd/recovery_types.cc b/src/osd/recovery_types.cc new file mode 100644 index 000000000..3dd49a82d --- /dev/null +++ b/src/osd/recovery_types.cc @@ -0,0 +1,16 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "recovery_types.h" + +ostream& operator<<(ostream& out, const BackfillInterval& bi) +{ + out << "BackfillInfo(" << bi.begin << "-" << bi.end + << " " << bi.objects.size() << " objects"; + if (!bi.objects.empty()) + out << " " << bi.objects; + out << ")"; + return out; +} + + diff --git a/src/osd/recovery_types.h b/src/osd/recovery_types.h new file mode 100644 index 000000000..73a621882 --- /dev/null +++ b/src/osd/recovery_types.h @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "osd_types.h" + +/** + * BackfillInterval + * + * Represents the objects in a range [begin, end) + * + * Possible states: + * 1) begin == end == hobject_t() indicates the the interval is unpopulated + * 2) Else, objects contains all objects in [begin, end) + */ +struct BackfillInterval { + // info about a backfill interval on a peer + eversion_t version; /// version at which the scan occurred + std::map objects; + hobject_t begin; + hobject_t end; + + /// clear content + void clear() { + *this = BackfillInterval(); + } + + /// clear objects std::list only + void clear_objects() { + objects.clear(); + } + + /// reinstantiate with a new start+end position and sort order + void reset(hobject_t start) { + clear(); + begin = end = start; + } + + /// true if there are no objects in this interval + bool empty() const { + return objects.empty(); + } + + /// true if interval extends to the end of the range + bool extends_to_end() const { + return end.is_max(); + } + + /// removes items <= soid and adjusts begin to the first object + void trim_to(const hobject_t &soid) { + trim(); + while (!objects.empty() && + objects.begin()->first <= soid) { + pop_front(); + } + } + + /// Adjusts begin to the first object + void trim() { + if (!objects.empty()) + begin = objects.begin()->first; + else + begin = end; + } + + /// drop first entry, and adjust @begin accordingly + void pop_front() { + ceph_assert(!objects.empty()); + objects.erase(objects.begin()); + trim(); + } + + /// dump + void dump(ceph::Formatter *f) const { + f->dump_stream("begin") << begin; + f->dump_stream("end") << end; + f->open_array_section("objects"); + for (std::map::const_iterator i = + objects.begin(); + i != objects.end(); + ++i) { + f->open_object_section("object"); + f->dump_stream("object") << i->first; + f->dump_stream("version") << i->second; + f->close_section(); + } + f->close_section(); + } +}; + +std::ostream &operator<<(std::ostream &out, const BackfillInterval &bi); + diff --git a/src/osd/scheduler/OpScheduler.cc b/src/osd/scheduler/OpScheduler.cc new file mode 100644 index 000000000..3ce6fdb55 --- /dev/null +++ b/src/osd/scheduler/OpScheduler.cc @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include + +#include "osd/scheduler/OpScheduler.h" + +#include "common/WeightedPriorityQueue.h" +#include "osd/scheduler/mClockScheduler.h" + +namespace ceph::osd::scheduler { + +OpSchedulerRef make_scheduler( + CephContext *cct, uint32_t num_shards, bool is_rotational) +{ + const std::string *type = &cct->_conf->osd_op_queue; + if (*type == "debug_random") { + static const std::string index_lookup[] = { "mclock_scheduler", + "wpq" }; + srand(time(NULL)); + unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0])); + type = &index_lookup[which]; + } + + if (*type == "wpq" ) { + // default is 'wpq' + return std::make_unique< + ClassedOpQueueScheduler>>( + cct, + cct->_conf->osd_op_pq_max_tokens_per_priority, + cct->_conf->osd_op_pq_min_cost + ); + } else if (*type == "mclock_scheduler") { + return std::make_unique(cct, num_shards, is_rotational); + } else { + ceph_assert("Invalid choice of wq" == 0); + } +} + +std::ostream &operator<<(std::ostream &lhs, const OpScheduler &rhs) { + rhs.print(lhs); + return lhs; +} + +} diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h new file mode 100644 index 000000000..6e2bb5abd --- /dev/null +++ b/src/osd/scheduler/OpScheduler.h @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include + +#include "common/ceph_context.h" +#include "osd/scheduler/OpSchedulerItem.h" + +namespace ceph::osd::scheduler { + +using client = uint64_t; +using WorkItem = std::variant; + +/** + * Base interface for classes responsible for choosing + * op processing order in the OSD. + */ +class OpScheduler { +public: + // Enqueue op for scheduling + virtual void enqueue(OpSchedulerItem &&item) = 0; + + // Enqueue op for processing as though it were enqueued prior + // to other items already scheduled. + virtual void enqueue_front(OpSchedulerItem &&item) = 0; + + // Returns true iff there are no ops scheduled + virtual bool empty() const = 0; + + // Return next op to be processed + virtual WorkItem dequeue() = 0; + + // Dump formatted representation for the queue + virtual void dump(ceph::Formatter &f) const = 0; + + // Print human readable brief description with relevant parameters + virtual void print(std::ostream &out) const = 0; + + // Apply config changes to the scheduler (if any) + virtual void update_configuration() = 0; + + // Destructor + virtual ~OpScheduler() {}; +}; + +std::ostream &operator<<(std::ostream &lhs, const OpScheduler &); +using OpSchedulerRef = std::unique_ptr; + +OpSchedulerRef make_scheduler( + CephContext *cct, uint32_t num_shards, bool is_rotational); + +/** + * Implements OpScheduler in terms of OpQueue + * + * Templated on queue type to avoid dynamic dispatch, T should implement + * OpQueue. This adapter is mainly responsible for + * the boilerplate priority cutoff/strict concept which is needed for + * OpQueue based implementations. + */ +template +class ClassedOpQueueScheduler final : public OpScheduler { + unsigned cutoff; + T queue; + + static unsigned int get_io_prio_cut(CephContext *cct) { + if (cct->_conf->osd_op_queue_cut_off == "debug_random") { + srand(time(NULL)); + return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW; + } else if (cct->_conf->osd_op_queue_cut_off == "high") { + return CEPH_MSG_PRIO_HIGH; + } else { + // default / catch-all is 'low' + return CEPH_MSG_PRIO_LOW; + } + } +public: + template + ClassedOpQueueScheduler(CephContext *cct, Args&&... args) : + cutoff(get_io_prio_cut(cct)), + queue(std::forward(args)...) + {} + + void enqueue(OpSchedulerItem &&item) final { + unsigned priority = item.get_priority(); + unsigned cost = item.get_cost(); + + if (priority >= cutoff) + queue.enqueue_strict( + item.get_owner(), priority, std::move(item)); + else + queue.enqueue( + item.get_owner(), priority, cost, std::move(item)); + } + + void enqueue_front(OpSchedulerItem &&item) final { + unsigned priority = item.get_priority(); + unsigned cost = item.get_cost(); + if (priority >= cutoff) + queue.enqueue_strict_front( + item.get_owner(), + priority, std::move(item)); + else + queue.enqueue_front( + item.get_owner(), + priority, cost, std::move(item)); + } + + bool empty() const final { + return queue.empty(); + } + + WorkItem dequeue() final { + return queue.dequeue(); + } + + void dump(ceph::Formatter &f) const final { + return queue.dump(&f); + } + + void print(std::ostream &out) const final { + out << "ClassedOpQueueScheduler(queue="; + queue.print(out); + out << ", cutoff=" << cutoff << ")"; + } + + void update_configuration() final { + // no-op + } + + ~ClassedOpQueueScheduler() final {}; +}; + +} diff --git a/src/osd/scheduler/OpSchedulerItem.cc b/src/osd/scheduler/OpSchedulerItem.cc new file mode 100644 index 000000000..27db1dfa3 --- /dev/null +++ b/src/osd/scheduler/OpSchedulerItem.cc @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "osd/scheduler/OpSchedulerItem.h" +#include "osd/OSD.h" +#ifdef HAVE_JAEGER +#include "common/tracer.h" +#endif + +namespace ceph::osd::scheduler { + +void PGOpItem::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ +#ifdef HAVE_JAEGER + auto PGOpItem_span = jaeger_tracing::child_span("PGOpItem::run", op->osd_parent_span); +#endif + osd->dequeue_op(pg, op, handle); + pg->unlock(); +} + +void PGPeeringItem::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_peering_evt(sdata, pg.get(), evt, handle); +} + +void PGSnapTrim::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + pg->snap_trimmer(epoch_queued); + pg->unlock(); +} + +void PGScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) +{ + pg->scrub(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubAfterRepair::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->recovery_scrub(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubResched::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_scrub_resched(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubResourcesOK::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_resources_granted(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubDenied::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_resources_denied(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubPushesUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_pushes_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubAppliedUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_applied_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubUnblocked::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_unblocking(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubDigestUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_digest_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubGotLocalMap::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_local_map_ready(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubGotReplMaps::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_replmaps_ready(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubMapsCompared::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_maps_compared(epoch_queued, handle); + pg->unlock(); +} + +void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) +{ + pg->replica_scrub(epoch_queued, activation_index, handle); + pg->unlock(); +} + +void PGRepScrubResched::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->replica_scrub_resched(epoch_queued, activation_index, handle); + pg->unlock(); +} + +void PGScrubReplicaPushes::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_replica_pushes(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubScrubFinished::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_scrub_is_finished(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubGetNextChunk::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_get_next_chunk(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubChunkIsBusy::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_chunk_busy(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubChunkIsFree::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_chunk_free(epoch_queued, handle); + pg->unlock(); +} + +void PGRecovery::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->do_recovery(pg.get(), epoch_queued, reserved_pushes, handle); + pg->unlock(); +} + +void PGRecoveryContext::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + c.release()->complete(handle); + pg->unlock(); +} + +void PGDelete::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_delete(sdata, pg.get(), epoch_queued, handle); +} + +void PGRecoveryMsg::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_op(pg, op, handle); + pg->unlock(); +} + +} diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h new file mode 100644 index 000000000..7ba59838e --- /dev/null +++ b/src/osd/scheduler/OpSchedulerItem.h @@ -0,0 +1,629 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include + +#include "include/types.h" +#include "include/utime.h" +#include "osd/OpRequest.h" +#include "osd/PG.h" +#include "osd/PGPeeringEvent.h" +#include "messages/MOSDOp.h" + + +class OSD; +class OSDShard; + +namespace ceph::osd::scheduler { + +enum class op_scheduler_class : uint8_t { + background_recovery = 0, + background_best_effort, + immediate, + client, +}; + +class OpSchedulerItem { +public: + class OrderLocker { + public: + using Ref = std::unique_ptr; + virtual void lock() = 0; + virtual void unlock() = 0; + virtual ~OrderLocker() {} + }; + + // Abstraction for operations queueable in the op queue + class OpQueueable { + public: + enum class op_type_t { + client_op, + peering_event, + bg_snaptrim, + bg_recovery, + bg_scrub, + bg_pg_delete + }; + using Ref = std::unique_ptr; + + /// Items with the same queue token will end up in the same shard + virtual uint32_t get_queue_token() const = 0; + + /* Items will be dequeued and locked atomically w.r.t. other items with the + * same ordering token */ + virtual const spg_t& get_ordering_token() const = 0; + virtual OrderLocker::Ref get_order_locker(PGRef pg) = 0; + virtual op_type_t get_op_type() const = 0; + virtual std::optional maybe_get_op() const { + return std::nullopt; + } + + virtual uint64_t get_reserved_pushes() const { + return 0; + } + + virtual bool is_peering() const { + return false; + } + virtual bool peering_requires_pg() const { + ceph_abort(); + } + virtual const PGCreateInfo *creates_pg() const { + return nullptr; + } + + virtual std::ostream &print(std::ostream &rhs) const = 0; + + virtual void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) = 0; + virtual op_scheduler_class get_scheduler_class() const = 0; + + virtual ~OpQueueable() {} + friend std::ostream& operator<<(std::ostream& out, const OpQueueable& q) { + return q.print(out); + } + + }; + +private: + OpQueueable::Ref qitem; + int cost; + unsigned priority; + utime_t start_time; + uint64_t owner; ///< global id (e.g., client.XXX) + epoch_t map_epoch; ///< an epoch we expect the PG to exist in + +public: + OpSchedulerItem( + OpQueueable::Ref &&item, + int cost, + unsigned priority, + utime_t start_time, + uint64_t owner, + epoch_t e) + : qitem(std::move(item)), + cost(cost), + priority(priority), + start_time(start_time), + owner(owner), + map_epoch(e) + {} + OpSchedulerItem(OpSchedulerItem &&) = default; + OpSchedulerItem(const OpSchedulerItem &) = delete; + OpSchedulerItem &operator=(OpSchedulerItem &&) = default; + OpSchedulerItem &operator=(const OpSchedulerItem &) = delete; + + OrderLocker::Ref get_order_locker(PGRef pg) { + return qitem->get_order_locker(pg); + } + uint32_t get_queue_token() const { + return qitem->get_queue_token(); + } + const spg_t& get_ordering_token() const { + return qitem->get_ordering_token(); + } + using op_type_t = OpQueueable::op_type_t; + OpQueueable::op_type_t get_op_type() const { + return qitem->get_op_type(); + } + std::optional maybe_get_op() const { + return qitem->maybe_get_op(); + } + uint64_t get_reserved_pushes() const { + return qitem->get_reserved_pushes(); + } + void run(OSD *osd, OSDShard *sdata,PGRef& pg, ThreadPool::TPHandle &handle) { + qitem->run(osd, sdata, pg, handle); + } + unsigned get_priority() const { return priority; } + int get_cost() const { return cost; } + utime_t get_start_time() const { return start_time; } + uint64_t get_owner() const { return owner; } + epoch_t get_map_epoch() const { return map_epoch; } + + bool is_peering() const { + return qitem->is_peering(); + } + + const PGCreateInfo *creates_pg() const { + return qitem->creates_pg(); + } + + bool peering_requires_pg() const { + return qitem->peering_requires_pg(); + } + + op_scheduler_class get_scheduler_class() const { + return qitem->get_scheduler_class(); + } + + friend std::ostream& operator<<(std::ostream& out, const OpSchedulerItem& item) { + out << "OpSchedulerItem(" + << item.get_ordering_token() << " " << *item.qitem + << " prio " << item.get_priority() + << " cost " << item.get_cost() + << " e" << item.get_map_epoch(); + if (item.get_reserved_pushes()) { + out << " reserved_pushes " << item.get_reserved_pushes(); + } + return out << ")"; + } +}; // class OpSchedulerItem + +/// Implements boilerplate for operations queued for the pg lock +class PGOpQueueable : public OpSchedulerItem::OpQueueable { + spg_t pgid; +protected: + const spg_t& get_pgid() const { + return pgid; + } +public: + explicit PGOpQueueable(spg_t pg) : pgid(pg) {} + uint32_t get_queue_token() const final { + return get_pgid().ps(); + } + + const spg_t& get_ordering_token() const final { + return get_pgid(); + } + + OpSchedulerItem::OrderLocker::Ref get_order_locker(PGRef pg) final { + class Locker : public OpSchedulerItem::OrderLocker { + PGRef pg; + public: + explicit Locker(PGRef pg) : pg(pg) {} + void lock() final { + pg->lock(); + } + void unlock() final { + pg->unlock(); + } + }; + return OpSchedulerItem::OrderLocker::Ref( + new Locker(pg)); + } +}; + +class PGOpItem : public PGOpQueueable { + OpRequestRef op; + + const MOSDOp *maybe_get_mosd_op() const { + auto req = op->get_req(); + if (req->get_type() == CEPH_MSG_OSD_OP) { + return op->get_req(); + } else { + return nullptr; + } + } + +public: + PGOpItem(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {} + op_type_t get_op_type() const final { + + return op_type_t::client_op; + } + + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGOpItem(op=" << *(op->get_req()) << ")"; + } + + std::optional maybe_get_op() const final { + return op; + } + + op_scheduler_class get_scheduler_class() const final { + auto type = op->get_req()->get_type(); + if (type == CEPH_MSG_OSD_OP || + type == CEPH_MSG_OSD_BACKOFF) { + return op_scheduler_class::client; + } else { + return op_scheduler_class::immediate; + } + } + + void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; +}; + +class PGPeeringItem : public PGOpQueueable { + PGPeeringEventRef evt; +public: + PGPeeringItem(spg_t pg, PGPeeringEventRef e) : PGOpQueueable(pg), evt(e) {} + op_type_t get_op_type() const final { + return op_type_t::peering_event; + } + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGPeeringEvent(" << evt->get_desc() << ")"; + } + void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + bool is_peering() const override { + return true; + } + bool peering_requires_pg() const override { + return evt->requires_pg; + } + const PGCreateInfo *creates_pg() const override { + return evt->create_info.get(); + } + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::immediate; + } +}; + +class PGSnapTrim : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGSnapTrim( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), epoch_queued(epoch_queued) {} + op_type_t get_op_type() const final { + return op_type_t::bg_snaptrim; + } + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGSnapTrim(pgid=" << get_pgid() + << " epoch_queued=" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::background_best_effort; + } +}; + +class PGScrub : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGScrub( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), epoch_queued(epoch_queued) {} + op_type_t get_op_type() const final { + return op_type_t::bg_scrub; + } + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGScrub(pgid=" << get_pgid() + << "epoch_queued=" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::background_best_effort; + } +}; + +class PGScrubItem : public PGOpQueueable { + protected: + epoch_t epoch_queued; + Scrub::act_token_t activation_index; + std::string_view message_name; + PGScrubItem(spg_t pg, epoch_t epoch_queued, std::string_view derivative_name) + : PGOpQueueable{pg} + , epoch_queued{epoch_queued} + , activation_index{0} + , message_name{derivative_name} + {} + PGScrubItem(spg_t pg, + epoch_t epoch_queued, + Scrub::act_token_t op_index, + std::string_view derivative_name) + : PGOpQueueable{pg} + , epoch_queued{epoch_queued} + , activation_index{op_index} + , message_name{derivative_name} + {} + op_type_t get_op_type() const final { return op_type_t::bg_scrub; } + std::ostream& print(std::ostream& rhs) const final + { + return rhs << message_name << "(pgid=" << get_pgid() + << "epoch_queued=" << epoch_queued + << " scrub-token=" << activation_index << ")"; + } + void run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) override = 0; + op_scheduler_class get_scheduler_class() const final + { + return op_scheduler_class::background_best_effort; + } +}; + +class PGScrubResched : public PGScrubItem { + public: + PGScrubResched(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubResched"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +/** + * all replicas have granted our scrub resources request + */ +class PGScrubResourcesOK : public PGScrubItem { + public: + PGScrubResourcesOK(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubResourcesOK"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +/** + * scrub resources requests denied by replica(s) + */ +class PGScrubDenied : public PGScrubItem { + public: + PGScrubDenied(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubDenied"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +/** + * called when a repair process completes, to initiate scrubbing. No local/remote + * resources are allocated. + */ +class PGScrubAfterRepair : public PGScrubItem { + public: + PGScrubAfterRepair(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubAfterRepair"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubPushesUpdate : public PGScrubItem { + public: + PGScrubPushesUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubPushesUpdate"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubAppliedUpdate : public PGScrubItem { + public: + PGScrubAppliedUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubAppliedUpdate"} + {} + void run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + [[maybe_unused]] ThreadPool::TPHandle& handle) final; +}; + +class PGScrubUnblocked : public PGScrubItem { + public: + PGScrubUnblocked(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubUnblocked"} + {} + void run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + [[maybe_unused]] ThreadPool::TPHandle& handle) final; +}; + +class PGScrubDigestUpdate : public PGScrubItem { + public: + PGScrubDigestUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubDigestUpdate"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubGotLocalMap : public PGScrubItem { + public: + PGScrubGotLocalMap(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubGotLocalMap"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubGotReplMaps : public PGScrubItem { + public: + PGScrubGotReplMaps(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubGotReplMaps"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubMapsCompared : public PGScrubItem { + public: + PGScrubMapsCompared(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubMapsCompared"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGRepScrub : public PGScrubItem { + public: + PGRepScrub(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token) + : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrub"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGRepScrubResched : public PGScrubItem { + public: + PGRepScrubResched(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token) + : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrubResched"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubReplicaPushes : public PGScrubItem { + public: + PGScrubReplicaPushes(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubReplicaPushes"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubScrubFinished : public PGScrubItem { + public: + PGScrubScrubFinished(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubScrubFinished"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubGetNextChunk : public PGScrubItem { + public: + PGScrubGetNextChunk(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubGetNextChunk"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubChunkIsBusy : public PGScrubItem { + public: + PGScrubChunkIsBusy(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubChunkIsBusy"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubChunkIsFree : public PGScrubItem { + public: + PGScrubChunkIsFree(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubChunkIsFree"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGRecovery : public PGOpQueueable { + epoch_t epoch_queued; + uint64_t reserved_pushes; +public: + PGRecovery( + spg_t pg, + epoch_t epoch_queued, + uint64_t reserved_pushes) + : PGOpQueueable(pg), + epoch_queued(epoch_queued), + reserved_pushes(reserved_pushes) {} + op_type_t get_op_type() const final { + return op_type_t::bg_recovery; + } + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGRecovery(pgid=" << get_pgid() + << " epoch_queued=" << epoch_queued + << " reserved_pushes=" << reserved_pushes + << ")"; + } + uint64_t get_reserved_pushes() const final { + return reserved_pushes; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::background_recovery; + } +}; + +class PGRecoveryContext : public PGOpQueueable { + std::unique_ptr> c; + epoch_t epoch; +public: + PGRecoveryContext(spg_t pgid, + GenContext *c, epoch_t epoch) + : PGOpQueueable(pgid), + c(c), epoch(epoch) {} + op_type_t get_op_type() const final { + return op_type_t::bg_recovery; + } + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGRecoveryContext(pgid=" << get_pgid() + << " c=" << c.get() << " epoch=" << epoch + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::background_recovery; + } +}; + +class PGDelete : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGDelete( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), + epoch_queued(epoch_queued) {} + op_type_t get_op_type() const final { + return op_type_t::bg_pg_delete; + } + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGDelete(" << get_pgid() + << " e" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::background_best_effort; + } +}; + +class PGRecoveryMsg : public PGOpQueueable { + OpRequestRef op; + +public: + PGRecoveryMsg(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {} + op_type_t get_op_type() const final { + return op_type_t::bg_recovery; + } + + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGRecoveryMsg(op=" << *(op->get_req()) << ")"; + } + + std::optional maybe_get_op() const final { + return op; + } + + op_scheduler_class get_scheduler_class() const final { + auto priority = op->get_req()->get_priority(); + if (priority >= CEPH_MSG_PRIO_HIGH) { + return op_scheduler_class::immediate; + } + return op_scheduler_class::background_recovery; + } + + void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; +}; + +} diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc new file mode 100644 index 000000000..f2f0ffc3d --- /dev/null +++ b/src/osd/scheduler/mClockScheduler.cc @@ -0,0 +1,514 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include +#include + +#include "osd/scheduler/mClockScheduler.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout << "mClockScheduler: " + + +namespace ceph::osd::scheduler { + +mClockScheduler::mClockScheduler(CephContext *cct, + uint32_t num_shards, + bool is_rotational) + : cct(cct), + num_shards(num_shards), + is_rotational(is_rotational), + scheduler( + std::bind(&mClockScheduler::ClientRegistry::get_info, + &client_registry, + _1), + dmc::AtLimit::Wait, + cct->_conf.get_val("osd_mclock_scheduler_anticipation_timeout")) +{ + cct->_conf.add_observer(this); + ceph_assert(num_shards > 0); + set_max_osd_capacity(); + set_osd_mclock_cost_per_io(); + set_osd_mclock_cost_per_byte(); + set_mclock_profile(); + enable_mclock_profile_settings(); + client_registry.update_from_config(cct->_conf); +} + +void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf) +{ + default_external_client_info.update( + conf.get_val("osd_mclock_scheduler_client_res"), + conf.get_val("osd_mclock_scheduler_client_wgt"), + conf.get_val("osd_mclock_scheduler_client_lim")); + + internal_client_infos[ + static_cast(op_scheduler_class::background_recovery)].update( + conf.get_val("osd_mclock_scheduler_background_recovery_res"), + conf.get_val("osd_mclock_scheduler_background_recovery_wgt"), + conf.get_val("osd_mclock_scheduler_background_recovery_lim")); + + internal_client_infos[ + static_cast(op_scheduler_class::background_best_effort)].update( + conf.get_val("osd_mclock_scheduler_background_best_effort_res"), + conf.get_val("osd_mclock_scheduler_background_best_effort_wgt"), + conf.get_val("osd_mclock_scheduler_background_best_effort_lim")); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( + const client_profile_id_t &client) const +{ + auto ret = external_client_infos.find(client); + if (ret == external_client_infos.end()) + return &default_external_client_info; + else + return &(ret->second); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( + const scheduler_id_t &id) const { + switch (id.class_id) { + case op_scheduler_class::immediate: + ceph_assert(0 == "Cannot schedule immediate"); + return (dmc::ClientInfo*)nullptr; + case op_scheduler_class::client: + return get_external_client(id.client_profile_id); + default: + ceph_assert(static_cast(id.class_id) < internal_client_infos.size()); + return &internal_client_infos[static_cast(id.class_id)]; + } +} + +void mClockScheduler::set_max_osd_capacity() +{ + if (is_rotational) { + max_osd_capacity = + cct->_conf.get_val("osd_mclock_max_capacity_iops_hdd"); + } else { + max_osd_capacity = + cct->_conf.get_val("osd_mclock_max_capacity_iops_ssd"); + } + // Set per op-shard iops limit + max_osd_capacity /= num_shards; + dout(1) << __func__ << " #op shards: " << num_shards + << std::fixed << std::setprecision(2) + << " max osd capacity(iops) per shard: " << max_osd_capacity + << dendl; +} + +void mClockScheduler::set_osd_mclock_cost_per_io() +{ + std::chrono::seconds sec(1); + if (cct->_conf.get_val("osd_mclock_cost_per_io_usec")) { + osd_mclock_cost_per_io = + cct->_conf.get_val("osd_mclock_cost_per_io_usec"); + } else { + if (is_rotational) { + osd_mclock_cost_per_io = + cct->_conf.get_val("osd_mclock_cost_per_io_usec_hdd"); + // For HDDs, convert value to seconds + osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count(); + } else { + // For SSDs, convert value to milliseconds + osd_mclock_cost_per_io = + cct->_conf.get_val("osd_mclock_cost_per_io_usec_ssd"); + osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count(); + } + } + dout(1) << __func__ << " osd_mclock_cost_per_io: " + << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io + << dendl; +} + +void mClockScheduler::set_osd_mclock_cost_per_byte() +{ + std::chrono::seconds sec(1); + if (cct->_conf.get_val("osd_mclock_cost_per_byte_usec")) { + osd_mclock_cost_per_byte = + cct->_conf.get_val("osd_mclock_cost_per_byte_usec"); + } else { + if (is_rotational) { + osd_mclock_cost_per_byte = + cct->_conf.get_val("osd_mclock_cost_per_byte_usec_hdd"); + // For HDDs, convert value to seconds + osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count(); + } else { + osd_mclock_cost_per_byte = + cct->_conf.get_val("osd_mclock_cost_per_byte_usec_ssd"); + // For SSDs, convert value to milliseconds + osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count(); + } + } + dout(1) << __func__ << " osd_mclock_cost_per_byte: " + << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte + << dendl; +} + +void mClockScheduler::set_mclock_profile() +{ + mclock_profile = cct->_conf.get_val("osd_mclock_profile"); + dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl; +} + +std::string mClockScheduler::get_mclock_profile() +{ + return mclock_profile; +} + +void mClockScheduler::set_balanced_profile_allocations() +{ + // Client Allocation: + // reservation: 40% | weight: 1 | limit: 100% | + // Background Recovery Allocation: + // reservation: 40% | weight: 1 | limit: 150% | + // Background Best Effort Allocation: + // reservation: 20% | weight: 2 | limit: max | + + // Client + uint64_t client_res = static_cast( + std::round(0.40 * max_osd_capacity)); + uint64_t client_lim = static_cast( + std::round(max_osd_capacity)); + uint64_t client_wgt = default_min; + + // Background Recovery + uint64_t rec_res = static_cast( + std::round(0.40 * max_osd_capacity)); + uint64_t rec_lim = static_cast( + std::round(1.5 * max_osd_capacity)); + uint64_t rec_wgt = default_min; + + // Background Best Effort + uint64_t best_effort_res = static_cast( + std::round(0.20 * max_osd_capacity)); + uint64_t best_effort_lim = default_max; + uint64_t best_effort_wgt = 2; + + // Set the allocations for the mclock clients + client_allocs[ + static_cast(op_scheduler_class::client)].update( + client_res, + client_wgt, + client_lim); + client_allocs[ + static_cast(op_scheduler_class::background_recovery)].update( + rec_res, + rec_wgt, + rec_lim); + client_allocs[ + static_cast(op_scheduler_class::background_best_effort)].update( + best_effort_res, + best_effort_wgt, + best_effort_lim); +} + +void mClockScheduler::set_high_recovery_ops_profile_allocations() +{ + // Client Allocation: + // reservation: 30% | weight: 1 | limit: 80% | + // Background Recovery Allocation: + // reservation: 60% | weight: 2 | limit: 200% | + // Background Best Effort Allocation: + // reservation: 1 | weight: 2 | limit: max | + + // Client + uint64_t client_res = static_cast( + std::round(0.30 * max_osd_capacity)); + uint64_t client_lim = static_cast( + std::round(0.80 * max_osd_capacity)); + uint64_t client_wgt = default_min; + + // Background Recovery + uint64_t rec_res = static_cast( + std::round(0.60 * max_osd_capacity)); + uint64_t rec_lim = static_cast( + std::round(2.0 * max_osd_capacity)); + uint64_t rec_wgt = 2; + + // Background Best Effort + uint64_t best_effort_res = default_min; + uint64_t best_effort_lim = default_max; + uint64_t best_effort_wgt = 2; + + // Set the allocations for the mclock clients + client_allocs[ + static_cast(op_scheduler_class::client)].update( + client_res, + client_wgt, + client_lim); + client_allocs[ + static_cast(op_scheduler_class::background_recovery)].update( + rec_res, + rec_wgt, + rec_lim); + client_allocs[ + static_cast(op_scheduler_class::background_best_effort)].update( + best_effort_res, + best_effort_wgt, + best_effort_lim); +} + +void mClockScheduler::set_high_client_ops_profile_allocations() +{ + // Client Allocation: + // reservation: 50% | weight: 2 | limit: max | + // Background Recovery Allocation: + // reservation: 25% | weight: 1 | limit: 100% | + // Background Best Effort Allocation: + // reservation: 25% | weight: 2 | limit: max | + + // Client + uint64_t client_res = static_cast( + std::round(0.50 * max_osd_capacity)); + uint64_t client_wgt = 2; + uint64_t client_lim = default_max; + + // Background Recovery + uint64_t rec_res = static_cast( + std::round(0.25 * max_osd_capacity)); + uint64_t rec_lim = static_cast( + std::round(max_osd_capacity)); + uint64_t rec_wgt = default_min; + + // Background Best Effort + uint64_t best_effort_res = static_cast( + std::round(0.25 * max_osd_capacity)); + uint64_t best_effort_lim = default_max; + uint64_t best_effort_wgt = 2; + + // Set the allocations for the mclock clients + client_allocs[ + static_cast(op_scheduler_class::client)].update( + client_res, + client_wgt, + client_lim); + client_allocs[ + static_cast(op_scheduler_class::background_recovery)].update( + rec_res, + rec_wgt, + rec_lim); + client_allocs[ + static_cast(op_scheduler_class::background_best_effort)].update( + best_effort_res, + best_effort_wgt, + best_effort_lim); +} + +void mClockScheduler::enable_mclock_profile_settings() +{ + // Nothing to do for "custom" profile + if (mclock_profile == "custom") { + return; + } + + // Set mclock and ceph config options for the chosen profile + if (mclock_profile == "balanced") { + set_balanced_profile_allocations(); + } else if (mclock_profile == "high_recovery_ops") { + set_high_recovery_ops_profile_allocations(); + } else if (mclock_profile == "high_client_ops") { + set_high_client_ops_profile_allocations(); + } else { + ceph_assert("Invalid choice of mclock profile" == 0); + return; + } + + // Set the mclock config parameters + set_profile_config(); +} + +void mClockScheduler::set_profile_config() +{ + ClientAllocs client = client_allocs[ + static_cast(op_scheduler_class::client)]; + ClientAllocs rec = client_allocs[ + static_cast(op_scheduler_class::background_recovery)]; + ClientAllocs best_effort = client_allocs[ + static_cast(op_scheduler_class::background_best_effort)]; + + // Set external client params + cct->_conf.set_val("osd_mclock_scheduler_client_res", + std::to_string(client.res)); + cct->_conf.set_val("osd_mclock_scheduler_client_wgt", + std::to_string(client.wgt)); + cct->_conf.set_val("osd_mclock_scheduler_client_lim", + std::to_string(client.lim)); + + // Set background recovery client params + cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res", + std::to_string(rec.res)); + cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt", + std::to_string(rec.wgt)); + cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim", + std::to_string(rec.lim)); + + // Set background best effort client params + cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res", + std::to_string(best_effort.res)); + cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt", + std::to_string(best_effort.wgt)); + cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim", + std::to_string(best_effort.lim)); +} + +int mClockScheduler::calc_scaled_cost(int item_cost) +{ + // Calculate total scaled cost in secs + int scaled_cost = + std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost)); + return std::max(scaled_cost, 1); +} + +void mClockScheduler::update_configuration() +{ + // Apply configuration change. The expectation is that + // at least one of the tracked mclock config option keys + // is modified before calling this method. + cct->_conf.apply_changes(nullptr); +} + +void mClockScheduler::dump(ceph::Formatter &f) const +{ +} + +void mClockScheduler::enqueue(OpSchedulerItem&& item) +{ + auto id = get_scheduler_id(item); + + // TODO: move this check into OpSchedulerItem, handle backwards compat + if (op_scheduler_class::immediate == id.class_id) { + immediate.push_front(std::move(item)); + } else { + int cost = calc_scaled_cost(item.get_cost()); + // Add item to scheduler queue + scheduler.add_request( + std::move(item), + id, + cost); + } +} + +void mClockScheduler::enqueue_front(OpSchedulerItem&& item) +{ + immediate.push_back(std::move(item)); + // TODO: item may not be immediate, update mclock machinery to permit + // putting the item back in the queue +} + +WorkItem mClockScheduler::dequeue() +{ + if (!immediate.empty()) { + WorkItem work_item{std::move(immediate.back())}; + immediate.pop_back(); + return work_item; + } else { + mclock_queue_t::PullReq result = scheduler.pull_request(); + if (result.is_future()) { + return result.getTime(); + } else if (result.is_none()) { + ceph_assert( + 0 == "Impossible, must have checked empty() first"); + return {}; + } else { + ceph_assert(result.is_retn()); + + auto &retn = result.get_retn(); + return std::move(*retn.request); + } + } +} + +const char** mClockScheduler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_mclock_scheduler_client_res", + "osd_mclock_scheduler_client_wgt", + "osd_mclock_scheduler_client_lim", + "osd_mclock_scheduler_background_recovery_res", + "osd_mclock_scheduler_background_recovery_wgt", + "osd_mclock_scheduler_background_recovery_lim", + "osd_mclock_scheduler_background_best_effort_res", + "osd_mclock_scheduler_background_best_effort_wgt", + "osd_mclock_scheduler_background_best_effort_lim", + "osd_mclock_cost_per_io_usec", + "osd_mclock_cost_per_io_usec_hdd", + "osd_mclock_cost_per_io_usec_ssd", + "osd_mclock_cost_per_byte_usec", + "osd_mclock_cost_per_byte_usec_hdd", + "osd_mclock_cost_per_byte_usec_ssd", + "osd_mclock_max_capacity_iops_hdd", + "osd_mclock_max_capacity_iops_ssd", + "osd_mclock_profile", + NULL + }; + return KEYS; +} + +void mClockScheduler::handle_conf_change( + const ConfigProxy& conf, + const std::set &changed) +{ + if (changed.count("osd_mclock_cost_per_io_usec") || + changed.count("osd_mclock_cost_per_io_usec_hdd") || + changed.count("osd_mclock_cost_per_io_usec_ssd")) { + set_osd_mclock_cost_per_io(); + } + if (changed.count("osd_mclock_cost_per_byte_usec") || + changed.count("osd_mclock_cost_per_byte_usec_hdd") || + changed.count("osd_mclock_cost_per_byte_usec_ssd")) { + set_osd_mclock_cost_per_byte(); + } + if (changed.count("osd_mclock_max_capacity_iops_hdd") || + changed.count("osd_mclock_max_capacity_iops_ssd")) { + set_max_osd_capacity(); + if (mclock_profile != "custom") { + enable_mclock_profile_settings(); + client_registry.update_from_config(conf); + } + } + if (changed.count("osd_mclock_profile")) { + set_mclock_profile(); + if (mclock_profile != "custom") { + enable_mclock_profile_settings(); + client_registry.update_from_config(conf); + } + } + if (changed.count("osd_mclock_scheduler_client_res") || + changed.count("osd_mclock_scheduler_client_wgt") || + changed.count("osd_mclock_scheduler_client_lim") || + changed.count("osd_mclock_scheduler_background_recovery_res") || + changed.count("osd_mclock_scheduler_background_recovery_wgt") || + changed.count("osd_mclock_scheduler_background_recovery_lim") || + changed.count("osd_mclock_scheduler_background_best_effort_res") || + changed.count("osd_mclock_scheduler_background_best_effort_wgt") || + changed.count("osd_mclock_scheduler_background_best_effort_lim")) { + if (mclock_profile == "custom") { + client_registry.update_from_config(conf); + } + } +} + +mClockScheduler::~mClockScheduler() +{ + cct->_conf.remove_observer(this); +} + +} diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h new file mode 100644 index 000000000..32f3851ec --- /dev/null +++ b/src/osd/scheduler/mClockScheduler.h @@ -0,0 +1,204 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include +#include +#include + +#include "boost/variant.hpp" + +#include "dmclock/src/dmclock_server.h" + +#include "osd/scheduler/OpScheduler.h" +#include "common/config.h" +#include "include/cmp.h" +#include "common/ceph_context.h" +#include "common/mClockPriorityQueue.h" +#include "osd/scheduler/OpSchedulerItem.h" + + +namespace ceph::osd::scheduler { + +constexpr uint64_t default_min = 1; +constexpr uint64_t default_max = 999999; + +using client_id_t = uint64_t; +using profile_id_t = uint64_t; + +struct client_profile_id_t { + client_id_t client_id; + profile_id_t profile_id; +}; + +WRITE_EQ_OPERATORS_2(client_profile_id_t, client_id, profile_id) +WRITE_CMP_OPERATORS_2(client_profile_id_t, client_id, profile_id) + + +struct scheduler_id_t { + op_scheduler_class class_id; + client_profile_id_t client_profile_id; +}; + +WRITE_EQ_OPERATORS_2(scheduler_id_t, class_id, client_profile_id) +WRITE_CMP_OPERATORS_2(scheduler_id_t, class_id, client_profile_id) + +/** + * Scheduler implementation based on mclock. + * + * TODO: explain configs + */ +class mClockScheduler : public OpScheduler, md_config_obs_t { + + CephContext *cct; + const uint32_t num_shards; + bool is_rotational; + double max_osd_capacity; + double osd_mclock_cost_per_io; + double osd_mclock_cost_per_byte; + std::string mclock_profile = "high_client_ops"; + struct ClientAllocs { + uint64_t res; + uint64_t wgt; + uint64_t lim; + + ClientAllocs(uint64_t _res, uint64_t _wgt, uint64_t _lim) { + update(_res, _wgt, _lim); + } + + inline void update(uint64_t _res, uint64_t _wgt, uint64_t _lim) { + res = _res; + wgt = _wgt; + lim = _lim; + } + }; + std::array< + ClientAllocs, + static_cast(op_scheduler_class::client) + 1 + > client_allocs = { + // Placeholder, get replaced with configured values + ClientAllocs(1, 1, 1), // background_recovery + ClientAllocs(1, 1, 1), // background_best_effort + ClientAllocs(1, 1, 1), // immediate (not used) + ClientAllocs(1, 1, 1) // client + }; + class ClientRegistry { + std::array< + crimson::dmclock::ClientInfo, + static_cast(op_scheduler_class::immediate) + > internal_client_infos = { + // Placeholder, gets replaced with configured values + crimson::dmclock::ClientInfo(1, 1, 1), + crimson::dmclock::ClientInfo(1, 1, 1) + }; + + crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1}; + std::map external_client_infos; + const crimson::dmclock::ClientInfo *get_external_client( + const client_profile_id_t &client) const; + public: + void update_from_config(const ConfigProxy &conf); + const crimson::dmclock::ClientInfo *get_info( + const scheduler_id_t &id) const; + } client_registry; + + using mclock_queue_t = crimson::dmclock::PullPriorityQueue< + scheduler_id_t, + OpSchedulerItem, + true, + true, + 2>; + mclock_queue_t scheduler; + std::list immediate; + + static scheduler_id_t get_scheduler_id(const OpSchedulerItem &item) { + return scheduler_id_t{ + item.get_scheduler_class(), + client_profile_id_t{ + item.get_owner(), + 0 + } + }; + } + +public: + mClockScheduler(CephContext *cct, uint32_t num_shards, bool is_rotational); + ~mClockScheduler() override; + + // Set the max osd capacity in iops + void set_max_osd_capacity(); + + // Set the cost per io for the osd + void set_osd_mclock_cost_per_io(); + + // Set the cost per byte for the osd + void set_osd_mclock_cost_per_byte(); + + // Set the mclock profile type to enable + void set_mclock_profile(); + + // Get the active mclock profile + std::string get_mclock_profile(); + + // Set "balanced" profile allocations + void set_balanced_profile_allocations(); + + // Set "high_recovery_ops" profile allocations + void set_high_recovery_ops_profile_allocations(); + + // Set "high_client_ops" profile allocations + void set_high_client_ops_profile_allocations(); + + // Set the mclock related config params based on the profile + void enable_mclock_profile_settings(); + + // Set mclock config parameter based on allocations + void set_profile_config(); + + // Calculate scale cost per item + int calc_scaled_cost(int cost); + + // Enqueue op in the back of the regular queue + void enqueue(OpSchedulerItem &&item) final; + + // Enqueue the op in the front of the regular queue + void enqueue_front(OpSchedulerItem &&item) final; + + // Return an op to be dispatch + WorkItem dequeue() final; + + // Returns if the queue is empty + bool empty() const final { + return immediate.empty() && scheduler.empty(); + } + + // Formatted output of the queue + void dump(ceph::Formatter &f) const final; + + void print(std::ostream &ostream) const final { + ostream << "mClockScheduler"; + } + + // Update data associated with the modified mclock config key(s) + void update_configuration() final; + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set &changed) final; +}; + +} diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc new file mode 100644 index 000000000..fff372081 --- /dev/null +++ b/src/osd/scrub_machine.cc @@ -0,0 +1,534 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "scrub_machine.h" + +#include +#include + +#include + +#include "OSD.h" +#include "OpRequest.h" +#include "ScrubStore.h" +#include "scrub_machine_lstnr.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout << " scrubberFSM " + +using namespace std::chrono; +using namespace std::chrono_literals; +namespace sc = boost::statechart; + +#define DECLARE_LOCALS \ + ScrubMachineListener* scrbr = context().m_scrbr; \ + std::ignore = scrbr; \ + auto pg_id = context().m_pg_id; \ + std::ignore = pg_id; + +namespace Scrub { + +// --------- trace/debug auxiliaries ------------------------------- + +void on_event_creation(std::string_view nm) +{ + dout(20) << " event: --vvvv---- " << nm << dendl; +} + +void on_event_discard(std::string_view nm) +{ + dout(20) << " event: --^^^^---- " << nm << dendl; +} + +std::string ScrubMachine::current_states_desc() const +{ + std::string sts{"<"}; + for (auto si = state_begin(); si != state_end(); ++si) { + const auto& siw{ *si }; // prevents a warning re side-effects + // the '7' is the size of the 'scrub::' + sts += boost::core::demangle(typeid(siw).name()).substr(7, std::string::npos) + "/"; + } + return sts + ">"; +} + +void ScrubMachine::assert_not_active() const +{ + ceph_assert(state_cast()); +} + +bool ScrubMachine::is_reserving() const +{ + return state_cast(); +} + +bool ScrubMachine::is_accepting_updates() const +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + ceph_assert(scrbr->is_primary()); + + return state_cast(); +} + +// for the rest of the code in this file - we know what PG we are dealing with: +#undef dout_prefix +#define dout_prefix _prefix(_dout, this->context()) + +template +static ostream& _prefix(std::ostream* _dout, T& t) +{ + return t.gen_prefix(*_dout); +} + +std::ostream& ScrubMachine::gen_prefix(std::ostream& out) const +{ + return m_scrbr->gen_prefix(out) << "FSM: "; +} + +// ////////////// the actual actions + +// ----------------------- NotActive ----------------------------------------- + +NotActive::NotActive(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> NotActive" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + scrbr->clear_queued_or_active(); +} + +// ----------------------- ReservingReplicas --------------------------------- + +ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> ReservingReplicas" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + scrbr->scrub_begin(); + scrbr->reserve_replicas(); +} + +sc::result ReservingReplicas::react(const ReservationFailure&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl; + + // the Scrubber must release all resources and abort the scrubbing + scrbr->clear_pgscrub_state(); + return transit(); +} + +/** + * note: the event poster is handling the scrubber reset + */ +sc::result ReservingReplicas::react(const FullReset&) +{ + dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl; + return transit(); +} + +// ----------------------- ActiveScrubbing ----------------------------------- + +ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> ActiveScrubbing" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + scrbr->on_init(); +} + +/** + * upon exiting the Active state + */ +ActiveScrubbing::~ActiveScrubbing() +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(15) << __func__ << dendl; + scrbr->unreserve_replicas(); + scrbr->clear_queued_or_active(); +} + +/* + * The only source of an InternalError event as of now is the BuildMap state, + * when encountering a backend error. + * We kill the scrub and reset the FSM. + */ +sc::result ActiveScrubbing::react(const InternalError&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << __func__ << dendl; + scrbr->clear_pgscrub_state(); + return transit(); +} + +sc::result ActiveScrubbing::react(const FullReset&) +{ + dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl; + // caller takes care of clearing the scrubber & FSM states + return transit(); +} + +// ----------------------- RangeBlocked ----------------------------------- + +/* + * Blocked. Will be released by kick_object_context_blocked() (or upon + * an abort) + */ +RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/RangeBlocked" << dendl; +} + +// ----------------------- PendingTimer ----------------------------------- + +/** + * Sleeping till timer reactivation - or just requeuing + */ +PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/PendingTimer" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + scrbr->add_delayed_scheduling(); +} + +// ----------------------- NewChunk ----------------------------------- + +/** + * Preconditions: + * - preemption data was set + * - epoch start was updated + */ +NewChunk::NewChunk(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/NewChunk" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + scrbr->get_preemptor().adjust_parameters(); + + // choose range to work on + // select_range_n_notify() will signal either SelectedChunkFree or + // ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the + // range to become available. + scrbr->select_range_n_notify(); +} + +sc::result NewChunk::react(const SelectedChunkFree&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl; + + scrbr->set_subset_last_update(scrbr->search_log_for_updates()); + return transit(); +} + +// ----------------------- WaitPushes ----------------------------------- + +WaitPushes::WaitPushes(my_context ctx) : my_base(ctx) +{ + dout(10) << " -- state -->> Act/WaitPushes" << dendl; + post_event(ActivePushesUpd{}); +} + +/* + * Triggered externally, by the entity that had an update re pushes + */ +sc::result WaitPushes::react(const ActivePushesUpd&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: " + << scrbr->pending_active_pushes() << dendl; + + if (!scrbr->pending_active_pushes()) { + // done waiting + return transit(); + } + + return discard_event(); +} + +// ----------------------- WaitLastUpdate ----------------------------------- + +WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx) +{ + dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl; + post_event(UpdatesApplied{}); +} + +/** + * Note: + * Updates are locally readable immediately. Thus, on the replicas we do need + * to wait for the update notifications before scrubbing. For the Primary it's + * a bit different: on EC (and only there) rmw operations have an additional + * read roundtrip. That means that on the Primary we need to wait for + * last_update_applied (the replica side, even on EC, is still safe + * since the actual transaction will already be readable by commit time. + */ +void WaitLastUpdate::on_new_updates(const UpdatesApplied&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl; + + if (scrbr->has_pg_marked_new_updates()) { + post_event(InternalAllUpdates{}); + } else { + // will be requeued by op_applied + dout(10) << "wait for EC read/modify/writes to queue" << dendl; + } +} + +/* + * request maps from the replicas in the acting set + */ +sc::result WaitLastUpdate::react(const InternalAllUpdates&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl; + + scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable()); + return transit(); +} + +// ----------------------- BuildMap ----------------------------------- + +BuildMap::BuildMap(my_context ctx) : my_base(ctx) +{ + dout(10) << " -- state -->> Act/BuildMap" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + // no need to check for an epoch change, as all possible flows that brought us here have + // a check_interval() verification of their final event. + + if (scrbr->get_preemptor().was_preempted()) { + + // we were preempted, either directly or by a replica + dout(10) << __func__ << " preempted!!!" << dendl; + scrbr->mark_local_map_ready(); + post_event(IntBmPreempted{}); + + } else { + + auto ret = scrbr->build_primary_map_chunk(); + + if (ret == -EINPROGRESS) { + // must wait for the backend to finish. No specific event provided. + // build_primary_map_chunk() has already requeued us. + dout(20) << "waiting for the backend..." << dendl; + + } else if (ret < 0) { + + dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl; + post_event(InternalError{}); + + } else { + + // the local map was created + post_event(IntLocalMapDone{}); + } + } +} + +sc::result BuildMap::react(const IntLocalMapDone&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl; + + scrbr->mark_local_map_ready(); + return transit(); +} + +// ----------------------- DrainReplMaps ----------------------------------- + +DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/DrainReplMaps" << dendl; + // we may have received all maps already. Send the event that will make us check. + post_event(GotReplicas{}); +} + +sc::result DrainReplMaps::react(const GotReplicas&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl; + + if (scrbr->are_all_maps_available()) { + // NewChunk will handle the preemption that brought us to this state + return transit(); + } + + dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: " + << scrbr->dump_awaited_maps() << dendl; + return discard_event(); +} + +// ----------------------- WaitReplicas ----------------------------------- + +WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/WaitReplicas" << dendl; + post_event(GotReplicas{}); +} + +/** + * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state + * for a while even after we got all our maps, we must prevent are_all_maps_available() + * (actually - the code after the if()) from being called more than once. + * This is basically a separate state, but it's too transitory and artificial to justify + * the cost of a separate state. + + * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately + * after initiating the process. The actual termination of the maps comparing etc' is + * signalled via an event. As we share the code with "classic" OSD, here too + * maps_compare_n_cleanup() is responsible for signalling the completion of the + * processing. + */ +sc::result WaitReplicas::react(const GotReplicas&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl; + + if (!all_maps_already_called && scrbr->are_all_maps_available()) { + dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl; + + all_maps_already_called = true; + + // were we preempted? + if (scrbr->get_preemptor().disable_and_test()) { // a test&set + + + dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl; + return transit(); + + } else { + + // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent: + scrbr->maps_compare_n_cleanup(); + return discard_event(); + } + } else { + return discard_event(); + } +} + +// ----------------------- WaitDigestUpdate ----------------------------------- + +WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl; + + // perform an initial check: maybe we already + // have all the updates we need: + // (note that DigestUpdate is usually an external event) + post_event(DigestUpdate{}); +} + +sc::result WaitDigestUpdate::react(const DigestUpdate&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl; + + // on_digest_updates() will either: + // - do nothing - if we are still waiting for updates, or + // - finish the scrubbing of the current chunk, and: + // - send NextChunk, or + // - send ScrubFinished + + scrbr->on_digest_updates(); + return discard_event(); +} + +sc::result WaitDigestUpdate::react(const ScrubFinished&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitDigestUpdate::react(const ScrubFinished&)" << dendl; + scrbr->scrub_finish(); + return transit(); +} + +ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub) + : m_pg_id{pg->pg_id}, m_scrbr{pg_scrub} +{ +} + +ScrubMachine::~ScrubMachine() = default; + +// -------- for replicas ----------------------------------------------------- + +// ----------------------- ReplicaWaitUpdates -------------------------------- + +ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + scrbr->on_replica_init(); +} + +/* + * Triggered externally, by the entity that had an update re pushes + */ +sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): " + << scrbr->pending_active_pushes() << dendl; + + if (scrbr->pending_active_pushes() == 0) { + + // done waiting + return transit(); + } + + return discard_event(); +} + +/** + * the event poster is handling the scrubber reset + */ +sc::result ReplicaWaitUpdates::react(const FullReset&) +{ + dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl; + return transit(); +} + +// ----------------------- ActiveReplica ----------------------------------- + +ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "-- state -->> ActiveReplica" << dendl; + scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates + post_event(SchedReplica{}); +} + +sc::result ActiveReplica::react(const SchedReplica&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? " + << scrbr->get_preemptor().is_preemptable() << dendl; + + if (scrbr->get_preemptor().was_preempted()) { + dout(10) << "replica scrub job preempted" << dendl; + + scrbr->send_preempted_replica(); + scrbr->replica_handling_done(); + return transit(); + } + + // start or check progress of build_replica_map_chunk() + auto ret_init = scrbr->build_replica_map_chunk(); + if (ret_init != -EINPROGRESS) { + return transit(); + } + + return discard_event(); +} + +/** + * the event poster is handling the scrubber reset + */ +sc::result ActiveReplica::react(const FullReset&) +{ + dout(10) << "ActiveReplica::react(const FullReset&)" << dendl; + return transit(); +} + +} // namespace Scrub diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h new file mode 100644 index 000000000..7f88a675a --- /dev/null +++ b/src/osd/scrub_machine.h @@ -0,0 +1,344 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/version.h" +#include "include/Context.h" + +#include "scrub_machine_lstnr.h" +#include "scrubber_common.h" + +using namespace std::string_literals; + +class PG; // holding a pointer to that one - just for testing +class PgScrubber; +namespace Scrub { + +namespace sc = ::boost::statechart; +namespace mpl = ::boost::mpl; + +// +// EVENTS +// + +void on_event_creation(std::string_view nm); +void on_event_discard(std::string_view nm); + +#define MEV(E) \ + struct E : sc::event { \ + inline static int actv{0}; \ + E() \ + { \ + if (!actv++) \ + on_event_creation(#E); \ + } \ + ~E() \ + { \ + if (!--actv) \ + on_event_discard(#E); \ + } \ + void print(std::ostream* out) const { *out << #E; } \ + std::string_view print() const { return #E; } \ + }; + +MEV(RemotesReserved) ///< all replicas have granted our reserve request + +MEV(ReservationFailure) ///< a reservation request has failed + +MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary) + +MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery + ///< completion. + +MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for + ///< scrubbing. Via the PGScrubUnblocked op + +MEV(InternalSchedScrub) + +MEV(SelectedChunkFree) + +MEV(ChunkIsBusy) + +MEV(ActivePushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery + ///< that is in-flight to the local ObjectStore +MEV(UpdatesApplied) ///< (Primary only) all updates are committed + +MEV(InternalAllUpdates) ///< the internal counterpart of UpdatesApplied + +MEV(GotReplicas) ///< got a map from a replica + +MEV(IntBmPreempted) ///< internal - BuildMap preempted. Required, as detected within the + ///< ctor + +MEV(InternalError) + +MEV(IntLocalMapDone) + +MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See + ///< scrub_snapshot_metadata() + +MEV(MapsCompared) ///< (Crimson) maps_compare_n_cleanup() transactions are done + +MEV(StartReplica) ///< initiating replica scrub. + +MEV(StartReplicaNoWait) ///< 'start replica' when there are no pending updates + +MEV(SchedReplica) + +MEV(ReplicaPushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery + ///< that is in-flight to the local ObjectStore + +MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive) + +MEV(NextChunk) ///< finished handling this chunk. Go get the next one + +MEV(ScrubFinished) ///< all chunks handled + + +struct NotActive; ///< the quiescent state. No active scrubbing. +struct ReservingReplicas; ///< securing scrub resources from replicas' OSDs +struct ActiveScrubbing; ///< the active state for a Primary. A sub-machine. +struct ReplicaWaitUpdates; ///< an active state for a replica. Waiting for all active + ///< operations to finish. +struct ActiveReplica; ///< an active state for a replica. + + +class ScrubMachine : public sc::state_machine { + public: + friend class PgScrubber; + + public: + explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub); + ~ScrubMachine(); + + spg_t m_pg_id; + ScrubMachineListener* m_scrbr; + std::ostream& gen_prefix(std::ostream& out) const; + + std::string current_states_desc() const; + void assert_not_active() const; + [[nodiscard]] bool is_reserving() const; + [[nodiscard]] bool is_accepting_updates() const; +}; + +/** + * The Scrubber's base (quiescent) state. + * Scrubbing is triggered by one of the following events: + * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources + * reservation process. Will be issued by PG::scrub(), following a + * queued "PGScrub" op. + * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is + * not required to reserve resources. + * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming + * MOSDRepScrub message. + * + * note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting + * for replica resources to be acquired. But once replicas started using the + * resource-request to identify and tag the scrub session, this bypass cannot be + * supported anymore. + */ +struct NotActive : sc::state { + explicit NotActive(my_context ctx); + + using reactions = mpl::list, + // a scrubbing that was initiated at recovery completion, + // and requires no resource reservations: + sc::transition, + sc::transition, + sc::transition>; +}; + +struct ReservingReplicas : sc::state { + + explicit ReservingReplicas(my_context ctx); + using reactions = mpl::list, + // all replicas granted our resources request + sc::transition, + sc::custom_reaction>; + + sc::result react(const FullReset&); + + /// at least one replica denied us the scrub resources we've requested + sc::result react(const ReservationFailure&); +}; + + +// the "active" sub-states + +struct RangeBlocked; ///< the objects range is blocked +struct PendingTimer; ///< either delaying the scrub by some time and requeuing, or just + ///< requeue +struct NewChunk; ///< select a chunk to scrub, and verify its availability +struct WaitPushes; +struct WaitLastUpdate; +struct BuildMap; +struct DrainReplMaps; ///< a problem during BuildMap. Wait for all replicas to report, + ///< then restart. +struct WaitReplicas; ///< wait for all replicas to report +struct WaitDigestUpdate; + +struct ActiveScrubbing : sc::state { + + explicit ActiveScrubbing(my_context ctx); + ~ActiveScrubbing(); + + using reactions = mpl::list< + sc::custom_reaction, + sc::custom_reaction>; + + sc::result react(const FullReset&); + sc::result react(const InternalError&); +}; + +struct RangeBlocked : sc::state { + explicit RangeBlocked(my_context ctx); + using reactions = mpl::list>; +}; + +struct PendingTimer : sc::state { + + explicit PendingTimer(my_context ctx); + + using reactions = mpl::list>; +}; + +struct NewChunk : sc::state { + + explicit NewChunk(my_context ctx); + + using reactions = mpl::list, + sc::custom_reaction>; + + sc::result react(const SelectedChunkFree&); +}; + +/** + * initiate the update process for this chunk + * + * Wait fo 'active_pushes' to clear. + * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence + * scrub waits until the correct data is readable (in-flight data to the Objectstore is + * not readable until written to disk, termed 'applied' here) + */ +struct WaitPushes : sc::state { + + explicit WaitPushes(my_context ctx); + + using reactions = mpl::list>; + + sc::result react(const ActivePushesUpd&); +}; + +struct WaitLastUpdate : sc::state { + + explicit WaitLastUpdate(my_context ctx); + + void on_new_updates(const UpdatesApplied&); + + using reactions = mpl::list, + sc::in_state_reaction>; + + sc::result react(const InternalAllUpdates&); +}; + +struct BuildMap : sc::state { + explicit BuildMap(my_context ctx); + + // possible error scenarios: + // - an error reported by the backend will trigger an 'InternalError' event, + // handled by our parent state; + // - if preempted, we switch to DrainReplMaps, where we will wait for all + // replicas to send their maps before acknowledging the preemption; + // - an interval change will be handled by the relevant 'send-event' functions, + // and will translated into a 'FullReset' event. + using reactions = + mpl::list, + sc::transition, // looping, waiting + // for the backend to + // finish + sc::custom_reaction>; + + sc::result react(const IntLocalMapDone&); +}; + +/* + * "drain" scrub-maps responses from replicas + */ +struct DrainReplMaps : sc::state { + explicit DrainReplMaps(my_context ctx); + + using reactions = + mpl::list // all replicas are accounted for + >; + + sc::result react(const GotReplicas&); +}; + +struct WaitReplicas : sc::state { + explicit WaitReplicas(my_context ctx); + + using reactions = + mpl::list, // all replicas are accounted for + sc::transition, + sc::deferral // might arrive before we've reached WDU + >; + + sc::result react(const GotReplicas&); + + bool all_maps_already_called{false}; // see comment in react code +}; + +struct WaitDigestUpdate : sc::state { + explicit WaitDigestUpdate(my_context ctx); + + using reactions = mpl::list, + sc::custom_reaction, + sc::transition>; + sc::result react(const DigestUpdate&); + sc::result react(const ScrubFinished&); +}; + +// ----------------------------- the "replica active" states ----------------------- + +/* + * Waiting for 'active_pushes' to complete + * + * When in this state: + * - the details of the Primary's request were internalized by PgScrubber; + * - 'active' scrubbing is set + */ +struct ReplicaWaitUpdates : sc::state { + explicit ReplicaWaitUpdates(my_context ctx); + using reactions = + mpl::list, sc::custom_reaction>; + + sc::result react(const ReplicaPushesUpd&); + sc::result react(const FullReset&); +}; + + +struct ActiveReplica : sc::state { + explicit ActiveReplica(my_context ctx); + using reactions = mpl::list, + sc::custom_reaction, + sc::transition>; + + sc::result react(const SchedReplica&); + sc::result react(const FullReset&); +}; + +} // namespace Scrub diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h new file mode 100644 index 000000000..8d9622b9b --- /dev/null +++ b/src/osd/scrub_machine_lstnr.h @@ -0,0 +1,164 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +/** + * \file the PgScrubber interface used by the scrub FSM + */ +#include "common/version.h" +#include "include/Context.h" + +#include "osd_types.h" + +namespace Scrub { + +enum class PreemptionNoted { no_preemption, preempted }; + +/// the interface exposed by the PgScrubber into its internal +/// preemption_data object +struct preemption_t { + + virtual ~preemption_t() = default; + + [[nodiscard]] virtual bool is_preemptable() const = 0; + + [[nodiscard]] virtual bool was_preempted() const = 0; + + virtual void adjust_parameters() = 0; + + /** + * Try to preempt the scrub. + * 'true' (i.e. - preempted) if: + * preemptable && not already preempted + */ + virtual bool do_preempt() = 0; + + /** + * disables preemptions. + * Returns 'true' if we were already preempted + */ + virtual bool disable_and_test() = 0; +}; + +} // namespace Scrub + +struct ScrubMachineListener { + + struct MsgAndEpoch { + MessageRef m_msg; + epoch_t m_epoch; + }; + + virtual ~ScrubMachineListener() = default; + + virtual void select_range_n_notify() = 0; + + [[nodiscard]] virtual bool is_primary() const = 0; + + /// walk the log to find the latest update that affects our chunk + virtual eversion_t search_log_for_updates() const = 0; + + virtual eversion_t get_last_update_applied() const = 0; + + virtual int pending_active_pushes() const = 0; + + virtual int build_primary_map_chunk() = 0; + + virtual int build_replica_map_chunk() = 0; + + virtual void on_init() = 0; + + virtual void on_replica_init() = 0; + + virtual void replica_handling_done() = 0; + + /// the version of 'scrub_clear_state()' that does not try to invoke FSM services + /// (thus can be called from FSM reactions) + virtual void clear_pgscrub_state() = 0; + + /* + * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep' + * is asserted - after a configuration-dependent timeout. + */ + virtual void add_delayed_scheduling() = 0; + + /** + * Ask all replicas for their scrub maps for the current chunk. + */ + virtual void get_replicas_maps(bool replica_can_preempt) = 0; + + virtual void on_digest_updates() = 0; + + virtual void scrub_begin() = 0; + + /// the part that actually finalizes a scrub + virtual void scrub_finish() = 0; + + /** + * Prepare a MOSDRepScrubMap message carrying the requested scrub map + * @param was_preempted - were we preempted? + * @return the message, and the current value of 'm_replica_min_epoch' (which is + * used when sending the message, but will be overwritten before that). + */ + [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg( + Scrub::PreemptionNoted was_preempted) = 0; + + /** + * Send to the primary the pre-prepared message containing the requested map + */ + virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0; + + /** + * Let the primary know that we were preempted while trying to build the + * requested map. + */ + virtual void send_preempted_replica() = 0; + + [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0; + + virtual void set_subset_last_update(eversion_t e) = 0; + + [[nodiscard]] virtual bool was_epoch_changed() const = 0; + + virtual Scrub::preemption_t& get_preemptor() = 0; + + /** + * a "technical" collection of the steps performed once all + * rep maps are available: + * - the maps are compared + * - the scrub region markers (start_ & end_) are advanced + * - callbacks and ops that were pending are allowed to run + */ + virtual void maps_compare_n_cleanup() = 0; + + /** + * order the PgScrubber to initiate the process of reserving replicas' scrub + * resources. + */ + virtual void reserve_replicas() = 0; + + virtual void unreserve_replicas() = 0; + + /** + * Manipulate the 'I am being scrubbed now' Scrubber's flag + */ + virtual void set_queued_or_active() = 0; + virtual void clear_queued_or_active() = 0; + + /** + * the FSM interface into the "are we waiting for maps, either our own or from + * replicas" state. + * The FSM can only: + * - mark the local map as available, and + * - query status + */ + virtual void mark_local_map_ready() = 0; + + [[nodiscard]] virtual bool are_all_maps_available() const = 0; + + /// a log/debug interface + virtual std::string dump_awaited_maps() const = 0; + + /// exposed to be used by the scrub_machine logger + virtual std::ostream& gen_prefix(std::ostream& out) const = 0; +}; diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h new file mode 100644 index 000000000..65014b594 --- /dev/null +++ b/src/osd/scrubber_common.h @@ -0,0 +1,299 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include "common/scrub_types.h" +#include "include/types.h" +#include "os/ObjectStore.h" + +#include "OpRequest.h" + +namespace ceph { +class Formatter; +} + +namespace Scrub { + +/// high/low OP priority +enum class scrub_prio_t : bool { low_priority = false, high_priority = true }; + +/// Identifies a specific scrub activation within an interval, +/// see ScrubPGgIF::m_current_token +using act_token_t = uint32_t; + +} // namespace Scrub + + +/** + * Flags affecting the scheduling and behaviour of the *next* scrub. + * + * we hold two of these flag collections: one + * for the next scrub, and one frozen at initiation (i.e. in pg::queue_scrub()) + */ +struct requested_scrub_t { + + // flags to indicate explicitly requested scrubs (by admin): + // bool must_scrub, must_deep_scrub, must_repair, need_auto; + + /** + * 'must_scrub' is set by an admin command (or by need_auto). + * Affects the priority of the scrubbing, and the sleep periods + * during the scrub. + */ + bool must_scrub{false}; + + /** + * scrub must not be aborted. + * Set for explicitly requested scrubs, and for scrubs originated by the pairing + * process with the 'repair' flag set (in the RequestScrub event). + * + * Will be copied into the 'required' scrub flag upon scrub start. + */ + bool req_scrub{false}; + + /** + * Set from: + * - scrub_requested() with need_auto param set, which only happens in + * - scrub_finish() - if deep_scrub_on_error is set, and we have errors + * + * If set, will prevent the OSD from casually postponing our scrub. When scrubbing + * starts, will cause must_scrub, must_deep_scrub and auto_repair to be set. + */ + bool need_auto{false}; + + /** + * Set for scrub-after-recovery just before we initiate the recovery deep scrub, + * or if scrub_requested() was called with either need_auto ot repair. + * Affects PG_STATE_DEEP_SCRUB. + */ + bool must_deep_scrub{false}; + + /** + * (An intermediary flag used by pg::sched_scrub() on the first time + * a planned scrub has all its resources). Determines whether the next + * repair/scrub will be 'deep'. + * + * Note: 'dumped' by PgScrubber::dump() and such. In reality, being a + * temporary that is set and reset by the same operation, will never + * appear externally to be set + */ + bool time_for_deep{false}; + + bool deep_scrub_on_error{false}; + + /** + * If set, we should see must_deep_scrub and must_repair set, too + * + * - 'must_repair' is checked by the OSD when scheduling the scrubs. + * - also checked & cleared at pg::queue_scrub() + */ + bool must_repair{false}; + + /* + * the value of auto_repair is determined in sched_scrub() (once per scrub. previous + * value is not remembered). Set if + * - allowed by configuration and backend, and + * - must_scrub is not set (i.e. - this is a periodic scrub), + * - time_for_deep was just set + */ + bool auto_repair{false}; + + /** + * indicating that we are scrubbing post repair to verify everything is fixed. + * Otherwise - PG_STATE_FAILED_REPAIR will be asserted. + */ + bool check_repair{false}; +}; + +ostream& operator<<(ostream& out, const requested_scrub_t& sf); + +/** + * The interface used by the PG when requesting scrub-related info or services + */ +struct ScrubPgIF { + + virtual ~ScrubPgIF() = default; + + friend ostream& operator<<(ostream& out, const ScrubPgIF& s) { return s.show(out); } + + virtual ostream& show(ostream& out) const = 0; + + // --------------- triggering state-machine events: + + virtual void initiate_regular_scrub(epoch_t epoch_queued) = 0; + + virtual void initiate_scrub_after_repair(epoch_t epoch_queued) = 0; + + virtual void send_scrub_resched(epoch_t epoch_queued) = 0; + + virtual void active_pushes_notification(epoch_t epoch_queued) = 0; + + virtual void update_applied_notification(epoch_t epoch_queued) = 0; + + virtual void digest_update_notification(epoch_t epoch_queued) = 0; + + virtual void send_scrub_unblock(epoch_t epoch_queued) = 0; + + virtual void send_replica_maps_ready(epoch_t epoch_queued) = 0; + + virtual void send_replica_pushes_upd(epoch_t epoch_queued) = 0; + + virtual void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0; + + virtual void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0; + + virtual void on_applied_when_primary(const eversion_t &applied_version) = 0; + + virtual void send_full_reset(epoch_t epoch_queued) = 0; + + virtual void send_chunk_free(epoch_t epoch_queued) = 0; + + virtual void send_chunk_busy(epoch_t epoch_queued) = 0; + + virtual void send_local_map_done(epoch_t epoch_queued) = 0; + + virtual void send_get_next_chunk(epoch_t epoch_queued) = 0; + + virtual void send_scrub_is_finished(epoch_t epoch_queued) = 0; + + virtual void send_maps_compared(epoch_t epoch_queued) = 0; + + // -------------------------------------------------- + + [[nodiscard]] virtual bool are_callbacks_pending() + const = 0; // currently only used for an assert + + /** + * the scrubber is marked 'active': + * - for the primary: when all replica OSDs grant us the requested resources + * - for replicas: upon receiving the scrub request from the primary + */ + [[nodiscard]] virtual bool is_scrub_active() const = 0; + + /** + * 'true' until after the FSM processes the 'scrub-finished' event, + * and scrubbing is completely cleaned-up. + * + * In other words - holds longer than is_scrub_active(), thus preventing + * a rescrubbing of the same PG while the previous scrub has not fully + * terminated. + */ + [[nodiscard]] virtual bool is_queued_or_active() const = 0; + + /** + * Manipulate the 'scrubbing request has been queued, or - we are + * actually scrubbing' Scrubber's flag + */ + virtual void set_queued_or_active() = 0; + virtual void clear_queued_or_active() = 0; + + /// are we waiting for resource reservation grants form our replicas? + [[nodiscard]] virtual bool is_reserving() const = 0; + + /// handle a message carrying a replica map + virtual void map_from_replica(OpRequestRef op) = 0; + + virtual void replica_scrub_op(OpRequestRef op) = 0; + + virtual void set_op_parameters(requested_scrub_t&) = 0; + + virtual void scrub_clear_state() = 0; + + virtual void handle_query_state(ceph::Formatter* f) = 0; + + virtual void dump(ceph::Formatter* f) const = 0; + + /** + * Return true if soid is currently being scrubbed and pending IOs should block. + * May have a side effect of preempting an in-progress scrub -- will return false + * in that case. + * + * @param soid object to check for ongoing scrub + * @return boolean whether a request on soid should block until scrub completion + */ + virtual bool write_blocked_by_scrub(const hobject_t& soid) = 0; + + /// Returns whether any objects in the range [begin, end] are being scrubbed + virtual bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) = 0; + + /// the op priority, taken from the primary's request message + virtual Scrub::scrub_prio_t replica_op_priority() const = 0; + + /// the priority of the on-going scrub (used when requeuing events) + virtual unsigned int scrub_requeue_priority( + Scrub::scrub_prio_t with_priority) const = 0; + virtual unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const = 0; + + virtual void add_callback(Context* context) = 0; + + /// add to scrub statistics, but only if the soid is below the scrub start + virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) = 0; + + /** + * the version of 'scrub_clear_state()' that does not try to invoke FSM services + * (thus can be called from FSM reactions) + */ + virtual void clear_pgscrub_state() = 0; + + /** + * triggers the 'RemotesReserved' (all replicas granted scrub resources) + * state-machine event + */ + virtual void send_remotes_reserved(epoch_t epoch_queued) = 0; + + /** + * triggers the 'ReservationFailure' (at least one replica denied us the requested + * resources) state-machine event + */ + virtual void send_reservation_failure(epoch_t epoch_queued) = 0; + + virtual void cleanup_store(ObjectStore::Transaction* t) = 0; + + virtual bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const = 0; + + // --------------- reservations ----------------------------------- + + /** + * message all replicas with a request to "unreserve" scrub + */ + virtual void unreserve_replicas() = 0; + + /** + * "forget" all replica reservations. No messages are sent to the + * previously-reserved. + * + * Used upon interval change. The replicas' state is guaranteed to + * be reset separately by the interval-change event. + */ + virtual void discard_replica_reservations() = 0; + + /** + * clear both local and OSD-managed resource reservation flags + */ + virtual void clear_scrub_reservations() = 0; + + /** + * Reserve local scrub resources (managed by the OSD) + * + * Fails if OSD's local-scrubs budget was exhausted + * \returns were local resources reserved? + */ + virtual bool reserve_local() = 0; + + // on the replica: + virtual void handle_scrub_reserve_request(OpRequestRef op) = 0; + virtual void handle_scrub_reserve_release(OpRequestRef op) = 0; + + // and on the primary: + virtual void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) = 0; + virtual void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) = 0; + + virtual void reg_next_scrub(const requested_scrub_t& request_flags) = 0; + virtual void unreg_next_scrub() = 0; + virtual void scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) = 0; +}; -- cgit v1.2.3