diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/osd | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/osd')
60 files changed, 85484 insertions, 0 deletions
diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt new file mode 100644 index 00000000..a1b16b59 --- /dev/null +++ b/src/osd/CMakeLists.txt @@ -0,0 +1,73 @@ +set(osdc_osd_srcs + ${CMAKE_SOURCE_DIR}/src/osdc/Objecter.cc + ${CMAKE_SOURCE_DIR}/src/osdc/Striper.cc) + +if(WITH_OSD_INSTRUMENT_FUNCTIONS AND CMAKE_CXX_COMPILER_ID STREQUAL GNU) + set(GCC_C_FLAGS "-finstrument-functions") + set(GCC_C_FLAGS "${GCC_C_FLAGS} -finstrument-functions-exclude-function-list=_mm_loadu_si128,_mm_cmpeq_epi32,_mm_movemask_epi8") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_C_FLAGS}") + set(osd_cyg_functions_src ${CMAKE_SOURCE_DIR}/src/tracing/cyg_profile_functions.c) +endif() + +set(osd_srcs + OSD.cc + Watch.cc + ClassHandler.cc + PG.cc + PGLog.cc + PrimaryLogPG.cc + ReplicatedBackend.cc + ECBackend.cc + ECTransaction.cc + PGBackend.cc + OSDCap.cc + Watch.cc + ClassHandler.cc + Session.cc + SnapMapper.cc + ScrubStore.cc + osd_types.cc + ECUtil.cc + ExtentCache.cc + mClockOpClassSupport.cc + mClockOpClassQueue.cc + mClockClientQueue.cc + OpQueueItem.cc + ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc + ${CMAKE_SOURCE_DIR}/src/objclass/class_api.cc + ${CMAKE_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc + ${osd_cyg_functions_src} + ${osdc_osd_srcs}) +if(HAS_VTA) + set_source_files_properties(osdcap.cc + PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments) +endif() +add_library(osd STATIC ${osd_srcs} + $<TARGET_OBJECTS:global_common_objs>) +target_link_libraries(osd + PUBLIC dmclock::dmclock + PRIVATE + ${LEVELDB_LIBRARIES} + heap_profiler cpu_profiler ${CMAKE_DL_LIBS}) +if(WITH_LTTNG) + add_dependencies(osd osd-tp pg-tp) +endif() +if(WITH_LTTNG AND WITH_EVENTTRACE) + add_dependencies(osd eventtrace_tp) +endif() +if(WITH_OSD_INSTRUMENT_FUNCTIONS) + add_dependencies(osd cyg_profile_tp) +endif() + +# libcls_* are runtime dependencies +add_dependencies(osd cls_journal cls_hello cls_lock cls_log cls_numops + cls_refcount cls_timeindex cls_user cls_version cls_cas) +if(WITH_CEPHFS) + add_dependencies(osd cls_cephfs) +endif() +if(WITH_RBD) + add_dependencies(osd cls_rbd) +endif() +if(WITH_RADOSGW) + add_dependencies(osd cls_otp cls_rgw) +endif() diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc new file mode 100644 index 00000000..1b7f6686 --- /dev/null +++ b/src/osd/ClassHandler.cc @@ -0,0 +1,340 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/types.h" +#include "ClassHandler.h" +#include "common/errno.h" +#include "common/ceph_context.h" + +#include <dlfcn.h> + +#include <map> + +#if defined(__FreeBSD__) +#include <sys/param.h> +#endif + +#include "common/config.h" +#include "common/debug.h" + +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout + + +#define CLS_PREFIX "libcls_" +#define CLS_SUFFIX ".so" + + +void ClassHandler::add_embedded_class(const string& cname) +{ + ceph_assert(mutex.is_locked()); + ClassData *cls = _get_class(cname, false); + ceph_assert(cls->status == ClassData::CLASS_UNKNOWN); + cls->status = ClassData::CLASS_INITIALIZING; +} + +int ClassHandler::open_class(const string& cname, ClassData **pcls) +{ + std::lock_guard lock(mutex); + ClassData *cls = _get_class(cname, true); + if (!cls) + return -EPERM; + if (cls->status != ClassData::CLASS_OPEN) { + int r = _load_class(cls); + if (r) + return r; + } + *pcls = cls; + return 0; +} + +int ClassHandler::open_all_classes() +{ + ldout(cct, 10) << __func__ << dendl; + DIR *dir = ::opendir(cct->_conf->osd_class_dir.c_str()); + if (!dir) + return -errno; + + struct dirent *pde = nullptr; + int r = 0; + while ((pde = ::readdir(dir))) { + if (pde->d_name[0] == '.') + continue; + if (strlen(pde->d_name) > sizeof(CLS_PREFIX) - 1 + sizeof(CLS_SUFFIX) - 1 && + strncmp(pde->d_name, CLS_PREFIX, sizeof(CLS_PREFIX) - 1) == 0 && + strcmp(pde->d_name + strlen(pde->d_name) - (sizeof(CLS_SUFFIX) - 1), CLS_SUFFIX) == 0) { + char cname[PATH_MAX + 1]; + strncpy(cname, pde->d_name + sizeof(CLS_PREFIX) - 1, sizeof(cname) -1); + cname[strlen(cname) - (sizeof(CLS_SUFFIX) - 1)] = '\0'; + ldout(cct, 10) << __func__ << " found " << cname << dendl; + ClassData *cls; + // skip classes that aren't in 'osd class load list' + r = open_class(cname, &cls); + if (r < 0 && r != -EPERM) + goto out; + } + } + out: + closedir(dir); + return r; +} + +void ClassHandler::shutdown() +{ + for (auto& cls : classes) { + if (cls.second.handle) { + dlclose(cls.second.handle); + } + } + classes.clear(); +} + +/* + * Check if @cname is in the whitespace delimited list @list, or the @list + * contains the wildcard "*". + * + * This is expensive but doesn't consume memory for an index, and is performed + * only once when a class is loaded. + */ +bool ClassHandler::in_class_list(const std::string& cname, + const std::string& list) +{ + std::istringstream ss(list); + std::istream_iterator<std::string> begin{ss}; + std::istream_iterator<std::string> end{}; + + const std::vector<std::string> targets{cname, "*"}; + + auto it = std::find_first_of(begin, end, + targets.begin(), targets.end()); + + return it != end; +} + +ClassHandler::ClassData *ClassHandler::_get_class(const string& cname, + bool check_allowed) +{ + ClassData *cls; + map<string, ClassData>::iterator iter = classes.find(cname); + + if (iter != classes.end()) { + cls = &iter->second; + } else { + if (check_allowed && !in_class_list(cname, cct->_conf->osd_class_load_list)) { + ldout(cct, 0) << "_get_class not permitted to load " << cname << dendl; + return NULL; + } + cls = &classes[cname]; + ldout(cct, 10) << "_get_class adding new class name " << cname << " " << cls << dendl; + cls->name = cname; + cls->handler = this; + cls->whitelisted = in_class_list(cname, cct->_conf->osd_class_default_list); + } + return cls; +} + +int ClassHandler::_load_class(ClassData *cls) +{ + // already open + if (cls->status == ClassData::CLASS_OPEN) + return 0; + + if (cls->status == ClassData::CLASS_UNKNOWN || + cls->status == ClassData::CLASS_MISSING) { + char fname[PATH_MAX]; + snprintf(fname, sizeof(fname), "%s/" CLS_PREFIX "%s" CLS_SUFFIX, + cct->_conf->osd_class_dir.c_str(), + cls->name.c_str()); + ldout(cct, 10) << "_load_class " << cls->name << " from " << fname << dendl; + + cls->handle = dlopen(fname, RTLD_NOW); + if (!cls->handle) { + struct stat st; + int r = ::stat(fname, &st); + if (r < 0) { + r = -errno; + ldout(cct, 0) << __func__ << " could not stat class " << fname + << ": " << cpp_strerror(r) << dendl; + } else { + ldout(cct, 0) << "_load_class could not open class " << fname + << " (dlopen failed): " << dlerror() << dendl; + r = -EIO; + } + cls->status = ClassData::CLASS_MISSING; + return r; + } + + cls_deps_t *(*cls_deps)(); + cls_deps = (cls_deps_t *(*)())dlsym(cls->handle, "class_deps"); + if (cls_deps) { + cls_deps_t *deps = cls_deps(); + while (deps) { + if (!deps->name) + break; + ClassData *cls_dep = _get_class(deps->name, false); + cls->dependencies.insert(cls_dep); + if (cls_dep->status != ClassData::CLASS_OPEN) + cls->missing_dependencies.insert(cls_dep); + deps++; + } + } + } + + // resolve dependencies + set<ClassData*>::iterator p = cls->missing_dependencies.begin(); + while (p != cls->missing_dependencies.end()) { + ClassData *dc = *p; + int r = _load_class(dc); + if (r < 0) { + cls->status = ClassData::CLASS_MISSING_DEPS; + return r; + } + + ldout(cct, 10) << "_load_class " << cls->name << " satisfied dependency " << dc->name << dendl; + cls->missing_dependencies.erase(p++); + } + + // initialize + void (*cls_init)() = (void (*)())dlsym(cls->handle, "__cls_init"); + if (cls_init) { + cls->status = ClassData::CLASS_INITIALIZING; + cls_init(); + } + + ldout(cct, 10) << "_load_class " << cls->name << " success" << dendl; + cls->status = ClassData::CLASS_OPEN; + return 0; +} + + + +ClassHandler::ClassData *ClassHandler::register_class(const char *cname) +{ + ceph_assert(mutex.is_locked()); + + ClassData *cls = _get_class(cname, false); + ldout(cct, 10) << "register_class " << cname << " status " << cls->status << dendl; + + if (cls->status != ClassData::CLASS_INITIALIZING) { + ldout(cct, 0) << "class " << cname << " isn't loaded; is the class registering under the wrong name?" << dendl; + return NULL; + } + return cls; +} + +void ClassHandler::unregister_class(ClassHandler::ClassData *cls) +{ + /* FIXME: do we really need this one? */ +} + +ClassHandler::ClassMethod *ClassHandler::ClassData::register_method(const char *mname, + int flags, + cls_method_call_t func) +{ + /* no need for locking, called under the class_init mutex */ + if (!flags) { + lderr(handler->cct) << "register_method " << name << "." << mname + << " flags " << flags << " " << (void*)func + << " FAILED -- flags must be non-zero" << dendl; + return NULL; + } + ldout(handler->cct, 10) << "register_method " << name << "." << mname << " flags " << flags << " " << (void*)func << dendl; + ClassMethod& method = methods_map[mname]; + method.func = func; + method.name = mname; + method.flags = flags; + method.cls = this; + return &method; +} + +ClassHandler::ClassMethod *ClassHandler::ClassData::register_cxx_method(const char *mname, + int flags, + cls_method_cxx_call_t func) +{ + /* no need for locking, called under the class_init mutex */ + ldout(handler->cct, 10) << "register_cxx_method " << name << "." << mname << " flags " << flags << " " << (void*)func << dendl; + ClassMethod& method = methods_map[mname]; + method.cxx_func = func; + method.name = mname; + method.flags = flags; + method.cls = this; + return &method; +} + +ClassHandler::ClassFilter *ClassHandler::ClassData::register_cxx_filter( + const std::string &filter_name, + cls_cxx_filter_factory_t fn) +{ + ClassFilter &filter = filters_map[filter_name]; + filter.fn = fn; + filter.name = filter_name; + filter.cls = this; + return &filter; +} + +ClassHandler::ClassMethod *ClassHandler::ClassData::_get_method(const char *mname) +{ + map<string, ClassHandler::ClassMethod>::iterator iter = methods_map.find(mname); + if (iter == methods_map.end()) + return NULL; + return &(iter->second); +} + +int ClassHandler::ClassData::get_method_flags(const char *mname) +{ + std::lock_guard l(handler->mutex); + ClassMethod *method = _get_method(mname); + if (!method) + return -ENOENT; + return method->flags; +} + +void ClassHandler::ClassData::unregister_method(ClassHandler::ClassMethod *method) +{ + /* no need for locking, called under the class_init mutex */ + map<string, ClassMethod>::iterator iter = methods_map.find(method->name); + if (iter == methods_map.end()) + return; + methods_map.erase(iter); +} + +void ClassHandler::ClassMethod::unregister() +{ + cls->unregister_method(this); +} + +void ClassHandler::ClassData::unregister_filter(ClassHandler::ClassFilter *filter) +{ + /* no need for locking, called under the class_init mutex */ + map<string, ClassFilter>::iterator iter = filters_map.find(filter->name); + if (iter == filters_map.end()) + return; + filters_map.erase(iter); +} + +void ClassHandler::ClassFilter::unregister() +{ + cls->unregister_filter(this); +} + +int ClassHandler::ClassMethod::exec(cls_method_context_t ctx, bufferlist& indata, bufferlist& outdata) +{ + int ret; + if (cxx_func) { + // C++ call version + ret = cxx_func(ctx, &indata, &outdata); + } else { + // C version + char *out = NULL; + int olen = 0; + ret = func(ctx, indata.c_str(), indata.length(), &out, &olen); + if (out) { + // assume *out was allocated via cls_alloc (which calls malloc!) + buffer::ptr bp = buffer::claim_malloc(olen, out); + outdata.push_back(bp); + } + } + return ret; +} + diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h new file mode 100644 index 00000000..58a14225 --- /dev/null +++ b/src/osd/ClassHandler.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_CLASSHANDLER_H +#define CEPH_CLASSHANDLER_H + +#include "include/types.h" +#include "objclass/objclass.h" +#include "common/Mutex.h" + +//forward declaration +class CephContext; + +class ClassHandler +{ +public: + CephContext *cct; + + struct ClassData; + + struct ClassMethod { + struct ClassHandler::ClassData *cls; + string name; + int flags; + cls_method_call_t func; + cls_method_cxx_call_t cxx_func; + + int exec(cls_method_context_t ctx, bufferlist& indata, bufferlist& outdata); + void unregister(); + + int get_flags() { + std::lock_guard l(cls->handler->mutex); + return flags; + } + + ClassMethod() : cls(0), flags(0), func(0), cxx_func(0) {} + }; + + struct ClassFilter { + struct ClassHandler::ClassData *cls = nullptr; + std::string name; + cls_cxx_filter_factory_t fn; + + void unregister(); + + ClassFilter() : fn(0) + {} + }; + + struct ClassData { + enum Status { + CLASS_UNKNOWN, + CLASS_MISSING, // missing + CLASS_MISSING_DEPS, // missing dependencies + CLASS_INITIALIZING, // calling init() right now + CLASS_OPEN, // initialized, usable + } status; + + string name; + ClassHandler *handler; + void *handle; + + bool whitelisted = false; + + map<string, ClassMethod> methods_map; + map<string, ClassFilter> filters_map; + + set<ClassData *> dependencies; /* our dependencies */ + set<ClassData *> missing_dependencies; /* only missing dependencies */ + + ClassMethod *_get_method(const char *mname); + + ClassData() : status(CLASS_UNKNOWN), + handler(NULL), + handle(NULL) {} + ~ClassData() { } + + ClassMethod *register_method(const char *mname, int flags, cls_method_call_t func); + ClassMethod *register_cxx_method(const char *mname, int flags, cls_method_cxx_call_t func); + void unregister_method(ClassMethod *method); + + ClassFilter *register_cxx_filter( + const std::string &filter_name, + cls_cxx_filter_factory_t fn); + void unregister_filter(ClassFilter *method); + + ClassMethod *get_method(const char *mname) { + std::lock_guard l(handler->mutex); + return _get_method(mname); + } + int get_method_flags(const char *mname); + + ClassFilter *get_filter(const std::string &filter_name) + { + std::lock_guard l(handler->mutex); + std::map<std::string, ClassFilter>::iterator i = filters_map.find(filter_name); + if (i == filters_map.end()) { + return NULL; + } else { + return &(i->second); + } + } + }; + +private: + map<string, ClassData> classes; + + ClassData *_get_class(const string& cname, bool check_allowed); + int _load_class(ClassData *cls); + + static bool in_class_list(const std::string& cname, + const std::string& list); + +public: + Mutex mutex; + + explicit ClassHandler(CephContext *cct_) : cct(cct_), mutex("ClassHandler") {} + + int open_all_classes(); + + void add_embedded_class(const string& cname); + int open_class(const string& cname, ClassData **pcls); + + ClassData *register_class(const char *cname); + void unregister_class(ClassData *cls); + + void shutdown(); +}; + + +#endif diff --git a/src/osd/DynamicPerfStats.h b/src/osd/DynamicPerfStats.h new file mode 100644 index 00000000..aaef8684 --- /dev/null +++ b/src/osd/DynamicPerfStats.h @@ -0,0 +1,267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef DYNAMIC_PERF_STATS_H +#define DYNAMIC_PERF_STATS_H + +#include "include/random.h" +#include "messages/MOSDOp.h" +#include "mgr/OSDPerfMetricTypes.h" +#include "osd/OSD.h" +#include "osd/OpRequest.h" + +class DynamicPerfStats { +public: + DynamicPerfStats() { + } + + DynamicPerfStats(const std::list<OSDPerfMetricQuery> &queries) { + for (auto &query : queries) { + data[query]; + } + } + + void merge(const DynamicPerfStats &dps) { + for (auto &query_it : dps.data) { + auto &query = query_it.first; + for (auto &key_it : query_it.second) { + auto &key = key_it.first; + auto counter_it = key_it.second.begin(); + auto update_counter_fnc = + [&counter_it](const PerformanceCounterDescriptor &d, + PerformanceCounter *c) { + c->first += counter_it->first; + c->second += counter_it->second; + counter_it++; + }; + + ceph_assert(key_it.second.size() >= data[query][key].size()); + query.update_counters(update_counter_fnc, &data[query][key]); + } + } + } + + void set_queries(const std::list<OSDPerfMetricQuery> &queries) { + std::map<OSDPerfMetricQuery, + std::map<OSDPerfMetricKey, PerformanceCounters>> new_data; + for (auto &query : queries) { + std::swap(new_data[query], data[query]); + } + std::swap(data, new_data); + } + + bool is_enabled() { + return !data.empty(); + } + + void add(const OSDService *osd, const pg_info_t &pg_info, const OpRequest& op, + uint64_t inb, uint64_t outb, const utime_t &latency) { + + auto update_counter_fnc = + [&op, inb, outb, &latency](const PerformanceCounterDescriptor &d, + PerformanceCounter *c) { + ceph_assert(d.is_supported()); + + switch(d.type) { + case PerformanceCounterType::OPS: + c->first++; + return; + case PerformanceCounterType::WRITE_OPS: + if (op.may_write() || op.may_cache()) { + c->first++; + } + return; + case PerformanceCounterType::READ_OPS: + if (op.may_read()) { + c->first++; + } + return; + case PerformanceCounterType::BYTES: + c->first += inb + outb; + return; + case PerformanceCounterType::WRITE_BYTES: + if (op.may_write() || op.may_cache()) { + c->first += inb; + } + return; + case PerformanceCounterType::READ_BYTES: + if (op.may_read()) { + c->first += outb; + } + return; + case PerformanceCounterType::LATENCY: + c->first += latency.to_nsec(); + c->second++; + return; + case PerformanceCounterType::WRITE_LATENCY: + if (op.may_write() || op.may_cache()) { + c->first += latency.to_nsec(); + c->second++; + } + return; + case PerformanceCounterType::READ_LATENCY: + if (op.may_read()) { + c->first += latency.to_nsec(); + c->second++; + } + return; + default: + ceph_abort_msg("unknown counter type"); + } + }; + + auto get_subkey_fnc = + [&osd, &pg_info, &op](const OSDPerfMetricSubKeyDescriptor &d, + OSDPerfMetricSubKey *sub_key) { + ceph_assert(d.is_supported()); + + auto m = static_cast<const MOSDOp*>(op.get_req()); + std::string match_string; + switch(d.type) { + case OSDPerfMetricSubKeyType::CLIENT_ID: + match_string = stringify(m->get_reqid().name); + break; + case OSDPerfMetricSubKeyType::CLIENT_ADDRESS: + match_string = stringify(m->get_connection()->get_peer_addr()); + break; + case OSDPerfMetricSubKeyType::POOL_ID: + match_string = stringify(m->get_spg().pool()); + break; + case OSDPerfMetricSubKeyType::NAMESPACE: + match_string = m->get_hobj().nspace; + break; + case OSDPerfMetricSubKeyType::OSD_ID: + match_string = stringify(osd->get_nodeid()); + break; + case OSDPerfMetricSubKeyType::PG_ID: + match_string = stringify(pg_info.pgid); + break; + case OSDPerfMetricSubKeyType::OBJECT_NAME: + match_string = m->get_oid().name; + break; + case OSDPerfMetricSubKeyType::SNAP_ID: + match_string = stringify(m->get_snapid()); + break; + default: + ceph_abort_msg("unknown counter type"); + } + + std::smatch match; + if (!std::regex_search(match_string, match, d.regex)) { + return false; + } + if (match.size() <= 1) { + return false; + } + for (size_t i = 1; i < match.size(); i++) { + sub_key->push_back(match[i].str()); + } + return true; + }; + + for (auto &it : data) { + auto &query = it.first; + OSDPerfMetricKey key; + if (query.get_key(get_subkey_fnc, &key)) { + query.update_counters(update_counter_fnc, &it.second[key]); + } + } + } + + void add_to_reports( + const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &limits, + std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) { + for (auto &it : data) { + auto &query = it.first; + auto limit_it = limits.find(query); + if (limit_it == limits.end()) { + continue; + } + auto &query_limits = limit_it->second; + auto &counters = it.second; + auto &report = (*reports)[query]; + + query.get_performance_counter_descriptors( + &report.performance_counter_descriptors); + + auto &descriptors = report.performance_counter_descriptors; + ceph_assert(descriptors.size() > 0); + + if (!is_limited(query_limits, counters.size())) { + for (auto &it_counters : counters) { + auto &bl = report.group_packed_performance_counters[it_counters.first]; + query.pack_counters(it_counters.second, &bl); + } + continue; + } + + for (auto &limit : query_limits) { + size_t index = 0; + for (; index < descriptors.size(); index++) { + if (descriptors[index] == limit.order_by) { + break; + } + } + if (index == descriptors.size()) { + // should not happen + continue; + } + + // Weighted Random Sampling (Algorithm A-Chao): + // Select the first [0, max_count) samples, randomly replace + // with samples from [max_count, end) using weighted + // probability, and return [0, max_count) as the result. + + ceph_assert(limit.max_count < counters.size()); + typedef std::map<OSDPerfMetricKey, PerformanceCounters>::iterator + Iterator; + std::vector<Iterator> counter_iterators; + counter_iterators.reserve(limit.max_count); + + Iterator it_counters = counters.begin(); + uint64_t wsum = 0; + for (size_t i = 0; i < limit.max_count; i++) { + wsum += it_counters->second[index].first; + counter_iterators.push_back(it_counters++); + } + for (; it_counters != counters.end(); it_counters++) { + wsum += it_counters->second[index].first; + if (ceph::util::generate_random_number(0, wsum) <= + it_counters->second[index].first) { + auto i = ceph::util::generate_random_number(0, limit.max_count - 1); + counter_iterators[i] = it_counters; + } + } + + for (auto it_counters : counter_iterators) { + auto &bl = + report.group_packed_performance_counters[it_counters->first]; + if (bl.length() == 0) { + query.pack_counters(it_counters->second, &bl); + } + } + } + } + } + +private: + static bool is_limited(const OSDPerfMetricLimits &limits, + size_t counters_size) { + if (limits.empty()) { + return false; + } + + for (auto &limit : limits) { + if (limit.max_count >= counters_size) { + return false; + } + } + + return true; + } + + std::map<OSDPerfMetricQuery, + std::map<OSDPerfMetricKey, PerformanceCounters>> data; +}; + +#endif // DYNAMIC_PERF_STATS_H diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc new file mode 100644 index 00000000..36a77cc7 --- /dev/null +++ b/src/osd/ECBackend.cc @@ -0,0 +1,2566 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iostream> +#include <sstream> + +#include "ECBackend.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDECSubOpWrite.h" +#include "messages/MOSDECSubOpWriteReply.h" +#include "messages/MOSDECSubOpRead.h" +#include "messages/MOSDECSubOpReadReply.h" +#include "ECMsgTypes.h" + +#include "PrimaryLogPG.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, ECBackend *pgb) { + return pgb->get_parent()->gen_dbg_prefix(*_dout); +} + +struct ECRecoveryHandle : public PGBackend::RecoveryHandle { + list<ECBackend::RecoveryOp> ops; +}; + +ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs) { + switch (rhs.pipeline_state) { + case ECBackend::pipeline_state_t::CACHE_VALID: + return lhs << "CACHE_VALID"; + case ECBackend::pipeline_state_t::CACHE_INVALID: + return lhs << "CACHE_INVALID"; + default: + ceph_abort_msg("invalid pipeline state"); + } + return lhs; // unreachable +} + +static ostream &operator<<(ostream &lhs, const map<pg_shard_t, bufferlist> &rhs) +{ + lhs << "["; + for (map<pg_shard_t, bufferlist>::const_iterator i = rhs.begin(); + i != rhs.end(); + ++i) { + if (i != rhs.begin()) + lhs << ", "; + lhs << make_pair(i->first, i->second.length()); + } + return lhs << "]"; +} + +static ostream &operator<<(ostream &lhs, const map<int, bufferlist> &rhs) +{ + lhs << "["; + for (map<int, bufferlist>::const_iterator i = rhs.begin(); + i != rhs.end(); + ++i) { + if (i != rhs.begin()) + lhs << ", "; + lhs << make_pair(i->first, i->second.length()); + } + return lhs << "]"; +} + +static ostream &operator<<( + ostream &lhs, + const boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &rhs) +{ + return lhs << "(" << rhs.get<0>() << ", " + << rhs.get<1>() << ", " << rhs.get<2>() << ")"; +} + +ostream &operator<<(ostream &lhs, const ECBackend::read_request_t &rhs) +{ + return lhs << "read_request_t(to_read=[" << rhs.to_read << "]" + << ", need=" << rhs.need + << ", want_attrs=" << rhs.want_attrs + << ")"; +} + +ostream &operator<<(ostream &lhs, const ECBackend::read_result_t &rhs) +{ + lhs << "read_result_t(r=" << rhs.r + << ", errors=" << rhs.errors; + if (rhs.attrs) { + lhs << ", attrs=" << rhs.attrs.get(); + } else { + lhs << ", noattrs"; + } + return lhs << ", returned=" << rhs.returned << ")"; +} + +ostream &operator<<(ostream &lhs, const ECBackend::ReadOp &rhs) +{ + lhs << "ReadOp(tid=" << rhs.tid; + if (rhs.op && rhs.op->get_req()) { + lhs << ", op="; + rhs.op->get_req()->print(lhs); + } + return lhs << ", to_read=" << rhs.to_read + << ", complete=" << rhs.complete + << ", priority=" << rhs.priority + << ", obj_to_source=" << rhs.obj_to_source + << ", source_to_obj=" << rhs.source_to_obj + << ", in_progress=" << rhs.in_progress << ")"; +} + +void ECBackend::ReadOp::dump(Formatter *f) const +{ + f->dump_unsigned("tid", tid); + if (op && op->get_req()) { + f->dump_stream("op") << *(op->get_req()); + } + f->dump_stream("to_read") << to_read; + f->dump_stream("complete") << complete; + f->dump_int("priority", priority); + f->dump_stream("obj_to_source") << obj_to_source; + f->dump_stream("source_to_obj") << source_to_obj; + f->dump_stream("in_progress") << in_progress; +} + +ostream &operator<<(ostream &lhs, const ECBackend::Op &rhs) +{ + lhs << "Op(" << rhs.hoid + << " v=" << rhs.version + << " tt=" << rhs.trim_to + << " tid=" << rhs.tid + << " reqid=" << rhs.reqid; + if (rhs.client_op && rhs.client_op->get_req()) { + lhs << " client_op="; + rhs.client_op->get_req()->print(lhs); + } + lhs << " roll_forward_to=" << rhs.roll_forward_to + << " temp_added=" << rhs.temp_added + << " temp_cleared=" << rhs.temp_cleared + << " pending_read=" << rhs.pending_read + << " remote_read=" << rhs.remote_read + << " remote_read_result=" << rhs.remote_read_result + << " pending_apply=" << rhs.pending_apply + << " pending_commit=" << rhs.pending_commit + << " plan.to_read=" << rhs.plan.to_read + << " plan.will_write=" << rhs.plan.will_write + << ")"; + return lhs; +} + +ostream &operator<<(ostream &lhs, const ECBackend::RecoveryOp &rhs) +{ + return lhs << "RecoveryOp(" + << "hoid=" << rhs.hoid + << " v=" << rhs.v + << " missing_on=" << rhs.missing_on + << " missing_on_shards=" << rhs.missing_on_shards + << " recovery_info=" << rhs.recovery_info + << " recovery_progress=" << rhs.recovery_progress + << " obc refcount=" << rhs.obc.use_count() + << " state=" << ECBackend::RecoveryOp::tostr(rhs.state) + << " waiting_on_pushes=" << rhs.waiting_on_pushes + << " extent_requested=" << rhs.extent_requested + << ")"; +} + +void ECBackend::RecoveryOp::dump(Formatter *f) const +{ + f->dump_stream("hoid") << hoid; + f->dump_stream("v") << v; + f->dump_stream("missing_on") << missing_on; + f->dump_stream("missing_on_shards") << missing_on_shards; + f->dump_stream("recovery_info") << recovery_info; + f->dump_stream("recovery_progress") << recovery_progress; + f->dump_stream("state") << tostr(state); + f->dump_stream("waiting_on_pushes") << waiting_on_pushes; + f->dump_stream("extent_requested") << extent_requested; +} + +ECBackend::ECBackend( + PGBackend::Listener *pg, + const coll_t &coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct, + ErasureCodeInterfaceRef ec_impl, + uint64_t stripe_width) + : PGBackend(cct, pg, store, coll, ch), + ec_impl(ec_impl), + sinfo(ec_impl->get_data_chunk_count(), stripe_width) { + ceph_assert((ec_impl->get_data_chunk_count() * + ec_impl->get_chunk_size(stripe_width)) == stripe_width); +} + +PGBackend::RecoveryHandle *ECBackend::open_recovery_op() +{ + return new ECRecoveryHandle; +} + +void ECBackend::_failed_push(const hobject_t &hoid, + pair<RecoveryMessages *, ECBackend::read_result_t &> &in) +{ + ECBackend::read_result_t &res = in.second; + dout(10) << __func__ << ": Read error " << hoid << " r=" + << res.r << " errors=" << res.errors << dendl; + dout(10) << __func__ << ": canceling recovery op for obj " << hoid + << dendl; + ceph_assert(recovery_ops.count(hoid)); + eversion_t v = recovery_ops[hoid].v; + recovery_ops.erase(hoid); + + list<pg_shard_t> fl; + for (auto&& i : res.errors) { + fl.push_back(i.first); + } + get_parent()->failed_push(fl, hoid); + get_parent()->backfill_add_missing(hoid, v); + get_parent()->finish_degraded_object(hoid); +} + +struct OnRecoveryReadComplete : + public GenContext<pair<RecoveryMessages*, ECBackend::read_result_t& > &> { + ECBackend *pg; + hobject_t hoid; + OnRecoveryReadComplete(ECBackend *pg, const hobject_t &hoid) + : pg(pg), hoid(hoid) {} + void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) override { + ECBackend::read_result_t &res = in.second; + if (!(res.r == 0 && res.errors.empty())) { + pg->_failed_push(hoid, in); + return; + } + ceph_assert(res.returned.size() == 1); + pg->handle_recovery_read_complete( + hoid, + res.returned.back(), + res.attrs, + in.first); + } +}; + +struct RecoveryMessages { + map<hobject_t, + ECBackend::read_request_t> reads; + map<hobject_t, set<int>> want_to_read; + void read( + ECBackend *ec, + const hobject_t &hoid, uint64_t off, uint64_t len, + set<int> &&_want_to_read, + const map<pg_shard_t, vector<pair<int, int>>> &need, + bool attrs) { + list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read; + to_read.push_back(boost::make_tuple(off, len, 0)); + ceph_assert(!reads.count(hoid)); + want_to_read.insert(make_pair(hoid, std::move(_want_to_read))); + reads.insert( + make_pair( + hoid, + ECBackend::read_request_t( + to_read, + need, + attrs, + new OnRecoveryReadComplete( + ec, + hoid)))); + } + + map<pg_shard_t, vector<PushOp> > pushes; + map<pg_shard_t, vector<PushReplyOp> > push_replies; + ObjectStore::Transaction t; + RecoveryMessages() {} + ~RecoveryMessages(){} +}; + +void ECBackend::handle_recovery_push( + const PushOp &op, + RecoveryMessages *m, + bool is_repair) +{ + if (get_parent()->check_failsafe_full()) { + dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl; + ceph_abort(); + } + + bool oneshot = op.before_progress.first && op.after_progress.data_complete; + ghobject_t tobj; + if (oneshot) { + tobj = ghobject_t(op.soid, ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard); + } else { + tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.soid, + op.version), + ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard); + if (op.before_progress.first) { + dout(10) << __func__ << ": Adding oid " + << tobj.hobj << " in the temp collection" << dendl; + add_temp_obj(tobj.hobj); + } + } + + if (op.before_progress.first) { + m->t.remove(coll, tobj); + m->t.touch(coll, tobj); + } + + if (!op.data_included.empty()) { + uint64_t start = op.data_included.range_start(); + uint64_t end = op.data_included.range_end(); + ceph_assert(op.data.length() == (end - start)); + + m->t.write( + coll, + tobj, + start, + op.data.length(), + op.data); + } else { + ceph_assert(op.data.length() == 0); + } + + if (get_parent()->pg_is_remote_backfilling()) { + get_parent()->pg_add_local_num_bytes(op.data.length()); + get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count()); + dout(10) << __func__ << " " << op.soid + << " add new actual data by " << op.data.length() + << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count() + << dendl; + } + + if (op.before_progress.first) { + ceph_assert(op.attrset.count(string("_"))); + m->t.setattrs( + coll, + tobj, + op.attrset); + } + + if (op.after_progress.data_complete && !oneshot) { + dout(10) << __func__ << ": Removing oid " + << tobj.hobj << " from the temp collection" << dendl; + clear_temp_obj(tobj.hobj); + m->t.remove(coll, ghobject_t( + op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + m->t.collection_move_rename( + coll, tobj, + coll, ghobject_t( + op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + } + if (op.after_progress.data_complete) { + if ((get_parent()->pgb_is_primary())) { + ceph_assert(recovery_ops.count(op.soid)); + ceph_assert(recovery_ops[op.soid].obc); + if (get_parent()->pg_is_repair()) + get_parent()->inc_osd_stat_repaired(); + get_parent()->on_local_recover( + op.soid, + op.recovery_info, + recovery_ops[op.soid].obc, + false, + &m->t); + } else { + // If primary told us this is a repair, bump osd_stat_t::num_objects_repaired + if (is_repair) + get_parent()->inc_osd_stat_repaired(); + get_parent()->on_local_recover( + op.soid, + op.recovery_info, + ObjectContextRef(), + false, + &m->t); + if (get_parent()->pg_is_remote_backfilling()) { + struct stat st; + int r = store->stat(ch, ghobject_t(op.soid, ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard), &st); + if (r == 0) { + get_parent()->pg_sub_local_num_bytes(st.st_size); + // XXX: This can be way overestimated for small objects + get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count()); + dout(10) << __func__ << " " << op.soid + << " sub actual data by " << st.st_size + << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count() + << dendl; + } + } + } + } + m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp()); + m->push_replies[get_parent()->primary_shard()].back().soid = op.soid; +} + +void ECBackend::handle_recovery_push_reply( + const PushReplyOp &op, + pg_shard_t from, + RecoveryMessages *m) +{ + if (!recovery_ops.count(op.soid)) + return; + RecoveryOp &rop = recovery_ops[op.soid]; + ceph_assert(rop.waiting_on_pushes.count(from)); + rop.waiting_on_pushes.erase(from); + continue_recovery_op(rop, m); +} + +void ECBackend::handle_recovery_read_complete( + const hobject_t &hoid, + boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &to_read, + boost::optional<map<string, bufferlist> > attrs, + RecoveryMessages *m) +{ + dout(10) << __func__ << ": returned " << hoid << " " + << "(" << to_read.get<0>() + << ", " << to_read.get<1>() + << ", " << to_read.get<2>() + << ")" + << dendl; + ceph_assert(recovery_ops.count(hoid)); + RecoveryOp &op = recovery_ops[hoid]; + ceph_assert(op.returned_data.empty()); + map<int, bufferlist*> target; + for (set<shard_id_t>::iterator i = op.missing_on_shards.begin(); + i != op.missing_on_shards.end(); + ++i) { + target[*i] = &(op.returned_data[*i]); + } + map<int, bufferlist> from; + for(map<pg_shard_t, bufferlist>::iterator i = to_read.get<2>().begin(); + i != to_read.get<2>().end(); + ++i) { + from[i->first.shard].claim(i->second); + } + dout(10) << __func__ << ": " << from << dendl; + int r; + r = ECUtil::decode(sinfo, ec_impl, from, target); + ceph_assert(r == 0); + if (attrs) { + op.xattrs.swap(*attrs); + + if (!op.obc) { + // attrs only reference the origin bufferlist (decode from + // ECSubReadReply message) whose size is much greater than attrs + // in recovery. If obc cache it (get_obc maybe cache the attr), + // this causes the whole origin bufferlist would not be free + // until obc is evicted from obc cache. So rebuild the + // bufferlist before cache it. + for (map<string, bufferlist>::iterator it = op.xattrs.begin(); + it != op.xattrs.end(); + ++it) { + it->second.rebuild(); + } + // Need to remove ECUtil::get_hinfo_key() since it should not leak out + // of the backend (see bug #12983) + map<string, bufferlist> sanitized_attrs(op.xattrs); + sanitized_attrs.erase(ECUtil::get_hinfo_key()); + op.obc = get_parent()->get_obc(hoid, sanitized_attrs); + ceph_assert(op.obc); + op.recovery_info.size = op.obc->obs.oi.size; + op.recovery_info.oi = op.obc->obs.oi; + } + + ECUtil::HashInfo hinfo(ec_impl->get_chunk_count()); + if (op.obc->obs.oi.size > 0) { + ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key())); + auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin(); + decode(hinfo, bp); + } + op.hinfo = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo); + } + ceph_assert(op.xattrs.size()); + ceph_assert(op.obc); + continue_recovery_op(op, m); +} + +struct SendPushReplies : public Context { + PGBackend::Listener *l; + epoch_t epoch; + map<int, MOSDPGPushReply*> replies; + SendPushReplies( + PGBackend::Listener *l, + epoch_t epoch, + map<int, MOSDPGPushReply*> &in) : l(l), epoch(epoch) { + replies.swap(in); + } + void finish(int) override { + for (map<int, MOSDPGPushReply*>::iterator i = replies.begin(); + i != replies.end(); + ++i) { + l->send_message_osd_cluster(i->first, i->second, epoch); + } + replies.clear(); + } + ~SendPushReplies() override { + for (map<int, MOSDPGPushReply*>::iterator i = replies.begin(); + i != replies.end(); + ++i) { + i->second->put(); + } + replies.clear(); + } +}; + +void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority) +{ + for (map<pg_shard_t, vector<PushOp> >::iterator i = m.pushes.begin(); + i != m.pushes.end(); + m.pushes.erase(i++)) { + MOSDPGPush *msg = new MOSDPGPush(); + msg->set_priority(priority); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); + msg->from = get_parent()->whoami_shard(); + msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard); + msg->pushes.swap(i->second); + msg->compute_cost(cct); + msg->is_repair = get_parent()->pg_is_repair(); + get_parent()->send_message( + i->first.osd, + msg); + } + map<int, MOSDPGPushReply*> replies; + for (map<pg_shard_t, vector<PushReplyOp> >::iterator i = + m.push_replies.begin(); + i != m.push_replies.end(); + m.push_replies.erase(i++)) { + MOSDPGPushReply *msg = new MOSDPGPushReply(); + msg->set_priority(priority); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); + msg->from = get_parent()->whoami_shard(); + msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard); + msg->replies.swap(i->second); + msg->compute_cost(cct); + replies.insert(make_pair(i->first.osd, msg)); + } + + if (!replies.empty()) { + (m.t).register_on_complete( + get_parent()->bless_context( + new SendPushReplies( + get_parent(), + get_osdmap_epoch(), + replies))); + get_parent()->queue_transaction(std::move(m.t)); + } + + if (m.reads.empty()) + return; + start_read_op( + priority, + m.want_to_read, + m.reads, + OpRequestRef(), + false, true); +} + +void ECBackend::continue_recovery_op( + RecoveryOp &op, + RecoveryMessages *m) +{ + dout(10) << __func__ << ": continuing " << op << dendl; + while (1) { + switch (op.state) { + case RecoveryOp::IDLE: { + // start read + op.state = RecoveryOp::READING; + ceph_assert(!op.recovery_progress.data_complete); + set<int> want(op.missing_on_shards.begin(), op.missing_on_shards.end()); + uint64_t from = op.recovery_progress.data_recovered_to; + uint64_t amount = get_recovery_chunk_size(); + + if (op.recovery_progress.first && op.obc) { + /* We've got the attrs and the hinfo, might as well use them */ + op.hinfo = get_hash_info(op.hoid); + ceph_assert(op.hinfo); + op.xattrs = op.obc->attr_cache; + encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]); + } + + map<pg_shard_t, vector<pair<int, int>>> to_read; + int r = get_min_avail_to_read_shards( + op.hoid, want, true, false, &to_read); + if (r != 0) { + // we must have lost a recovery source + ceph_assert(!op.recovery_progress.first); + dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid + << dendl; + get_parent()->cancel_pull(op.hoid); + recovery_ops.erase(op.hoid); + return; + } + m->read( + this, + op.hoid, + op.recovery_progress.data_recovered_to, + amount, + std::move(want), + to_read, + op.recovery_progress.first && !op.obc); + op.extent_requested = make_pair( + from, + amount); + dout(10) << __func__ << ": IDLE return " << op << dendl; + return; + } + case RecoveryOp::READING: { + // read completed, start write + ceph_assert(op.xattrs.size()); + ceph_assert(op.returned_data.size()); + op.state = RecoveryOp::WRITING; + ObjectRecoveryProgress after_progress = op.recovery_progress; + after_progress.data_recovered_to += op.extent_requested.second; + after_progress.first = false; + if (after_progress.data_recovered_to >= op.obc->obs.oi.size) { + after_progress.data_recovered_to = + sinfo.logical_to_next_stripe_offset( + op.obc->obs.oi.size); + after_progress.data_complete = true; + } + for (set<pg_shard_t>::iterator mi = op.missing_on.begin(); + mi != op.missing_on.end(); + ++mi) { + ceph_assert(op.returned_data.count(mi->shard)); + m->pushes[*mi].push_back(PushOp()); + PushOp &pop = m->pushes[*mi].back(); + pop.soid = op.hoid; + pop.version = op.v; + pop.data = op.returned_data[mi->shard]; + dout(10) << __func__ << ": before_progress=" << op.recovery_progress + << ", after_progress=" << after_progress + << ", pop.data.length()=" << pop.data.length() + << ", size=" << op.obc->obs.oi.size << dendl; + ceph_assert( + pop.data.length() == + sinfo.aligned_logical_offset_to_chunk_offset( + after_progress.data_recovered_to - + op.recovery_progress.data_recovered_to) + ); + if (pop.data.length()) + pop.data_included.insert( + sinfo.aligned_logical_offset_to_chunk_offset( + op.recovery_progress.data_recovered_to), + pop.data.length() + ); + if (op.recovery_progress.first) { + pop.attrset = op.xattrs; + } + pop.recovery_info = op.recovery_info; + pop.before_progress = op.recovery_progress; + pop.after_progress = after_progress; + if (*mi != get_parent()->primary_shard()) + get_parent()->begin_peer_recover( + *mi, + op.hoid); + } + op.returned_data.clear(); + op.waiting_on_pushes = op.missing_on; + op.recovery_progress = after_progress; + dout(10) << __func__ << ": READING return " << op << dendl; + return; + } + case RecoveryOp::WRITING: { + if (op.waiting_on_pushes.empty()) { + if (op.recovery_progress.data_complete) { + op.state = RecoveryOp::COMPLETE; + for (set<pg_shard_t>::iterator i = op.missing_on.begin(); + i != op.missing_on.end(); + ++i) { + if (*i != get_parent()->primary_shard()) { + dout(10) << __func__ << ": on_peer_recover on " << *i + << ", obj " << op.hoid << dendl; + get_parent()->on_peer_recover( + *i, + op.hoid, + op.recovery_info); + } + } + object_stat_sum_t stat; + stat.num_bytes_recovered = op.recovery_info.size; + stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ? + stat.num_objects_recovered = 1; + if (get_parent()->pg_is_repair()) + stat.num_objects_repaired = 1; + get_parent()->on_global_recover(op.hoid, stat, false); + dout(10) << __func__ << ": WRITING return " << op << dendl; + recovery_ops.erase(op.hoid); + return; + } else { + op.state = RecoveryOp::IDLE; + dout(10) << __func__ << ": WRITING continue " << op << dendl; + continue; + } + } + return; + } + // should never be called once complete + case RecoveryOp::COMPLETE: + default: { + ceph_abort(); + }; + } + } +} + +void ECBackend::run_recovery_op( + RecoveryHandle *_h, + int priority) +{ + ECRecoveryHandle *h = static_cast<ECRecoveryHandle*>(_h); + RecoveryMessages m; + for (list<RecoveryOp>::iterator i = h->ops.begin(); + i != h->ops.end(); + ++i) { + dout(10) << __func__ << ": starting " << *i << dendl; + ceph_assert(!recovery_ops.count(i->hoid)); + RecoveryOp &op = recovery_ops.insert(make_pair(i->hoid, *i)).first->second; + continue_recovery_op(op, &m); + } + + dispatch_recovery_messages(m, priority); + send_recovery_deletes(priority, h->deletes); + delete _h; +} + +int ECBackend::recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + RecoveryHandle *_h) +{ + ECRecoveryHandle *h = static_cast<ECRecoveryHandle*>(_h); + h->ops.push_back(RecoveryOp()); + h->ops.back().v = v; + h->ops.back().hoid = hoid; + h->ops.back().obc = obc; + h->ops.back().recovery_info.soid = hoid; + h->ops.back().recovery_info.version = v; + if (obc) { + h->ops.back().recovery_info.size = obc->obs.oi.size; + h->ops.back().recovery_info.oi = obc->obs.oi; + } + if (hoid.is_snap()) { + if (obc) { + ceph_assert(obc->ssc); + h->ops.back().recovery_info.ss = obc->ssc->snapset; + } else if (head) { + ceph_assert(head->ssc); + h->ops.back().recovery_info.ss = head->ssc->snapset; + } else { + ceph_abort_msg("neither obc nor head set for a snap object"); + } + } + h->ops.back().recovery_progress.omap_complete = true; + for (set<pg_shard_t>::const_iterator i = + get_parent()->get_acting_recovery_backfill_shards().begin(); + i != get_parent()->get_acting_recovery_backfill_shards().end(); + ++i) { + dout(10) << "checking " << *i << dendl; + if (get_parent()->get_shard_missing(*i).is_missing(hoid)) { + h->ops.back().missing_on.insert(*i); + h->ops.back().missing_on_shards.insert(i->shard); + } + } + dout(10) << __func__ << ": built op " << h->ops.back() << dendl; + return 0; +} + +bool ECBackend::can_handle_while_inactive( + OpRequestRef _op) +{ + return false; +} + +bool ECBackend::_handle_message( + OpRequestRef _op) +{ + dout(10) << __func__ << ": " << *_op->get_req() << dendl; + int priority = _op->get_req()->get_priority(); + switch (_op->get_req()->get_type()) { + case MSG_OSD_EC_WRITE: { + // NOTE: this is non-const because handle_sub_write modifies the embedded + // ObjectStore::Transaction in place (and then std::move's it). It does + // not conflict with ECSubWrite's operator<<. + MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>( + _op->get_nonconst_req()); + parent->maybe_preempt_replica_scrub(op->op.soid); + handle_sub_write(op->op.from, _op, op->op, _op->pg_trace); + return true; + } + case MSG_OSD_EC_WRITE_REPLY: { + const MOSDECSubOpWriteReply *op = static_cast<const MOSDECSubOpWriteReply*>( + _op->get_req()); + handle_sub_write_reply(op->op.from, op->op, _op->pg_trace); + return true; + } + case MSG_OSD_EC_READ: { + const MOSDECSubOpRead *op = static_cast<const MOSDECSubOpRead*>(_op->get_req()); + MOSDECSubOpReadReply *reply = new MOSDECSubOpReadReply; + reply->pgid = get_parent()->primary_spg_t(); + reply->map_epoch = get_osdmap_epoch(); + reply->min_epoch = get_parent()->get_interval_start_epoch(); + handle_sub_read(op->op.from, op->op, &(reply->op), _op->pg_trace); + reply->trace = _op->pg_trace; + get_parent()->send_message_osd_cluster( + op->op.from.osd, reply, get_osdmap_epoch()); + return true; + } + case MSG_OSD_EC_READ_REPLY: { + // NOTE: this is non-const because handle_sub_read_reply steals resulting + // buffers. It does not conflict with ECSubReadReply operator<<. + MOSDECSubOpReadReply *op = static_cast<MOSDECSubOpReadReply*>( + _op->get_nonconst_req()); + RecoveryMessages rm; + handle_sub_read_reply(op->op.from, op->op, &rm, _op->pg_trace); + dispatch_recovery_messages(rm, priority); + return true; + } + case MSG_OSD_PG_PUSH: { + const MOSDPGPush *op = static_cast<const MOSDPGPush *>(_op->get_req()); + RecoveryMessages rm; + for (vector<PushOp>::const_iterator i = op->pushes.begin(); + i != op->pushes.end(); + ++i) { + handle_recovery_push(*i, &rm, op->is_repair); + } + dispatch_recovery_messages(rm, priority); + return true; + } + case MSG_OSD_PG_PUSH_REPLY: { + const MOSDPGPushReply *op = static_cast<const MOSDPGPushReply *>( + _op->get_req()); + RecoveryMessages rm; + for (vector<PushReplyOp>::const_iterator i = op->replies.begin(); + i != op->replies.end(); + ++i) { + handle_recovery_push_reply(*i, op->from, &rm); + } + dispatch_recovery_messages(rm, priority); + return true; + } + default: + return false; + } + return false; +} + +struct SubWriteCommitted : public Context { + ECBackend *pg; + OpRequestRef msg; + ceph_tid_t tid; + eversion_t version; + eversion_t last_complete; + const ZTracer::Trace trace; + SubWriteCommitted( + ECBackend *pg, + OpRequestRef msg, + ceph_tid_t tid, + eversion_t version, + eversion_t last_complete, + const ZTracer::Trace &trace) + : pg(pg), msg(msg), tid(tid), + version(version), last_complete(last_complete), trace(trace) {} + void finish(int) override { + if (msg) + msg->mark_event("sub_op_committed"); + pg->sub_write_committed(tid, version, last_complete, trace); + } +}; +void ECBackend::sub_write_committed( + ceph_tid_t tid, eversion_t version, eversion_t last_complete, + const ZTracer::Trace &trace) { + if (get_parent()->pgb_is_primary()) { + ECSubWriteReply reply; + reply.tid = tid; + reply.last_complete = last_complete; + reply.committed = true; + reply.applied = true; + reply.from = get_parent()->whoami_shard(); + handle_sub_write_reply( + get_parent()->whoami_shard(), + reply, trace); + } else { + get_parent()->update_last_complete_ondisk(last_complete); + MOSDECSubOpWriteReply *r = new MOSDECSubOpWriteReply; + r->pgid = get_parent()->primary_spg_t(); + r->map_epoch = get_osdmap_epoch(); + r->min_epoch = get_parent()->get_interval_start_epoch(); + r->op.tid = tid; + r->op.last_complete = last_complete; + r->op.committed = true; + r->op.applied = true; + r->op.from = get_parent()->whoami_shard(); + r->set_priority(CEPH_MSG_PRIO_HIGH); + r->trace = trace; + r->trace.event("sending sub op commit"); + get_parent()->send_message_osd_cluster( + get_parent()->primary_shard().osd, r, get_osdmap_epoch()); + } +} + +void ECBackend::handle_sub_write( + pg_shard_t from, + OpRequestRef msg, + ECSubWrite &op, + const ZTracer::Trace &trace) +{ + if (msg) + msg->mark_event("sub_op_started"); + trace.event("handle_sub_write"); + if (!get_parent()->pgb_is_primary()) + get_parent()->update_stats(op.stats); + ObjectStore::Transaction localt; + if (!op.temp_added.empty()) { + add_temp_objs(op.temp_added); + } + if (op.backfill_or_async_recovery) { + for (set<hobject_t>::iterator i = op.temp_removed.begin(); + i != op.temp_removed.end(); + ++i) { + dout(10) << __func__ << ": removing object " << *i + << " since we won't get the transaction" << dendl; + localt.remove( + coll, + ghobject_t( + *i, + ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard)); + } + } + clear_temp_objs(op.temp_removed); + dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl; + // flag set to true during async recovery + bool async = false; + pg_missing_tracker_t pmissing = get_parent()->get_local_missing(); + if (pmissing.is_missing(op.soid)) { + async = true; + dout(30) << __func__ << " is_missing " << pmissing.is_missing(op.soid) << dendl; + for (auto &&e: op.log_entries) { + dout(30) << " add_next_event entry " << e << dendl; + get_parent()->add_local_next_event(e); + dout(30) << " entry is_delete " << e.is_delete() << dendl; + } + } + get_parent()->log_operation( + op.log_entries, + op.updated_hit_set_history, + op.trim_to, + op.roll_forward_to, + !op.backfill_or_async_recovery, + localt, + async); + + if (!get_parent()->pg_is_undersized() && + (unsigned)get_parent()->whoami_shard().shard >= + ec_impl->get_data_chunk_count()) + op.t.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + + localt.register_on_commit( + get_parent()->bless_context( + new SubWriteCommitted( + this, msg, op.tid, + op.at_version, + get_parent()->get_info().last_complete, trace))); + vector<ObjectStore::Transaction> tls; + tls.reserve(2); + tls.push_back(std::move(op.t)); + tls.push_back(std::move(localt)); + get_parent()->queue_transactions(tls, msg); + dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl; + if (op.at_version != eversion_t()) { + // dummy rollforward transaction doesn't get at_version (and doesn't advance it) + get_parent()->op_applied(op.at_version); + } +} + +void ECBackend::handle_sub_read( + pg_shard_t from, + const ECSubRead &op, + ECSubReadReply *reply, + const ZTracer::Trace &trace) +{ + trace.event("handle sub read"); + shard_id_t shard = get_parent()->whoami_shard().shard; + for(auto i = op.to_read.begin(); + i != op.to_read.end(); + ++i) { + int r = 0; + for (auto j = i->second.begin(); j != i->second.end(); ++j) { + bufferlist bl; + if ((op.subchunks.find(i->first)->second.size() == 1) && + (op.subchunks.find(i->first)->second.front().second == + ec_impl->get_sub_chunk_count())) { + dout(25) << __func__ << " case1: reading the complete chunk/shard." << dendl; + r = store->read( + ch, + ghobject_t(i->first, ghobject_t::NO_GEN, shard), + j->get<0>(), + j->get<1>(), + bl, j->get<2>()); // Allow EIO return + } else { + dout(25) << __func__ << " case2: going to do fragmented read." << dendl; + int subchunk_size = + sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count(); + bool error = false; + for (int m = 0; m < (int)j->get<1>() && !error; + m += sinfo.get_chunk_size()) { + for (auto &&k:op.subchunks.find(i->first)->second) { + bufferlist bl0; + r = store->read( + ch, + ghobject_t(i->first, ghobject_t::NO_GEN, shard), + j->get<0>() + m + (k.first)*subchunk_size, + (k.second)*subchunk_size, + bl0, j->get<2>()); + if (r < 0) { + error = true; + break; + } + bl.claim_append(bl0); + } + } + } + + if (r < 0) { + // if we are doing fast reads, it's possible for one of the shard + // reads to cross paths with another update and get a (harmless) + // ENOENT. Suppress the message to the cluster log in that case. + if (r == -ENOENT && get_parent()->get_pool().fast_read) { + dout(5) << __func__ << ": Error " << r + << " reading " << i->first << ", fast read, probably ok" + << dendl; + } else { + get_parent()->clog_error() << "Error " << r + << " reading object " + << i->first; + dout(5) << __func__ << ": Error " << r + << " reading " << i->first << dendl; + } + goto error; + } else { + dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl; + reply->buffers_read[i->first].push_back( + make_pair( + j->get<0>(), + bl) + ); + } + + if (!get_parent()->get_pool().allows_ecoverwrites()) { + // This shows that we still need deep scrub because large enough files + // are read in sections, so the digest check here won't be done here. + // Do NOT check osd_read_eio_on_bad_digest here. We need to report + // the state of our chunk in case other chunks could substitute. + ECUtil::HashInfoRef hinfo; + hinfo = get_hash_info(i->first); + if (!hinfo) { + r = -EIO; + get_parent()->clog_error() << "Corruption detected: object " + << i->first + << " is missing hash_info"; + dout(5) << __func__ << ": No hinfo for " << i->first << dendl; + goto error; + } + ceph_assert(hinfo->has_chunk_hash()); + if ((bl.length() == hinfo->get_total_chunk_size()) && + (j->get<0>() == 0)) { + dout(20) << __func__ << ": Checking hash of " << i->first << dendl; + bufferhash h(-1); + h << bl; + if (h.digest() != hinfo->get_chunk_hash(shard)) { + get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x" + << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec; + dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x" + << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl; + r = -EIO; + goto error; + } + } + } + } + continue; +error: + // Do NOT check osd_read_eio_on_bad_digest here. We need to report + // the state of our chunk in case other chunks could substitute. + reply->buffers_read.erase(i->first); + reply->errors[i->first] = r; + } + for (set<hobject_t>::iterator i = op.attrs_to_read.begin(); + i != op.attrs_to_read.end(); + ++i) { + dout(10) << __func__ << ": fulfilling attr request on " + << *i << dendl; + if (reply->errors.count(*i)) + continue; + int r = store->getattrs( + ch, + ghobject_t( + *i, ghobject_t::NO_GEN, shard), + reply->attrs_read[*i]); + if (r < 0) { + // If we read error, we should not return the attrs too. + reply->attrs_read.erase(*i); + reply->buffers_read.erase(*i); + reply->errors[*i] = r; + } + } + reply->from = get_parent()->whoami_shard(); + reply->tid = op.tid; +} + +void ECBackend::handle_sub_write_reply( + pg_shard_t from, + const ECSubWriteReply &op, + const ZTracer::Trace &trace) +{ + map<ceph_tid_t, Op>::iterator i = tid_to_op_map.find(op.tid); + ceph_assert(i != tid_to_op_map.end()); + if (op.committed) { + trace.event("sub write committed"); + ceph_assert(i->second.pending_commit.count(from)); + i->second.pending_commit.erase(from); + if (from != get_parent()->whoami_shard()) { + get_parent()->update_peer_last_complete_ondisk(from, op.last_complete); + } + } + if (op.applied) { + trace.event("sub write applied"); + ceph_assert(i->second.pending_apply.count(from)); + i->second.pending_apply.erase(from); + } + + if (i->second.pending_commit.empty() && + i->second.on_all_commit && + // also wait for apply, to preserve ordering with luminous peers. + i->second.pending_apply.empty()) { + dout(10) << __func__ << " Calling on_all_commit on " << i->second << dendl; + i->second.on_all_commit->complete(0); + i->second.on_all_commit = 0; + i->second.trace.event("ec write all committed"); + } + check_ops(); +} + +void ECBackend::handle_sub_read_reply( + pg_shard_t from, + ECSubReadReply &op, + RecoveryMessages *m, + const ZTracer::Trace &trace) +{ + trace.event("ec sub read reply"); + dout(10) << __func__ << ": reply " << op << dendl; + map<ceph_tid_t, ReadOp>::iterator iter = tid_to_read_map.find(op.tid); + if (iter == tid_to_read_map.end()) { + //canceled + dout(20) << __func__ << ": dropped " << op << dendl; + return; + } + ReadOp &rop = iter->second; + for (auto i = op.buffers_read.begin(); + i != op.buffers_read.end(); + ++i) { + ceph_assert(!op.errors.count(i->first)); // If attribute error we better not have sent a buffer + if (!rop.to_read.count(i->first)) { + // We canceled this read! @see filter_read_op + dout(20) << __func__ << " to_read skipping" << dendl; + continue; + } + list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator req_iter = + rop.to_read.find(i->first)->second.to_read.begin(); + list< + boost::tuple< + uint64_t, uint64_t, map<pg_shard_t, bufferlist> > >::iterator riter = + rop.complete[i->first].returned.begin(); + for (list<pair<uint64_t, bufferlist> >::iterator j = i->second.begin(); + j != i->second.end(); + ++j, ++req_iter, ++riter) { + ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end()); + ceph_assert(riter != rop.complete[i->first].returned.end()); + pair<uint64_t, uint64_t> adjusted = + sinfo.aligned_offset_len_to_chunk( + make_pair(req_iter->get<0>(), req_iter->get<1>())); + ceph_assert(adjusted.first == j->first); + riter->get<2>()[from].claim(j->second); + } + } + for (auto i = op.attrs_read.begin(); + i != op.attrs_read.end(); + ++i) { + ceph_assert(!op.errors.count(i->first)); // if read error better not have sent an attribute + if (!rop.to_read.count(i->first)) { + // We canceled this read! @see filter_read_op + dout(20) << __func__ << " to_read skipping" << dendl; + continue; + } + rop.complete[i->first].attrs = map<string, bufferlist>(); + (*(rop.complete[i->first].attrs)).swap(i->second); + } + for (auto i = op.errors.begin(); + i != op.errors.end(); + ++i) { + rop.complete[i->first].errors.insert( + make_pair( + from, + i->second)); + dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl; + } + + map<pg_shard_t, set<ceph_tid_t> >::iterator siter = + shard_to_read_map.find(from); + ceph_assert(siter != shard_to_read_map.end()); + ceph_assert(siter->second.count(op.tid)); + siter->second.erase(op.tid); + + ceph_assert(rop.in_progress.count(from)); + rop.in_progress.erase(from); + unsigned is_complete = 0; + // For redundant reads check for completion as each shard comes in, + // or in a non-recovery read check for completion once all the shards read. + if (rop.do_redundant_reads || rop.in_progress.empty()) { + for (map<hobject_t, read_result_t>::const_iterator iter = + rop.complete.begin(); + iter != rop.complete.end(); + ++iter) { + set<int> have; + for (map<pg_shard_t, bufferlist>::const_iterator j = + iter->second.returned.front().get<2>().begin(); + j != iter->second.returned.front().get<2>().end(); + ++j) { + have.insert(j->first.shard); + dout(20) << __func__ << " have shard=" << j->first.shard << dendl; + } + map<int, vector<pair<int, int>>> dummy_minimum; + int err; + if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) { + dout(20) << __func__ << " minimum_to_decode failed" << dendl; + if (rop.in_progress.empty()) { + // If we don't have enough copies, try other pg_shard_ts if available. + // During recovery there may be multiple osds with copies of the same shard, + // so getting EIO from one may result in multiple passes through this code path. + if (!rop.do_redundant_reads) { + int r = send_all_remaining_reads(iter->first, rop); + if (r == 0) { + // We added to in_progress and not incrementing is_complete + continue; + } + // Couldn't read any additional shards so handle as completed with errors + } + // We don't want to confuse clients / RBD with objectstore error + // values in particular ENOENT. We may have different error returns + // from different shards, so we'll return minimum_to_decode() error + // (usually EIO) to reader. It is likely an error here is due to a + // damaged pg. + rop.complete[iter->first].r = err; + ++is_complete; + } + } else { + ceph_assert(rop.complete[iter->first].r == 0); + if (!rop.complete[iter->first].errors.empty()) { + if (cct->_conf->osd_read_ec_check_for_errors) { + dout(10) << __func__ << ": Not ignoring errors, use one shard err=" << err << dendl; + err = rop.complete[iter->first].errors.begin()->second; + rop.complete[iter->first].r = err; + } else { + get_parent()->clog_warn() << "Error(s) ignored for " + << iter->first << " enough copies available"; + dout(10) << __func__ << " Error(s) ignored for " << iter->first + << " enough copies available" << dendl; + rop.complete[iter->first].errors.clear(); + } + } + ++is_complete; + } + } + } + if (rop.in_progress.empty() || is_complete == rop.complete.size()) { + dout(20) << __func__ << " Complete: " << rop << dendl; + rop.trace.event("ec read complete"); + complete_read_op(rop, m); + } else { + dout(10) << __func__ << " readop not complete: " << rop << dendl; + } +} + +void ECBackend::complete_read_op(ReadOp &rop, RecoveryMessages *m) +{ + map<hobject_t, read_request_t>::iterator reqiter = + rop.to_read.begin(); + map<hobject_t, read_result_t>::iterator resiter = + rop.complete.begin(); + ceph_assert(rop.to_read.size() == rop.complete.size()); + for (; reqiter != rop.to_read.end(); ++reqiter, ++resiter) { + if (reqiter->second.cb) { + pair<RecoveryMessages *, read_result_t &> arg( + m, resiter->second); + reqiter->second.cb->complete(arg); + reqiter->second.cb = nullptr; + } + } + // if the read op is over. clean all the data of this tid. + for (set<pg_shard_t>::iterator iter = rop.in_progress.begin(); + iter != rop.in_progress.end(); + iter++) { + shard_to_read_map[*iter].erase(rop.tid); + } + rop.in_progress.clear(); + tid_to_read_map.erase(rop.tid); +} + +struct FinishReadOp : public GenContext<ThreadPool::TPHandle&> { + ECBackend *ec; + ceph_tid_t tid; + FinishReadOp(ECBackend *ec, ceph_tid_t tid) : ec(ec), tid(tid) {} + void finish(ThreadPool::TPHandle &handle) override { + auto ropiter = ec->tid_to_read_map.find(tid); + ceph_assert(ropiter != ec->tid_to_read_map.end()); + int priority = ropiter->second.priority; + RecoveryMessages rm; + ec->complete_read_op(ropiter->second, &rm); + ec->dispatch_recovery_messages(rm, priority); + } +}; + +void ECBackend::filter_read_op( + const OSDMapRef& osdmap, + ReadOp &op) +{ + set<hobject_t> to_cancel; + for (map<pg_shard_t, set<hobject_t> >::iterator i = op.source_to_obj.begin(); + i != op.source_to_obj.end(); + ++i) { + if (osdmap->is_down(i->first.osd)) { + to_cancel.insert(i->second.begin(), i->second.end()); + op.in_progress.erase(i->first); + continue; + } + } + + if (to_cancel.empty()) + return; + + for (map<pg_shard_t, set<hobject_t> >::iterator i = op.source_to_obj.begin(); + i != op.source_to_obj.end(); + ) { + for (set<hobject_t>::iterator j = i->second.begin(); + j != i->second.end(); + ) { + if (to_cancel.count(*j)) + i->second.erase(j++); + else + ++j; + } + if (i->second.empty()) { + op.source_to_obj.erase(i++); + } else { + ceph_assert(!osdmap->is_down(i->first.osd)); + ++i; + } + } + + for (set<hobject_t>::iterator i = to_cancel.begin(); + i != to_cancel.end(); + ++i) { + get_parent()->cancel_pull(*i); + + ceph_assert(op.to_read.count(*i)); + read_request_t &req = op.to_read.find(*i)->second; + dout(10) << __func__ << ": canceling " << req + << " for obj " << *i << dendl; + ceph_assert(req.cb); + delete req.cb; + req.cb = nullptr; + + op.to_read.erase(*i); + op.complete.erase(*i); + recovery_ops.erase(*i); + } + + if (op.in_progress.empty()) { + get_parent()->schedule_recovery_work( + get_parent()->bless_unlocked_gencontext( + new FinishReadOp(this, op.tid))); + } +} + +void ECBackend::check_recovery_sources(const OSDMapRef& osdmap) +{ + set<ceph_tid_t> tids_to_filter; + for (map<pg_shard_t, set<ceph_tid_t> >::iterator + i = shard_to_read_map.begin(); + i != shard_to_read_map.end(); + ) { + if (osdmap->is_down(i->first.osd)) { + tids_to_filter.insert(i->second.begin(), i->second.end()); + shard_to_read_map.erase(i++); + } else { + ++i; + } + } + for (set<ceph_tid_t>::iterator i = tids_to_filter.begin(); + i != tids_to_filter.end(); + ++i) { + map<ceph_tid_t, ReadOp>::iterator j = tid_to_read_map.find(*i); + ceph_assert(j != tid_to_read_map.end()); + filter_read_op(osdmap, j->second); + } +} + +void ECBackend::on_change() +{ + dout(10) << __func__ << dendl; + + completed_to = eversion_t(); + committed_to = eversion_t(); + pipeline_state.clear(); + waiting_reads.clear(); + waiting_state.clear(); + waiting_commit.clear(); + for (auto &&op: tid_to_op_map) { + cache.release_write_pin(op.second.pin); + } + tid_to_op_map.clear(); + + for (map<ceph_tid_t, ReadOp>::iterator i = tid_to_read_map.begin(); + i != tid_to_read_map.end(); + ++i) { + dout(10) << __func__ << ": cancelling " << i->second << dendl; + for (map<hobject_t, read_request_t>::iterator j = + i->second.to_read.begin(); + j != i->second.to_read.end(); + ++j) { + delete j->second.cb; + j->second.cb = nullptr; + } + } + tid_to_read_map.clear(); + in_progress_client_reads.clear(); + shard_to_read_map.clear(); + clear_recovery_state(); +} + +void ECBackend::clear_recovery_state() +{ + recovery_ops.clear(); +} + +void ECBackend::dump_recovery_info(Formatter *f) const +{ + f->open_array_section("recovery_ops"); + for (map<hobject_t, RecoveryOp>::const_iterator i = recovery_ops.begin(); + i != recovery_ops.end(); + ++i) { + f->open_object_section("op"); + i->second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("read_ops"); + for (map<ceph_tid_t, ReadOp>::const_iterator i = tid_to_read_map.begin(); + i != tid_to_read_map.end(); + ++i) { + f->open_object_section("read_op"); + i->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void ECBackend::submit_transaction( + const hobject_t &hoid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&t, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const vector<pg_log_entry_t> &log_entries, + boost::optional<pg_hit_set_history_t> &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef client_op + ) +{ + ceph_assert(!tid_to_op_map.count(tid)); + Op *op = &(tid_to_op_map[tid]); + op->hoid = hoid; + op->delta_stats = delta_stats; + op->version = at_version; + op->trim_to = trim_to; + op->roll_forward_to = std::max(roll_forward_to, committed_to); + op->log_entries = log_entries; + std::swap(op->updated_hit_set_history, hset_history); + op->on_all_commit = on_all_commit; + op->tid = tid; + op->reqid = reqid; + op->client_op = client_op; + if (client_op) + op->trace = client_op->pg_trace; + + dout(10) << __func__ << ": op " << *op << " starting" << dendl; + start_rmw(op, std::move(t)); +} + +void ECBackend::call_write_ordered(std::function<void(void)> &&cb) { + if (!waiting_state.empty()) { + waiting_state.back().on_write.emplace_back(std::move(cb)); + } else if (!waiting_reads.empty()) { + waiting_reads.back().on_write.emplace_back(std::move(cb)); + } else { + // Nothing earlier in the pipeline, just call it + cb(); + } +} + +void ECBackend::get_all_avail_shards( + const hobject_t &hoid, + const set<pg_shard_t> &error_shards, + set<int> &have, + map<shard_id_t, pg_shard_t> &shards, + bool for_recovery) +{ + for (set<pg_shard_t>::const_iterator i = + get_parent()->get_acting_shards().begin(); + i != get_parent()->get_acting_shards().end(); + ++i) { + dout(10) << __func__ << ": checking acting " << *i << dendl; + const pg_missing_t &missing = get_parent()->get_shard_missing(*i); + if (error_shards.find(*i) != error_shards.end()) + continue; + if (!missing.is_missing(hoid)) { + ceph_assert(!have.count(i->shard)); + have.insert(i->shard); + ceph_assert(!shards.count(i->shard)); + shards.insert(make_pair(i->shard, *i)); + } + } + + if (for_recovery) { + for (set<pg_shard_t>::const_iterator i = + get_parent()->get_backfill_shards().begin(); + i != get_parent()->get_backfill_shards().end(); + ++i) { + if (error_shards.find(*i) != error_shards.end()) + continue; + if (have.count(i->shard)) { + ceph_assert(shards.count(i->shard)); + continue; + } + dout(10) << __func__ << ": checking backfill " << *i << dendl; + ceph_assert(!shards.count(i->shard)); + const pg_info_t &info = get_parent()->get_shard_info(*i); + const pg_missing_t &missing = get_parent()->get_shard_missing(*i); + if (hoid < info.last_backfill && + !missing.is_missing(hoid)) { + have.insert(i->shard); + shards.insert(make_pair(i->shard, *i)); + } + } + + map<hobject_t, set<pg_shard_t>>::const_iterator miter = + get_parent()->get_missing_loc_shards().find(hoid); + if (miter != get_parent()->get_missing_loc_shards().end()) { + for (set<pg_shard_t>::iterator i = miter->second.begin(); + i != miter->second.end(); + ++i) { + dout(10) << __func__ << ": checking missing_loc " << *i << dendl; + auto m = get_parent()->maybe_get_shard_missing(*i); + if (m) { + ceph_assert(!(*m).is_missing(hoid)); + } + if (error_shards.find(*i) != error_shards.end()) + continue; + have.insert(i->shard); + shards.insert(make_pair(i->shard, *i)); + } + } + } +} + +int ECBackend::get_min_avail_to_read_shards( + const hobject_t &hoid, + const set<int> &want, + bool for_recovery, + bool do_redundant_reads, + map<pg_shard_t, vector<pair<int, int>>> *to_read) +{ + // Make sure we don't do redundant reads for recovery + ceph_assert(!for_recovery || !do_redundant_reads); + + set<int> have; + map<shard_id_t, pg_shard_t> shards; + set<pg_shard_t> error_shards; + + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + + map<int, vector<pair<int, int>>> need; + int r = ec_impl->minimum_to_decode(want, have, &need); + if (r < 0) + return r; + + if (do_redundant_reads) { + vector<pair<int, int>> subchunks_list; + subchunks_list.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); + for (auto &&i: have) { + need[i] = subchunks_list; + } + } + + if (!to_read) + return 0; + + for (auto &&i:need) { + ceph_assert(shards.count(shard_id_t(i.first))); + to_read->insert(make_pair(shards[shard_id_t(i.first)], i.second)); + } + return 0; +} + +int ECBackend::get_remaining_shards( + const hobject_t &hoid, + const set<int> &avail, + const set<int> &want, + const read_result_t &result, + map<pg_shard_t, vector<pair<int, int>>> *to_read, + bool for_recovery) +{ + ceph_assert(to_read); + + set<int> have; + map<shard_id_t, pg_shard_t> shards; + set<pg_shard_t> error_shards; + for (auto &p : result.errors) { + error_shards.insert(p.first); + } + + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + + map<int, vector<pair<int, int>>> need; + int r = ec_impl->minimum_to_decode(want, have, &need); + if (r < 0) { + dout(0) << __func__ << " not enough shards left to try for " << hoid + << " read result was " << result << dendl; + return -EIO; + } + + set<int> shards_left; + for (auto p : need) { + if (avail.find(p.first) == avail.end()) { + shards_left.insert(p.first); + } + } + + vector<pair<int, int>> subchunks; + subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); + for (set<int>::iterator i = shards_left.begin(); + i != shards_left.end(); + ++i) { + ceph_assert(shards.count(shard_id_t(*i))); + ceph_assert(avail.find(*i) == avail.end()); + to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks)); + } + return 0; +} + +void ECBackend::start_read_op( + int priority, + map<hobject_t, set<int>> &want_to_read, + map<hobject_t, read_request_t> &to_read, + OpRequestRef _op, + bool do_redundant_reads, + bool for_recovery) +{ + ceph_tid_t tid = get_parent()->get_tid(); + ceph_assert(!tid_to_read_map.count(tid)); + auto &op = tid_to_read_map.emplace( + tid, + ReadOp( + priority, + tid, + do_redundant_reads, + for_recovery, + _op, + std::move(want_to_read), + std::move(to_read))).first->second; + dout(10) << __func__ << ": starting " << op << dendl; + if (_op) { + op.trace = _op->pg_trace; + op.trace.event("start ec read"); + } + do_read_op(op); +} + +void ECBackend::do_read_op(ReadOp &op) +{ + int priority = op.priority; + ceph_tid_t tid = op.tid; + + dout(10) << __func__ << ": starting read " << op << dendl; + + map<pg_shard_t, ECSubRead> messages; + for (map<hobject_t, read_request_t>::iterator i = op.to_read.begin(); + i != op.to_read.end(); + ++i) { + bool need_attrs = i->second.want_attrs; + + for (auto j = i->second.need.begin(); + j != i->second.need.end(); + ++j) { + if (need_attrs) { + messages[j->first].attrs_to_read.insert(i->first); + need_attrs = false; + } + messages[j->first].subchunks[i->first] = j->second; + op.obj_to_source[i->first].insert(j->first); + op.source_to_obj[j->first].insert(i->first); + } + for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator j = + i->second.to_read.begin(); + j != i->second.to_read.end(); + ++j) { + pair<uint64_t, uint64_t> chunk_off_len = + sinfo.aligned_offset_len_to_chunk(make_pair(j->get<0>(), j->get<1>())); + for (auto k = i->second.need.begin(); + k != i->second.need.end(); + ++k) { + messages[k->first].to_read[i->first].push_back( + boost::make_tuple( + chunk_off_len.first, + chunk_off_len.second, + j->get<2>())); + } + ceph_assert(!need_attrs); + } + } + + for (map<pg_shard_t, ECSubRead>::iterator i = messages.begin(); + i != messages.end(); + ++i) { + op.in_progress.insert(i->first); + shard_to_read_map[i->first].insert(op.tid); + i->second.tid = tid; + MOSDECSubOpRead *msg = new MOSDECSubOpRead; + msg->set_priority(priority); + msg->pgid = spg_t( + get_parent()->whoami_spg_t().pgid, + i->first.shard); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_interval_start_epoch(); + msg->op = i->second; + msg->op.from = get_parent()->whoami_shard(); + msg->op.tid = tid; + if (op.trace) { + // initialize a child span for this shard + msg->trace.init("ec sub read", nullptr, &op.trace); + msg->trace.keyval("shard", i->first.shard.id); + } + get_parent()->send_message_osd_cluster( + i->first.osd, + msg, + get_osdmap_epoch()); + } + dout(10) << __func__ << ": started " << op << dendl; +} + +ECUtil::HashInfoRef ECBackend::get_hash_info( + const hobject_t &hoid, bool checks, const map<string,bufferptr> *attrs) +{ + dout(10) << __func__ << ": Getting attr on " << hoid << dendl; + ECUtil::HashInfoRef ref = unstable_hashinfo_registry.lookup(hoid); + if (!ref) { + dout(10) << __func__ << ": not in cache " << hoid << dendl; + struct stat st; + int r = store->stat( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + &st); + ECUtil::HashInfo hinfo(ec_impl->get_chunk_count()); + // XXX: What does it mean if there is no object on disk? + if (r >= 0) { + dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl; + bufferlist bl; + if (attrs) { + map<string, bufferptr>::const_iterator k = attrs->find(ECUtil::get_hinfo_key()); + if (k == attrs->end()) { + dout(5) << __func__ << " " << hoid << " missing hinfo attr" << dendl; + } else { + bl.push_back(k->second); + } + } else { + r = store->getattr( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ECUtil::get_hinfo_key(), + bl); + if (r < 0) { + dout(5) << __func__ << ": getattr failed: " << cpp_strerror(r) << dendl; + bl.clear(); // just in case + } + } + if (bl.length() > 0) { + auto bp = bl.cbegin(); + try { + decode(hinfo, bp); + } catch(...) { + dout(0) << __func__ << ": Can't decode hinfo for " << hoid << dendl; + return ECUtil::HashInfoRef(); + } + if (checks && hinfo.get_total_chunk_size() != (uint64_t)st.st_size) { + dout(0) << __func__ << ": Mismatch of total_chunk_size " + << hinfo.get_total_chunk_size() << dendl; + return ECUtil::HashInfoRef(); + } + } else if (st.st_size > 0) { // If empty object and no hinfo, create it + return ECUtil::HashInfoRef(); + } + } + ref = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo); + } + return ref; +} + +void ECBackend::start_rmw(Op *op, PGTransactionUPtr &&t) +{ + ceph_assert(op); + + op->plan = ECTransaction::get_write_plan( + sinfo, + std::move(t), + [&](const hobject_t &i) { + ECUtil::HashInfoRef ref = get_hash_info(i, false); + if (!ref) { + derr << __func__ << ": get_hash_info(" << i << ")" + << " returned a null pointer and there is no " + << " way to recover from such an error in this " + << " context" << dendl; + ceph_abort(); + } + return ref; + }, + get_parent()->get_dpp()); + + dout(10) << __func__ << ": " << *op << dendl; + + waiting_state.push_back(*op); + check_ops(); +} + +bool ECBackend::try_state_to_reads() +{ + if (waiting_state.empty()) + return false; + + Op *op = &(waiting_state.front()); + if (op->requires_rmw() && pipeline_state.cache_invalid()) { + ceph_assert(get_parent()->get_pool().allows_ecoverwrites()); + dout(20) << __func__ << ": blocking " << *op + << " because it requires an rmw and the cache is invalid " + << pipeline_state + << dendl; + return false; + } + + if (!pipeline_state.caching_enabled()) { + op->using_cache = false; + } else if (op->invalidates_cache()) { + dout(20) << __func__ << ": invalidating cache after this op" + << dendl; + pipeline_state.invalidate(); + } + + waiting_state.pop_front(); + waiting_reads.push_back(*op); + + if (op->using_cache) { + cache.open_write_pin(op->pin); + + extent_set empty; + for (auto &&hpair: op->plan.will_write) { + auto to_read_plan_iter = op->plan.to_read.find(hpair.first); + const extent_set &to_read_plan = + to_read_plan_iter == op->plan.to_read.end() ? + empty : + to_read_plan_iter->second; + + extent_set remote_read = cache.reserve_extents_for_rmw( + hpair.first, + op->pin, + hpair.second, + to_read_plan); + + extent_set pending_read = to_read_plan; + pending_read.subtract(remote_read); + + if (!remote_read.empty()) { + op->remote_read[hpair.first] = std::move(remote_read); + } + if (!pending_read.empty()) { + op->pending_read[hpair.first] = std::move(pending_read); + } + } + } else { + op->remote_read = op->plan.to_read; + } + + dout(10) << __func__ << ": " << *op << dendl; + + if (!op->remote_read.empty()) { + ceph_assert(get_parent()->get_pool().allows_ecoverwrites()); + objects_read_async_no_cache( + op->remote_read, + [this, op](map<hobject_t,pair<int, extent_map> > &&results) { + for (auto &&i: results) { + op->remote_read_result.emplace(i.first, i.second.second); + } + check_ops(); + }); + } + + return true; +} + +bool ECBackend::try_reads_to_commit() +{ + if (waiting_reads.empty()) + return false; + Op *op = &(waiting_reads.front()); + if (op->read_in_progress()) + return false; + waiting_reads.pop_front(); + waiting_commit.push_back(*op); + + dout(10) << __func__ << ": starting commit on " << *op << dendl; + dout(20) << __func__ << ": " << cache << dendl; + + get_parent()->apply_stats( + op->hoid, + op->delta_stats); + + if (op->using_cache) { + for (auto &&hpair: op->pending_read) { + op->remote_read_result[hpair.first].insert( + cache.get_remaining_extents_for_rmw( + hpair.first, + op->pin, + hpair.second)); + } + op->pending_read.clear(); + } else { + ceph_assert(op->pending_read.empty()); + } + + map<shard_id_t, ObjectStore::Transaction> trans; + for (set<pg_shard_t>::const_iterator i = + get_parent()->get_acting_recovery_backfill_shards().begin(); + i != get_parent()->get_acting_recovery_backfill_shards().end(); + ++i) { + trans[i->shard]; + } + + op->trace.event("start ec write"); + + map<hobject_t,extent_map> written; + if (op->plan.t) { + ECTransaction::generate_transactions( + op->plan, + ec_impl, + get_parent()->get_info().pgid.pgid, + sinfo, + op->remote_read_result, + op->log_entries, + &written, + &trans, + &(op->temp_added), + &(op->temp_cleared), + get_parent()->get_dpp()); + } + + dout(20) << __func__ << ": " << cache << dendl; + dout(20) << __func__ << ": written: " << written << dendl; + dout(20) << __func__ << ": op: " << *op << dendl; + + if (!get_parent()->get_pool().allows_ecoverwrites()) { + for (auto &&i: op->log_entries) { + if (i.requires_kraken()) { + derr << __func__ << ": log entry " << i << " requires kraken" + << " but overwrites are not enabled!" << dendl; + ceph_abort(); + } + } + } + + map<hobject_t,extent_set> written_set; + for (auto &&i: written) { + written_set[i.first] = i.second.get_interval_set(); + } + dout(20) << __func__ << ": written_set: " << written_set << dendl; + ceph_assert(written_set == op->plan.will_write); + + if (op->using_cache) { + for (auto &&hpair: written) { + dout(20) << __func__ << ": " << hpair << dendl; + cache.present_rmw_update(hpair.first, op->pin, hpair.second); + } + } + op->remote_read.clear(); + op->remote_read_result.clear(); + + ObjectStore::Transaction empty; + bool should_write_local = false; + ECSubWrite local_write_op; + set<pg_shard_t> backfill_shards = get_parent()->get_backfill_shards(); + for (set<pg_shard_t>::const_iterator i = + get_parent()->get_acting_recovery_backfill_shards().begin(); + i != get_parent()->get_acting_recovery_backfill_shards().end(); + ++i) { + op->pending_apply.insert(*i); + op->pending_commit.insert(*i); + map<shard_id_t, ObjectStore::Transaction>::iterator iter = + trans.find(i->shard); + ceph_assert(iter != trans.end()); + bool should_send = get_parent()->should_send_op(*i, op->hoid); + const pg_stat_t &stats = + (should_send || !backfill_shards.count(*i)) ? + get_info().stats : + parent->get_shard_info().find(*i)->second.stats; + + ECSubWrite sop( + get_parent()->whoami_shard(), + op->tid, + op->reqid, + op->hoid, + stats, + should_send ? iter->second : empty, + op->version, + op->trim_to, + op->roll_forward_to, + op->log_entries, + op->updated_hit_set_history, + op->temp_added, + op->temp_cleared, + !should_send); + + ZTracer::Trace trace; + if (op->trace) { + // initialize a child span for this shard + trace.init("ec sub write", nullptr, &op->trace); + trace.keyval("shard", i->shard.id); + } + + if (*i == get_parent()->whoami_shard()) { + should_write_local = true; + local_write_op.claim(sop); + } else { + MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop); + r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard); + r->map_epoch = get_osdmap_epoch(); + r->min_epoch = get_parent()->get_interval_start_epoch(); + r->trace = trace; + get_parent()->send_message_osd_cluster( + i->osd, r, get_osdmap_epoch()); + } + } + if (should_write_local) { + handle_sub_write( + get_parent()->whoami_shard(), + op->client_op, + local_write_op, + op->trace); + } + + for (auto i = op->on_write.begin(); + i != op->on_write.end(); + op->on_write.erase(i++)) { + (*i)(); + } + + return true; +} + +bool ECBackend::try_finish_rmw() +{ + if (waiting_commit.empty()) + return false; + Op *op = &(waiting_commit.front()); + if (op->write_in_progress()) + return false; + waiting_commit.pop_front(); + + dout(10) << __func__ << ": " << *op << dendl; + dout(20) << __func__ << ": " << cache << dendl; + + if (op->roll_forward_to > completed_to) + completed_to = op->roll_forward_to; + if (op->version > committed_to) + committed_to = op->version; + + if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) { + if (op->version > get_parent()->get_log().get_can_rollback_to() && + waiting_reads.empty() && + waiting_commit.empty()) { + // submit a dummy transaction to kick the rollforward + auto tid = get_parent()->get_tid(); + Op *nop = &(tid_to_op_map[tid]); + nop->hoid = op->hoid; + nop->trim_to = op->trim_to; + nop->roll_forward_to = op->version; + nop->tid = tid; + nop->reqid = op->reqid; + waiting_reads.push_back(*nop); + } + } + + if (op->using_cache) { + cache.release_write_pin(op->pin); + } + tid_to_op_map.erase(op->tid); + + if (waiting_reads.empty() && + waiting_commit.empty()) { + pipeline_state.clear(); + dout(20) << __func__ << ": clearing pipeline_state " + << pipeline_state + << dendl; + } + return true; +} + +void ECBackend::check_ops() +{ + while (try_state_to_reads() || + try_reads_to_commit() || + try_finish_rmw()); +} + +int ECBackend::objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + bufferlist *bl) +{ + return -EOPNOTSUPP; +} + +void ECBackend::objects_read_async( + const hobject_t &hoid, + const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, + pair<bufferlist*, Context*> > > &to_read, + Context *on_complete, + bool fast_read) +{ + map<hobject_t,std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > > + reads; + + uint32_t flags = 0; + extent_set es; + for (list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, + pair<bufferlist*, Context*> > >::const_iterator i = + to_read.begin(); + i != to_read.end(); + ++i) { + pair<uint64_t, uint64_t> tmp = + sinfo.offset_len_to_stripe_bounds( + make_pair(i->first.get<0>(), i->first.get<1>())); + + es.union_insert(tmp.first, tmp.second); + flags |= i->first.get<2>(); + } + + if (!es.empty()) { + auto &offsets = reads[hoid]; + for (auto j = es.begin(); + j != es.end(); + ++j) { + offsets.push_back( + boost::make_tuple( + j.get_start(), + j.get_len(), + flags)); + } + } + + struct cb { + ECBackend *ec; + hobject_t hoid; + list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, + pair<bufferlist*, Context*> > > to_read; + unique_ptr<Context> on_complete; + cb(const cb&) = delete; + cb(cb &&) = default; + cb(ECBackend *ec, + const hobject_t &hoid, + const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, + pair<bufferlist*, Context*> > > &to_read, + Context *on_complete) + : ec(ec), + hoid(hoid), + to_read(to_read), + on_complete(on_complete) {} + void operator()(map<hobject_t,pair<int, extent_map> > &&results) { + auto dpp = ec->get_parent()->get_dpp(); + ldpp_dout(dpp, 20) << "objects_read_async_cb: got: " << results + << dendl; + ldpp_dout(dpp, 20) << "objects_read_async_cb: cache: " << ec->cache + << dendl; + + auto &got = results[hoid]; + + int r = 0; + for (auto &&read: to_read) { + if (got.first < 0) { + if (read.second.second) { + read.second.second->complete(got.first); + } + if (r == 0) + r = got.first; + } else { + ceph_assert(read.second.first); + uint64_t offset = read.first.get<0>(); + uint64_t length = read.first.get<1>(); + auto range = got.second.get_containing_range(offset, length); + ceph_assert(range.first != range.second); + ceph_assert(range.first.get_off() <= offset); + ldpp_dout(dpp, 30) << "offset: " << offset << dendl; + ldpp_dout(dpp, 30) << "range offset: " << range.first.get_off() << dendl; + ldpp_dout(dpp, 30) << "length: " << length << dendl; + ldpp_dout(dpp, 30) << "range length: " << range.first.get_len() << dendl; + ceph_assert( + (offset + length) <= + (range.first.get_off() + range.first.get_len())); + read.second.first->substr_of( + range.first.get_val(), + offset - range.first.get_off(), + length); + if (read.second.second) { + read.second.second->complete(length); + read.second.second = nullptr; + } + } + } + to_read.clear(); + if (on_complete) { + on_complete.release()->complete(r); + } + } + ~cb() { + for (auto &&i: to_read) { + delete i.second.second; + } + to_read.clear(); + } + }; + objects_read_and_reconstruct( + reads, + fast_read, + make_gen_lambda_context< + map<hobject_t,pair<int, extent_map> > &&, cb>( + cb(this, + hoid, + to_read, + on_complete))); +} + +struct CallClientContexts : + public GenContext<pair<RecoveryMessages*, ECBackend::read_result_t& > &> { + hobject_t hoid; + ECBackend *ec; + ECBackend::ClientAsyncReadStatus *status; + list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read; + CallClientContexts( + hobject_t hoid, + ECBackend *ec, + ECBackend::ClientAsyncReadStatus *status, + const list<boost::tuple<uint64_t, uint64_t, uint32_t> > &to_read) + : hoid(hoid), ec(ec), status(status), to_read(to_read) {} + void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) override { + ECBackend::read_result_t &res = in.second; + extent_map result; + if (res.r != 0) + goto out; + ceph_assert(res.returned.size() == to_read.size()); + ceph_assert(res.errors.empty()); + for (auto &&read: to_read) { + pair<uint64_t, uint64_t> adjusted = + ec->sinfo.offset_len_to_stripe_bounds( + make_pair(read.get<0>(), read.get<1>())); + ceph_assert(res.returned.front().get<0>() == adjusted.first && + res.returned.front().get<1>() == adjusted.second); + map<int, bufferlist> to_decode; + bufferlist bl; + for (map<pg_shard_t, bufferlist>::iterator j = + res.returned.front().get<2>().begin(); + j != res.returned.front().get<2>().end(); + ++j) { + to_decode[j->first.shard].claim(j->second); + } + int r = ECUtil::decode( + ec->sinfo, + ec->ec_impl, + to_decode, + &bl); + if (r < 0) { + res.r = r; + goto out; + } + bufferlist trimmed; + trimmed.substr_of( + bl, + read.get<0>() - adjusted.first, + std::min(read.get<1>(), + bl.length() - (read.get<0>() - adjusted.first))); + result.insert( + read.get<0>(), trimmed.length(), std::move(trimmed)); + res.returned.pop_front(); + } +out: + status->complete_object(hoid, res.r, std::move(result)); + ec->kick_reads(); + } +}; + +void ECBackend::objects_read_and_reconstruct( + const map<hobject_t, + std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > + > &reads, + bool fast_read, + GenContextURef<map<hobject_t,pair<int, extent_map> > &&> &&func) +{ + in_progress_client_reads.emplace_back( + reads.size(), std::move(func)); + if (!reads.size()) { + kick_reads(); + return; + } + + map<hobject_t, set<int>> obj_want_to_read; + set<int> want_to_read; + get_want_to_read_shards(&want_to_read); + + map<hobject_t, read_request_t> for_read_op; + for (auto &&to_read: reads) { + map<pg_shard_t, vector<pair<int, int>>> shards; + int r = get_min_avail_to_read_shards( + to_read.first, + want_to_read, + false, + fast_read, + &shards); + ceph_assert(r == 0); + + CallClientContexts *c = new CallClientContexts( + to_read.first, + this, + &(in_progress_client_reads.back()), + to_read.second); + for_read_op.insert( + make_pair( + to_read.first, + read_request_t( + to_read.second, + shards, + false, + c))); + obj_want_to_read.insert(make_pair(to_read.first, want_to_read)); + } + + start_read_op( + CEPH_MSG_PRIO_DEFAULT, + obj_want_to_read, + for_read_op, + OpRequestRef(), + fast_read, false); + return; +} + + +int ECBackend::send_all_remaining_reads( + const hobject_t &hoid, + ReadOp &rop) +{ + set<int> already_read; + const set<pg_shard_t>& ots = rop.obj_to_source[hoid]; + for (set<pg_shard_t>::iterator i = ots.begin(); i != ots.end(); ++i) + already_read.insert(i->shard); + dout(10) << __func__ << " have/error shards=" << already_read << dendl; + map<pg_shard_t, vector<pair<int, int>>> shards; + int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid], + rop.complete[hoid], &shards, rop.for_recovery); + if (r) + return r; + + list<boost::tuple<uint64_t, uint64_t, uint32_t> > offsets = + rop.to_read.find(hoid)->second.to_read; + GenContext<pair<RecoveryMessages *, read_result_t& > &> *c = + rop.to_read.find(hoid)->second.cb; + + // (Note cuixf) If we need to read attrs and we read failed, try to read again. + bool want_attrs = + rop.to_read.find(hoid)->second.want_attrs && + (!rop.complete[hoid].attrs || rop.complete[hoid].attrs->empty()); + if (want_attrs) { + dout(10) << __func__ << " want attrs again" << dendl; + } + + rop.to_read.erase(hoid); + rop.to_read.insert(make_pair( + hoid, + read_request_t( + offsets, + shards, + want_attrs, + c))); + do_read_op(rop); + return 0; +} + +int ECBackend::objects_get_attrs( + const hobject_t &hoid, + map<string, bufferlist> *out) +{ + int r = store->getattrs( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + *out); + if (r < 0) + return r; + + for (map<string, bufferlist>::iterator i = out->begin(); + i != out->end(); + ) { + if (ECUtil::is_hinfo_key_string(i->first)) + out->erase(i++); + else + ++i; + } + return r; +} + +void ECBackend::rollback_append( + const hobject_t &hoid, + uint64_t old_size, + ObjectStore::Transaction *t) +{ + ceph_assert(old_size % sinfo.get_stripe_width() == 0); + t->truncate( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + sinfo.aligned_logical_offset_to_chunk_offset( + old_size)); +} + +int ECBackend::be_deep_scrub( + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) +{ + dout(10) << __func__ << " " << poid << " pos " << pos << dendl; + int r; + + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + utime_t sleeptime; + sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep); + if (sleeptime != utime_t()) { + lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl; + sleeptime.sleep(); + } + + if (pos.data_pos == 0) { + pos.data_hash = bufferhash(-1); + } + + uint64_t stride = cct->_conf->osd_deep_scrub_stride; + if (stride % sinfo.get_chunk_size()) + stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size()); + + bufferlist bl; + r = store->read( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + pos.data_pos, + stride, bl, + fadvise_flags); + if (r < 0) { + dout(20) << __func__ << " " << poid << " got " + << r << " on read, read_error" << dendl; + o.read_error = true; + return 0; + } + if (bl.length() % sinfo.get_chunk_size()) { + dout(20) << __func__ << " " << poid << " got " + << r << " on read, not chunk size " << sinfo.get_chunk_size() << " aligned" + << dendl; + o.read_error = true; + return 0; + } + if (r > 0) { + pos.data_hash << bl; + } + pos.data_pos += r; + if (r == (int)stride) { + return -EINPROGRESS; + } + + ECUtil::HashInfoRef hinfo = get_hash_info(poid, false, &o.attrs); + if (!hinfo) { + dout(0) << "_scan_list " << poid << " could not retrieve hash info" << dendl; + o.read_error = true; + o.digest_present = false; + return 0; + } else { + if (!get_parent()->get_pool().allows_ecoverwrites()) { + ceph_assert(hinfo->has_chunk_hash()); + if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) { + dout(0) << "_scan_list " << poid << " got incorrect size on read 0x" + << std::hex << pos + << " expected 0x" << hinfo->get_total_chunk_size() << std::dec + << dendl; + o.ec_size_mismatch = true; + return 0; + } + + if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) != + pos.data_hash.digest()) { + dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x" + << std::hex << pos.data_hash.digest() << " != expected 0x" + << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) + << std::dec << dendl; + o.ec_hash_mismatch = true; + return 0; + } + + /* We checked above that we match our own stored hash. We cannot + * send a hash of the actual object, so instead we simply send + * our locally stored hash of shard 0 on the assumption that if + * we match our chunk hash and our recollection of the hash for + * chunk 0 matches that of our peers, there is likely no corruption. + */ + o.digest = hinfo->get_chunk_hash(0); + o.digest_present = true; + } else { + /* Hack! We must be using partial overwrites, and partial overwrites + * don't support deep-scrub yet + */ + o.digest = 0; + o.digest_present = true; + } + } + + o.omap_digest = -1; + o.omap_digest_present = true; + return 0; +} diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h new file mode 100644 index 00000000..e003a08c --- /dev/null +++ b/src/osd/ECBackend.h @@ -0,0 +1,690 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ECBACKEND_H +#define ECBACKEND_H + +#include <boost/intrusive/set.hpp> +#include <boost/intrusive/list.hpp> + +#include "OSD.h" +#include "PGBackend.h" +#include "erasure-code/ErasureCodeInterface.h" +#include "ECUtil.h" +#include "ECTransaction.h" +#include "ExtentCache.h" + +//forward declaration +struct ECSubWrite; +struct ECSubWriteReply; +struct ECSubRead; +struct ECSubReadReply; + +struct RecoveryMessages; +class ECBackend : public PGBackend { +public: + RecoveryHandle *open_recovery_op() override; + + void run_recovery_op( + RecoveryHandle *h, + int priority + ) override; + + int recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + RecoveryHandle *h + ) override; + + bool _handle_message( + OpRequestRef op + ) override; + bool can_handle_while_inactive( + OpRequestRef op + ) override; + friend struct SubWriteApplied; + friend struct SubWriteCommitted; + void sub_write_committed( + ceph_tid_t tid, + eversion_t version, + eversion_t last_complete, + const ZTracer::Trace &trace); + void handle_sub_write( + pg_shard_t from, + OpRequestRef msg, + ECSubWrite &op, + const ZTracer::Trace &trace + ); + void handle_sub_read( + pg_shard_t from, + const ECSubRead &op, + ECSubReadReply *reply, + const ZTracer::Trace &trace + ); + void handle_sub_write_reply( + pg_shard_t from, + const ECSubWriteReply &op, + const ZTracer::Trace &trace + ); + void handle_sub_read_reply( + pg_shard_t from, + ECSubReadReply &op, + RecoveryMessages *m, + const ZTracer::Trace &trace + ); + + /// @see ReadOp below + void check_recovery_sources(const OSDMapRef& osdmap) override; + + void on_change() override; + void clear_recovery_state() override; + + void dump_recovery_info(Formatter *f) const override; + + void call_write_ordered(std::function<void(void)> &&cb) override; + + void submit_transaction( + const hobject_t &hoid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&t, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const vector<pg_log_entry_t> &log_entries, + boost::optional<pg_hit_set_history_t> &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef op + ) override; + + int objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + bufferlist *bl) override; + + /** + * Async read mechanism + * + * Async reads use the same async read mechanism as does recovery. + * CallClientContexts is responsible for reconstructing the response + * buffer as well as for calling the callbacks. + * + * One tricky bit is that two reads may possibly not read from the same + * set of replicas. This could result in two reads completing in the + * wrong (from the interface user's point of view) order. Thus, we + * maintain a queue of in progress reads (@see in_progress_client_reads) + * to ensure that we always call the completion callback in order. + * + * Another subtly is that while we may read a degraded object, we will + * still only perform a client read from shards in the acting set. This + * ensures that we won't ever have to restart a client initiated read in + * check_recovery_sources. + */ + void objects_read_and_reconstruct( + const map<hobject_t, std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > + > &reads, + bool fast_read, + GenContextURef<map<hobject_t,pair<int, extent_map> > &&> &&func); + + friend struct CallClientContexts; + struct ClientAsyncReadStatus { + unsigned objects_to_read; + GenContextURef<map<hobject_t,pair<int, extent_map> > &&> func; + map<hobject_t,pair<int, extent_map> > results; + explicit ClientAsyncReadStatus( + unsigned objects_to_read, + GenContextURef<map<hobject_t,pair<int, extent_map> > &&> &&func) + : objects_to_read(objects_to_read), func(std::move(func)) {} + void complete_object( + const hobject_t &hoid, + int err, + extent_map &&buffers) { + ceph_assert(objects_to_read); + --objects_to_read; + ceph_assert(!results.count(hoid)); + results.emplace(hoid, make_pair(err, std::move(buffers))); + } + bool is_complete() const { + return objects_to_read == 0; + } + void run() { + func.release()->complete(std::move(results)); + } + }; + list<ClientAsyncReadStatus> in_progress_client_reads; + void objects_read_async( + const hobject_t &hoid, + const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, + pair<bufferlist*, Context*> > > &to_read, + Context *on_complete, + bool fast_read = false) override; + + template <typename Func> + void objects_read_async_no_cache( + const map<hobject_t,extent_set> &to_read, + Func &&on_complete) { + map<hobject_t,std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > > _to_read; + for (auto &&hpair: to_read) { + auto &l = _to_read[hpair.first]; + for (auto extent: hpair.second) { + l.emplace_back(extent.first, extent.second, 0); + } + } + objects_read_and_reconstruct( + _to_read, + false, + make_gen_lambda_context< + map<hobject_t,pair<int, extent_map> > &&, Func>( + std::forward<Func>(on_complete))); + } + void kick_reads() { + while (in_progress_client_reads.size() && + in_progress_client_reads.front().is_complete()) { + in_progress_client_reads.front().run(); + in_progress_client_reads.pop_front(); + } + } + +private: + friend struct ECRecoveryHandle; + uint64_t get_recovery_chunk_size() const { + return round_up_to(cct->_conf->osd_recovery_max_chunk, + sinfo.get_stripe_width()); + } + + void get_want_to_read_shards(set<int> *want_to_read) const { + const vector<int> &chunk_mapping = ec_impl->get_chunk_mapping(); + for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) { + int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i; + want_to_read->insert(chunk); + } + } + + /** + * Recovery + * + * Recovery uses the same underlying read mechanism as client reads + * with the slight difference that recovery reads may come from non + * acting shards. Thus, check_recovery_sources may wind up calling + * cancel_pull for a read originating with RecoveryOp. + * + * The recovery process is expressed as a state machine: + * - IDLE: Nothing is currently in progress, reads will be started and + * we will transition to READING + * - READING: We are awaiting a pending read op. Once complete, we will + * decode the buffers and proceed to WRITING + * - WRITING: We are awaiting a completed push. Once complete, we will + * either transition to COMPLETE or to IDLE to continue. + * - COMPLETE: complete + * + * We use the existing Push and PushReply messages and structures to + * handle actually shuffling the data over to the replicas. recovery_info + * and recovery_progress are expressed in terms of the logical offset + * space except for data_included which is in terms of the chunked object + * space (to match the passed buffer). + * + * xattrs are requested on the first read and used to initialize the + * object_context if missing on completion of the first read. + * + * In order to batch up reads and writes, we batch Push, PushReply, + * Transaction, and reads in a RecoveryMessages object which is passed + * among the recovery methods. + */ + struct RecoveryOp { + hobject_t hoid; + eversion_t v; + set<pg_shard_t> missing_on; + set<shard_id_t> missing_on_shards; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress recovery_progress; + + enum state_t { IDLE, READING, WRITING, COMPLETE } state; + + static const char* tostr(state_t state) { + switch (state) { + case ECBackend::RecoveryOp::IDLE: + return "IDLE"; + break; + case ECBackend::RecoveryOp::READING: + return "READING"; + break; + case ECBackend::RecoveryOp::WRITING: + return "WRITING"; + break; + case ECBackend::RecoveryOp::COMPLETE: + return "COMPLETE"; + break; + default: + ceph_abort(); + return ""; + } + } + + // must be filled if state == WRITING + map<int, bufferlist> returned_data; + map<string, bufferlist> xattrs; + ECUtil::HashInfoRef hinfo; + ObjectContextRef obc; + set<pg_shard_t> waiting_on_pushes; + + // valid in state READING + pair<uint64_t, uint64_t> extent_requested; + + void dump(Formatter *f) const; + + RecoveryOp() : state(IDLE) {} + }; + friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs); + map<hobject_t, RecoveryOp> recovery_ops; + + void continue_recovery_op( + RecoveryOp &op, + RecoveryMessages *m); + void dispatch_recovery_messages(RecoveryMessages &m, int priority); + friend struct OnRecoveryReadComplete; + void handle_recovery_read_complete( + const hobject_t &hoid, + boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &to_read, + boost::optional<map<string, bufferlist> > attrs, + RecoveryMessages *m); + void handle_recovery_push( + const PushOp &op, + RecoveryMessages *m, + bool is_repair); + void handle_recovery_push_reply( + const PushReplyOp &op, + pg_shard_t from, + RecoveryMessages *m); + void get_all_avail_shards( + const hobject_t &hoid, + const set<pg_shard_t> &error_shards, + set<int> &have, + map<shard_id_t, pg_shard_t> &shards, + bool for_recovery); + +public: + /** + * Low level async read mechanism + * + * To avoid duplicating the logic for requesting and waiting for + * multiple object shards, there is a common async read mechanism + * taking a map of hobject_t->read_request_t which defines callbacks + * taking read_result_ts as arguments. + * + * tid_to_read_map gives open read ops. check_recovery_sources uses + * shard_to_read_map and ReadOp::source_to_obj to restart reads + * involving down osds. + * + * The user is responsible for specifying replicas on which to read + * and for reassembling the buffer on the other side since client + * reads require the original object buffer while recovery only needs + * the missing pieces. + * + * Rather than handling reads on the primary directly, we simply send + * ourselves a message. This avoids a dedicated primary path for that + * part. + */ + struct read_result_t { + int r; + map<pg_shard_t, int> errors; + boost::optional<map<string, bufferlist> > attrs; + list< + boost::tuple< + uint64_t, uint64_t, map<pg_shard_t, bufferlist> > > returned; + read_result_t() : r(0) {} + }; + struct read_request_t { + const list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read; + const map<pg_shard_t, vector<pair<int, int>>> need; + const bool want_attrs; + GenContext<pair<RecoveryMessages *, read_result_t& > &> *cb; + read_request_t( + const list<boost::tuple<uint64_t, uint64_t, uint32_t> > &to_read, + const map<pg_shard_t, vector<pair<int, int>>> &need, + bool want_attrs, + GenContext<pair<RecoveryMessages *, read_result_t& > &> *cb) + : to_read(to_read), need(need), want_attrs(want_attrs), + cb(cb) {} + }; + friend ostream &operator<<(ostream &lhs, const read_request_t &rhs); + + struct ReadOp { + int priority; + ceph_tid_t tid; + OpRequestRef op; // may be null if not on behalf of a client + // True if redundant reads are issued, false otherwise, + // this is useful to tradeoff some resources (redundant ops) for + // low latency read, especially on relatively idle cluster + bool do_redundant_reads; + // True if reading for recovery which could possibly reading only a subset + // of the available shards. + bool for_recovery; + + ZTracer::Trace trace; + + map<hobject_t, set<int>> want_to_read; + map<hobject_t, read_request_t> to_read; + map<hobject_t, read_result_t> complete; + + map<hobject_t, set<pg_shard_t>> obj_to_source; + map<pg_shard_t, set<hobject_t> > source_to_obj; + + void dump(Formatter *f) const; + + set<pg_shard_t> in_progress; + + ReadOp( + int priority, + ceph_tid_t tid, + bool do_redundant_reads, + bool for_recovery, + OpRequestRef op, + map<hobject_t, set<int>> &&_want_to_read, + map<hobject_t, read_request_t> &&_to_read) + : priority(priority), tid(tid), op(op), do_redundant_reads(do_redundant_reads), + for_recovery(for_recovery), want_to_read(std::move(_want_to_read)), + to_read(std::move(_to_read)) { + for (auto &&hpair: to_read) { + auto &returned = complete[hpair.first].returned; + for (auto &&extent: hpair.second.to_read) { + returned.push_back( + boost::make_tuple( + extent.get<0>(), + extent.get<1>(), + map<pg_shard_t, bufferlist>())); + } + } + } + ReadOp() = delete; + ReadOp(const ReadOp &) = default; + ReadOp(ReadOp &&) = default; + }; + friend struct FinishReadOp; + void filter_read_op( + const OSDMapRef& osdmap, + ReadOp &op); + void complete_read_op(ReadOp &rop, RecoveryMessages *m); + friend ostream &operator<<(ostream &lhs, const ReadOp &rhs); + map<ceph_tid_t, ReadOp> tid_to_read_map; + map<pg_shard_t, set<ceph_tid_t> > shard_to_read_map; + void start_read_op( + int priority, + map<hobject_t, set<int>> &want_to_read, + map<hobject_t, read_request_t> &to_read, + OpRequestRef op, + bool do_redundant_reads, bool for_recovery); + + void do_read_op(ReadOp &rop); + int send_all_remaining_reads( + const hobject_t &hoid, + ReadOp &rop); + + + /** + * Client writes + * + * ECTransaction is responsible for generating a transaction for + * each shard to which we need to send the write. As required + * by the PGBackend interface, the ECBackend write mechanism + * passes trim information with the write and last_complete back + * with the reply. + * + * As with client reads, there is a possibility of out-of-order + * completions. Thus, callbacks and completion are called in order + * on the writing list. + */ + struct Op : boost::intrusive::list_base_hook<> { + /// From submit_transaction caller, describes operation + hobject_t hoid; + object_stat_sum_t delta_stats; + eversion_t version; + eversion_t trim_to; + boost::optional<pg_hit_set_history_t> updated_hit_set_history; + vector<pg_log_entry_t> log_entries; + ceph_tid_t tid; + osd_reqid_t reqid; + ZTracer::Trace trace; + + eversion_t roll_forward_to; /// Soon to be generated internally + + /// Ancillary also provided from submit_transaction caller + map<hobject_t, ObjectContextRef> obc_map; + + /// see call_write_ordered + std::list<std::function<void(void)> > on_write; + + /// Generated internally + set<hobject_t> temp_added; + set<hobject_t> temp_cleared; + + ECTransaction::WritePlan plan; + bool requires_rmw() const { return !plan.to_read.empty(); } + bool invalidates_cache() const { return plan.invalidates_cache; } + + // must be true if requires_rmw(), must be false if invalidates_cache() + bool using_cache = true; + + /// In progress read state; + map<hobject_t,extent_set> pending_read; // subset already being read + map<hobject_t,extent_set> remote_read; // subset we must read + map<hobject_t,extent_map> remote_read_result; + bool read_in_progress() const { + return !remote_read.empty() && remote_read_result.empty(); + } + + /// In progress write state. + set<pg_shard_t> pending_commit; + // we need pending_apply for pre-mimic peers so that we don't issue a + // read on a remote shard before it has applied a previous write. We can + // remove this after nautilus. + set<pg_shard_t> pending_apply; + bool write_in_progress() const { + return !pending_commit.empty() || !pending_apply.empty(); + } + + /// optional, may be null, for tracking purposes + OpRequestRef client_op; + + /// pin for cache + ExtentCache::write_pin pin; + + /// Callbacks + Context *on_all_commit = nullptr; + ~Op() { + delete on_all_commit; + } + }; + using op_list = boost::intrusive::list<Op>; + friend ostream &operator<<(ostream &lhs, const Op &rhs); + + ExtentCache cache; + map<ceph_tid_t, Op> tid_to_op_map; /// Owns Op structure + + /** + * We model the possible rmw states as a set of waitlists. + * All writes at this time complete in order, so a write blocked + * at waiting_state blocks all writes behind it as well (same for + * other states). + * + * Future work: We can break this up into a per-object pipeline + * (almost). First, provide an ordering token to submit_transaction + * and require that all operations within a single transaction take + * place on a subset of hobject_t space partitioned by that token + * (the hashid seem about right to me -- even works for temp objects + * if you recall that a temp object created for object head foo will + * only ever be referenced by other transactions on foo and aren't + * reused). Next, factor this part into a class and maintain one per + * ordering token. Next, fixup PrimaryLogPG's repop queue to be + * partitioned by ordering token. Finally, refactor the op pipeline + * so that the log entries passed into submit_transaction aren't + * versioned. We can't assign versions to them until we actually + * submit the operation. That's probably going to be the hard part. + */ + class pipeline_state_t { + enum { + CACHE_VALID = 0, + CACHE_INVALID = 1 + } pipeline_state = CACHE_VALID; + public: + bool caching_enabled() const { + return pipeline_state == CACHE_VALID; + } + bool cache_invalid() const { + return !caching_enabled(); + } + void invalidate() { + pipeline_state = CACHE_INVALID; + } + void clear() { + pipeline_state = CACHE_VALID; + } + friend ostream &operator<<(ostream &lhs, const pipeline_state_t &rhs); + } pipeline_state; + + + op_list waiting_state; /// writes waiting on pipe_state + op_list waiting_reads; /// writes waiting on partial stripe reads + op_list waiting_commit; /// writes waiting on initial commit + eversion_t completed_to; + eversion_t committed_to; + void start_rmw(Op *op, PGTransactionUPtr &&t); + bool try_state_to_reads(); + bool try_reads_to_commit(); + bool try_finish_rmw(); + void check_ops(); + + ErasureCodeInterfaceRef ec_impl; + + + /** + * ECRecPred + * + * Determines the whether _have is sufficient to recover an object + */ + class ECRecPred : public IsPGRecoverablePredicate { + set<int> want; + ErasureCodeInterfaceRef ec_impl; + public: + explicit ECRecPred(ErasureCodeInterfaceRef ec_impl) : ec_impl(ec_impl) { + for (unsigned i = 0; i < ec_impl->get_chunk_count(); ++i) { + want.insert(i); + } + } + bool operator()(const set<pg_shard_t> &_have) const override { + set<int> have; + for (set<pg_shard_t>::const_iterator i = _have.begin(); + i != _have.end(); + ++i) { + have.insert(i->shard); + } + map<int, vector<pair<int, int>>> min; + return ec_impl->minimum_to_decode(want, have, &min) == 0; + } + }; + IsPGRecoverablePredicate *get_is_recoverable_predicate() const override { + return new ECRecPred(ec_impl); + } + + int get_ec_data_chunk_count() const override { + return ec_impl->get_data_chunk_count(); + } + int get_ec_stripe_chunk_size() const override { + return sinfo.get_chunk_size(); + } + + /** + * ECReadPred + * + * Determines the whether _have is sufficient to read an object + */ + class ECReadPred : public IsPGReadablePredicate { + pg_shard_t whoami; + ECRecPred rec_pred; + public: + ECReadPred( + pg_shard_t whoami, + ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(ec_impl) {} + bool operator()(const set<pg_shard_t> &_have) const override { + return _have.count(whoami) && rec_pred(_have); + } + }; + IsPGReadablePredicate *get_is_readable_predicate() const override { + return new ECReadPred(get_parent()->whoami_shard(), ec_impl); + } + + + const ECUtil::stripe_info_t sinfo; + /// If modified, ensure that the ref is held until the update is applied + SharedPtrRegistry<hobject_t, ECUtil::HashInfo> unstable_hashinfo_registry; + ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid, bool checks = true, + const map<string,bufferptr> *attr = NULL); + +public: + ECBackend( + PGBackend::Listener *pg, + const coll_t &coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct, + ErasureCodeInterfaceRef ec_impl, + uint64_t stripe_width); + + /// Returns to_read replicas sufficient to reconstruct want + int get_min_avail_to_read_shards( + const hobject_t &hoid, ///< [in] object + const set<int> &want, ///< [in] desired shards + bool for_recovery, ///< [in] true if we may use non-acting replicas + bool do_redundant_reads, ///< [in] true if we want to issue redundant reads to reduce latency + map<pg_shard_t, vector<pair<int, int>>> *to_read ///< [out] shards, corresponding subchunks to read + ); ///< @return error code, 0 on success + + int get_remaining_shards( + const hobject_t &hoid, + const set<int> &avail, + const set<int> &want, + const read_result_t &result, + map<pg_shard_t, vector<pair<int, int>>> *to_read, + bool for_recovery); + + int objects_get_attrs( + const hobject_t &hoid, + map<string, bufferlist> *out) override; + + void rollback_append( + const hobject_t &hoid, + uint64_t old_size, + ObjectStore::Transaction *t) override; + + bool auto_repair_supported() const override { return true; } + + int be_deep_scrub( + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) override; + uint64_t be_get_ondisk_size(uint64_t logical_size) override { + return sinfo.logical_to_next_chunk_offset(logical_size); + } + void _failed_push(const hobject_t &hoid, + pair<RecoveryMessages *, ECBackend::read_result_t &> &in); +}; +ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs); + +#endif diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc new file mode 100644 index 00000000..02c3a8e2 --- /dev/null +++ b/src/osd/ECMsgTypes.cc @@ -0,0 +1,410 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "ECMsgTypes.h" + +void ECSubWrite::encode(bufferlist &bl) const +{ + ENCODE_START(4, 1, bl); + encode(from, bl); + encode(tid, bl); + encode(reqid, bl); + encode(soid, bl); + encode(stats, bl); + encode(t, bl); + encode(at_version, bl); + encode(trim_to, bl); + encode(log_entries, bl); + encode(temp_added, bl); + encode(temp_removed, bl); + encode(updated_hit_set_history, bl); + encode(roll_forward_to, bl); + encode(backfill_or_async_recovery, bl); + ENCODE_FINISH(bl); +} + +void ECSubWrite::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(4, bl); + decode(from, bl); + decode(tid, bl); + decode(reqid, bl); + decode(soid, bl); + decode(stats, bl); + decode(t, bl); + decode(at_version, bl); + decode(trim_to, bl); + decode(log_entries, bl); + decode(temp_added, bl); + decode(temp_removed, bl); + if (struct_v >= 2) { + decode(updated_hit_set_history, bl); + } + if (struct_v >= 3) { + decode(roll_forward_to, bl); + } else { + roll_forward_to = trim_to; + } + if (struct_v >= 4) { + decode(backfill_or_async_recovery, bl); + } else { + // The old protocol used an empty transaction to indicate backfill or async_recovery + backfill_or_async_recovery = t.empty(); + } + DECODE_FINISH(bl); +} + +std::ostream &operator<<( + std::ostream &lhs, const ECSubWrite &rhs) +{ + lhs << "ECSubWrite(tid=" << rhs.tid + << ", reqid=" << rhs.reqid + << ", at_version=" << rhs.at_version + << ", trim_to=" << rhs.trim_to + << ", roll_forward_to=" << rhs.roll_forward_to; + if (rhs.updated_hit_set_history) + lhs << ", has_updated_hit_set_history"; + if (rhs.backfill_or_async_recovery) + lhs << ", backfill_or_async_recovery"; + return lhs << ")"; +} + +void ECSubWrite::dump(Formatter *f) const +{ + f->dump_unsigned("tid", tid); + f->dump_stream("reqid") << reqid; + f->dump_stream("at_version") << at_version; + f->dump_stream("trim_to") << trim_to; + f->dump_stream("roll_forward_to") << roll_forward_to; + f->dump_bool("has_updated_hit_set_history", + static_cast<bool>(updated_hit_set_history)); + f->dump_bool("backfill_or_async_recovery", backfill_or_async_recovery); +} + +void ECSubWrite::generate_test_instances(list<ECSubWrite*> &o) +{ + o.push_back(new ECSubWrite()); + o.back()->tid = 1; + o.back()->at_version = eversion_t(2, 100); + o.back()->trim_to = eversion_t(1, 40); + o.push_back(new ECSubWrite()); + o.back()->tid = 4; + o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678); + o.back()->at_version = eversion_t(10, 300); + o.back()->trim_to = eversion_t(5, 42); + o.push_back(new ECSubWrite()); + o.back()->tid = 9; + o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678); + o.back()->at_version = eversion_t(10, 300); + o.back()->trim_to = eversion_t(5, 42); + o.back()->roll_forward_to = eversion_t(8, 250); +} + +void ECSubWriteReply::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(from, bl); + encode(tid, bl); + encode(last_complete, bl); + encode(committed, bl); + encode(applied, bl); + ENCODE_FINISH(bl); +} + +void ECSubWriteReply::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(from, bl); + decode(tid, bl); + decode(last_complete, bl); + decode(committed, bl); + decode(applied, bl); + DECODE_FINISH(bl); +} + +std::ostream &operator<<( + std::ostream &lhs, const ECSubWriteReply &rhs) +{ + return lhs + << "ECSubWriteReply(tid=" << rhs.tid + << ", last_complete=" << rhs.last_complete + << ", committed=" << rhs.committed + << ", applied=" << rhs.applied << ")"; +} + +void ECSubWriteReply::dump(Formatter *f) const +{ + f->dump_unsigned("tid", tid); + f->dump_stream("last_complete") << last_complete; + f->dump_bool("committed", committed); + f->dump_bool("applied", applied); +} + +void ECSubWriteReply::generate_test_instances(list<ECSubWriteReply*>& o) +{ + o.push_back(new ECSubWriteReply()); + o.back()->tid = 20; + o.back()->last_complete = eversion_t(100, 2000); + o.back()->committed = true; + o.push_back(new ECSubWriteReply()); + o.back()->tid = 80; + o.back()->last_complete = eversion_t(50, 200); + o.back()->applied = true; +} + +void ECSubRead::encode(bufferlist &bl, uint64_t features) const +{ + if ((features & CEPH_FEATURE_OSD_FADVISE_FLAGS) == 0) { + ENCODE_START(2, 1, bl); + encode(from, bl); + encode(tid, bl); + map<hobject_t, list<pair<uint64_t, uint64_t> >> tmp; + for (map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> >>::const_iterator m = to_read.begin(); + m != to_read.end(); ++m) { + list<pair<uint64_t, uint64_t> > tlist; + for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator l = m->second.begin(); + l != m->second.end(); ++l) { + tlist.push_back(std::make_pair(l->get<0>(), l->get<1>())); + } + tmp[m->first] = tlist; + } + encode(tmp, bl); + encode(attrs_to_read, bl); + encode(subchunks, bl); + ENCODE_FINISH(bl); + return; + } + + ENCODE_START(3, 2, bl); + encode(from, bl); + encode(tid, bl); + encode(to_read, bl); + encode(attrs_to_read, bl); + encode(subchunks, bl); + ENCODE_FINISH(bl); +} + +void ECSubRead::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(3, bl); + decode(from, bl); + decode(tid, bl); + if (struct_v == 1) { + map<hobject_t, list<pair<uint64_t, uint64_t> >>tmp; + decode(tmp, bl); + for (map<hobject_t, list<pair<uint64_t, uint64_t> >>::const_iterator m = tmp.begin(); + m != tmp.end(); ++m) { + list<boost::tuple<uint64_t, uint64_t, uint32_t> > tlist; + for (list<pair<uint64_t, uint64_t> > ::const_iterator l = m->second.begin(); + l != m->second.end(); ++l) { + tlist.push_back(boost::make_tuple(l->first, l->second, 0)); + } + to_read[m->first] = tlist; + } + } else { + decode(to_read, bl); + } + decode(attrs_to_read, bl); + if (struct_v > 2 && struct_v > struct_compat) { + decode(subchunks, bl); + } else { + for (auto &i : to_read) { + subchunks[i.first].push_back(make_pair(0, 1)); + } + } + DECODE_FINISH(bl); +} + +std::ostream &operator<<( + std::ostream &lhs, const ECSubRead &rhs) +{ + return lhs + << "ECSubRead(tid=" << rhs.tid + << ", to_read=" << rhs.to_read + << ", subchunks=" << rhs.subchunks + << ", attrs_to_read=" << rhs.attrs_to_read << ")"; +} + +void ECSubRead::dump(Formatter *f) const +{ + f->dump_stream("from") << from; + f->dump_unsigned("tid", tid); + f->open_array_section("objects"); + for (map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> >>::const_iterator i = + to_read.begin(); + i != to_read.end(); + ++i) { + f->open_object_section("object"); + f->dump_stream("oid") << i->first; + f->open_array_section("extents"); + for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator j = + i->second.begin(); + j != i->second.end(); + ++j) { + f->open_object_section("extent"); + f->dump_unsigned("off", j->get<0>()); + f->dump_unsigned("len", j->get<1>()); + f->dump_unsigned("flags", j->get<2>()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("object_attrs_requested"); + for (set<hobject_t>::const_iterator i = attrs_to_read.begin(); + i != attrs_to_read.end(); + ++i) { + f->open_object_section("object"); + f->dump_stream("oid") << *i; + f->close_section(); + } + f->close_section(); +} + +void ECSubRead::generate_test_instances(list<ECSubRead*>& o) +{ + hobject_t hoid1(sobject_t("asdf", 1)); + hobject_t hoid2(sobject_t("asdf2", CEPH_NOSNAP)); + o.push_back(new ECSubRead()); + o.back()->from = pg_shard_t(2, shard_id_t(-1)); + o.back()->tid = 1; + o.back()->to_read[hoid1].push_back(boost::make_tuple(100, 200, 0)); + o.back()->to_read[hoid1].push_back(boost::make_tuple(400, 600, 0)); + o.back()->to_read[hoid2].push_back(boost::make_tuple(400, 600, 0)); + o.back()->attrs_to_read.insert(hoid1); + o.push_back(new ECSubRead()); + o.back()->from = pg_shard_t(2, shard_id_t(-1)); + o.back()->tid = 300; + o.back()->to_read[hoid1].push_back(boost::make_tuple(300, 200, 0)); + o.back()->to_read[hoid2].push_back(boost::make_tuple(400, 600, 0)); + o.back()->to_read[hoid2].push_back(boost::make_tuple(2000, 600, 0)); + o.back()->attrs_to_read.insert(hoid2); +} + +void ECSubReadReply::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(from, bl); + encode(tid, bl); + encode(buffers_read, bl); + encode(attrs_read, bl); + encode(errors, bl); + ENCODE_FINISH(bl); +} + +void ECSubReadReply::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(from, bl); + decode(tid, bl); + decode(buffers_read, bl); + decode(attrs_read, bl); + decode(errors, bl); + DECODE_FINISH(bl); +} + +std::ostream &operator<<( + std::ostream &lhs, const ECSubReadReply &rhs) +{ + return lhs + << "ECSubReadReply(tid=" << rhs.tid + << ", attrs_read=" << rhs.attrs_read.size() + << ")"; +} + +void ECSubReadReply::dump(Formatter *f) const +{ + f->dump_stream("from") << from; + f->dump_unsigned("tid", tid); + f->open_array_section("buffers_read"); + for (map<hobject_t, list<pair<uint64_t, bufferlist> >>::const_iterator i = + buffers_read.begin(); + i != buffers_read.end(); + ++i) { + f->open_object_section("object"); + f->dump_stream("oid") << i->first; + f->open_array_section("data"); + for (list<pair<uint64_t, bufferlist> >::const_iterator j = + i->second.begin(); + j != i->second.end(); + ++j) { + f->open_object_section("extent"); + f->dump_unsigned("off", j->first); + f->dump_unsigned("buf_len", j->second.length()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("attrs_returned"); + for (map<hobject_t, map<string, bufferlist>>::const_iterator i = + attrs_read.begin(); + i != attrs_read.end(); + ++i) { + f->open_object_section("object_attrs"); + f->dump_stream("oid") << i->first; + f->open_array_section("attrs"); + for (map<string, bufferlist>::const_iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + f->open_object_section("attr"); + f->dump_string("attr", j->first); + f->dump_unsigned("val_len", j->second.length()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("errors"); + for (map<hobject_t, int>::const_iterator i = errors.begin(); + i != errors.end(); + ++i) { + f->open_object_section("error_pair"); + f->dump_stream("oid") << i->first; + f->dump_int("error", i->second); + f->close_section(); + } + f->close_section(); +} + +void ECSubReadReply::generate_test_instances(list<ECSubReadReply*>& o) +{ + hobject_t hoid1(sobject_t("asdf", 1)); + hobject_t hoid2(sobject_t("asdf2", CEPH_NOSNAP)); + bufferlist bl; + bl.append_zero(100); + bufferlist bl2; + bl2.append_zero(200); + o.push_back(new ECSubReadReply()); + o.back()->from = pg_shard_t(2, shard_id_t(-1)); + o.back()->tid = 1; + o.back()->buffers_read[hoid1].push_back(make_pair(20, bl)); + o.back()->buffers_read[hoid1].push_back(make_pair(2000, bl2)); + o.back()->buffers_read[hoid2].push_back(make_pair(0, bl)); + o.back()->attrs_read[hoid1]["foo"] = bl; + o.back()->attrs_read[hoid1]["_"] = bl2; + o.push_back(new ECSubReadReply()); + o.back()->from = pg_shard_t(2, shard_id_t(-1)); + o.back()->tid = 300; + o.back()->buffers_read[hoid2].push_back(make_pair(0, bl2)); + o.back()->attrs_read[hoid2]["foo"] = bl; + o.back()->attrs_read[hoid2]["_"] = bl2; + o.back()->errors[hoid1] = -2; +} diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h new file mode 100644 index 00000000..40eda369 --- /dev/null +++ b/src/osd/ECMsgTypes.h @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ECBMSGTYPES_H +#define ECBMSGTYPES_H + +#include "osd_types.h" +#include "include/buffer.h" +#include "os/ObjectStore.h" +#include "boost/tuple/tuple.hpp" + +struct ECSubWrite { + pg_shard_t from; + ceph_tid_t tid; + osd_reqid_t reqid; + hobject_t soid; + pg_stat_t stats; + ObjectStore::Transaction t; + eversion_t at_version; + eversion_t trim_to; + eversion_t roll_forward_to; + vector<pg_log_entry_t> log_entries; + set<hobject_t> temp_added; + set<hobject_t> temp_removed; + boost::optional<pg_hit_set_history_t> updated_hit_set_history; + bool backfill_or_async_recovery = false; + ECSubWrite() : tid(0) {} + ECSubWrite( + pg_shard_t from, + ceph_tid_t tid, + osd_reqid_t reqid, + hobject_t soid, + const pg_stat_t &stats, + const ObjectStore::Transaction &t, + eversion_t at_version, + eversion_t trim_to, + eversion_t roll_forward_to, + vector<pg_log_entry_t> log_entries, + boost::optional<pg_hit_set_history_t> updated_hit_set_history, + const set<hobject_t> &temp_added, + const set<hobject_t> &temp_removed, + bool backfill_or_async_recovery) + : from(from), tid(tid), reqid(reqid), + soid(soid), stats(stats), t(t), + at_version(at_version), + trim_to(trim_to), roll_forward_to(roll_forward_to), + log_entries(log_entries), + temp_added(temp_added), + temp_removed(temp_removed), + updated_hit_set_history(updated_hit_set_history), + backfill_or_async_recovery(backfill_or_async_recovery) + {} + void claim(ECSubWrite &other) { + from = other.from; + tid = other.tid; + reqid = other.reqid; + soid = other.soid; + stats = other.stats; + t.swap(other.t); + at_version = other.at_version; + trim_to = other.trim_to; + roll_forward_to = other.roll_forward_to; + log_entries.swap(other.log_entries); + temp_added.swap(other.temp_added); + temp_removed.swap(other.temp_removed); + updated_hit_set_history = other.updated_hit_set_history; + backfill_or_async_recovery = other.backfill_or_async_recovery; + } + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ECSubWrite*>& o); +private: + // no outside copying -- slow + ECSubWrite(ECSubWrite& other); + const ECSubWrite& operator=(const ECSubWrite& other); +}; +WRITE_CLASS_ENCODER(ECSubWrite) + +struct ECSubWriteReply { + pg_shard_t from; + ceph_tid_t tid; + eversion_t last_complete; + bool committed; + bool applied; + ECSubWriteReply() : tid(0), committed(false), applied(false) {} + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ECSubWriteReply*>& o); +}; +WRITE_CLASS_ENCODER(ECSubWriteReply) + +struct ECSubRead { + pg_shard_t from; + ceph_tid_t tid; + map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> >> to_read; + set<hobject_t> attrs_to_read; + map<hobject_t, vector<pair<int, int>>> subchunks; + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ECSubRead*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(ECSubRead) + +struct ECSubReadReply { + pg_shard_t from; + ceph_tid_t tid; + map<hobject_t, list<pair<uint64_t, bufferlist> >> buffers_read; + map<hobject_t, map<string, bufferlist>> attrs_read; + map<hobject_t, int> errors; + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ECSubReadReply*>& o); +}; +WRITE_CLASS_ENCODER(ECSubReadReply) + +std::ostream &operator<<( + std::ostream &lhs, const ECSubWrite &rhs); +std::ostream &operator<<( + std::ostream &lhs, const ECSubWriteReply &rhs); +std::ostream &operator<<( + std::ostream &lhs, const ECSubRead &rhs); +std::ostream &operator<<( + std::ostream &lhs, const ECSubReadReply &rhs); + +#endif diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc new file mode 100644 index 00000000..ee791d63 --- /dev/null +++ b/src/osd/ECTransaction.cc @@ -0,0 +1,652 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iostream> +#include <vector> +#include <sstream> + +#include "ECTransaction.h" +#include "ECUtil.h" +#include "os/ObjectStore.h" +#include "common/inline_variant.h" + + +void encode_and_write( + pg_t pgid, + const hobject_t &oid, + const ECUtil::stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ecimpl, + const set<int> &want, + uint64_t offset, + bufferlist bl, + uint32_t flags, + ECUtil::HashInfoRef hinfo, + extent_map &written, + map<shard_id_t, ObjectStore::Transaction> *transactions, + DoutPrefixProvider *dpp) { + const uint64_t before_size = hinfo->get_total_logical_size(sinfo); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(offset)); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(bl.length())); + ceph_assert(bl.length()); + + map<int, bufferlist> buffers; + int r = ECUtil::encode( + sinfo, ecimpl, bl, want, &buffers); + ceph_assert(r == 0); + + written.insert(offset, bl.length(), bl); + + ldpp_dout(dpp, 20) << __func__ << ": " << oid + << " new_size " + << offset + bl.length() + << dendl; + + if (offset >= before_size) { + ceph_assert(offset == before_size); + hinfo->append( + sinfo.aligned_logical_offset_to_chunk_offset(offset), + buffers); + } + + for (auto &&i : *transactions) { + ceph_assert(buffers.count(i.first)); + bufferlist &enc_bl = buffers[i.first]; + if (offset >= before_size) { + i.second.set_alloc_hint( + coll_t(spg_t(pgid, i.first)), + ghobject_t(oid, ghobject_t::NO_GEN, i.first), + 0, 0, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE | + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY); + } + i.second.write( + coll_t(spg_t(pgid, i.first)), + ghobject_t(oid, ghobject_t::NO_GEN, i.first), + sinfo.logical_to_prev_chunk_offset( + offset), + enc_bl.length(), + enc_bl, + flags); + } +} + +bool ECTransaction::requires_overwrite( + uint64_t prev_size, + const PGTransaction::ObjectOperation &op) { + // special handling for truncates to 0 + if (op.truncate && op.truncate->first == 0) + return false; + return op.is_none() && + ((!op.buffer_updates.empty() && + (op.buffer_updates.begin().get_off() < prev_size)) || + (op.truncate && + (op.truncate->first < prev_size))); +} + +void ECTransaction::generate_transactions( + WritePlan &plan, + ErasureCodeInterfaceRef &ecimpl, + pg_t pgid, + const ECUtil::stripe_info_t &sinfo, + const map<hobject_t,extent_map> &partial_extents, + vector<pg_log_entry_t> &entries, + map<hobject_t,extent_map> *written_map, + map<shard_id_t, ObjectStore::Transaction> *transactions, + set<hobject_t> *temp_added, + set<hobject_t> *temp_removed, + DoutPrefixProvider *dpp) +{ + ceph_assert(written_map); + ceph_assert(transactions); + ceph_assert(temp_added); + ceph_assert(temp_removed); + ceph_assert(plan.t); + auto &t = *(plan.t); + + auto &hash_infos = plan.hash_infos; + + map<hobject_t, pg_log_entry_t*> obj_to_log; + for (auto &&i: entries) { + obj_to_log.insert(make_pair(i.soid, &i)); + } + + t.safe_create_traverse( + [&](pair<const hobject_t, PGTransaction::ObjectOperation> &opair) { + const hobject_t &oid = opair.first; + auto &op = opair.second; + auto &obc_map = t.obc_map; + auto &written = (*written_map)[oid]; + + auto iter = obj_to_log.find(oid); + pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr; + + ObjectContextRef obc; + auto obiter = t.obc_map.find(oid); + if (obiter != t.obc_map.end()) { + obc = obiter->second; + } + if (entry) { + ceph_assert(obc); + } else { + ceph_assert(oid.is_temp()); + } + + ECUtil::HashInfoRef hinfo; + { + auto iter = hash_infos.find(oid); + ceph_assert(iter != hash_infos.end()); + hinfo = iter->second; + } + + if (oid.is_temp()) { + if (op.is_fresh_object()) { + temp_added->insert(oid); + } else if (op.is_delete()) { + temp_removed->insert(oid); + } + } + + if (entry && + entry->is_modify() && + op.updated_snaps) { + bufferlist bl(op.updated_snaps->second.size() * 8 + 8); + encode(op.updated_snaps->second, bl); + entry->snaps.swap(bl); + entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + ldpp_dout(dpp, 20) << "generate_transactions: " + << opair.first + << ", current size is " + << hinfo->get_total_logical_size(sinfo) + << " buffers are " + << op.buffer_updates + << dendl; + if (op.truncate) { + ldpp_dout(dpp, 20) << "generate_transactions: " + << " truncate is " + << *(op.truncate) + << dendl; + } + + if (entry && op.updated_snaps) { + entry->mod_desc.update_snaps(op.updated_snaps->first); + } + + map<string, boost::optional<bufferlist> > xattr_rollback; + ceph_assert(hinfo); + bufferlist old_hinfo; + encode(*hinfo, old_hinfo); + xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo; + + if (op.is_none() && op.truncate && op.truncate->first == 0) { + ceph_assert(op.truncate->first == 0); + ceph_assert(op.truncate->first == + op.truncate->second); + ceph_assert(entry); + ceph_assert(obc); + + if (op.truncate->first != op.truncate->second) { + op.truncate->first = op.truncate->second; + } else { + op.truncate = boost::none; + } + + op.delete_first = true; + op.init_type = PGTransaction::ObjectOperation::Init::Create(); + + if (obc) { + /* We need to reapply all of the cached xattrs. + * std::map insert fortunately only writes keys + * which don't already exist, so this should do + * the right thing. */ + op.attr_updates.insert( + obc->attr_cache.begin(), + obc->attr_cache.end()); + } + } + + if (op.delete_first) { + /* We also want to remove the boost::none entries since + * the keys already won't exist */ + for (auto j = op.attr_updates.begin(); + j != op.attr_updates.end(); + ) { + if (j->second) { + ++j; + } else { + op.attr_updates.erase(j++); + } + } + /* Fill in all current entries for xattr rollback */ + if (obc) { + xattr_rollback.insert( + obc->attr_cache.begin(), + obc->attr_cache.end()); + obc->attr_cache.clear(); + } + if (entry) { + entry->mod_desc.rmobject(entry->version.version); + for (auto &&st: *transactions) { + st.second.collection_move_rename( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, entry->version.version, st.first)); + } + } else { + for (auto &&st: *transactions) { + st.second.remove( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } + } + hinfo->clear(); + } + + if (op.is_fresh_object() && entry) { + entry->mod_desc.create(); + } + + match( + op.init_type, + [&](const PGTransaction::ObjectOperation::Init::None &) {}, + [&](const PGTransaction::ObjectOperation::Init::Create &op) { + for (auto &&st: *transactions) { + st.second.touch( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } + }, + [&](const PGTransaction::ObjectOperation::Init::Clone &op) { + for (auto &&st: *transactions) { + st.second.clone( + coll_t(spg_t(pgid, st.first)), + ghobject_t(op.source, ghobject_t::NO_GEN, st.first), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } + + auto siter = hash_infos.find(op.source); + ceph_assert(siter != hash_infos.end()); + hinfo->update_to(*(siter->second)); + + if (obc) { + auto cobciter = obc_map.find(op.source); + ceph_assert(cobciter != obc_map.end()); + obc->attr_cache = cobciter->second->attr_cache; + } + }, + [&](const PGTransaction::ObjectOperation::Init::Rename &op) { + ceph_assert(op.source.is_temp()); + for (auto &&st: *transactions) { + st.second.collection_move_rename( + coll_t(spg_t(pgid, st.first)), + ghobject_t(op.source, ghobject_t::NO_GEN, st.first), + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first)); + } + auto siter = hash_infos.find(op.source); + ceph_assert(siter != hash_infos.end()); + hinfo->update_to(*(siter->second)); + if (obc) { + auto cobciter = obc_map.find(op.source); + ceph_assert(cobciter == obc_map.end()); + obc->attr_cache.clear(); + } + }); + + // omap not supported (except 0, handled above) + ceph_assert(!(op.clear_omap)); + ceph_assert(!(op.omap_header)); + ceph_assert(op.omap_updates.empty()); + + if (!op.attr_updates.empty()) { + map<string, bufferlist> to_set; + for (auto &&j: op.attr_updates) { + if (j.second) { + to_set[j.first] = *(j.second); + } else { + for (auto &&st : *transactions) { + st.second.rmattr( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + j.first); + } + } + if (obc) { + auto citer = obc->attr_cache.find(j.first); + if (entry) { + if (citer != obc->attr_cache.end()) { + // won't overwrite anything we put in earlier + xattr_rollback.insert( + make_pair( + j.first, + boost::optional<bufferlist>(citer->second))); + } else { + // won't overwrite anything we put in earlier + xattr_rollback.insert( + make_pair( + j.first, + boost::none)); + } + } + if (j.second) { + obc->attr_cache[j.first] = *(j.second); + } else if (citer != obc->attr_cache.end()) { + obc->attr_cache.erase(citer); + } + } else { + ceph_assert(!entry); + } + } + for (auto &&st : *transactions) { + st.second.setattrs( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + to_set); + } + ceph_assert(!xattr_rollback.empty()); + } + if (entry && !xattr_rollback.empty()) { + entry->mod_desc.setattrs(xattr_rollback); + } + + if (op.alloc_hint) { + /* logical_to_next_chunk_offset() scales down both aligned and + * unaligned offsets + + * we don't bother to roll this back at this time for two reasons: + * 1) it's advisory + * 2) we don't track the old value */ + uint64_t object_size = sinfo.logical_to_next_chunk_offset( + op.alloc_hint->expected_object_size); + uint64_t write_size = sinfo.logical_to_next_chunk_offset( + op.alloc_hint->expected_write_size); + + for (auto &&st : *transactions) { + st.second.set_alloc_hint( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + object_size, + write_size, + op.alloc_hint->flags); + } + } + + extent_map to_write; + auto pextiter = partial_extents.find(oid); + if (pextiter != partial_extents.end()) { + to_write = pextiter->second; + } + + vector<pair<uint64_t, uint64_t> > rollback_extents; + const uint64_t orig_size = hinfo->get_total_logical_size(sinfo); + + uint64_t new_size = orig_size; + uint64_t append_after = new_size; + ldpp_dout(dpp, 20) << __func__ << ": new_size start " << new_size << dendl; + if (op.truncate && op.truncate->first < new_size) { + ceph_assert(!op.is_fresh_object()); + new_size = sinfo.logical_to_next_stripe_offset( + op.truncate->first); + ldpp_dout(dpp, 20) << __func__ << ": new_size truncate down " + << new_size << dendl; + if (new_size != op.truncate->first) { // 0 the unaligned part + bufferlist bl; + bl.append_zero(new_size - op.truncate->first); + to_write.insert( + op.truncate->first, + bl.length(), + bl); + append_after = sinfo.logical_to_prev_stripe_offset( + op.truncate->first); + } else { + append_after = new_size; + } + to_write.erase( + new_size, + std::numeric_limits<uint64_t>::max() - new_size); + + if (entry && !op.is_fresh_object()) { + uint64_t restore_from = sinfo.logical_to_prev_chunk_offset( + op.truncate->first); + uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset( + orig_size - + sinfo.logical_to_prev_stripe_offset(op.truncate->first)); + ceph_assert(rollback_extents.empty()); + + ldpp_dout(dpp, 20) << __func__ << ": saving extent " + << make_pair(restore_from, restore_len) + << dendl; + ldpp_dout(dpp, 20) << __func__ << ": truncating to " + << new_size + << dendl; + rollback_extents.emplace_back( + make_pair(restore_from, restore_len)); + for (auto &&st : *transactions) { + st.second.touch( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, entry->version.version, st.first)); + st.second.clone_range( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + ghobject_t(oid, entry->version.version, st.first), + restore_from, + restore_len, + restore_from); + + } + } else { + ldpp_dout(dpp, 20) << __func__ << ": not saving extents, fresh object" + << dendl; + } + for (auto &&st : *transactions) { + st.second.truncate( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + sinfo.aligned_logical_offset_to_chunk_offset(new_size)); + } + } + + uint32_t fadvise_flags = 0; + for (auto &&extent: op.buffer_updates) { + using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; + bufferlist bl; + match( + extent.get_val(), + [&](const BufferUpdate::Write &op) { + bl = op.buffer; + fadvise_flags |= op.fadvise_flags; + }, + [&](const BufferUpdate::Zero &) { + bl.append_zero(extent.get_len()); + }, + [&](const BufferUpdate::CloneRange &) { + ceph_assert( + 0 == + "CloneRange is not allowed, do_op should have returned ENOTSUPP"); + }); + + uint64_t off = extent.get_off(); + uint64_t len = extent.get_len(); + uint64_t end = off + len; + ldpp_dout(dpp, 20) << __func__ << ": adding buffer_update " + << make_pair(off, len) + << dendl; + ceph_assert(len > 0); + if (off > new_size) { + ceph_assert(off > append_after); + bl.prepend_zero(off - new_size); + len += off - new_size; + ldpp_dout(dpp, 20) << __func__ << ": prepending zeroes to align " + << off << "->" << new_size + << dendl; + off = new_size; + } + if (!sinfo.logical_offset_is_stripe_aligned(end) && (end > append_after)) { + uint64_t aligned_end = sinfo.logical_to_next_stripe_offset( + end); + uint64_t tail = aligned_end - end; + bl.append_zero(tail); + ldpp_dout(dpp, 20) << __func__ << ": appending zeroes to align end " + << end << "->" << end+tail + << ", len: " << len << "->" << len+tail + << dendl; + end += tail; + len += tail; + } + + to_write.insert(off, len, bl); + if (end > new_size) + new_size = end; + } + + if (op.truncate && + op.truncate->second > new_size) { + ceph_assert(op.truncate->second > append_after); + uint64_t truncate_to = + sinfo.logical_to_next_stripe_offset( + op.truncate->second); + uint64_t zeroes = truncate_to - new_size; + bufferlist bl; + bl.append_zero(zeroes); + to_write.insert( + new_size, + zeroes, + bl); + new_size = truncate_to; + ldpp_dout(dpp, 20) << __func__ << ": truncating out to " + << truncate_to + << dendl; + } + + set<int> want; + for (unsigned i = 0; i < ecimpl->get_chunk_count(); ++i) { + want.insert(i); + } + auto to_overwrite = to_write.intersect(0, append_after); + ldpp_dout(dpp, 20) << __func__ << ": to_overwrite: " + << to_overwrite + << dendl; + for (auto &&extent: to_overwrite) { + ceph_assert(extent.get_off() + extent.get_len() <= append_after); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off())); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len())); + if (entry) { + uint64_t restore_from = sinfo.aligned_logical_offset_to_chunk_offset( + extent.get_off()); + uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset( + extent.get_len()); + ldpp_dout(dpp, 20) << __func__ << ": overwriting " + << restore_from << "~" << restore_len + << dendl; + if (rollback_extents.empty()) { + for (auto &&st : *transactions) { + st.second.touch( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, entry->version.version, st.first)); + } + } + rollback_extents.emplace_back(make_pair(restore_from, restore_len)); + for (auto &&st : *transactions) { + st.second.clone_range( + coll_t(spg_t(pgid, st.first)), + ghobject_t(oid, ghobject_t::NO_GEN, st.first), + ghobject_t(oid, entry->version.version, st.first), + restore_from, + restore_len, + restore_from); + } + } + encode_and_write( + pgid, + oid, + sinfo, + ecimpl, + want, + extent.get_off(), + extent.get_val(), + fadvise_flags, + hinfo, + written, + transactions, + dpp); + } + + auto to_append = to_write.intersect( + append_after, + std::numeric_limits<uint64_t>::max() - append_after); + ldpp_dout(dpp, 20) << __func__ << ": to_append: " + << to_append + << dendl; + for (auto &&extent: to_append) { + ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off())); + ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len())); + ldpp_dout(dpp, 20) << __func__ << ": appending " + << extent.get_off() << "~" << extent.get_len() + << dendl; + encode_and_write( + pgid, + oid, + sinfo, + ecimpl, + want, + extent.get_off(), + extent.get_val(), + fadvise_flags, + hinfo, + written, + transactions, + dpp); + } + + ldpp_dout(dpp, 20) << __func__ << ": " << oid + << " resetting hinfo to logical size " + << new_size + << dendl; + if (!rollback_extents.empty() && entry) { + if (entry) { + ldpp_dout(dpp, 20) << __func__ << ": " << oid + << " marking rollback extents " + << rollback_extents + << dendl; + entry->mod_desc.rollback_extents( + entry->version.version, rollback_extents); + } + hinfo->set_total_chunk_size_clear_hash( + sinfo.aligned_logical_offset_to_chunk_offset(new_size)); + } else { + ceph_assert(hinfo->get_total_logical_size(sinfo) == new_size); + } + + if (entry && !to_append.empty()) { + ldpp_dout(dpp, 20) << __func__ << ": marking append " + << append_after + << dendl; + entry->mod_desc.append(append_after); + } + + if (!op.is_delete()) { + bufferlist hbuf; + encode(*hinfo, hbuf); + for (auto &&i : *transactions) { + i.second.setattr( + coll_t(spg_t(pgid, i.first)), + ghobject_t(oid, ghobject_t::NO_GEN, i.first), + ECUtil::get_hinfo_key(), + hbuf); + } + } + }); +} diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h new file mode 100644 index 00000000..ae0faf5d --- /dev/null +++ b/src/osd/ECTransaction.h @@ -0,0 +1,199 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ECTRANSACTION_H +#define ECTRANSACTION_H + +#include "OSD.h" +#include "PGBackend.h" +#include "ECUtil.h" +#include "erasure-code/ErasureCodeInterface.h" +#include "PGTransaction.h" +#include "ExtentCache.h" + +namespace ECTransaction { + struct WritePlan { + PGTransactionUPtr t; + bool invalidates_cache = false; // Yes, both are possible + map<hobject_t,extent_set> to_read; + map<hobject_t,extent_set> will_write; // superset of to_read + + map<hobject_t,ECUtil::HashInfoRef> hash_infos; + }; + + bool requires_overwrite( + uint64_t prev_size, + const PGTransaction::ObjectOperation &op); + + template <typename F> + WritePlan get_write_plan( + const ECUtil::stripe_info_t &sinfo, + PGTransactionUPtr &&t, + F &&get_hinfo, + DoutPrefixProvider *dpp) { + WritePlan plan; + t->safe_create_traverse( + [&](pair<const hobject_t, PGTransaction::ObjectOperation> &i) { + ECUtil::HashInfoRef hinfo = get_hinfo(i.first); + plan.hash_infos[i.first] = hinfo; + + uint64_t projected_size = + hinfo->get_projected_total_logical_size(sinfo); + + if (i.second.deletes_first()) { + ldpp_dout(dpp, 20) << __func__ << ": delete, setting projected size" + << " to 0" << dendl; + projected_size = 0; + } + + hobject_t source; + if (i.second.has_source(&source)) { + plan.invalidates_cache = true; + + ECUtil::HashInfoRef shinfo = get_hinfo(source); + projected_size = shinfo->get_projected_total_logical_size(sinfo); + plan.hash_infos[source] = shinfo; + } + + auto &will_write = plan.will_write[i.first]; + if (i.second.truncate && + i.second.truncate->first < projected_size) { + if (!(sinfo.logical_offset_is_stripe_aligned( + i.second.truncate->first))) { + plan.to_read[i.first].union_insert( + sinfo.logical_to_prev_stripe_offset(i.second.truncate->first), + sinfo.get_stripe_width()); + + ldpp_dout(dpp, 20) << __func__ << ": unaligned truncate" << dendl; + + will_write.union_insert( + sinfo.logical_to_prev_stripe_offset(i.second.truncate->first), + sinfo.get_stripe_width()); + } + projected_size = sinfo.logical_to_next_stripe_offset( + i.second.truncate->first); + } + + extent_set raw_write_set; + for (auto &&extent: i.second.buffer_updates) { + using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; + if (boost::get<BufferUpdate::CloneRange>(&(extent.get_val()))) { + ceph_assert( + 0 == + "CloneRange is not allowed, do_op should have returned ENOTSUPP"); + } + raw_write_set.insert(extent.get_off(), extent.get_len()); + } + + auto orig_size = projected_size; + for (auto extent = raw_write_set.begin(); + extent != raw_write_set.end(); + ++extent) { + uint64_t head_start = + sinfo.logical_to_prev_stripe_offset(extent.get_start()); + uint64_t head_finish = + sinfo.logical_to_next_stripe_offset(extent.get_start()); + if (head_start > projected_size) { + head_start = projected_size; + } + if (head_start != head_finish && + head_start < orig_size) { + ceph_assert(head_finish <= orig_size); + ceph_assert(head_finish - head_start == sinfo.get_stripe_width()); + ldpp_dout(dpp, 20) << __func__ << ": reading partial head stripe " + << head_start << "~" << sinfo.get_stripe_width() + << dendl; + plan.to_read[i.first].union_insert( + head_start, sinfo.get_stripe_width()); + } + + uint64_t tail_start = + sinfo.logical_to_prev_stripe_offset( + extent.get_start() + extent.get_len()); + uint64_t tail_finish = + sinfo.logical_to_next_stripe_offset( + extent.get_start() + extent.get_len()); + if (tail_start != tail_finish && + (head_start == head_finish || tail_start != head_start) && + tail_start < orig_size) { + ceph_assert(tail_finish <= orig_size); + ceph_assert(tail_finish - tail_start == sinfo.get_stripe_width()); + ldpp_dout(dpp, 20) << __func__ << ": reading partial tail stripe " + << tail_start << "~" << sinfo.get_stripe_width() + << dendl; + plan.to_read[i.first].union_insert( + tail_start, sinfo.get_stripe_width()); + } + + if (head_start != tail_finish) { + ceph_assert( + sinfo.logical_offset_is_stripe_aligned( + tail_finish - head_start) + ); + will_write.union_insert( + head_start, tail_finish - head_start); + if (tail_finish > projected_size) + projected_size = tail_finish; + } else { + ceph_assert(tail_finish <= projected_size); + } + } + + if (i.second.truncate && + i.second.truncate->second > projected_size) { + uint64_t truncating_to = + sinfo.logical_to_next_stripe_offset(i.second.truncate->second); + ldpp_dout(dpp, 20) << __func__ << ": truncating out to " + << truncating_to + << dendl; + will_write.union_insert(projected_size, + truncating_to - projected_size); + projected_size = truncating_to; + } + + ldpp_dout(dpp, 20) << __func__ << ": " << i.first + << " projected size " + << projected_size + << dendl; + hinfo->set_projected_total_logical_size( + sinfo, + projected_size); + + /* validate post conditions: + * to_read should have an entry for i.first iff it isn't empty + * and if we are reading from i.first, we can't be renaming or + * cloning it */ + ceph_assert(plan.to_read.count(i.first) == 0 || + (!plan.to_read.at(i.first).empty() && + !i.second.has_source())); + }); + plan.t = std::move(t); + return plan; + } + + void generate_transactions( + WritePlan &plan, + ErasureCodeInterfaceRef &ecimpl, + pg_t pgid, + const ECUtil::stripe_info_t &sinfo, + const map<hobject_t,extent_map> &partial_extents, + vector<pg_log_entry_t> &entries, + map<hobject_t,extent_map> *written, + map<shard_id_t, ObjectStore::Transaction> *transactions, + set<hobject_t> *temp_added, + set<hobject_t> *temp_removed, + DoutPrefixProvider *dpp); +}; + +#endif diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc new file mode 100644 index 00000000..311e8526 --- /dev/null +++ b/src/osd/ECUtil.cc @@ -0,0 +1,245 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include <errno.h> +#include "include/encoding.h" +#include "ECUtil.h" + +using namespace std; + +int ECUtil::decode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + map<int, bufferlist> &to_decode, + bufferlist *out) { + ceph_assert(to_decode.size()); + + uint64_t total_data_size = to_decode.begin()->second.length(); + ceph_assert(total_data_size % sinfo.get_chunk_size() == 0); + + ceph_assert(out); + ceph_assert(out->length() == 0); + + for (map<int, bufferlist>::iterator i = to_decode.begin(); + i != to_decode.end(); + ++i) { + ceph_assert(i->second.length() == total_data_size); + } + + if (total_data_size == 0) + return 0; + + for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) { + map<int, bufferlist> chunks; + for (map<int, bufferlist>::iterator j = to_decode.begin(); + j != to_decode.end(); + ++j) { + chunks[j->first].substr_of(j->second, i, sinfo.get_chunk_size()); + } + bufferlist bl; + int r = ec_impl->decode_concat(chunks, &bl); + ceph_assert(r == 0); + ceph_assert(bl.length() == sinfo.get_stripe_width()); + out->claim_append(bl); + } + return 0; +} + +int ECUtil::decode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + map<int, bufferlist> &to_decode, + map<int, bufferlist*> &out) { + + ceph_assert(to_decode.size()); + + for (auto &&i : to_decode) { + if(i.second.length() == 0) + return 0; + } + + set<int> need; + for (map<int, bufferlist*>::iterator i = out.begin(); + i != out.end(); + ++i) { + ceph_assert(i->second); + ceph_assert(i->second->length() == 0); + need.insert(i->first); + } + + set<int> avail; + for (auto &&i : to_decode) { + ceph_assert(i.second.length() != 0); + avail.insert(i.first); + } + + map<int, vector<pair<int, int>>> min; + int r = ec_impl->minimum_to_decode(need, avail, &min); + ceph_assert(r == 0); + + int chunks_count = 0; + int repair_data_per_chunk = 0; + int subchunk_size = sinfo.get_chunk_size()/ec_impl->get_sub_chunk_count(); + + for (auto &&i : to_decode) { + auto found = min.find(i.first); + if (found != min.end()) { + int repair_subchunk_count = 0; + for (auto& subchunks : min[i.first]) { + repair_subchunk_count += subchunks.second; + } + repair_data_per_chunk = repair_subchunk_count * subchunk_size; + chunks_count = (int)i.second.length() / repair_data_per_chunk; + break; + } + } + + for (int i = 0; i < chunks_count; i++) { + map<int, bufferlist> chunks; + for (auto j = to_decode.begin(); + j != to_decode.end(); + ++j) { + chunks[j->first].substr_of(j->second, + i*repair_data_per_chunk, + repair_data_per_chunk); + } + map<int, bufferlist> out_bls; + r = ec_impl->decode(need, chunks, &out_bls, sinfo.get_chunk_size()); + ceph_assert(r == 0); + for (auto j = out.begin(); j != out.end(); ++j) { + ceph_assert(out_bls.count(j->first)); + ceph_assert(out_bls[j->first].length() == sinfo.get_chunk_size()); + j->second->claim_append(out_bls[j->first]); + } + } + for (auto &&i : out) { + ceph_assert(i.second->length() == chunks_count * sinfo.get_chunk_size()); + } + return 0; +} + +int ECUtil::encode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + bufferlist &in, + const set<int> &want, + map<int, bufferlist> *out) { + + uint64_t logical_size = in.length(); + + ceph_assert(logical_size % sinfo.get_stripe_width() == 0); + ceph_assert(out); + ceph_assert(out->empty()); + + if (logical_size == 0) + return 0; + + for (uint64_t i = 0; i < logical_size; i += sinfo.get_stripe_width()) { + map<int, bufferlist> encoded; + bufferlist buf; + buf.substr_of(in, i, sinfo.get_stripe_width()); + int r = ec_impl->encode(want, buf, &encoded); + ceph_assert(r == 0); + for (map<int, bufferlist>::iterator i = encoded.begin(); + i != encoded.end(); + ++i) { + ceph_assert(i->second.length() == sinfo.get_chunk_size()); + (*out)[i->first].claim_append(i->second); + } + } + + for (map<int, bufferlist>::iterator i = out->begin(); + i != out->end(); + ++i) { + ceph_assert(i->second.length() % sinfo.get_chunk_size() == 0); + ceph_assert( + sinfo.aligned_chunk_offset_to_logical_offset(i->second.length()) == + logical_size); + } + return 0; +} + +void ECUtil::HashInfo::append(uint64_t old_size, + map<int, bufferlist> &to_append) { + ceph_assert(old_size == total_chunk_size); + uint64_t size_to_append = to_append.begin()->second.length(); + if (has_chunk_hash()) { + ceph_assert(to_append.size() == cumulative_shard_hashes.size()); + for (map<int, bufferlist>::iterator i = to_append.begin(); + i != to_append.end(); + ++i) { + ceph_assert(size_to_append == i->second.length()); + ceph_assert((unsigned)i->first < cumulative_shard_hashes.size()); + uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]); + cumulative_shard_hashes[i->first] = new_hash; + } + } + total_chunk_size += size_to_append; +} + +void ECUtil::HashInfo::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(total_chunk_size, bl); + encode(cumulative_shard_hashes, bl); + ENCODE_FINISH(bl); +} + +void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(total_chunk_size, bl); + decode(cumulative_shard_hashes, bl); + projected_total_chunk_size = total_chunk_size; + DECODE_FINISH(bl); +} + +void ECUtil::HashInfo::dump(Formatter *f) const +{ + f->dump_unsigned("total_chunk_size", total_chunk_size); + f->open_array_section("cumulative_shard_hashes"); + for (unsigned i = 0; i != cumulative_shard_hashes.size(); ++i) { + f->open_object_section("hash"); + f->dump_unsigned("shard", i); + f->dump_unsigned("hash", cumulative_shard_hashes[i]); + f->close_section(); + } + f->close_section(); +} + +namespace ECUtil { +std::ostream& operator<<(std::ostream& out, const HashInfo& hi) +{ + ostringstream hashes; + for (auto hash: hi.cumulative_shard_hashes) + hashes << " " << hex << hash; + return out << "tcs=" << hi.total_chunk_size << hashes.str(); +} +} + +void ECUtil::HashInfo::generate_test_instances(list<HashInfo*>& o) +{ + o.push_back(new HashInfo(3)); + { + bufferlist bl; + bl.append_zero(20); + map<int, bufferlist> buffers; + buffers[0] = bl; + buffers[1] = bl; + buffers[2] = bl; + o.back()->append(0, buffers); + o.back()->append(20, buffers); + } + o.push_back(new HashInfo(4)); +} + +const string HINFO_KEY = "hinfo_key"; + +bool ECUtil::is_hinfo_key_string(const string &key) +{ + return key == HINFO_KEY; +} + +const string &ECUtil::get_hinfo_key() +{ + return HINFO_KEY; +} diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h new file mode 100644 index 00000000..8e980e27 --- /dev/null +++ b/src/osd/ECUtil.h @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ECUTIL_H +#define ECUTIL_H + +#include <ostream> +#include "erasure-code/ErasureCodeInterface.h" +#include "include/buffer_fwd.h" +#include "include/ceph_assert.h" +#include "include/encoding.h" +#include "common/Formatter.h" + +namespace ECUtil { + +class stripe_info_t { + const uint64_t stripe_width; + const uint64_t chunk_size; +public: + stripe_info_t(uint64_t stripe_size, uint64_t stripe_width) + : stripe_width(stripe_width), + chunk_size(stripe_width / stripe_size) { + ceph_assert(stripe_width % stripe_size == 0); + } + bool logical_offset_is_stripe_aligned(uint64_t logical) const { + return (logical % stripe_width) == 0; + } + uint64_t get_stripe_width() const { + return stripe_width; + } + uint64_t get_chunk_size() const { + return chunk_size; + } + uint64_t logical_to_prev_chunk_offset(uint64_t offset) const { + return (offset / stripe_width) * chunk_size; + } + uint64_t logical_to_next_chunk_offset(uint64_t offset) const { + return ((offset + stripe_width - 1)/ stripe_width) * chunk_size; + } + uint64_t logical_to_prev_stripe_offset(uint64_t offset) const { + return offset - (offset % stripe_width); + } + uint64_t logical_to_next_stripe_offset(uint64_t offset) const { + return ((offset % stripe_width) ? + (offset - (offset % stripe_width) + stripe_width) : + offset); + } + uint64_t aligned_logical_offset_to_chunk_offset(uint64_t offset) const { + ceph_assert(offset % stripe_width == 0); + return (offset / stripe_width) * chunk_size; + } + uint64_t aligned_chunk_offset_to_logical_offset(uint64_t offset) const { + ceph_assert(offset % chunk_size == 0); + return (offset / chunk_size) * stripe_width; + } + std::pair<uint64_t, uint64_t> aligned_offset_len_to_chunk( + std::pair<uint64_t, uint64_t> in) const { + return std::make_pair( + aligned_logical_offset_to_chunk_offset(in.first), + aligned_logical_offset_to_chunk_offset(in.second)); + } + std::pair<uint64_t, uint64_t> offset_len_to_stripe_bounds( + std::pair<uint64_t, uint64_t> in) const { + uint64_t off = logical_to_prev_stripe_offset(in.first); + uint64_t len = logical_to_next_stripe_offset( + (in.first - off) + in.second); + return std::make_pair(off, len); + } +}; + +int decode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + std::map<int, bufferlist> &to_decode, + bufferlist *out); + +int decode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + std::map<int, bufferlist> &to_decode, + std::map<int, bufferlist*> &out); + +int encode( + const stripe_info_t &sinfo, + ErasureCodeInterfaceRef &ec_impl, + bufferlist &in, + const std::set<int> &want, + std::map<int, bufferlist> *out); + +class HashInfo { + uint64_t total_chunk_size = 0; + std::vector<uint32_t> cumulative_shard_hashes; + + // purely ephemeral, represents the size once all in-flight ops commit + uint64_t projected_total_chunk_size = 0; +public: + HashInfo() {} + explicit HashInfo(unsigned num_chunks) : + cumulative_shard_hashes(num_chunks, -1) {} + void append(uint64_t old_size, std::map<int, bufferlist> &to_append); + void clear() { + total_chunk_size = 0; + cumulative_shard_hashes = std::vector<uint32_t>( + cumulative_shard_hashes.size(), + -1); + } + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(std::list<HashInfo*>& o); + uint32_t get_chunk_hash(int shard) const { + ceph_assert((unsigned)shard < cumulative_shard_hashes.size()); + return cumulative_shard_hashes[shard]; + } + uint64_t get_total_chunk_size() const { + return total_chunk_size; + } + uint64_t get_projected_total_chunk_size() const { + return projected_total_chunk_size; + } + uint64_t get_total_logical_size(const stripe_info_t &sinfo) const { + return get_total_chunk_size() * + (sinfo.get_stripe_width()/sinfo.get_chunk_size()); + } + uint64_t get_projected_total_logical_size(const stripe_info_t &sinfo) const { + return get_projected_total_chunk_size() * + (sinfo.get_stripe_width()/sinfo.get_chunk_size()); + } + void set_projected_total_logical_size( + const stripe_info_t &sinfo, + uint64_t logical_size) { + ceph_assert(sinfo.logical_offset_is_stripe_aligned(logical_size)); + projected_total_chunk_size = sinfo.aligned_logical_offset_to_chunk_offset( + logical_size); + } + void set_total_chunk_size_clear_hash(uint64_t new_chunk_size) { + cumulative_shard_hashes.clear(); + total_chunk_size = new_chunk_size; + } + bool has_chunk_hash() const { + return !cumulative_shard_hashes.empty(); + } + void update_to(const HashInfo &rhs) { + auto ptcs = projected_total_chunk_size; + *this = rhs; + projected_total_chunk_size = ptcs; + } + friend std::ostream& operator<<(std::ostream& out, const HashInfo& hi); +}; + +typedef std::shared_ptr<HashInfo> HashInfoRef; + +bool is_hinfo_key_string(const std::string &key); +const std::string &get_hinfo_key(); + +WRITE_CLASS_ENCODER(ECUtil::HashInfo) +} +#endif diff --git a/src/osd/ExtentCache.cc b/src/osd/ExtentCache.cc new file mode 100644 index 00000000..a09fc86e --- /dev/null +++ b/src/osd/ExtentCache.cc @@ -0,0 +1,241 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "ExtentCache.h" + +void ExtentCache::extent::_link_pin_state(pin_state &pin_state) +{ + ceph_assert(parent_extent_set); + ceph_assert(!parent_pin_state); + parent_pin_state = &pin_state; + pin_state.pin_list.push_back(*this); +} + +void ExtentCache::extent::_unlink_pin_state() +{ + ceph_assert(parent_extent_set); + ceph_assert(parent_pin_state); + auto liter = pin_state::list::s_iterator_to(*this); + parent_pin_state->pin_list.erase(liter); + parent_pin_state = nullptr; +} + +void ExtentCache::extent::unlink() +{ + ceph_assert(parent_extent_set); + ceph_assert(parent_pin_state); + + _unlink_pin_state(); + + // remove from extent set + { + auto siter = object_extent_set::set::s_iterator_to(*this); + auto &set = object_extent_set::set::container_from_iterator(siter); + ceph_assert(&set == &(parent_extent_set->extent_set)); + set.erase(siter); + } + + parent_extent_set = nullptr; + ceph_assert(!parent_pin_state); +} + +void ExtentCache::extent::link( + object_extent_set &extent_set, + pin_state &pin_state) +{ + ceph_assert(!parent_extent_set); + parent_extent_set = &extent_set; + extent_set.extent_set.insert(*this); + + _link_pin_state(pin_state); +} + +void ExtentCache::extent::move( + pin_state &to) +{ + _unlink_pin_state(); + _link_pin_state(to); +} + +void ExtentCache::remove_and_destroy_if_empty(object_extent_set &eset) +{ + if (eset.extent_set.empty()) { + auto siter = cache_set::s_iterator_to(eset); + auto &set = cache_set::container_from_iterator(siter); + ceph_assert(&set == &per_object_caches); + + // per_object_caches owns eset + per_object_caches.erase(eset); + delete &eset; + } +} + +ExtentCache::object_extent_set &ExtentCache::get_or_create( + const hobject_t &oid) +{ + cache_set::insert_commit_data data; + auto p = per_object_caches.insert_check(oid, Cmp(), data); + if (p.second) { + auto *eset = new object_extent_set(oid); + per_object_caches.insert_commit(*eset, data); + return *eset; + } else { + return *(p.first); + } +} + +ExtentCache::object_extent_set *ExtentCache::get_if_exists( + const hobject_t &oid) +{ + cache_set::insert_commit_data data; + auto p = per_object_caches.insert_check(oid, Cmp(), data); + if (p.second) { + return nullptr; + } else { + return &*(p.first); + } +} + +std::pair< + ExtentCache::object_extent_set::set::iterator, + ExtentCache::object_extent_set::set::iterator + > ExtentCache::object_extent_set::get_containing_range( + uint64_t off, uint64_t len) +{ + // fst is first iterator with end after off (may be end) + auto fst = extent_set.upper_bound(off, uint_cmp()); + if (fst != extent_set.begin()) + --fst; + if (fst != extent_set.end() && off >= (fst->offset + fst->get_length())) + ++fst; + + // lst is first iterator with start >= off + len (may be end) + auto lst = extent_set.lower_bound(off + len, uint_cmp()); + return std::make_pair(fst, lst); +} + +extent_set ExtentCache::reserve_extents_for_rmw( + const hobject_t &oid, + write_pin &pin, + const extent_set &to_write, + const extent_set &to_read) +{ + if (to_write.empty() && to_read.empty()) { + return extent_set(); + } + extent_set must_read; + auto &eset = get_or_create(oid); + extent_set missing; + for (auto &&res: to_write) { + eset.traverse_update( + pin, + res.first, + res.second, + [&](uint64_t off, uint64_t len, + extent *ext, object_extent_set::update_action *action) { + action->action = object_extent_set::update_action::UPDATE_PIN; + if (!ext) { + missing.insert(off, len); + } + }); + } + must_read.intersection_of( + to_read, + missing); + return must_read; +} + +extent_map ExtentCache::get_remaining_extents_for_rmw( + const hobject_t &oid, + write_pin &pin, + const extent_set &to_get) +{ + if (to_get.empty()) { + return extent_map(); + } + extent_map ret; + auto &eset = get_or_create(oid); + for (auto &&res: to_get) { + bufferlist bl; + uint64_t cur = res.first; + eset.traverse_update( + pin, + res.first, + res.second, + [&](uint64_t off, uint64_t len, + extent *ext, object_extent_set::update_action *action) { + ceph_assert(off == cur); + cur = off + len; + action->action = object_extent_set::update_action::NONE; + ceph_assert(ext && ext->bl && ext->pinned_by_write()); + bl.substr_of( + *(ext->bl), + off - ext->offset, + len); + ret.insert(off, len, bl); + }); + } + return ret; +} + +void ExtentCache::present_rmw_update( + const hobject_t &oid, + write_pin &pin, + const extent_map &extents) +{ + if (extents.empty()) { + return; + } + auto &eset = get_or_create(oid); + for (auto &&res: extents) { + eset.traverse_update( + pin, + res.get_off(), + res.get_len(), + [&](uint64_t off, uint64_t len, + extent *ext, object_extent_set::update_action *action) { + action->action = object_extent_set::update_action::NONE; + ceph_assert(ext && ext->pinned_by_write()); + action->bl = bufferlist(); + action->bl->substr_of( + res.get_val(), + off - res.get_off(), + len); + }); + } +} + +ostream &ExtentCache::print(ostream &out) const +{ + out << "ExtentCache(" << std::endl; + for (auto esiter = per_object_caches.begin(); + esiter != per_object_caches.end(); + ++esiter) { + out << " Extents(" << esiter->oid << ")[" << std::endl; + for (auto exiter = esiter->extent_set.begin(); + exiter != esiter->extent_set.end(); + ++exiter) { + out << " Extent(" << exiter->offset + << "~" << exiter->get_length() + << ":" << exiter->pin_tid() + << ")" << std::endl; + } + } + return out << ")" << std::endl; +} + +ostream &operator<<(ostream &lhs, const ExtentCache &cache) +{ + return cache.print(lhs); +} diff --git a/src/osd/ExtentCache.h b/src/osd/ExtentCache.h new file mode 100644 index 00000000..7f6e3e2e --- /dev/null +++ b/src/osd/ExtentCache.h @@ -0,0 +1,491 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef EXTENT_CACHE_H +#define EXTENT_CACHE_H + +#include <map> +#include <list> +#include <vector> +#include <utility> +#include <boost/optional.hpp> +#include <boost/intrusive/set.hpp> +#include <boost/intrusive/list.hpp> +#include "include/interval_set.h" +#include "common/interval_map.h" +#include "include/buffer.h" +#include "common/hobject.h" + +/** + ExtentCache + + The main purpose of this cache is to ensure that we can pipeline + overlapping partial overwrites. + + To that end we need to ensure that an extent pinned for an operation is + live until that operation completes. However, a particular extent + might be pinned by multiple operations (several pipelined writes + on the same object). + + 1) When we complete an operation, we only look at extents owned only + by that operation. + 2) Per-extent overhead is fixed size. + 2) Per-operation metadata is fixed size. + + This is simple enough to realize with two main structures: + - extent: contains a pointer to the pin owning it and intrusive list + pointers to other extents owned by the same pin + - pin_state: contains the list head for extents owned by it + + This works as long as we only need to remember one "owner" for + each extent. To make this work, we'll need to leverage some + invariants guaranteed by higher layers: + + 1) Writes on a particular object must be ordered + 2) A particular object will have outstanding reads or writes, but not + both (note that you can have a read while a write is committed, but + not applied). + + Our strategy therefore will be to have whichever in-progress op will + finish "last" be the owner of a particular extent. For now, we won't + cache reads, so 2) simply means that we can assume that reads and + recovery operations imply no unstable extents on the object in + question. + + Write: WaitRead -> WaitCommit -> Complete + + Invariant 1) above actually indicates that we can't have writes + bypassing the WaitRead state while there are writes waiting on + Reads. Thus, the set of operations pinning a particular extent + must always complete in order or arrival. + + This suggests that a particular extent may be in only the following + states: + + + 0) Empty (not in the map at all) + 1) Write Pending N + - Some write with reqid <= N is currently fetching the data for + this extent + - The extent must persist until Write reqid N completes + - All ops pinning this extent are writes in the WaitRead state of + the Write pipeline (there must be an in progress write, so no + reads can be in progress). + 2) Write Pinned N: + - This extent has data corresponding to some reqid M <= N + - The extent must persist until Write reqid N commits + - All ops pinning this extent are writes in some Write + state (all are possible). Reads are not possible + in this state (or the others) due to 2). + + All of the above suggests that there are 3 things users can + ask of the cache corresponding to the 3 Write pipelines + states. + */ + +/// If someone wants these types, but not ExtentCache, move to another file +struct bl_split_merge { + bufferlist split( + uint64_t offset, + uint64_t length, + bufferlist &bl) const { + bufferlist out; + out.substr_of(bl, offset, length); + return out; + } + bool can_merge(const bufferlist &left, const bufferlist &right) const { + return true; + } + bufferlist merge(bufferlist &&left, bufferlist &&right) const { + bufferlist bl; + bl.claim(left); + bl.claim_append(right); + return bl; + } + uint64_t length(const bufferlist &b) const { return b.length(); } +}; +using extent_set = interval_set<uint64_t>; +using extent_map = interval_map<uint64_t, bufferlist, bl_split_merge>; + +class ExtentCache { + struct object_extent_set; + struct pin_state; +private: + + struct extent { + object_extent_set *parent_extent_set = nullptr; + pin_state *parent_pin_state = nullptr; + boost::intrusive::set_member_hook<> extent_set_member; + boost::intrusive::list_member_hook<> pin_list_member; + + uint64_t offset; + uint64_t length; + boost::optional<bufferlist> bl; + + uint64_t get_length() const { + return length; + } + + bool is_pending() const { + return bl == boost::none; + } + + bool pinned_by_write() const { + ceph_assert(parent_pin_state); + return parent_pin_state->is_write(); + } + + uint64_t pin_tid() const { + ceph_assert(parent_pin_state); + return parent_pin_state->tid; + } + + extent(uint64_t offset, bufferlist _bl) + : offset(offset), length(_bl.length()), bl(_bl) {} + + extent(uint64_t offset, uint64_t length) + : offset(offset), length(length) {} + + bool operator<(const extent &rhs) const { + return offset < rhs.offset; + } + private: + // can briefly violate the two link invariant, used in unlink() and move() + void _link_pin_state(pin_state &pin_state); + void _unlink_pin_state(); + public: + void unlink(); + void link(object_extent_set &parent_extent_set, pin_state &pin_state); + void move(pin_state &to); + }; + + struct object_extent_set : boost::intrusive::set_base_hook<> { + hobject_t oid; + explicit object_extent_set(const hobject_t &oid) : oid(oid) {} + + using set_member_options = boost::intrusive::member_hook< + extent, + boost::intrusive::set_member_hook<>, + &extent::extent_set_member>; + using set = boost::intrusive::set<extent, set_member_options>; + set extent_set; + + bool operator<(const object_extent_set &rhs) const { + return oid < rhs.oid; + } + + struct uint_cmp { + bool operator()(uint64_t lhs, const extent &rhs) const { + return lhs < rhs.offset; + } + bool operator()(const extent &lhs, uint64_t rhs) const { + return lhs.offset < rhs; + } + }; + std::pair<set::iterator, set::iterator> get_containing_range( + uint64_t offset, uint64_t length); + + void erase(uint64_t offset, uint64_t length); + + struct update_action { + enum type { + NONE, + UPDATE_PIN + }; + type action = NONE; + boost::optional<bufferlist> bl; + }; + template <typename F> + void traverse_update( + pin_state &pin, + uint64_t offset, + uint64_t length, + F &&f) { + auto range = get_containing_range(offset, length); + + if (range.first == range.second || range.first->offset > offset) { + uint64_t extlen = range.first == range.second ? + length : range.first->offset - offset; + + update_action action; + f(offset, extlen, nullptr, &action); + ceph_assert(!action.bl || action.bl->length() == extlen); + if (action.action == update_action::UPDATE_PIN) { + extent *ext = action.bl ? + new extent(offset, *action.bl) : + new extent(offset, extlen); + ext->link(*this, pin); + } else { + ceph_assert(!action.bl); + } + } + + for (auto p = range.first; p != range.second;) { + extent *ext = &*p; + ++p; + + uint64_t extoff = std::max(ext->offset, offset); + uint64_t extlen = std::min( + ext->length - (extoff - ext->offset), + offset + length - extoff); + + update_action action; + f(extoff, extlen, ext, &action); + ceph_assert(!action.bl || action.bl->length() == extlen); + extent *final_extent = nullptr; + if (action.action == update_action::NONE) { + final_extent = ext; + } else { + pin_state *ps = ext->parent_pin_state; + ext->unlink(); + if ((ext->offset < offset) && + (ext->offset + ext->get_length() > offset)) { + extent *head = nullptr; + if (ext->bl) { + bufferlist bl; + bl.substr_of( + *(ext->bl), + 0, + offset - ext->offset); + head = new extent(ext->offset, bl); + } else { + head = new extent( + ext->offset, offset - ext->offset); + } + head->link(*this, *ps); + } + if ((ext->offset + ext->length > offset + length) && + (offset + length > ext->offset)) { + uint64_t nlen = + (ext->offset + ext->get_length()) - (offset + length); + extent *tail = nullptr; + if (ext->bl) { + bufferlist bl; + bl.substr_of( + *(ext->bl), + ext->get_length() - nlen, + nlen); + tail = new extent(offset + length, bl); + } else { + tail = new extent(offset + length, nlen); + } + tail->link(*this, *ps); + } + if (action.action == update_action::UPDATE_PIN) { + if (ext->bl) { + bufferlist bl; + bl.substr_of( + *(ext->bl), + extoff - ext->offset, + extlen); + final_extent = new ExtentCache::extent( + extoff, + bl); + } else { + final_extent = new ExtentCache::extent( + extoff, extlen); + } + final_extent->link(*this, pin); + } + delete ext; + } + + if (action.bl) { + ceph_assert(final_extent); + ceph_assert(final_extent->length == action.bl->length()); + final_extent->bl = *(action.bl); + } + + uint64_t next_off = p == range.second ? + offset + length : p->offset; + if (extoff + extlen < next_off) { + uint64_t tailoff = extoff + extlen; + uint64_t taillen = next_off - tailoff; + + update_action action; + f(tailoff, taillen, nullptr, &action); + ceph_assert(!action.bl || action.bl->length() == taillen); + if (action.action == update_action::UPDATE_PIN) { + extent *ext = action.bl ? + new extent(tailoff, *action.bl) : + new extent(tailoff, taillen); + ext->link(*this, pin); + } else { + ceph_assert(!action.bl); + } + } + } + } + }; + struct Cmp { + bool operator()(const hobject_t &oid, const object_extent_set &rhs) const { + return oid < rhs.oid; + } + bool operator()(const object_extent_set &lhs, const hobject_t &oid) const { + return lhs.oid < oid; + } + }; + + object_extent_set &get_or_create(const hobject_t &oid); + object_extent_set *get_if_exists(const hobject_t &oid); + + void remove_and_destroy_if_empty(object_extent_set &set); + using cache_set = boost::intrusive::set<object_extent_set>; + cache_set per_object_caches; + + uint64_t next_write_tid = 1; + uint64_t next_read_tid = 1; + struct pin_state { + uint64_t tid = 0; + enum pin_type_t { + NONE, + WRITE, + }; + pin_type_t pin_type = NONE; + bool is_write() const { return pin_type == WRITE; } + + pin_state(const pin_state &other) = delete; + pin_state &operator=(const pin_state &other) = delete; + pin_state(pin_state &&other) = delete; + pin_state() = default; + + using list_member_options = boost::intrusive::member_hook< + extent, + boost::intrusive::list_member_hook<>, + &extent::pin_list_member>; + using list = boost::intrusive::list<extent, list_member_options>; + list pin_list; + ~pin_state() { + ceph_assert(pin_list.empty()); + ceph_assert(tid == 0); + ceph_assert(pin_type == NONE); + } + void _open(uint64_t in_tid, pin_type_t in_type) { + ceph_assert(pin_type == NONE); + ceph_assert(in_tid > 0); + tid = in_tid; + pin_type = in_type; + } + }; + + void release_pin(pin_state &p) { + for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) { + unique_ptr<extent> extent(&*iter); // we now own this + iter++; // unlink will invalidate + ceph_assert(extent->parent_extent_set); + auto &eset = *(extent->parent_extent_set); + extent->unlink(); + remove_and_destroy_if_empty(eset); + } + p.tid = 0; + p.pin_type = pin_state::NONE; + } + +public: + class write_pin : private pin_state { + friend class ExtentCache; + private: + void open(uint64_t in_tid) { + _open(in_tid, pin_state::WRITE); + } + public: + write_pin() : pin_state() {} + }; + + void open_write_pin(write_pin &pin) { + pin.open(next_write_tid++); + } + + /** + * Reserves extents required for rmw, and learn + * which need to be read + * + * Pins all extents in to_write. Returns subset of to_read not + * currently present in the cache. Caller must obtain those + * extents before calling get_remaining_extents_for_rmw. + * + * Transition table: + * - Empty -> Write Pending pin.reqid + * - Write Pending N -> Write Pending pin.reqid + * - Write Pinned N -> Write Pinned pin.reqid + * + * @param oid [in] object undergoing rmw + * @param pin [in,out] pin to use (obtained from create_write_pin) + * @param to_write [in] extents which will be written + * @param to_read [in] extents to read prior to write (must be subset + * of to_write) + * @return subset of to_read which isn't already present or pending + */ + extent_set reserve_extents_for_rmw( + const hobject_t &oid, + write_pin &pin, + const extent_set &to_write, + const extent_set &to_read); + + /** + * Gets extents required for rmw not returned from + * reserve_extents_for_rmw + * + * Requested extents (to_get) must be the set to_read \ the set + * returned from reserve_extents_for_rmw. No transition table, + * all extents at this point must be present and already pinned + * for this pin by reserve_extents_for_rmw. + * + * @param oid [in] object + * @param pin [in,out] pin associated with this IO + * @param to_get [in] extents to get (see above for restrictions) + * @return map of buffers from to_get + */ + extent_map get_remaining_extents_for_rmw( + const hobject_t &oid, + write_pin &pin, + const extent_set &to_get); + + /** + * Updates the cache to reflect the rmw write + * + * All presented extents must already have been specified in + * reserve_extents_for_rmw under to_write. + * + * Transition table: + * - Empty -> invalid, must call reserve_extents_for_rmw first + * - Write Pending N -> Write Pinned N, update buffer + * (assert N >= pin.reqid) + * - Write Pinned N -> Update buffer (assert N >= pin.reqid) + * + * @param oid [in] object + * @param pin [in,out] pin associated with this IO + * @param extents [in] map of buffers to update + * @return void + */ + void present_rmw_update( + const hobject_t &oid, + write_pin &pin, + const extent_map &extents); + + /** + * Release all buffers pinned by pin + */ + void release_write_pin( + write_pin &pin) { + release_pin(pin); + } + + ostream &print( + ostream &out) const; +}; + +ostream &operator<<(ostream &lhs, const ExtentCache &cache); + +#endif diff --git a/src/osd/HitSet.cc b/src/osd/HitSet.cc new file mode 100644 index 00000000..653c4448 --- /dev/null +++ b/src/osd/HitSet.cc @@ -0,0 +1,252 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank <info@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "HitSet.h" +#include "common/Formatter.h" + +// -- HitSet -- + +HitSet::HitSet(const HitSet::Params& params) + : sealed(false) +{ + switch (params.get_type()) { + case TYPE_BLOOM: + { + BloomHitSet::Params *p = + static_cast<BloomHitSet::Params*>(params.impl.get()); + impl.reset(new BloomHitSet(p)); + } + break; + + case TYPE_EXPLICIT_HASH: + impl.reset(new ExplicitHashHitSet(static_cast<ExplicitHashHitSet::Params*>(params.impl.get()))); + break; + + case TYPE_EXPLICIT_OBJECT: + impl.reset(new ExplicitObjectHitSet(static_cast<ExplicitObjectHitSet::Params*>(params.impl.get()))); + break; + + default: + assert (0 == "unknown HitSet type"); + } +} + +void HitSet::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(sealed, bl); + if (impl) { + encode((__u8)impl->get_type(), bl); + impl->encode(bl); + } else { + encode((__u8)TYPE_NONE, bl); + } + ENCODE_FINISH(bl); +} + +void HitSet::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(sealed, bl); + __u8 type; + decode(type, bl); + switch ((impl_type_t)type) { + case TYPE_EXPLICIT_HASH: + impl.reset(new ExplicitHashHitSet); + break; + case TYPE_EXPLICIT_OBJECT: + impl.reset(new ExplicitObjectHitSet); + break; + case TYPE_BLOOM: + impl.reset(new BloomHitSet); + break; + case TYPE_NONE: + impl.reset(NULL); + break; + default: + throw buffer::malformed_input("unrecognized HitMap type"); + } + if (impl) + impl->decode(bl); + DECODE_FINISH(bl); +} + +void HitSet::dump(Formatter *f) const +{ + f->dump_string("type", get_type_name()); + f->dump_string("sealed", sealed ? "yes" : "no"); + if (impl) + impl->dump(f); +} + +void HitSet::generate_test_instances(list<HitSet*>& o) +{ + o.push_back(new HitSet); + o.push_back(new HitSet(new BloomHitSet(10, .1, 1))); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + o.push_back(new HitSet(new ExplicitHashHitSet)); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + o.push_back(new HitSet(new ExplicitObjectHitSet)); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); +} + +HitSet::Params::Params(const Params& o) noexcept +{ + if (o.get_type() != TYPE_NONE) { + create_impl(o.get_type()); + // it's annoying to write virtual operator= methods; use encode/decode + // instead. + bufferlist bl; + o.impl->encode(bl); + auto p = bl.cbegin(); + impl->decode(p); + } // else we don't need to do anything +} + +const HitSet::Params& HitSet::Params::operator=(const Params& o) +{ + create_impl(o.get_type()); + if (o.impl) { + // it's annoying to write virtual operator= methods; use encode/decode + // instead. + bufferlist bl; + o.impl->encode(bl); + auto p = bl.cbegin(); + impl->decode(p); + } + return *this; +} + +void HitSet::Params::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + if (impl) { + encode((__u8)impl->get_type(), bl); + impl->encode(bl); + } else { + encode((__u8)TYPE_NONE, bl); + } + ENCODE_FINISH(bl); +} + +bool HitSet::Params::create_impl(impl_type_t type) +{ + switch ((impl_type_t)type) { + case TYPE_EXPLICIT_HASH: + impl.reset(new ExplicitHashHitSet::Params); + break; + case TYPE_EXPLICIT_OBJECT: + impl.reset(new ExplicitObjectHitSet::Params); + break; + case TYPE_BLOOM: + impl.reset(new BloomHitSet::Params); + break; + case TYPE_NONE: + impl.reset(NULL); + break; + default: + return false; + } + return true; +} + +void HitSet::Params::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + __u8 type; + decode(type, bl); + if (!create_impl((impl_type_t)type)) + throw buffer::malformed_input("unrecognized HitMap type"); + if (impl) + impl->decode(bl); + DECODE_FINISH(bl); +} + +void HitSet::Params::dump(Formatter *f) const +{ + f->dump_string("type", HitSet::get_type_name(get_type())); + if (impl) + impl->dump(f); +} + +void HitSet::Params::generate_test_instances(list<HitSet::Params*>& o) +{ +#define loop_hitset_params(kind) \ +{ \ + list<kind::Params*> params; \ + kind::Params::generate_test_instances(params); \ + for (list<kind::Params*>::iterator i = params.begin(); \ + i != params.end(); ++i) \ + o.push_back(new Params(*i)); \ +} + o.push_back(new Params); + o.push_back(new Params(new BloomHitSet::Params)); + loop_hitset_params(BloomHitSet); + o.push_back(new Params(new ExplicitHashHitSet::Params)); + loop_hitset_params(ExplicitHashHitSet); + o.push_back(new Params(new ExplicitObjectHitSet::Params)); + loop_hitset_params(ExplicitObjectHitSet); +} + +ostream& operator<<(ostream& out, const HitSet::Params& p) { + out << HitSet::get_type_name(p.get_type()); + if (p.impl) { + out << "{"; + p.impl->dump_stream(out); + } + out << "}"; + return out; +} + + +void ExplicitHashHitSet::dump(Formatter *f) const { + f->dump_unsigned("insert_count", count); + f->open_array_section("hash_set"); + for (ceph::unordered_set<uint32_t>::const_iterator p = hits.begin(); + p != hits.end(); + ++p) + f->dump_unsigned("hash", *p); + f->close_section(); +} + +void ExplicitObjectHitSet::dump(Formatter *f) const { + f->dump_unsigned("insert_count", count); + f->open_array_section("set"); + for (ceph::unordered_set<hobject_t>::const_iterator p = hits.begin(); + p != hits.end(); + ++p) { + f->open_object_section("object"); + p->dump(f); + f->close_section(); + } + f->close_section(); +} + +void BloomHitSet::Params::dump(Formatter *f) const { + f->dump_float("false_positive_probability", get_fpp()); + f->dump_int("target_size", target_size); + f->dump_int("seed", seed); +} + +void BloomHitSet::dump(Formatter *f) const { + f->open_object_section("bloom_filter"); + bloom.dump(f); + f->close_section(); +} diff --git a/src/osd/HitSet.h b/src/osd/HitSet.h new file mode 100644 index 00000000..7e50fd93 --- /dev/null +++ b/src/osd/HitSet.h @@ -0,0 +1,455 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank <info@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_HITSET_H +#define CEPH_OSD_HITSET_H + +#include <string_view> + +#include <boost/scoped_ptr.hpp> + +#include "include/encoding.h" +#include "include/unordered_set.h" +#include "common/bloom_filter.hpp" +#include "common/hobject.h" + +/** + * generic container for a HitSet + * + * Encapsulate a HitSetImpl of any type. Expose a generic interface + * to users and wrap the encoded object with a type so that it can be + * safely decoded later. + */ + +class HitSet { +public: + typedef enum { + TYPE_NONE = 0, + TYPE_EXPLICIT_HASH = 1, + TYPE_EXPLICIT_OBJECT = 2, + TYPE_BLOOM = 3 + } impl_type_t; + + static std::string_view get_type_name(impl_type_t t) { + switch (t) { + case TYPE_NONE: return "none"; + case TYPE_EXPLICIT_HASH: return "explicit_hash"; + case TYPE_EXPLICIT_OBJECT: return "explicit_object"; + case TYPE_BLOOM: return "bloom"; + default: return "???"; + } + } + std::string_view get_type_name() const { + if (impl) + return get_type_name(impl->get_type()); + return get_type_name(TYPE_NONE); + } + + /// abstract interface for a HitSet implementation + class Impl { + public: + virtual impl_type_t get_type() const = 0; + virtual bool is_full() const = 0; + virtual void insert(const hobject_t& o) = 0; + virtual bool contains(const hobject_t& o) const = 0; + virtual unsigned insert_count() const = 0; + virtual unsigned approx_unique_insert_count() const = 0; + virtual void encode(bufferlist &bl) const = 0; + virtual void decode(bufferlist::const_iterator& p) = 0; + virtual void dump(Formatter *f) const = 0; + virtual Impl* clone() const = 0; + virtual void seal() {} + virtual ~Impl() {} + }; + + boost::scoped_ptr<Impl> impl; + bool sealed; + + class Params { + /// create an Impl* of the given type + bool create_impl(impl_type_t t); + + public: + class Impl { + public: + virtual impl_type_t get_type() const = 0; + virtual HitSet::Impl *get_new_impl() const = 0; + virtual void encode(bufferlist &bl) const {} + virtual void decode(bufferlist::const_iterator& p) {} + virtual void dump(Formatter *f) const {} + virtual void dump_stream(ostream& o) const {} + virtual ~Impl() {} + }; + + Params() {} + explicit Params(Impl *i) : impl(i) {} + virtual ~Params() {} + + boost::scoped_ptr<Params::Impl> impl; + + impl_type_t get_type() const { + if (impl) + return impl->get_type(); + return TYPE_NONE; + } + + Params(const Params& o) noexcept; + const Params& operator=(const Params& o); + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<HitSet::Params*>& o); + + friend ostream& operator<<(ostream& out, const HitSet::Params& p); + }; + + HitSet() : impl(NULL), sealed(false) {} + explicit HitSet(Impl *i) : impl(i), sealed(false) {} + explicit HitSet(const HitSet::Params& params); + + HitSet(const HitSet& o) { + sealed = o.sealed; + if (o.impl) + impl.reset(o.impl->clone()); + else + impl.reset(NULL); + } + const HitSet& operator=(const HitSet& o) { + sealed = o.sealed; + if (o.impl) + impl.reset(o.impl->clone()); + else + impl.reset(NULL); + return *this; + } + + + bool is_full() const { + return impl->is_full(); + } + /// insert a hash into the set + void insert(const hobject_t& o) { + impl->insert(o); + } + /// query whether a hash is in the set + bool contains(const hobject_t& o) const { + return impl->contains(o); + } + + unsigned insert_count() const { + return impl->insert_count(); + } + unsigned approx_unique_insert_count() const { + return impl->approx_unique_insert_count(); + } + void seal() { + ceph_assert(!sealed); + sealed = true; + impl->seal(); + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<HitSet*>& o); + +private: + void reset_to_type(impl_type_t type); +}; +WRITE_CLASS_ENCODER(HitSet) +WRITE_CLASS_ENCODER(HitSet::Params) + +typedef boost::shared_ptr<HitSet> HitSetRef; + +ostream& operator<<(ostream& out, const HitSet::Params& p); + +/** + * explicitly enumerate hash hits in the set + */ +class ExplicitHashHitSet : public HitSet::Impl { + uint64_t count; + ceph::unordered_set<uint32_t> hits; +public: + class Params : public HitSet::Params::Impl { + public: + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_EXPLICIT_HASH; + } + HitSet::Impl *get_new_impl() const override { + return new ExplicitHashHitSet; + } + static void generate_test_instances(list<Params*>& o) { + o.push_back(new Params); + } + }; + + ExplicitHashHitSet() : count(0) {} + explicit ExplicitHashHitSet(const ExplicitHashHitSet::Params *p) : count(0) {} + ExplicitHashHitSet(const ExplicitHashHitSet &o) : count(o.count), + hits(o.hits) {} + + HitSet::Impl *clone() const override { + return new ExplicitHashHitSet(*this); + } + + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_EXPLICIT_HASH; + } + bool is_full() const override { + return false; + } + void insert(const hobject_t& o) override { + hits.insert(o.get_hash()); + ++count; + } + bool contains(const hobject_t& o) const override { + return hits.count(o.get_hash()); + } + unsigned insert_count() const override { + return count; + } + unsigned approx_unique_insert_count() const override { + return hits.size(); + } + void encode(bufferlist &bl) const override { + ENCODE_START(1, 1, bl); + encode(count, bl); + encode(hits, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &bl) override { + DECODE_START(1, bl); + decode(count, bl); + decode(hits, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const override; + static void generate_test_instances(list<ExplicitHashHitSet*>& o) { + o.push_back(new ExplicitHashHitSet); + o.push_back(new ExplicitHashHitSet); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + } +}; +WRITE_CLASS_ENCODER(ExplicitHashHitSet) + +/** + * explicitly enumerate objects in the set + */ +class ExplicitObjectHitSet : public HitSet::Impl { + uint64_t count; + ceph::unordered_set<hobject_t> hits; +public: + class Params : public HitSet::Params::Impl { + public: + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_EXPLICIT_OBJECT; + } + HitSet::Impl *get_new_impl() const override { + return new ExplicitObjectHitSet; + } + static void generate_test_instances(list<Params*>& o) { + o.push_back(new Params); + } + }; + + ExplicitObjectHitSet() : count(0) {} + explicit ExplicitObjectHitSet(const ExplicitObjectHitSet::Params *p) : count(0) {} + ExplicitObjectHitSet(const ExplicitObjectHitSet &o) : count(o.count), + hits(o.hits) {} + + HitSet::Impl *clone() const override { + return new ExplicitObjectHitSet(*this); + } + + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_EXPLICIT_OBJECT; + } + bool is_full() const override { + return false; + } + void insert(const hobject_t& o) override { + hits.insert(o); + ++count; + } + bool contains(const hobject_t& o) const override { + return hits.count(o); + } + unsigned insert_count() const override { + return count; + } + unsigned approx_unique_insert_count() const override { + return hits.size(); + } + void encode(bufferlist &bl) const override { + ENCODE_START(1, 1, bl); + encode(count, bl); + encode(hits, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(1, bl); + decode(count, bl); + decode(hits, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const override; + static void generate_test_instances(list<ExplicitObjectHitSet*>& o) { + o.push_back(new ExplicitObjectHitSet); + o.push_back(new ExplicitObjectHitSet); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + } +}; +WRITE_CLASS_ENCODER(ExplicitObjectHitSet) + +/** + * use a bloom_filter to track hits to the set + */ +class BloomHitSet : public HitSet::Impl { + compressible_bloom_filter bloom; + +public: + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_BLOOM; + } + + class Params : public HitSet::Params::Impl { + public: + HitSet::impl_type_t get_type() const override { + return HitSet::TYPE_BLOOM; + } + HitSet::Impl *get_new_impl() const override { + return new BloomHitSet; + } + + uint32_t fpp_micro; ///< false positive probability / 1M + uint64_t target_size; ///< number of unique insertions we expect to this HitSet + uint64_t seed; ///< seed to use when initializing the bloom filter + + Params() + : fpp_micro(0), target_size(0), seed(0) {} + Params(double fpp, uint64_t t, uint64_t s) + : fpp_micro(fpp * 1000000.0), target_size(t), seed(s) {} + Params(const Params &o) + : fpp_micro(o.fpp_micro), + target_size(o.target_size), + seed(o.seed) {} + ~Params() override {} + + double get_fpp() const { + return (double)fpp_micro / 1000000.0; + } + void set_fpp(double f) { + fpp_micro = (unsigned)(llrintl(f * 1000000.0)); + } + + void encode(bufferlist& bl) const override { + ENCODE_START(1, 1, bl); + encode(fpp_micro, bl); + encode(target_size, bl); + encode(seed, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(1, bl); + decode(fpp_micro, bl); + decode(target_size, bl); + decode(seed, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const override; + void dump_stream(ostream& o) const override { + o << "false_positive_probability: " + << get_fpp() << ", target_size: " << target_size + << ", seed: " << seed; + } + static void generate_test_instances(list<Params*>& o) { + o.push_back(new Params); + o.push_back(new Params); + (*o.rbegin())->fpp_micro = 123456; + (*o.rbegin())->target_size = 300; + (*o.rbegin())->seed = 99; + } + }; + + BloomHitSet() {} + BloomHitSet(unsigned inserts, double fpp, int seed) + : bloom(inserts, fpp, seed) + {} + explicit BloomHitSet(const BloomHitSet::Params *p) : bloom(p->target_size, + p->get_fpp(), + p->seed) + {} + + BloomHitSet(const BloomHitSet &o) { + // oh god + bufferlist bl; + o.encode(bl); + auto bli = std::cbegin(bl); + this->decode(bli); + } + + HitSet::Impl *clone() const override { + return new BloomHitSet(*this); + } + + bool is_full() const override { + return bloom.is_full(); + } + + void insert(const hobject_t& o) override { + bloom.insert(o.get_hash()); + } + bool contains(const hobject_t& o) const override { + return bloom.contains(o.get_hash()); + } + unsigned insert_count() const override { + return bloom.element_count(); + } + unsigned approx_unique_insert_count() const override { + return bloom.approx_unique_element_count(); + } + void seal() override { + // aim for a density of .5 (50% of bit set) + double pc = bloom.density() * 2.0; + if (pc < 1.0) + bloom.compress(pc); + } + + void encode(bufferlist &bl) const override { + ENCODE_START(1, 1, bl); + encode(bloom, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(1, bl); + decode(bloom, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const override; + static void generate_test_instances(list<BloomHitSet*>& o) { + o.push_back(new BloomHitSet); + o.push_back(new BloomHitSet(10, .1, 1)); + o.back()->insert(hobject_t()); + o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, "")); + o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, "")); + } +}; +WRITE_CLASS_ENCODER(BloomHitSet) + +#endif diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc new file mode 100644 index 00000000..9c59bce2 --- /dev/null +++ b/src/osd/OSD.cc @@ -0,0 +1,11546 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2017 OVH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "acconfig.h" + +#include <cctype> +#include <fstream> +#include <iostream> +#include <iterator> + +#include <unistd.h> +#include <sys/stat.h> +#include <signal.h> +#include <time.h> +#include <boost/scoped_ptr.hpp> +#include <boost/range/adaptor/reversed.hpp> + +#ifdef HAVE_SYS_PARAM_H +#include <sys/param.h> +#endif + +#ifdef HAVE_SYS_MOUNT_H +#include <sys/mount.h> +#endif + +#include "osd/PG.h" + +#include "include/types.h" +#include "include/compat.h" +#include "include/random.h" + +#include "OSD.h" +#include "OSDMap.h" +#include "Watch.h" +#include "osdc/Objecter.h" + +#include "common/errno.h" +#include "common/ceph_argparse.h" +#include "common/ceph_time.h" +#include "common/version.h" +#include "common/pick_address.h" +#include "common/blkdev.h" +#include "common/numa.h" + +#include "os/ObjectStore.h" +#ifdef HAVE_LIBFUSE +#include "os/FuseStore.h" +#endif + +#include "PrimaryLogPG.h" + +#include "msg/Messenger.h" +#include "msg/Message.h" + +#include "mon/MonClient.h" + +#include "messages/MLog.h" + +#include "messages/MGenericMessage.h" +#include "messages/MOSDPing.h" +#include "messages/MOSDFailure.h" +#include "messages/MOSDMarkMeDown.h" +#include "messages/MOSDFull.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDBeacon.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDPGTemp.h" +#include "messages/MOSDPGReadyToMerge.h" + +#include "messages/MOSDMap.h" +#include "messages/MMonGetOSDMap.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGQuery.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGRemove.h" +#include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGCreate.h" +#include "messages/MOSDPGCreate2.h" +#include "messages/MOSDPGTrim.h" +#include "messages/MOSDPGScan.h" +#include "messages/MBackfillReserve.h" +#include "messages/MRecoveryReserve.h" +#include "messages/MOSDForceRecovery.h" +#include "messages/MOSDECSubOpWrite.h" +#include "messages/MOSDECSubOpWriteReply.h" +#include "messages/MOSDECSubOpRead.h" +#include "messages/MOSDECSubOpReadReply.h" +#include "messages/MOSDPGCreated.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" + +#include "messages/MOSDPeeringOp.h" + +#include "messages/MOSDAlive.h" + +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrub2.h" +#include "messages/MOSDRepScrub.h" + +#include "messages/MMonCommand.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" + +#include "messages/MPGStats.h" +#include "messages/MPGStatsAck.h" + +#include "messages/MWatchNotify.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGPull.h" + +#include "common/perf_counters.h" +#include "common/Timer.h" +#include "common/LogClient.h" +#include "common/AsyncReserver.h" +#include "common/HeartbeatMap.h" +#include "common/admin_socket.h" +#include "common/ceph_context.h" + +#include "global/signal_handler.h" +#include "global/pidfile.h" + +#include "include/color.h" +#include "perfglue/cpu_profiler.h" +#include "perfglue/heap_profiler.h" + +#include "osd/OpRequest.h" + +#include "auth/AuthAuthorizeHandler.h" +#include "auth/RotatingKeyRing.h" + +#include "objclass/objclass.h" + +#include "common/cmdparse.h" +#include "include/str_list.h" +#include "include/util.h" + +#include "include/ceph_assert.h" +#include "common/config.h" +#include "common/EventTrace.h" + +#include "json_spirit/json_spirit_reader.h" +#include "json_spirit/json_spirit_writer.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/osd.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch()) + + +static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) { + return *_dout << "osd." << whoami << " " << epoch << " "; +} + +//Initial features in new superblock. +//Features here are also automatically upgraded +CompatSet OSD::get_osd_initial_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES); + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} + +//Features are added here that this OSD supports. +CompatSet OSD::get_osd_compat_set() { + CompatSet compat = get_osd_initial_compat_set(); + //Any features here can be set in code, but not in initial superblock + compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + return compat; +} + +OSDService::OSDService(OSD *osd) : + osd(osd), + cct(osd->cct), + whoami(osd->whoami), store(osd->store), + log_client(osd->log_client), clog(osd->clog), + pg_recovery_stats(osd->pg_recovery_stats), + cluster_messenger(osd->cluster_messenger), + client_messenger(osd->client_messenger), + logger(osd->logger), + recoverystate_perf(osd->recoverystate_perf), + monc(osd->monc), + class_handler(osd->class_handler), + osd_max_object_size(cct->_conf, "osd_max_object_size"), + osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"), + publish_lock{ceph::make_mutex("OSDService::publish_lock")}, + pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")}, + max_oldest_map(0), + peer_map_epoch_lock("OSDService::peer_map_epoch_lock"), + sched_scrub_lock("OSDService::sched_scrub_lock"), + scrubs_local(0), + scrubs_remote(0), + agent_lock("OSDService::agent_lock"), + agent_valid_iterator(false), + agent_ops(0), + flush_mode_high_count(0), + agent_active(true), + agent_thread(this), + agent_stop_flag(false), + agent_timer_lock("OSDService::agent_timer_lock"), + agent_timer(osd->client_messenger->cct, agent_timer_lock), + last_recalibrate(ceph_clock_now()), + promote_max_objects(0), + promote_max_bytes(0), + objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)), + m_objecter_finishers(cct->_conf->osd_objecter_finishers), + watch_lock("OSDService::watch_lock"), + watch_timer(osd->client_messenger->cct, watch_lock), + next_notif_id(0), + recovery_request_lock("OSDService::recovery_request_lock"), + recovery_request_timer(cct, recovery_request_lock, false), + sleep_lock("OSDService::sleep_lock"), + sleep_timer(cct, sleep_lock, false), + reserver_finisher(cct), + local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills, + cct->_conf->osd_min_recovery_priority), + remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills, + cct->_conf->osd_min_recovery_priority), + pg_temp_lock("OSDService::pg_temp_lock"), + snap_reserver(cct, &reserver_finisher, + cct->_conf->osd_max_trimming_pgs), + recovery_lock("OSDService::recovery_lock"), + recovery_ops_active(0), + recovery_ops_reserved(0), + recovery_paused(false), + map_cache_lock("OSDService::map_cache_lock"), + map_cache(cct, cct->_conf->osd_map_cache_size), + map_bl_cache(cct->_conf->osd_map_cache_size), + map_bl_inc_cache(cct->_conf->osd_map_cache_size), + stat_lock("OSDService::stat_lock"), + full_status_lock("OSDService::full_status_lock"), + cur_state(NONE), + cur_ratio(0), physical_ratio(0), + epoch_lock("OSDService::epoch_lock"), + boot_epoch(0), up_epoch(0), bind_epoch(0), + is_stopping_lock("OSDService::is_stopping_lock") +#ifdef PG_DEBUG_REFS + , pgid_lock("OSDService::pgid_lock") +#endif +{ + objecter->init(); + + for (int i = 0; i < m_objecter_finishers; i++) { + ostringstream str; + str << "objecter-finisher-" << i; + Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher"); + objecter_finishers.push_back(fin); + } +} + +OSDService::~OSDService() +{ + delete objecter; + + for (auto f : objecter_finishers) { + delete f; + f = NULL; + } +} + + + +#ifdef PG_DEBUG_REFS +void OSDService::add_pgid(spg_t pgid, PG *pg){ + std::lock_guard l(pgid_lock); + if (!pgid_tracker.count(pgid)) { + live_pgs[pgid] = pg; + } + pgid_tracker[pgid]++; +} +void OSDService::remove_pgid(spg_t pgid, PG *pg) +{ + std::lock_guard l(pgid_lock); + ceph_assert(pgid_tracker.count(pgid)); + ceph_assert(pgid_tracker[pgid] > 0); + pgid_tracker[pgid]--; + if (pgid_tracker[pgid] == 0) { + pgid_tracker.erase(pgid); + live_pgs.erase(pgid); + } +} +void OSDService::dump_live_pgids() +{ + std::lock_guard l(pgid_lock); + derr << "live pgids:" << dendl; + for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin(); + i != pgid_tracker.cend(); + ++i) { + derr << "\t" << *i << dendl; + live_pgs[i->first]->dump_live_ids(); + } +} +#endif + + + +void OSDService::identify_splits_and_merges( + OSDMapRef old_map, + OSDMapRef new_map, + spg_t pgid, + set<pair<spg_t,epoch_t>> *split_children, + set<pair<spg_t,epoch_t>> *merge_pgs) +{ + if (!old_map->have_pg_pool(pgid.pool())) { + return; + } + int old_pgnum = old_map->get_pg_num(pgid.pool()); + auto p = osd->pg_num_history.pg_nums.find(pgid.pool()); + if (p == osd->pg_num_history.pg_nums.end()) { + return; + } + dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch() + << " to e" << new_map->get_epoch() + << " pg_nums " << p->second << dendl; + deque<spg_t> queue; + queue.push_back(pgid); + set<spg_t> did; + while (!queue.empty()) { + auto cur = queue.front(); + queue.pop_front(); + did.insert(cur); + unsigned pgnum = old_pgnum; + for (auto q = p->second.lower_bound(old_map->get_epoch()); + q != p->second.end() && + q->first <= new_map->get_epoch(); + ++q) { + if (pgnum < q->second) { + // split? + if (cur.ps() < pgnum) { + set<spg_t> children; + if (cur.is_split(pgnum, q->second, &children)) { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " children " << children << dendl; + for (auto i : children) { + split_children->insert(make_pair(i, q->first)); + if (!did.count(i)) + queue.push_back(i); + } + } + } else if (cur.ps() < q->second) { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is a child" << dendl; + // normally we'd capture this from the parent, but it's + // possible the parent doesn't exist yet (it will be + // fabricated to allow an intervening merge). note this PG + // as a split child here to be sure we catch it. + split_children->insert(make_pair(cur, q->first)); + } else { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is post-split, skipping" << dendl; + } + } else if (merge_pgs) { + // merge? + if (cur.ps() >= q->second) { + if (cur.ps() < pgnum) { + spg_t parent; + if (cur.is_merge_source(pgnum, q->second, &parent)) { + set<spg_t> children; + parent.is_split(q->second, pgnum, &children); + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is merge source, target " << parent + << ", source(s) " << children << dendl; + merge_pgs->insert(make_pair(parent, q->first)); + if (!did.count(parent)) { + // queue (and re-scan) parent in case it might not exist yet + // and there are some future splits pending on it + queue.push_back(parent); + } + for (auto c : children) { + merge_pgs->insert(make_pair(c, q->first)); + if (!did.count(c)) + queue.push_back(c); + } + } + } else { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is beyond old pgnum, skipping" << dendl; + } + } else { + set<spg_t> children; + if (cur.is_split(q->second, pgnum, &children)) { + dout(20) << __func__ << " " << cur << " e" << q->first + << " pg_num " << pgnum << " -> " << q->second + << " is merge target, source " << children << dendl; + for (auto c : children) { + merge_pgs->insert(make_pair(c, q->first)); + if (!did.count(c)) + queue.push_back(c); + } + merge_pgs->insert(make_pair(cur, q->first)); + } + } + } + pgnum = q->second; + } + } +} + +void OSDService::need_heartbeat_peer_update() +{ + osd->need_heartbeat_peer_update(); +} + +void OSDService::start_shutdown() +{ + { + std::lock_guard l(agent_timer_lock); + agent_timer.shutdown(); + } + + { + std::lock_guard l(sleep_lock); + sleep_timer.shutdown(); + } + + { + std::lock_guard l(recovery_request_lock); + recovery_request_timer.shutdown(); + } +} + +void OSDService::shutdown_reserver() +{ + reserver_finisher.wait_for_empty(); + reserver_finisher.stop(); +} + +void OSDService::shutdown() +{ + { + std::lock_guard l(watch_lock); + watch_timer.shutdown(); + } + + objecter->shutdown(); + for (auto f : objecter_finishers) { + f->wait_for_empty(); + f->stop(); + } + + publish_map(OSDMapRef()); + next_osdmap = OSDMapRef(); +} + +void OSDService::init() +{ + reserver_finisher.start(); + for (auto f : objecter_finishers) { + f->start(); + } + objecter->set_client_incarnation(0); + + // deprioritize objecter in daemonperf output + objecter->get_logger()->set_prio_adjust(-3); + + watch_timer.init(); + agent_timer.init(); + + agent_thread.create("osd_srv_agent"); + + if (cct->_conf->osd_recovery_delay_start) + defer_recovery(cct->_conf->osd_recovery_delay_start); +} + +void OSDService::final_init() +{ + objecter->start(osdmap.get()); +} + +void OSDService::activate_map() +{ + // wake/unwake the tiering agent + agent_lock.Lock(); + agent_active = + !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) && + osd->is_active(); + agent_cond.Signal(); + agent_lock.Unlock(); +} + +void OSDService::request_osdmap_update(epoch_t e) +{ + osd->osdmap_subscribe(e, false); +} + +class AgentTimeoutCB : public Context { + PGRef pg; +public: + explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {} + void finish(int) override { + pg->agent_choose_mode_restart(); + } +}; + +void OSDService::agent_entry() +{ + dout(10) << __func__ << " start" << dendl; + agent_lock.Lock(); + + while (!agent_stop_flag) { + if (agent_queue.empty()) { + dout(20) << __func__ << " empty queue" << dendl; + agent_cond.Wait(agent_lock); + continue; + } + uint64_t level = agent_queue.rbegin()->first; + set<PGRef>& top = agent_queue.rbegin()->second; + dout(10) << __func__ + << " tiers " << agent_queue.size() + << ", top is " << level + << " with pgs " << top.size() + << ", ops " << agent_ops << "/" + << cct->_conf->osd_agent_max_ops + << (agent_active ? " active" : " NOT ACTIVE") + << dendl; + dout(20) << __func__ << " oids " << agent_oids << dendl; + int max = cct->_conf->osd_agent_max_ops - agent_ops; + int agent_flush_quota = max; + if (!flush_mode_high_count) + agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops; + if (agent_flush_quota <= 0 || top.empty() || !agent_active) { + agent_cond.Wait(agent_lock); + continue; + } + + if (!agent_valid_iterator || agent_queue_pos == top.end()) { + agent_queue_pos = top.begin(); + agent_valid_iterator = true; + } + PGRef pg = *agent_queue_pos; + dout(10) << "high_count " << flush_mode_high_count + << " agent_ops " << agent_ops + << " flush_quota " << agent_flush_quota << dendl; + agent_lock.Unlock(); + if (!pg->agent_work(max, agent_flush_quota)) { + dout(10) << __func__ << " " << pg->pg_id + << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time + << " seconds" << dendl; + + osd->logger->inc(l_osd_tier_delay); + // Queue a timer to call agent_choose_mode for this pg in 5 seconds + agent_timer_lock.Lock(); + Context *cb = new AgentTimeoutCB(pg); + agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb); + agent_timer_lock.Unlock(); + } + agent_lock.Lock(); + } + agent_lock.Unlock(); + dout(10) << __func__ << " finish" << dendl; +} + +void OSDService::agent_stop() +{ + { + std::lock_guard l(agent_lock); + + // By this time all ops should be cancelled + ceph_assert(agent_ops == 0); + // By this time all PGs are shutdown and dequeued + if (!agent_queue.empty()) { + set<PGRef>& top = agent_queue.rbegin()->second; + derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl; + ceph_abort_msg("agent queue not empty"); + } + + agent_stop_flag = true; + agent_cond.Signal(); + } + agent_thread.join(); +} + +// ------------------------------------- + +void OSDService::promote_throttle_recalibrate() +{ + utime_t now = ceph_clock_now(); + double dur = now - last_recalibrate; + last_recalibrate = now; + unsigned prob = promote_probability_millis; + + uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec; + uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec; + + unsigned min_prob = 1; + + uint64_t attempts, obj, bytes; + promote_counter.sample_and_attenuate(&attempts, &obj, &bytes); + dout(10) << __func__ << " " << attempts << " attempts, promoted " + << obj << " objects and " << byte_u_t(bytes) << "; target " + << target_obj_sec << " obj/sec or " + << byte_u_t(target_bytes_sec) << "/sec" + << dendl; + + // calculate what the probability *should* be, given the targets + unsigned new_prob; + if (attempts && dur > 0) { + uint64_t avg_size = 1; + if (obj) + avg_size = std::max<uint64_t>(bytes / obj, 1); + unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts; + unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0 + / (double)attempts; + dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size " + << avg_size << dendl; + if (target_obj_sec && target_bytes_sec) + new_prob = std::min(po, pb); + else if (target_obj_sec) + new_prob = po; + else if (target_bytes_sec) + new_prob = pb; + else + new_prob = 1000; + } else { + new_prob = 1000; + } + dout(20) << __func__ << " new_prob " << new_prob << dendl; + + // correct for persistent skew between target rate and actual rate, adjust + double ratio = 1.0; + unsigned actual = 0; + if (attempts && obj) { + actual = obj * 1000 / attempts; + ratio = (double)actual / (double)prob; + new_prob = (double)new_prob / ratio; + } + new_prob = std::max(new_prob, min_prob); + new_prob = std::min(new_prob, 1000u); + + // adjust + prob = (prob + new_prob) / 2; + prob = std::max(prob, min_prob); + prob = std::min(prob, 1000u); + dout(10) << __func__ << " actual " << actual + << ", actual/prob ratio " << ratio + << ", adjusted new_prob " << new_prob + << ", prob " << promote_probability_millis << " -> " << prob + << dendl; + promote_probability_millis = prob; + + // set hard limits for this interval to mitigate stampedes + promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2; + promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2; +} + +// ------------------------------------- + +float OSDService::get_failsafe_full_ratio() +{ + float full_ratio = cct->_conf->osd_failsafe_full_ratio; + if (full_ratio > 1.0) full_ratio /= 100.0; + return full_ratio; +} + +OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject) +{ + // The OSDMap ratios take precendence. So if the failsafe is .95 and + // the admin sets the cluster full to .96, the failsafe moves up to .96 + // too. (Not that having failsafe == full is ideal, but it's better than + // dropping writes before the clusters appears full.) + OSDMapRef osdmap = get_osdmap(); + if (!osdmap || osdmap->get_epoch() == 0) { + return NONE; + } + float nearfull_ratio = osdmap->get_nearfull_ratio(); + float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio); + float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio); + float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio); + + if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) { + // use the failsafe for nearfull and full; the mon isn't using the + // flags anyway because we're mid-upgrade. + full_ratio = failsafe_ratio; + backfillfull_ratio = failsafe_ratio; + nearfull_ratio = failsafe_ratio; + } else if (full_ratio <= 0 || + backfillfull_ratio <= 0 || + nearfull_ratio <= 0) { + derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl; + // use failsafe flag. ick. the monitor did something wrong or the user + // did something stupid. + full_ratio = failsafe_ratio; + backfillfull_ratio = failsafe_ratio; + nearfull_ratio = failsafe_ratio; + } + + if (injectfull_state > NONE && injectfull) { + inject = "(Injected)"; + return injectfull_state; + } else if (pratio > failsafe_ratio) { + return FAILSAFE; + } else if (ratio > full_ratio) { + return FULL; + } else if (ratio > backfillfull_ratio) { + return BACKFILLFULL; + } else if (pratio > nearfull_ratio) { + return NEARFULL; + } + return NONE; +} + +void OSDService::check_full_status(float ratio, float pratio) +{ + std::lock_guard l(full_status_lock); + + cur_ratio = ratio; + physical_ratio = pratio; + + string inject; + s_names new_state; + new_state = recalc_full_state(ratio, pratio, inject); + + dout(20) << __func__ << " cur ratio " << ratio + << ", physical ratio " << pratio + << ", new state " << get_full_state_name(new_state) + << " " << inject + << dendl; + + // warn + if (cur_state != new_state) { + dout(10) << __func__ << " " << get_full_state_name(cur_state) + << " -> " << get_full_state_name(new_state) << dendl; + if (new_state == FAILSAFE) { + clog->error() << "full status failsafe engaged, dropping updates, now " + << (int)roundf(ratio * 100) << "% full"; + } else if (cur_state == FAILSAFE) { + clog->error() << "full status failsafe disengaged, no longer dropping " + << "updates, now " << (int)roundf(ratio * 100) << "% full"; + } + cur_state = new_state; + } +} + +bool OSDService::need_fullness_update() +{ + OSDMapRef osdmap = get_osdmap(); + s_names cur = NONE; + if (osdmap->exists(whoami)) { + if (osdmap->get_state(whoami) & CEPH_OSD_FULL) { + cur = FULL; + } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) { + cur = BACKFILLFULL; + } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) { + cur = NEARFULL; + } + } + s_names want = NONE; + if (is_full()) + want = FULL; + else if (is_backfillfull()) + want = BACKFILLFULL; + else if (is_nearfull()) + want = NEARFULL; + return want != cur; +} + +bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const +{ + if (injectfull && injectfull_state >= type) { + // injectfull is either a count of the number of times to return failsafe full + // or if -1 then always return full + if (injectfull > 0) + --injectfull; + ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD (" + << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")" + << dendl; + return true; + } + return false; +} + +bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const +{ + std::lock_guard l(full_status_lock); + + if (_check_inject_full(dpp, type)) + return true; + + if (cur_state >= type) + ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio + << " physical " << physical_ratio << dendl; + + return cur_state >= type; +} + +bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat) +{ + ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl; + { + std::lock_guard l(full_status_lock); + if (_check_inject_full(dpp, type)) { + return true; + } + } + + float pratio; + float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used); + + string notused; + s_names tentative_state = recalc_full_state(ratio, pratio, notused); + + if (tentative_state >= type) + ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl; + + return tentative_state >= type; +} + +bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const +{ + return _check_full(dpp, FAILSAFE); +} + +bool OSDService::check_full(DoutPrefixProvider *dpp) const +{ + return _check_full(dpp, FULL); +} + +bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats) +{ + return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats); +} + +bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const +{ + return _check_full(dpp, BACKFILLFULL); +} + +bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const +{ + return _check_full(dpp, NEARFULL); +} + +bool OSDService::is_failsafe_full() const +{ + std::lock_guard l(full_status_lock); + return cur_state == FAILSAFE; +} + +bool OSDService::is_full() const +{ + std::lock_guard l(full_status_lock); + return cur_state >= FULL; +} + +bool OSDService::is_backfillfull() const +{ + std::lock_guard l(full_status_lock); + return cur_state >= BACKFILLFULL; +} + +bool OSDService::is_nearfull() const +{ + std::lock_guard l(full_status_lock); + return cur_state >= NEARFULL; +} + +void OSDService::set_injectfull(s_names type, int64_t count) +{ + std::lock_guard l(full_status_lock); + injectfull_state = type; + injectfull = count; +} + +void OSDService::set_statfs(const struct store_statfs_t &stbuf, + osd_alert_list_t& alerts) +{ + uint64_t bytes = stbuf.total; + uint64_t avail = stbuf.available; + uint64_t used = stbuf.get_used_raw(); + + // For testing fake statfs values so it doesn't matter if all + // OSDs are using the same partition. + if (cct->_conf->fake_statfs_for_testing) { + uint64_t total_num_bytes = 0; + vector<PGRef> pgs; + osd->_get_pgs(&pgs); + for (auto p : pgs) { + total_num_bytes += p->get_stats_num_bytes(); + } + bytes = cct->_conf->fake_statfs_for_testing; + if (total_num_bytes < bytes) + avail = bytes - total_num_bytes; + else + avail = 0; + dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing + << " adjust available " << avail + << dendl; + used = bytes - avail; + } + + osd->logger->set(l_osd_stat_bytes, bytes); + osd->logger->set(l_osd_stat_bytes_used, used); + osd->logger->set(l_osd_stat_bytes_avail, avail); + + std::lock_guard l(stat_lock); + osd_stat.statfs = stbuf; + osd_stat.os_alerts.clear(); + osd_stat.os_alerts[whoami].swap(alerts); + if (cct->_conf->fake_statfs_for_testing) { + osd_stat.statfs.total = bytes; + osd_stat.statfs.available = avail; + // For testing don't want used to go negative, so clear reserved + osd_stat.statfs.internally_reserved = 0; + } +} + +osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers, + int num_pgs) +{ + utime_t now = ceph_clock_now(); + auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale"); + std::lock_guard l(stat_lock); + osd_stat.hb_peers.swap(hb_peers); + osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist); + osd_stat.num_pgs = num_pgs; + // Clean entries that aren't updated + // This is called often enough that we can just remove 1 at a time + for (auto i: osd_stat.hb_pingtime) { + if (i.second.last_update == 0) + continue; + if (stale_time && now.sec() - i.second.last_update > stale_time) { + dout(20) << __func__ << " time out heartbeat for osd " << i.first + << " last_update " << i.second.last_update << dendl; + osd_stat.hb_pingtime.erase(i.first); + break; + } + } + return osd_stat; +} + +void OSDService::inc_osd_stat_repaired() +{ + std::lock_guard l(stat_lock); + osd_stat.num_shards_repaired++; + return; +} + +void OSDService::set_osd_stat_repaired(int64_t count) +{ + std::lock_guard l(stat_lock); + osd_stat.num_shards_repaired = count; + return; +} + +float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, + uint64_t adjust_used) +{ + *pratio = + ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total); + + if (adjust_used) { + dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl; + if (new_stat.statfs.available > adjust_used) + new_stat.statfs.available -= adjust_used; + else + new_stat.statfs.available = 0; + dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl; + } + + // Check all pgs and adjust kb_used to include all pending backfill data + int backfill_adjusted = 0; + vector<PGRef> pgs; + osd->_get_pgs(&pgs); + for (auto p : pgs) { + backfill_adjusted += p->pg_stat_adjust(&new_stat); + } + if (backfill_adjusted) { + dout(20) << __func__ << " backfill adjusted " << new_stat << dendl; + } + return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total); +} + +bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on) +{ + OSDMapRef osdmap = get_osdmap(); + for (auto shard : missing_on) { + if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL) + return true; + } + return false; +} + +void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch) +{ + OSDMapRef next_map = get_nextmap_reserved(); + // service map is always newer/newest + ceph_assert(from_epoch <= next_map->get_epoch()); + + if (next_map->is_down(peer) || + next_map->get_info(peer).up_from > from_epoch) { + m->put(); + release_map(next_map); + return; + } + ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd( + next_map->get_cluster_addrs(peer)); + share_map_peer(peer, peer_con.get(), next_map); + peer_con->send_message(m); + release_map(next_map); +} + +ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch) +{ + OSDMapRef next_map = get_nextmap_reserved(); + // service map is always newer/newest + ceph_assert(from_epoch <= next_map->get_epoch()); + + if (next_map->is_down(peer) || + next_map->get_info(peer).up_from > from_epoch) { + release_map(next_map); + return NULL; + } + ConnectionRef con = osd->cluster_messenger->connect_to_osd( + next_map->get_cluster_addrs(peer)); + release_map(next_map); + return con; +} + +pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch) +{ + OSDMapRef next_map = get_nextmap_reserved(); + // service map is always newer/newest + ceph_assert(from_epoch <= next_map->get_epoch()); + + pair<ConnectionRef,ConnectionRef> ret; + if (next_map->is_down(peer) || + next_map->get_info(peer).up_from > from_epoch) { + release_map(next_map); + return ret; + } + ret.first = osd->hb_back_client_messenger->connect_to_osd( + next_map->get_hb_back_addrs(peer)); + ret.second = osd->hb_front_client_messenger->connect_to_osd( + next_map->get_hb_front_addrs(peer)); + release_map(next_map); + return ret; +} + +entity_name_t OSDService::get_cluster_msgr_name() const +{ + return cluster_messenger->get_myname(); +} + +void OSDService::queue_want_pg_temp(pg_t pgid, + const vector<int>& want, + bool forced) +{ + std::lock_guard l(pg_temp_lock); + auto p = pg_temp_pending.find(pgid); + if (p == pg_temp_pending.end() || + p->second.acting != want || + forced) { + pg_temp_wanted[pgid] = {want, forced}; + } +} + +void OSDService::remove_want_pg_temp(pg_t pgid) +{ + std::lock_guard l(pg_temp_lock); + pg_temp_wanted.erase(pgid); + pg_temp_pending.erase(pgid); +} + +void OSDService::_sent_pg_temp() +{ +#ifdef HAVE_STDLIB_MAP_SPLICING + pg_temp_pending.merge(pg_temp_wanted); +#else + pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)), + make_move_iterator(end(pg_temp_wanted))); +#endif + pg_temp_wanted.clear(); +} + +void OSDService::requeue_pg_temp() +{ + std::lock_guard l(pg_temp_lock); + // wanted overrides pending. note that remove_want_pg_temp + // clears the item out of both. + unsigned old_wanted = pg_temp_wanted.size(); + unsigned old_pending = pg_temp_pending.size(); + _sent_pg_temp(); + pg_temp_wanted.swap(pg_temp_pending); + dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> " + << pg_temp_wanted.size() << dendl; +} + +std::ostream& operator<<(std::ostream& out, + const OSDService::pg_temp_t& pg_temp) +{ + out << pg_temp.acting; + if (pg_temp.forced) { + out << " (forced)"; + } + return out; +} + +void OSDService::send_pg_temp() +{ + std::lock_guard l(pg_temp_lock); + if (pg_temp_wanted.empty()) + return; + dout(10) << "send_pg_temp " << pg_temp_wanted << dendl; + MOSDPGTemp *ms[2] = {nullptr, nullptr}; + for (auto& [pgid, pg_temp] : pg_temp_wanted) { + auto& m = ms[pg_temp.forced]; + if (!m) { + m = new MOSDPGTemp(osdmap->get_epoch()); + m->forced = pg_temp.forced; + } + m->pg_temp.emplace(pgid, pg_temp.acting); + } + for (auto m : ms) { + if (m) { + monc->send_mon_message(m); + } + } + _sent_pg_temp(); +} + +void OSDService::send_pg_created(pg_t pgid) +{ + std::lock_guard l(pg_created_lock); + dout(20) << __func__ << dendl; + auto o = get_osdmap(); + if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) { + pg_created.insert(pgid); + monc->send_mon_message(new MOSDPGCreated(pgid)); + } +} + +void OSDService::send_pg_created() +{ + std::lock_guard l(pg_created_lock); + dout(20) << __func__ << dendl; + auto o = get_osdmap(); + if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) { + for (auto pgid : pg_created) { + monc->send_mon_message(new MOSDPGCreated(pgid)); + } + } +} + +void OSDService::prune_pg_created() +{ + std::lock_guard l(pg_created_lock); + dout(20) << __func__ << dendl; + auto o = get_osdmap(); + auto i = pg_created.begin(); + while (i != pg_created.end()) { + auto p = o->get_pg_pool(i->pool()); + if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) { + dout(20) << __func__ << " pruning " << *i << dendl; + i = pg_created.erase(i); + } else { + dout(20) << __func__ << " keeping " << *i << dendl; + ++i; + } + } +} + + +// -------------------------------------- +// dispatch + +epoch_t OSDService::get_peer_epoch(int peer) +{ + std::lock_guard l(peer_map_epoch_lock); + map<int,epoch_t>::iterator p = peer_map_epoch.find(peer); + if (p == peer_map_epoch.end()) + return 0; + return p->second; +} + +epoch_t OSDService::note_peer_epoch(int peer, epoch_t e) +{ + std::lock_guard l(peer_map_epoch_lock); + map<int,epoch_t>::iterator p = peer_map_epoch.find(peer); + if (p != peer_map_epoch.end()) { + if (p->second < e) { + dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl; + p->second = e; + } else { + dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl; + } + return p->second; + } else { + dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl; + peer_map_epoch[peer] = e; + return e; + } +} + +void OSDService::forget_peer_epoch(int peer, epoch_t as_of) +{ + std::lock_guard l(peer_map_epoch_lock); + map<int,epoch_t>::iterator p = peer_map_epoch.find(peer); + if (p != peer_map_epoch.end()) { + if (p->second <= as_of) { + dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of + << " had " << p->second << dendl; + peer_map_epoch.erase(p); + } else { + dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of + << " has " << p->second << " - not forgetting" << dendl; + } + } +} + +bool OSDService::should_share_map(entity_name_t name, Connection *con, + epoch_t epoch, const OSDMapRef& osdmap, + const epoch_t *sent_epoch_p) +{ + dout(20) << "should_share_map " + << name << " " << con->get_peer_addr() + << " " << epoch << dendl; + + // does client have old map? + if (name.is_client()) { + bool message_sendmap = epoch < osdmap->get_epoch(); + if (message_sendmap && sent_epoch_p) { + dout(20) << "client session last_sent_epoch: " + << *sent_epoch_p + << " versus osdmap epoch " << osdmap->get_epoch() << dendl; + if (*sent_epoch_p < osdmap->get_epoch()) { + return true; + } // else we don't need to send it out again + } + } + + if (con->get_messenger() == osd->cluster_messenger && + con != osd->cluster_messenger->get_loopback_connection() && + osdmap->is_up(name.num()) && + (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() || + osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) { + // remember + epoch_t has = std::max(get_peer_epoch(name.num()), epoch); + + // share? + if (has < osdmap->get_epoch()) { + dout(10) << name << " " << con->get_peer_addr() + << " has old map " << epoch << " < " + << osdmap->get_epoch() << dendl; + return true; + } + } + + return false; +} + +void OSDService::share_map( + entity_name_t name, + Connection *con, + epoch_t epoch, + OSDMapRef& osdmap, + epoch_t *sent_epoch_p) +{ + dout(20) << "share_map " + << name << " " << con->get_peer_addr() + << " " << epoch << dendl; + + if (!osd->is_active()) { + /*It is safe not to proceed as OSD is not in healthy state*/ + return; + } + + bool want_shared = should_share_map(name, con, epoch, + osdmap, sent_epoch_p); + + if (want_shared){ + if (name.is_client()) { + dout(10) << name << " has old map " << epoch + << " < " << osdmap->get_epoch() << dendl; + // we know the Session is valid or we wouldn't be sending + if (sent_epoch_p) { + *sent_epoch_p = osdmap->get_epoch(); + } + send_incremental_map(epoch, con, osdmap); + } else if (con->get_messenger() == osd->cluster_messenger && + osdmap->is_up(name.num()) && + (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() || + osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) { + dout(10) << name << " " << con->get_peer_addrs() + << " has old map " << epoch << " < " + << osdmap->get_epoch() << dendl; + note_peer_epoch(name.num(), osdmap->get_epoch()); + send_incremental_map(epoch, con, osdmap); + } + } +} + +void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map) +{ + if (!map) + map = get_osdmap(); + + // send map? + epoch_t pe = get_peer_epoch(peer); + if (pe) { + if (pe < map->get_epoch()) { + send_incremental_map(pe, con, map); + note_peer_epoch(peer, map->get_epoch()); + } else + dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl; + } else { + dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl; + // no idea about peer's epoch. + // ??? send recent ??? + // do nothing. + } +} + +bool OSDService::can_inc_scrubs() +{ + bool can_inc = false; + std::lock_guard l(sched_scrub_lock); + + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote + << " remote < max " << cct->_conf->osd_max_scrubs << dendl; + can_inc = true; + } else { + dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote + << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; + } + + return can_inc; +} + +bool OSDService::inc_scrubs_local() +{ + bool result = false; + std::lock_guard l{sched_scrub_lock}; + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1) + << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl; + result = true; + ++scrubs_local; + } else { + dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; + } + return result; +} + +void OSDService::dec_scrubs_local() +{ + std::lock_guard l{sched_scrub_lock}; + dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1) + << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl; + --scrubs_local; + ceph_assert(scrubs_local >= 0); +} + +bool OSDService::inc_scrubs_remote() +{ + bool result = false; + std::lock_guard l{sched_scrub_lock}; + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1) + << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl; + result = true; + ++scrubs_remote; + } else { + dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; + } + return result; +} + +void OSDService::dec_scrubs_remote() +{ + std::lock_guard l{sched_scrub_lock}; + dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1) + << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl; + --scrubs_remote; + ceph_assert(scrubs_remote >= 0); +} + +void OSDService::dump_scrub_reservations(Formatter *f) +{ + std::lock_guard l{sched_scrub_lock}; + f->dump_int("scrubs_local", scrubs_local); + f->dump_int("scrubs_remote", scrubs_remote); + f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs); +} + +void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch, + epoch_t *_bind_epoch) const +{ + std::lock_guard l(epoch_lock); + if (_boot_epoch) + *_boot_epoch = boot_epoch; + if (_up_epoch) + *_up_epoch = up_epoch; + if (_bind_epoch) + *_bind_epoch = bind_epoch; +} + +void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch, + const epoch_t *_bind_epoch) +{ + std::lock_guard l(epoch_lock); + if (_boot_epoch) { + ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch); + boot_epoch = *_boot_epoch; + } + if (_up_epoch) { + ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch); + up_epoch = *_up_epoch; + } + if (_bind_epoch) { + ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch); + bind_epoch = *_bind_epoch; + } +} + +bool OSDService::prepare_to_stop() +{ + std::lock_guard l(is_stopping_lock); + if (get_state() != NOT_STOPPING) + return false; + + OSDMapRef osdmap = get_osdmap(); + if (osdmap && osdmap->is_up(whoami)) { + dout(0) << __func__ << " telling mon we are shutting down" << dendl; + set_state(PREPARING_TO_STOP); + monc->send_mon_message( + new MOSDMarkMeDown( + monc->get_fsid(), + whoami, + osdmap->get_addrs(whoami), + osdmap->get_epoch(), + true // request ack + )); + utime_t now = ceph_clock_now(); + utime_t timeout; + timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout); + while ((ceph_clock_now() < timeout) && + (get_state() != STOPPING)) { + is_stopping_cond.WaitUntil(is_stopping_lock, timeout); + } + } + dout(0) << __func__ << " starting shutdown" << dendl; + set_state(STOPPING); + return true; +} + +void OSDService::got_stop_ack() +{ + std::lock_guard l(is_stopping_lock); + if (get_state() == PREPARING_TO_STOP) { + dout(0) << __func__ << " starting shutdown" << dendl; + set_state(STOPPING); + is_stopping_cond.Signal(); + } else { + dout(10) << __func__ << " ignoring msg" << dendl; + } +} + +MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to, + OSDSuperblock& sblock) +{ + MOSDMap *m = new MOSDMap(monc->get_fsid(), + osdmap->get_encoding_features()); + m->oldest_map = max_oldest_map; + m->newest_map = sblock.newest_map; + + int max = cct->_conf->osd_map_message_max; + ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes; + + if (since < m->oldest_map) { + // we don't have the next map the target wants, so start with a + // full map. + bufferlist bl; + dout(10) << __func__ << " oldest map " << max_oldest_map << " > since " + << since << ", starting with full map" << dendl; + since = m->oldest_map; + if (!get_map_bl(since, bl)) { + derr << __func__ << " missing full map " << since << dendl; + goto panic; + } + max--; + max_bytes -= bl.length(); + m->maps[since].claim(bl); + } + for (epoch_t e = since + 1; e <= to; ++e) { + bufferlist bl; + if (get_inc_map_bl(e, bl)) { + m->incremental_maps[e].claim(bl); + } else { + dout(10) << __func__ << " missing incremental map " << e << dendl; + if (!get_map_bl(e, bl)) { + derr << __func__ << " also missing full map " << e << dendl; + goto panic; + } + m->maps[e].claim(bl); + } + max--; + max_bytes -= bl.length(); + if (max <= 0 || max_bytes <= 0) { + break; + } + } + return m; + + panic: + if (!m->maps.empty() || + !m->incremental_maps.empty()) { + // send what we have so far + return m; + } + // send something + bufferlist bl; + if (get_inc_map_bl(m->newest_map, bl)) { + m->incremental_maps[m->newest_map].claim(bl); + } else { + derr << __func__ << " unable to load latest map " << m->newest_map << dendl; + if (!get_map_bl(m->newest_map, bl)) { + derr << __func__ << " unable to load latest full map " << m->newest_map + << dendl; + ceph_abort(); + } + m->maps[m->newest_map].claim(bl); + } + return m; +} + +void OSDService::send_map(MOSDMap *m, Connection *con) +{ + con->send_message(m); +} + +void OSDService::send_incremental_map(epoch_t since, Connection *con, + const OSDMapRef& osdmap) +{ + epoch_t to = osdmap->get_epoch(); + dout(10) << "send_incremental_map " << since << " -> " << to + << " to " << con << " " << con->get_peer_addr() << dendl; + + MOSDMap *m = NULL; + while (!m) { + OSDSuperblock sblock(get_superblock()); + if (since < sblock.oldest_map) { + // just send latest full map + MOSDMap *m = new MOSDMap(monc->get_fsid(), + osdmap->get_encoding_features()); + m->oldest_map = max_oldest_map; + m->newest_map = sblock.newest_map; + get_map_bl(to, m->maps[to]); + send_map(m, con); + return; + } + + if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) { + dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs + << ", only sending most recent" << dendl; + since = to - cct->_conf->osd_map_share_max_epochs; + } + + m = build_incremental_map_msg(since, to, sblock); + } + send_map(m, con); +} + +bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl) +{ + bool found = map_bl_cache.lookup(e, &bl); + if (found) { + if (logger) + logger->inc(l_osd_map_bl_cache_hit); + return true; + } + if (logger) + logger->inc(l_osd_map_bl_cache_miss); + found = store->read(meta_ch, + OSD::get_osdmap_pobject_name(e), 0, 0, bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0; + if (found) { + _add_map_bl(e, bl); + } + return found; +} + +bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl) +{ + std::lock_guard l(map_cache_lock); + bool found = map_bl_inc_cache.lookup(e, &bl); + if (found) { + if (logger) + logger->inc(l_osd_map_bl_cache_hit); + return true; + } + if (logger) + logger->inc(l_osd_map_bl_cache_miss); + found = store->read(meta_ch, + OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0; + if (found) { + _add_map_inc_bl(e, bl); + } + return found; +} + +void OSDService::_add_map_bl(epoch_t e, bufferlist& bl) +{ + dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl; + // cache a contiguous buffer + if (bl.get_num_buffers() > 1) { + bl.rebuild(); + } + bl.try_assign_to_mempool(mempool::mempool_osd_mapbl); + map_bl_cache.add(e, bl); +} + +void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl) +{ + dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl; + // cache a contiguous buffer + if (bl.get_num_buffers() > 1) { + bl.rebuild(); + } + bl.try_assign_to_mempool(mempool::mempool_osd_mapbl); + map_bl_inc_cache.add(e, bl); +} + +int OSDService::get_deleted_pool_pg_num(int64_t pool) +{ + std::lock_guard l(map_cache_lock); + auto p = deleted_pool_pg_nums.find(pool); + if (p != deleted_pool_pg_nums.end()) { + return p->second; + } + dout(20) << __func__ << " " << pool << " loading" << dendl; + ghobject_t oid = OSD::make_final_pool_info_oid(pool); + bufferlist bl; + int r = store->read(meta_ch, oid, 0, 0, bl); + ceph_assert(r >= 0); + auto blp = bl.cbegin(); + pg_pool_t pi; + ::decode(pi, blp); + deleted_pool_pg_nums[pool] = pi.get_pg_num(); + dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl; + return pi.get_pg_num(); +} + +OSDMapRef OSDService::_add_map(OSDMap *o) +{ + epoch_t e = o->get_epoch(); + + if (cct->_conf->osd_map_dedup) { + // Dedup against an existing map at a nearby epoch + OSDMapRef for_dedup = map_cache.lower_bound(e); + if (for_dedup) { + OSDMap::dedup(for_dedup.get(), o); + } + } + bool existed; + OSDMapRef l = map_cache.add(e, o, &existed); + if (existed) { + delete o; + } + return l; +} + +OSDMapRef OSDService::try_get_map(epoch_t epoch) +{ + std::lock_guard l(map_cache_lock); + OSDMapRef retval = map_cache.lookup(epoch); + if (retval) { + dout(30) << "get_map " << epoch << " -cached" << dendl; + if (logger) { + logger->inc(l_osd_map_cache_hit); + } + return retval; + } + if (logger) { + logger->inc(l_osd_map_cache_miss); + epoch_t lb = map_cache.cached_key_lower_bound(); + if (epoch < lb) { + dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl; + logger->inc(l_osd_map_cache_miss_low); + logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch); + } + } + + OSDMap *map = new OSDMap; + if (epoch > 0) { + dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl; + bufferlist bl; + if (!_get_map_bl(epoch, bl) || bl.length() == 0) { + derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl; + delete map; + return OSDMapRef(); + } + map->decode(bl); + } else { + dout(20) << "get_map " << epoch << " - return initial " << map << dendl; + } + return _add_map(map); +} + +// ops + + +void OSDService::reply_op_error(OpRequestRef op, int err) +{ + reply_op_error(op, err, eversion_t(), 0); +} + +void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v, + version_t uv) +{ + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + int flags; + flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); + + MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true); + reply->set_reply_versions(v, uv); + m->get_connection()->send_message(reply); +} + +void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op) +{ + if (!cct->_conf->osd_debug_misdirected_ops) { + return; + } + + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + + ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since); + + if (pg->is_ec_pg()) { + /** + * OSD recomputes op target based on current OSDMap. With an EC pg, we + * can get this result: + * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping + * [CRUSH_ITEM_NONE, 2, 3]/3 + * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping + * [3, 2, 3]/3 + * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary + * -- misdirected op + * 4) client resends and this time PG 3.9s0 having caught up to 513 gets + * it and fulfils it + * + * We can't compute the op target based on the sending map epoch due to + * splitting. The simplest thing is to detect such cases here and drop + * them without an error (the client will resend anyway). + */ + ceph_assert(m->get_map_epoch() <= superblock.newest_map); + OSDMapRef opmap = try_get_map(m->get_map_epoch()); + if (!opmap) { + dout(7) << __func__ << ": " << *pg << " no longer have map for " + << m->get_map_epoch() << ", dropping" << dendl; + return; + } + pg_t _pgid = m->get_raw_pg(); + spg_t pgid; + if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0) + _pgid = opmap->raw_pg_to_pg(_pgid); + if (opmap->get_primary_shard(_pgid, &pgid) && + pgid.shard != pg->pg_id.shard) { + dout(7) << __func__ << ": " << *pg << " primary changed since " + << m->get_map_epoch() << ", dropping" << dendl; + return; + } + } + + dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl; + clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid() + << " pg " << m->get_raw_pg() + << " to osd." << whoami + << " not " << pg->get_acting() + << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch(); +} + +void OSDService::enqueue_back(OpQueueItem&& qi) +{ + osd->op_shardedwq.queue(std::move(qi)); +} + +void OSDService::enqueue_front(OpQueueItem&& qi) +{ + osd->op_shardedwq.queue_front(std::move(qi)); +} + +void OSDService::queue_recovery_context( + PG *pg, + GenContext<ThreadPool::TPHandle&> *c) +{ + epoch_t e = get_osdmap_epoch(); + enqueue_back( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>( + new PGRecoveryContext(pg->get_pgid(), c, e)), + cct->_conf->osd_recovery_cost, + cct->_conf->osd_recovery_priority, + ceph_clock_now(), + 0, + e)); +} + +void OSDService::queue_for_snap_trim(PG *pg) +{ + dout(10) << "queueing " << *pg << " for snaptrim" << dendl; + enqueue_back( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>( + new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())), + cct->_conf->osd_snap_trim_cost, + cct->_conf->osd_snap_trim_priority, + ceph_clock_now(), + 0, + pg->get_osdmap_epoch())); +} + +void OSDService::queue_for_scrub(PG *pg, bool with_high_priority) +{ + unsigned scrub_queue_priority = pg->scrubber.priority; + if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) { + scrub_queue_priority = cct->_conf->osd_client_op_priority; + } + const auto epoch = pg->get_osdmap_epoch(); + enqueue_back( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)), + cct->_conf->osd_scrub_cost, + scrub_queue_priority, + ceph_clock_now(), + 0, + epoch)); +} + +void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e) +{ + dout(10) << __func__ << " on " << pgid << " e " << e << dendl; + enqueue_back( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>( + new PGDelete(pgid, e)), + cct->_conf->osd_pg_delete_cost, + cct->_conf->osd_pg_delete_priority, + ceph_clock_now(), + 0, + e)); +} + +bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num) +{ + return osd->try_finish_pg_delete(pg, old_pg_num); +} + +// --- + +void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << pg->pg_id << dendl; + ready_to_merge_source[pg->pg_id.pgid] = version; + assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0); + _send_ready_to_merge(); +} + +void OSDService::set_ready_to_merge_target(PG *pg, + eversion_t version, + epoch_t last_epoch_started, + epoch_t last_epoch_clean) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << pg->pg_id << dendl; + ready_to_merge_target.insert(make_pair(pg->pg_id.pgid, + make_tuple(version, + last_epoch_started, + last_epoch_clean))); + assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0); + _send_ready_to_merge(); +} + +void OSDService::set_not_ready_to_merge_source(pg_t source) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << source << dendl; + not_ready_to_merge_source.insert(source); + assert(ready_to_merge_source.count(source) == 0); + _send_ready_to_merge(); +} + +void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << target << " source " << source << dendl; + not_ready_to_merge_target[target] = source; + assert(ready_to_merge_target.count(target) == 0); + _send_ready_to_merge(); +} + +void OSDService::send_ready_to_merge() +{ + std::lock_guard l(merge_lock); + _send_ready_to_merge(); +} + +void OSDService::_send_ready_to_merge() +{ + dout(20) << __func__ + << " ready_to_merge_source " << ready_to_merge_source + << " not_ready_to_merge_source " << not_ready_to_merge_source + << " ready_to_merge_target " << ready_to_merge_target + << " not_ready_to_merge_target " << not_ready_to_merge_target + << " sent_ready_to_merge_source " << sent_ready_to_merge_source + << dendl; + for (auto src : not_ready_to_merge_source) { + if (sent_ready_to_merge_source.count(src) == 0) { + monc->send_mon_message(new MOSDPGReadyToMerge( + src, + {}, {}, 0, 0, + false, + osdmap->get_epoch())); + sent_ready_to_merge_source.insert(src); + } + } + for (auto p : not_ready_to_merge_target) { + if (sent_ready_to_merge_source.count(p.second) == 0) { + monc->send_mon_message(new MOSDPGReadyToMerge( + p.second, + {}, {}, 0, 0, + false, + osdmap->get_epoch())); + sent_ready_to_merge_source.insert(p.second); + } + } + for (auto src : ready_to_merge_source) { + if (not_ready_to_merge_source.count(src.first) || + not_ready_to_merge_target.count(src.first.get_parent())) { + continue; + } + auto p = ready_to_merge_target.find(src.first.get_parent()); + if (p != ready_to_merge_target.end() && + sent_ready_to_merge_source.count(src.first) == 0) { + monc->send_mon_message(new MOSDPGReadyToMerge( + src.first, // source pgid + src.second, // src version + std::get<0>(p->second), // target version + std::get<1>(p->second), // PG's last_epoch_started + std::get<2>(p->second), // PG's last_epoch_clean + true, + osdmap->get_epoch())); + sent_ready_to_merge_source.insert(src.first); + } + } +} + +void OSDService::clear_ready_to_merge(PG *pg) +{ + std::lock_guard l(merge_lock); + dout(10) << __func__ << " " << pg->pg_id << dendl; + ready_to_merge_source.erase(pg->pg_id.pgid); + ready_to_merge_target.erase(pg->pg_id.pgid); + not_ready_to_merge_source.erase(pg->pg_id.pgid); + not_ready_to_merge_target.erase(pg->pg_id.pgid); + sent_ready_to_merge_source.erase(pg->pg_id.pgid); +} + +void OSDService::clear_sent_ready_to_merge() +{ + std::lock_guard l(merge_lock); + sent_ready_to_merge_source.clear(); +} + +void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap) +{ + std::lock_guard l(merge_lock); + auto i = sent_ready_to_merge_source.begin(); + while (i != sent_ready_to_merge_source.end()) { + if (!osdmap->pg_exists(*i)) { + dout(10) << __func__ << " " << *i << dendl; + i = sent_ready_to_merge_source.erase(i); + } else { + ++i; + } + } +} + +// --- + +void OSDService::_queue_for_recovery( + std::pair<epoch_t, PGRef> p, + uint64_t reserved_pushes) +{ + ceph_assert(recovery_lock.is_locked_by_me()); + enqueue_back( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>( + new PGRecovery( + p.second->get_pgid(), p.first, reserved_pushes)), + cct->_conf->osd_recovery_cost, + cct->_conf->osd_recovery_priority, + ceph_clock_now(), + 0, + p.first)); +} + +// ==================================================================== +// OSD + +#undef dout_prefix +#define dout_prefix *_dout + +// Commands shared between OSD's console and admin console: +namespace ceph { +namespace osd_cmds { + +int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os); + +}} // namespace ceph::osd_cmds + +int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami) +{ + int ret; + + OSDSuperblock sb; + bufferlist sbbl; + ObjectStore::CollectionHandle ch; + + // if we are fed a uuid for this osd, use it. + store->set_fsid(cct->_conf->osd_uuid); + + ret = store->mkfs(); + if (ret) { + derr << "OSD::mkfs: ObjectStore::mkfs failed with error " + << cpp_strerror(ret) << dendl; + goto free_store; + } + + store->set_cache_shards(1); // doesn't matter for mkfs! + + ret = store->mount(); + if (ret) { + derr << "OSD::mkfs: couldn't mount ObjectStore: error " + << cpp_strerror(ret) << dendl; + goto free_store; + } + + ch = store->open_collection(coll_t::meta()); + if (ch) { + ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl); + if (ret < 0) { + derr << "OSD::mkfs: have meta collection but no superblock" << dendl; + goto free_store; + } + /* if we already have superblock, check content of superblock */ + dout(0) << " have superblock" << dendl; + auto p = sbbl.cbegin(); + decode(sb, p); + if (whoami != sb.whoami) { + derr << "provided osd id " << whoami << " != superblock's " << sb.whoami + << dendl; + ret = -EINVAL; + goto umount_store; + } + if (fsid != sb.cluster_fsid) { + derr << "provided cluster fsid " << fsid + << " != superblock's " << sb.cluster_fsid << dendl; + ret = -EINVAL; + goto umount_store; + } + } else { + // create superblock + sb.cluster_fsid = fsid; + sb.osd_fsid = store->get_fsid(); + sb.whoami = whoami; + sb.compat_features = get_osd_initial_compat_set(); + + bufferlist bl; + encode(sb, bl); + + ObjectStore::CollectionHandle ch = store->create_new_collection( + coll_t::meta()); + ObjectStore::Transaction t; + t.create_collection(coll_t::meta(), 0); + t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); + ret = store->queue_transaction(ch, std::move(t)); + if (ret) { + derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: " + << "queue_transaction returned " << cpp_strerror(ret) << dendl; + goto umount_store; + } + } + + ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami); + if (ret) { + derr << "OSD::mkfs: failed to write fsid file: error " + << cpp_strerror(ret) << dendl; + goto umount_store; + } + +umount_store: + if (ch) { + ch.reset(); + } + store->umount(); +free_store: + delete store; + return ret; +} + +int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami) +{ + char val[80]; + int r; + + snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC); + r = store->write_meta("magic", val); + if (r < 0) + return r; + + snprintf(val, sizeof(val), "%d", whoami); + r = store->write_meta("whoami", val); + if (r < 0) + return r; + + cluster_fsid.print(val); + r = store->write_meta("ceph_fsid", val); + if (r < 0) + return r; + + string key = cct->_conf.get_val<string>("key"); + if (key.size()) { + r = store->write_meta("osd_key", key); + if (r < 0) + return r; + } else { + string keyfile = cct->_conf.get_val<string>("keyfile"); + if (!keyfile.empty()) { + bufferlist keybl; + string err; + r = keybl.read_file(keyfile.c_str(), &err); + if (r < 0) { + derr << __func__ << " failed to read keyfile " << keyfile << ": " + << err << ": " << cpp_strerror(r) << dendl; + return r; + } + r = store->write_meta("osd_key", keybl.to_str()); + if (r < 0) + return r; + } + } + + r = store->write_meta("ready", "ready"); + if (r < 0) + return r; + + return 0; +} + +int OSD::peek_meta(ObjectStore *store, + std::string *magic, + uuid_d *cluster_fsid, + uuid_d *osd_fsid, + int *whoami, + int *require_osd_release) +{ + string val; + + int r = store->read_meta("magic", &val); + if (r < 0) + return r; + *magic = val; + + r = store->read_meta("whoami", &val); + if (r < 0) + return r; + *whoami = atoi(val.c_str()); + + r = store->read_meta("ceph_fsid", &val); + if (r < 0) + return r; + r = cluster_fsid->parse(val.c_str()); + if (!r) + return -EINVAL; + + r = store->read_meta("fsid", &val); + if (r < 0) { + *osd_fsid = uuid_d(); + } else { + r = osd_fsid->parse(val.c_str()); + if (!r) + return -EINVAL; + } + + r = store->read_meta("require_osd_release", &val); + if (r >= 0) { + *require_osd_release = atoi(val.c_str()); + } + + return 0; +} + + +#undef dout_prefix +#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch()) + +// cons/des + +OSD::OSD(CephContext *cct_, ObjectStore *store_, + int id, + Messenger *internal_messenger, + Messenger *external_messenger, + Messenger *hb_client_front, + Messenger *hb_client_back, + Messenger *hb_front_serverm, + Messenger *hb_back_serverm, + Messenger *osdc_messenger, + MonClient *mc, + const std::string &dev, const std::string &jdev) : + Dispatcher(cct_), + osd_lock("OSD::osd_lock"), + tick_timer(cct, osd_lock), + tick_timer_lock("OSD::tick_timer_lock"), + tick_timer_without_osd_lock(cct, tick_timer_lock), + gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")), + cluster_messenger(internal_messenger), + client_messenger(external_messenger), + objecter_messenger(osdc_messenger), + monc(mc), + mgrc(cct_, client_messenger), + logger(NULL), + recoverystate_perf(NULL), + store(store_), + log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS), + clog(log_client.create_channel()), + whoami(id), + dev_path(dev), journal_path(jdev), + store_is_rotational(store->is_rotational()), + trace_endpoint("0.0.0.0", 0, "osd"), + asok_hook(NULL), + m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>( + "osd_pg_epoch_max_lag_factor")), + osd_compat(get_osd_compat_set()), + osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp", + get_num_op_threads()), + command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1), + session_waiting_lock("OSD::session_waiting_lock"), + osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"), + heartbeat_lock("OSD::heartbeat_lock"), + heartbeat_stop(false), + heartbeat_need_update(true), + hb_front_client_messenger(hb_client_front), + hb_back_client_messenger(hb_client_back), + hb_front_server_messenger(hb_front_serverm), + hb_back_server_messenger(hb_back_serverm), + daily_loadavg(0.0), + heartbeat_thread(this), + heartbeat_dispatcher(this), + op_tracker(cct, cct->_conf->osd_enable_op_tracker, + cct->_conf->osd_num_op_tracker_shard), + test_ops_hook(NULL), + op_queue(get_io_queue()), + op_prio_cutoff(get_io_prio_cut()), + op_shardedwq( + this, + cct->_conf->osd_op_thread_timeout, + cct->_conf->osd_op_thread_suicide_timeout, + &osd_op_tp), + map_lock("OSD::map_lock"), + last_pg_create_epoch(0), + mon_report_lock("OSD::mon_report_lock"), + boot_finisher(cct), + up_thru_wanted(0), + requested_full_first(0), + requested_full_last(0), + command_wq( + this, + cct->_conf->osd_command_thread_timeout, + cct->_conf->osd_command_thread_suicide_timeout, + &command_tp), + service(this) +{ + + if (!gss_ktfile_client.empty()) { + // Assert we can export environment variable + /* + The default client keytab is used, if it is present and readable, + to automatically obtain initial credentials for GSSAPI client + applications. The principal name of the first entry in the client + keytab is used by default when obtaining initial credentials. + 1. The KRB5_CLIENT_KTNAME environment variable. + 2. The default_client_keytab_name profile variable in [libdefaults]. + 3. The hardcoded default, DEFCKTNAME. + */ + const int32_t set_result(setenv("KRB5_CLIENT_KTNAME", + gss_ktfile_client.c_str(), 1)); + ceph_assert(set_result == 0); + } + + monc->set_messenger(client_messenger); + op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time, + cct->_conf->osd_op_log_threshold); + op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size, + cct->_conf->osd_op_history_duration); + op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size, + cct->_conf->osd_op_history_slow_op_threshold); +#ifdef WITH_BLKIN + std::stringstream ss; + ss << "osd." << whoami; + trace_endpoint.copy_name(ss.str()); +#endif + + // initialize shards + num_shards = get_num_op_shards(); + for (uint32_t i = 0; i < num_shards; i++) { + OSDShard *one_shard = new OSDShard( + i, + cct, + this, + cct->_conf->osd_op_pq_max_tokens_per_priority, + cct->_conf->osd_op_pq_min_cost, + op_queue); + shards.push_back(one_shard); + } +} + +OSD::~OSD() +{ + while (!shards.empty()) { + delete shards.back(); + shards.pop_back(); + } + delete class_handler; + cct->get_perfcounters_collection()->remove(recoverystate_perf); + cct->get_perfcounters_collection()->remove(logger); + delete recoverystate_perf; + delete logger; + delete store; +} + +double OSD::get_tick_interval() const +{ + // vary +/- 5% to avoid scrub scheduling livelocks + constexpr auto delta = 0.05; + return (OSD_TICK_INTERVAL * + ceph::util::generate_random_number(1.0 - delta, 1.0 + delta)); +} + +void cls_initialize(ClassHandler *ch); + +void OSD::handle_signal(int signum) +{ + ceph_assert(signum == SIGINT || signum == SIGTERM); + derr << "*** Got signal " << sig_str(signum) << " ***" << dendl; + shutdown(); +} + +int OSD::pre_init() +{ + std::lock_guard lock(osd_lock); + if (is_stopping()) + return 0; + + if (store->test_mount_in_use()) { + derr << "OSD::pre_init: object store '" << dev_path << "' is " + << "currently in use. (Is ceph-osd already running?)" << dendl; + return -EBUSY; + } + + cct->_conf.add_observer(this); + return 0; +} + +int OSD::set_numa_affinity() +{ + // storage numa node + int store_node = -1; + store->get_numa_node(&store_node, nullptr, nullptr); + if (store_node >= 0) { + dout(1) << __func__ << " storage numa node " << store_node << dendl; + } + + // check network numa node(s) + int front_node = -1, back_node = -1; + string front_iface = pick_iface( + cct, + client_messenger->get_myaddrs().front().get_sockaddr_storage()); + string back_iface = pick_iface( + cct, + cluster_messenger->get_myaddrs().front().get_sockaddr_storage()); + int r = get_iface_numa_node(front_iface, &front_node); + if (r >= 0 && front_node >= 0) { + dout(1) << __func__ << " public network " << front_iface << " numa node " + << front_node << dendl; + r = get_iface_numa_node(back_iface, &back_node); + if (r >= 0 && back_node >= 0) { + dout(1) << __func__ << " cluster network " << back_iface << " numa node " + << back_node << dendl; + if (front_node == back_node && + front_node == store_node) { + dout(1) << " objectstore and network numa nodes all match" << dendl; + if (g_conf().get_val<bool>("osd_numa_auto_affinity")) { + numa_node = front_node; + } + } else if (front_node != back_node) { + dout(1) << __func__ << " public and cluster network numa nodes do not match" + << dendl; + } else { + dout(1) << __func__ << " objectstore and network numa nodes do not match" + << dendl; + } + } else if (back_node == -2) { + dout(1) << __func__ << " cluster network " << back_iface + << " ports numa nodes do not match" << dendl; + } else { + derr << __func__ << " unable to identify cluster interface '" << back_iface + << "' numa node: " << cpp_strerror(r) << dendl; + } + } else if (front_node == -2) { + dout(1) << __func__ << " public network " << front_iface + << " ports numa nodes do not match" << dendl; + } else { + derr << __func__ << " unable to identify public interface '" << front_iface + << "' numa node: " << cpp_strerror(r) << dendl; + } + if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) { + // this takes precedence over the automagic logic above + numa_node = node; + } + if (numa_node >= 0) { + int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set); + if (r < 0) { + dout(1) << __func__ << " unable to determine numa node " << numa_node + << " CPUs" << dendl; + numa_node = -1; + } else { + dout(1) << __func__ << " setting numa affinity to node " << numa_node + << " cpus " + << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set) + << dendl; + r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r) + << dendl; + numa_node = -1; + } + } + } else { + dout(1) << __func__ << " not setting numa affinity" << dendl; + } + return 0; +} + +// asok + +class OSDSocketHook : public AdminSocketHook { + OSD *osd; +public: + explicit OSDSocketHook(OSD *o) : osd(o) {} + bool call(std::string_view admin_command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + stringstream ss; + bool r = true; + try { + r = osd->asok_command(admin_command, cmdmap, format, ss); + } catch (const bad_cmd_get& e) { + ss << e.what(); + r = true; + } + out.append(ss); + return r; + } +}; + +std::set<int64_t> OSD::get_mapped_pools() +{ + std::set<int64_t> pools; + std::vector<spg_t> pgids; + _get_pgids(&pgids); + for (const auto &pgid : pgids) { + pools.insert(pgid.pool()); + } + return pools; +} + +bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap, + std::string_view format, ostream& ss) +{ + Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); + if (admin_command == "status") { + f->open_object_section("status"); + f->dump_stream("cluster_fsid") << superblock.cluster_fsid; + f->dump_stream("osd_fsid") << superblock.osd_fsid; + f->dump_unsigned("whoami", superblock.whoami); + f->dump_string("state", get_state_name(get_state())); + f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_unsigned("newest_map", superblock.newest_map); + f->dump_unsigned("num_pgs", num_pgs); + f->close_section(); + } else if (admin_command == "flush_journal") { + store->flush_journal(); + } else if (admin_command == "dump_ops_in_flight" || + admin_command == "ops" || + admin_command == "dump_blocked_ops" || + admin_command == "dump_historic_ops" || + admin_command == "dump_historic_ops_by_duration" || + admin_command == "dump_historic_slow_ops") { + + const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \ +even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \ +will start to track new ops received afterwards."; + + set<string> filters; + vector<string> filter_str; + if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) { + copy(filter_str.begin(), filter_str.end(), + inserter(filters, filters.end())); + } + + if (admin_command == "dump_ops_in_flight" || + admin_command == "ops") { + if (!op_tracker.dump_ops_in_flight(f, false, filters)) { + ss << error_str; + } + } + if (admin_command == "dump_blocked_ops") { + if (!op_tracker.dump_ops_in_flight(f, true, filters)) { + ss << error_str; + } + } + if (admin_command == "dump_historic_ops") { + if (!op_tracker.dump_historic_ops(f, false, filters)) { + ss << error_str; + } + } + if (admin_command == "dump_historic_ops_by_duration") { + if (!op_tracker.dump_historic_ops(f, true, filters)) { + ss << error_str; + } + } + if (admin_command == "dump_historic_slow_ops") { + if (!op_tracker.dump_historic_slow_ops(f, filters)) { + ss << error_str; + } + } + } else if (admin_command == "dump_op_pq_state") { + f->open_object_section("pq"); + op_shardedwq.dump(f); + f->close_section(); + } else if (admin_command == "dump_blacklist") { + list<pair<entity_addr_t,utime_t> > bl; + OSDMapRef curmap = service.get_osdmap(); + + f->open_array_section("blacklist"); + curmap->get_blacklist(&bl); + for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin(); + it != bl.end(); ++it) { + f->open_object_section("entry"); + f->open_object_section("entity_addr_t"); + it->first.dump(f); + f->close_section(); //entity_addr_t + it->second.localtime(f->dump_stream("expire_time")); + f->close_section(); //entry + } + f->close_section(); //blacklist + } else if (admin_command == "dump_watchers") { + list<obj_watch_item_t> watchers; + // scan pg's + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + list<obj_watch_item_t> pg_watchers; + pg->get_watchers(&pg_watchers); + watchers.splice(watchers.end(), pg_watchers); + } + + f->open_array_section("watchers"); + for (list<obj_watch_item_t>::iterator it = watchers.begin(); + it != watchers.end(); ++it) { + + f->open_object_section("watch"); + + f->dump_string("namespace", it->obj.nspace); + f->dump_string("object", it->obj.oid.name); + + f->open_object_section("entity_name"); + it->wi.name.dump(f); + f->close_section(); //entity_name_t + + f->dump_unsigned("cookie", it->wi.cookie); + f->dump_unsigned("timeout", it->wi.timeout_seconds); + + f->open_object_section("entity_addr_t"); + it->wi.addr.dump(f); + f->close_section(); //entity_addr_t + + f->close_section(); //watch + } + + f->close_section(); //watchers + } else if (admin_command == "dump_recovery_reservations") { + f->open_object_section("reservations"); + f->open_object_section("local_reservations"); + service.local_reserver.dump(f); + f->close_section(); + f->open_object_section("remote_reservations"); + service.remote_reserver.dump(f); + f->close_section(); + f->close_section(); + } else if (admin_command == "dump_scrub_reservations") { + f->open_object_section("scrub_reservations"); + service.dump_scrub_reservations(f); + f->close_section(); + } else if (admin_command == "get_latest_osdmap") { + get_latest_osdmap(); + } else if (admin_command == "heap") { + auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss); + + // Note: Failed heap profile commands won't necessarily trigger an error: + f->open_object_section("result"); + f->dump_string("error", cpp_strerror(result)); + f->dump_bool("success", result >= 0); + f->close_section(); + } else if (admin_command == "set_heap_property") { + string property; + int64_t value = 0; + string error; + bool success = false; + if (!cmd_getval(cct, cmdmap, "property", property)) { + error = "unable to get property"; + success = false; + } else if (!cmd_getval(cct, cmdmap, "value", value)) { + error = "unable to get value"; + success = false; + } else if (value < 0) { + error = "negative value not allowed"; + success = false; + } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) { + error = "invalid property"; + success = false; + } else { + success = true; + } + f->open_object_section("result"); + f->dump_string("error", error); + f->dump_bool("success", success); + f->close_section(); + } else if (admin_command == "get_heap_property") { + string property; + size_t value = 0; + string error; + bool success = false; + if (!cmd_getval(cct, cmdmap, "property", property)) { + error = "unable to get property"; + success = false; + } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) { + error = "invalid property"; + success = false; + } else { + success = true; + } + f->open_object_section("result"); + f->dump_string("error", error); + f->dump_bool("success", success); + f->dump_int("value", value); + f->close_section(); + } else if (admin_command == "dump_objectstore_kv_stats") { + store->get_db_statistics(f); + } else if (admin_command == "dump_scrubs") { + service.dumps_scrub(f); + } else if (admin_command == "calc_objectstore_db_histogram") { + store->generate_db_histogram(f); + } else if (admin_command == "flush_store_cache") { + store->flush_cache(&ss); + } else if (admin_command == "dump_pgstate_history") { + f->open_object_section("pgstate_history"); + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + f->dump_stream("pg") << pg->pg_id; + pg->dump_pgstate_history(f); + } + f->close_section(); + } else if (admin_command == "compact") { + dout(1) << "triggering manual compaction" << dendl; + auto start = ceph::coarse_mono_clock::now(); + store->compact(); + auto end = ceph::coarse_mono_clock::now(); + double duration = std::chrono::duration<double>(end-start).count(); + dout(1) << "finished manual compaction in " + << duration + << " seconds" << dendl; + f->open_object_section("compact_result"); + f->dump_float("elapsed_time", duration); + f->close_section(); + } else if (admin_command == "get_mapped_pools") { + f->open_array_section("mapped_pools"); + set<int64_t> poollist = get_mapped_pools(); + for (auto pool : poollist) { + f->dump_int("pool_id", pool); + } + f->close_section(); + } else if (admin_command == "smart") { + string devid; + cmd_getval(cct, cmdmap, "devid", devid); + probe_smart(devid, ss); + } else if (admin_command == "list_devices") { + set<string> devnames; + store->get_devices(&devnames); + f->open_object_section("list_devices"); + for (auto dev : devnames) { + if (dev.find("dm-") == 0) { + continue; + } + f->dump_string("device", "/dev/" + dev); + } + f->close_section(); + } else if (admin_command == "send_beacon") { + if (is_active()) { + send_beacon(ceph::coarse_mono_clock::now()); + } + } else if (admin_command == "dump_osd_network") { + int64_t value = 0; + if (!(cmd_getval(cct, cmdmap, "value", value))) { + // Convert milliseconds to microseconds + value = static_cast<int64_t>(g_conf().get_val<double>("mon_warn_on_slow_ping_time")) * 1000; + if (value == 0) { + double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio"); + value = g_conf().get_val<int64_t>("osd_heartbeat_grace"); + value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio + } + } else { + // Convert user input to microseconds + value *= 1000; + } + if (value < 0) value = 0; + + struct osd_ping_time_t { + uint32_t pingtime; + int to; + bool back; + std::array<uint32_t,3> times; + std::array<uint32_t,3> min; + std::array<uint32_t,3> max; + uint32_t last; + uint32_t last_update; + + bool operator<(const osd_ping_time_t& rhs) const { + if (pingtime < rhs.pingtime) + return true; + if (pingtime > rhs.pingtime) + return false; + if (to < rhs.to) + return true; + if (to > rhs.to) + return false; + return back; + } + }; + + set<osd_ping_time_t> sorted; + // Get pingtimes under lock and not on the stack + map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>; + service.get_hb_pingtime(pingtimes); + for (auto j : *pingtimes) { + if (j.second.last_update == 0) + continue; + osd_ping_time_t item; + item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); + item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]); + if (item.pingtime >= value) { + item.to = j.first; + item.times[0] = j.second.back_pingtime[0]; + item.times[1] = j.second.back_pingtime[1]; + item.times[2] = j.second.back_pingtime[2]; + item.min[0] = j.second.back_min[0]; + item.min[1] = j.second.back_min[1]; + item.min[2] = j.second.back_min[2]; + item.max[0] = j.second.back_max[0]; + item.max[1] = j.second.back_max[1]; + item.max[2] = j.second.back_max[2]; + item.last = j.second.back_last; + item.back = true; + item.last_update = j.second.last_update; + sorted.emplace(item); + } + if (j.second.front_last == 0) + continue; + item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]); + item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]); + if (item.pingtime >= value) { + item.to = j.first; + item.times[0] = j.second.front_pingtime[0]; + item.times[1] = j.second.front_pingtime[1]; + item.times[2] = j.second.front_pingtime[2]; + item.min[0] = j.second.front_min[0]; + item.min[1] = j.second.front_min[1]; + item.min[2] = j.second.front_min[2]; + item.max[0] = j.second.front_max[0]; + item.max[1] = j.second.front_max[1]; + item.max[2] = j.second.front_max[2]; + item.last = j.second.front_last; + item.last_update = j.second.last_update; + item.back = false; + sorted.emplace(item); + } + } + delete pingtimes; + // + // Network ping times (1min 5min 15min) + f->open_object_section("network_ping_times"); + f->dump_int("threshold", value / 1000); + f->open_array_section("entries"); + for (auto &sitem : boost::adaptors::reverse(sorted)) { + ceph_assert(sitem.pingtime >= value); + f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); + f->dump_int("from osd", whoami); + f->dump_int("to osd", sitem.to); + f->dump_string("interface", (sitem.back ? "back" : "front")); + f->open_object_section("average"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str()); + f->close_section(); // average + f->open_object_section("min"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); + f->close_section(); // min + f->open_object_section("max"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); + f->close_section(); // max + f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str()); + f->close_section(); // entry + } + f->close_section(); // entries + f->close_section(); // network_ping_times + } else { + ceph_abort_msg("broken asok registration"); + } + f->flush(ss); + delete f; + return true; +} + +class TestOpsSocketHook : public AdminSocketHook { + OSDService *service; + ObjectStore *store; +public: + TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {} + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + stringstream ss; + try { + test_ops(service, store, command, cmdmap, ss); + } catch (const bad_cmd_get& e) { + ss << e.what(); + } + out.append(ss); + return true; + } + void test_ops(OSDService *service, ObjectStore *store, + std::string_view command, const cmdmap_t& cmdmap, ostream &ss); + +}; + +class OSD::C_Tick : public Context { + OSD *osd; + public: + explicit C_Tick(OSD *o) : osd(o) {} + void finish(int r) override { + osd->tick(); + } +}; + +class OSD::C_Tick_WithoutOSDLock : public Context { + OSD *osd; + public: + explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {} + void finish(int r) override { + osd->tick_without_osd_lock(); + } +}; + +int OSD::enable_disable_fuse(bool stop) +{ +#ifdef HAVE_LIBFUSE + int r; + string mntpath = cct->_conf->osd_data + "/fuse"; + if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) { + dout(1) << __func__ << " disabling" << dendl; + fuse_store->stop(); + delete fuse_store; + fuse_store = NULL; + r = ::rmdir(mntpath.c_str()); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to rmdir " << mntpath << ": " + << cpp_strerror(r) << dendl; + return r; + } + return 0; + } + if (!fuse_store && cct->_conf->osd_objectstore_fuse) { + dout(1) << __func__ << " enabling" << dendl; + r = ::mkdir(mntpath.c_str(), 0700); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " unable to create " << mntpath << ": " + << cpp_strerror(r) << dendl; + return r; + } + fuse_store = new FuseStore(store, mntpath); + r = fuse_store->start(); + if (r < 0) { + derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl; + delete fuse_store; + fuse_store = NULL; + return r; + } + } +#endif // HAVE_LIBFUSE + return 0; +} + +int OSD::get_num_op_shards() +{ + if (cct->_conf->osd_op_num_shards) + return cct->_conf->osd_op_num_shards; + if (store_is_rotational) + return cct->_conf->osd_op_num_shards_hdd; + else + return cct->_conf->osd_op_num_shards_ssd; +} + +int OSD::get_num_op_threads() +{ + if (cct->_conf->osd_op_num_threads_per_shard) + return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard; + if (store_is_rotational) + return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd; + else + return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd; +} + +float OSD::get_osd_recovery_sleep() +{ + if (cct->_conf->osd_recovery_sleep) + return cct->_conf->osd_recovery_sleep; + if (!store_is_rotational && !journal_is_rotational) + return cct->_conf->osd_recovery_sleep_ssd; + else if (store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid"); + else + return cct->_conf->osd_recovery_sleep_hdd; +} + +float OSD::get_osd_delete_sleep() +{ + float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep"); + if (osd_delete_sleep > 0) + return osd_delete_sleep; + if (!store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val<double>("osd_delete_sleep_ssd"); + if (store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val<double>("osd_delete_sleep_hybrid"); + return cct->_conf.get_val<double>("osd_delete_sleep_hdd"); +} + +float OSD::get_osd_snap_trim_sleep() +{ + float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep"); + if (osd_snap_trim_sleep > 0) + return osd_snap_trim_sleep; + if (!store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd"); + if (store_is_rotational && !journal_is_rotational) + return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid"); + return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd"); +} + +int OSD::init() +{ + OSDMapRef osdmap; + CompatSet initial, diff; + std::lock_guard lock(osd_lock); + if (is_stopping()) + return 0; + + tick_timer.init(); + tick_timer_without_osd_lock.init(); + service.recovery_request_timer.init(); + service.sleep_timer.init(); + + boot_finisher.start(); + + { + string val; + store->read_meta("require_osd_release", &val); + last_require_osd_release = atoi(val.c_str()); + } + + // mount. + dout(2) << "init " << dev_path + << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")" + << dendl; + dout(2) << "journal " << journal_path << dendl; + ceph_assert(store); // call pre_init() first! + + store->set_cache_shards(get_num_op_shards()); + + int r = store->mount(); + if (r < 0) { + derr << "OSD:init: unable to mount object store" << dendl; + return r; + } + journal_is_rotational = store->is_journal_rotational(); + dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd") + << dendl; + + enable_disable_fuse(false); + + dout(2) << "boot" << dendl; + + service.meta_ch = store->open_collection(coll_t::meta()); + + // initialize the daily loadavg with current 15min loadavg + double loadavgs[3]; + if (getloadavg(loadavgs, 3) == 3) { + daily_loadavg = loadavgs[2]; + } else { + derr << "OSD::init() : couldn't read loadavgs\n" << dendl; + daily_loadavg = 1.0; + } + + int rotating_auth_attempts = 0; + auto rotating_auth_timeout = + g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout"); + + // sanity check long object name handling + { + hobject_t l; + l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n'); + l.set_key(string(cct->_conf->osd_max_object_name_len, 'k')); + l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's'); + r = store->validate_hobject_key(l); + if (r < 0) { + derr << "backend (" << store->get_type() << ") is unable to support max " + << "object name[space] len" << dendl; + derr << " osd max object name len = " + << cct->_conf->osd_max_object_name_len << dendl; + derr << " osd max object namespace len = " + << cct->_conf->osd_max_object_namespace_len << dendl; + derr << cpp_strerror(r) << dendl; + if (cct->_conf->osd_check_max_object_name_len_on_startup) { + goto out; + } + derr << "osd_check_max_object_name_len_on_startup = false, starting anyway" + << dendl; + } else { + dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl; + } + } + + // read superblock + r = read_superblock(); + if (r < 0) { + derr << "OSD::init() : unable to read osd superblock" << dendl; + r = -EINVAL; + goto out; + } + + if (osd_compat.compare(superblock.compat_features) < 0) { + derr << "The disk uses features unsupported by the executable." << dendl; + derr << " ondisk features " << superblock.compat_features << dendl; + derr << " daemon features " << osd_compat << dendl; + + if (osd_compat.writeable(superblock.compat_features)) { + CompatSet diff = osd_compat.unsupported(superblock.compat_features); + derr << "it is still writeable, though. Missing features: " << diff << dendl; + r = -EOPNOTSUPP; + goto out; + } + else { + CompatSet diff = osd_compat.unsupported(superblock.compat_features); + derr << "Cannot write to disk! Missing features: " << diff << dendl; + r = -EOPNOTSUPP; + goto out; + } + } + + assert_warn(whoami == superblock.whoami); + if (whoami != superblock.whoami) { + derr << "OSD::init: superblock says osd" + << superblock.whoami << " but I am osd." << whoami << dendl; + r = -EINVAL; + goto out; + } + + // load up "current" osdmap + assert_warn(!get_osdmap()); + if (get_osdmap()) { + derr << "OSD::init: unable to read current osdmap" << dendl; + r = -EINVAL; + goto out; + } + osdmap = get_map(superblock.current_epoch); + set_osdmap(osdmap); + + // make sure we don't have legacy pgs deleting + { + vector<coll_t> ls; + int r = store->list_collections(ls); + ceph_assert(r >= 0); + for (auto c : ls) { + spg_t pgid; + if (c.is_pg(&pgid) && + !osdmap->have_pg_pool(pgid.pool())) { + ghobject_t oid = make_final_pool_info_oid(pgid.pool()); + if (!store->exists(service.meta_ch, oid)) { + derr << __func__ << " missing pg_pool_t for deleted pool " + << pgid.pool() << " for pg " << pgid + << "; please downgrade to luminous and allow " + << "pg deletion to complete before upgrading" << dendl; + ceph_abort(); + } + } + } + } + + initial = get_osd_initial_compat_set(); + diff = superblock.compat_features.unsupported(initial); + if (superblock.compat_features.merge(initial)) { + // We need to persist the new compat_set before we + // do anything else + dout(5) << "Upgrading superblock adding: " << diff << dendl; + ObjectStore::Transaction t; + write_superblock(t); + r = store->queue_transaction(service.meta_ch, std::move(t)); + if (r < 0) + goto out; + } + + // make sure snap mapper object exists + if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) { + dout(10) << "init creating/touching snapmapper object" << dendl; + ObjectStore::Transaction t; + t.touch(coll_t::meta(), OSD::make_snapmapper_oid()); + r = store->queue_transaction(service.meta_ch, std::move(t)); + if (r < 0) + goto out; + } + + class_handler = new ClassHandler(cct); + cls_initialize(class_handler); + + if (cct->_conf->osd_open_classes_on_start) { + int r = class_handler->open_all_classes(); + if (r) + dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl; + } + + check_osdmap_features(); + + create_recoverystate_perf(); + + { + epoch_t bind_epoch = osdmap->get_epoch(); + service.set_epochs(NULL, NULL, &bind_epoch); + } + + clear_temp_objects(); + + // initialize osdmap references in sharded wq + for (auto& shard : shards) { + std::lock_guard l(shard->osdmap_lock); + shard->shard_osdmap = osdmap; + } + + // load up pgs (as they previously existed) + load_pgs(); + + dout(2) << "superblock: I am osd." << superblock.whoami << dendl; + dout(0) << "using " << op_queue << " op queue with priority op cut off at " << + op_prio_cutoff << "." << dendl; + + create_logger(); + + // prime osd stats + { + struct store_statfs_t stbuf; + osd_alert_list_t alerts; + int r = store->statfs(&stbuf, &alerts); + ceph_assert(r == 0); + service.set_statfs(stbuf, alerts); + } + + // client_messenger auth_client is already set up by monc. + for (auto m : { cluster_messenger, + objecter_messenger, + hb_front_client_messenger, + hb_back_client_messenger, + hb_front_server_messenger, + hb_back_server_messenger } ) { + m->set_auth_client(monc); + } + for (auto m : { client_messenger, + cluster_messenger, + hb_front_server_messenger, + hb_back_server_messenger }) { + m->set_auth_server(monc); + } + monc->set_handle_authentication_dispatcher(this); + + monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD + | CEPH_ENTITY_TYPE_MGR); + r = monc->init(); + if (r < 0) + goto out; + + mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); }); + mgrc.set_perf_metric_query_cb( + [this](const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) { + set_perf_queries(queries); + }, + [this](std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) { + get_perf_reports(reports); + }); + mgrc.init(); + + // tell monc about log_client so it will know about mon session resets + monc->set_log_client(&log_client); + update_log_config(); + + // i'm ready! + client_messenger->add_dispatcher_tail(&mgrc); + client_messenger->add_dispatcher_tail(this); + cluster_messenger->add_dispatcher_head(this); + + hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher); + hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher); + hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher); + hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher); + + objecter_messenger->add_dispatcher_head(service.objecter); + + service.init(); + service.publish_map(osdmap); + service.publish_superblock(superblock); + service.max_oldest_map = superblock.oldest_map; + + for (auto& shard : shards) { + // put PGs in a temporary set because we may modify pg_slots + // unordered_map below. + set<PGRef> pgs; + for (auto& i : shard->pg_slots) { + PGRef pg = i.second->pg; + if (!pg) { + continue; + } + pgs.insert(pg); + } + for (auto pg : pgs) { + pg->lock(); + set<pair<spg_t,epoch_t>> new_children; + set<pair<spg_t,epoch_t>> merge_pgs; + service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id, + &new_children, &merge_pgs); + if (!new_children.empty()) { + for (auto shard : shards) { + shard->prime_splits(osdmap, &new_children); + } + assert(new_children.empty()); + } + if (!merge_pgs.empty()) { + for (auto shard : shards) { + shard->prime_merges(osdmap, &merge_pgs); + } + assert(merge_pgs.empty()); + } + pg->unlock(); + } + } + + osd_op_tp.start(); + command_tp.start(); + + // start the heartbeat + heartbeat_thread.create("osd_srv_heartbt"); + + // tick + tick_timer.add_event_after(get_tick_interval(), + new C_Tick(this)); + { + std::lock_guard l(tick_timer_lock); + tick_timer_without_osd_lock.add_event_after(get_tick_interval(), + new C_Tick_WithoutOSDLock(this)); + } + + osd_lock.Unlock(); + + r = monc->authenticate(); + if (r < 0) { + derr << __func__ << " authentication failed: " << cpp_strerror(r) + << dendl; + exit(1); + } + + while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) { + derr << "unable to obtain rotating service keys; retrying" << dendl; + ++rotating_auth_attempts; + if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) { + derr << __func__ << " wait_auth_rotating timed out" << dendl; + exit(1); + } + } + + r = update_crush_device_class(); + if (r < 0) { + derr << __func__ << " unable to update_crush_device_class: " + << cpp_strerror(r) << dendl; + exit(1); + } + + r = update_crush_location(); + if (r < 0) { + derr << __func__ << " unable to update_crush_location: " + << cpp_strerror(r) << dendl; + exit(1); + } + + osd_lock.Lock(); + if (is_stopping()) + return 0; + + // start objecter *after* we have authenticated, so that we don't ignore + // the OSDMaps it requests. + service.final_init(); + + check_config(); + + dout(10) << "ensuring pgs have consumed prior maps" << dendl; + consume_map(); + + dout(0) << "done with init, starting boot process" << dendl; + + // subscribe to any pg creations + monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0); + + // MgrClient needs this (it doesn't have MonClient reference itself) + monc->sub_want("mgrmap", 0, 0); + + // we don't need to ask for an osdmap here; objecter will + //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME); + + monc->renew_subs(); + + start_boot(); + + return 0; + +out: + enable_disable_fuse(true); + store->umount(); + delete store; + store = NULL; + return r; +} + +void OSD::final_init() +{ + AdminSocket *admin_socket = cct->get_admin_socket(); + asok_hook = new OSDSocketHook(this); + int r = admin_socket->register_command("status", "status", asok_hook, + "high-level status of OSD"); + ceph_assert(r == 0); + r = admin_socket->register_command("flush_journal", "flush_journal", + asok_hook, + "flush the journal to permanent store"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_ops_in_flight", + "dump_ops_in_flight " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show the ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("ops", + "ops " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show the ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_blocked_ops", + "dump_blocked_ops " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show the blocked ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_historic_ops", + "dump_historic_ops " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show recent ops"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_historic_slow_ops", + "dump_historic_slow_ops " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show slowest recent ops"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_historic_ops_by_duration", + "dump_historic_ops_by_duration " \ + "name=filterstr,type=CephString,n=N,req=false", + asok_hook, + "show slowest recent ops, sorted by duration"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state", + asok_hook, + "dump op priority queue state"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_blacklist", "dump_blacklist", + asok_hook, + "dump blacklisted clients and times"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_watchers", "dump_watchers", + asok_hook, + "show clients which have active watches," + " and on which objects"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_recovery_reservations", "dump_recovery_reservations", + asok_hook, + "show recovery reservations"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_scrub_reservations", "dump_scrub_reservations", + asok_hook, + "show scrub reservations"); + ceph_assert(r == 0); + r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap", + asok_hook, + "force osd to update the latest map from " + "the mon"); + ceph_assert(r == 0); + + r = admin_socket->register_command( "heap", + "heap " \ + "name=heapcmd,type=CephString " \ + "name=value,type=CephString,req=false", + asok_hook, + "show heap usage info (available only if " + "compiled with tcmalloc)"); + ceph_assert(r == 0); + + r = admin_socket->register_command("set_heap_property", + "set_heap_property " \ + "name=property,type=CephString " \ + "name=value,type=CephInt", + asok_hook, + "update malloc extension heap property"); + ceph_assert(r == 0); + + r = admin_socket->register_command("get_heap_property", + "get_heap_property " \ + "name=property,type=CephString", + asok_hook, + "get malloc extension heap property"); + ceph_assert(r == 0); + + r = admin_socket->register_command("dump_objectstore_kv_stats", + "dump_objectstore_kv_stats", + asok_hook, + "print statistics of kvdb which used by bluestore"); + ceph_assert(r == 0); + + r = admin_socket->register_command("dump_scrubs", + "dump_scrubs", + asok_hook, + "print scheduled scrubs"); + ceph_assert(r == 0); + + r = admin_socket->register_command("calc_objectstore_db_histogram", + "calc_objectstore_db_histogram", + asok_hook, + "Generate key value histogram of kvdb(rocksdb) which used by bluestore"); + ceph_assert(r == 0); + + r = admin_socket->register_command("flush_store_cache", + "flush_store_cache", + asok_hook, + "Flush bluestore internal cache"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history", + asok_hook, + "show recent state history"); + ceph_assert(r == 0); + + r = admin_socket->register_command("compact", "compact", + asok_hook, + "Commpact object store's omap." + " WARNING: Compaction probably slows your requests"); + ceph_assert(r == 0); + + r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools", + asok_hook, + "dump pools whose PG(s) are mapped to this OSD."); + + ceph_assert(r == 0); + + r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False", + asok_hook, + "probe OSD devices for SMART data."); + + ceph_assert(r == 0); + + r = admin_socket->register_command("list_devices", "list_devices", + asok_hook, + "list OSD devices."); + r = admin_socket->register_command("send_beacon", "send_beacon", + asok_hook, + "send OSD beacon to mon immediately"); + + r = admin_socket->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook, + "Dump osd heartbeat network ping times"); + ceph_assert(r == 0); + + test_ops_hook = new TestOpsSocketHook(&(this->service), this->store); + // Note: pools are CephString instead of CephPoolname because + // these commands traditionally support both pool names and numbers + r = admin_socket->register_command( + "setomapval", + "setomapval " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=key,type=CephString "\ + "name=val,type=CephString", + test_ops_hook, + "set omap key"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "rmomapkey", + "rmomapkey " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=key,type=CephString", + test_ops_hook, + "remove omap key"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "setomapheader", + "setomapheader " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=header,type=CephString", + test_ops_hook, + "set omap header"); + ceph_assert(r == 0); + + r = admin_socket->register_command( + "getomap", + "getomap " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname", + test_ops_hook, + "output entire object map"); + ceph_assert(r == 0); + + r = admin_socket->register_command( + "truncobj", + "truncobj " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=len,type=CephInt", + test_ops_hook, + "truncate object to length"); + ceph_assert(r == 0); + + r = admin_socket->register_command( + "injectdataerr", + "injectdataerr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=false,range=0|255", + test_ops_hook, + "inject data error to an object"); + ceph_assert(r == 0); + + r = admin_socket->register_command( + "injectmdataerr", + "injectmdataerr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=false,range=0|255", + test_ops_hook, + "inject metadata error to an object"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "set_recovery_delay", + "set_recovery_delay " \ + "name=utime,type=CephInt,req=false", + test_ops_hook, + "Delay osd recovery by specified seconds"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "trigger_scrub", + "trigger_scrub " \ + "name=pgid,type=CephString " \ + "name=time,type=CephInt,req=false", + test_ops_hook, + "Trigger a scheduled scrub "); + ceph_assert(r == 0); + r = admin_socket->register_command( + "trigger_deep_scrub", + "trigger_deep_scrub " \ + "name=pgid,type=CephString " \ + "name=time,type=CephInt,req=false", + test_ops_hook, + "Trigger a scheduled deep scrub "); + ceph_assert(r == 0); + r = admin_socket->register_command( + "injectfull", + "injectfull " \ + "name=type,type=CephString,req=false " \ + "name=count,type=CephInt,req=false ", + test_ops_hook, + "Inject a full disk (optional count times)"); + ceph_assert(r == 0); +} + +void OSD::create_logger() +{ + dout(10) << "create_logger" << dendl; + + PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last); + + // Latency axis configuration for op histograms, values are in nanoseconds + PerfHistogramCommon::axis_config_d op_hist_x_axis_config{ + "Latency (usec)", + PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale + 0, ///< Start at 0 + 100000, ///< Quantization unit is 100usec + 32, ///< Enough to cover much longer than slow requests + }; + + // Op size axis configuration for op histograms, values are in bytes + PerfHistogramCommon::axis_config_d op_hist_y_axis_config{ + "Request size (bytes)", + PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale + 0, ///< Start at 0 + 512, ///< Quantization unit is 512 bytes + 32, ///< Enough to cover requests larger than GB + }; + + + // All the basic OSD operation stats are to be considered useful + osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + osd_plb.add_u64( + l_osd_op_wip, "op_wip", + "Replication operations currently being processed (primary)"); + osd_plb.add_u64_counter( + l_osd_op, "op", + "Client operations", + "ops", PerfCountersBuilder::PRIO_CRITICAL); + osd_plb.add_u64_counter( + l_osd_op_inb, "op_in_bytes", + "Client operations total write size", + "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); + osd_plb.add_u64_counter( + l_osd_op_outb, "op_out_bytes", + "Client operations total read size", + "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_op_lat, "op_latency", + "Latency of client operations (including queue time)", + "l", 9); + osd_plb.add_time_avg( + l_osd_op_process_lat, "op_process_latency", + "Latency of client operations (excluding queue time)"); + osd_plb.add_time_avg( + l_osd_op_prepare_lat, "op_prepare_latency", + "Latency of client operations (excluding queue time and wait for finished)"); + + osd_plb.add_u64_counter( + l_osd_op_r, "op_r", "Client read operations"); + osd_plb.add_u64_counter( + l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_op_r_lat, "op_r_latency", + "Latency of read operation (including queue time)"); + osd_plb.add_u64_counter_histogram( + l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of operation latency (including queue time) + data read"); + osd_plb.add_time_avg( + l_osd_op_r_process_lat, "op_r_process_latency", + "Latency of read operation (excluding queue time)"); + osd_plb.add_time_avg( + l_osd_op_r_prepare_lat, "op_r_prepare_latency", + "Latency of read operations (excluding queue time and wait for finished)"); + osd_plb.add_u64_counter( + l_osd_op_w, "op_w", "Client write operations"); + osd_plb.add_u64_counter( + l_osd_op_w_inb, "op_w_in_bytes", "Client data written"); + osd_plb.add_time_avg( + l_osd_op_w_lat, "op_w_latency", + "Latency of write operation (including queue time)"); + osd_plb.add_u64_counter_histogram( + l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of operation latency (including queue time) + data written"); + osd_plb.add_time_avg( + l_osd_op_w_process_lat, "op_w_process_latency", + "Latency of write operation (excluding queue time)"); + osd_plb.add_time_avg( + l_osd_op_w_prepare_lat, "op_w_prepare_latency", + "Latency of write operations (excluding queue time and wait for finished)"); + osd_plb.add_u64_counter( + l_osd_op_rw, "op_rw", + "Client read-modify-write operations"); + osd_plb.add_u64_counter( + l_osd_op_rw_inb, "op_rw_in_bytes", + "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_u64_counter( + l_osd_op_rw_outb,"op_rw_out_bytes", + "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_op_rw_lat, "op_rw_latency", + "Latency of read-modify-write operation (including queue time)"); + osd_plb.add_u64_counter_histogram( + l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of rw operation latency (including queue time) + data written"); + osd_plb.add_u64_counter_histogram( + l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of rw operation latency (including queue time) + data read"); + osd_plb.add_time_avg( + l_osd_op_rw_process_lat, "op_rw_process_latency", + "Latency of read-modify-write operation (excluding queue time)"); + osd_plb.add_time_avg( + l_osd_op_rw_prepare_lat, "op_rw_prepare_latency", + "Latency of read-modify-write operations (excluding queue time and wait for finished)"); + + // Now we move on to some more obscure stats, revert to assuming things + // are low priority unless otherwise specified. + osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); + + osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat", + "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency + osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat", + "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency + + osd_plb.add_u64_counter( + l_osd_sop, "subop", "Suboperations"); + osd_plb.add_u64_counter( + l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency"); + + osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes"); + osd_plb.add_u64_counter( + l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency"); + osd_plb.add_u64_counter( + l_osd_sop_pull, "subop_pull", "Suboperations pull requests"); + osd_plb.add_time_avg( + l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency"); + osd_plb.add_u64_counter( + l_osd_sop_push, "subop_push", "Suboperations push messages"); + osd_plb.add_u64_counter( + l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES)); + osd_plb.add_time_avg( + l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency"); + + osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent"); + osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent"); + osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES)); + + osd_plb.add_u64_counter( + l_osd_rop, "recovery_ops", + "Started recovery operations", + "rop", PerfCountersBuilder::PRIO_INTERESTING); + + osd_plb.add_u64_counter( + l_osd_rbytes, "recovery_bytes", + "recovery bytes", + "rbt", PerfCountersBuilder::PRIO_INTERESTING); + + osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load"); + osd_plb.add_u64( + l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache"); + osd_plb.add_u64( + l_osd_cached_crc_adjusted, "cached_crc_adjusted", + "Total number getting crc from crc_cache with adjusting"); + osd_plb.add_u64(l_osd_missed_crc, "missed_crc", + "Total number of crc cache misses"); + + osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups", + "pgs", PerfCountersBuilder::PRIO_USEFUL); + osd_plb.add_u64( + l_osd_pg_primary, "numpg_primary", + "Placement groups for which this osd is primary"); + osd_plb.add_u64( + l_osd_pg_replica, "numpg_replica", + "Placement groups for which this osd is replica"); + osd_plb.add_u64( + l_osd_pg_stray, "numpg_stray", + "Placement groups ready to be deleted from this osd"); + osd_plb.add_u64( + l_osd_pg_removing, "numpg_removing", + "Placement groups queued for local deletion", "pgsr", + PerfCountersBuilder::PRIO_USEFUL); + osd_plb.add_u64( + l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to"); + osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages"); + osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs"); + osd_plb.add_u64_counter( + l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates"); + osd_plb.add_u64_counter( + l_osd_waiting_for_map, "messages_delayed_for_map", + "Operations waiting for OSD map"); + + osd_plb.add_u64_counter( + l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit"); + osd_plb.add_u64_counter( + l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss"); + osd_plb.add_u64_counter( + l_osd_map_cache_miss_low, "osd_map_cache_miss_low", + "osdmap cache miss below cache lower bound"); + osd_plb.add_u64_avg( + l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg", + "osdmap cache miss, avg distance below cache lower bound"); + osd_plb.add_u64_counter( + l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit", + "OSDMap buffer cache hits"); + osd_plb.add_u64_counter( + l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss", + "OSDMap buffer cache misses"); + + osd_plb.add_u64( + l_osd_stat_bytes, "stat_bytes", "OSD size", "size", + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_u64( + l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used", + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES)); + + osd_plb.add_u64_counter( + l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations"); + + osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions"); + osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes"); + osd_plb.add_u64_counter( + l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes"); + osd_plb.add_u64_counter( + l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts"); + osd_plb.add_u64_counter( + l_osd_tier_try_flush_fail, "tier_try_flush_fail", + "Failed tier flush attempts"); + osd_plb.add_u64_counter( + l_osd_tier_evict, "tier_evict", "Tier evictions"); + osd_plb.add_u64_counter( + l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts"); + osd_plb.add_u64_counter( + l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set"); + osd_plb.add_u64_counter( + l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned"); + osd_plb.add_u64_counter( + l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)"); + osd_plb.add_u64_counter( + l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads"); + osd_plb.add_u64_counter( + l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes"); + + osd_plb.add_u64_counter( + l_osd_agent_wake, "agent_wake", "Tiering agent wake up"); + osd_plb.add_u64_counter( + l_osd_agent_skip, "agent_skip", "Objects skipped by agent"); + osd_plb.add_u64_counter( + l_osd_agent_flush, "agent_flush", "Tiering agent flushes"); + osd_plb.add_u64_counter( + l_osd_agent_evict, "agent_evict", "Tiering agent evictions"); + + osd_plb.add_u64_counter( + l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits"); + osd_plb.add_u64_counter( + l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups"); + + osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit"); + osd_plb.add_time_avg( + l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency"); + osd_plb.add_time_avg( + l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency"); + osd_plb.add_time_avg( + l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency"); + + osd_plb.add_u64_counter( + l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)"); + osd_plb.add_u64_counter( + l_osd_pg_fastinfo, "osd_pg_fastinfo", + "PG updated its info using fastinfo attr"); + osd_plb.add_u64_counter( + l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr"); + + logger = osd_plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); +} + +void OSD::create_recoverystate_perf() +{ + dout(10) << "create_recoverystate_perf" << dendl; + + PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last); + + rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency"); + rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency"); + rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency"); + rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency"); + rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency"); + rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency"); + rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency"); + rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency"); + rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency"); + rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency"); + rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency"); + rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency"); + rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency"); + rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency"); + rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency"); + rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency"); + rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency"); + rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency"); + rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency"); + rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency"); + rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency"); + rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency"); + rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency"); + rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency"); + rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency"); + + recoverystate_perf = rs_perf.create_perf_counters(); + cct->get_perfcounters_collection()->add(recoverystate_perf); +} + +int OSD::shutdown() +{ + if (cct->_conf->osd_fast_shutdown) { + derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl; + cct->_log->flush(); + _exit(0); + } + + if (!service.prepare_to_stop()) + return 0; // already shutting down + osd_lock.Lock(); + if (is_stopping()) { + osd_lock.Unlock(); + return 0; + } + dout(0) << "shutdown" << dendl; + + set_state(STATE_STOPPING); + + // Debugging + if (cct->_conf.get_val<bool>("osd_debug_shutdown")) { + cct->_conf.set_val("debug_osd", "100"); + cct->_conf.set_val("debug_journal", "100"); + cct->_conf.set_val("debug_filestore", "100"); + cct->_conf.set_val("debug_bluestore", "100"); + cct->_conf.set_val("debug_ms", "100"); + cct->_conf.apply_changes(nullptr); + } + + // stop MgrClient earlier as it's more like an internal consumer of OSD + mgrc.shutdown(); + + service.start_shutdown(); + + // stop sending work to pgs. this just prevents any new work in _process + // from racing with on_shutdown and potentially entering the pg after. + op_shardedwq.drain(); + + // Shutdown PGs + { + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto pg : pgs) { + pg->shutdown(); + } + } + + // drain op queue again (in case PGs requeued something) + op_shardedwq.drain(); + { + finished.clear(); // zap waiters (bleh, this is messy) + waiting_for_osdmap.clear(); + } + + // unregister commands + cct->get_admin_socket()->unregister_commands(asok_hook); + delete asok_hook; + asok_hook = NULL; + + cct->get_admin_socket()->unregister_commands(test_ops_hook); + delete test_ops_hook; + test_ops_hook = NULL; + + osd_lock.Unlock(); + + heartbeat_lock.Lock(); + heartbeat_stop = true; + heartbeat_cond.Signal(); + heartbeat_lock.Unlock(); + heartbeat_thread.join(); + + osd_op_tp.drain(); + osd_op_tp.stop(); + dout(10) << "op sharded tp stopped" << dendl; + + command_tp.drain(); + command_tp.stop(); + dout(10) << "command tp stopped" << dendl; + + dout(10) << "stopping agent" << dendl; + service.agent_stop(); + + boot_finisher.wait_for_empty(); + + osd_lock.Lock(); + + boot_finisher.stop(); + reset_heartbeat_peers(true); + + tick_timer.shutdown(); + + { + std::lock_guard l(tick_timer_lock); + tick_timer_without_osd_lock.shutdown(); + } + + // note unmount epoch + dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl; + superblock.mounted = service.get_boot_epoch(); + superblock.clean_thru = get_osdmap_epoch(); + ObjectStore::Transaction t; + write_superblock(t); + int r = store->queue_transaction(service.meta_ch, std::move(t)); + if (r) { + derr << "OSD::shutdown: error writing superblock: " + << cpp_strerror(r) << dendl; + } + + + service.shutdown_reserver(); + + // Remove PGs +#ifdef PG_DEBUG_REFS + service.dump_live_pgids(); +#endif + while (true) { + vector<PGRef> pgs; + _get_pgs(&pgs, true); + if (pgs.empty()) { + break; + } + for (auto& pg : pgs) { + if (pg->is_deleted()) { + continue; + } + dout(20) << " kicking pg " << pg << dendl; + pg->lock(); + if (pg->get_num_ref() != 1) { + derr << "pgid " << pg->get_pgid() << " has ref count of " + << pg->get_num_ref() << dendl; +#ifdef PG_DEBUG_REFS + pg->dump_live_ids(); +#endif + if (cct->_conf->osd_shutdown_pgref_assert) { + ceph_abort(); + } + } + pg->ch.reset(); + pg->unlock(); + } + } +#ifdef PG_DEBUG_REFS + service.dump_live_pgids(); +#endif + + osd_lock.Unlock(); + cct->_conf.remove_observer(this); + osd_lock.Lock(); + + service.meta_ch.reset(); + + dout(10) << "syncing store" << dendl; + enable_disable_fuse(true); + + if (cct->_conf->osd_journal_flush_on_shutdown) { + dout(10) << "flushing journal" << dendl; + store->flush_journal(); + } + + monc->shutdown(); + osd_lock.Unlock(); + + map_lock.get_write(); + set_osdmap(OSDMapRef()); + map_lock.put_write(); + + for (auto s : shards) { + std::lock_guard l(s->osdmap_lock); + s->shard_osdmap = OSDMapRef(); + } + service.shutdown(); + + std::lock_guard lock(osd_lock); + store->umount(); + delete store; + store = nullptr; + dout(10) << "Store synced" << dendl; + + op_tracker.on_shutdown(); + + class_handler->shutdown(); + client_messenger->shutdown(); + cluster_messenger->shutdown(); + hb_front_client_messenger->shutdown(); + hb_back_client_messenger->shutdown(); + objecter_messenger->shutdown(); + hb_front_server_messenger->shutdown(); + hb_back_server_messenger->shutdown(); + + return r; +} + +int OSD::mon_cmd_maybe_osd_create(string &cmd) +{ + bool created = false; + while (true) { + dout(10) << __func__ << " cmd: " << cmd << dendl; + vector<string> vcmd{cmd}; + bufferlist inbl; + C_SaferCond w; + string outs; + monc->start_mon_command(vcmd, inbl, NULL, &outs, &w); + int r = w.wait(); + if (r < 0) { + if (r == -ENOENT && !created) { + string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami) + + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}"; + vector<string> vnewcmd{newcmd}; + bufferlist inbl; + C_SaferCond w; + string outs; + monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w); + int r = w.wait(); + if (r < 0) { + derr << __func__ << " fail: osd does not exist and created failed: " + << cpp_strerror(r) << dendl; + return r; + } + created = true; + continue; + } + derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl; + return r; + } + break; + } + + return 0; +} + +int OSD::update_crush_location() +{ + if (!cct->_conf->osd_crush_update_on_start) { + dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl; + return 0; + } + + char weight[32]; + if (cct->_conf->osd_crush_initial_weight >= 0) { + snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight); + } else { + struct store_statfs_t st; + osd_alert_list_t alerts; + int r = store->statfs(&st, &alerts); + if (r < 0) { + derr << "statfs: " << cpp_strerror(r) << dendl; + return r; + } + snprintf(weight, sizeof(weight), "%.4lf", + std::max(.00001, + double(st.total) / + double(1ull << 40 /* TB */))); + } + + std::multimap<string,string> loc = cct->crush_location.get_location(); + dout(10) << __func__ << " crush location is " << loc << dendl; + + string cmd = + string("{\"prefix\": \"osd crush create-or-move\", ") + + string("\"id\": ") + stringify(whoami) + string(", ") + + string("\"weight\":") + weight + string(", ") + + string("\"args\": ["); + for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) { + if (p != loc.begin()) + cmd += ", "; + cmd += "\"" + p->first + "=" + p->second + "\""; + } + cmd += "]}"; + + return mon_cmd_maybe_osd_create(cmd); +} + +int OSD::update_crush_device_class() +{ + if (!cct->_conf->osd_class_update_on_start) { + dout(10) << __func__ << " osd_class_update_on_start = false" << dendl; + return 0; + } + + string device_class; + int r = store->read_meta("crush_device_class", &device_class); + if (r < 0 || device_class.empty()) { + device_class = store->get_default_device_class(); + } + + if (device_class.empty()) { + dout(20) << __func__ << " no device class stored locally" << dendl; + return 0; + } + + string cmd = + string("{\"prefix\": \"osd crush set-device-class\", ") + + string("\"class\": \"") + device_class + string("\", ") + + string("\"ids\": [\"") + stringify(whoami) + string("\"]}"); + + r = mon_cmd_maybe_osd_create(cmd); + if (r == -EBUSY) { + // good, already bound to a device-class + return 0; + } else { + return r; + } +} + +void OSD::write_superblock(ObjectStore::Transaction& t) +{ + dout(10) << "write_superblock " << superblock << dendl; + + //hack: at minimum it's using the baseline feature set + if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE)) + superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + + bufferlist bl; + encode(superblock, bl); + t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); +} + +int OSD::read_superblock() +{ + bufferlist bl; + int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl); + if (r < 0) + return r; + + auto p = bl.cbegin(); + decode(superblock, p); + + dout(10) << "read_superblock " << superblock << dendl; + + return 0; +} + +void OSD::clear_temp_objects() +{ + dout(10) << __func__ << dendl; + vector<coll_t> ls; + store->list_collections(ls); + for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) { + spg_t pgid; + if (!p->is_pg(&pgid)) + continue; + + // list temp objects + dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl; + + vector<ghobject_t> temps; + ghobject_t next; + while (1) { + vector<ghobject_t> objects; + auto ch = store->open_collection(*p); + ceph_assert(ch); + store->collection_list(ch, next, ghobject_t::get_max(), + store->get_ideal_list_max(), + &objects, &next); + if (objects.empty()) + break; + vector<ghobject_t>::iterator q; + for (q = objects.begin(); q != objects.end(); ++q) { + // Hammer set pool for temps to -1, so check for clean-up + if (q->hobj.is_temp() || (q->hobj.pool == -1)) { + temps.push_back(*q); + } else { + break; + } + } + // If we saw a non-temp object and hit the break above we can + // break out of the while loop too. + if (q != objects.end()) + break; + } + if (!temps.empty()) { + ObjectStore::Transaction t; + int removed = 0; + for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) { + dout(20) << " removing " << *p << " object " << *q << dendl; + t.remove(*p, *q); + if (++removed > cct->_conf->osd_target_transaction_size) { + store->queue_transaction(service.meta_ch, std::move(t)); + t = ObjectStore::Transaction(); + removed = 0; + } + } + if (removed) { + store->queue_transaction(service.meta_ch, std::move(t)); + } + } + } +} + +void OSD::recursive_remove_collection(CephContext* cct, + ObjectStore *store, spg_t pgid, + coll_t tmp) +{ + OSDriver driver( + store, + coll_t(), + make_snapmapper_oid()); + + ObjectStore::CollectionHandle ch = store->open_collection(tmp); + ObjectStore::Transaction t; + SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard); + + ghobject_t next; + int max = cct->_conf->osd_target_transaction_size; + vector<ghobject_t> objects; + objects.reserve(max); + while (true) { + objects.clear(); + store->collection_list(ch, next, ghobject_t::get_max(), + max, &objects, &next); + generic_dout(10) << __func__ << " " << objects << dendl; + if (objects.empty()) + break; + for (auto& p: objects) { + OSDriver::OSTransaction _t(driver.get_transaction(&t)); + int r = mapper.remove_oid(p.hobj, &_t); + if (r != 0 && r != -ENOENT) + ceph_abort(); + t.remove(tmp, p); + } + int r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + t = ObjectStore::Transaction(); + } + t.remove_collection(tmp); + int r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + + C_SaferCond waiter; + if (!ch->flush_commit(&waiter)) { + waiter.wait(); + } +} + + +// ====================================================== +// PG's + +PG* OSD::_make_pg( + OSDMapRef createmap, + spg_t pgid) +{ + dout(10) << __func__ << " " << pgid << dendl; + pg_pool_t pi; + map<string,string> ec_profile; + string name; + if (createmap->have_pg_pool(pgid.pool())) { + pi = *createmap->get_pg_pool(pgid.pool()); + name = createmap->get_pool_name(pgid.pool()); + if (pi.is_erasure()) { + ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile); + } + } else { + // pool was deleted; grab final pg_pool_t off disk. + ghobject_t oid = make_final_pool_info_oid(pgid.pool()); + bufferlist bl; + int r = store->read(service.meta_ch, oid, 0, 0, bl); + if (r < 0) { + derr << __func__ << " missing pool " << pgid.pool() << " tombstone" + << dendl; + return nullptr; + } + ceph_assert(r >= 0); + auto p = bl.cbegin(); + decode(pi, p); + decode(name, p); + if (p.end()) { // dev release v13.0.2 did not include ec_profile + derr << __func__ << " missing ec_profile from pool " << pgid.pool() + << " tombstone" << dendl; + return nullptr; + } + decode(ec_profile, p); + } + PGPool pool(cct, createmap, pgid.pool(), pi, name); + PG *pg; + if (pi.type == pg_pool_t::TYPE_REPLICATED || + pi.type == pg_pool_t::TYPE_ERASURE) + pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid); + else + ceph_abort(); + return pg; +} + +void OSD::_get_pgs(vector<PGRef> *v, bool clear_too) +{ + v->clear(); + v->reserve(get_num_pgs()); + for (auto& s : shards) { + std::lock_guard l(s->shard_lock); + for (auto& j : s->pg_slots) { + if (j.second->pg && + !j.second->pg->is_deleted()) { + v->push_back(j.second->pg); + if (clear_too) { + s->_detach_pg(j.second.get()); + } + } + } + } +} + +void OSD::_get_pgids(vector<spg_t> *v) +{ + v->clear(); + v->reserve(get_num_pgs()); + for (auto& s : shards) { + std::lock_guard l(s->shard_lock); + for (auto& j : s->pg_slots) { + if (j.second->pg && + !j.second->pg->is_deleted()) { + v->push_back(j.first); + } + } + } +} + +void OSD::register_pg(PGRef pg) +{ + spg_t pgid = pg->get_pgid(); + uint32_t shard_index = pgid.hash_to_shard(num_shards); + auto sdata = shards[shard_index]; + std::lock_guard l(sdata->shard_lock); + auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>()); + ceph_assert(r.second); + auto *slot = r.first->second.get(); + dout(20) << __func__ << " " << pgid << " " << pg << dendl; + sdata->_attach_pg(slot, pg.get()); +} + +bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num) +{ + auto sdata = pg->osd_shard; + ceph_assert(sdata); + { + std::lock_guard l(sdata->shard_lock); + auto p = sdata->pg_slots.find(pg->pg_id); + if (p == sdata->pg_slots.end() || + !p->second->pg) { + dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl; + return false; + } + if (p->second->waiting_for_merge_epoch) { + dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl; + return false; + } + dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl; + sdata->_detach_pg(p->second.get()); + } + + for (auto shard : shards) { + shard->unprime_split_children(pg->pg_id, old_pg_num); + } + + // update pg count now since we might not get an osdmap any time soon. + if (pg->is_primary()) + service.logger->dec(l_osd_pg_primary); + else if (pg->is_replica()) + service.logger->dec(l_osd_pg_replica); + else + service.logger->dec(l_osd_pg_stray); + + return true; +} + +PGRef OSD::_lookup_pg(spg_t pgid) +{ + uint32_t shard_index = pgid.hash_to_shard(num_shards); + auto sdata = shards[shard_index]; + std::lock_guard l(sdata->shard_lock); + auto p = sdata->pg_slots.find(pgid); + if (p == sdata->pg_slots.end()) { + return nullptr; + } + return p->second->pg; +} + +PGRef OSD::_lookup_lock_pg(spg_t pgid) +{ + PGRef pg = _lookup_pg(pgid); + if (!pg) { + return nullptr; + } + pg->lock(); + if (!pg->is_deleted()) { + return pg; + } + pg->unlock(); + return nullptr; +} + +PGRef OSD::lookup_lock_pg(spg_t pgid) +{ + return _lookup_lock_pg(pgid); +} + +void OSD::load_pgs() +{ + ceph_assert(osd_lock.is_locked()); + dout(0) << "load_pgs" << dendl; + + { + auto pghist = make_pg_num_history_oid(); + bufferlist bl; + int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0); + if (r >= 0 && bl.length() > 0) { + auto p = bl.cbegin(); + decode(pg_num_history, p); + } + dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; + } + + vector<coll_t> ls; + int r = store->list_collections(ls); + if (r < 0) { + derr << "failed to list pgs: " << cpp_strerror(-r) << dendl; + } + + int num = 0; + for (vector<coll_t>::iterator it = ls.begin(); + it != ls.end(); + ++it) { + spg_t pgid; + if (it->is_temp(&pgid) || + (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) { + dout(10) << "load_pgs " << *it + << " removing, legacy or flagged for removal pg" << dendl; + recursive_remove_collection(cct, store, pgid, *it); + continue; + } + + if (!it->is_pg(&pgid)) { + dout(10) << "load_pgs ignoring unrecognized " << *it << dendl; + continue; + } + + dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl; + epoch_t map_epoch = 0; + int r = PG::peek_map_epoch(store, pgid, &map_epoch); + if (r < 0) { + derr << __func__ << " unable to peek at " << pgid << " metadata, skipping" + << dendl; + continue; + } + + PGRef pg; + if (map_epoch > 0) { + OSDMapRef pgosdmap = service.try_get_map(map_epoch); + if (!pgosdmap) { + if (!get_osdmap()->have_pg_pool(pgid.pool())) { + derr << __func__ << ": could not find map for epoch " << map_epoch + << " on pg " << pgid << ", but the pool is not present in the " + << "current map, so this is probably a result of bug 10617. " + << "Skipping the pg for now, you can use ceph-objectstore-tool " + << "to clean it up later." << dendl; + continue; + } else { + derr << __func__ << ": have pgid " << pgid << " at epoch " + << map_epoch << ", but missing map. Crashing." + << dendl; + ceph_abort_msg("Missing map in load_pgs"); + } + } + pg = _make_pg(pgosdmap, pgid); + } else { + pg = _make_pg(get_osdmap(), pgid); + } + if (!pg) { + recursive_remove_collection(cct, store, pgid, *it); + continue; + } + + // there can be no waiters here, so we don't call _wake_pg_slot + + pg->lock(); + pg->ch = store->open_collection(pg->coll); + + // read pg state, log + pg->read_state(store); + + if (pg->dne()) { + dout(10) << "load_pgs " << *it << " deleting dne" << dendl; + pg->ch = nullptr; + pg->unlock(); + recursive_remove_collection(cct, store, pgid, *it); + continue; + } + { + uint32_t shard_index = pgid.hash_to_shard(shards.size()); + assert(NULL != shards[shard_index]); + store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue)); + } + + pg->reg_next_scrub(); + + dout(10) << __func__ << " loaded " << *pg << dendl; + pg->unlock(); + + register_pg(pg); + ++num; + } + dout(0) << __func__ << " opened " << num << " pgs" << dendl; +} + + +PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap, + const PGCreateInfo *info) +{ + spg_t pgid = info->pgid; + + if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) { + dout(10) << __func__ << " hit max pg, dropping" << dendl; + return nullptr; + } + + PG::RecoveryCtx rctx = create_context(); + + OSDMapRef startmap = get_map(info->epoch); + + if (info->by_mon) { + int64_t pool_id = pgid.pgid.pool(); + const pg_pool_t *pool = osdmap->get_pg_pool(pool_id); + if (!pool) { + dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl; + return nullptr; + } + if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS && + !pool->has_flag(pg_pool_t::FLAG_CREATING)) { + // this ensures we do not process old creating messages after the + // pool's initial pgs have been created (and pg are subsequently + // allowed to split or merge). + dout(20) << __func__ << " dropping " << pgid + << "create, pool does not have CREATING flag set" << dendl; + return nullptr; + } + } + + int up_primary, acting_primary; + vector<int> up, acting; + startmap->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &acting_primary); + + const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool()); + if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) && + store->get_type() != "bluestore") { + clog->warn() << "pg " << pgid + << " is at risk of silent data corruption: " + << "the pool allows ec overwrites but is not stored in " + << "bluestore, so deep scrubbing will not detect bitrot"; + } + PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num())); + PG::_init(*rctx.transaction, pgid, pp); + + int role = startmap->calc_pg_role(whoami, acting, acting.size()); + if (!pp->is_replicated() && role != pgid.shard) { + role = -1; + } + + PGRef pg = _make_pg(startmap, pgid); + pg->ch = store->create_new_collection(pg->coll); + + { + uint32_t shard_index = pgid.hash_to_shard(shards.size()); + assert(NULL != shards[shard_index]); + store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue)); + } + + pg->lock(true); + + // we are holding the shard lock + ceph_assert(!pg->is_deleted()); + + pg->init( + role, + up, + up_primary, + acting, + acting_primary, + info->history, + info->past_intervals, + false, + rctx.transaction); + + pg->init_collection_pool_opts(); + + if (pg->is_primary()) { + Mutex::Locker locker(m_perf_queries_lock); + pg->set_dynamic_perf_stats_queries(m_perf_queries); + } + + pg->handle_initialize(&rctx); + pg->handle_activate_map(&rctx); + + dispatch_context(rctx, pg.get(), osdmap, nullptr); + + dout(10) << __func__ << " new pg " << *pg << dendl; + return pg; +} + +bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap, + spg_t pgid, + bool is_mon_create) +{ + const auto max_pgs_per_osd = + (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") * + cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio")); + + if (num_pgs < max_pgs_per_osd) { + return false; + } + + std::lock_guard l(pending_creates_lock); + if (is_mon_create) { + pending_creates_from_mon++; + } else { + bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0; + pending_creates_from_osd.emplace(pgid.pgid, is_primary); + } + dout(1) << __func__ << " withhold creation of pg " << pgid + << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl; + return true; +} + +// to re-trigger a peering, we have to twiddle the pg mapping a little bit, +// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn +// to up set if pg_temp is empty. so an empty pg_temp won't work. +static vector<int32_t> twiddle(const vector<int>& acting) { + if (acting.size() > 1) { + return {acting[0]}; + } else { + vector<int32_t> twiddled(acting.begin(), acting.end()); + twiddled.push_back(-1); + return twiddled; + } +} + +void OSD::resume_creating_pg() +{ + bool do_sub_pg_creates = false; + bool have_pending_creates = false; + { + const auto max_pgs_per_osd = + (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") * + cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio")); + if (max_pgs_per_osd <= num_pgs) { + // this could happen if admin decreases this setting before a PG is removed + return; + } + unsigned spare_pgs = max_pgs_per_osd - num_pgs; + std::lock_guard l(pending_creates_lock); + if (pending_creates_from_mon > 0) { + dout(20) << __func__ << " pending_creates_from_mon " + << pending_creates_from_mon << dendl; + do_sub_pg_creates = true; + if (pending_creates_from_mon >= spare_pgs) { + spare_pgs = pending_creates_from_mon = 0; + } else { + spare_pgs -= pending_creates_from_mon; + pending_creates_from_mon = 0; + } + } + auto pg = pending_creates_from_osd.cbegin(); + while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) { + dout(20) << __func__ << " pg " << pg->first << dendl; + vector<int> acting; + get_osdmap()->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr); + service.queue_want_pg_temp(pg->first, twiddle(acting), true); + pg = pending_creates_from_osd.erase(pg); + do_sub_pg_creates = true; + spare_pgs--; + } + have_pending_creates = (pending_creates_from_mon > 0 || + !pending_creates_from_osd.empty()); + } + + bool do_renew_subs = false; + if (do_sub_pg_creates) { + if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) { + dout(4) << __func__ << ": resolicit pg creates from mon since " + << last_pg_create_epoch << dendl; + do_renew_subs = true; + } + } + version_t start = get_osdmap_epoch() + 1; + if (have_pending_creates) { + // don't miss any new osdmap deleting PGs + if (monc->sub_want("osdmap", start, 0)) { + dout(4) << __func__ << ": resolicit osdmap from mon since " + << start << dendl; + do_renew_subs = true; + } + } else if (do_sub_pg_creates) { + // no need to subscribe the osdmap continuously anymore + // once the pgtemp and/or mon_subscribe(pg_creates) is sent + if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) { + dout(4) << __func__ << ": re-subscribe osdmap(onetime) since " + << start << dendl; + do_renew_subs = true; + } + } + + if (do_renew_subs) { + monc->renew_subs(); + } + + service.send_pg_temp(); +} + +void OSD::build_initial_pg_history( + spg_t pgid, + epoch_t created, + utime_t created_stamp, + pg_history_t *h, + PastIntervals *pi) +{ + dout(10) << __func__ << " " << pgid << " created " << created << dendl; + h->epoch_created = created; + h->epoch_pool_created = created; + h->same_interval_since = created; + h->same_up_since = created; + h->same_primary_since = created; + h->last_scrub_stamp = created_stamp; + h->last_deep_scrub_stamp = created_stamp; + h->last_clean_scrub_stamp = created_stamp; + + OSDMapRef lastmap = service.get_map(created); + int up_primary, acting_primary; + vector<int> up, acting; + lastmap->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &acting_primary); + + ostringstream debug; + for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) { + OSDMapRef osdmap = service.get_map(e); + int new_up_primary, new_acting_primary; + vector<int> new_up, new_acting; + osdmap->pg_to_up_acting_osds( + pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary); + + // this is a bit imprecise, but sufficient? + struct min_size_predicate_t : public IsPGRecoverablePredicate { + const pg_pool_t *pi; + bool operator()(const set<pg_shard_t> &have) const { + return have.size() >= pi->min_size; + } + explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {} + } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool())); + + bool new_interval = PastIntervals::check_new_interval( + acting_primary, + new_acting_primary, + acting, new_acting, + up_primary, + new_up_primary, + up, new_up, + h->same_interval_since, + h->last_epoch_clean, + osdmap, + lastmap, + pgid.pgid, + &min_size_predicate, + pi, + &debug); + if (new_interval) { + h->same_interval_since = e; + if (up != new_up) { + h->same_up_since = e; + } + if (acting_primary != new_acting_primary) { + h->same_primary_since = e; + } + if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()), + osdmap->get_pg_num(pgid.pgid.pool()), + nullptr)) { + h->last_epoch_split = e; + } + up = new_up; + acting = new_acting; + up_primary = new_up_primary; + acting_primary = new_acting_primary; + } + lastmap = osdmap; + } + dout(20) << __func__ << " " << debug.str() << dendl; + dout(10) << __func__ << " " << *h << " " << *pi + << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) : + pi->get_bounds()) << ")" + << dendl; +} + +void OSD::_add_heartbeat_peer(int p) +{ + if (p == whoami) + return; + HeartbeatInfo *hi; + + map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p); + if (i == heartbeat_peers.end()) { + pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch()); + if (!cons.first) + return; + hi = &heartbeat_peers[p]; + hi->peer = p; + RefCountedPtr s{new HeartbeatSession{p}, false}; + hi->hb_interval_start = ceph_clock_now(); + hi->con_back = cons.first.get(); + hi->con_back->set_priv(s); + if (cons.second) { + hi->con_front = cons.second.get(); + hi->con_front->set_priv(s); + dout(10) << "_add_heartbeat_peer: new peer osd." << p + << " " << hi->con_back->get_peer_addr() + << " " << hi->con_front->get_peer_addr() + << dendl; + } else { + hi->con_front.reset(NULL); + dout(10) << "_add_heartbeat_peer: new peer osd." << p + << " " << hi->con_back->get_peer_addr() + << dendl; + } + } else { + hi = &i->second; + } + hi->epoch = get_osdmap_epoch(); +} + +void OSD::_remove_heartbeat_peer(int n) +{ + map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n); + ceph_assert(q != heartbeat_peers.end()); + dout(20) << " removing heartbeat peer osd." << n + << " " << q->second.con_back->get_peer_addr() + << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t()) + << dendl; + q->second.con_back->mark_down(); + if (q->second.con_front) { + q->second.con_front->mark_down(); + } + heartbeat_peers.erase(q); +} + +void OSD::need_heartbeat_peer_update() +{ + if (is_stopping()) + return; + dout(20) << "need_heartbeat_peer_update" << dendl; + heartbeat_set_peers_need_update(); +} + +void OSD::maybe_update_heartbeat_peers() +{ + ceph_assert(osd_lock.is_locked()); + + if (is_waiting_for_healthy() || is_active()) { + utime_t now = ceph_clock_now(); + if (last_heartbeat_resample == utime_t()) { + last_heartbeat_resample = now; + heartbeat_set_peers_need_update(); + } else if (!heartbeat_peers_need_update()) { + utime_t dur = now - last_heartbeat_resample; + if (dur > cct->_conf->osd_heartbeat_grace) { + dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl; + heartbeat_set_peers_need_update(); + last_heartbeat_resample = now; + // automatically clean up any stale heartbeat peers + // if we are unhealthy, then clean all + reset_heartbeat_peers(is_waiting_for_healthy()); + } + } + } + + if (!heartbeat_peers_need_update()) + return; + heartbeat_clear_peers_need_update(); + + std::lock_guard l(heartbeat_lock); + + dout(10) << "maybe_update_heartbeat_peers updating" << dendl; + + + // build heartbeat from set + if (is_active()) { + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + pg->with_heartbeat_peers([&](int peer) { + if (get_osdmap()->is_up(peer)) { + _add_heartbeat_peer(peer); + } + }); + } + } + + // include next and previous up osds to ensure we have a fully-connected set + set<int> want, extras; + const int next = get_osdmap()->get_next_up_osd_after(whoami); + if (next >= 0) + want.insert(next); + int prev = get_osdmap()->get_previous_up_osd_before(whoami); + if (prev >= 0 && prev != next) + want.insert(prev); + + // make sure we have at least **min_down** osds coming from different + // subtree level (e.g., hosts) for fast failure detection. + auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters"); + auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level"); + get_osdmap()->get_random_up_osds_by_subtree( + whoami, subtree, min_down, want, &want); + + for (set<int>::iterator p = want.begin(); p != want.end(); ++p) { + dout(10) << " adding neighbor peer osd." << *p << dendl; + extras.insert(*p); + _add_heartbeat_peer(*p); + } + + // remove down peers; enumerate extras + map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin(); + while (p != heartbeat_peers.end()) { + if (!get_osdmap()->is_up(p->first)) { + int o = p->first; + ++p; + _remove_heartbeat_peer(o); + continue; + } + if (p->second.epoch < get_osdmap_epoch()) { + extras.insert(p->first); + } + ++p; + } + + // too few? + for (int n = next; n >= 0; ) { + if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers) + break; + if (!extras.count(n) && !want.count(n) && n != whoami) { + dout(10) << " adding random peer osd." << n << dendl; + extras.insert(n); + _add_heartbeat_peer(n); + } + n = get_osdmap()->get_next_up_osd_after(n); + if (n == next) + break; // came full circle; stop + } + + // too many? + for (set<int>::iterator p = extras.begin(); + (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end(); + ++p) { + if (want.count(*p)) + continue; + _remove_heartbeat_peer(*p); + } + + dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl; +} + +void OSD::reset_heartbeat_peers(bool all) +{ + ceph_assert(osd_lock.is_locked()); + dout(10) << "reset_heartbeat_peers" << dendl; + utime_t stale = ceph_clock_now(); + stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale"); + std::lock_guard l(heartbeat_lock); + for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) { + HeartbeatInfo& hi = it->second; + if (all || hi.is_stale(stale)) { + hi.con_back->mark_down(); + if (hi.con_front) { + hi.con_front->mark_down(); + } + // stop sending failure_report to mon too + failure_queue.erase(it->first); + heartbeat_peers.erase(it++); + } else { + it++; + } + } +} + +void OSD::handle_osd_ping(MOSDPing *m) +{ + if (superblock.cluster_fsid != m->fsid) { + dout(20) << "handle_osd_ping from " << m->get_source_inst() + << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl; + m->put(); + return; + } + + int from = m->get_source().num(); + + heartbeat_lock.Lock(); + if (is_stopping()) { + heartbeat_lock.Unlock(); + m->put(); + return; + } + + OSDMapRef curmap = service.get_osdmap(); + if (!curmap) { + heartbeat_lock.Unlock(); + m->put(); + return; + } + + switch (m->op) { + + case MOSDPing::PING: + { + if (cct->_conf->osd_debug_drop_ping_probability > 0) { + auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from); + if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) { + if (heartbeat_drop->second == 0) { + debug_heartbeat_drops_remaining.erase(heartbeat_drop); + } else { + --heartbeat_drop->second; + dout(5) << "Dropping heartbeat from " << from + << ", " << heartbeat_drop->second + << " remaining to drop" << dendl; + break; + } + } else if (cct->_conf->osd_debug_drop_ping_probability > + ((((double)(rand()%100))/100.0))) { + heartbeat_drop = + debug_heartbeat_drops_remaining.insert(std::make_pair(from, + cct->_conf->osd_debug_drop_ping_duration)).first; + dout(5) << "Dropping heartbeat from " << from + << ", " << heartbeat_drop->second + << " remaining to drop" << dendl; + break; + } + } + + if (!cct->get_heartbeat_map()->is_healthy()) { + dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl; + break; + } + + Message *r = new MOSDPing(monc->get_fsid(), + curmap->get_epoch(), + MOSDPing::PING_REPLY, m->stamp, + cct->_conf->osd_heartbeat_min_size); + m->get_connection()->send_message(r); + + if (curmap->is_up(from)) { + service.note_peer_epoch(from, m->map_epoch); + if (is_active()) { + ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch()); + if (con) { + service.share_map_peer(from, con.get()); + } + } + } else if (!curmap->exists(from) || + curmap->get_down_at(from) > m->map_epoch) { + // tell them they have died + Message *r = new MOSDPing(monc->get_fsid(), + curmap->get_epoch(), + MOSDPing::YOU_DIED, + m->stamp, + cct->_conf->osd_heartbeat_min_size); + m->get_connection()->send_message(r); + } + } + break; + + case MOSDPing::PING_REPLY: + { + map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from); + if (i != heartbeat_peers.end()) { + auto acked = i->second.ping_history.find(m->stamp); + if (acked != i->second.ping_history.end()) { + utime_t now = ceph_clock_now(); + int &unacknowledged = acked->second.second; + if (m->get_connection() == i->second.con_back) { + dout(25) << "handle_osd_ping got reply from osd." << from + << " first_tx " << i->second.first_tx + << " last_tx " << i->second.last_tx + << " last_rx_back " << i->second.last_rx_back << " -> " << now + << " last_rx_front " << i->second.last_rx_front + << dendl; + i->second.last_rx_back = now; + ceph_assert(unacknowledged > 0); + --unacknowledged; + // if there is no front con, set both stamps. + if (i->second.con_front == NULL) { + i->second.last_rx_front = now; + ceph_assert(unacknowledged > 0); + --unacknowledged; + } + } else if (m->get_connection() == i->second.con_front) { + dout(25) << "handle_osd_ping got reply from osd." << from + << " first_tx " << i->second.first_tx + << " last_tx " << i->second.last_tx + << " last_rx_back " << i->second.last_rx_back + << " last_rx_front " << i->second.last_rx_front << " -> " << now + << dendl; + i->second.last_rx_front = now; + ceph_assert(unacknowledged > 0); + --unacknowledged; + } + + if (unacknowledged == 0) { + // succeeded in getting all replies + dout(25) << "handle_osd_ping got all replies from osd." << from + << " , erase pending ping(sent at " << m->stamp << ")" + << " and older pending ping(s)" + << dendl; + +#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5) + ++i->second.hb_average_count; + uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp); + i->second.hb_total_back += back_pingtime; + if (back_pingtime < i->second.hb_min_back) + i->second.hb_min_back = back_pingtime; + if (back_pingtime > i->second.hb_max_back) + i->second.hb_max_back = back_pingtime; + uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp); + i->second.hb_total_front += front_pingtime; + if (front_pingtime < i->second.hb_min_front) + i->second.hb_min_front = front_pingtime; + if (front_pingtime > i->second.hb_max_front) + i->second.hb_max_front = front_pingtime; + + ceph_assert(i->second.hb_interval_start != utime_t()); + if (i->second.hb_interval_start == utime_t()) + i->second.hb_interval_start = now; + int64_t hb_avg_time_period = 60; + if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) { + hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span"); + } + if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) { + uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count; + uint32_t back_min = i->second.hb_min_back; + uint32_t back_max = i->second.hb_max_back; + uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count; + uint32_t front_min = i->second.hb_min_front; + uint32_t front_max = i->second.hb_max_front; + + // Reset for new interval + i->second.hb_average_count = 0; + i->second.hb_interval_start = now; + i->second.hb_total_back = i->second.hb_max_back = 0; + i->second.hb_min_back = UINT_MAX; + i->second.hb_total_front = i->second.hb_max_front = 0; + i->second.hb_min_front = UINT_MAX; + + // Record per osd interace ping times + // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval + if (i->second.hb_back_pingtime.size() == 0) { + ceph_assert(i->second.hb_front_pingtime.size() == 0); + for (unsigned k = 0 ; k < hb_vector_size; ++k) { + i->second.hb_back_pingtime.push_back(back_avg); + i->second.hb_back_min.push_back(back_min); + i->second.hb_back_max.push_back(back_max); + i->second.hb_front_pingtime.push_back(front_avg); + i->second.hb_front_min.push_back(front_min); + i->second.hb_front_max.push_back(front_max); + ++i->second.hb_index; + } + } else { + int index = i->second.hb_index & (hb_vector_size - 1); + i->second.hb_back_pingtime[index] = back_avg; + i->second.hb_back_min[index] = back_min; + i->second.hb_back_max[index] = back_max; + i->second.hb_front_pingtime[index] = front_avg; + i->second.hb_front_min[index] = front_min; + i->second.hb_front_max[index] = front_max; + ++i->second.hb_index; + } + + { + std::lock_guard l(service.stat_lock); + service.osd_stat.hb_pingtime[from].last_update = now.sec(); + service.osd_stat.hb_pingtime[from].back_last = back_pingtime; + + uint32_t total = 0; + uint32_t min = UINT_MAX; + uint32_t max = 0; + uint32_t count = 0; + uint32_t which = 0; + uint32_t size = (uint32_t)i->second.hb_back_pingtime.size(); + for (int32_t k = size - 1 ; k >= 0; --k) { + ++count; + int index = (i->second.hb_index + k) % size; + total += i->second.hb_back_pingtime[index]; + if (i->second.hb_back_min[index] < min) + min = i->second.hb_back_min[index]; + if (i->second.hb_back_max[index] > max) + max = i->second.hb_back_max[index]; + if (count == 1 || count == 5 || count == 15) { + service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count; + service.osd_stat.hb_pingtime[from].back_min[which] = min; + service.osd_stat.hb_pingtime[from].back_max[which] = max; + which++; + if (count == 15) + break; + } + } + + if (i->second.con_front != NULL) { + service.osd_stat.hb_pingtime[from].front_last = front_pingtime; + + total = 0; + min = UINT_MAX; + max = 0; + count = 0; + which = 0; + for (int32_t k = size - 1 ; k >= 0; --k) { + ++count; + int index = (i->second.hb_index + k) % size; + total += i->second.hb_front_pingtime[index]; + if (i->second.hb_front_min[index] < min) + min = i->second.hb_front_min[index]; + if (i->second.hb_front_max[index] > max) + max = i->second.hb_front_max[index]; + if (count == 1 || count == 5 || count == 15) { + service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count; + service.osd_stat.hb_pingtime[from].front_min[which] = min; + service.osd_stat.hb_pingtime[from].front_max[which] = max; + which++; + if (count == 15) + break; + } + } + } + } + } else { + std::lock_guard l(service.stat_lock); + service.osd_stat.hb_pingtime[from].back_last = back_pingtime; + if (i->second.con_front != NULL) + service.osd_stat.hb_pingtime[from].front_last = front_pingtime; + } + i->second.ping_history.erase(i->second.ping_history.begin(), ++acked); + } + + if (i->second.is_healthy(now)) { + // Cancel false reports + auto failure_queue_entry = failure_queue.find(from); + if (failure_queue_entry != failure_queue.end()) { + dout(10) << "handle_osd_ping canceling queued " + << "failure report for osd." << from << dendl; + failure_queue.erase(failure_queue_entry); + } + + auto failure_pending_entry = failure_pending.find(from); + if (failure_pending_entry != failure_pending.end()) { + dout(10) << "handle_osd_ping canceling in-flight " + << "failure report for osd." << from << dendl; + send_still_alive(curmap->get_epoch(), + from, + failure_pending_entry->second.second); + failure_pending.erase(failure_pending_entry); + } + } + } else { + // old replies, deprecated by newly sent pings. + dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp + << ") is found, treat as covered by newly sent pings " + << "and ignore" + << dendl; + } + } + + if (m->map_epoch && + curmap->is_up(from)) { + service.note_peer_epoch(from, m->map_epoch); + if (is_active()) { + ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch()); + if (con) { + service.share_map_peer(from, con.get()); + } + } + } + } + break; + + case MOSDPing::YOU_DIED: + dout(10) << "handle_osd_ping " << m->get_source_inst() + << " says i am down in " << m->map_epoch << dendl; + osdmap_subscribe(curmap->get_epoch()+1, false); + break; + } + + heartbeat_lock.Unlock(); + m->put(); +} + +void OSD::heartbeat_entry() +{ + std::lock_guard l(heartbeat_lock); + if (is_stopping()) + return; + while (!heartbeat_stop) { + heartbeat(); + + double wait; + if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) { + wait = (float)cct->_conf->osd_heartbeat_interval; + } else { + wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval; + } + utime_t w; + w.set_from_double(wait); + dout(30) << "heartbeat_entry sleeping for " << wait << dendl; + heartbeat_cond.WaitInterval(heartbeat_lock, w); + if (is_stopping()) + return; + dout(30) << "heartbeat_entry woke up" << dendl; + } +} + +void OSD::heartbeat_check() +{ + ceph_assert(heartbeat_lock.is_locked()); + utime_t now = ceph_clock_now(); + + // check for incoming heartbeats (move me elsewhere?) + for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin(); + p != heartbeat_peers.end(); + ++p) { + + if (p->second.first_tx == utime_t()) { + dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first + << " yet, skipping" << dendl; + continue; + } + + dout(25) << "heartbeat_check osd." << p->first + << " first_tx " << p->second.first_tx + << " last_tx " << p->second.last_tx + << " last_rx_back " << p->second.last_rx_back + << " last_rx_front " << p->second.last_rx_front + << dendl; + if (p->second.is_unhealthy(now)) { + utime_t oldest_deadline = p->second.ping_history.begin()->second.first; + if (p->second.last_rx_back == utime_t() || + p->second.last_rx_front == utime_t()) { + derr << "heartbeat_check: no reply from " + << p->second.con_front->get_peer_addr().get_sockaddr() + << " osd." << p->first + << " ever on either front or back, first ping sent " + << p->second.first_tx + << " (oldest deadline " << oldest_deadline << ")" + << dendl; + // fail + failure_queue[p->first] = p->second.first_tx; + } else { + derr << "heartbeat_check: no reply from " + << p->second.con_front->get_peer_addr().get_sockaddr() + << " osd." << p->first << " since back " << p->second.last_rx_back + << " front " << p->second.last_rx_front + << " (oldest deadline " << oldest_deadline << ")" + << dendl; + // fail + failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front); + } + } + } +} + +void OSD::heartbeat() +{ + ceph_assert(heartbeat_lock.is_locked_by_me()); + dout(30) << "heartbeat" << dendl; + + // get CPU load avg + double loadavgs[1]; + int hb_interval = cct->_conf->osd_heartbeat_interval; + int n_samples = 86400; + if (hb_interval > 1) { + n_samples /= hb_interval; + if (n_samples < 1) + n_samples = 1; + } + + if (getloadavg(loadavgs, 1) == 1) { + logger->set(l_osd_loadavg, 100 * loadavgs[0]); + daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples; + dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl; + } + + dout(30) << "heartbeat checking stats" << dendl; + + // refresh peer list and osd stats + vector<int> hb_peers; + for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin(); + p != heartbeat_peers.end(); + ++p) + hb_peers.push_back(p->first); + + auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs()); + dout(5) << __func__ << " " << new_stat << dendl; + ceph_assert(new_stat.statfs.total); + + float pratio; + float ratio = service.compute_adjusted_ratio(new_stat, &pratio); + + service.check_full_status(ratio, pratio); + + utime_t now = ceph_clock_now(); + utime_t deadline = now; + deadline += cct->_conf->osd_heartbeat_grace; + + // send heartbeats + for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin(); + i != heartbeat_peers.end(); + ++i) { + int peer = i->first; + i->second.last_tx = now; + if (i->second.first_tx == utime_t()) + i->second.first_tx = now; + i->second.ping_history[now] = make_pair(deadline, + HeartbeatInfo::HEARTBEAT_MAX_CONN); + if (i->second.hb_interval_start == utime_t()) + i->second.hb_interval_start = now; + dout(30) << "heartbeat sending ping to osd." << peer << dendl; + i->second.con_back->send_message(new MOSDPing(monc->get_fsid(), + service.get_osdmap_epoch(), + MOSDPing::PING, now, + cct->_conf->osd_heartbeat_min_size)); + + if (i->second.con_front) + i->second.con_front->send_message(new MOSDPing(monc->get_fsid(), + service.get_osdmap_epoch(), + MOSDPing::PING, now, + cct->_conf->osd_heartbeat_min_size)); + } + + logger->set(l_osd_hb_to, heartbeat_peers.size()); + + // hmm.. am i all alone? + dout(30) << "heartbeat lonely?" << dendl; + if (heartbeat_peers.empty()) { + if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) { + last_mon_heartbeat = now; + dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl; + osdmap_subscribe(get_osdmap_epoch() + 1, false); + } + } + + dout(30) << "heartbeat done" << dendl; +} + +bool OSD::heartbeat_reset(Connection *con) +{ + std::lock_guard l(heartbeat_lock); + auto s = con->get_priv(); + con->set_priv(nullptr); + if (s) { + if (is_stopping()) { + return true; + } + auto heartbeat_session = static_cast<HeartbeatSession*>(s.get()); + auto p = heartbeat_peers.find(heartbeat_session->peer); + if (p != heartbeat_peers.end() && + (p->second.con_back == con || + p->second.con_front == con)) { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", reopening" << dendl; + if (con != p->second.con_back) { + p->second.con_back->mark_down(); + } + p->second.con_back.reset(NULL); + if (p->second.con_front && con != p->second.con_front) { + p->second.con_front->mark_down(); + } + p->second.con_front.reset(NULL); + pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); + if (newcon.first) { + p->second.con_back = newcon.first.get(); + p->second.con_back->set_priv(s); + if (newcon.second) { + p->second.con_front = newcon.second.get(); + p->second.con_front->set_priv(s); + } + p->second.ping_history.clear(); + } else { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", raced with osdmap update, closing out peer" << dendl; + heartbeat_peers.erase(p); + } + } else { + dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl; + } + } + return true; +} + + + +// ========================================= + +void OSD::tick() +{ + ceph_assert(osd_lock.is_locked()); + dout(10) << "tick" << dendl; + + if (is_active() || is_waiting_for_healthy()) { + maybe_update_heartbeat_peers(); + } + + if (is_waiting_for_healthy()) { + start_boot(); + } + + if (is_waiting_for_healthy() || is_booting()) { + std::lock_guard l(heartbeat_lock); + utime_t now = ceph_clock_now(); + if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) { + last_mon_heartbeat = now; + dout(1) << __func__ << " checking mon for new map" << dendl; + osdmap_subscribe(get_osdmap_epoch() + 1, false); + } + } + + do_waiters(); + + tick_timer.add_event_after(get_tick_interval(), new C_Tick(this)); +} + +void OSD::tick_without_osd_lock() +{ + ceph_assert(tick_timer_lock.is_locked()); + dout(10) << "tick_without_osd_lock" << dendl; + + logger->set(l_osd_cached_crc, buffer::get_cached_crc()); + logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted()); + logger->set(l_osd_missed_crc, buffer::get_missed_crc()); + + // refresh osd stats + struct store_statfs_t stbuf; + osd_alert_list_t alerts; + int r = store->statfs(&stbuf, &alerts); + ceph_assert(r == 0); + service.set_statfs(stbuf, alerts); + + // osd_lock is not being held, which means the OSD state + // might change when doing the monitor report + if (is_active() || is_waiting_for_healthy()) { + heartbeat_lock.Lock(); + heartbeat_check(); + heartbeat_lock.Unlock(); + + map_lock.get_read(); + std::lock_guard l(mon_report_lock); + + // mon report? + utime_t now = ceph_clock_now(); + if (service.need_fullness_update() || + now - last_mon_report > cct->_conf->osd_mon_report_interval) { + last_mon_report = now; + send_full_update(); + send_failures(); + } + map_lock.put_read(); + + epoch_t max_waiting_epoch = 0; + for (auto s : shards) { + max_waiting_epoch = std::max(max_waiting_epoch, + s->get_max_waiting_epoch()); + } + if (max_waiting_epoch > get_osdmap()->get_epoch()) { + dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch + << ", requesting new map" << dendl; + osdmap_subscribe(superblock.newest_map + 1, false); + } + } + + if (is_active()) { + if (!scrub_random_backoff()) { + sched_scrub(); + } + service.promote_throttle_recalibrate(); + resume_creating_pg(); + bool need_send_beacon = false; + const auto now = ceph::coarse_mono_clock::now(); + { + // borrow lec lock to pretect last_sent_beacon from changing + std::lock_guard l{min_last_epoch_clean_lock}; + const auto elapsed = now - last_sent_beacon; + if (chrono::duration_cast<chrono::seconds>(elapsed).count() > + cct->_conf->osd_beacon_report_interval) { + need_send_beacon = true; + } + } + if (need_send_beacon) { + send_beacon(now); + } + } + + mgrc.update_daemon_health(get_health_metrics()); + service.kick_recovery_queue(); + tick_timer_without_osd_lock.add_event_after(get_tick_interval(), + new C_Tick_WithoutOSDLock(this)); +} + +// Usage: +// setomapval <pool-id> [namespace/]<obj-name> <key> <val> +// rmomapkey <pool-id> [namespace/]<obj-name> <key> +// setomapheader <pool-id> [namespace/]<obj-name> <header> +// getomap <pool> [namespace/]<obj-name> +// truncobj <pool-id> [namespace/]<obj-name> <newlen> +// injectmdataerr [namespace/]<obj-name> [shardid] +// injectdataerr [namespace/]<obj-name> [shardid] +// +// set_recovery_delay [utime] +void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, + std::string_view command, + const cmdmap_t& cmdmap, ostream &ss) +{ + //Test support + //Support changing the omap on a single osd by using the Admin Socket to + //directly request the osd make a change. + if (command == "setomapval" || command == "rmomapkey" || + command == "setomapheader" || command == "getomap" || + command == "truncobj" || command == "injectmdataerr" || + command == "injectdataerr" + ) { + pg_t rawpg; + int64_t pool; + OSDMapRef curmap = service->get_osdmap(); + int r = -1; + + string poolstr; + + cmd_getval(service->cct, cmdmap, "pool", poolstr); + pool = curmap->lookup_pg_pool_name(poolstr); + //If we can't find it by name then maybe id specified + if (pool < 0 && isdigit(poolstr[0])) + pool = atoll(poolstr.c_str()); + if (pool < 0) { + ss << "Invalid pool '" << poolstr << "''"; + return; + } + + string objname, nspace; + cmd_getval(service->cct, cmdmap, "objname", objname); + std::size_t found = objname.find_first_of('/'); + if (found != string::npos) { + nspace = objname.substr(0, found); + objname = objname.substr(found+1); + } + object_locator_t oloc(pool, nspace); + r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg); + + if (r < 0) { + ss << "Invalid namespace/objname"; + return; + } + + int64_t shardid; + cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD)); + hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace); + ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid))); + spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid)); + if (curmap->pg_is_ec(rawpg)) { + if ((command != "injectdataerr") && (command != "injectmdataerr")) { + ss << "Must not call on ec pool, except injectdataerr or injectmdataerr"; + return; + } + } + + ObjectStore::Transaction t; + + if (command == "setomapval") { + map<string, bufferlist> newattrs; + bufferlist val; + string key, valstr; + cmd_getval(service->cct, cmdmap, "key", key); + cmd_getval(service->cct, cmdmap, "val", valstr); + + val.append(valstr); + newattrs[key] = val; + t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs); + r = store->queue_transaction(service->meta_ch, std::move(t)); + if (r < 0) + ss << "error=" << r; + else + ss << "ok"; + } else if (command == "rmomapkey") { + string key; + set<string> keys; + cmd_getval(service->cct, cmdmap, "key", key); + + keys.insert(key); + t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys); + r = store->queue_transaction(service->meta_ch, std::move(t)); + if (r < 0) + ss << "error=" << r; + else + ss << "ok"; + } else if (command == "setomapheader") { + bufferlist newheader; + string headerstr; + + cmd_getval(service->cct, cmdmap, "header", headerstr); + newheader.append(headerstr); + t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader); + r = store->queue_transaction(service->meta_ch, std::move(t)); + if (r < 0) + ss << "error=" << r; + else + ss << "ok"; + } else if (command == "getomap") { + //Debug: Output entire omap + bufferlist hdrbl; + map<string, bufferlist> keyvals; + auto ch = store->open_collection(coll_t(pgid)); + if (!ch) { + ss << "unable to open collection for " << pgid; + r = -ENOENT; + } else { + r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals); + if (r >= 0) { + ss << "header=" << string(hdrbl.c_str(), hdrbl.length()); + for (map<string, bufferlist>::iterator it = keyvals.begin(); + it != keyvals.end(); ++it) + ss << " key=" << (*it).first << " val=" + << string((*it).second.c_str(), (*it).second.length()); + } else { + ss << "error=" << r; + } + } + } else if (command == "truncobj") { + int64_t trunclen; + cmd_getval(service->cct, cmdmap, "len", trunclen); + t.truncate(coll_t(pgid), ghobject_t(obj), trunclen); + r = store->queue_transaction(service->meta_ch, std::move(t)); + if (r < 0) + ss << "error=" << r; + else + ss << "ok"; + } else if (command == "injectdataerr") { + store->inject_data_error(gobj); + ss << "ok"; + } else if (command == "injectmdataerr") { + store->inject_mdata_error(gobj); + ss << "ok"; + } + return; + } + if (command == "set_recovery_delay") { + int64_t delay; + cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0); + ostringstream oss; + oss << delay; + int r = service->cct->_conf.set_val("osd_recovery_delay_start", + oss.str().c_str()); + if (r != 0) { + ss << "set_recovery_delay: error setting " + << "osd_recovery_delay_start to '" << delay << "': error " + << r; + return; + } + service->cct->_conf.apply_changes(nullptr); + ss << "set_recovery_delay: set osd_recovery_delay_start " + << "to " << service->cct->_conf->osd_recovery_delay_start; + return; + } + if (command == "trigger_scrub" || command == "trigger_deep_scrub") { + spg_t pgid; + bool deep = (command == "trigger_deep_scrub"); + OSDMapRef curmap = service->get_osdmap(); + + string pgidstr; + + cmd_getval(service->cct, cmdmap, "pgid", pgidstr); + if (!pgid.parse(pgidstr.c_str())) { + ss << "Invalid pgid specified"; + return; + } + + int64_t time; + cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0); + + PGRef pg = service->osd->_lookup_lock_pg(pgid); + if (pg == nullptr) { + ss << "Can't find pg " << pgid; + return; + } + + if (pg->is_primary()) { + pg->unreg_next_scrub(); + const pg_pool_t *p = curmap->get_pg_pool(pgid.pool()); + double pool_scrub_max_interval = 0; + double scrub_max_interval; + if (deep) { + p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval); + scrub_max_interval = pool_scrub_max_interval > 0 ? + pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval; + } else { + p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval); + scrub_max_interval = pool_scrub_max_interval > 0 ? + pool_scrub_max_interval : g_conf()->osd_scrub_max_interval; + } + // Instead of marking must_scrub force a schedule scrub + utime_t stamp = ceph_clock_now(); + if (time == 0) + stamp -= scrub_max_interval; + else + stamp -= (float)time; + stamp -= 100.0; // push back last scrub more for good measure + if (deep) { + pg->set_last_deep_scrub_stamp(stamp); + } else { + pg->set_last_scrub_stamp(stamp); + } + pg->reg_next_scrub(); + pg->publish_stats_to_osd(); + ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp; + } else { + ss << "Not primary"; + } + pg->unlock(); + return; + } + if (command == "injectfull") { + int64_t count; + string type; + OSDService::s_names state; + cmd_getval(service->cct, cmdmap, "type", type, string("full")); + cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1); + if (type == "none" || count == 0) { + type = "none"; + count = 0; + } + state = service->get_full_state(type); + if (state == OSDService::s_names::INVALID) { + ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)"; + return; + } + service->set_injectfull(state, count); + return; + } + ss << "Internal error - command=" << command; +} + +// ========================================= + +void OSD::ms_handle_connect(Connection *con) +{ + dout(10) << __func__ << " con " << con << dendl; + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + std::lock_guard l(osd_lock); + if (is_stopping()) + return; + dout(10) << __func__ << " on mon" << dendl; + + if (is_preboot()) { + start_boot(); + } else if (is_booting()) { + _send_boot(); // resend boot message + } else { + map_lock.get_read(); + std::lock_guard l2(mon_report_lock); + + utime_t now = ceph_clock_now(); + last_mon_report = now; + + // resend everything, it's a new session + send_full_update(); + send_alive(); + service.requeue_pg_temp(); + service.clear_sent_ready_to_merge(); + service.send_pg_temp(); + service.send_ready_to_merge(); + service.send_pg_created(); + requeue_failures(); + send_failures(); + + map_lock.put_read(); + if (is_active()) { + send_beacon(ceph::coarse_mono_clock::now()); + } + } + + // full map requests may happen while active or pre-boot + if (requested_full_first) { + rerequest_full_maps(); + } + } +} + +void OSD::ms_handle_fast_connect(Connection *con) +{ + if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON && + con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) { + auto priv = con->get_priv(); + auto s = static_cast<Session*>(priv.get()); + if (!s) { + s = new Session{cct, con}; + con->set_priv(RefCountedPtr{s, false}); + dout(10) << " new session (outgoing) " << s << " con=" << s->con + << " addr=" << s->con->get_peer_addr() << dendl; + // we don't connect to clients + ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD); + s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD); + } + } +} + +void OSD::ms_handle_fast_accept(Connection *con) +{ + if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON && + con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) { + auto priv = con->get_priv(); + auto s = static_cast<Session*>(priv.get()); + if (!s) { + s = new Session{cct, con}; + con->set_priv(RefCountedPtr{s, false}); + dout(10) << "new session (incoming)" << s << " con=" << con + << " addr=" << con->get_peer_addr() + << " must have raced with connect" << dendl; + ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD); + s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD); + } + } +} + +bool OSD::ms_handle_reset(Connection *con) +{ + auto s = con->get_priv(); + auto session = static_cast<Session*>(s.get()); + dout(2) << "ms_handle_reset con " << con << " session " << session << dendl; + if (!session) + return false; + session->wstate.reset(con); + session->con->set_priv(nullptr); + session->con.reset(); // break con <-> session ref cycle + // note that we break session->con *before* the session_handle_reset + // cleanup below. this avoids a race between us and + // PG::add_backoff, Session::check_backoff, etc. + session_handle_reset(SessionRef{session}); + return true; +} + +bool OSD::ms_handle_refused(Connection *con) +{ + if (!cct->_conf->osd_fast_fail_on_connection_refused) + return false; + + auto priv = con->get_priv(); + auto session = static_cast<Session*>(priv.get()); + dout(2) << "ms_handle_refused con " << con << " session " << session << dendl; + if (!session) + return false; + int type = con->get_peer_type(); + // handle only OSD failures here + if (monc && (type == CEPH_ENTITY_TYPE_OSD)) { + OSDMapRef osdmap = get_osdmap(); + if (osdmap) { + int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr()); + if (id >= 0 && osdmap->is_up(id)) { + // I'm cheating mon heartbeat grace logic, because we know it's not going + // to respawn alone. +1 so we won't hit any boundary case. + monc->send_mon_message( + new MOSDFailure( + monc->get_fsid(), + id, + osdmap->get_addrs(id), + cct->_conf->osd_heartbeat_grace + 1, + osdmap->get_epoch(), + MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED + )); + } + } + } + return true; +} + +struct C_OSD_GetVersion : public Context { + OSD *osd; + uint64_t oldest, newest; + explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {} + void finish(int r) override { + if (r >= 0) + osd->_got_mon_epochs(oldest, newest); + } +}; + +void OSD::start_boot() +{ + if (!_is_healthy()) { + // if we are not healthy, do not mark ourselves up (yet) + dout(1) << "not healthy; waiting to boot" << dendl; + if (!is_waiting_for_healthy()) + start_waiting_for_healthy(); + // send pings sooner rather than later + heartbeat_kick(); + return; + } + dout(1) << __func__ << dendl; + set_state(STATE_PREBOOT); + dout(10) << "start_boot - have maps " << superblock.oldest_map + << ".." << superblock.newest_map << dendl; + C_OSD_GetVersion *c = new C_OSD_GetVersion(this); + monc->get_version("osdmap", &c->newest, &c->oldest, c); +} + +void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest) +{ + std::lock_guard l(osd_lock); + if (is_preboot()) { + _preboot(oldest, newest); + } +} + +void OSD::_preboot(epoch_t oldest, epoch_t newest) +{ + ceph_assert(is_preboot()); + dout(10) << __func__ << " _preboot mon has osdmaps " + << oldest << ".." << newest << dendl; + + // ensure our local fullness awareness is accurate + { + std::lock_guard l(heartbeat_lock); + heartbeat(); + } + + const auto osdmap = get_osdmap(); + // if our map within recent history, try to add ourselves to the osdmap. + if (osdmap->get_epoch() == 0) { + derr << "waiting for initial osdmap" << dendl; + } else if (osdmap->is_destroyed(whoami)) { + derr << "osdmap says I am destroyed" << dendl; + // provide a small margin so we don't livelock seeing if we + // un-destroyed ourselves. + if (osdmap->get_epoch() > newest - 1) { + exit(0); + } + } else if (osdmap->is_noup(whoami)) { + derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl; + } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { + derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it" + << dendl; + } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) { + derr << "osdmap require_osd_release < luminous; please upgrade to luminous" + << dendl; + } else if (service.need_fullness_update()) { + derr << "osdmap fullness state needs update" << dendl; + send_full_update(); + } else if (osdmap->get_epoch() >= oldest - 1 && + osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) { + + // wait for pgs to fully catch up in a different thread, since + // this thread might be required for splitting and merging PGs to + // make progress. + boot_finisher.queue( + new FunctionContext( + [this](int r) { + std::lock_guard l(osd_lock); + if (is_preboot()) { + dout(10) << __func__ << " waiting for peering work to drain" + << dendl; + osd_lock.Unlock(); + for (auto shard : shards) { + shard->wait_min_pg_epoch(get_osdmap_epoch()); + } + osd_lock.Lock(); + } + if (is_preboot()) { + _send_boot(); + } + })); + return; + } + + // get all the latest maps + if (osdmap->get_epoch() + 1 >= oldest) + osdmap_subscribe(osdmap->get_epoch() + 1, false); + else + osdmap_subscribe(oldest - 1, true); +} + +void OSD::send_full_update() +{ + if (!service.need_fullness_update()) + return; + unsigned state = 0; + if (service.is_full()) { + state = CEPH_OSD_FULL; + } else if (service.is_backfillfull()) { + state = CEPH_OSD_BACKFILLFULL; + } else if (service.is_nearfull()) { + state = CEPH_OSD_NEARFULL; + } + set<string> s; + OSDMap::calc_state_set(state, s); + dout(10) << __func__ << " want state " << s << dendl; + monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state)); +} + +void OSD::start_waiting_for_healthy() +{ + dout(1) << "start_waiting_for_healthy" << dendl; + set_state(STATE_WAITING_FOR_HEALTHY); + last_heartbeat_resample = utime_t(); + + // subscribe to osdmap updates, in case our peers really are known to be dead + osdmap_subscribe(get_osdmap_epoch() + 1, false); +} + +bool OSD::_is_healthy() +{ + if (!cct->get_heartbeat_map()->is_healthy()) { + dout(1) << "is_healthy false -- internal heartbeat failed" << dendl; + return false; + } + + if (is_waiting_for_healthy()) { + utime_t now = ceph_clock_now(); + utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0); + while (!osd_markdown_log.empty() && + osd_markdown_log.front() + grace < now) + osd_markdown_log.pop_front(); + if (osd_markdown_log.size() <= 1) { + dout(5) << __func__ << " first time marked as down," + << " try reboot unconditionally" << dendl; + return true; + } + std::lock_guard l(heartbeat_lock); + int num = 0, up = 0; + for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin(); + p != heartbeat_peers.end(); + ++p) { + if (p->second.is_healthy(now)) + ++up; + ++num; + } + if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) { + dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than " + << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl; + return false; + } + } + + return true; +} + +void OSD::_send_boot() +{ + dout(10) << "_send_boot" << dendl; + Connection *local_connection = + cluster_messenger->get_loopback_connection().get(); + entity_addrvec_t client_addrs = client_messenger->get_myaddrs(); + entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs(); + entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs(); + entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs(); + + dout(20) << " initial client_addrs " << client_addrs + << ", cluster_addrs " << cluster_addrs + << ", hb_back_addrs " << hb_back_addrs + << ", hb_front_addrs " << hb_front_addrs + << dendl; + if (cluster_messenger->set_addr_unknowns(client_addrs)) { + dout(10) << " assuming cluster_addrs match client_addrs " + << client_addrs << dendl; + cluster_addrs = cluster_messenger->get_myaddrs(); + } + if (auto session = local_connection->get_priv(); !session) { + cluster_messenger->ms_deliver_handle_fast_connect(local_connection); + } + + local_connection = hb_back_server_messenger->get_loopback_connection().get(); + if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) { + dout(10) << " assuming hb_back_addrs match cluster_addrs " + << cluster_addrs << dendl; + hb_back_addrs = hb_back_server_messenger->get_myaddrs(); + } + if (auto session = local_connection->get_priv(); !session) { + hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection); + } + + local_connection = hb_front_server_messenger->get_loopback_connection().get(); + if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) { + dout(10) << " assuming hb_front_addrs match client_addrs " + << client_addrs << dendl; + hb_front_addrs = hb_front_server_messenger->get_myaddrs(); + } + if (auto session = local_connection->get_priv(); !session) { + hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection); + } + + // we now know what our front and back addrs will be, and we are + // about to tell the mon what our metadata (including numa bindings) + // are, so now is a good time! + set_numa_affinity(); + + MOSDBoot *mboot = new MOSDBoot( + superblock, get_osdmap_epoch(), service.get_boot_epoch(), + hb_back_addrs, hb_front_addrs, cluster_addrs, + CEPH_FEATURES_ALL); + dout(10) << " final client_addrs " << client_addrs + << ", cluster_addrs " << cluster_addrs + << ", hb_back_addrs " << hb_back_addrs + << ", hb_front_addrs " << hb_front_addrs + << dendl; + _collect_metadata(&mboot->metadata); + monc->send_mon_message(mboot); + set_state(STATE_BOOTING); +} + +void OSD::_collect_metadata(map<string,string> *pm) +{ + // config info + (*pm)["osd_data"] = dev_path; + if (store->get_type() == "filestore") { + // not applicable for bluestore + (*pm)["osd_journal"] = journal_path; + } + (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs()); + (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs()); + (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs()); + (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs()); + + // backend + (*pm)["osd_objectstore"] = store->get_type(); + (*pm)["rotational"] = store_is_rotational ? "1" : "0"; + (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0"; + (*pm)["default_device_class"] = store->get_default_device_class(); + store->collect_metadata(pm); + + collect_sys_info(pm, cct); + + (*pm)["front_iface"] = pick_iface( + cct, + client_messenger->get_myaddrs().front().get_sockaddr_storage()); + (*pm)["back_iface"] = pick_iface( + cct, + cluster_messenger->get_myaddrs().front().get_sockaddr_storage()); + + // network numa + { + int node = -1; + set<int> nodes; + set<string> unknown; + for (auto nm : { "front_iface", "back_iface" }) { + if (!(*pm)[nm].size()) { + unknown.insert(nm); + continue; + } + int n = -1; + int r = get_iface_numa_node((*pm)[nm], &n); + if (r < 0) { + unknown.insert((*pm)[nm]); + continue; + } + nodes.insert(n); + if (node < 0) { + node = n; + } + } + if (unknown.size()) { + (*pm)["network_numa_unknown_ifaces"] = stringify(unknown); + } + if (!nodes.empty()) { + (*pm)["network_numa_nodes"] = stringify(nodes); + } + if (node >= 0 && nodes.size() == 1 && unknown.empty()) { + (*pm)["network_numa_node"] = stringify(node); + } + } + + if (numa_node >= 0) { + (*pm)["numa_node"] = stringify(numa_node); + (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size, + &numa_cpu_set); + } + + set<string> devnames; + store->get_devices(&devnames); + map<string,string> errs; + get_device_metadata(devnames, pm, &errs); + for (auto& i : errs) { + dout(1) << __func__ << " " << i.first << ": " << i.second << dendl; + } + dout(10) << __func__ << " " << *pm << dendl; +} + +void OSD::queue_want_up_thru(epoch_t want) +{ + map_lock.get_read(); + epoch_t cur = get_osdmap()->get_up_thru(whoami); + std::lock_guard l(mon_report_lock); + if (want > up_thru_wanted) { + dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")" + << ", currently " << cur + << dendl; + up_thru_wanted = want; + send_alive(); + } else { + dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted + << ", currently " << cur + << dendl; + } + map_lock.put_read(); +} + +void OSD::send_alive() +{ + ceph_assert(mon_report_lock.is_locked()); + const auto osdmap = get_osdmap(); + if (!osdmap->exists(whoami)) + return; + epoch_t up_thru = osdmap->get_up_thru(whoami); + dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl; + if (up_thru_wanted > up_thru) { + dout(10) << "send_alive want " << up_thru_wanted << dendl; + monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted)); + } +} + +void OSD::request_full_map(epoch_t first, epoch_t last) +{ + dout(10) << __func__ << " " << first << ".." << last + << ", previously requested " + << requested_full_first << ".." << requested_full_last << dendl; + ceph_assert(osd_lock.is_locked()); + ceph_assert(first > 0 && last > 0); + ceph_assert(first <= last); + ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps + if (requested_full_first == 0) { + // first request + requested_full_first = first; + requested_full_last = last; + } else if (last <= requested_full_last) { + // dup + return; + } else { + // additional request + first = requested_full_last + 1; + requested_full_last = last; + } + MMonGetOSDMap *req = new MMonGetOSDMap; + req->request_full(first, last); + monc->send_mon_message(req); +} + +void OSD::got_full_map(epoch_t e) +{ + ceph_assert(requested_full_first <= requested_full_last); + ceph_assert(osd_lock.is_locked()); + if (requested_full_first == 0) { + dout(20) << __func__ << " " << e << ", nothing requested" << dendl; + return; + } + if (e < requested_full_first) { + dout(10) << __func__ << " " << e << ", requested " << requested_full_first + << ".." << requested_full_last + << ", ignoring" << dendl; + return; + } + if (e >= requested_full_last) { + dout(10) << __func__ << " " << e << ", requested " << requested_full_first + << ".." << requested_full_last << ", resetting" << dendl; + requested_full_first = requested_full_last = 0; + return; + } + + requested_full_first = e + 1; + + dout(10) << __func__ << " " << e << ", requested " << requested_full_first + << ".." << requested_full_last + << ", still need more" << dendl; +} + +void OSD::requeue_failures() +{ + std::lock_guard l(heartbeat_lock); + unsigned old_queue = failure_queue.size(); + unsigned old_pending = failure_pending.size(); + for (auto p = failure_pending.begin(); p != failure_pending.end(); ) { + failure_queue[p->first] = p->second.first; + failure_pending.erase(p++); + } + dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> " + << failure_queue.size() << dendl; +} + +void OSD::send_failures() +{ + ceph_assert(map_lock.is_locked()); + ceph_assert(mon_report_lock.is_locked()); + std::lock_guard l(heartbeat_lock); + utime_t now = ceph_clock_now(); + const auto osdmap = get_osdmap(); + while (!failure_queue.empty()) { + int osd = failure_queue.begin()->first; + if (!failure_pending.count(osd)) { + int failed_for = (int)(double)(now - failure_queue.begin()->second); + monc->send_mon_message( + new MOSDFailure( + monc->get_fsid(), + osd, + osdmap->get_addrs(osd), + failed_for, + osdmap->get_epoch())); + failure_pending[osd] = make_pair(failure_queue.begin()->second, + osdmap->get_addrs(osd)); + } + failure_queue.erase(osd); + } +} + +void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs) +{ + MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch, + MOSDFailure::FLAG_ALIVE); + monc->send_mon_message(m); +} + +void OSD::cancel_pending_failures() +{ + std::lock_guard l(heartbeat_lock); + auto it = failure_pending.begin(); + while (it != failure_pending.end()) { + dout(10) << __func__ << " canceling in-flight failure report for osd." + << it->first << dendl; + send_still_alive(get_osdmap_epoch(), it->first, it->second.second); + failure_pending.erase(it++); + } +} + +void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now) +{ + const auto& monmap = monc->monmap; + // send beacon to mon even if we are just connected, and the monmap is not + // initialized yet by then. + if (monmap.epoch > 0 && + monmap.get_required_features().contains_all( + ceph::features::mon::FEATURE_LUMINOUS)) { + dout(20) << __func__ << " sending" << dendl; + MOSDBeacon* beacon = nullptr; + { + std::lock_guard l{min_last_epoch_clean_lock}; + beacon = new MOSDBeacon(get_osdmap_epoch(), min_last_epoch_clean); + beacon->pgs = min_last_epoch_clean_pgs; + last_sent_beacon = now; + } + monc->send_mon_message(beacon); + } else { + dout(20) << __func__ << " not sending" << dendl; + } +} + +void OSD::handle_command(MMonCommand *m) +{ + if (!require_mon_peer(m)) { + m->put(); + return; + } + + Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL); + command_wq.queue(c); + m->put(); +} + +void OSD::handle_command(MCommand *m) +{ + ConnectionRef con = m->get_connection(); + auto priv = con->get_priv(); + auto session = static_cast<Session *>(priv.get()); + if (!session) { + con->send_message(new MCommandReply(m, -EPERM)); + m->put(); + return; + } + + OSDCap& caps = session->caps; + priv.reset(); + + if (!caps.allow_all() || m->get_source().is_mon()) { + con->send_message(new MCommandReply(m, -EPERM)); + m->put(); + return; + } + + Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get()); + command_wq.queue(c); + + m->put(); +} + +struct OSDCommand { + string cmdstring; + string helpstring; + string module; + string perm; +} osd_commands[] = { + +#define COMMAND(parsesig, helptext, module, perm) \ + {parsesig, helptext, module, perm}, + +// yes, these are really pg commands, but there's a limit to how +// much work it's worth. The OSD returns all of them. Make this +// form (pg <pgid> <cmd>) valid only for the cli. +// Rest uses "tell <pgid> <cmd>" + +COMMAND("pg " \ + "name=pgid,type=CephPgid " \ + "name=cmd,type=CephChoices,strings=query", \ + "show details of a specific pg", "osd", "r") +COMMAND("pg " \ + "name=pgid,type=CephPgid " \ + "name=cmd,type=CephChoices,strings=mark_unfound_lost " \ + "name=mulcmd,type=CephChoices,strings=revert|delete", \ + "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available", + "osd", "rw") +COMMAND("pg " \ + "name=pgid,type=CephPgid " \ + "name=cmd,type=CephChoices,strings=list_unfound " \ + "name=offset,type=CephString,req=false", + "list unfound objects on this pg, perhaps starting at an offset given in JSON", + "osd", "r") + +// new form: tell <pgid> <cmd> for both cli and rest + +COMMAND("query", + "show details of a specific pg", "osd", "r") +COMMAND("mark_unfound_lost " \ + "name=mulcmd,type=CephChoices,strings=revert|delete", \ + "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available", + "osd", "rw") +COMMAND("list_unfound " \ + "name=offset,type=CephString,req=false", + "list unfound objects on this pg, perhaps starting at an offset given in JSON", + "osd", "r") +COMMAND("perf histogram dump " + "name=logger,type=CephString,req=false " + "name=counter,type=CephString,req=false", + "Get histogram data", + "osd", "r") + +// tell <osd.n> commands. Validation of osd.n must be special-cased in client +COMMAND("version", "report version of OSD", "osd", "r") +COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r") +COMMAND("injectargs " \ + "name=injected_args,type=CephString,n=N", + "inject configuration arguments into running OSD", + "osd", "rw") +COMMAND("config set " \ + "name=key,type=CephString name=value,type=CephString", + "Set a configuration option at runtime (not persistent)", + "osd", "rw") +COMMAND("config get " \ + "name=key,type=CephString", + "Get a configuration option at runtime", + "osd", "r") +COMMAND("config unset " \ + "name=key,type=CephString", + "Unset a configuration option at runtime (not persistent)", + "osd", "rw") +COMMAND("cluster_log " \ + "name=level,type=CephChoices,strings=error,warning,info,debug " \ + "name=message,type=CephString,n=N", + "log a message to the cluster log", + "osd", "rw") +COMMAND("clear_shards_repaired " \ + "name=count,type=CephInt,req=false", + "clear num_shards_repaired to clear health warning", + "osd", "rw") +COMMAND("bench " \ + "name=count,type=CephInt,req=false " \ + "name=size,type=CephInt,req=false " \ + "name=object_size,type=CephInt,req=false " \ + "name=object_num,type=CephInt,req=false ", \ + "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \ + "(default count=1G default size=4MB). Results in log.", + "osd", "rw") +COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw") +COMMAND("heap " \ + "name=heapcmd,type=CephChoices,strings="\ + "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \ + "name=value,type=CephString,req=false", + "show heap usage info (available only if compiled with tcmalloc)", + "osd", "rw") +COMMAND("debug dump_missing " \ + "name=filename,type=CephFilepath", + "dump missing objects to a named file", "osd", "r") +COMMAND("debug kick_recovery_wq " \ + "name=delay,type=CephInt,range=0", + "set osd_recovery_delay_start to <val>", "osd", "rw") +COMMAND("cpu_profiler " \ + "name=arg,type=CephChoices,strings=status|flush", + "run cpu profiling on daemon", "osd", "rw") +COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics", + "osd", "r") +COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics", + "osd", "rw") +COMMAND("compact", + "compact object store's omap. " + "WARNING: Compaction probably slows your requests", + "osd", "rw") +COMMAND("smart name=devid,type=CephString,req=False", + "runs smartctl on this osd devices. ", + "osd", "rw") +COMMAND("cache drop", + "Drop all OSD caches", + "osd", "rwx") +COMMAND("cache status", + "Get OSD caches statistics", + "osd", "r") +COMMAND("send_beacon", + "Send OSD beacon to mon immediately", + "osd", "r") +}; + +void OSD::do_command( + Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data) +{ + dout(20) << "do_command tid " << tid << " " << cmd << dendl; + + int r = 0; + stringstream ss, ds; + bufferlist odata; + cmdmap_t cmdmap; + if (cmd.empty()) { + ss << "no command given"; + goto out; + } + if (!cmdmap_from_json(cmd, &cmdmap, ss)) { + r = -EINVAL; + goto out; + } + + try { + r = _do_command(con, cmdmap, tid, data, odata, ss, ds); + } catch (const bad_cmd_get& e) { + r = -EINVAL; + ss << e.what(); + } + if (r == -EAGAIN) { + return; + } + out: + string rs = ss.str(); + odata.append(ds); + dout(0) << "do_command r=" << r << " " << rs << dendl; + clog->info() << rs; + if (con) { + MCommandReply *reply = new MCommandReply(r, rs); + reply->set_tid(tid); + reply->set_data(odata); + con->send_message(reply); + } +} + +namespace { + class unlock_guard { + Mutex& m; + public: + explicit unlock_guard(Mutex& mutex) + : m(mutex) + { + m.unlock(); + } + unlock_guard(unlock_guard&) = delete; + ~unlock_guard() { + m.lock(); + } + }; +} + +int OSD::_do_command( + Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data, + bufferlist& odata, stringstream& ss, stringstream& ds) +{ + int r = 0; + string prefix; + string format; + string pgidstr; + boost::scoped_ptr<Formatter> f; + + cmd_getval(cct, cmdmap, "prefix", prefix); + + if (prefix == "get_command_descriptions") { + int cmdnum = 0; + JSONFormatter *f = new JSONFormatter(); + f->open_object_section("command_descriptions"); + for (OSDCommand *cp = osd_commands; + cp < &osd_commands[std::size(osd_commands)]; cp++) { + + ostringstream secname; + secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; + dump_cmddesc_to_json(f, con->get_features(), + secname.str(), cp->cmdstring, cp->helpstring, + cp->module, cp->perm, 0); + cmdnum++; + } + f->close_section(); // command_descriptions + + f->flush(ds); + delete f; + goto out; + } + + cmd_getval(cct, cmdmap, "format", format); + f.reset(Formatter::create(format)); + + if (prefix == "version") { + if (f) { + f->open_object_section("version"); + f->dump_string("version", pretty_version_to_str()); + f->close_section(); + f->flush(ds); + } else { + ds << pretty_version_to_str(); + } + goto out; + } + else if (prefix == "injectargs") { + vector<string> argsvec; + cmd_getval(cct, cmdmap, "injected_args", argsvec); + + if (argsvec.empty()) { + r = -EINVAL; + ss << "ignoring empty injectargs"; + goto out; + } + string args = argsvec.front(); + for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a) + args += " " + *a; + unlock_guard unlock{osd_lock}; + r = cct->_conf.injectargs(args, &ss); + } + else if (prefix == "config set") { + std::string key; + std::string val; + cmd_getval(cct, cmdmap, "key", key); + cmd_getval(cct, cmdmap, "value", val); + unlock_guard unlock{osd_lock}; + r = cct->_conf.set_val(key, val, &ss); + if (r == 0) { + cct->_conf.apply_changes(nullptr); + } + } + else if (prefix == "config get") { + std::string key; + cmd_getval(cct, cmdmap, "key", key); + unlock_guard unlock{osd_lock}; + std::string val; + r = cct->_conf.get_val(key, &val); + if (r == 0) { + ds << val; + } + } + else if (prefix == "config unset") { + std::string key; + cmd_getval(cct, cmdmap, "key", key); + unlock_guard unlock{osd_lock}; + r = cct->_conf.rm_val(key); + if (r == 0) { + cct->_conf.apply_changes(nullptr); + } + if (r == -ENOENT) { + r = 0; // make command idempotent + } + } + else if (prefix == "cluster_log") { + vector<string> msg; + cmd_getval(cct, cmdmap, "message", msg); + if (msg.empty()) { + r = -EINVAL; + ss << "ignoring empty log message"; + goto out; + } + string message = msg.front(); + for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a) + message += " " + *a; + string lvl; + cmd_getval(cct, cmdmap, "level", lvl); + clog_type level = string_to_clog_type(lvl); + if (level < 0) { + r = -EINVAL; + ss << "unknown level '" << lvl << "'"; + goto out; + } + clog->do_log(level, message); + } + else if (prefix == "clear_shards_repaired") { + int64_t count; + cmd_getval(cct, cmdmap, "count", count, (int64_t) 0); + service.set_osd_stat_repaired(count); + } + + // either 'pg <pgid> <command>' or + // 'tell <pgid>' (which comes in without any of that prefix)? + + else if (prefix == "pg" || + prefix == "query" || + prefix == "mark_unfound_lost" || + prefix == "list_unfound" + ) { + pg_t pgid; + + if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) { + ss << "no pgid specified"; + r = -EINVAL; + } else if (!pgid.parse(pgidstr.c_str())) { + ss << "couldn't parse pgid '" << pgidstr << "'"; + r = -EINVAL; + } else { + spg_t pcand; + PGRef pg; + if (get_osdmap()->get_primary_shard(pgid, &pcand) && + (pg = _lookup_lock_pg(pcand))) { + if (pg->is_primary()) { + // simulate pg <pgid> cmd= for pg->do-command + if (prefix != "pg") + cmd_putval(cct, cmdmap, "cmd", prefix); + try { + r = pg->do_command(cmdmap, ss, data, odata, con, tid); + } catch (const bad_cmd_get& e) { + pg->unlock(); + ss << e.what(); + return -EINVAL; + } + if (r == -EAGAIN) { + pg->unlock(); + // don't reply, pg will do so async + return -EAGAIN; + } + } else { + ss << "not primary for pgid " << pgid; + + // send them the latest diff to ensure they realize the mapping + // has changed. + service.send_incremental_map(get_osdmap_epoch() - 1, con, get_osdmap()); + + // do not reply; they will get newer maps and realize they + // need to resend. + pg->unlock(); + return -EAGAIN; + } + pg->unlock(); + } else { + ss << "i don't have pgid " << pgid; + r = -ENOENT; + } + } + } + + else if (prefix == "bench") { + int64_t count; + int64_t bsize; + int64_t osize, onum; + // default count 1G, size 4MB + cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30); + cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20); + cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0); + cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0); + + uint32_t duration = cct->_conf->osd_bench_duration; + + if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) { + // let us limit the block size because the next checks rely on it + // having a sane value. If we allow any block size to be set things + // can still go sideways. + ss << "block 'size' values are capped at " + << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use" + << " a higher value, please adjust 'osd_bench_max_block_size'"; + r = -EINVAL; + goto out; + } else if (bsize < (int64_t) (1 << 20)) { + // entering the realm of small block sizes. + // limit the count to a sane value, assuming a configurable amount of + // IOPS and duration, so that the OSD doesn't get hung up on this, + // preventing timeouts from going off + int64_t max_count = + bsize * duration * cct->_conf->osd_bench_small_size_max_iops; + if (count > max_count) { + ss << "'count' values greater than " << max_count + << " for a block size of " << byte_u_t(bsize) << ", assuming " + << cct->_conf->osd_bench_small_size_max_iops << " IOPS," + << " for " << duration << " seconds," + << " can cause ill effects on osd. " + << " Please adjust 'osd_bench_small_size_max_iops' with a higher" + << " value if you wish to use a higher 'count'."; + r = -EINVAL; + goto out; + } + } else { + // 1MB block sizes are big enough so that we get more stuff done. + // However, to avoid the osd from getting hung on this and having + // timers being triggered, we are going to limit the count assuming + // a configurable throughput and duration. + // NOTE: max_count is the total amount of bytes that we believe we + // will be able to write during 'duration' for the given + // throughput. The block size hardly impacts this unless it's + // way too big. Given we already check how big the block size + // is, it's safe to assume everything will check out. + int64_t max_count = + cct->_conf->osd_bench_large_size_max_throughput * duration; + if (count > max_count) { + ss << "'count' values greater than " << max_count + << " for a block size of " << byte_u_t(bsize) << ", assuming " + << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s," + << " for " << duration << " seconds," + << " can cause ill effects on osd. " + << " Please adjust 'osd_bench_large_size_max_throughput'" + << " with a higher value if you wish to use a higher 'count'."; + r = -EINVAL; + goto out; + } + } + + if (osize && bsize > osize) + bsize = osize; + + dout(1) << " bench count " << count + << " bsize " << byte_u_t(bsize) << dendl; + + ObjectStore::Transaction cleanupt; + + if (osize && onum) { + bufferlist bl; + bufferptr bp(osize); + bp.zero(); + bl.push_back(std::move(bp)); + bl.rebuild_page_aligned(); + for (int i=0; i<onum; ++i) { + char nm[30]; + snprintf(nm, sizeof(nm), "disk_bw_test_%d", i); + object_t oid(nm); + hobject_t soid(sobject_t(oid, 0)); + ObjectStore::Transaction t; + t.write(coll_t(), ghobject_t(soid), 0, osize, bl); + store->queue_transaction(service.meta_ch, std::move(t), NULL); + cleanupt.remove(coll_t(), ghobject_t(soid)); + } + } + + bufferlist bl; + bufferptr bp(bsize); + bp.zero(); + bl.push_back(std::move(bp)); + bl.rebuild_page_aligned(); + + { + C_SaferCond waiter; + if (!service.meta_ch->flush_commit(&waiter)) { + waiter.wait(); + } + } + + utime_t start = ceph_clock_now(); + for (int64_t pos = 0; pos < count; pos += bsize) { + char nm[30]; + unsigned offset = 0; + if (onum && osize) { + snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum)); + offset = rand() % (osize / bsize) * bsize; + } else { + snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos); + } + object_t oid(nm); + hobject_t soid(sobject_t(oid, 0)); + ObjectStore::Transaction t; + t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl); + store->queue_transaction(service.meta_ch, std::move(t), NULL); + if (!onum || !osize) + cleanupt.remove(coll_t::meta(), ghobject_t(soid)); + } + + { + C_SaferCond waiter; + if (!service.meta_ch->flush_commit(&waiter)) { + waiter.wait(); + } + } + utime_t end = ceph_clock_now(); + + // clean up + store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL); + { + C_SaferCond waiter; + if (!service.meta_ch->flush_commit(&waiter)) { + waiter.wait(); + } + } + + double elapsed = end - start; + double rate = count / elapsed; + double iops = rate / bsize; + if (f) { + f->open_object_section("osd_bench_results"); + f->dump_int("bytes_written", count); + f->dump_int("blocksize", bsize); + f->dump_float("elapsed_sec", elapsed); + f->dump_float("bytes_per_sec", rate); + f->dump_float("iops", iops); + f->close_section(); + f->flush(ds); + } else { + ds << "bench: wrote " << byte_u_t(count) + << " in blocks of " << byte_u_t(bsize) << " in " + << elapsed << " sec at " << byte_u_t(rate) << "/sec " + << si_u_t(iops) << " IOPS"; + } + } + + else if (prefix == "flush_pg_stats") { + mgrc.send_pgstats(); + ds << service.get_osd_stat_seq() << "\n"; + } + + else if (prefix == "heap") { + r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds); + } + + else if (prefix == "debug dump_missing") { + if (!f) { + f.reset(new JSONFormatter(true)); + } + f->open_array_section("pgs"); + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + string s = stringify(pg->pg_id); + f->open_array_section(s.c_str()); + pg->lock(); + pg->dump_missing(f.get()); + pg->unlock(); + f->close_section(); + } + f->close_section(); + f->flush(ds); + } + else if (prefix == "debug kick_recovery_wq") { + int64_t delay; + cmd_getval(cct, cmdmap, "delay", delay); + ostringstream oss; + oss << delay; + unlock_guard unlock{osd_lock}; + r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str()); + if (r != 0) { + ss << "kick_recovery_wq: error setting " + << "osd_recovery_delay_start to '" << delay << "': error " + << r; + goto out; + } + cct->_conf.apply_changes(nullptr); + ss << "kicking recovery queue. set osd_recovery_delay_start " + << "to " << cct->_conf->osd_recovery_delay_start; + } + + else if (prefix == "cpu_profiler") { + string arg; + cmd_getval(cct, cmdmap, "arg", arg); + vector<string> argvec; + get_str_vec(arg, argvec); + cpu_profiler_handle_command(argvec, ds); + } + + else if (prefix == "dump_pg_recovery_stats") { + stringstream s; + if (f) { + pg_recovery_stats.dump_formatted(f.get()); + f->flush(ds); + } else { + pg_recovery_stats.dump(s); + ds << "dump pg recovery stats: " << s.str(); + } + } + + else if (prefix == "reset_pg_recovery_stats") { + ss << "reset pg recovery stats"; + pg_recovery_stats.reset(); + } + + else if (prefix == "perf histogram dump") { + std::string logger; + std::string counter; + cmd_getval(cct, cmdmap, "logger", logger); + cmd_getval(cct, cmdmap, "counter", counter); + if (f) { + cct->get_perfcounters_collection()->dump_formatted_histograms( + f.get(), false, logger, counter); + f->flush(ds); + } + } + + else if (prefix == "compact") { + dout(1) << "triggering manual compaction" << dendl; + auto start = ceph::coarse_mono_clock::now(); + store->compact(); + auto end = ceph::coarse_mono_clock::now(); + double duration = std::chrono::duration<double>(end-start).count(); + dout(1) << "finished manual compaction in " + << duration + << " seconds" << dendl; + ss << "compacted omap in " << duration << " seconds"; + } + + else if (prefix == "smart") { + string devid; + cmd_getval(cct, cmdmap, "devid", devid); + probe_smart(devid, ds); + } + + else if (prefix == "cache drop") { + dout(20) << "clearing all caches" << dendl; + // Clear the objectstore's cache - onode and buffer for Bluestore, + // system's pagecache for Filestore + r = store->flush_cache(&ss); + if (r < 0) { + ds << "Error flushing objectstore cache: " << cpp_strerror(r); + goto out; + } + // Clear the objectcontext cache (per PG) + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg: pgs) { + pg->clear_cache(); + } + } + + else if (prefix == "cache status") { + int obj_ctx_count = 0; + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg: pgs) { + obj_ctx_count += pg->get_cache_obj_count(); + } + if (f) { + f->open_object_section("cache_status"); + f->dump_int("object_ctx", obj_ctx_count); + store->dump_cache_stats(f.get()); + f->close_section(); + f->flush(ds); + } else { + ds << "object_ctx: " << obj_ctx_count; + store->dump_cache_stats(ds); + } + } + else if (prefix == "send_beacon") { + if (is_active()) { + send_beacon(ceph::coarse_mono_clock::now()); + } + } else { + ss << "unrecognized command '" << prefix << "'"; + r = -EINVAL; + } + + out: + return r; +} + +void OSD::probe_smart(const string& only_devid, ostream& ss) +{ + set<string> devnames; + store->get_devices(&devnames); + uint64_t smart_timeout = cct->_conf.get_val<uint64_t>( + "osd_smart_report_timeout"); + + // == typedef std::map<std::string, mValue> mObject; + json_spirit::mObject json_map; + + for (auto dev : devnames) { + // smartctl works only on physical devices; filter out any logical device + if (dev.find("dm-") == 0) { + continue; + } + + string err; + string devid = get_device_id(dev, &err); + if (devid.size() == 0) { + dout(10) << __func__ << " no unique id for dev " << dev << " (" + << err << "), skipping" << dendl; + continue; + } + if (only_devid.size() && devid != only_devid) { + continue; + } + + json_spirit::mValue smart_json; + if (block_device_get_metrics(dev, smart_timeout, + &smart_json)) { + dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl; + continue; + } + json_map[devid] = smart_json; + } + json_spirit::write(json_map, ss, json_spirit::pretty_print); +} + +bool OSD::heartbeat_dispatch(Message *m) +{ + dout(30) << "heartbeat_dispatch " << m << dendl; + switch (m->get_type()) { + + case CEPH_MSG_PING: + dout(10) << "ping from " << m->get_source_inst() << dendl; + m->put(); + break; + + case MSG_OSD_PING: + handle_osd_ping(static_cast<MOSDPing*>(m)); + break; + + default: + dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl; + m->put(); + } + + return true; +} + +bool OSD::ms_dispatch(Message *m) +{ + dout(20) << "OSD::ms_dispatch: " << *m << dendl; + if (m->get_type() == MSG_OSD_MARK_ME_DOWN) { + service.got_stop_ack(); + m->put(); + return true; + } + + // lock! + + osd_lock.Lock(); + if (is_stopping()) { + osd_lock.Unlock(); + m->put(); + return true; + } + + do_waiters(); + _dispatch(m); + + osd_lock.Unlock(); + + return true; +} + +void OSD::maybe_share_map( + Session *session, + OpRequestRef op, + OSDMapRef osdmap) +{ + if (!op->check_send_map) { + return; + } + epoch_t last_sent_epoch = 0; + + session->sent_epoch_lock.lock(); + last_sent_epoch = session->last_sent_epoch; + session->sent_epoch_lock.unlock(); + + // assume the peer has the newer of the op's sent_epoch and what + // we think we sent them. + epoch_t from = std::max(last_sent_epoch, op->sent_epoch); + + const Message *m = op->get_req(); + service.share_map( + m->get_source(), + m->get_connection().get(), + from, + osdmap, + session ? &last_sent_epoch : NULL); + + session->sent_epoch_lock.lock(); + if (session->last_sent_epoch < last_sent_epoch) { + session->last_sent_epoch = last_sent_epoch; + } + session->sent_epoch_lock.unlock(); + + op->check_send_map = false; +} + +void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap) +{ + ceph_assert(session->session_dispatch_lock.is_locked()); + + auto i = session->waiting_on_map.begin(); + while (i != session->waiting_on_map.end()) { + OpRequestRef op = &(*i); + ceph_assert(ms_can_fast_dispatch(op->get_req())); + const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>( + op->get_req()); + if (m->get_min_epoch() > osdmap->get_epoch()) { + break; + } + session->waiting_on_map.erase(i++); + op->put(); + + spg_t pgid; + if (m->get_type() == CEPH_MSG_OSD_OP) { + pg_t actual_pgid = osdmap->raw_pg_to_pg( + static_cast<const MOSDOp*>(m)->get_pg()); + if (!osdmap->get_primary_shard(actual_pgid, &pgid)) { + continue; + } + } else { + pgid = m->get_spg(); + } + enqueue_op(pgid, std::move(op), m->get_map_epoch()); + } + + if (session->waiting_on_map.empty()) { + clear_session_waiting_on_map(session); + } else { + register_session_waiting_on_map(session); + } +} + +void OSD::ms_fast_dispatch(Message *m) +{ + FUNCTRACE(cct); + if (service.is_stopping()) { + m->put(); + return; + } + + // peering event? + switch (m->get_type()) { + case CEPH_MSG_PING: + dout(10) << "ping from " << m->get_source() << dendl; + m->put(); + return; + case MSG_MON_COMMAND: + handle_command(static_cast<MMonCommand*>(m)); + return; + case MSG_OSD_FORCE_RECOVERY: + handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m)); + return; + case MSG_OSD_SCRUB2: + handle_fast_scrub(static_cast<MOSDScrub2*>(m)); + return; + + case MSG_OSD_PG_CREATE2: + return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m)); + case MSG_OSD_PG_QUERY: + return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m)); + case MSG_OSD_PG_NOTIFY: + return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m)); + case MSG_OSD_PG_INFO: + return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m)); + case MSG_OSD_PG_REMOVE: + return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m)); + + // these are single-pg messages that handle themselves + case MSG_OSD_PG_LOG: + case MSG_OSD_PG_TRIM: + case MSG_OSD_BACKFILL_RESERVE: + case MSG_OSD_RECOVERY_RESERVE: + { + MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m); + if (require_osd_peer(pm)) { + enqueue_peering_evt( + pm->get_spg(), + PGPeeringEventRef(pm->get_event())); + } + pm->put(); + return; + } + } + + OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m); + { +#ifdef WITH_LTTNG + osd_reqid_t reqid = op->get_reqid(); +#endif + tracepoint(osd, ms_fast_dispatch, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + if (m->trace) + op->osd_trace.init("osd op", &trace_endpoint, &m->trace); + + // note sender epoch, min req's epoch + op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch(); + op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch(); + ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check! + + service.maybe_inject_dispatch_delay(); + + if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) || + m->get_type() != CEPH_MSG_OSD_OP) { + // queue it directly + enqueue_op( + static_cast<MOSDFastDispatchOp*>(m)->get_spg(), + std::move(op), + static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch()); + } else { + // legacy client, and this is an MOSDOp (the *only* fast dispatch + // message that didn't have an explicit spg_t); we need to map + // them to an spg_t while preserving delivery order. + auto priv = m->get_connection()->get_priv(); + if (auto session = static_cast<Session*>(priv.get()); session) { + std::lock_guard l{session->session_dispatch_lock}; + op->get(); + session->waiting_on_map.push_back(*op); + OSDMapRef nextmap = service.get_nextmap_reserved(); + dispatch_session_waiting(session, nextmap); + service.release_map(nextmap); + } + } + OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false); +} + +bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) +{ + dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl; + + if (is_stopping()) { + dout(10) << __func__ << " bailing, we are shutting down" << dendl; + return false; + } + + if (dest_type == CEPH_ENTITY_TYPE_MON) + return true; + + *authorizer = monc->build_authorizer(dest_type); + return *authorizer != NULL; +} + +KeyStore *OSD::ms_get_auth1_authorizer_keystore() +{ + return monc->rotating_secrets.get(); +} + +int OSD::ms_handle_authentication(Connection *con) +{ + int ret = 0; + auto priv = con->get_priv(); + Session *s = static_cast<Session*>(priv.get()); + if (!s) { + s = new Session(cct, con); + con->set_priv(RefCountedPtr{s, false}); + s->entity_name = con->get_peer_entity_name(); + dout(10) << __func__ << " new session " << s << " con " << s->con + << " entity " << s->entity_name + << " addr " << con->get_peer_addrs() << dendl; + } else { + dout(10) << __func__ << " existing session " << s << " con " << s->con + << " entity " << s->entity_name + << " addr " << con->get_peer_addrs() << dendl; + } + + AuthCapsInfo &caps_info = con->get_peer_caps_info(); + if (caps_info.allow_all) + s->caps.set_allow_all(); + + if (caps_info.caps.length() > 0) { + bufferlist::const_iterator p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } + catch (buffer::error& e) { + dout(10) << __func__ << " session " << s << " " << s->entity_name + << " failed to decode caps string" << dendl; + ret = -EPERM; + } + if (!ret) { + bool success = s->caps.parse(str); + if (success) { + dout(10) << __func__ << " session " << s + << " " << s->entity_name + << " has caps " << s->caps << " '" << str << "'" << dendl; + ret = 1; + } else { + dout(10) << __func__ << " session " << s << " " << s->entity_name + << " failed to parse caps '" << str << "'" << dendl; + ret = -EPERM; + } + } + } + return ret; +} + +void OSD::do_waiters() +{ + ceph_assert(osd_lock.is_locked()); + + dout(10) << "do_waiters -- start" << dendl; + while (!finished.empty()) { + OpRequestRef next = finished.front(); + finished.pop_front(); + dispatch_op(next); + } + dout(10) << "do_waiters -- finish" << dendl; +} + +void OSD::dispatch_op(OpRequestRef op) +{ + switch (op->get_req()->get_type()) { + + case MSG_OSD_PG_CREATE: + handle_pg_create(op); + break; + } +} + +void OSD::_dispatch(Message *m) +{ + ceph_assert(osd_lock.is_locked()); + dout(20) << "_dispatch " << m << " " << *m << dendl; + + switch (m->get_type()) { + // -- don't need OSDMap -- + + // map and replication + case CEPH_MSG_OSD_MAP: + handle_osd_map(static_cast<MOSDMap*>(m)); + break; + + // osd + case MSG_OSD_SCRUB: + handle_scrub(static_cast<MOSDScrub*>(m)); + break; + + case MSG_COMMAND: + handle_command(static_cast<MCommand*>(m)); + return; + + // -- need OSDMap -- + + case MSG_OSD_PG_CREATE: + { + OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m); + if (m->trace) + op->osd_trace.init("osd op", &trace_endpoint, &m->trace); + // no map? starting up? + if (!get_osdmap()) { + dout(7) << "no OSDMap, not booted" << dendl; + logger->inc(l_osd_waiting_for_map); + waiting_for_osdmap.push_back(op); + op->mark_delayed("no osdmap"); + break; + } + + // need OSDMap + dispatch_op(op); + } + } +} + +// remove me post-nautilus +void OSD::handle_scrub(MOSDScrub *m) +{ + dout(10) << "handle_scrub " << *m << dendl; + if (!require_mon_or_mgr_peer(m)) { + m->put(); + return; + } + if (m->fsid != monc->get_fsid()) { + dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() + << dendl; + m->put(); + return; + } + + vector<spg_t> spgs; + _get_pgids(&spgs); + + if (!m->scrub_pgs.empty()) { + vector<spg_t> v; + for (auto pgid : m->scrub_pgs) { + spg_t pcand; + if (get_osdmap()->get_primary_shard(pgid, &pcand) && + std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) { + v.push_back(pcand); + } + } + spgs.swap(v); + } + + for (auto pgid : spgs) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PG::RequestScrub(m->deep, m->repair)))); + } + + m->put(); +} + +void OSD::handle_fast_scrub(MOSDScrub2 *m) +{ + dout(10) << __func__ << " " << *m << dendl; + if (!require_mon_or_mgr_peer(m)) { + m->put(); + return; + } + if (m->fsid != monc->get_fsid()) { + dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid() + << dendl; + m->put(); + return; + } + for (auto pgid : m->scrub_pgs) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + m->epoch, + m->epoch, + PG::RequestScrub(m->deep, m->repair)))); + } + m->put(); +} + +bool OSD::scrub_random_backoff() +{ + bool coin_flip = (rand() / (double)RAND_MAX >= + cct->_conf->osd_scrub_backoff_ratio); + if (!coin_flip) { + dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl; + return true; + } + return false; +} + +OSDService::ScrubJob::ScrubJob(CephContext* cct, + const spg_t& pg, const utime_t& timestamp, + double pool_scrub_min_interval, + double pool_scrub_max_interval, bool must) + : cct(cct), + pgid(pg), + sched_time(timestamp), + deadline(timestamp) +{ + // if not explicitly requested, postpone the scrub with a random delay + if (!must) { + double scrub_min_interval = pool_scrub_min_interval > 0 ? + pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval; + double scrub_max_interval = pool_scrub_max_interval > 0 ? + pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval; + + sched_time += scrub_min_interval; + double r = rand() / (double)RAND_MAX; + sched_time += + scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r; + if (scrub_max_interval == 0) { + deadline = utime_t(); + } else { + deadline += scrub_max_interval; + } + + } +} + +bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const { + if (sched_time < rhs.sched_time) + return true; + if (sched_time > rhs.sched_time) + return false; + return pgid < rhs.pgid; +} + +bool OSD::scrub_time_permit(utime_t now) +{ + struct tm bdt; + time_t tt = now.sec(); + localtime_r(&tt, &bdt); + + bool day_permit = false; + if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) { + if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) { + day_permit = true; + } + } else { + if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) { + day_permit = true; + } + } + + if (!day_permit) { + dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day + << " - " << cct->_conf->osd_scrub_end_week_day + << " now " << bdt.tm_wday << " = no" << dendl; + return false; + } + + bool time_permit = false; + if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) { + if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) { + time_permit = true; + } + } else { + if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) { + time_permit = true; + } + } + if (!time_permit) { + dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour + << " - " << cct->_conf->osd_scrub_end_hour + << " now " << bdt.tm_hour << " = no" << dendl; + } else { + dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour + << " - " << cct->_conf->osd_scrub_end_hour + << " now " << bdt.tm_hour << " = yes" << dendl; + } + return time_permit; +} + +bool OSD::scrub_load_below_threshold() +{ + double loadavgs[3]; + if (getloadavg(loadavgs, 3) != 3) { + dout(10) << __func__ << " couldn't read loadavgs\n" << dendl; + return false; + } + + // allow scrub if below configured threshold + long cpus = sysconf(_SC_NPROCESSORS_ONLN); + double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0]; + if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) { + dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu + << " < max " << cct->_conf->osd_scrub_load_threshold + << " = yes" << dendl; + return true; + } + + // allow scrub if below daily avg and currently decreasing + if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) { + dout(20) << __func__ << " loadavg " << loadavgs[0] + << " < daily_loadavg " << daily_loadavg + << " and < 15m avg " << loadavgs[2] + << " = yes" << dendl; + return true; + } + + dout(20) << __func__ << " loadavg " << loadavgs[0] + << " >= max " << cct->_conf->osd_scrub_load_threshold + << " and ( >= daily_loadavg " << daily_loadavg + << " or >= 15m avg " << loadavgs[2] + << ") = no" << dendl; + return false; +} + +void OSD::sched_scrub() +{ + // if not permitted, fail fast + if (!service.can_inc_scrubs()) { + return; + } + bool allow_requested_repair_only = false; + if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) { + if (!cct->_conf->osd_repair_during_recovery) { + dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl; + return; + } + dout(10) << __func__ + << " will only schedule explicitly requested repair due to active recovery" + << dendl; + allow_requested_repair_only = true; + } + + utime_t now = ceph_clock_now(); + bool time_permit = scrub_time_permit(now); + bool load_is_low = scrub_load_below_threshold(); + dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl; + + OSDService::ScrubJob scrub; + if (service.first_scrub_stamp(&scrub)) { + do { + dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl; + + if (scrub.sched_time > now) { + // save ourselves some effort + dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time + << " > " << now << dendl; + break; + } + + if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) { + dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to " + << (!time_permit ? "time not permit" : "high load") << dendl; + continue; + } + + PGRef pg = _lookup_lock_pg(scrub.pgid); + if (!pg) + continue; + // This has already started, so go on to the next scrub job + if (pg->scrubber.active) { + pg->unlock(); + dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl; + continue; + } + // Skip other kinds of scrubing if only explicitly requested repairing is allowed + if (allow_requested_repair_only && !pg->scrubber.must_repair) { + pg->unlock(); + dout(10) << __func__ << " skip " << scrub.pgid + << " because repairing is not explicitly requested on it" + << dendl; + continue; + } + // If it is reserving, let it resolve before going to the next scrub job + if (pg->scrubber.local_reserved && !pg->scrubber.active) { + pg->unlock(); + dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl; + break; + } + dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time + << (pg->get_must_scrub() ? ", explicitly requested" : + (load_is_low ? ", load_is_low" : " deadline < now")) + << dendl; + if (pg->sched_scrub()) { + pg->unlock(); + break; + } + pg->unlock(); + } while (service.next_scrub_stamp(scrub, &scrub)); + } + dout(20) << "sched_scrub done" << dendl; +} + +void OSD::resched_all_scrubs() +{ + dout(10) << __func__ << ": start" << dendl; + OSDService::ScrubJob scrub; + if (service.first_scrub_stamp(&scrub)) { + do { + dout(20) << __func__ << ": examine " << scrub.pgid << dendl; + + PGRef pg = _lookup_lock_pg(scrub.pgid); + if (!pg) + continue; + if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) { + dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl; + pg->on_info_history_change(); + } + pg->unlock(); + } while (service.next_scrub_stamp(scrub, &scrub)); + } + dout(10) << __func__ << ": done" << dendl; +} + +MPGStats* OSD::collect_pg_stats() +{ + // This implementation unconditionally sends every is_primary PG's + // stats every time we're called. This has equivalent cost to the + // previous implementation's worst case where all PGs are busy and + // their stats are always enqueued for sending. + RWLock::RLocker l(map_lock); + + utime_t had_for = ceph_clock_now() - had_map_since; + osd_stat_t cur_stat = service.get_osd_stat(); + cur_stat.os_perf_stat = store->get_cur_stats(); + + auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch(), had_for); + m->osd_stat = cur_stat; + + std::lock_guard lec{min_last_epoch_clean_lock}; + min_last_epoch_clean = get_osdmap_epoch(); + min_last_epoch_clean_pgs.clear(); + + std::set<int64_t> pool_set; + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + auto pool = pg->pg_id.pgid.pool(); + pool_set.emplace((int64_t)pool); + if (!pg->is_primary()) { + continue; + } + pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) { + m->pg_stat[pg->pg_id.pgid] = s; + min_last_epoch_clean = min(min_last_epoch_clean, lec); + min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid); + }); + } + store_statfs_t st; + bool per_pool_stats = false; + for (auto p : pool_set) { + int r = store->pool_statfs(p, &st); + if (r == -ENOTSUP) { + break; + } else { + assert(r >= 0); + m->pool_stat[p] = st; + per_pool_stats = true; + } + } + + // indicate whether we are reporting per-pool stats + m->osd_stat.num_osds = 1; + m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0; + + return m; +} + +vector<DaemonHealthMetric> OSD::get_health_metrics() +{ + vector<DaemonHealthMetric> metrics; + { + utime_t oldest_secs; + const utime_t now = ceph_clock_now(); + auto too_old = now; + too_old -= cct->_conf.get_val<double>("osd_op_complaint_time"); + int slow = 0; + TrackedOpRef oldest_op; + auto count_slow_ops = [&](TrackedOp& op) { + if (op.get_initiated() < too_old) { + stringstream ss; + ss << "slow request " << op.get_desc() + << " initiated " + << op.get_initiated() + << " currently " + << op.state_string(); + lgeneric_subdout(cct,osd,20) << ss.str() << dendl; + clog->warn() << ss.str(); + slow++; + if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) { + oldest_op = &op; + } + return true; + } else { + return false; + } + }; + if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) { + if (slow) { + derr << __func__ << " reporting " << slow << " slow ops, oldest is " + << oldest_op->get_desc() << dendl; + } + metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs); + } else { + // no news is not good news. + metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0); + } + } + { + std::lock_guard l(pending_creates_lock); + auto n_primaries = pending_creates_from_mon; + for (const auto& create : pending_creates_from_osd) { + if (create.second) { + n_primaries++; + } + } + metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries); + } + return metrics; +} + +// ===================================================== +// MAP + +void OSD::wait_for_new_map(OpRequestRef op) +{ + // ask? + if (waiting_for_osdmap.empty()) { + osdmap_subscribe(get_osdmap_epoch() + 1, false); + } + + logger->inc(l_osd_waiting_for_map); + waiting_for_osdmap.push_back(op); + op->mark_delayed("wait for new map"); +} + + +/** update_map + * assimilate new OSDMap(s). scan pgs, etc. + */ + +void OSD::note_down_osd(int peer) +{ + ceph_assert(osd_lock.is_locked()); + cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer)); + + heartbeat_lock.Lock(); + failure_queue.erase(peer); + failure_pending.erase(peer); + map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer); + if (p != heartbeat_peers.end()) { + p->second.con_back->mark_down(); + if (p->second.con_front) { + p->second.con_front->mark_down(); + } + heartbeat_peers.erase(p); + } + heartbeat_lock.Unlock(); +} + +void OSD::note_up_osd(int peer) +{ + service.forget_peer_epoch(peer, get_osdmap_epoch() - 1); + heartbeat_set_peers_need_update(); +} + +struct C_OnMapCommit : public Context { + OSD *osd; + epoch_t first, last; + MOSDMap *msg; + C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m) + : osd(o), first(f), last(l), msg(m) {} + void finish(int r) override { + osd->_committed_osd_maps(first, last, msg); + msg->put(); + } +}; + +void OSD::osdmap_subscribe(version_t epoch, bool force_request) +{ + std::lock_guard l(osdmap_subscribe_lock); + if (latest_subscribed_epoch >= epoch && !force_request) + return; + + latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch); + + if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) || + force_request) { + monc->renew_subs(); + } +} + +void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps) +{ + epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); + if (min <= superblock.oldest_map) + return; + + int num = 0; + ObjectStore::Transaction t; + for (epoch_t e = superblock.oldest_map; e < min; ++e) { + dout(20) << " removing old osdmap epoch " << e << dendl; + t.remove(coll_t::meta(), get_osdmap_pobject_name(e)); + t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e)); + superblock.oldest_map = e + 1; + num++; + if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) { + service.publish_superblock(superblock); + write_superblock(t); + int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); + ceph_assert(tr == 0); + num = 0; + if (!skip_maps) { + // skip_maps leaves us with a range of old maps if we fail to remove all + // of them before moving superblock.oldest_map forward to the first map + // in the incoming MOSDMap msg. so we should continue removing them in + // this case, even we could do huge series of delete transactions all at + // once. + break; + } + } + } + if (num > 0) { + service.publish_superblock(superblock); + write_superblock(t); + int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); + ceph_assert(tr == 0); + } + // we should not remove the cached maps + ceph_assert(min <= service.map_cache.cached_key_lower_bound()); +} + +void OSD::handle_osd_map(MOSDMap *m) +{ + // wait for pgs to catch up + { + // we extend the map cache pins to accomodate pgs slow to consume maps + // for some period, until we hit the max_lag_factor bound, at which point + // we block here to stop injesting more maps than they are able to keep + // up with. + epoch_t max_lag = cct->_conf->osd_map_cache_size * + m_osd_pg_epoch_max_lag_factor; + ceph_assert(max_lag > 0); + epoch_t osd_min = 0; + for (auto shard : shards) { + epoch_t min = shard->get_min_pg_epoch(); + if (osd_min == 0 || min < osd_min) { + osd_min = min; + } + } + epoch_t osdmap_epoch = get_osdmap_epoch(); + if (osd_min > 0 && + osdmap_epoch > max_lag && + osdmap_epoch - max_lag > osd_min) { + epoch_t need = osdmap_epoch - max_lag; + dout(10) << __func__ << " waiting for pgs to catch up (need " << need + << " max_lag " << max_lag << ")" << dendl; + for (auto shard : shards) { + epoch_t min = shard->get_min_pg_epoch(); + if (need > min) { + dout(10) << __func__ << " waiting for pgs to consume " << need + << " (shard " << shard->shard_id << " min " << min + << ", map cache is " << cct->_conf->osd_map_cache_size + << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor + << ")" << dendl; + unlock_guard unlock{osd_lock}; + shard->wait_min_pg_epoch(need); + } + } + } + } + + ceph_assert(osd_lock.is_locked()); + map<epoch_t,OSDMapRef> added_maps; + map<epoch_t,bufferlist> added_maps_bl; + if (m->fsid != monc->get_fsid()) { + dout(0) << "handle_osd_map fsid " << m->fsid << " != " + << monc->get_fsid() << dendl; + m->put(); + return; + } + if (is_initializing()) { + dout(0) << "ignoring osdmap until we have initialized" << dendl; + m->put(); + return; + } + + auto priv = m->get_connection()->get_priv(); + if (auto session = static_cast<Session *>(priv.get()); + session && !(session->entity_name.is_mon() || + session->entity_name.is_osd())) { + //not enough perms! + dout(10) << "got osd map from Session " << session + << " which we can't take maps from (not a mon or osd)" << dendl; + m->put(); + return; + } + + // share with the objecter + if (!is_preboot()) + service.objecter->handle_osd_map(m); + + epoch_t first = m->get_first(); + epoch_t last = m->get_last(); + dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have " + << superblock.newest_map + << ", src has [" << m->oldest_map << "," << m->newest_map << "]" + << dendl; + + logger->inc(l_osd_map); + logger->inc(l_osd_mape, last - first + 1); + if (first <= superblock.newest_map) + logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1); + if (service.max_oldest_map < m->oldest_map) { + service.max_oldest_map = m->oldest_map; + ceph_assert(service.max_oldest_map >= superblock.oldest_map); + } + + // make sure there is something new, here, before we bother flushing + // the queues and such + if (last <= superblock.newest_map) { + dout(10) << " no new maps here, dropping" << dendl; + m->put(); + return; + } + + // missing some? + bool skip_maps = false; + if (first > superblock.newest_map + 1) { + dout(10) << "handle_osd_map message skips epochs " + << superblock.newest_map + 1 << ".." << (first-1) << dendl; + if (m->oldest_map <= superblock.newest_map + 1) { + osdmap_subscribe(superblock.newest_map + 1, false); + m->put(); + return; + } + // always try to get the full range of maps--as many as we can. this + // 1- is good to have + // 2- is at present the only way to ensure that we get a *full* map as + // the first map! + if (m->oldest_map < first) { + osdmap_subscribe(m->oldest_map - 1, true); + m->put(); + return; + } + skip_maps = true; + } + + ObjectStore::Transaction t; + uint64_t txn_size = 0; + + // store new maps: queue for disk and put in the osdmap cache + epoch_t start = std::max(superblock.newest_map + 1, first); + for (epoch_t e = start; e <= last; e++) { + if (txn_size >= t.get_num_bytes()) { + derr << __func__ << " transaction size overflowed" << dendl; + ceph_assert(txn_size < t.get_num_bytes()); + } + txn_size = t.get_num_bytes(); + map<epoch_t,bufferlist>::iterator p; + p = m->maps.find(e); + if (p != m->maps.end()) { + dout(10) << "handle_osd_map got full map for epoch " << e << dendl; + OSDMap *o = new OSDMap; + bufferlist& bl = p->second; + + o->decode(bl); + + ghobject_t fulloid = get_osdmap_pobject_name(e); + t.write(coll_t::meta(), fulloid, 0, bl.length(), bl); + added_maps[e] = add_map(o); + added_maps_bl[e] = bl; + got_full_map(e); + continue; + } + + p = m->incremental_maps.find(e); + if (p != m->incremental_maps.end()) { + dout(10) << "handle_osd_map got inc map for epoch " << e << dendl; + bufferlist& bl = p->second; + ghobject_t oid = get_inc_osdmap_pobject_name(e); + t.write(coll_t::meta(), oid, 0, bl.length(), bl); + + OSDMap *o = new OSDMap; + if (e > 1) { + bufferlist obl; + bool got = get_map_bl(e - 1, obl); + if (!got) { + auto p = added_maps_bl.find(e - 1); + ceph_assert(p != added_maps_bl.end()); + obl = p->second; + } + o->decode(obl); + } + + OSDMap::Incremental inc; + auto p = bl.cbegin(); + inc.decode(p); + + if (o->apply_incremental(inc) < 0) { + derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl; + ceph_abort_msg("bad fsid"); + } + + bufferlist fbl; + o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); + + bool injected_failure = false; + if (cct->_conf->osd_inject_bad_map_crc_probability > 0 && + (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) { + derr << __func__ << " injecting map crc failure" << dendl; + injected_failure = true; + } + + if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) { + dout(2) << "got incremental " << e + << " but failed to encode full with correct crc; requesting" + << dendl; + clog->warn() << "failed to encode map e" << e << " with expected crc"; + dout(20) << "my encoded map was:\n"; + fbl.hexdump(*_dout); + *_dout << dendl; + delete o; + request_full_map(e, last); + last = e - 1; + + // don't continue committing if we failed to enc the first inc map + if (last < start) { + dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl; + m->put(); + return; + } + break; + } + got_full_map(e); + + ghobject_t fulloid = get_osdmap_pobject_name(e); + t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl); + added_maps[e] = add_map(o); + added_maps_bl[e] = fbl; + continue; + } + + ceph_abort_msg("MOSDMap lied about what maps it had?"); + } + + // even if this map isn't from a mon, we may have satisfied our subscription + monc->sub_got("osdmap", last); + + if (!m->maps.empty() && requested_full_first) { + dout(10) << __func__ << " still missing full maps " << requested_full_first + << ".." << requested_full_last << dendl; + rerequest_full_maps(); + } + + if (superblock.oldest_map) { + // make sure we at least keep pace with incoming maps + trim_maps(m->oldest_map, last - first + 1, skip_maps); + pg_num_history.prune(superblock.oldest_map); + } + + if (!superblock.oldest_map || skip_maps) + superblock.oldest_map = first; + superblock.newest_map = last; + superblock.current_epoch = last; + + // note in the superblock that we were clean thru the prior epoch + epoch_t boot_epoch = service.get_boot_epoch(); + if (boot_epoch && boot_epoch >= superblock.mounted) { + superblock.mounted = boot_epoch; + superblock.clean_thru = last; + } + + // check for pg_num changes and deleted pools + OSDMapRef lastmap; + for (auto& i : added_maps) { + if (!lastmap) { + if (!(lastmap = service.try_get_map(i.first - 1))) { + dout(10) << __func__ << " can't get previous map " << i.first - 1 + << " probably first start of this osd" << dendl; + continue; + } + } + ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch()); + for (auto& j : lastmap->get_pools()) { + if (!i.second->have_pg_pool(j.first)) { + pg_num_history.log_pool_delete(i.first, j.first); + dout(10) << __func__ << " recording final pg_pool_t for pool " + << j.first << dendl; + // this information is needed by _make_pg() if have to restart before + // the pool is deleted and need to instantiate a new (zombie) PG[Pool]. + ghobject_t obj = make_final_pool_info_oid(j.first); + bufferlist bl; + encode(j.second, bl, CEPH_FEATURES_ALL); + string name = lastmap->get_pool_name(j.first); + encode(name, bl); + map<string,string> profile; + if (lastmap->get_pg_pool(j.first)->is_erasure()) { + profile = lastmap->get_erasure_code_profile( + lastmap->get_pg_pool(j.first)->erasure_code_profile); + } + encode(profile, bl); + t.write(coll_t::meta(), obj, 0, bl.length(), bl); + service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num()); + } else if (unsigned new_pg_num = i.second->get_pg_num(j.first); + new_pg_num != j.second.get_pg_num()) { + dout(10) << __func__ << " recording pool " << j.first << " pg_num " + << j.second.get_pg_num() << " -> " << new_pg_num << dendl; + pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num); + } + } + for (auto& j : i.second->get_pools()) { + if (!lastmap->have_pg_pool(j.first)) { + dout(10) << __func__ << " recording new pool " << j.first << " pg_num " + << j.second.get_pg_num() << dendl; + pg_num_history.log_pg_num_change(i.first, j.first, + j.second.get_pg_num()); + } + } + lastmap = i.second; + } + pg_num_history.epoch = last; + { + bufferlist bl; + ::encode(pg_num_history, bl); + t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl); + dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; + } + + // superblock and commit + write_superblock(t); + t.register_on_commit(new C_OnMapCommit(this, start, last, m)); + store->queue_transaction( + service.meta_ch, + std::move(t)); + service.publish_superblock(superblock); +} + +void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) +{ + dout(10) << __func__ << " " << first << ".." << last << dendl; + if (is_stopping()) { + dout(10) << __func__ << " bailing, we are shutting down" << dendl; + return; + } + std::lock_guard l(osd_lock); + if (is_stopping()) { + dout(10) << __func__ << " bailing, we are shutting down" << dendl; + return; + } + map_lock.get_write(); + + ceph_assert(first <= last); + + bool do_shutdown = false; + bool do_restart = false; + bool network_error = false; + OSDMapRef osdmap = get_osdmap(); + + // advance through the new maps + for (epoch_t cur = first; cur <= last; cur++) { + dout(10) << " advance to epoch " << cur + << " (<= last " << last + << " <= newest_map " << superblock.newest_map + << ")" << dendl; + + OSDMapRef newmap = get_map(cur); + ceph_assert(newmap); // we just cached it above! + + // start blacklisting messages sent to peers that go down. + service.pre_publish_map(newmap); + + // kill connections to newly down osds + bool waited_for_reservations = false; + set<int> old; + osdmap = get_osdmap(); + osdmap->get_all_osds(old); + for (set<int>::iterator p = old.begin(); p != old.end(); ++p) { + if (*p != whoami && + osdmap->is_up(*p) && // in old map + newmap->is_down(*p)) { // but not the new one + if (!waited_for_reservations) { + service.await_reserved_maps(); + waited_for_reservations = true; + } + note_down_osd(*p); + } else if (*p != whoami && + osdmap->is_down(*p) && + newmap->is_up(*p)) { + note_up_osd(*p); + } + } + + if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) { + dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch() + << dendl; + if (is_booting()) { + // this captures the case where we sent the boot message while + // NOUP was being set on the mon and our boot request was + // dropped, and then later it is cleared. it imperfectly + // handles the case where our original boot message was not + // dropped and we restart even though we might have booted, but + // that is harmless (boot will just take slightly longer). + do_restart = true; + } + } + + osdmap = std::move(newmap); + set_osdmap(osdmap); + epoch_t up_epoch; + epoch_t boot_epoch; + service.retrieve_epochs(&boot_epoch, &up_epoch, NULL); + if (!up_epoch && + osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) { + up_epoch = osdmap->get_epoch(); + dout(10) << "up_epoch is " << up_epoch << dendl; + if (!boot_epoch) { + boot_epoch = osdmap->get_epoch(); + dout(10) << "boot_epoch is " << boot_epoch << dendl; + } + service.set_epochs(&boot_epoch, &up_epoch, NULL); + } + } + + had_map_since = ceph_clock_now(); + + epoch_t _bind_epoch = service.get_bind_epoch(); + if (osdmap->is_up(whoami) && + osdmap->get_addrs(whoami).legacy_equals( + client_messenger->get_myaddrs()) && + _bind_epoch < osdmap->get_up_from(whoami)) { + + if (is_booting()) { + dout(1) << "state: booting -> active" << dendl; + set_state(STATE_ACTIVE); + do_restart = false; + + // set incarnation so that osd_reqid_t's we generate for our + // objecter requests are unique across restarts. + service.objecter->set_client_incarnation(osdmap->get_epoch()); + cancel_pending_failures(); + } + } + + if (osdmap->get_epoch() > 0 && + is_active()) { + if (!osdmap->exists(whoami)) { + dout(0) << "map says i do not exist. shutting down." << dendl; + do_shutdown = true; // don't call shutdown() while we have + // everything paused + } else if (!osdmap->is_up(whoami) || + !osdmap->get_addrs(whoami).legacy_equals( + client_messenger->get_myaddrs()) || + !osdmap->get_cluster_addrs(whoami).legacy_equals( + cluster_messenger->get_myaddrs()) || + !osdmap->get_hb_back_addrs(whoami).legacy_equals( + hb_back_server_messenger->get_myaddrs()) || + !osdmap->get_hb_front_addrs(whoami).legacy_equals( + hb_front_server_messenger->get_myaddrs())) { + if (!osdmap->is_up(whoami)) { + if (service.is_preparing_to_stop() || service.is_stopping()) { + service.got_stop_ack(); + } else { + clog->warn() << "Monitor daemon marked osd." << whoami << " down, " + "but it is still running"; + clog->debug() << "map e" << osdmap->get_epoch() + << " wrongly marked me down at e" + << osdmap->get_down_at(whoami); + } + } else if (!osdmap->get_addrs(whoami).legacy_equals( + client_messenger->get_myaddrs())) { + clog->error() << "map e" << osdmap->get_epoch() + << " had wrong client addr (" << osdmap->get_addrs(whoami) + << " != my " << client_messenger->get_myaddrs() << ")"; + } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals( + cluster_messenger->get_myaddrs())) { + clog->error() << "map e" << osdmap->get_epoch() + << " had wrong cluster addr (" + << osdmap->get_cluster_addrs(whoami) + << " != my " << cluster_messenger->get_myaddrs() << ")"; + } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals( + hb_back_server_messenger->get_myaddrs())) { + clog->error() << "map e" << osdmap->get_epoch() + << " had wrong heartbeat back addr (" + << osdmap->get_hb_back_addrs(whoami) + << " != my " << hb_back_server_messenger->get_myaddrs() + << ")"; + } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals( + hb_front_server_messenger->get_myaddrs())) { + clog->error() << "map e" << osdmap->get_epoch() + << " had wrong heartbeat front addr (" + << osdmap->get_hb_front_addrs(whoami) + << " != my " << hb_front_server_messenger->get_myaddrs() + << ")"; + } + + if (!service.is_stopping()) { + epoch_t up_epoch = 0; + epoch_t bind_epoch = osdmap->get_epoch(); + service.set_epochs(NULL,&up_epoch, &bind_epoch); + do_restart = true; + + //add markdown log + utime_t now = ceph_clock_now(); + utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0); + osd_markdown_log.push_back(now); + //clear all out-of-date log + while (!osd_markdown_log.empty() && + osd_markdown_log.front() + grace < now) + osd_markdown_log.pop_front(); + if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) { + dout(0) << __func__ << " marked down " + << osd_markdown_log.size() + << " > osd_max_markdown_count " + << cct->_conf->osd_max_markdown_count + << " in last " << grace << " seconds, shutting down" + << dendl; + do_restart = false; + do_shutdown = true; + } + + start_waiting_for_healthy(); + + set<int> avoid_ports; +#if defined(__FreeBSD__) + // prevent FreeBSD from grabbing the client_messenger port during + // rebinding. In which case a cluster_meesneger will connect also + // to the same port + client_messenger->get_myaddrs().get_ports(&avoid_ports); +#endif + cluster_messenger->get_myaddrs().get_ports(&avoid_ports); + hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports); + hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports); + + int r = cluster_messenger->rebind(avoid_ports); + if (r != 0) { + do_shutdown = true; // FIXME: do_restart? + network_error = true; + dout(0) << __func__ << " marked down:" + << " rebind cluster_messenger failed" << dendl; + } + + r = hb_back_server_messenger->rebind(avoid_ports); + if (r != 0) { + do_shutdown = true; // FIXME: do_restart? + network_error = true; + dout(0) << __func__ << " marked down:" + << " rebind hb_back_server_messenger failed" << dendl; + } + + r = hb_front_server_messenger->rebind(avoid_ports); + if (r != 0) { + do_shutdown = true; // FIXME: do_restart? + network_error = true; + dout(0) << __func__ << " marked down:" + << " rebind hb_front_server_messenger failed" << dendl; + } + + hb_front_client_messenger->mark_down_all(); + hb_back_client_messenger->mark_down_all(); + + reset_heartbeat_peers(true); + } + } + } + + map_lock.put_write(); + + check_osdmap_features(); + + // yay! + consume_map(); + + if (is_active() || is_waiting_for_healthy()) + maybe_update_heartbeat_peers(); + + if (is_active()) { + activate_map(); + } + + if (do_shutdown) { + if (network_error) { + cancel_pending_failures(); + } + // trigger shutdown in a different thread + dout(0) << __func__ << " shutdown OSD via async signal" << dendl; + queue_async_signal(SIGINT); + } + else if (m->newest_map && m->newest_map > last) { + dout(10) << " msg say newest map is " << m->newest_map + << ", requesting more" << dendl; + osdmap_subscribe(osdmap->get_epoch()+1, false); + } + else if (is_preboot()) { + if (m->get_source().is_mon()) + _preboot(m->oldest_map, m->newest_map); + else + start_boot(); + } + else if (do_restart) + start_boot(); + +} + +void OSD::check_osdmap_features() +{ + // adjust required feature bits? + + // we have to be a bit careful here, because we are accessing the + // Policy structures without taking any lock. in particular, only + // modify integer values that can safely be read by a racing CPU. + // since we are only accessing existing Policy structures a their + // current memory location, and setting or clearing bits in integer + // fields, and we are the only writer, this is not a problem. + + const auto osdmap = get_osdmap(); + { + Messenger::Policy p = client_messenger->get_default_policy(); + uint64_t mask; + uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask); + if ((p.features_required & mask) != features) { + dout(0) << "crush map has features " << features + << ", adjusting msgr requires for clients" << dendl; + p.features_required = (p.features_required & ~mask) | features; + client_messenger->set_default_policy(p); + } + } + { + Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON); + uint64_t mask; + uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask); + if ((p.features_required & mask) != features) { + dout(0) << "crush map has features " << features + << " was " << p.features_required + << ", adjusting msgr requires for mons" << dendl; + p.features_required = (p.features_required & ~mask) | features; + client_messenger->set_policy(entity_name_t::TYPE_MON, p); + } + } + { + Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD); + uint64_t mask; + uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask); + + if ((p.features_required & mask) != features) { + dout(0) << "crush map has features " << features + << ", adjusting msgr requires for osds" << dendl; + p.features_required = (p.features_required & ~mask) | features; + cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p); + } + + if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) { + dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl; + superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + ObjectStore::Transaction t; + write_superblock(t); + int err = store->queue_transaction(service.meta_ch, std::move(t), NULL); + ceph_assert(err == 0); + } + } + + if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) { + heartbeat_dispatcher.ms_set_require_authorizer(false); + } + + if (osdmap->require_osd_release != last_require_osd_release) { + dout(1) << __func__ << " require_osd_release " << last_require_osd_release + << " -> " << to_string(osdmap->require_osd_release) << dendl; + store->write_meta("require_osd_release", + stringify((int)osdmap->require_osd_release)); + last_require_osd_release = osdmap->require_osd_release; + } +} + +struct C_FinishSplits : public Context { + OSD *osd; + set<PGRef> pgs; + C_FinishSplits(OSD *osd, const set<PGRef> &in) + : osd(osd), pgs(in) {} + void finish(int r) override { + osd->_finish_splits(pgs); + } +}; + +void OSD::_finish_splits(set<PGRef>& pgs) +{ + dout(10) << __func__ << " " << pgs << dendl; + if (is_stopping()) + return; + PG::RecoveryCtx rctx = create_context(); + for (set<PGRef>::iterator i = pgs.begin(); + i != pgs.end(); + ++i) { + PG *pg = i->get(); + + pg->lock(); + dout(10) << __func__ << " " << *pg << dendl; + epoch_t e = pg->get_osdmap_epoch(); + pg->handle_initialize(&rctx); + pg->queue_null(e, e); + dispatch_context_transaction(rctx, pg); + pg->unlock(); + + unsigned shard_index = pg->pg_id.hash_to_shard(num_shards); + shards[shard_index]->register_and_wake_split_child(pg); + } + + dispatch_context(rctx, 0, service.get_osdmap()); +}; + +bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src, + unsigned need) +{ + std::lock_guard l(merge_lock); + auto& p = merge_waiters[nextmap->get_epoch()][target]; + p[src->pg_id] = src; + dout(10) << __func__ << " added merge_waiter " << src->pg_id + << " for " << target << ", have " << p.size() << "/" << need + << dendl; + return p.size() == need; +} + +bool OSD::advance_pg( + epoch_t osd_epoch, + PG *pg, + ThreadPool::TPHandle &handle, + PG::RecoveryCtx *rctx) +{ + if (osd_epoch <= pg->get_osdmap_epoch()) { + return true; + } + ceph_assert(pg->is_locked()); + OSDMapRef lastmap = pg->get_osdmap(); + ceph_assert(lastmap->get_epoch() < osd_epoch); + set<PGRef> new_pgs; // any split children + bool ret = true; + + unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ? + lastmap->get_pg_num(pg->pg_id.pool()) : 0; + for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1; + next_epoch <= osd_epoch; + ++next_epoch) { + OSDMapRef nextmap = service.try_get_map(next_epoch); + if (!nextmap) { + dout(20) << __func__ << " missing map " << next_epoch << dendl; + continue; + } + + unsigned new_pg_num = + (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ? + nextmap->get_pg_num(pg->pg_id.pool()) : 0; + if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) { + // check for merge + if (nextmap->have_pg_pool(pg->pg_id.pool())) { + spg_t parent; + if (pg->pg_id.is_merge_source( + old_pg_num, + new_pg_num, + &parent)) { + // we are merge source + PGRef spg = pg; // carry a ref + dout(1) << __func__ << " " << pg->pg_id + << " is merge source, target is " << parent + << dendl; + pg->write_if_dirty(rctx); + if (!new_pgs.empty()) { + rctx->transaction->register_on_applied(new C_FinishSplits(this, + new_pgs)); + new_pgs.clear(); + } + dispatch_context_transaction(*rctx, pg, &handle); + pg->ch->flush(); + // release backoffs explicitly, since the on_shutdown path + // aggressively tears down backoff state. + if (pg->is_primary()) { + pg->release_pg_backoffs(); + } + pg->on_shutdown(); + OSDShard *sdata = pg->osd_shard; + { + std::lock_guard l(sdata->shard_lock); + if (pg->pg_slot) { + sdata->_detach_pg(pg->pg_slot); + // update pg count now since we might not get an osdmap + // any time soon. + if (pg->is_primary()) + logger->dec(l_osd_pg_primary); + else if (pg->is_replica()) + logger->dec(l_osd_pg_replica); + else + logger->dec(l_osd_pg_stray); + } + } + pg->unlock(); + + set<spg_t> children; + parent.is_split(new_pg_num, old_pg_num, &children); + if (add_merge_waiter(nextmap, parent, pg, children.size())) { + enqueue_peering_evt( + parent, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + nextmap->get_epoch(), + nextmap->get_epoch(), + NullEvt()))); + } + ret = false; + goto out; + } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) { + // we are merge target + set<spg_t> children; + pg->pg_id.is_split(new_pg_num, old_pg_num, &children); + dout(20) << __func__ << " " << pg->pg_id + << " is merge target, sources are " << children + << dendl; + map<spg_t,PGRef> sources; + { + std::lock_guard l(merge_lock); + auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id]; + unsigned need = children.size(); + dout(20) << __func__ << " have " << s.size() << "/" + << need << dendl; + if (s.size() == need) { + sources.swap(s); + merge_waiters[nextmap->get_epoch()].erase(pg->pg_id); + if (merge_waiters[nextmap->get_epoch()].empty()) { + merge_waiters.erase(nextmap->get_epoch()); + } + } + } + if (!sources.empty()) { + unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool()); + unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num); + dout(1) << __func__ << " merging " << pg->pg_id << dendl; + pg->merge_from( + sources, rctx, split_bits, + nextmap->get_pg_pool( + pg->pg_id.pool())->last_pg_merge_meta); + pg->pg_slot->waiting_for_merge_epoch = 0; + } else { + dout(20) << __func__ << " not ready to merge yet" << dendl; + pg->write_if_dirty(rctx); + if (!new_pgs.empty()) { + rctx->transaction->register_on_applied(new C_FinishSplits(this, + new_pgs)); + new_pgs.clear(); + } + dispatch_context_transaction(*rctx, pg, &handle); + pg->unlock(); + // kick source(s) to get them ready + for (auto& i : children) { + dout(20) << __func__ << " kicking source " << i << dendl; + enqueue_peering_evt( + i, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + nextmap->get_epoch(), + nextmap->get_epoch(), + NullEvt()))); + } + ret = false; + goto out; + } + } + } + } + + vector<int> newup, newacting; + int up_primary, acting_primary; + nextmap->pg_to_up_acting_osds( + pg->pg_id.pgid, + &newup, &up_primary, + &newacting, &acting_primary); + pg->handle_advance_map( + nextmap, lastmap, newup, up_primary, + newacting, acting_primary, rctx); + + auto oldpool = lastmap->get_pools().find(pg->pg_id.pool()); + auto newpool = nextmap->get_pools().find(pg->pg_id.pool()); + if (oldpool != lastmap->get_pools().end() + && newpool != nextmap->get_pools().end()) { + dout(20) << __func__ + << " new pool opts " << newpool->second.opts + << " old pool opts " << oldpool->second.opts + << dendl; + + double old_min_interval = 0, new_min_interval = 0; + oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval); + newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval); + + double old_max_interval = 0, new_max_interval = 0; + oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval); + newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval); + + // Assume if an interval is change from set to unset or vice versa the actual config + // is different. Keep it simple even if it is possible to call resched_all_scrub() + // unnecessarily. + if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) { + pg->on_info_history_change(); + } + } + + if (new_pg_num && old_pg_num != new_pg_num) { + // check for split + set<spg_t> children; + if (pg->pg_id.is_split( + old_pg_num, + new_pg_num, + &children)) { + split_pgs( + pg, children, &new_pgs, lastmap, nextmap, + rctx); + } + } + + lastmap = nextmap; + old_pg_num = new_pg_num; + handle.reset_tp_timeout(); + } + pg->handle_activate_map(rctx); + + ret = true; + out: + if (!new_pgs.empty()) { + rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs)); + } + return ret; +} + +void OSD::consume_map() +{ + ceph_assert(osd_lock.is_locked()); + auto osdmap = get_osdmap(); + dout(7) << "consume_map version " << osdmap->get_epoch() << dendl; + + /** make sure the cluster is speaking in SORTBITWISE, because we don't + * speak the older sorting version any more. Be careful not to force + * a shutdown if we are merely processing old maps, though. + */ + if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) { + derr << __func__ << " SORTBITWISE flag is not set" << dendl; + ceph_abort(); + } + + service.pre_publish_map(osdmap); + service.await_reserved_maps(); + service.publish_map(osdmap); + + // prime splits and merges + set<pair<spg_t,epoch_t>> newly_split; // splits, and when + set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when + for (auto& shard : shards) { + shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs); + } + if (!newly_split.empty()) { + for (auto& shard : shards) { + shard->prime_splits(osdmap, &newly_split); + } + ceph_assert(newly_split.empty()); + } + + // prune sent_ready_to_merge + service.prune_sent_ready_to_merge(osdmap); + + // FIXME, maybe: We could race against an incoming peering message + // that instantiates a merge PG after identify_merges() below and + // never set up its peer to complete the merge. An OSD restart + // would clear it up. This is a hard race to resolve, + // extraordinarily rare (we only merge PGs that are stable and + // clean, so it'd have to be an imported PG to an OSD with a + // slightly stale OSDMap...), so I'm ignoring it for now. We plan to + // replace all of this with a seastar-based code soon anyway. + if (!merge_pgs.empty()) { + // mark the pgs we already have, or create new and empty merge + // participants for those we are missing. do this all under the + // shard lock so we don't have to worry about racing pg creates + // via _process. + for (auto& shard : shards) { + shard->prime_merges(osdmap, &merge_pgs); + } + ceph_assert(merge_pgs.empty()); + } + + service.prune_pg_created(); + + unsigned pushes_to_free = 0; + for (auto& shard : shards) { + shard->consume_map(osdmap, &pushes_to_free); + } + + vector<spg_t> pgids; + _get_pgids(&pgids); + + // count (FIXME, probably during seastar rewrite) + int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0; + vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + // FIXME (probably during seastar rewrite): this is lockless and + // racy, but we don't want to take pg lock here. + if (pg->is_primary()) + num_pg_primary++; + else if (pg->is_replica()) + num_pg_replica++; + else + num_pg_stray++; + } + + { + // FIXME (as part of seastar rewrite): move to OSDShard + std::lock_guard l(pending_creates_lock); + for (auto pg = pending_creates_from_osd.begin(); + pg != pending_creates_from_osd.end();) { + if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) { + dout(10) << __func__ << " pg " << pg->first << " doesn't map here, " + << "discarding pending_create_from_osd" << dendl; + pg = pending_creates_from_osd.erase(pg); + } else { + ++pg; + } + } + } + + service.maybe_inject_dispatch_delay(); + + dispatch_sessions_waiting_on_map(); + + service.maybe_inject_dispatch_delay(); + + service.release_reserved_pushes(pushes_to_free); + + // queue null events to push maps down to individual PGs + for (auto pgid : pgids) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + osdmap->get_epoch(), + osdmap->get_epoch(), + NullEvt()))); + } + logger->set(l_osd_pg, pgids.size()); + logger->set(l_osd_pg_primary, num_pg_primary); + logger->set(l_osd_pg_replica, num_pg_replica); + logger->set(l_osd_pg_stray, num_pg_stray); +} + +void OSD::activate_map() +{ + ceph_assert(osd_lock.is_locked()); + auto osdmap = get_osdmap(); + + dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; + + if (osdmap->test_flag(CEPH_OSDMAP_FULL)) { + dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl; + osdmap_subscribe(osdmap->get_epoch() + 1, false); + } + + // norecover? + if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) { + if (!service.recovery_is_paused()) { + dout(1) << "pausing recovery (NORECOVER flag set)" << dendl; + service.pause_recovery(); + } + } else { + if (service.recovery_is_paused()) { + dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl; + service.unpause_recovery(); + } + } + + service.activate_map(); + + // process waiters + take_waiters(waiting_for_osdmap); +} + +bool OSD::require_mon_peer(const Message *m) +{ + if (!m->get_connection()->peer_is_mon()) { + dout(0) << "require_mon_peer received from non-mon " + << m->get_connection()->get_peer_addr() + << " " << *m << dendl; + return false; + } + return true; +} + +bool OSD::require_mon_or_mgr_peer(const Message *m) +{ + if (!m->get_connection()->peer_is_mon() && + !m->get_connection()->peer_is_mgr()) { + dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr " + << m->get_connection()->get_peer_addr() + << " " << *m << dendl; + return false; + } + return true; +} + +bool OSD::require_osd_peer(const Message *m) +{ + if (!m->get_connection()->peer_is_osd()) { + dout(0) << "require_osd_peer received from non-osd " + << m->get_connection()->get_peer_addr() + << " " << *m << dendl; + return false; + } + return true; +} + +bool OSD::require_self_aliveness(const Message *m, epoch_t epoch) +{ + epoch_t up_epoch = service.get_up_epoch(); + if (epoch < up_epoch) { + dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl; + return false; + } + + if (!is_active()) { + dout(7) << "still in boot state, dropping message " << *m << dendl; + return false; + } + + return true; +} + +bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map, + bool is_fast_dispatch) +{ + int from = m->get_source().num(); + + if (map->is_down(from) || + (map->get_cluster_addrs(from) != m->get_source_addrs())) { + dout(5) << "from dead osd." << from << ", marking down, " + << " msg was " << m->get_source_inst().addr + << " expected " + << (map->is_up(from) ? + map->get_cluster_addrs(from) : entity_addrvec_t()) + << dendl; + ConnectionRef con = m->get_connection(); + con->mark_down(); + auto priv = con->get_priv(); + if (auto s = static_cast<Session*>(priv.get()); s) { + if (!is_fast_dispatch) + s->session_dispatch_lock.Lock(); + clear_session_waiting_on_map(s); + con->set_priv(nullptr); // break ref <-> session cycle, if any + s->con.reset(); + if (!is_fast_dispatch) + s->session_dispatch_lock.Unlock(); + } + return false; + } + return true; +} + + +/* + * require that we have same (or newer) map, and that + * the source is the pg primary. + */ +bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch, + bool is_fast_dispatch) +{ + const Message *m = op->get_req(); + const auto osdmap = get_osdmap(); + dout(15) << "require_same_or_newer_map " << epoch + << " (i am " << osdmap->get_epoch() << ") " << m << dendl; + + ceph_assert(osd_lock.is_locked()); + + // do they have a newer map? + if (epoch > osdmap->get_epoch()) { + dout(7) << "waiting for newer map epoch " << epoch + << " > my " << osdmap->get_epoch() << " with " << m << dendl; + wait_for_new_map(op); + return false; + } + + if (!require_self_aliveness(op->get_req(), epoch)) { + return false; + } + + // ok, our map is same or newer.. do they still exist? + if (m->get_connection()->get_messenger() == cluster_messenger && + !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) { + return false; + } + + return true; +} + + + + + +// ---------------------------------------- +// pg creation + +void OSD::split_pgs( + PG *parent, + const set<spg_t> &childpgids, set<PGRef> *out_pgs, + OSDMapRef curmap, + OSDMapRef nextmap, + PG::RecoveryCtx *rctx) +{ + unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool()); + parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num)); + + vector<object_stat_sum_t> updated_stats; + parent->start_split_stats(childpgids, &updated_stats); + + vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin(); + for (set<spg_t>::const_iterator i = childpgids.begin(); + i != childpgids.end(); + ++i, ++stat_iter) { + ceph_assert(stat_iter != updated_stats.end()); + dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl; + PG* child = _make_pg(nextmap, *i); + child->lock(true); + out_pgs->insert(child); + child->ch = store->create_new_collection(child->coll); + + { + uint32_t shard_index = i->hash_to_shard(shards.size()); + assert(NULL != shards[shard_index]); + store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue)); + } + + unsigned split_bits = i->get_split_bits(pg_num); + dout(10) << " pg_num is " << pg_num + << ", m_seed " << i->ps() + << ", split_bits is " << split_bits << dendl; + parent->split_colls( + *i, + split_bits, + i->ps(), + &child->get_pool().info, + rctx->transaction); + parent->split_into( + i->pgid, + child, + split_bits); + + child->init_collection_pool_opts(); + + child->finish_split_stats(*stat_iter, rctx->transaction); + child->unlock(); + } + ceph_assert(stat_iter != updated_stats.end()); + parent->finish_split_stats(*stat_iter, rctx->transaction); +} + +/* + * holding osd_lock + */ +void OSD::handle_pg_create(OpRequestRef op) +{ + const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_CREATE); + + dout(10) << "handle_pg_create " << *m << dendl; + + if (!require_mon_peer(op->get_req())) { + return; + } + + if (!require_same_or_newer_map(op, m->epoch, false)) + return; + + op->mark_started(); + + const auto osdmap = get_osdmap(); + map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin(); + for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin(); + p != m->mkpg.end(); + ++p, ++ci) { + ceph_assert(ci != m->ctimes.end() && ci->first == p->first); + epoch_t created = p->second.created; + if (p->second.split_bits) // Skip split pgs + continue; + pg_t on = p->first; + + if (!osdmap->have_pg_pool(on.pool())) { + dout(20) << "ignoring pg on deleted pool " << on << dendl; + continue; + } + + dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl; + + // is it still ours? + vector<int> up, acting; + int up_primary = -1; + int acting_primary = -1; + osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary); + int role = osdmap->calc_pg_role(whoami, acting, acting.size()); + + if (acting_primary != whoami) { + dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary + << "), my role=" << role << ", skipping" << dendl; + continue; + } + + spg_t pgid; + bool mapped = osdmap->get_primary_shard(on, &pgid); + ceph_assert(mapped); + + PastIntervals pi; + pg_history_t history; + build_initial_pg_history(pgid, created, ci->second, &history, &pi); + + // The mon won't resend unless the primary changed, so we ignore + // same_interval_since. We'll pass this history with the current + // epoch as the event. + if (history.same_primary_since > m->epoch) { + dout(10) << __func__ << ": got obsolete pg create on pgid " + << pgid << " from epoch " << m->epoch + << ", primary changed in " << history.same_primary_since + << dendl; + continue; + } + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + osdmap->get_epoch(), + osdmap->get_epoch(), + NullEvt(), + true, + new PGCreateInfo( + pgid, + osdmap->get_epoch(), + history, + pi, + true) + ))); + } + + { + std::lock_guard l(pending_creates_lock); + if (pending_creates_from_mon == 0) { + last_pg_create_epoch = m->epoch; + } + } + + maybe_update_heartbeat_peers(); +} + + +// ---------------------------------------- +// peering and recovery + +PG::RecoveryCtx OSD::create_context() +{ + ObjectStore::Transaction *t = new ObjectStore::Transaction; + map<int, map<spg_t,pg_query_t> > *query_map = + new map<int, map<spg_t, pg_query_t> >; + map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list = + new map<int, vector<pair<pg_notify_t, PastIntervals> > >; + map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map = + new map<int,vector<pair<pg_notify_t, PastIntervals> > >; + PG::RecoveryCtx rctx(query_map, info_map, notify_list, t); + return rctx; +} + +void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg, + ThreadPool::TPHandle *handle) +{ + if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) { + int tr = store->queue_transaction( + pg->ch, + std::move(*ctx.transaction), TrackedOpRef(), handle); + ceph_assert(tr == 0); + delete (ctx.transaction); + ctx.transaction = new ObjectStore::Transaction; + } +} + +void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap, + ThreadPool::TPHandle *handle) +{ + if (!service.get_osdmap()->is_up(whoami)) { + dout(20) << __func__ << " not up in osdmap" << dendl; + } else if (!is_active()) { + dout(20) << __func__ << " not active" << dendl; + } else { + do_notifies(*ctx.notify_list, curmap); + do_queries(*ctx.query_map, curmap); + do_infos(*ctx.info_map, curmap); + } + if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) { + int tr = store->queue_transaction( + pg->ch, + std::move(*ctx.transaction), TrackedOpRef(), + handle); + ceph_assert(tr == 0); + } + delete ctx.notify_list; + delete ctx.query_map; + delete ctx.info_map; + delete ctx.transaction; +} + +void OSD::discard_context(PG::RecoveryCtx& ctx) +{ + delete ctx.notify_list; + delete ctx.query_map; + delete ctx.info_map; + delete ctx.transaction; +} + + +/** do_notifies + * Send an MOSDPGNotify to a primary, with a list of PGs that I have + * content for, and they are primary for. + */ + +void OSD::do_notifies( + map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list, + OSDMapRef curmap) +{ + for (map<int, + vector<pair<pg_notify_t,PastIntervals> > >::iterator it = + notify_list.begin(); + it != notify_list.end(); + ++it) { + if (!curmap->is_up(it->first)) { + dout(20) << __func__ << " skipping down osd." << it->first << dendl; + continue; + } + ConnectionRef con = service.get_con_osd_cluster( + it->first, curmap->get_epoch()); + if (!con) { + dout(20) << __func__ << " skipping osd." << it->first + << " (NULL con)" << dendl; + continue; + } + service.share_map_peer(it->first, con.get(), curmap); + dout(7) << __func__ << " osd." << it->first + << " on " << it->second.size() << " PGs" << dendl; + MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(), + it->second); + con->send_message(m); + } +} + + +/** do_queries + * send out pending queries for info | summaries + */ +void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map, + OSDMapRef curmap) +{ + for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin(); + pit != query_map.end(); + ++pit) { + if (!curmap->is_up(pit->first)) { + dout(20) << __func__ << " skipping down osd." << pit->first << dendl; + continue; + } + int who = pit->first; + ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch()); + if (!con) { + dout(20) << __func__ << " skipping osd." << who + << " (NULL con)" << dendl; + continue; + } + service.share_map_peer(who, con.get(), curmap); + dout(7) << __func__ << " querying osd." << who + << " on " << pit->second.size() << " PGs" << dendl; + MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second); + con->send_message(m); + } +} + + +void OSD::do_infos(map<int, + vector<pair<pg_notify_t, PastIntervals> > >& info_map, + OSDMapRef curmap) +{ + for (map<int, + vector<pair<pg_notify_t, PastIntervals> > >::iterator p = + info_map.begin(); + p != info_map.end(); + ++p) { + if (!curmap->is_up(p->first)) { + dout(20) << __func__ << " skipping down osd." << p->first << dendl; + continue; + } + for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin(); + i != p->second.end(); + ++i) { + dout(20) << __func__ << " sending info " << i->first.info + << " to shard " << p->first << dendl; + } + ConnectionRef con = service.get_con_osd_cluster( + p->first, curmap->get_epoch()); + if (!con) { + dout(20) << __func__ << " skipping osd." << p->first + << " (NULL con)" << dendl; + continue; + } + service.share_map_peer(p->first, con.get(), curmap); + MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch()); + m->pg_list = p->second; + con->send_message(m); + } + info_map.clear(); +} + +void OSD::handle_fast_pg_create(MOSDPGCreate2 *m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_mon_peer(m)) { + m->put(); + return; + } + for (auto& p : m->pgs) { + spg_t pgid = p.first; + epoch_t created = p.second.first; + utime_t created_stamp = p.second.second; + dout(20) << __func__ << " " << pgid << " e" << created + << "@" << created_stamp << dendl; + pg_history_t h; + h.epoch_created = created; + h.epoch_pool_created = created; + h.same_up_since = created; + h.same_interval_since = created; + h.same_primary_since = created; + h.last_scrub_stamp = created_stamp; + h.last_deep_scrub_stamp = created_stamp; + h.last_clean_scrub_stamp = created_stamp; + + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + m->epoch, + m->epoch, + NullEvt(), + true, + new PGCreateInfo( + pgid, + created, + h, + PastIntervals(), + true) + ))); + } + + { + std::lock_guard l(pending_creates_lock); + if (pending_creates_from_mon == 0) { + last_pg_create_epoch = m->epoch; + } + } + + m->put(); +} + +void OSD::handle_fast_pg_query(MOSDPGQuery *m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_osd_peer(m)) { + m->put(); + return; + } + int from = m->get_source().num(); + for (auto& p : m->pg_list) { + enqueue_peering_evt( + p.first, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + p.second.epoch_sent, p.second.epoch_sent, + MQuery( + p.first, + pg_shard_t(from, p.second.from), + p.second, + p.second.epoch_sent), + false)) + ); + } + m->put(); +} + +void OSD::handle_fast_pg_notify(MOSDPGNotify* m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_osd_peer(m)) { + m->put(); + return; + } + int from = m->get_source().num(); + for (auto& p : m->get_pg_list()) { + spg_t pgid(p.first.info.pgid.pgid, p.first.to); + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + p.first.epoch_sent, + p.first.query_epoch, + MNotifyRec( + pgid, pg_shard_t(from, p.first.from), + p.first, + m->get_connection()->get_features(), + p.second), + true, + new PGCreateInfo( + pgid, + p.first.query_epoch, + p.first.info.history, + p.second, + false) + ))); + } + m->put(); +} + +void OSD::handle_fast_pg_info(MOSDPGInfo* m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_osd_peer(m)) { + m->put(); + return; + } + int from = m->get_source().num(); + for (auto& p : m->pg_list) { + enqueue_peering_evt( + spg_t(p.first.info.pgid.pgid, p.first.to), + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + p.first.epoch_sent, p.first.query_epoch, + MInfoRec( + pg_shard_t(from, p.first.from), + p.first.info, + p.first.epoch_sent))) + ); + } + m->put(); +} + +void OSD::handle_fast_pg_remove(MOSDPGRemove *m) +{ + dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; + if (!require_osd_peer(m)) { + m->put(); + return; + } + for (auto& pgid : m->pg_list) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + m->get_epoch(), m->get_epoch(), + PG::DeleteStart()))); + } + m->put(); +} + +void OSD::handle_fast_force_recovery(MOSDForceRecovery *m) +{ + dout(10) << __func__ << " " << *m << dendl; + if (!require_mon_or_mgr_peer(m)) { + m->put(); + return; + } + epoch_t epoch = get_osdmap_epoch(); + for (auto pgid : m->forced_pgs) { + if (m->options & OFR_BACKFILL) { + if (m->options & OFR_CANCEL) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + epoch, epoch, + PG::UnsetForceBackfill()))); + } else { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + epoch, epoch, + PG::SetForceBackfill()))); + } + } else if (m->options & OFR_RECOVERY) { + if (m->options & OFR_CANCEL) { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + epoch, epoch, + PG::UnsetForceRecovery()))); + } else { + enqueue_peering_evt( + pgid, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + epoch, epoch, + PG::SetForceRecovery()))); + } + } + } + m->put(); +} + +void OSD::handle_pg_query_nopg(const MQuery& q) +{ + spg_t pgid = q.pgid; + dout(10) << __func__ << " " << pgid << dendl; + + OSDMapRef osdmap = get_osdmap(); + if (!osdmap->have_pg_pool(pgid.pool())) + return; + + dout(10) << " pg " << pgid << " dne" << dendl; + pg_info_t empty(spg_t(pgid.pgid, q.query.to)); + ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch()); + if (con) { + Message *m; + if (q.query.type == pg_query_t::LOG || + q.query.type == pg_query_t::FULLLOG) { + m = new MOSDPGLog( + q.query.from, q.query.to, + osdmap->get_epoch(), empty, + q.query.epoch_sent); + } else { + vector<pair<pg_notify_t,PastIntervals>> ls; + ls.push_back( + make_pair( + pg_notify_t( + q.query.from, q.query.to, + q.query.epoch_sent, + osdmap->get_epoch(), + empty), + PastIntervals())); + m = new MOSDPGNotify(osdmap->get_epoch(), ls); + } + service.share_map_peer(q.from.osd, con.get(), osdmap); + con->send_message(m); + } +} + + +// ========================================================= +// RECOVERY + +void OSDService::_maybe_queue_recovery() { + ceph_assert(recovery_lock.is_locked_by_me()); + uint64_t available_pushes; + while (!awaiting_throttle.empty() && + _recover_now(&available_pushes)) { + uint64_t to_start = std::min( + available_pushes, + cct->_conf->osd_recovery_max_single_start); + _queue_for_recovery(awaiting_throttle.front(), to_start); + awaiting_throttle.pop_front(); + dout(10) << __func__ << " starting " << to_start + << ", recovery_ops_reserved " << recovery_ops_reserved + << " -> " << (recovery_ops_reserved + to_start) << dendl; + recovery_ops_reserved += to_start; + } +} + +bool OSDService::_recover_now(uint64_t *available_pushes) +{ + if (available_pushes) + *available_pushes = 0; + + if (ceph_clock_now() < defer_recovery_until) { + dout(15) << __func__ << " defer until " << defer_recovery_until << dendl; + return false; + } + + if (recovery_paused) { + dout(15) << __func__ << " paused" << dendl; + return false; + } + + uint64_t max = cct->_conf->osd_recovery_max_active; + if (max <= recovery_ops_active + recovery_ops_reserved) { + dout(15) << __func__ << " active " << recovery_ops_active + << " + reserved " << recovery_ops_reserved + << " >= max " << max << dendl; + return false; + } + + if (available_pushes) + *available_pushes = max - recovery_ops_active - recovery_ops_reserved; + + return true; +} + +void OSD::do_recovery( + PG *pg, epoch_t queued, uint64_t reserved_pushes, + ThreadPool::TPHandle &handle) +{ + uint64_t started = 0; + + /* + * When the value of osd_recovery_sleep is set greater than zero, recovery + * ops are scheduled after osd_recovery_sleep amount of time from the previous + * recovery event's schedule time. This is done by adding a + * recovery_requeue_callback event, which re-queues the recovery op using + * queue_recovery_after_sleep. + */ + float recovery_sleep = get_osd_recovery_sleep(); + { + std::lock_guard l(service.sleep_lock); + if (recovery_sleep > 0 && service.recovery_needs_sleep) { + PGRef pgref(pg); + auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) { + dout(20) << "do_recovery wake up at " + << ceph_clock_now() + << ", re-queuing recovery" << dendl; + std::lock_guard l(service.sleep_lock); + service.recovery_needs_sleep = false; + service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes); + }); + + // This is true for the first recovery op and when the previous recovery op + // has been scheduled in the past. The next recovery op is scheduled after + // completing the sleep from now. + if (service.recovery_schedule_time < ceph_clock_now()) { + service.recovery_schedule_time = ceph_clock_now(); + } + service.recovery_schedule_time += recovery_sleep; + service.sleep_timer.add_event_at(service.recovery_schedule_time, + recovery_requeue_callback); + dout(20) << "Recovery event scheduled at " + << service.recovery_schedule_time << dendl; + return; + } + } + + { + { + std::lock_guard l(service.sleep_lock); + service.recovery_needs_sleep = true; + } + + if (pg->pg_has_reset_since(queued)) { + goto out; + } + + dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl; +#ifdef DEBUG_RECOVERY_OIDS + dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl; +#endif + + bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started); + dout(10) << "do_recovery started " << started << "/" << reserved_pushes + << " on " << *pg << dendl; + + if (do_unfound) { + PG::RecoveryCtx rctx = create_context(); + rctx.handle = &handle; + pg->find_unfound(queued, &rctx); + dispatch_context(rctx, pg, pg->get_osdmap()); + } + } + + out: + ceph_assert(started <= reserved_pushes); + service.release_reserved_pushes(reserved_pushes); +} + +void OSDService::start_recovery_op(PG *pg, const hobject_t& soid) +{ + std::lock_guard l(recovery_lock); + dout(10) << "start_recovery_op " << *pg << " " << soid + << " (" << recovery_ops_active << "/" + << cct->_conf->osd_recovery_max_active << " rops)" + << dendl; + recovery_ops_active++; + +#ifdef DEBUG_RECOVERY_OIDS + dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl; + ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0); + recovery_oids[pg->pg_id].insert(soid); +#endif +} + +void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue) +{ + std::lock_guard l(recovery_lock); + dout(10) << "finish_recovery_op " << *pg << " " << soid + << " dequeue=" << dequeue + << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)" + << dendl; + + // adjust count + ceph_assert(recovery_ops_active > 0); + recovery_ops_active--; + +#ifdef DEBUG_RECOVERY_OIDS + dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl; + ceph_assert(recovery_oids[pg->pg_id].count(soid)); + recovery_oids[pg->pg_id].erase(soid); +#endif + + _maybe_queue_recovery(); +} + +bool OSDService::is_recovery_active() +{ + if (cct->_conf->osd_debug_pretend_recovery_active) { + return true; + } + return local_reserver.has_reservation() || remote_reserver.has_reservation(); +} + +void OSDService::release_reserved_pushes(uint64_t pushes) +{ + std::lock_guard l(recovery_lock); + dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved " + << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes) + << dendl; + ceph_assert(recovery_ops_reserved >= pushes); + recovery_ops_reserved -= pushes; + _maybe_queue_recovery(); +} + +// ========================================================= +// OPS + +bool OSD::op_is_discardable(const MOSDOp *op) +{ + // drop client request if they are not connected and can't get the + // reply anyway. + if (!op->get_connection()->is_connected()) { + return true; + } + return false; +} + +void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch) +{ + const utime_t stamp = op->get_req()->get_recv_stamp(); + const utime_t latency = ceph_clock_now() - stamp; + const unsigned priority = op->get_req()->get_priority(); + const int cost = op->get_req()->get_cost(); + const uint64_t owner = op->get_req()->get_source().num(); + + dout(15) << "enqueue_op " << op << " prio " << priority + << " cost " << cost + << " latency " << latency + << " epoch " << epoch + << " " << *(op->get_req()) << dendl; + op->osd_trace.event("enqueue op"); + op->osd_trace.keyval("priority", priority); + op->osd_trace.keyval("cost", cost); + op->mark_queued_for_pg(); + logger->tinc(l_osd_op_before_queue_op_lat, latency); + op_shardedwq.queue( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))), + cost, priority, stamp, owner, epoch)); +} + +void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt) +{ + dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl; + op_shardedwq.queue( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)), + 10, + cct->_conf->osd_peering_op_priority, + utime_t(), + 0, + evt->get_epoch_sent())); +} + +void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt) +{ + dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl; + op_shardedwq.queue_front( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)), + 10, + cct->_conf->osd_peering_op_priority, + utime_t(), + 0, + evt->get_epoch_sent())); +} + +/* + * NOTE: dequeue called in worker thread, with pg lock + */ +void OSD::dequeue_op( + PGRef pg, OpRequestRef op, + ThreadPool::TPHandle &handle) +{ + FUNCTRACE(cct); + OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false); + + utime_t now = ceph_clock_now(); + op->set_dequeued_time(now); + utime_t latency = now - op->get_req()->get_recv_stamp(); + dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority() + << " cost " << op->get_req()->get_cost() + << " latency " << latency + << " " << *(op->get_req()) + << " pg " << *pg << dendl; + + logger->tinc(l_osd_op_before_dequeue_op_lat, latency); + + auto priv = op->get_req()->get_connection()->get_priv(); + if (auto session = static_cast<Session *>(priv.get()); session) { + maybe_share_map(session, op, pg->get_osdmap()); + } + + if (pg->is_deleting()) + return; + + op->mark_reached_pg(); + op->osd_trace.event("dequeue_op"); + + pg->do_request(op, handle); + + // finish + dout(10) << "dequeue_op " << op << " finish" << dendl; + OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false); +} + + +void OSD::dequeue_peering_evt( + OSDShard *sdata, + PG *pg, + PGPeeringEventRef evt, + ThreadPool::TPHandle& handle) +{ + PG::RecoveryCtx rctx = create_context(); + auto curmap = sdata->get_osdmap(); + epoch_t need_up_thru = 0, same_interval_since = 0; + if (!pg) { + if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) { + handle_pg_query_nopg(*q); + } else { + derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl; + ceph_abort(); + } + } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) { + pg->do_peering_event(evt, &rctx); + if (pg->is_deleted()) { + // do not dispatch rctx; the final _delete_some already did it. + discard_context(rctx); + pg->unlock(); + return; + } + dispatch_context_transaction(rctx, pg, &handle); + need_up_thru = pg->get_need_up_thru(); + same_interval_since = pg->get_same_interval_since(); + pg->unlock(); + } + + if (need_up_thru) { + queue_want_up_thru(same_interval_since); + } + dispatch_context(rctx, pg, curmap, &handle); + + service.send_pg_temp(); +} + +void OSD::dequeue_delete( + OSDShard *sdata, + PG *pg, + epoch_t e, + ThreadPool::TPHandle& handle) +{ + dequeue_peering_evt( + sdata, + pg, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + e, e, + PG::DeleteSome())), + handle); +} + + + +// -------------------------------- + +const char** OSD::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_max_backfills", + "osd_min_recovery_priority", + "osd_max_trimming_pgs", + "osd_op_complaint_time", + "osd_op_log_threshold", + "osd_op_history_size", + "osd_op_history_duration", + "osd_op_history_slow_op_size", + "osd_op_history_slow_op_threshold", + "osd_enable_op_tracker", + "osd_map_cache_size", + "osd_pg_epoch_max_lag_factor", + "osd_pg_epoch_persisted_max_stale", + // clog & admin clog + "clog_to_monitors", + "clog_to_syslog", + "clog_to_syslog_facility", + "clog_to_syslog_level", + "osd_objectstore_fuse", + "clog_to_graylog", + "clog_to_graylog_host", + "clog_to_graylog_port", + "host", + "fsid", + "osd_recovery_delay_start", + "osd_client_message_size_cap", + "osd_client_message_cap", + "osd_heartbeat_min_size", + "osd_heartbeat_interval", + "osd_scrub_min_interval", + "osd_scrub_max_interval", + NULL + }; + return KEYS; +} + +void OSD::handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + Mutex::Locker l(osd_lock); + if (changed.count("osd_max_backfills")) { + service.local_reserver.set_max(cct->_conf->osd_max_backfills); + service.remote_reserver.set_max(cct->_conf->osd_max_backfills); + } + if (changed.count("osd_min_recovery_priority")) { + service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority); + service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority); + } + if (changed.count("osd_max_trimming_pgs")) { + service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs); + } + if (changed.count("osd_op_complaint_time") || + changed.count("osd_op_log_threshold")) { + op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time, + cct->_conf->osd_op_log_threshold); + } + if (changed.count("osd_op_history_size") || + changed.count("osd_op_history_duration")) { + op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size, + cct->_conf->osd_op_history_duration); + } + if (changed.count("osd_op_history_slow_op_size") || + changed.count("osd_op_history_slow_op_threshold")) { + op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size, + cct->_conf->osd_op_history_slow_op_threshold); + } + if (changed.count("osd_enable_op_tracker")) { + op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker); + } + if (changed.count("osd_map_cache_size")) { + service.map_cache.set_size(cct->_conf->osd_map_cache_size); + service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size); + service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size); + } + if (changed.count("clog_to_monitors") || + changed.count("clog_to_syslog") || + changed.count("clog_to_syslog_level") || + changed.count("clog_to_syslog_facility") || + changed.count("clog_to_graylog") || + changed.count("clog_to_graylog_host") || + changed.count("clog_to_graylog_port") || + changed.count("host") || + changed.count("fsid")) { + update_log_config(); + } + if (changed.count("osd_pg_epoch_max_lag_factor")) { + m_osd_pg_epoch_max_lag_factor = conf.get_val<double>( + "osd_pg_epoch_max_lag_factor"); + } + +#ifdef HAVE_LIBFUSE + if (changed.count("osd_objectstore_fuse")) { + if (store) { + enable_disable_fuse(false); + } + } +#endif + + if (changed.count("osd_recovery_delay_start")) { + service.defer_recovery(cct->_conf->osd_recovery_delay_start); + service.kick_recovery_queue(); + } + + if (changed.count("osd_client_message_cap")) { + uint64_t newval = cct->_conf->osd_client_message_cap; + Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT); + if (pol.throttler_messages && newval > 0) { + pol.throttler_messages->reset_max(newval); + } + } + if (changed.count("osd_client_message_size_cap")) { + uint64_t newval = cct->_conf->osd_client_message_size_cap; + Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT); + if (pol.throttler_bytes && newval > 0) { + pol.throttler_bytes->reset_max(newval); + } + } + + if (changed.count("osd_scrub_min_interval") || + changed.count("osd_scrub_max_interval")) { + resched_all_scrubs(); + dout(0) << __func__ << ": scrub interval change" << dendl; + } + check_config(); +} + +void OSD::update_log_config() +{ + map<string,string> log_to_monitors; + map<string,string> log_to_syslog; + map<string,string> log_channel; + map<string,string> log_prio; + map<string,string> log_to_graylog; + map<string,string> log_to_graylog_host; + map<string,string> log_to_graylog_port; + uuid_d fsid; + string host; + + if (parse_log_client_options(cct, log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host) == 0) + clog->update_config(log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host); + derr << "log_to_monitors " << log_to_monitors << dendl; +} + +void OSD::check_config() +{ + // some sanity checks + if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) { + clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")" + << " is not > osd_pg_epoch_persisted_max_stale (" + << cct->_conf->osd_pg_epoch_persisted_max_stale << ")"; + } +} + +// -------------------------------- + +void OSD::get_latest_osdmap() +{ + dout(10) << __func__ << " -- start" << dendl; + + C_SaferCond cond; + service.objecter->wait_for_latest_osdmap(&cond); + cond.wait(); + + dout(10) << __func__ << " -- finish" << dendl; +} + +// -------------------------------- + +int OSD::init_op_flags(OpRequestRef& op) +{ + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + vector<OSDOp>::const_iterator iter; + + // client flags have no bearing on whether an op is a read, write, etc. + op->rmw_flags = 0; + + if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) { + op->set_force_rwordered(); + } + + // set bits based on op codes, called methods. + for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) { + if ((iter->op.op == CEPH_OSD_OP_WATCH && + iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) { + /* This a bit odd. PING isn't actually a write. It can't + * result in an update to the object_info. PINGs also aren't + * resent, so there's no reason to write out a log entry. + * + * However, we pipeline them behind writes, so let's force + * the write_ordered flag. + */ + op->set_force_rwordered(); + } else { + if (ceph_osd_op_mode_modify(iter->op.op)) + op->set_write(); + } + if (ceph_osd_op_mode_read(iter->op.op)) + op->set_read(); + + // set READ flag if there are src_oids + if (iter->soid.oid.name.length()) + op->set_read(); + + // set PGOP flag if there are PG ops + if (ceph_osd_op_type_pg(iter->op.op)) + op->set_pg_op(); + + if (ceph_osd_op_mode_cache(iter->op.op)) + op->set_cache(); + + // check for ec base pool + int64_t poolid = m->get_pg().pool(); + const pg_pool_t *pool = get_osdmap()->get_pg_pool(poolid); + if (pool && pool->is_tier()) { + const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool->tier_of); + if (base_pool && base_pool->require_rollback()) { + if ((iter->op.op != CEPH_OSD_OP_READ) && + (iter->op.op != CEPH_OSD_OP_CHECKSUM) && + (iter->op.op != CEPH_OSD_OP_CMPEXT) && + (iter->op.op != CEPH_OSD_OP_STAT) && + (iter->op.op != CEPH_OSD_OP_ISDIRTY) && + (iter->op.op != CEPH_OSD_OP_UNDIRTY) && + (iter->op.op != CEPH_OSD_OP_GETXATTR) && + (iter->op.op != CEPH_OSD_OP_GETXATTRS) && + (iter->op.op != CEPH_OSD_OP_CMPXATTR) && + (iter->op.op != CEPH_OSD_OP_ASSERT_VER) && + (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) && + (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) && + (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) && + (iter->op.op != CEPH_OSD_OP_WRITEFULL) && + (iter->op.op != CEPH_OSD_OP_ROLLBACK) && + (iter->op.op != CEPH_OSD_OP_CREATE) && + (iter->op.op != CEPH_OSD_OP_DELETE) && + (iter->op.op != CEPH_OSD_OP_SETXATTR) && + (iter->op.op != CEPH_OSD_OP_RMXATTR) && + (iter->op.op != CEPH_OSD_OP_STARTSYNC) && + (iter->op.op != CEPH_OSD_OP_COPY_GET) && + (iter->op.op != CEPH_OSD_OP_COPY_FROM)) { + op->set_promote(); + } + } + } + + switch (iter->op.op) { + case CEPH_OSD_OP_CALL: + { + bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin(); + int is_write, is_read; + string cname, mname; + bp.copy(iter->op.cls.class_len, cname); + bp.copy(iter->op.cls.method_len, mname); + + ClassHandler::ClassData *cls; + int r = class_handler->open_class(cname, &cls); + if (r) { + derr << "class " << cname << " open got " << cpp_strerror(r) << dendl; + if (r == -ENOENT) + r = -EOPNOTSUPP; + else if (r != -EPERM) // propagate permission errors + r = -EIO; + return r; + } + int flags = cls->get_method_flags(mname.c_str()); + if (flags < 0) { + if (flags == -ENOENT) + r = -EOPNOTSUPP; + else + r = flags; + return r; + } + is_read = flags & CLS_METHOD_RD; + is_write = flags & CLS_METHOD_WR; + bool is_promote = flags & CLS_METHOD_PROMOTE; + + dout(10) << "class " << cname << " method " << mname << " " + << "flags=" << (is_read ? "r" : "") + << (is_write ? "w" : "") + << (is_promote ? "p" : "") + << dendl; + if (is_read) + op->set_class_read(); + if (is_write) + op->set_class_write(); + if (is_promote) + op->set_promote(); + op->add_class(std::move(cname), std::move(mname), is_read, is_write, + cls->whitelisted); + break; + } + + case CEPH_OSD_OP_WATCH: + // force the read bit for watch since it is depends on previous + // watch state (and may return early if the watch exists) or, in + // the case of ping, is simply a read op. + op->set_read(); + // fall through + case CEPH_OSD_OP_NOTIFY: + case CEPH_OSD_OP_NOTIFY_ACK: + { + op->set_promote(); + break; + } + + case CEPH_OSD_OP_DELETE: + // if we get a delete with FAILOK we can skip handle cache. without + // FAILOK we still need to promote (or do something smarter) to + // determine whether to return ENOENT or 0. + if (iter == m->ops.begin() && + iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) { + op->set_skip_handle_cache(); + } + // skip promotion when proxying a delete op + if (m->ops.size() == 1) { + op->set_skip_promote(); + } + break; + + case CEPH_OSD_OP_CACHE_TRY_FLUSH: + case CEPH_OSD_OP_CACHE_FLUSH: + case CEPH_OSD_OP_CACHE_EVICT: + // If try_flush/flush/evict is the only op, can skip handle cache. + if (m->ops.size() == 1) { + op->set_skip_handle_cache(); + } + break; + + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_CHECKSUM: + case CEPH_OSD_OP_WRITEFULL: + if (m->ops.size() == 1 && + (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE || + iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) { + op->set_skip_promote(); + } + break; + + // force promotion when pin an object in cache tier + case CEPH_OSD_OP_CACHE_PIN: + op->set_promote(); + break; + + default: + break; + } + } + + if (op->rmw_flags == 0) + return -EINVAL; + + return 0; +} + +void OSD::set_perf_queries( + const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) { + dout(10) << "setting " << queries.size() << " queries" << dendl; + + std::list<OSDPerfMetricQuery> supported_queries; + for (auto &it : queries) { + auto &query = it.first; + if (!query.key_descriptor.empty()) { + supported_queries.push_back(query); + } + } + if (supported_queries.size() < queries.size()) { + dout(1) << queries.size() - supported_queries.size() + << " unsupported queries" << dendl; + } + + { + Mutex::Locker locker(m_perf_queries_lock); + m_perf_queries = supported_queries; + m_perf_limits = queries; + } + + std::vector<PGRef> pgs; + _get_pgs(&pgs); + for (auto& pg : pgs) { + pg->lock(); + pg->set_dynamic_perf_stats_queries(supported_queries); + pg->unlock(); + } +} + +void OSD::get_perf_reports( + std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) { + std::vector<PGRef> pgs; + _get_pgs(&pgs); + DynamicPerfStats dps; + for (auto& pg : pgs) { + // m_perf_queries can be modified only in set_perf_queries by mgr client + // request, and it is protected by by mgr client's lock, which is held + // when set_perf_queries/get_perf_reports are called, so we may not hold + // m_perf_queries_lock here. + DynamicPerfStats pg_dps(m_perf_queries); + pg->lock(); + pg->get_dynamic_perf_stats(&pg_dps); + pg->unlock(); + dps.merge(pg_dps); + } + dps.add_to_reports(m_perf_limits, reports); + dout(20) << "reports for " << reports->size() << " queries" << dendl; +} + +// ============================================================= + +#undef dout_context +#define dout_context cct +#undef dout_prefix +#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " " + +void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg) +{ + dout(10) << pg->pg_id << " " << pg << dendl; + slot->pg = pg; + pg->osd_shard = this; + pg->pg_slot = slot; + osd->inc_num_pgs(); + + slot->epoch = pg->get_osdmap_epoch(); + pg_slots_by_epoch.insert(*slot); +} + +void OSDShard::_detach_pg(OSDShardPGSlot *slot) +{ + dout(10) << slot->pg->pg_id << " " << slot->pg << dendl; + slot->pg->osd_shard = nullptr; + slot->pg->pg_slot = nullptr; + slot->pg = nullptr; + osd->dec_num_pgs(); + + pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot)); + slot->epoch = 0; + if (waiting_for_min_pg_epoch) { + min_pg_epoch_cond.notify_all(); + } +} + +void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e) +{ + std::lock_guard l(shard_lock); + dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch + << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl; + pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot)); + dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl; + slot->epoch = e; + pg_slots_by_epoch.insert(*slot); + dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch + << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl; + if (waiting_for_min_pg_epoch) { + min_pg_epoch_cond.notify_all(); + } +} + +epoch_t OSDShard::get_min_pg_epoch() +{ + std::lock_guard l(shard_lock); + auto p = pg_slots_by_epoch.begin(); + if (p == pg_slots_by_epoch.end()) { + return 0; + } + return p->epoch; +} + +void OSDShard::wait_min_pg_epoch(epoch_t need) +{ + std::unique_lock l{shard_lock}; + ++waiting_for_min_pg_epoch; + min_pg_epoch_cond.wait(l, [need, this] { + if (pg_slots_by_epoch.empty()) { + return true; + } else if (pg_slots_by_epoch.begin()->epoch >= need) { + return true; + } else { + dout(10) << need << " waiting on " + << pg_slots_by_epoch.begin()->epoch << dendl; + return false; + } + }); + --waiting_for_min_pg_epoch; +} + +epoch_t OSDShard::get_max_waiting_epoch() +{ + std::lock_guard l(shard_lock); + epoch_t r = 0; + for (auto& i : pg_slots) { + if (!i.second->waiting_peering.empty()) { + r = std::max(r, i.second->waiting_peering.rbegin()->first); + } + } + return r; +} + +void OSDShard::consume_map( + const OSDMapRef& new_osdmap, + unsigned *pushes_to_free) +{ + std::lock_guard l(shard_lock); + OSDMapRef old_osdmap; + { + std::lock_guard l(osdmap_lock); + old_osdmap = std::move(shard_osdmap); + shard_osdmap = new_osdmap; + } + dout(10) << new_osdmap->get_epoch() + << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")" + << dendl; + bool queued = false; + + // check slots + auto p = pg_slots.begin(); + while (p != pg_slots.end()) { + OSDShardPGSlot *slot = p->second.get(); + const spg_t& pgid = p->first; + dout(20) << __func__ << " " << pgid << dendl; + if (!slot->waiting_for_split.empty()) { + dout(20) << __func__ << " " << pgid + << " waiting for split " << slot->waiting_for_split << dendl; + ++p; + continue; + } + if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) { + dout(20) << __func__ << " " << pgid + << " waiting for merge by epoch " << slot->waiting_for_merge_epoch + << dendl; + ++p; + continue; + } + if (!slot->waiting_peering.empty()) { + epoch_t first = slot->waiting_peering.begin()->first; + if (first <= new_osdmap->get_epoch()) { + dout(20) << __func__ << " " << pgid + << " pending_peering first epoch " << first + << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl; + _wake_pg_slot(pgid, slot); + queued = true; + } + ++p; + continue; + } + if (!slot->waiting.empty()) { + if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) { + dout(20) << __func__ << " " << pgid << " maps to us, keeping" + << dendl; + ++p; + continue; + } + while (!slot->waiting.empty() && + slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) { + auto& qi = slot->waiting.front(); + dout(20) << __func__ << " " << pgid + << " waiting item " << qi + << " epoch " << qi.get_map_epoch() + << " <= " << new_osdmap->get_epoch() + << ", " + << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" : + "misdirected") + << ", dropping" << dendl; + *pushes_to_free += qi.get_reserved_pushes(); + slot->waiting.pop_front(); + } + } + if (slot->waiting.empty() && + slot->num_running == 0 && + slot->waiting_for_split.empty() && + !slot->pg) { + dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl; + p = pg_slots.erase(p); + continue; + } + + ++p; + } + if (queued) { + std::lock_guard l{sdata_wait_lock}; + sdata_cond.notify_one(); + } +} + +void OSDShard::_wake_pg_slot( + spg_t pgid, + OSDShardPGSlot *slot) +{ + dout(20) << __func__ << " " << pgid + << " to_process " << slot->to_process + << " waiting " << slot->waiting + << " waiting_peering " << slot->waiting_peering << dendl; + for (auto i = slot->to_process.rbegin(); + i != slot->to_process.rend(); + ++i) { + _enqueue_front(std::move(*i), osd->op_prio_cutoff); + } + slot->to_process.clear(); + for (auto i = slot->waiting.rbegin(); + i != slot->waiting.rend(); + ++i) { + _enqueue_front(std::move(*i), osd->op_prio_cutoff); + } + slot->waiting.clear(); + for (auto i = slot->waiting_peering.rbegin(); + i != slot->waiting_peering.rend(); + ++i) { + // this is overkill; we requeue everything, even if some of these + // items are waiting for maps we don't have yet. FIXME, maybe, + // someday, if we decide this inefficiency matters + for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) { + _enqueue_front(std::move(*j), osd->op_prio_cutoff); + } + } + slot->waiting_peering.clear(); + ++slot->requeue_seq; +} + +void OSDShard::identify_splits_and_merges( + const OSDMapRef& as_of_osdmap, + set<pair<spg_t,epoch_t>> *split_pgs, + set<pair<spg_t,epoch_t>> *merge_pgs) +{ + std::lock_guard l(shard_lock); + if (shard_osdmap) { + for (auto& i : pg_slots) { + const spg_t& pgid = i.first; + auto *slot = i.second.get(); + if (slot->pg) { + osd->service.identify_splits_and_merges( + shard_osdmap, as_of_osdmap, pgid, + split_pgs, merge_pgs); + } else if (!slot->waiting_for_split.empty()) { + osd->service.identify_splits_and_merges( + shard_osdmap, as_of_osdmap, pgid, + split_pgs, nullptr); + } else { + dout(20) << __func__ << " slot " << pgid + << " has no pg and waiting_for_split " + << slot->waiting_for_split << dendl; + } + } + } +} + +void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap, + set<pair<spg_t,epoch_t>> *pgids) +{ + std::lock_guard l(shard_lock); + _prime_splits(pgids); + if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) { + set<pair<spg_t,epoch_t>> newer_children; + for (auto i : *pgids) { + osd->service.identify_splits_and_merges( + as_of_osdmap, shard_osdmap, i.first, + &newer_children, nullptr); + } + newer_children.insert(pgids->begin(), pgids->end()); + dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard " + << shard_osdmap->get_epoch() << ", new children " << newer_children + << dendl; + _prime_splits(&newer_children); + // note: we don't care what is left over here for other shards. + // if this shard is ahead of us and one isn't, e.g., one thread is + // calling into prime_splits via _process (due to a newly created + // pg) and this shard has a newer map due to a racing consume_map, + // then any grandchildren left here will be identified (or were + // identified) when the slower shard's osdmap is advanced. + // _prime_splits() will tolerate the case where the pgid is + // already primed. + } +} + +void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids) +{ + dout(10) << *pgids << dendl; + auto p = pgids->begin(); + while (p != pgids->end()) { + unsigned shard_index = p->first.hash_to_shard(osd->num_shards); + if (shard_index == shard_id) { + auto r = pg_slots.emplace(p->first, nullptr); + if (r.second) { + dout(10) << "priming slot " << p->first << " e" << p->second << dendl; + r.first->second = make_unique<OSDShardPGSlot>(); + r.first->second->waiting_for_split.insert(p->second); + } else { + auto q = r.first; + ceph_assert(q != pg_slots.end()); + dout(10) << "priming (existing) slot " << p->first << " e" << p->second + << dendl; + q->second->waiting_for_split.insert(p->second); + } + p = pgids->erase(p); + } else { + ++p; + } + } +} + +void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap, + set<pair<spg_t,epoch_t>> *merge_pgs) +{ + std::lock_guard l(shard_lock); + dout(20) << __func__ << " checking shard " << shard_id + << " for remaining merge pgs " << merge_pgs << dendl; + auto p = merge_pgs->begin(); + while (p != merge_pgs->end()) { + spg_t pgid = p->first; + epoch_t epoch = p->second; + unsigned shard_index = pgid.hash_to_shard(osd->num_shards); + if (shard_index != shard_id) { + ++p; + continue; + } + OSDShardPGSlot *slot; + auto r = pg_slots.emplace(pgid, nullptr); + if (r.second) { + r.first->second = make_unique<OSDShardPGSlot>(); + } + slot = r.first->second.get(); + if (slot->pg) { + // already have pg + dout(20) << __func__ << " have merge participant pg " << pgid + << " " << slot->pg << dendl; + } else if (!slot->waiting_for_split.empty() && + *slot->waiting_for_split.begin() < epoch) { + dout(20) << __func__ << " pending split on merge participant pg " << pgid + << " " << slot->waiting_for_split << dendl; + } else { + dout(20) << __func__ << " creating empty merge participant " << pgid + << " for merge in " << epoch << dendl; + // leave history zeroed; PG::merge_from() will fill it in. + pg_history_t history; + PGCreateInfo cinfo(pgid, epoch - 1, + history, PastIntervals(), false); + PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo); + _attach_pg(r.first->second.get(), pg.get()); + _wake_pg_slot(pgid, slot); + pg->unlock(); + } + // mark slot for merge + dout(20) << __func__ << " marking merge participant " << pgid << dendl; + slot->waiting_for_merge_epoch = epoch; + p = merge_pgs->erase(p); + } +} + +void OSDShard::register_and_wake_split_child(PG *pg) +{ + epoch_t epoch; + { + std::lock_guard l(shard_lock); + dout(10) << pg->pg_id << " " << pg << dendl; + auto p = pg_slots.find(pg->pg_id); + ceph_assert(p != pg_slots.end()); + auto *slot = p->second.get(); + dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split + << dendl; + ceph_assert(!slot->pg); + ceph_assert(!slot->waiting_for_split.empty()); + _attach_pg(slot, pg); + + epoch = pg->get_osdmap_epoch(); + ceph_assert(slot->waiting_for_split.count(epoch)); + slot->waiting_for_split.erase(epoch); + if (slot->waiting_for_split.empty()) { + _wake_pg_slot(pg->pg_id, slot); + } else { + dout(10) << __func__ << " still waiting for split on " + << slot->waiting_for_split << dendl; + } + } + + // kick child to ensure it pulls up to the latest osdmap + osd->enqueue_peering_evt( + pg->pg_id, + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + epoch, + epoch, + NullEvt()))); + + std::lock_guard l{sdata_wait_lock}; + sdata_cond.notify_one(); +} + +void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num) +{ + std::lock_guard l(shard_lock); + vector<spg_t> to_delete; + for (auto& i : pg_slots) { + if (i.first != parent && + i.first.get_ancestor(old_pg_num) == parent) { + dout(10) << __func__ << " parent " << parent << " clearing " << i.first + << dendl; + _wake_pg_slot(i.first, i.second.get()); + to_delete.push_back(i.first); + } + } + for (auto pgid : to_delete) { + pg_slots.erase(pgid); + } +} + + +// ============================================================= + +#undef dout_context +#define dout_context osd->cct +#undef dout_prefix +#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq " + +void OSD::ShardedOpWQ::_add_slot_waiter( + spg_t pgid, + OSDShardPGSlot *slot, + OpQueueItem&& qi) +{ + if (qi.is_peering()) { + dout(20) << __func__ << " " << pgid + << " peering, item epoch is " + << qi.get_map_epoch() + << ", will wait on " << qi << dendl; + slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi)); + } else { + dout(20) << __func__ << " " << pgid + << " item epoch is " + << qi.get_map_epoch() + << ", will wait on " << qi << dendl; + slot->waiting.push_back(std::move(qi)); + } +} + +#undef dout_prefix +#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") " + +void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb) +{ + uint32_t shard_index = thread_index % osd->num_shards; + auto& sdata = osd->shards[shard_index]; + ceph_assert(sdata); + + // If all threads of shards do oncommits, there is a out-of-order + // problem. So we choose the thread which has the smallest + // thread_index(thread_index < num_shards) of shard to do oncommit + // callback. + bool is_smallest_thread_index = thread_index < osd->num_shards; + + // peek at spg_t + sdata->shard_lock.lock(); + if (sdata->pqueue->empty() && + (!is_smallest_thread_index || sdata->context_queue.empty())) { + std::unique_lock wait_lock{sdata->sdata_wait_lock}; + if (is_smallest_thread_index && !sdata->context_queue.empty()) { + // we raced with a context_queue addition, don't wait + wait_lock.unlock(); + } else if (!sdata->stop_waiting) { + dout(20) << __func__ << " empty q, waiting" << dendl; + osd->cct->get_heartbeat_map()->clear_timeout(hb); + sdata->shard_lock.unlock(); + sdata->sdata_cond.wait(wait_lock); + wait_lock.unlock(); + sdata->shard_lock.lock(); + if (sdata->pqueue->empty() && + !(is_smallest_thread_index && !sdata->context_queue.empty())) { + sdata->shard_lock.unlock(); + return; + } + // found a work item; reapply default wq timeouts + osd->cct->get_heartbeat_map()->reset_timeout(hb, + timeout_interval, suicide_interval); + } else { + dout(20) << __func__ << " need return immediately" << dendl; + wait_lock.unlock(); + sdata->shard_lock.unlock(); + return; + } + } + + list<Context *> oncommits; + if (is_smallest_thread_index && !sdata->context_queue.empty()) { + sdata->context_queue.swap(oncommits); + } + + if (sdata->pqueue->empty()) { + if (osd->is_stopping()) { + sdata->shard_lock.unlock(); + for (auto c : oncommits) { + dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl; + delete c; + } + return; // OSD shutdown, discard. + } + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + + OpQueueItem item = sdata->pqueue->dequeue(); + if (osd->is_stopping()) { + sdata->shard_lock.unlock(); + for (auto c : oncommits) { + dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl; + delete c; + } + return; // OSD shutdown, discard. + } + + const auto token = item.get_ordering_token(); + auto r = sdata->pg_slots.emplace(token, nullptr); + if (r.second) { + r.first->second = make_unique<OSDShardPGSlot>(); + } + OSDShardPGSlot *slot = r.first->second.get(); + dout(20) << __func__ << " " << token + << (r.second ? " (new)" : "") + << " to_process " << slot->to_process + << " waiting " << slot->waiting + << " waiting_peering " << slot->waiting_peering + << dendl; + slot->to_process.push_back(std::move(item)); + dout(20) << __func__ << " " << slot->to_process.back() + << " queued" << dendl; + + retry_pg: + PGRef pg = slot->pg; + + // lock pg (if we have it) + if (pg) { + // note the requeue seq now... + uint64_t requeue_seq = slot->requeue_seq; + ++slot->num_running; + + sdata->shard_lock.unlock(); + osd->service.maybe_inject_dispatch_delay(); + pg->lock(); + osd->service.maybe_inject_dispatch_delay(); + sdata->shard_lock.lock(); + + auto q = sdata->pg_slots.find(token); + if (q == sdata->pg_slots.end()) { + // this can happen if we race with pg removal. + dout(20) << __func__ << " slot " << token << " no longer there" << dendl; + pg->unlock(); + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + slot = q->second.get(); + --slot->num_running; + + if (slot->to_process.empty()) { + // raced with _wake_pg_slot or consume_map + dout(20) << __func__ << " " << token + << " nothing queued" << dendl; + pg->unlock(); + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + if (requeue_seq != slot->requeue_seq) { + dout(20) << __func__ << " " << token + << " requeue_seq " << slot->requeue_seq << " > our " + << requeue_seq << ", we raced with _wake_pg_slot" + << dendl; + pg->unlock(); + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + if (slot->pg != pg) { + // this can happen if we race with pg removal. + dout(20) << __func__ << " slot " << token << " no longer attached to " + << pg << dendl; + pg->unlock(); + goto retry_pg; + } + } + + dout(20) << __func__ << " " << token + << " to_process " << slot->to_process + << " waiting " << slot->waiting + << " waiting_peering " << slot->waiting_peering << dendl; + + ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval, + suicide_interval); + + // take next item + auto qi = std::move(slot->to_process.front()); + slot->to_process.pop_front(); + dout(20) << __func__ << " " << qi << " pg " << pg << dendl; + set<pair<spg_t,epoch_t>> new_children; + OSDMapRef osdmap; + + while (!pg) { + // should this pg shard exist on this osd in this (or a later) epoch? + osdmap = sdata->shard_osdmap; + const PGCreateInfo *create_info = qi.creates_pg(); + if (!slot->waiting_for_split.empty()) { + dout(20) << __func__ << " " << token + << " splitting " << slot->waiting_for_split << dendl; + _add_slot_waiter(token, slot, std::move(qi)); + } else if (qi.get_map_epoch() > osdmap->get_epoch()) { + dout(20) << __func__ << " " << token + << " map " << qi.get_map_epoch() << " > " + << osdmap->get_epoch() << dendl; + _add_slot_waiter(token, slot, std::move(qi)); + } else if (qi.is_peering()) { + if (!qi.peering_requires_pg()) { + // for pg-less events, we run them under the ordering lock, since + // we don't have the pg lock to keep them ordered. + qi.run(osd, sdata, pg, tp_handle); + } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) { + if (create_info) { + if (create_info->by_mon && + osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) { + dout(20) << __func__ << " " << token + << " no pg, no longer primary, ignoring mon create on " + << qi << dendl; + } else { + dout(20) << __func__ << " " << token + << " no pg, should create on " << qi << dendl; + pg = osd->handle_pg_create_info(osdmap, create_info); + if (pg) { + // we created the pg! drop out and continue "normally"! + sdata->_attach_pg(slot, pg.get()); + sdata->_wake_pg_slot(token, slot); + + // identify split children between create epoch and shard epoch. + osd->service.identify_splits_and_merges( + pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr); + sdata->_prime_splits(&new_children); + // distribute remaining split children to other shards below! + break; + } + dout(20) << __func__ << " ignored create on " << qi << dendl; + } + } else { + dout(20) << __func__ << " " << token + << " no pg, peering, !create, discarding " << qi << dendl; + } + } else { + dout(20) << __func__ << " " << token + << " no pg, peering, doesn't map here e" << osdmap->get_epoch() + << ", discarding " << qi + << dendl; + } + } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) { + dout(20) << __func__ << " " << token + << " no pg, should exist e" << osdmap->get_epoch() + << ", will wait on " << qi << dendl; + _add_slot_waiter(token, slot, std::move(qi)); + } else { + dout(20) << __func__ << " " << token + << " no pg, shouldn't exist e" << osdmap->get_epoch() + << ", dropping " << qi << dendl; + // share map with client? + if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) { + auto priv = (*_op)->get_req()->get_connection()->get_priv(); + if (auto session = static_cast<Session *>(priv.get()); session) { + osd->maybe_share_map(session, *_op, sdata->shard_osdmap); + } + } + unsigned pushes_to_free = qi.get_reserved_pushes(); + if (pushes_to_free > 0) { + sdata->shard_lock.unlock(); + osd->service.release_reserved_pushes(pushes_to_free); + handle_oncommits(oncommits); + return; + } + } + sdata->shard_lock.unlock(); + handle_oncommits(oncommits); + return; + } + if (qi.is_peering()) { + OSDMapRef osdmap = sdata->shard_osdmap; + if (qi.get_map_epoch() > osdmap->get_epoch()) { + _add_slot_waiter(token, slot, std::move(qi)); + sdata->shard_lock.unlock(); + pg->unlock(); + handle_oncommits(oncommits); + return; + } + } + sdata->shard_lock.unlock(); + + if (!new_children.empty()) { + for (auto shard : osd->shards) { + shard->prime_splits(osdmap, &new_children); + } + ceph_assert(new_children.empty()); + } + + // osd_opwq_process marks the point at which an operation has been dequeued + // and will begin to be handled by a worker thread. + { +#ifdef WITH_LTTNG + osd_reqid_t reqid; + if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) { + reqid = (*_op)->get_reqid(); + } +#endif + tracepoint(osd, opwq_process_start, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: "; + Formatter *f = Formatter::create("json"); + f->open_object_section("q"); + dump(f); + f->close_section(); + f->flush(*_dout); + delete f; + *_dout << dendl; + + qi.run(osd, sdata, pg, tp_handle); + + { +#ifdef WITH_LTTNG + osd_reqid_t reqid; + if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) { + reqid = (*_op)->get_reqid(); + } +#endif + tracepoint(osd, opwq_process_finish, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + handle_oncommits(oncommits); +} + +void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) { + uint32_t shard_index = + item.get_ordering_token().hash_to_shard(osd->shards.size()); + + OSDShard* sdata = osd->shards[shard_index]; + assert (NULL != sdata); + unsigned priority = item.get_priority(); + unsigned cost = item.get_cost(); + sdata->shard_lock.lock(); + + dout(20) << __func__ << " " << item << dendl; + if (priority >= osd->op_prio_cutoff) + sdata->pqueue->enqueue_strict( + item.get_owner(), priority, std::move(item)); + else + sdata->pqueue->enqueue( + item.get_owner(), priority, cost, std::move(item)); + sdata->shard_lock.unlock(); + + std::lock_guard l{sdata->sdata_wait_lock}; + sdata->sdata_cond.notify_one(); +} + +void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item) +{ + auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size()); + auto& sdata = osd->shards[shard_index]; + ceph_assert(sdata); + sdata->shard_lock.lock(); + auto p = sdata->pg_slots.find(item.get_ordering_token()); + if (p != sdata->pg_slots.end() && + !p->second->to_process.empty()) { + // we may be racing with _process, which has dequeued a new item + // from pqueue, put it on to_process, and is now busy taking the + // pg lock. ensure this old requeued item is ordered before any + // such newer item in to_process. + p->second->to_process.push_front(std::move(item)); + item = std::move(p->second->to_process.back()); + p->second->to_process.pop_back(); + dout(20) << __func__ + << " " << p->second->to_process.front() + << " shuffled w/ " << item << dendl; + } else { + dout(20) << __func__ << " " << item << dendl; + } + sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff); + sdata->shard_lock.unlock(); + std::lock_guard l{sdata->sdata_wait_lock}; + sdata->sdata_cond.notify_one(); +} + +namespace ceph { +namespace osd_cmds { + +int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, + std::ostream& os) +{ + if (!ceph_using_tcmalloc()) { + os << "could not issue heap profiler command -- not using tcmalloc!"; + return -EOPNOTSUPP; + } + + string cmd; + if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) { + os << "unable to get value for command \"" << cmd << "\""; + return -EINVAL; + } + + std::vector<std::string> cmd_vec; + get_str_vec(cmd, cmd_vec); + + string val; + if (cmd_getval(&cct, cmdmap, "value", val)) { + cmd_vec.push_back(val); + } + + ceph_heap_profiler_handle_command(cmd_vec, os); + + return 0; +} + +}} // namespace ceph::osd_cmds + + +std::ostream& operator<<(std::ostream& out, const io_queue& q) { + switch(q) { + case io_queue::prioritized: + out << "prioritized"; + break; + case io_queue::weightedpriority: + out << "weightedpriority"; + break; + case io_queue::mclock_opclass: + out << "mclock_opclass"; + break; + case io_queue::mclock_client: + out << "mclock_client"; + break; + } + return out; +} diff --git a/src/osd/OSD.h b/src/osd/OSD.h new file mode 100644 index 00000000..8c87823d --- /dev/null +++ b/src/osd/OSD.h @@ -0,0 +1,2395 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_H +#define CEPH_OSD_H + +#include "PG.h" + +#include "msg/Dispatcher.h" + +#include "common/Mutex.h" +#include "common/RWLock.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "common/AsyncReserver.h" +#include "common/ceph_context.h" +#include "common/config_cacher.h" +#include "common/zipkin_trace.h" + +#include "mgr/MgrClient.h" + +#include "os/ObjectStore.h" +#include "OSDCap.h" + +#include "auth/KeyRing.h" + +#include "osd/ClassHandler.h" + +#include "include/CompatSet.h" + +#include "OpRequest.h" +#include "Session.h" + +#include "osd/OpQueueItem.h" + +#include <atomic> +#include <map> +#include <memory> +#include <string> + +#include "include/unordered_map.h" + +#include "common/shared_cache.hpp" +#include "common/simple_cache.hpp" +#include "common/sharedptr_registry.hpp" +#include "common/WeightedPriorityQueue.h" +#include "common/PrioritizedQueue.h" +#include "osd/mClockOpClassQueue.h" +#include "osd/mClockClientQueue.h" +#include "messages/MOSDOp.h" +#include "common/EventTrace.h" + +#define CEPH_OSD_PROTOCOL 10 /* cluster internal */ + +/* + + lock ordering for pg map + + PG::lock + ShardData::lock + OSD::pg_map_lock + + */ + +enum { + l_osd_first = 10000, + l_osd_op_wip, + l_osd_op, + l_osd_op_inb, + l_osd_op_outb, + l_osd_op_lat, + l_osd_op_process_lat, + l_osd_op_prepare_lat, + l_osd_op_r, + l_osd_op_r_outb, + l_osd_op_r_lat, + l_osd_op_r_lat_outb_hist, + l_osd_op_r_process_lat, + l_osd_op_r_prepare_lat, + l_osd_op_w, + l_osd_op_w_inb, + l_osd_op_w_lat, + l_osd_op_w_lat_inb_hist, + l_osd_op_w_process_lat, + l_osd_op_w_prepare_lat, + l_osd_op_rw, + l_osd_op_rw_inb, + l_osd_op_rw_outb, + l_osd_op_rw_lat, + l_osd_op_rw_lat_inb_hist, + l_osd_op_rw_lat_outb_hist, + l_osd_op_rw_process_lat, + l_osd_op_rw_prepare_lat, + + l_osd_op_before_queue_op_lat, + l_osd_op_before_dequeue_op_lat, + + l_osd_sop, + l_osd_sop_inb, + l_osd_sop_lat, + l_osd_sop_w, + l_osd_sop_w_inb, + l_osd_sop_w_lat, + l_osd_sop_pull, + l_osd_sop_pull_lat, + l_osd_sop_push, + l_osd_sop_push_inb, + l_osd_sop_push_lat, + + l_osd_pull, + l_osd_push, + l_osd_push_outb, + + l_osd_rop, + l_osd_rbytes, + + l_osd_loadavg, + l_osd_cached_crc, + l_osd_cached_crc_adjusted, + l_osd_missed_crc, + + l_osd_pg, + l_osd_pg_primary, + l_osd_pg_replica, + l_osd_pg_stray, + l_osd_pg_removing, + l_osd_hb_to, + l_osd_map, + l_osd_mape, + l_osd_mape_dup, + + l_osd_waiting_for_map, + + l_osd_map_cache_hit, + l_osd_map_cache_miss, + l_osd_map_cache_miss_low, + l_osd_map_cache_miss_low_avg, + l_osd_map_bl_cache_hit, + l_osd_map_bl_cache_miss, + + l_osd_stat_bytes, + l_osd_stat_bytes_used, + l_osd_stat_bytes_avail, + + l_osd_copyfrom, + + l_osd_tier_promote, + l_osd_tier_flush, + l_osd_tier_flush_fail, + l_osd_tier_try_flush, + l_osd_tier_try_flush_fail, + l_osd_tier_evict, + l_osd_tier_whiteout, + l_osd_tier_dirty, + l_osd_tier_clean, + l_osd_tier_delay, + l_osd_tier_proxy_read, + l_osd_tier_proxy_write, + + l_osd_agent_wake, + l_osd_agent_skip, + l_osd_agent_flush, + l_osd_agent_evict, + + l_osd_object_ctx_cache_hit, + l_osd_object_ctx_cache_total, + + l_osd_op_cache_hit, + l_osd_tier_flush_lat, + l_osd_tier_promote_lat, + l_osd_tier_r_lat, + + l_osd_pg_info, + l_osd_pg_fastinfo, + l_osd_pg_biginfo, + + l_osd_last, +}; + +// RecoveryState perf counters +enum { + rs_first = 20000, + rs_initial_latency, + rs_started_latency, + rs_reset_latency, + rs_start_latency, + rs_primary_latency, + rs_peering_latency, + rs_backfilling_latency, + rs_waitremotebackfillreserved_latency, + rs_waitlocalbackfillreserved_latency, + rs_notbackfilling_latency, + rs_repnotrecovering_latency, + rs_repwaitrecoveryreserved_latency, + rs_repwaitbackfillreserved_latency, + rs_reprecovering_latency, + rs_activating_latency, + rs_waitlocalrecoveryreserved_latency, + rs_waitremoterecoveryreserved_latency, + rs_recovering_latency, + rs_recovered_latency, + rs_clean_latency, + rs_active_latency, + rs_replicaactive_latency, + rs_stray_latency, + rs_getinfo_latency, + rs_getlog_latency, + rs_waitactingchange_latency, + rs_incomplete_latency, + rs_down_latency, + rs_getmissing_latency, + rs_waitupthru_latency, + rs_notrecovering_latency, + rs_last, +}; + +class Messenger; +class Message; +class MonClient; +class PerfCounters; +class ObjectStore; +class FuseStore; +class OSDMap; +class MLog; +class Objecter; +class KeyStore; + +class Watch; +class PrimaryLogPG; + +class TestOpsSocketHook; +struct C_FinishSplits; +struct C_OpenPGs; +class LogChannel; +class CephContext; +class MOSDOp; + +class MOSDPGCreate2; +class MOSDPGQuery; +class MOSDPGNotify; +class MOSDPGInfo; +class MOSDPGRemove; +class MOSDForceRecovery; + +class OSD; + +class OSDService { +public: + OSD *osd; + CephContext *cct; + ObjectStore::CollectionHandle meta_ch; + const int whoami; + ObjectStore *&store; + LogClient &log_client; + LogChannelRef clog; + PGRecoveryStats &pg_recovery_stats; +private: + Messenger *&cluster_messenger; + Messenger *&client_messenger; +public: + PerfCounters *&logger; + PerfCounters *&recoverystate_perf; + MonClient *&monc; + ClassHandler *&class_handler; + + md_config_cacher_t<Option::size_t> osd_max_object_size; + md_config_cacher_t<bool> osd_skip_data_digest; + + void enqueue_back(OpQueueItem&& qi); + void enqueue_front(OpQueueItem&& qi); + + void maybe_inject_dispatch_delay() { + if (g_conf()->osd_debug_inject_dispatch_delay_probability > 0) { + if (rand() % 10000 < + g_conf()->osd_debug_inject_dispatch_delay_probability * 10000) { + utime_t t; + t.set_from_double(g_conf()->osd_debug_inject_dispatch_delay_duration); + t.sleep(); + } + } + } + +private: + // -- superblock -- + ceph::mutex publish_lock, pre_publish_lock; // pre-publish orders before publish + OSDSuperblock superblock; + +public: + OSDSuperblock get_superblock() { + std::lock_guard l(publish_lock); + return superblock; + } + void publish_superblock(const OSDSuperblock &block) { + std::lock_guard l(publish_lock); + superblock = block; + } + + int get_nodeid() const { return whoami; } + + std::atomic<epoch_t> max_oldest_map; +private: + OSDMapRef osdmap; + +public: + OSDMapRef get_osdmap() { + std::lock_guard l(publish_lock); + return osdmap; + } + epoch_t get_osdmap_epoch() { + std::lock_guard l(publish_lock); + return osdmap ? osdmap->get_epoch() : 0; + } + void publish_map(OSDMapRef map) { + std::lock_guard l(publish_lock); + osdmap = map; + } + + /* + * osdmap - current published map + * next_osdmap - pre_published map that is about to be published. + * + * We use the next_osdmap to send messages and initiate connections, + * but only if the target is the same instance as the one in the map + * epoch the current user is working from (i.e., the result is + * equivalent to what is in next_osdmap). + * + * This allows the helpers to start ignoring osds that are about to + * go down, and let OSD::handle_osd_map()/note_down_osd() mark them + * down, without worrying about reopening connections from threads + * working from old maps. + */ +private: + OSDMapRef next_osdmap; + ceph::condition_variable pre_publish_cond; + +public: + void pre_publish_map(OSDMapRef map) { + std::lock_guard l(pre_publish_lock); + next_osdmap = std::move(map); + } + + void activate_map(); + /// map epochs reserved below + map<epoch_t, unsigned> map_reservations; + + /// gets ref to next_osdmap and registers the epoch as reserved + OSDMapRef get_nextmap_reserved() { + std::lock_guard l(pre_publish_lock); + if (!next_osdmap) + return OSDMapRef(); + epoch_t e = next_osdmap->get_epoch(); + map<epoch_t, unsigned>::iterator i = + map_reservations.insert(make_pair(e, 0)).first; + i->second++; + return next_osdmap; + } + /// releases reservation on map + void release_map(OSDMapRef osdmap) { + std::lock_guard l(pre_publish_lock); + map<epoch_t, unsigned>::iterator i = + map_reservations.find(osdmap->get_epoch()); + ceph_assert(i != map_reservations.end()); + ceph_assert(i->second > 0); + if (--(i->second) == 0) { + map_reservations.erase(i); + } + pre_publish_cond.notify_all(); + } + /// blocks until there are no reserved maps prior to next_osdmap + void await_reserved_maps() { + std::unique_lock l{pre_publish_lock}; + ceph_assert(next_osdmap); + pre_publish_cond.wait(l, [this] { + auto i = map_reservations.cbegin(); + return (i == map_reservations.cend() || + i->first >= next_osdmap->get_epoch()); + }); + } + OSDMapRef get_next_osdmap() { + std::lock_guard l(pre_publish_lock); + if (!next_osdmap) + return OSDMapRef(); + return next_osdmap; + } + +private: + Mutex peer_map_epoch_lock; + map<int, epoch_t> peer_map_epoch; +public: + epoch_t get_peer_epoch(int p); + epoch_t note_peer_epoch(int p, epoch_t e); + void forget_peer_epoch(int p, epoch_t e); + + void send_map(class MOSDMap *m, Connection *con); + void send_incremental_map(epoch_t since, Connection *con, const OSDMapRef& osdmap); + MOSDMap *build_incremental_map_msg(epoch_t from, epoch_t to, + OSDSuperblock& superblock); + bool should_share_map(entity_name_t name, Connection *con, epoch_t epoch, + const OSDMapRef& osdmap, const epoch_t *sent_epoch_p); + void share_map(entity_name_t name, Connection *con, epoch_t epoch, + OSDMapRef& osdmap, epoch_t *sent_epoch_p); + void share_map_peer(int peer, Connection *con, + OSDMapRef map = OSDMapRef()); + + ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch); + pair<ConnectionRef,ConnectionRef> get_con_osd_hb(int peer, epoch_t from_epoch); // (back, front) + void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch); + void send_message_osd_cluster(Message *m, Connection *con) { + con->send_message(m); + } + void send_message_osd_cluster(Message *m, const ConnectionRef& con) { + con->send_message(m); + } + void send_message_osd_client(Message *m, Connection *con) { + con->send_message(m); + } + void send_message_osd_client(Message *m, const ConnectionRef& con) { + con->send_message(m); + } + entity_name_t get_cluster_msgr_name() const; + +private: + // -- scrub scheduling -- + Mutex sched_scrub_lock; + int scrubs_local; + int scrubs_remote; + +public: + struct ScrubJob { + CephContext* cct; + /// pg to be scrubbed + spg_t pgid; + /// a time scheduled for scrub. but the scrub could be delayed if system + /// load is too high or it fails to fall in the scrub hours + utime_t sched_time; + /// the hard upper bound of scrub time + utime_t deadline; + ScrubJob() : cct(nullptr) {} + explicit ScrubJob(CephContext* cct, const spg_t& pg, + const utime_t& timestamp, + double pool_scrub_min_interval = 0, + double pool_scrub_max_interval = 0, bool must = true); + /// order the jobs by sched_time + bool operator<(const ScrubJob& rhs) const; + }; + set<ScrubJob> sched_scrub_pg; + + /// @returns the scrub_reg_stamp used for unregister the scrub job + utime_t reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval, + double pool_scrub_max_interval, bool must) { + ScrubJob scrub(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval, + must); + std::lock_guard l(sched_scrub_lock); + sched_scrub_pg.insert(scrub); + return scrub.sched_time; + } + void unreg_pg_scrub(spg_t pgid, utime_t t) { + std::lock_guard l(sched_scrub_lock); + size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t)); + ceph_assert(removed); + } + bool first_scrub_stamp(ScrubJob *out) { + std::lock_guard l(sched_scrub_lock); + if (sched_scrub_pg.empty()) + return false; + set<ScrubJob>::iterator iter = sched_scrub_pg.begin(); + *out = *iter; + return true; + } + bool next_scrub_stamp(const ScrubJob& next, + ScrubJob *out) { + std::lock_guard l(sched_scrub_lock); + if (sched_scrub_pg.empty()) + return false; + set<ScrubJob>::const_iterator iter = sched_scrub_pg.lower_bound(next); + if (iter == sched_scrub_pg.cend()) + return false; + ++iter; + if (iter == sched_scrub_pg.cend()) + return false; + *out = *iter; + return true; + } + + void dumps_scrub(Formatter *f) { + ceph_assert(f != nullptr); + std::lock_guard l(sched_scrub_lock); + + f->open_array_section("scrubs"); + for (const auto &i: sched_scrub_pg) { + f->open_object_section("scrub"); + f->dump_stream("pgid") << i.pgid; + f->dump_stream("sched_time") << i.sched_time; + f->dump_stream("deadline") << i.deadline; + f->dump_bool("forced", i.sched_time == PG::Scrubber::scrub_must_stamp()); + f->close_section(); + } + f->close_section(); + } + + bool can_inc_scrubs(); + bool inc_scrubs_local(); + void dec_scrubs_local(); + bool inc_scrubs_remote(); + void dec_scrubs_remote(); + void dump_scrub_reservations(Formatter *f); + + void reply_op_error(OpRequestRef op, int err); + void reply_op_error(OpRequestRef op, int err, eversion_t v, version_t uv); + void handle_misdirected_op(PG *pg, OpRequestRef op); + + +private: + // -- agent shared state -- + Mutex agent_lock; + Cond agent_cond; + map<uint64_t, set<PGRef> > agent_queue; + set<PGRef>::iterator agent_queue_pos; + bool agent_valid_iterator; + int agent_ops; + int flush_mode_high_count; //once have one pg with FLUSH_MODE_HIGH then flush objects with high speed + set<hobject_t> agent_oids; + bool agent_active; + struct AgentThread : public Thread { + OSDService *osd; + explicit AgentThread(OSDService *o) : osd(o) {} + void *entry() override { + osd->agent_entry(); + return NULL; + } + } agent_thread; + bool agent_stop_flag; + Mutex agent_timer_lock; + SafeTimer agent_timer; + +public: + void agent_entry(); + void agent_stop(); + + void _enqueue(PG *pg, uint64_t priority) { + if (!agent_queue.empty() && + agent_queue.rbegin()->first < priority) + agent_valid_iterator = false; // inserting higher-priority queue + set<PGRef>& nq = agent_queue[priority]; + if (nq.empty()) + agent_cond.Signal(); + nq.insert(pg); + } + + void _dequeue(PG *pg, uint64_t old_priority) { + set<PGRef>& oq = agent_queue[old_priority]; + set<PGRef>::iterator p = oq.find(pg); + ceph_assert(p != oq.end()); + if (p == agent_queue_pos) + ++agent_queue_pos; + oq.erase(p); + if (oq.empty()) { + if (agent_queue.rbegin()->first == old_priority) + agent_valid_iterator = false; + agent_queue.erase(old_priority); + } + } + + /// enable agent for a pg + void agent_enable_pg(PG *pg, uint64_t priority) { + std::lock_guard l(agent_lock); + _enqueue(pg, priority); + } + + /// adjust priority for an enagled pg + void agent_adjust_pg(PG *pg, uint64_t old_priority, uint64_t new_priority) { + std::lock_guard l(agent_lock); + ceph_assert(new_priority != old_priority); + _enqueue(pg, new_priority); + _dequeue(pg, old_priority); + } + + /// disable agent for a pg + void agent_disable_pg(PG *pg, uint64_t old_priority) { + std::lock_guard l(agent_lock); + _dequeue(pg, old_priority); + } + + /// note start of an async (evict) op + void agent_start_evict_op() { + std::lock_guard l(agent_lock); + ++agent_ops; + } + + /// note finish or cancellation of an async (evict) op + void agent_finish_evict_op() { + std::lock_guard l(agent_lock); + ceph_assert(agent_ops > 0); + --agent_ops; + agent_cond.Signal(); + } + + /// note start of an async (flush) op + void agent_start_op(const hobject_t& oid) { + std::lock_guard l(agent_lock); + ++agent_ops; + ceph_assert(agent_oids.count(oid) == 0); + agent_oids.insert(oid); + } + + /// note finish or cancellation of an async (flush) op + void agent_finish_op(const hobject_t& oid) { + std::lock_guard l(agent_lock); + ceph_assert(agent_ops > 0); + --agent_ops; + ceph_assert(agent_oids.count(oid) == 1); + agent_oids.erase(oid); + agent_cond.Signal(); + } + + /// check if we are operating on an object + bool agent_is_active_oid(const hobject_t& oid) { + std::lock_guard l(agent_lock); + return agent_oids.count(oid); + } + + /// get count of active agent ops + int agent_get_num_ops() { + std::lock_guard l(agent_lock); + return agent_ops; + } + + void agent_inc_high_count() { + std::lock_guard l(agent_lock); + flush_mode_high_count ++; + } + + void agent_dec_high_count() { + std::lock_guard l(agent_lock); + flush_mode_high_count --; + } + +private: + /// throttle promotion attempts + std::atomic<unsigned int> promote_probability_millis{1000}; ///< probability thousands. one word. + PromoteCounter promote_counter; + utime_t last_recalibrate; + unsigned long promote_max_objects, promote_max_bytes; + +public: + bool promote_throttle() { + // NOTE: lockless! we rely on the probability being a single word. + promote_counter.attempt(); + if ((unsigned)rand() % 1000 > promote_probability_millis) + return true; // yes throttle (no promote) + if (promote_max_objects && + promote_counter.objects > promote_max_objects) + return true; // yes throttle + if (promote_max_bytes && + promote_counter.bytes > promote_max_bytes) + return true; // yes throttle + return false; // no throttle (promote) + } + void promote_finish(uint64_t bytes) { + promote_counter.finish(bytes); + } + void promote_throttle_recalibrate(); + + // -- Objecter, for tiering reads/writes from/to other OSDs -- + Objecter *objecter; + int m_objecter_finishers; + vector<Finisher*> objecter_finishers; + + // -- Watch -- + Mutex watch_lock; + SafeTimer watch_timer; + uint64_t next_notif_id; + uint64_t get_next_id(epoch_t cur_epoch) { + std::lock_guard l(watch_lock); + return (((uint64_t)cur_epoch) << 32) | ((uint64_t)(next_notif_id++)); + } + + // -- Recovery/Backfill Request Scheduling -- + Mutex recovery_request_lock; + SafeTimer recovery_request_timer; + + // For async recovery sleep + bool recovery_needs_sleep = true; + utime_t recovery_schedule_time = utime_t(); + + // For recovery & scrub & snap + Mutex sleep_lock; + SafeTimer sleep_timer; + + // -- tids -- + // for ops i issue + std::atomic<unsigned int> last_tid{0}; + ceph_tid_t get_tid() { + return (ceph_tid_t)last_tid++; + } + + // -- backfill_reservation -- + Finisher reserver_finisher; + AsyncReserver<spg_t> local_reserver; + AsyncReserver<spg_t> remote_reserver; + + // -- pg merge -- + Mutex merge_lock = {"OSD::merge_lock"}; + map<pg_t,eversion_t> ready_to_merge_source; // pg -> version + map<pg_t,std::tuple<eversion_t,epoch_t,epoch_t>> ready_to_merge_target; // pg -> (version,les,lec) + set<pg_t> not_ready_to_merge_source; + map<pg_t,pg_t> not_ready_to_merge_target; + set<pg_t> sent_ready_to_merge_source; + + void set_ready_to_merge_source(PG *pg, + eversion_t version); + void set_ready_to_merge_target(PG *pg, + eversion_t version, + epoch_t last_epoch_started, + epoch_t last_epoch_clean); + void set_not_ready_to_merge_source(pg_t source); + void set_not_ready_to_merge_target(pg_t target, pg_t source); + void clear_ready_to_merge(PG *pg); + void send_ready_to_merge(); + void _send_ready_to_merge(); + void clear_sent_ready_to_merge(); + void prune_sent_ready_to_merge(const OSDMapRef& osdmap); + + // -- pg_temp -- +private: + Mutex pg_temp_lock; + struct pg_temp_t { + vector<int> acting; + bool forced = false; + }; + map<pg_t, pg_temp_t> pg_temp_wanted; + map<pg_t, pg_temp_t> pg_temp_pending; + void _sent_pg_temp(); + friend std::ostream& operator<<(std::ostream&, const pg_temp_t&); +public: + void queue_want_pg_temp(pg_t pgid, const vector<int>& want, + bool forced = false); + void remove_want_pg_temp(pg_t pgid); + void requeue_pg_temp(); + void send_pg_temp(); + + ceph::mutex pg_created_lock = ceph::make_mutex("OSDService::pg_created_lock"); + set<pg_t> pg_created; + void send_pg_created(pg_t pgid); + void prune_pg_created(); + void send_pg_created(); + + AsyncReserver<spg_t> snap_reserver; + void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c); + void queue_for_snap_trim(PG *pg); + void queue_for_scrub(PG *pg, bool with_high_priority); + void queue_for_pg_delete(spg_t pgid, epoch_t e); + bool try_finish_pg_delete(PG *pg, unsigned old_pg_num); + +private: + // -- pg recovery and associated throttling -- + Mutex recovery_lock; + list<pair<epoch_t, PGRef> > awaiting_throttle; + + utime_t defer_recovery_until; + uint64_t recovery_ops_active; + uint64_t recovery_ops_reserved; + bool recovery_paused; +#ifdef DEBUG_RECOVERY_OIDS + map<spg_t, set<hobject_t> > recovery_oids; +#endif + bool _recover_now(uint64_t *available_pushes); + void _maybe_queue_recovery(); + void _queue_for_recovery( + pair<epoch_t, PGRef> p, uint64_t reserved_pushes); +public: + void start_recovery_op(PG *pg, const hobject_t& soid); + void finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue); + bool is_recovery_active(); + void release_reserved_pushes(uint64_t pushes); + void defer_recovery(float defer_for) { + defer_recovery_until = ceph_clock_now(); + defer_recovery_until += defer_for; + } + void pause_recovery() { + std::lock_guard l(recovery_lock); + recovery_paused = true; + } + bool recovery_is_paused() { + std::lock_guard l(recovery_lock); + return recovery_paused; + } + void unpause_recovery() { + std::lock_guard l(recovery_lock); + recovery_paused = false; + _maybe_queue_recovery(); + } + void kick_recovery_queue() { + std::lock_guard l(recovery_lock); + _maybe_queue_recovery(); + } + void clear_queued_recovery(PG *pg) { + std::lock_guard l(recovery_lock); + awaiting_throttle.remove_if( + [pg](decltype(awaiting_throttle)::const_reference awaiting ) { + return awaiting.second.get() == pg; + }); + } + // delayed pg activation + void queue_for_recovery(PG *pg) { + std::lock_guard l(recovery_lock); + + if (pg->is_forced_recovery_or_backfill()) { + awaiting_throttle.push_front(make_pair(pg->get_osdmap()->get_epoch(), pg)); + } else { + awaiting_throttle.push_back(make_pair(pg->get_osdmap()->get_epoch(), pg)); + } + _maybe_queue_recovery(); + } + void queue_recovery_after_sleep(PG *pg, epoch_t queued, uint64_t reserved_pushes) { + std::lock_guard l(recovery_lock); + _queue_for_recovery(make_pair(queued, pg), reserved_pushes); + } + + // osd map cache (past osd maps) + Mutex map_cache_lock; + SharedLRU<epoch_t, const OSDMap> map_cache; + SimpleLRU<epoch_t, bufferlist> map_bl_cache; + SimpleLRU<epoch_t, bufferlist> map_bl_inc_cache; + + /// final pg_num values for recently deleted pools + map<int64_t,int> deleted_pool_pg_nums; + + OSDMapRef try_get_map(epoch_t e); + OSDMapRef get_map(epoch_t e) { + OSDMapRef ret(try_get_map(e)); + ceph_assert(ret); + return ret; + } + OSDMapRef add_map(OSDMap *o) { + std::lock_guard l(map_cache_lock); + return _add_map(o); + } + OSDMapRef _add_map(OSDMap *o); + + void add_map_bl(epoch_t e, bufferlist& bl) { + std::lock_guard l(map_cache_lock); + return _add_map_bl(e, bl); + } + void _add_map_bl(epoch_t e, bufferlist& bl); + bool get_map_bl(epoch_t e, bufferlist& bl) { + std::lock_guard l(map_cache_lock); + return _get_map_bl(e, bl); + } + bool _get_map_bl(epoch_t e, bufferlist& bl); + + void add_map_inc_bl(epoch_t e, bufferlist& bl) { + std::lock_guard l(map_cache_lock); + return _add_map_inc_bl(e, bl); + } + void _add_map_inc_bl(epoch_t e, bufferlist& bl); + bool get_inc_map_bl(epoch_t e, bufferlist& bl); + + /// get last pg_num before a pool was deleted (if any) + int get_deleted_pool_pg_num(int64_t pool); + + void store_deleted_pool_pg_num(int64_t pool, int pg_num) { + std::lock_guard l(map_cache_lock); + deleted_pool_pg_nums[pool] = pg_num; + } + + /// get pgnum from newmap or, if pool was deleted, last map pool existed in + int get_possibly_deleted_pool_pg_num(OSDMapRef newmap, + int64_t pool) { + if (newmap->have_pg_pool(pool)) { + return newmap->get_pg_num(pool); + } + return get_deleted_pool_pg_num(pool); + } + + /// identify split child pgids over a osdmap interval + void identify_splits_and_merges( + OSDMapRef old_map, + OSDMapRef new_map, + spg_t pgid, + set<pair<spg_t,epoch_t>> *new_children, + set<pair<spg_t,epoch_t>> *merge_pgs); + + void need_heartbeat_peer_update(); + + void init(); + void final_init(); + void start_shutdown(); + void shutdown_reserver(); + void shutdown(); + + // -- stats -- + Mutex stat_lock; + osd_stat_t osd_stat; + uint32_t seq = 0; + + void set_statfs(const struct store_statfs_t &stbuf, + osd_alert_list_t& alerts); + osd_stat_t set_osd_stat(vector<int>& hb_peers, int num_pgs); + void inc_osd_stat_repaired(void); + void set_osd_stat_repaired(int64_t); + float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0); + osd_stat_t get_osd_stat() { + std::lock_guard l(stat_lock); + ++seq; + osd_stat.up_from = up_epoch; + osd_stat.seq = ((uint64_t)osd_stat.up_from << 32) + seq; + return osd_stat; + } + uint64_t get_osd_stat_seq() { + std::lock_guard l(stat_lock); + return osd_stat.seq; + } + void get_hb_pingtime(map<int, osd_stat_t::Interfaces> *pp) + { + std::lock_guard l(stat_lock); + *pp = osd_stat.hb_pingtime; + return; + } + + // -- OSD Full Status -- +private: + friend TestOpsSocketHook; + mutable Mutex full_status_lock; + enum s_names { INVALID = -1, NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state; // ascending + const char *get_full_state_name(s_names s) const { + switch (s) { + case NONE: return "none"; + case NEARFULL: return "nearfull"; + case BACKFILLFULL: return "backfillfull"; + case FULL: return "full"; + case FAILSAFE: return "failsafe"; + default: return "???"; + } + } + s_names get_full_state(string type) const { + if (type == "none") + return NONE; + else if (type == "failsafe") + return FAILSAFE; + else if (type == "full") + return FULL; + else if (type == "backfillfull") + return BACKFILLFULL; + else if (type == "nearfull") + return NEARFULL; + else + return INVALID; + } + double cur_ratio, physical_ratio; ///< current utilization + mutable int64_t injectfull = 0; + s_names injectfull_state = NONE; + float get_failsafe_full_ratio(); + bool _check_inject_full(DoutPrefixProvider *dpp, s_names type) const; + bool _check_full(DoutPrefixProvider *dpp, s_names type) const; +public: + void check_full_status(float ratio, float pratio); + s_names recalc_full_state(float ratio, float pratio, string &inject); + bool _tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t); + bool check_failsafe_full(DoutPrefixProvider *dpp) const; + bool check_full(DoutPrefixProvider *dpp) const; + bool tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t); + bool check_backfill_full(DoutPrefixProvider *dpp) const; + bool check_nearfull(DoutPrefixProvider *dpp) const; + bool is_failsafe_full() const; + bool is_full() const; + bool is_backfillfull() const; + bool is_nearfull() const; + bool need_fullness_update(); ///< osdmap state needs update + void set_injectfull(s_names type, int64_t count); + bool check_osdmap_full(const set<pg_shard_t> &missing_on); + + + // -- epochs -- +private: + mutable Mutex epoch_lock; // protects access to boot_epoch, up_epoch, bind_epoch + epoch_t boot_epoch; // _first_ epoch we were marked up (after this process started) + epoch_t up_epoch; // _most_recent_ epoch we were marked up + epoch_t bind_epoch; // epoch we last did a bind to new ip:ports +public: + /** + * Retrieve the boot_, up_, and bind_ epochs the OSD has set. The params + * can be NULL if you don't care about them. + */ + void retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch, + epoch_t *_bind_epoch) const; + /** + * Set the boot, up, and bind epochs. Any NULL params will not be set. + */ + void set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch, + const epoch_t *_bind_epoch); + epoch_t get_boot_epoch() const { + epoch_t ret; + retrieve_epochs(&ret, NULL, NULL); + return ret; + } + epoch_t get_up_epoch() const { + epoch_t ret; + retrieve_epochs(NULL, &ret, NULL); + return ret; + } + epoch_t get_bind_epoch() const { + epoch_t ret; + retrieve_epochs(NULL, NULL, &ret); + return ret; + } + + void request_osdmap_update(epoch_t e); + + // -- stopping -- + Mutex is_stopping_lock; + Cond is_stopping_cond; + enum { + NOT_STOPPING, + PREPARING_TO_STOP, + STOPPING }; + std::atomic<int> state{NOT_STOPPING}; + int get_state() const { + return state; + } + void set_state(int s) { + state = s; + } + bool is_stopping() const { + return state == STOPPING; + } + bool is_preparing_to_stop() const { + return state == PREPARING_TO_STOP; + } + bool prepare_to_stop(); + void got_stop_ack(); + + +#ifdef PG_DEBUG_REFS + Mutex pgid_lock; + map<spg_t, int> pgid_tracker; + map<spg_t, PG*> live_pgs; + void add_pgid(spg_t pgid, PG *pg); + void remove_pgid(spg_t pgid, PG *pg); + void dump_live_pgids(); +#endif + + explicit OSDService(OSD *osd); + ~OSDService(); +}; + + +enum class io_queue { + prioritized, + weightedpriority, + mclock_opclass, + mclock_client, +}; + + +/* + + Each PG slot includes queues for events that are processing and/or waiting + for a PG to be materialized in the slot. + + These are the constraints: + + - client ops must remained ordered by client, regardless of map epoch + - peering messages/events from peers must remain ordered by peer + - peering messages and client ops need not be ordered relative to each other + + - some peering events can create a pg (e.g., notify) + - the query peering event can proceed when a PG doesn't exist + + Implementation notes: + + - everybody waits for split. If the OSD has the parent PG it will instantiate + the PGSlot early and mark it waiting_for_split. Everything will wait until + the parent is able to commit the split operation and the child PG's are + materialized in the child slots. + + - every event has an epoch property and will wait for the OSDShard to catch + up to that epoch. For example, if we get a peering event from a future + epoch, the event will wait in the slot until the local OSD has caught up. + (We should be judicious in specifying the required epoch [by, e.g., setting + it to the same_interval_since epoch] so that we don't wait for epochs that + don't affect the given PG.) + + - we maintain two separate wait lists, *waiting* and *waiting_peering*. The + OpQueueItem has an is_peering() bool to determine which we use. Waiting + peering events are queued up by epoch required. + + - when we wake a PG slot (e.g., we finished split, or got a newer osdmap, or + materialized the PG), we wake *all* waiting items. (This could be optimized, + probably, but we don't bother.) We always requeue peering items ahead of + client ops. + + - some peering events are marked !peering_requires_pg (PGQuery). if we do + not have a PG these are processed immediately (under the shard lock). + + - we do not have a PG present, we check if the slot maps to the current host. + if so, we either queue the item and wait for the PG to materialize, or + (if the event is a pg creating event like PGNotify), we materialize the PG. + + - when we advance the osdmap on the OSDShard, we scan pg slots and + discard any slots with no pg (and not waiting_for_split) that no + longer map to the current host. + + */ + +struct OSDShardPGSlot { + PGRef pg; ///< pg reference + deque<OpQueueItem> to_process; ///< order items for this slot + int num_running = 0; ///< _process threads doing pg lookup/lock + + deque<OpQueueItem> waiting; ///< waiting for pg (or map + pg) + + /// waiting for map (peering evt) + map<epoch_t,deque<OpQueueItem>> waiting_peering; + + /// incremented by wake_pg_waiters; indicates racing _process threads + /// should bail out (their op has been requeued) + uint64_t requeue_seq = 0; + + /// waiting for split child to materialize in these epoch(s) + set<epoch_t> waiting_for_split; + + epoch_t epoch = 0; + boost::intrusive::set_member_hook<> pg_epoch_item; + + /// waiting for a merge (source or target) by this epoch + epoch_t waiting_for_merge_epoch = 0; +}; + +struct OSDShard { + const unsigned shard_id; + CephContext *cct; + OSD *osd; + + string shard_name; + + string sdata_wait_lock_name; + ceph::mutex sdata_wait_lock; + ceph::condition_variable sdata_cond; + + string osdmap_lock_name; + ceph::mutex osdmap_lock; ///< protect shard_osdmap updates vs users w/o shard_lock + OSDMapRef shard_osdmap; + + OSDMapRef get_osdmap() { + std::lock_guard l(osdmap_lock); + return shard_osdmap; + } + + string shard_lock_name; + ceph::mutex shard_lock; ///< protects remaining members below + + /// map of slots for each spg_t. maintains ordering of items dequeued + /// from pqueue while _process thread drops shard lock to acquire the + /// pg lock. stale slots are removed by consume_map. + unordered_map<spg_t,unique_ptr<OSDShardPGSlot>> pg_slots; + + struct pg_slot_compare_by_epoch { + bool operator()(const OSDShardPGSlot& l, const OSDShardPGSlot& r) const { + return l.epoch < r.epoch; + } + }; + + /// maintain an ordering of pg slots by pg epoch + boost::intrusive::multiset< + OSDShardPGSlot, + boost::intrusive::member_hook< + OSDShardPGSlot, + boost::intrusive::set_member_hook<>, + &OSDShardPGSlot::pg_epoch_item>, + boost::intrusive::compare<pg_slot_compare_by_epoch>> pg_slots_by_epoch; + int waiting_for_min_pg_epoch = 0; + ceph::condition_variable min_pg_epoch_cond; + + /// priority queue + std::unique_ptr<OpQueue<OpQueueItem, uint64_t>> pqueue; + + bool stop_waiting = false; + + ContextQueue context_queue; + + void _enqueue_front(OpQueueItem&& item, unsigned cutoff) { + unsigned priority = item.get_priority(); + unsigned cost = item.get_cost(); + if (priority >= cutoff) + pqueue->enqueue_strict_front( + item.get_owner(), + priority, std::move(item)); + else + pqueue->enqueue_front( + item.get_owner(), + priority, cost, std::move(item)); + } + + void _attach_pg(OSDShardPGSlot *slot, PG *pg); + void _detach_pg(OSDShardPGSlot *slot); + + void update_pg_epoch(OSDShardPGSlot *slot, epoch_t epoch); + epoch_t get_min_pg_epoch(); + void wait_min_pg_epoch(epoch_t need); + + /// return newest epoch we are waiting for + epoch_t get_max_waiting_epoch(); + + /// push osdmap into shard + void consume_map( + const OSDMapRef& osdmap, + unsigned *pushes_to_free); + + void _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot); + + void identify_splits_and_merges( + const OSDMapRef& as_of_osdmap, + set<pair<spg_t,epoch_t>> *split_children, + set<pair<spg_t,epoch_t>> *merge_pgs); + void _prime_splits(set<pair<spg_t,epoch_t>> *pgids); + void prime_splits(const OSDMapRef& as_of_osdmap, + set<pair<spg_t,epoch_t>> *pgids); + void prime_merges(const OSDMapRef& as_of_osdmap, + set<pair<spg_t,epoch_t>> *merge_pgs); + void register_and_wake_split_child(PG *pg); + void unprime_split_children(spg_t parent, unsigned old_pg_num); + + OSDShard( + int id, + CephContext *cct, + OSD *osd, + uint64_t max_tok_per_prio, uint64_t min_cost, + io_queue opqueue) + : shard_id(id), + cct(cct), + osd(osd), + shard_name(string("OSDShard.") + stringify(id)), + sdata_wait_lock_name(shard_name + "::sdata_wait_lock"), + sdata_wait_lock{make_mutex(sdata_wait_lock_name)}, + osdmap_lock_name(shard_name + "::osdmap_lock"), + osdmap_lock{make_mutex(osdmap_lock_name)}, + shard_lock_name(shard_name + "::shard_lock"), + shard_lock{make_mutex(shard_lock_name)}, + context_queue(sdata_wait_lock, sdata_cond) { + if (opqueue == io_queue::weightedpriority) { + pqueue = std::make_unique< + WeightedPriorityQueue<OpQueueItem,uint64_t>>( + max_tok_per_prio, min_cost); + } else if (opqueue == io_queue::prioritized) { + pqueue = std::make_unique< + PrioritizedQueue<OpQueueItem,uint64_t>>( + max_tok_per_prio, min_cost); + } else if (opqueue == io_queue::mclock_opclass) { + pqueue = std::make_unique<ceph::mClockOpClassQueue>(cct); + } else if (opqueue == io_queue::mclock_client) { + pqueue = std::make_unique<ceph::mClockClientQueue>(cct); + } + } +}; + +class OSD : public Dispatcher, + public md_config_obs_t { + /** OSD **/ + Mutex osd_lock; // global lock + SafeTimer tick_timer; // safe timer (osd_lock) + + // Tick timer for those stuff that do not need osd_lock + Mutex tick_timer_lock; + SafeTimer tick_timer_without_osd_lock; + std::string gss_ktfile_client{}; + +public: + // config observer bits + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) override; + void update_log_config(); + void check_config(); + +protected: + + const double OSD_TICK_INTERVAL = { 1.0 }; + double get_tick_interval() const; + + Messenger *cluster_messenger; + Messenger *client_messenger; + Messenger *objecter_messenger; + MonClient *monc; // check the "monc helpers" list before accessing directly + MgrClient mgrc; + PerfCounters *logger; + PerfCounters *recoverystate_perf; + ObjectStore *store; +#ifdef HAVE_LIBFUSE + FuseStore *fuse_store = nullptr; +#endif + LogClient log_client; + LogChannelRef clog; + + int whoami; + std::string dev_path, journal_path; + + int last_require_osd_release = 0; + + int numa_node = -1; + size_t numa_cpu_set_size = 0; + cpu_set_t numa_cpu_set; + + bool store_is_rotational = true; + bool journal_is_rotational = true; + + ZTracer::Endpoint trace_endpoint; + void create_logger(); + void create_recoverystate_perf(); + void tick(); + void tick_without_osd_lock(); + void _dispatch(Message *m); + void dispatch_op(OpRequestRef op); + + void check_osdmap_features(); + + // asok + friend class OSDSocketHook; + class OSDSocketHook *asok_hook; + bool asok_command(std::string_view admin_command, const cmdmap_t& cmdmap, + std::string_view format, std::ostream& ss); + +public: + ClassHandler *class_handler = nullptr; + int get_nodeid() { return whoami; } + + static ghobject_t get_osdmap_pobject_name(epoch_t epoch) { + char foo[20]; + snprintf(foo, sizeof(foo), "osdmap.%d", epoch); + return ghobject_t(hobject_t(sobject_t(object_t(foo), 0))); + } + static ghobject_t get_inc_osdmap_pobject_name(epoch_t epoch) { + char foo[22]; + snprintf(foo, sizeof(foo), "inc_osdmap.%d", epoch); + return ghobject_t(hobject_t(sobject_t(object_t(foo), 0))); + } + + static ghobject_t make_snapmapper_oid() { + return ghobject_t(hobject_t( + sobject_t( + object_t("snapmapper"), + 0))); + } + + static ghobject_t make_pg_log_oid(spg_t pg) { + stringstream ss; + ss << "pglog_" << pg; + string s; + getline(ss, s); + return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0))); + } + + static ghobject_t make_pg_biginfo_oid(spg_t pg) { + stringstream ss; + ss << "pginfo_" << pg; + string s; + getline(ss, s); + return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0))); + } + static ghobject_t make_infos_oid() { + hobject_t oid(sobject_t("infos", CEPH_NOSNAP)); + return ghobject_t(oid); + } + + static ghobject_t make_final_pool_info_oid(int64_t pool) { + return ghobject_t( + hobject_t( + sobject_t( + object_t(string("final_pool_") + stringify(pool)), + CEPH_NOSNAP))); + } + + static ghobject_t make_pg_num_history_oid() { + return ghobject_t(hobject_t(sobject_t("pg_num_history", CEPH_NOSNAP))); + } + + static void recursive_remove_collection(CephContext* cct, + ObjectStore *store, + spg_t pgid, + coll_t tmp); + + /** + * get_osd_initial_compat_set() + * + * Get the initial feature set for this OSD. Features + * here are automatically upgraded. + * + * Return value: Initial osd CompatSet + */ + static CompatSet get_osd_initial_compat_set(); + + /** + * get_osd_compat_set() + * + * Get all features supported by this OSD + * + * Return value: CompatSet of all supported features + */ + static CompatSet get_osd_compat_set(); + + +private: + class C_Tick; + class C_Tick_WithoutOSDLock; + + // -- config settings -- + float m_osd_pg_epoch_max_lag_factor; + + // -- superblock -- + OSDSuperblock superblock; + + void write_superblock(); + void write_superblock(ObjectStore::Transaction& t); + int read_superblock(); + + void clear_temp_objects(); + + CompatSet osd_compat; + + // -- state -- +public: + typedef enum { + STATE_INITIALIZING = 1, + STATE_PREBOOT, + STATE_BOOTING, + STATE_ACTIVE, + STATE_STOPPING, + STATE_WAITING_FOR_HEALTHY + } osd_state_t; + + static const char *get_state_name(int s) { + switch (s) { + case STATE_INITIALIZING: return "initializing"; + case STATE_PREBOOT: return "preboot"; + case STATE_BOOTING: return "booting"; + case STATE_ACTIVE: return "active"; + case STATE_STOPPING: return "stopping"; + case STATE_WAITING_FOR_HEALTHY: return "waiting_for_healthy"; + default: return "???"; + } + } + +private: + std::atomic<int> state{STATE_INITIALIZING}; + +public: + int get_state() const { + return state; + } + void set_state(int s) { + state = s; + } + bool is_initializing() const { + return state == STATE_INITIALIZING; + } + bool is_preboot() const { + return state == STATE_PREBOOT; + } + bool is_booting() const { + return state == STATE_BOOTING; + } + bool is_active() const { + return state == STATE_ACTIVE; + } + bool is_stopping() const { + return state == STATE_STOPPING; + } + bool is_waiting_for_healthy() const { + return state == STATE_WAITING_FOR_HEALTHY; + } + +private: + + ShardedThreadPool osd_op_tp; + ThreadPool command_tp; + + void get_latest_osdmap(); + + // -- sessions -- +private: + void dispatch_session_waiting(SessionRef session, OSDMapRef osdmap); + void maybe_share_map(Session *session, OpRequestRef op, OSDMapRef osdmap); + + Mutex session_waiting_lock; + set<SessionRef> session_waiting_for_map; + + /// Caller assumes refs for included Sessions + void get_sessions_waiting_for_map(set<SessionRef> *out) { + std::lock_guard l(session_waiting_lock); + out->swap(session_waiting_for_map); + } + void register_session_waiting_on_map(SessionRef session) { + std::lock_guard l(session_waiting_lock); + session_waiting_for_map.insert(session); + } + void clear_session_waiting_on_map(SessionRef session) { + std::lock_guard l(session_waiting_lock); + session_waiting_for_map.erase(session); + } + void dispatch_sessions_waiting_on_map() { + set<SessionRef> sessions_to_check; + get_sessions_waiting_for_map(&sessions_to_check); + for (auto i = sessions_to_check.begin(); + i != sessions_to_check.end(); + sessions_to_check.erase(i++)) { + std::lock_guard l{(*i)->session_dispatch_lock}; + dispatch_session_waiting(*i, get_osdmap()); + } + } + void session_handle_reset(SessionRef session) { + std::lock_guard l(session->session_dispatch_lock); + clear_session_waiting_on_map(session); + + session->clear_backoffs(); + + /* Messages have connection refs, we need to clear the + * connection->session->message->connection + * cycles which result. + * Bug #12338 + */ + session->waiting_on_map.clear_and_dispose(TrackedOp::Putter()); + } + +private: + /** + * @defgroup monc helpers + * @{ + * Right now we only have the one + */ + + /** + * Ask the Monitors for a sequence of OSDMaps. + * + * @param epoch The epoch to start with when replying + * @param force_request True if this request forces a new subscription to + * the monitors; false if an outstanding request that encompasses it is + * sufficient. + */ + void osdmap_subscribe(version_t epoch, bool force_request); + /** @} monc helpers */ + + Mutex osdmap_subscribe_lock; + epoch_t latest_subscribed_epoch{0}; + + // -- heartbeat -- + /// information about a heartbeat peer + struct HeartbeatInfo { + int peer; ///< peer + ConnectionRef con_front; ///< peer connection (front) + ConnectionRef con_back; ///< peer connection (back) + utime_t first_tx; ///< time we sent our first ping request + utime_t last_tx; ///< last time we sent a ping request + utime_t last_rx_front; ///< last time we got a ping reply on the front side + utime_t last_rx_back; ///< last time we got a ping reply on the back side + epoch_t epoch; ///< most recent epoch we wanted this peer + /// number of connections we send and receive heartbeat pings/replies + static constexpr int HEARTBEAT_MAX_CONN = 2; + /// history of inflight pings, arranging by timestamp we sent + /// send time -> deadline -> remaining replies + map<utime_t, pair<utime_t, int>> ping_history; + + utime_t hb_interval_start; + uint32_t hb_average_count = 0; + uint32_t hb_index = 0; + + uint32_t hb_total_back = 0; + uint32_t hb_min_back = UINT_MAX; + uint32_t hb_max_back = 0; + vector<uint32_t> hb_back_pingtime; + vector<uint32_t> hb_back_min; + vector<uint32_t> hb_back_max; + + uint32_t hb_total_front = 0; + uint32_t hb_min_front = UINT_MAX; + uint32_t hb_max_front = 0; + vector<uint32_t> hb_front_pingtime; + vector<uint32_t> hb_front_min; + vector<uint32_t> hb_front_max; + + bool is_stale(utime_t stale) { + if (ping_history.empty()) { + return false; + } + utime_t oldest_deadline = ping_history.begin()->second.first; + return oldest_deadline <= stale; + } + + bool is_unhealthy(utime_t now) { + if (ping_history.empty()) { + /// we haven't sent a ping yet or we have got all replies, + /// in either way we are safe and healthy for now + return false; + } + + utime_t oldest_deadline = ping_history.begin()->second.first; + return now > oldest_deadline; + } + + bool is_healthy(utime_t now) { + if (last_rx_front == utime_t() || last_rx_back == utime_t()) { + // only declare to be healthy until we have received the first + // replies from both front/back connections + return false; + } + return !is_unhealthy(now); + } + }; + /// state attached to outgoing heartbeat connections + struct HeartbeatSession : public RefCountedObject { + int peer; + explicit HeartbeatSession(int p) : peer(p) {} + }; + Mutex heartbeat_lock; + map<int, int> debug_heartbeat_drops_remaining; + Cond heartbeat_cond; + bool heartbeat_stop; + std::atomic<bool> heartbeat_need_update; + map<int,HeartbeatInfo> heartbeat_peers; ///< map of osd id to HeartbeatInfo + utime_t last_mon_heartbeat; + Messenger *hb_front_client_messenger; + Messenger *hb_back_client_messenger; + Messenger *hb_front_server_messenger; + Messenger *hb_back_server_messenger; + utime_t last_heartbeat_resample; ///< last time we chose random peers in waiting-for-healthy state + double daily_loadavg; + + // Track ping repsonse times using vector as a circular buffer + // MUST BE A POWER OF 2 + const uint32_t hb_vector_size = 16; + + void _add_heartbeat_peer(int p); + void _remove_heartbeat_peer(int p); + bool heartbeat_reset(Connection *con); + void maybe_update_heartbeat_peers(); + void reset_heartbeat_peers(bool all); + bool heartbeat_peers_need_update() { + return heartbeat_need_update.load(); + } + void heartbeat_set_peers_need_update() { + heartbeat_need_update.store(true); + } + void heartbeat_clear_peers_need_update() { + heartbeat_need_update.store(false); + } + void heartbeat(); + void heartbeat_check(); + void heartbeat_entry(); + void need_heartbeat_peer_update(); + + void heartbeat_kick() { + std::lock_guard l(heartbeat_lock); + heartbeat_cond.Signal(); + } + + struct T_Heartbeat : public Thread { + OSD *osd; + explicit T_Heartbeat(OSD *o) : osd(o) {} + void *entry() override { + osd->heartbeat_entry(); + return 0; + } + } heartbeat_thread; + +public: + bool heartbeat_dispatch(Message *m); + + struct HeartbeatDispatcher : public Dispatcher { + OSD *osd; + explicit HeartbeatDispatcher(OSD *o) : Dispatcher(o->cct), osd(o) {} + + bool ms_can_fast_dispatch_any() const override { return true; } + bool ms_can_fast_dispatch(const Message *m) const override { + switch (m->get_type()) { + case CEPH_MSG_PING: + case MSG_OSD_PING: + return true; + default: + return false; + } + } + void ms_fast_dispatch(Message *m) override { + osd->heartbeat_dispatch(m); + } + bool ms_dispatch(Message *m) override { + return osd->heartbeat_dispatch(m); + } + bool ms_handle_reset(Connection *con) override { + return osd->heartbeat_reset(con); + } + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override { + return osd->ms_handle_refused(con); + } + int ms_handle_authentication(Connection *con) override { + return true; + } + bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) override { + // some pre-nautilus OSDs get confused if you include an + // authorizer but they are not expecting it. do not try to authorize + // heartbeat connections until all OSDs are nautilus. + if (osd->get_osdmap()->require_osd_release >= CEPH_RELEASE_NAUTILUS) { + return osd->ms_get_authorizer(dest_type, authorizer); + } + return false; + } + KeyStore *ms_get_auth1_authorizer_keystore() override { + return osd->ms_get_auth1_authorizer_keystore(); + } + } heartbeat_dispatcher; + +private: + // -- waiters -- + list<OpRequestRef> finished; + + void take_waiters(list<OpRequestRef>& ls) { + ceph_assert(osd_lock.is_locked()); + finished.splice(finished.end(), ls); + } + void do_waiters(); + + // -- op tracking -- + OpTracker op_tracker; + void test_ops(std::string command, std::string args, ostream& ss); + friend class TestOpsSocketHook; + TestOpsSocketHook *test_ops_hook; + friend struct C_FinishSplits; + friend struct C_OpenPGs; + + // -- op queue -- + friend std::ostream& operator<<(std::ostream& out, const io_queue& q); + + const io_queue op_queue; +public: + const unsigned int op_prio_cutoff; +protected: + + /* + * The ordered op delivery chain is: + * + * fast dispatch -> pqueue back + * pqueue front <-> to_process back + * to_process front -> RunVis(item) + * <- queue_front() + * + * The pqueue is per-shard, and to_process is per pg_slot. Items can be + * pushed back up into to_process and/or pqueue while order is preserved. + * + * Multiple worker threads can operate on each shard. + * + * Under normal circumstances, num_running == to_process.size(). There are + * two times when that is not true: (1) when waiting_for_pg == true and + * to_process is accumulating requests that are waiting for the pg to be + * instantiated; in that case they will all get requeued together by + * wake_pg_waiters, and (2) when wake_pg_waiters just ran, waiting_for_pg + * and already requeued the items. + */ + friend class PGOpItem; + friend class PGPeeringItem; + friend class PGRecovery; + friend class PGDelete; + + class ShardedOpWQ + : public ShardedThreadPool::ShardedWQ<OpQueueItem> + { + OSD *osd; + + public: + ShardedOpWQ(OSD *o, + time_t ti, + time_t si, + ShardedThreadPool* tp) + : ShardedThreadPool::ShardedWQ<OpQueueItem>(ti, si, tp), + osd(o) { + } + + void _add_slot_waiter( + spg_t token, + OSDShardPGSlot *slot, + OpQueueItem&& qi); + + /// try to do some work + void _process(uint32_t thread_index, heartbeat_handle_d *hb) override; + + /// enqueue a new item + void _enqueue(OpQueueItem&& item) override; + + /// requeue an old item (at the front of the line) + void _enqueue_front(OpQueueItem&& item) override; + + void return_waiting_threads() override { + for(uint32_t i = 0; i < osd->num_shards; i++) { + OSDShard* sdata = osd->shards[i]; + assert (NULL != sdata); + std::scoped_lock l{sdata->sdata_wait_lock}; + sdata->stop_waiting = true; + sdata->sdata_cond.notify_all(); + } + } + + void stop_return_waiting_threads() override { + for(uint32_t i = 0; i < osd->num_shards; i++) { + OSDShard* sdata = osd->shards[i]; + assert (NULL != sdata); + std::scoped_lock l{sdata->sdata_wait_lock}; + sdata->stop_waiting = false; + } + } + + void dump(Formatter *f) { + for(uint32_t i = 0; i < osd->num_shards; i++) { + auto &&sdata = osd->shards[i]; + + char queue_name[32] = {0}; + snprintf(queue_name, sizeof(queue_name), "%s%" PRIu32, "OSD:ShardedOpWQ:", i); + ceph_assert(NULL != sdata); + + std::scoped_lock l{sdata->shard_lock}; + f->open_object_section(queue_name); + sdata->pqueue->dump(f); + f->close_section(); + } + } + + bool is_shard_empty(uint32_t thread_index) override { + uint32_t shard_index = thread_index % osd->num_shards; + auto &&sdata = osd->shards[shard_index]; + ceph_assert(sdata); + std::lock_guard l(sdata->shard_lock); + if (thread_index < osd->num_shards) { + return sdata->pqueue->empty() && sdata->context_queue.empty(); + } else { + return sdata->pqueue->empty(); + } + } + + void handle_oncommits(list<Context*>& oncommits) { + for (auto p : oncommits) { + p->complete(0); + } + } + } op_shardedwq; + + + void enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch); + void dequeue_op( + PGRef pg, OpRequestRef op, + ThreadPool::TPHandle &handle); + + void enqueue_peering_evt( + spg_t pgid, + PGPeeringEventRef ref); + void enqueue_peering_evt_front( + spg_t pgid, + PGPeeringEventRef ref); + void dequeue_peering_evt( + OSDShard *sdata, + PG *pg, + PGPeeringEventRef ref, + ThreadPool::TPHandle& handle); + + void dequeue_delete( + OSDShard *sdata, + PG *pg, + epoch_t epoch, + ThreadPool::TPHandle& handle); + + friend class PG; + friend class OSDShard; + friend class PrimaryLogPG; + + + protected: + + // -- osd map -- + // TODO: switch to std::atomic<OSDMapRef> when C++20 will be available. + OSDMapRef _osdmap; + void set_osdmap(OSDMapRef osdmap) { + std::atomic_store(&_osdmap, osdmap); + } + OSDMapRef get_osdmap() const { + return std::atomic_load(&_osdmap); + } + epoch_t get_osdmap_epoch() const { + // XXX: performance? + auto osdmap = get_osdmap(); + return osdmap ? osdmap->get_epoch() : 0; + } + + pool_pg_num_history_t pg_num_history; + + utime_t had_map_since; + RWLock map_lock; + list<OpRequestRef> waiting_for_osdmap; + deque<utime_t> osd_markdown_log; + + friend struct send_map_on_destruct; + + void wait_for_new_map(OpRequestRef op); + void handle_osd_map(class MOSDMap *m); + void _committed_osd_maps(epoch_t first, epoch_t last, class MOSDMap *m); + void trim_maps(epoch_t oldest, int nreceived, bool skip_maps); + void note_down_osd(int osd); + void note_up_osd(int osd); + friend class C_OnMapCommit; + + bool advance_pg( + epoch_t advance_to, + PG *pg, + ThreadPool::TPHandle &handle, + PG::RecoveryCtx *rctx); + void consume_map(); + void activate_map(); + + // osd map cache (past osd maps) + OSDMapRef get_map(epoch_t e) { + return service.get_map(e); + } + OSDMapRef add_map(OSDMap *o) { + return service.add_map(o); + } + void add_map_bl(epoch_t e, bufferlist& bl) { + return service.add_map_bl(e, bl); + } + bool get_map_bl(epoch_t e, bufferlist& bl) { + return service.get_map_bl(e, bl); + } + void add_map_inc_bl(epoch_t e, bufferlist& bl) { + return service.add_map_inc_bl(e, bl); + } + +public: + // -- shards -- + vector<OSDShard*> shards; + uint32_t num_shards = 0; + + void inc_num_pgs() { + ++num_pgs; + } + void dec_num_pgs() { + --num_pgs; + } + int get_num_pgs() const { + return num_pgs; + } + +protected: + Mutex merge_lock = {"OSD::merge_lock"}; + /// merge epoch -> target pgid -> source pgid -> pg + map<epoch_t,map<spg_t,map<spg_t,PGRef>>> merge_waiters; + + bool add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef source, + unsigned need); + + // -- placement groups -- + std::atomic<size_t> num_pgs = {0}; + + std::mutex pending_creates_lock; + using create_from_osd_t = std::pair<pg_t, bool /* is primary*/>; + std::set<create_from_osd_t> pending_creates_from_osd; + unsigned pending_creates_from_mon = 0; + + PGRecoveryStats pg_recovery_stats; + + PGRef _lookup_pg(spg_t pgid); + PGRef _lookup_lock_pg(spg_t pgid); + void register_pg(PGRef pg); + bool try_finish_pg_delete(PG *pg, unsigned old_pg_num); + + void _get_pgs(vector<PGRef> *v, bool clear_too=false); + void _get_pgids(vector<spg_t> *v); + +public: + PGRef lookup_lock_pg(spg_t pgid); + + std::set<int64_t> get_mapped_pools(); + +protected: + PG* _make_pg(OSDMapRef createmap, spg_t pgid); + + bool maybe_wait_for_max_pg(const OSDMapRef& osdmap, + spg_t pgid, bool is_mon_create); + void resume_creating_pg(); + + void load_pgs(); + + /// build initial pg history and intervals on create + void build_initial_pg_history( + spg_t pgid, + epoch_t created, + utime_t created_stamp, + pg_history_t *h, + PastIntervals *pi); + + epoch_t last_pg_create_epoch; + + void handle_pg_create(OpRequestRef op); + + void split_pgs( + PG *parent, + const set<spg_t> &childpgids, set<PGRef> *out_pgs, + OSDMapRef curmap, + OSDMapRef nextmap, + PG::RecoveryCtx *rctx); + void _finish_splits(set<PGRef>& pgs); + + // == monitor interaction == + Mutex mon_report_lock; + utime_t last_mon_report; + Finisher boot_finisher; + + // -- boot -- + void start_boot(); + void _got_mon_epochs(epoch_t oldest, epoch_t newest); + void _preboot(epoch_t oldest, epoch_t newest); + void _send_boot(); + void _collect_metadata(map<string,string> *pmeta); + + void start_waiting_for_healthy(); + bool _is_healthy(); + + void send_full_update(); + + friend struct C_OSD_GetVersion; + + // -- alive -- + epoch_t up_thru_wanted; + + void queue_want_up_thru(epoch_t want); + void send_alive(); + + // -- full map requests -- + epoch_t requested_full_first, requested_full_last; + + void request_full_map(epoch_t first, epoch_t last); + void rerequest_full_maps() { + epoch_t first = requested_full_first; + epoch_t last = requested_full_last; + requested_full_first = 0; + requested_full_last = 0; + request_full_map(first, last); + } + void got_full_map(epoch_t e); + + // -- failures -- + map<int,utime_t> failure_queue; + map<int,pair<utime_t,entity_addrvec_t> > failure_pending; + + void requeue_failures(); + void send_failures(); + void send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs); + void cancel_pending_failures(); + + ceph::coarse_mono_clock::time_point last_sent_beacon; + Mutex min_last_epoch_clean_lock{"OSD::min_last_epoch_clean_lock"}; + epoch_t min_last_epoch_clean = 0; + // which pgs were scanned for min_lec + std::vector<pg_t> min_last_epoch_clean_pgs; + void send_beacon(const ceph::coarse_mono_clock::time_point& now); + + ceph_tid_t get_tid() { + return service.get_tid(); + } + + // -- generic pg peering -- + PG::RecoveryCtx create_context(); + void dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap, + ThreadPool::TPHandle *handle = NULL); + void dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg, + ThreadPool::TPHandle *handle = NULL); + void discard_context(PG::RecoveryCtx &ctx); + void do_notifies(map<int, + vector<pair<pg_notify_t, PastIntervals> > >& + notify_list, + OSDMapRef map); + void do_queries(map<int, map<spg_t,pg_query_t> >& query_map, + OSDMapRef map); + void do_infos(map<int, + vector<pair<pg_notify_t, PastIntervals> > >& info_map, + OSDMapRef map); + + bool require_mon_peer(const Message *m); + bool require_mon_or_mgr_peer(const Message *m); + bool require_osd_peer(const Message *m); + /*** + * Verifies that we were alive in the given epoch, and that + * still are. + */ + bool require_self_aliveness(const Message *m, epoch_t alive_since); + /** + * Verifies that the OSD who sent the given op has the same + * address as in the given map. + * @pre op was sent by an OSD using the cluster messenger + */ + bool require_same_peer_instance(const Message *m, const OSDMapRef& map, + bool is_fast_dispatch); + + bool require_same_or_newer_map(OpRequestRef& op, epoch_t e, + bool is_fast_dispatch); + + void handle_fast_pg_create(MOSDPGCreate2 *m); + void handle_fast_pg_query(MOSDPGQuery *m); + void handle_pg_query_nopg(const MQuery& q); + void handle_fast_pg_notify(MOSDPGNotify *m); + void handle_pg_notify_nopg(const MNotifyRec& q); + void handle_fast_pg_info(MOSDPGInfo *m); + void handle_fast_pg_remove(MOSDPGRemove *m); + +public: + // used by OSDShard + PGRef handle_pg_create_info(const OSDMapRef& osdmap, const PGCreateInfo *info); +protected: + + void handle_fast_force_recovery(MOSDForceRecovery *m); + + // -- commands -- + struct Command { + vector<string> cmd; + ceph_tid_t tid; + bufferlist indata; + ConnectionRef con; + + Command(vector<string>& c, ceph_tid_t t, bufferlist& bl, Connection *co) + : cmd(c), tid(t), indata(bl), con(co) {} + }; + list<Command*> command_queue; + struct CommandWQ : public ThreadPool::WorkQueue<Command> { + OSD *osd; + CommandWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp) + : ThreadPool::WorkQueue<Command>("OSD::CommandWQ", ti, si, tp), osd(o) {} + + bool _empty() override { + return osd->command_queue.empty(); + } + bool _enqueue(Command *c) override { + osd->command_queue.push_back(c); + return true; + } + void _dequeue(Command *pg) override { + ceph_abort(); + } + Command *_dequeue() override { + if (osd->command_queue.empty()) + return NULL; + Command *c = osd->command_queue.front(); + osd->command_queue.pop_front(); + return c; + } + void _process(Command *c, ThreadPool::TPHandle &) override { + osd->osd_lock.lock(); + if (osd->is_stopping()) { + osd->osd_lock.unlock(); + delete c; + return; + } + osd->do_command(c->con.get(), c->tid, c->cmd, c->indata); + osd->osd_lock.unlock(); + delete c; + } + void _clear() override { + while (!osd->command_queue.empty()) { + Command *c = osd->command_queue.front(); + osd->command_queue.pop_front(); + delete c; + } + } + } command_wq; + + void handle_command(class MMonCommand *m); + void handle_command(class MCommand *m); + void do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data); + int _do_command( + Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data, + bufferlist& odata, stringstream& ss, stringstream& ds); + + + // -- pg recovery -- + void do_recovery(PG *pg, epoch_t epoch_queued, uint64_t pushes_reserved, + ThreadPool::TPHandle &handle); + + + // -- scrubbing -- + void sched_scrub(); + void resched_all_scrubs(); + bool scrub_random_backoff(); + bool scrub_load_below_threshold(); + bool scrub_time_permit(utime_t now); + + // -- status reporting -- + MPGStats *collect_pg_stats(); + std::vector<DaemonHealthMetric> get_health_metrics(); + + +private: + bool ms_can_fast_dispatch_any() const override { return true; } + bool ms_can_fast_dispatch(const Message *m) const override { + switch (m->get_type()) { + case CEPH_MSG_PING: + case CEPH_MSG_OSD_OP: + case CEPH_MSG_OSD_BACKOFF: + case MSG_OSD_SCRUB2: + case MSG_OSD_FORCE_RECOVERY: + case MSG_MON_COMMAND: + case MSG_OSD_PG_CREATE2: + case MSG_OSD_PG_QUERY: + case MSG_OSD_PG_INFO: + case MSG_OSD_PG_NOTIFY: + case MSG_OSD_PG_LOG: + case MSG_OSD_PG_TRIM: + case MSG_OSD_PG_REMOVE: + case MSG_OSD_BACKFILL_RESERVE: + case MSG_OSD_RECOVERY_RESERVE: + case MSG_OSD_REPOP: + case MSG_OSD_REPOPREPLY: + case MSG_OSD_PG_PUSH: + case MSG_OSD_PG_PULL: + case MSG_OSD_PG_PUSH_REPLY: + case MSG_OSD_PG_SCAN: + case MSG_OSD_PG_BACKFILL: + case MSG_OSD_PG_BACKFILL_REMOVE: + case MSG_OSD_EC_WRITE: + case MSG_OSD_EC_WRITE_REPLY: + case MSG_OSD_EC_READ: + case MSG_OSD_EC_READ_REPLY: + case MSG_OSD_SCRUB_RESERVE: + case MSG_OSD_REP_SCRUB: + case MSG_OSD_REP_SCRUBMAP: + case MSG_OSD_PG_UPDATE_LOG_MISSING: + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + case MSG_OSD_PG_RECOVERY_DELETE: + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + return true; + default: + return false; + } + } + void ms_fast_dispatch(Message *m) override; + bool ms_dispatch(Message *m) override; + bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) override; + void ms_handle_connect(Connection *con) override; + void ms_handle_fast_connect(Connection *con) override; + void ms_handle_fast_accept(Connection *con) override; + int ms_handle_authentication(Connection *con) override; + KeyStore *ms_get_auth1_authorizer_keystore() override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override; + + io_queue get_io_queue() const { + if (cct->_conf->osd_op_queue == "debug_random") { + static io_queue index_lookup[] = { io_queue::prioritized, + io_queue::weightedpriority, + io_queue::mclock_opclass, + io_queue::mclock_client }; + srand(time(NULL)); + unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0])); + return index_lookup[which]; + } else if (cct->_conf->osd_op_queue == "prioritized") { + return io_queue::prioritized; + } else if (cct->_conf->osd_op_queue == "mclock_opclass") { + return io_queue::mclock_opclass; + } else if (cct->_conf->osd_op_queue == "mclock_client") { + return io_queue::mclock_client; + } else { + // default / catch-all is 'wpq' + return io_queue::weightedpriority; + } + } + + unsigned int get_io_prio_cut() const { + if (cct->_conf->osd_op_queue_cut_off == "debug_random") { + srand(time(NULL)); + return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW; + } else if (cct->_conf->osd_op_queue_cut_off == "high") { + return CEPH_MSG_PRIO_HIGH; + } else { + // default / catch-all is 'low' + return CEPH_MSG_PRIO_LOW; + } + } + + public: + /* internal and external can point to the same messenger, they will still + * be cleaned up properly*/ + OSD(CephContext *cct_, + ObjectStore *store_, + int id, + Messenger *internal, + Messenger *external, + Messenger *hb_front_client, + Messenger *hb_back_client, + Messenger *hb_front_server, + Messenger *hb_back_server, + Messenger *osdc_messenger, + MonClient *mc, const std::string &dev, const std::string &jdev); + ~OSD() override; + + // static bits + static int mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami); + + /* remove any non-user xattrs from a map of them */ + void filter_xattrs(map<string, bufferptr>& attrs) { + for (map<string, bufferptr>::iterator iter = attrs.begin(); + iter != attrs.end(); + ) { + if (('_' != iter->first.at(0)) || (iter->first.size() == 1)) + attrs.erase(iter++); + else ++iter; + } + } + +private: + int mon_cmd_maybe_osd_create(string &cmd); + int update_crush_device_class(); + int update_crush_location(); + + static int write_meta(CephContext *cct, + ObjectStore *store, + uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami); + + void handle_scrub(struct MOSDScrub *m); + void handle_fast_scrub(struct MOSDScrub2 *m); + void handle_osd_ping(class MOSDPing *m); + + int init_op_flags(OpRequestRef& op); + + int get_num_op_shards(); + int get_num_op_threads(); + + float get_osd_recovery_sleep(); + float get_osd_delete_sleep(); + float get_osd_snap_trim_sleep(); + + void probe_smart(const string& devid, ostream& ss); + +public: + static int peek_meta(ObjectStore *store, + string *magic, + uuid_d *cluster_fsid, + uuid_d *osd_fsid, + int *whoami, + int *min_osd_release); + + + // startup/shutdown + int pre_init(); + int init(); + void final_init(); + + int enable_disable_fuse(bool stop); + int set_numa_affinity(); + + void suicide(int exitcode); + int shutdown(); + + void handle_signal(int signum); + + /// check if we can throw out op from a disconnected client + static bool op_is_discardable(const MOSDOp *m); + +public: + OSDService service; + friend class OSDService; + +private: + void set_perf_queries( + const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries); + void get_perf_reports( + std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports); + + Mutex m_perf_queries_lock = {"OSD::m_perf_queries_lock"}; + std::list<OSDPerfMetricQuery> m_perf_queries; + std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> m_perf_limits; +}; + + +std::ostream& operator<<(std::ostream& out, const io_queue& q); + + +//compatibility of the executable +extern const CompatSet::Feature ceph_osd_feature_compat[]; +extern const CompatSet::Feature ceph_osd_feature_ro_compat[]; +extern const CompatSet::Feature ceph_osd_feature_incompat[]; + +#endif // CEPH_OSD_H diff --git a/src/osd/OSDCap.cc b/src/osd/OSDCap.cc new file mode 100644 index 00000000..bd8d0b89 --- /dev/null +++ b/src/osd/OSDCap.cc @@ -0,0 +1,531 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2009-2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <boost/config/warning_disable.hpp> +#include <boost/spirit/include/qi.hpp> +#include <boost/spirit/include/phoenix_operator.hpp> +#include <boost/spirit/include/phoenix.hpp> +#include <boost/algorithm/string/predicate.hpp> + +#include "OSDCap.h" +#include "common/config.h" +#include "common/debug.h" +#include "include/ipaddr.h" + +using std::ostream; +using std::vector; + +ostream& operator<<(ostream& out, const osd_rwxa_t& p) +{ + if (p == OSD_CAP_ANY) + return out << "*"; + + if (p & OSD_CAP_R) + out << "r"; + if (p & OSD_CAP_W) + out << "w"; + if ((p & OSD_CAP_X) == OSD_CAP_X) { + out << "x"; + } else { + if (p & OSD_CAP_CLS_R) + out << " class-read"; + if (p & OSD_CAP_CLS_W) + out << " class-write"; + } + return out; +} + +ostream& operator<<(ostream& out, const OSDCapSpec& s) +{ + if (s.allow) + return out << s.allow; + if (s.class_name.length()) { + out << "class '" << s.class_name << "'"; + if (!s.method_name.empty()) { + out << " '" << s.method_name << "'"; + } + } + return out; +} + +ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns) +{ + if (!pns.pool_name.empty()) { + out << "pool " << pns.pool_name << " "; + } + if (pns.nspace) { + out << "namespace "; + if (pns.nspace->empty()) { + out << "\"\""; + } else { + out << *pns.nspace; + } + out << " "; + } + return out; +} + +ostream& operator<<(ostream &out, const OSDCapPoolTag &pt) +{ + out << "app " << pt.application << " key " << pt.key << " val " << pt.value + << " "; + return out; +} + +ostream& operator<<(ostream& out, const OSDCapMatch& m) +{ + if (!m.pool_namespace.pool_name.empty() || m.pool_namespace.nspace) { + out << m.pool_namespace; + } + + if (!m.pool_tag.application.empty()) { + out << m.pool_tag; + } + + if (m.object_prefix.length()) { + out << "object_prefix " << m.object_prefix << " "; + } + return out; +} + +ostream& operator<<(ostream& out, const OSDCapProfile& m) +{ + out << "profile " << m.name; + out << m.pool_namespace; + return out; +} + +bool OSDCapPoolNamespace::is_match(const std::string& pn, + const std::string& ns) const +{ + if (!pool_name.empty()) { + if (pool_name != pn) { + return false; + } + } + if (nspace) { + if (!nspace->empty() && nspace->back() == '*' && + boost::starts_with(ns, nspace->substr(0, nspace->length() - 1))) { + return true; + } + + if (*nspace != ns) { + return false; + } + } + return true; +} + +bool OSDCapPoolNamespace::is_match_all() const +{ + if (!pool_name.empty()) + return false; + if (nspace) + return false; + return true; +} + +bool OSDCapPoolTag::is_match(const app_map_t& app_map) const +{ + if (application.empty()) { + return true; + } + auto kv_map = app_map.find(application); + if (kv_map == app_map.end()) { + return false; + } + if (!key.compare("*") && !value.compare("*")) { + return true; + } + if (!key.compare("*")) { + for (auto it : kv_map->second) { + if (it.second == value) { + return true; + } + } + return false; + } + auto kv_val = kv_map->second.find(key); + if (kv_val == kv_map->second.end()) { + return false; + } + if (!value.compare("*")) { + return true; + } + return kv_val->second == value; +} + +bool OSDCapPoolTag::is_match_all() const { + return application.empty(); +} + +bool OSDCapMatch::is_match(const string& pn, const string& ns, + const OSDCapPoolTag::app_map_t& app_map, + const string& object) const +{ + if (!pool_namespace.is_match(pn, ns)) { + return false; + } else if (!pool_tag.is_match(app_map)) { + return false; + } + + if (object_prefix.length()) { + if (object.find(object_prefix) != 0) + return false; + } + return true; +} + +bool OSDCapMatch::is_match_all() const +{ +if (!pool_namespace.is_match_all()) { + return false; + } else if (!pool_tag.is_match_all()) { + return false; + } + + if (object_prefix.length()) { + return false; + } + return true; +} + +ostream& operator<<(ostream& out, const OSDCapGrant& g) +{ + out << "grant("; + if (g.profile.is_valid()) { + out << g.profile << " ["; + for (auto it = g.profile_grants.cbegin(); + it != g.profile_grants.cend(); ++it) { + if (it != g.profile_grants.cbegin()) { + out << ","; + } + out << *it; + } + out << "]"; + } else { + out << g.match << g.spec; + } + if (g.network.size()) { + out << " network " << g.network; + } + out << ")"; + return out; +} + +void OSDCapGrant::set_network(const string& n) +{ + network = n; + network_valid = ::parse_network(n.c_str(), &network_parsed, &network_prefix); +} + +bool OSDCapGrant::allow_all() const +{ + if (profile.is_valid()) { + return std::any_of(profile_grants.cbegin(), profile_grants.cend(), + [](const OSDCapGrant& grant) { + return grant.allow_all(); + }); + } + + return (match.is_match_all() && spec.allow_all()); +} + +bool OSDCapGrant::is_capable( + const string& pool_name, + const string& ns, + const OSDCapPoolTag::app_map_t& application_metadata, + const string& object, + bool op_may_read, + bool op_may_write, + const std::vector<OpRequest::ClassInfo>& classes, + const entity_addr_t& addr, + std::vector<bool>* class_allowed) const +{ + osd_rwxa_t allow = 0; + + if (network.size() && + (!network_valid || + !network_contains(network_parsed, + network_prefix, + addr))) { + return false; + } + + if (profile.is_valid()) { + return std::any_of(profile_grants.cbegin(), profile_grants.cend(), + [&](const OSDCapGrant& grant) { + return grant.is_capable(pool_name, ns, + application_metadata, + object, op_may_read, + op_may_write, classes, addr, + class_allowed); + }); + } else { + if (match.is_match(pool_name, ns, application_metadata, object)) { + allow = allow | spec.allow; + if ((op_may_read && !(allow & OSD_CAP_R)) || + (op_may_write && !(allow & OSD_CAP_W))) { + return false; + } + if (!classes.empty()) { + // check 'allow *' + if (spec.allow_all()) { + return true; + } + + // compare this grant to each class in the operation + for (size_t i = 0; i < classes.size(); ++i) { + // check 'allow class foo [method_name]' + if (!spec.class_name.empty() && + classes[i].class_name == spec.class_name && + (spec.method_name.empty() || + classes[i].method_name == spec.method_name)) { + (*class_allowed)[i] = true; + continue; + } + // check 'allow x | class-{rw}': must be on whitelist + if (!classes[i].whitelisted) { + continue; + } + if ((classes[i].read && !(allow & OSD_CAP_CLS_R)) || + (classes[i].write && !(allow & OSD_CAP_CLS_W))) { + continue; + } + (*class_allowed)[i] = true; + } + if (!std::all_of(class_allowed->cbegin(), class_allowed->cend(), + [](bool v) { return v; })) { + return false; + } + } + return true; + } + } + return false; +} + +void OSDCapGrant::expand_profile() +{ + if (profile.name == "read-only") { + // grants READ-ONLY caps to the OSD + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R))); + return; + } + if (profile.name == "read-write") { + // grants READ-WRITE caps to the OSD + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R | OSD_CAP_W))); + } + + if (profile.name == "rbd") { + // RBD read-write grant + profile_grants.emplace_back(OSDCapMatch(string(), "rbd_info"), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R))); + profile_grants.emplace_back(OSDCapMatch(string(), "rbd_children"), + OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R))); + profile_grants.emplace_back(OSDCapMatch(string(), "rbd_mirroring"), + OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R))); + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace.pool_name), + OSDCapSpec("rbd", "metadata_list")); + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R | + OSD_CAP_W | + OSD_CAP_X))); + } + if (profile.name == "rbd-read-only") { + // RBD read-only grant + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace), + OSDCapSpec(osd_rwxa_t(OSD_CAP_R | + OSD_CAP_CLS_R))); + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace, + "rbd_header."), + OSDCapSpec("rbd", "child_attach")); + profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace, + "rbd_header."), + OSDCapSpec("rbd", "child_detach")); + } +} + +bool OSDCap::allow_all() const +{ + for (auto &grant : grants) { + if (grant.allow_all()) { + return true; + } + } + return false; +} + +void OSDCap::set_allow_all() +{ + grants.clear(); + grants.push_back(OSDCapGrant(OSDCapMatch(), OSDCapSpec(OSD_CAP_ANY))); +} + +bool OSDCap::is_capable(const string& pool_name, const string& ns, + const OSDCapPoolTag::app_map_t& application_metadata, + const string& object, + bool op_may_read, bool op_may_write, + const std::vector<OpRequest::ClassInfo>& classes, + const entity_addr_t& addr) const +{ + std::vector<bool> class_allowed(classes.size(), false); + for (auto &grant : grants) { + if (grant.is_capable(pool_name, ns, application_metadata, + object, op_may_read, op_may_write, classes, addr, + &class_allowed)) { + return true; + } + } + return false; +} + + +// grammar +namespace qi = boost::spirit::qi; +namespace ascii = boost::spirit::ascii; +namespace phoenix = boost::phoenix; + +template <typename Iterator> +struct OSDCapParser : qi::grammar<Iterator, OSDCap()> +{ + OSDCapParser() : OSDCapParser::base_type(osdcap) + { + using qi::char_; + using qi::int_; + using qi::lexeme; + using qi::alnum; + using qi::_val; + using qi::_1; + using qi::_2; + using qi::_3; + using qi::eps; + using qi::lit; + + quoted_string %= + lexeme['"' >> +(char_ - '"') >> '"'] | + lexeme['\'' >> +(char_ - '\'') >> '\'']; + equoted_string %= + lexeme['"' >> *(char_ - '"') >> '"'] | + lexeme['\'' >> *(char_ - '\'') >> '\'']; + unquoted_word %= +char_("a-zA-Z0-9_./-"); + str %= quoted_string | unquoted_word; + estr %= equoted_string | unquoted_word; + network_str %= +char_("/.:a-fA-F0-9]["); + + spaces = +ascii::space; + + wildcard = (lit('*') | lit("all")) [_val = "*"]; + + pool_name %= -(spaces >> lit("pool") >> (lit('=') | spaces) >> str); + nspace %= (spaces >> lit("namespace") + >> (lit('=') | spaces) + >> estr >> -char_('*')); + + // match := [pool[=]<poolname> [namespace[=]<namespace>]] [object_prefix <prefix>] + object_prefix %= -(spaces >> lit("object_prefix") >> spaces >> str); + pooltag %= (spaces >> lit("tag") + >> spaces >> str // application + >> spaces >> (wildcard | str) // key + >> -spaces >> lit('=') >> -spaces >> (wildcard | str)); // value + + match = ( + pooltag [_val = phoenix::construct<OSDCapMatch>(_1)] | + (nspace >> pooltag) [_val = phoenix::construct<OSDCapMatch>(_1, _2)] | + (pool_name >> nspace >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2, _3)] | + (pool_name >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2)] + ); + + // rwxa := * | [r][w][x] [class-read] [class-write] + rwxa = + (spaces >> wildcard[_val = OSD_CAP_ANY]) | + ( eps[_val = 0] >> + ( + spaces >> + ( lit('r')[_val |= OSD_CAP_R] || + lit('w')[_val |= OSD_CAP_W] || + lit('x')[_val |= OSD_CAP_X] )) || + ( (spaces >> lit("class-read")[_val |= OSD_CAP_CLS_R]) || + (spaces >> lit("class-write")[_val |= OSD_CAP_CLS_W]) )); + + // capspec := * | rwx | class <name> [<method name>] + class_name %= (spaces >> lit("class") >> spaces >> str); + method_name %= -(spaces >> str); + capspec = ( + (rwxa) [_val = phoenix::construct<OSDCapSpec>(_1)] | + (class_name >> method_name) [_val = phoenix::construct<OSDCapSpec>(_1, _2)]); + + // profile := profile <name> [pool[=]<pool> [namespace[=]<namespace>]] + profile_name %= (lit("profile") >> (lit('=') | spaces) >> str); + profile = ( + (profile_name >> pool_name >> nspace) [_val = phoenix::construct<OSDCapProfile>(_1, _2, _3)] | + (profile_name >> pool_name) [_val = phoenix::construct<OSDCapProfile>(_1, _2)]); + + // grant := allow match capspec + grant = (*ascii::blank >> + ((lit("allow") >> capspec >> match >> + -(spaces >> lit("network") >> spaces >> network_str)) + [_val = phoenix::construct<OSDCapGrant>(_2, _1, _3)] | + (lit("allow") >> match >> capspec >> + -(spaces >> lit("network") >> spaces >> network_str)) + [_val = phoenix::construct<OSDCapGrant>(_1, _2, _3)] | + (profile >> -(spaces >> lit("network") >> spaces >> network_str)) + [_val = phoenix::construct<OSDCapGrant>(_1, _2)] + ) >> *ascii::blank); + // osdcap := grant [grant ...] + grants %= (grant % (lit(';') | lit(','))); + osdcap = grants [_val = phoenix::construct<OSDCap>(_1)]; + } + qi::rule<Iterator> spaces; + qi::rule<Iterator, unsigned()> rwxa; + qi::rule<Iterator, string()> quoted_string, equoted_string; + qi::rule<Iterator, string()> unquoted_word; + qi::rule<Iterator, string()> str, estr, network_str; + qi::rule<Iterator, string()> wildcard; + qi::rule<Iterator, string()> class_name; + qi::rule<Iterator, string()> method_name; + qi::rule<Iterator, OSDCapSpec()> capspec; + qi::rule<Iterator, string()> pool_name; + qi::rule<Iterator, string()> nspace; + qi::rule<Iterator, string()> object_prefix; + qi::rule<Iterator, OSDCapPoolTag()> pooltag; + qi::rule<Iterator, OSDCapMatch()> match; + qi::rule<Iterator, string()> profile_name; + qi::rule<Iterator, OSDCapProfile()> profile; + qi::rule<Iterator, OSDCapGrant()> grant; + qi::rule<Iterator, std::vector<OSDCapGrant>()> grants; + qi::rule<Iterator, OSDCap()> osdcap; +}; + +bool OSDCap::parse(const string& str, ostream *err) +{ + OSDCapParser<string::const_iterator> g; + string::const_iterator iter = str.begin(); + string::const_iterator end = str.end(); + + bool r = qi::phrase_parse(iter, end, g, ascii::space, *this); + if (r && iter == end) + return true; + + // Make sure no grants are kept after parsing failed! + grants.clear(); + + if (err) + *err << "osd capability parse failed, stopped at '" << std::string(iter, end) + << "' of '" << str << "'"; + + return false; +} diff --git a/src/osd/OSDCap.h b/src/osd/OSDCap.h new file mode 100644 index 00000000..2bb4e21c --- /dev/null +++ b/src/osd/OSDCap.h @@ -0,0 +1,261 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + * OSDCaps: Hold the capabilities associated with a single authenticated + * user key. These are specified by text strings of the form + * "allow r" (which allows reading anything on the OSD) + * "allow rwx pool foo" (which allows full access to listed pools) + * "allow *" (which allows full access to EVERYTHING) + * + * The full grammar is documented in the parser in OSDCap.cc. + * + * The OSD assumes that anyone with * caps is an admin and has full + * message permissions. This means that only the monitor and the OSDs + * should get * + */ + +#ifndef CEPH_OSDCAP_H +#define CEPH_OSDCAP_H + +#include <ostream> +using std::ostream; + +#include "include/types.h" +#include "OpRequest.h" + +#include <list> +#include <vector> +#include <boost/optional.hpp> +#include <boost/fusion/include/adapt_struct.hpp> + +static const __u8 OSD_CAP_R = (1 << 1); // read +static const __u8 OSD_CAP_W = (1 << 2); // write +static const __u8 OSD_CAP_CLS_R = (1 << 3); // class read +static const __u8 OSD_CAP_CLS_W = (1 << 4); // class write +static const __u8 OSD_CAP_X = (OSD_CAP_CLS_R | OSD_CAP_CLS_W); // execute +static const __u8 OSD_CAP_ANY = 0xff; // * + +struct osd_rwxa_t { + __u8 val; + + // cppcheck-suppress noExplicitConstructor + osd_rwxa_t(__u8 v = 0) : val(v) {} + osd_rwxa_t& operator=(__u8 v) { + val = v; + return *this; + } + operator __u8() const { + return val; + } +}; + +ostream& operator<<(ostream& out, const osd_rwxa_t& p); + +struct OSDCapSpec { + osd_rwxa_t allow; + std::string class_name; + std::string method_name; + + OSDCapSpec() : allow(0) {} + explicit OSDCapSpec(osd_rwxa_t v) : allow(v) {} + OSDCapSpec(std::string class_name, std::string method_name) + : allow(0), class_name(std::move(class_name)), + method_name(std::move(method_name)) {} + + bool allow_all() const { + return allow == OSD_CAP_ANY; + } +}; + +ostream& operator<<(ostream& out, const OSDCapSpec& s); + +struct OSDCapPoolNamespace { + std::string pool_name; + boost::optional<std::string> nspace = boost::none; + + OSDCapPoolNamespace() { + } + OSDCapPoolNamespace(const std::string& pool_name, + const boost::optional<std::string>& nspace = boost::none) + : pool_name(pool_name), nspace(nspace) { + } + + bool is_match(const std::string& pn, const std::string& ns) const; + bool is_match_all() const; +}; + +ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns); + +struct OSDCapPoolTag { + typedef std::map<std::string, std::map<std::string, std::string> > app_map_t; + std::string application; + std::string key; + std::string value; + + OSDCapPoolTag () {} + OSDCapPoolTag(const std::string& application, const std::string& key, + const std::string& value) : + application(application), key(key), value(value) {} + + bool is_match(const app_map_t& app_map) const; + bool is_match_all() const; +}; +// adapt for parsing with boost::spirit::qi in OSDCapParser +BOOST_FUSION_ADAPT_STRUCT(OSDCapPoolTag, + (std::string, application) + (std::string, key) + (std::string, value)) + +ostream& operator<<(ostream& out, const OSDCapPoolTag& pt); + +struct OSDCapMatch { + typedef std::map<std::string, std::map<std::string, std::string> > app_map_t; + OSDCapPoolNamespace pool_namespace; + OSDCapPoolTag pool_tag; + std::string object_prefix; + + OSDCapMatch() {} + explicit OSDCapMatch(const OSDCapPoolTag& pt) : pool_tag(pt) {} + explicit OSDCapMatch(const OSDCapPoolNamespace& pns) : pool_namespace(pns) {} + OSDCapMatch(const OSDCapPoolNamespace& pns, const std::string& pre) + : pool_namespace(pns), object_prefix(pre) {} + OSDCapMatch(const std::string& pl, const std::string& pre) + : pool_namespace(pl), object_prefix(pre) {} + OSDCapMatch(const std::string& pl, const std::string& ns, + const std::string& pre) + : pool_namespace(pl, ns), object_prefix(pre) {} + OSDCapMatch(const std::string& dummy, const std::string& app, + const std::string& key, const std::string& val) + : pool_tag(app, key, val) {} + OSDCapMatch(const std::string& ns, const OSDCapPoolTag& pt) + : pool_namespace("", ns), pool_tag(pt) {} + + /** + * check if given request parameters match our constraints + * + * @param pool_name pool name + * @param nspace_name namespace name + * @param object object name + * @return true if we match, false otherwise + */ + bool is_match(const std::string& pool_name, const std::string& nspace_name, + const app_map_t& app_map, + const std::string& object) const; + bool is_match_all() const; +}; + +ostream& operator<<(ostream& out, const OSDCapMatch& m); + + +struct OSDCapProfile { + std::string name; + OSDCapPoolNamespace pool_namespace; + + OSDCapProfile() { + } + OSDCapProfile(const std::string& name, + const std::string& pool_name, + const boost::optional<std::string>& nspace = boost::none) + : name(name), pool_namespace(pool_name, nspace) { + } + + inline bool is_valid() const { + return !name.empty(); + } +}; + +ostream& operator<<(ostream& out, const OSDCapProfile& m); + +struct OSDCapGrant { + OSDCapMatch match; + OSDCapSpec spec; + OSDCapProfile profile; + string network; + entity_addr_t network_parsed; + unsigned network_prefix = 0; + bool network_valid = true; + + // explicit grants that a profile grant expands to; populated as + // needed by expand_profile() and cached here. + std::list<OSDCapGrant> profile_grants; + + OSDCapGrant() {} + OSDCapGrant(const OSDCapMatch& m, const OSDCapSpec& s, + boost::optional<string> n = {}) + : match(m), spec(s) { + if (n) { + set_network(*n); + } + } + explicit OSDCapGrant(const OSDCapProfile& profile, + boost::optional<string> n = {}) + : profile(profile) { + if (n) { + set_network(*n); + } + expand_profile(); + } + + void set_network(const string& n); + + bool allow_all() const; + bool is_capable(const string& pool_name, const string& ns, + const OSDCapPoolTag::app_map_t& application_metadata, + const string& object, bool op_may_read, bool op_may_write, + const std::vector<OpRequest::ClassInfo>& classes, + const entity_addr_t& addr, + std::vector<bool>* class_allowed) const; + + void expand_profile(); +}; + +ostream& operator<<(ostream& out, const OSDCapGrant& g); + + +struct OSDCap { + std::vector<OSDCapGrant> grants; + + OSDCap() {} + explicit OSDCap(std::vector<OSDCapGrant> g) : grants(std::move(g)) {} + + bool allow_all() const; + void set_allow_all(); + bool parse(const std::string& str, ostream *err=NULL); + + /** + * check if we are capable of something + * + * This method actually checks a description of a particular operation against + * what the capability has specified. Currently that is just rwx with matches + * against pool, and object name prefix. + * + * @param pool_name name of the pool we are accessing + * @param ns name of the namespace we are accessing + * @param object name of the object we are accessing + * @param op_may_read whether the operation may need to read + * @param op_may_write whether the operation may need to write + * @param classes (class-name, rd, wr, whitelisted-flag) tuples + * @return true if the operation is allowed, false otherwise + */ + bool is_capable(const string& pool_name, const string& ns, + const OSDCapPoolTag::app_map_t& application_metadata, + const string& object, bool op_may_read, bool op_may_write, + const std::vector<OpRequest::ClassInfo>& classes, + const entity_addr_t& addr) const; +}; + +static inline ostream& operator<<(ostream& out, const OSDCap& cap) +{ + return out << "osdcap" << cap.grants; +} + +#endif diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc new file mode 100644 index 00000000..c588bae7 --- /dev/null +++ b/src/osd/OSDMap.cc @@ -0,0 +1,6012 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <algorithm> +#include <optional> +#include <random> + +#include <boost/algorithm/string.hpp> + +#include "OSDMap.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "include/ceph_features.h" +#include "include/str_map.h" + +#include "common/code_environment.h" +#include "mon/health_check.h" + +#include "crush/CrushTreeDumper.h" +#include "common/Clock.h" +#include "mon/PGMap.h" + +#define dout_subsys ceph_subsys_osd + +MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap); +MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap); + + +// ---------------------------------- +// osd_info_t + +void osd_info_t::dump(Formatter *f) const +{ + f->dump_int("last_clean_begin", last_clean_begin); + f->dump_int("last_clean_end", last_clean_end); + f->dump_int("up_from", up_from); + f->dump_int("up_thru", up_thru); + f->dump_int("down_at", down_at); + f->dump_int("lost_at", lost_at); +} + +void osd_info_t::encode(bufferlist& bl) const +{ + using ceph::encode; + __u8 struct_v = 1; + encode(struct_v, bl); + encode(last_clean_begin, bl); + encode(last_clean_end, bl); + encode(up_from, bl); + encode(up_thru, bl); + encode(down_at, bl); + encode(lost_at, bl); +} + +void osd_info_t::decode(bufferlist::const_iterator& bl) +{ + using ceph::decode; + __u8 struct_v; + decode(struct_v, bl); + decode(last_clean_begin, bl); + decode(last_clean_end, bl); + decode(up_from, bl); + decode(up_thru, bl); + decode(down_at, bl); + decode(lost_at, bl); +} + +void osd_info_t::generate_test_instances(list<osd_info_t*>& o) +{ + o.push_back(new osd_info_t); + o.push_back(new osd_info_t); + o.back()->last_clean_begin = 1; + o.back()->last_clean_end = 2; + o.back()->up_from = 30; + o.back()->up_thru = 40; + o.back()->down_at = 5; + o.back()->lost_at = 6; +} + +ostream& operator<<(ostream& out, const osd_info_t& info) +{ + out << "up_from " << info.up_from + << " up_thru " << info.up_thru + << " down_at " << info.down_at + << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")"; + if (info.lost_at) + out << " lost_at " << info.lost_at; + return out; +} + +// ---------------------------------- +// osd_xinfo_t + +void osd_xinfo_t::dump(Formatter *f) const +{ + f->dump_stream("down_stamp") << down_stamp; + f->dump_float("laggy_probability", laggy_probability); + f->dump_int("laggy_interval", laggy_interval); + f->dump_int("features", features); + f->dump_unsigned("old_weight", old_weight); +} + +void osd_xinfo_t::encode(bufferlist& bl) const +{ + ENCODE_START(3, 1, bl); + encode(down_stamp, bl); + __u32 lp = laggy_probability * 0xfffffffful; + encode(lp, bl); + encode(laggy_interval, bl); + encode(features, bl); + encode(old_weight, bl); + ENCODE_FINISH(bl); +} + +void osd_xinfo_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(3, bl); + decode(down_stamp, bl); + __u32 lp; + decode(lp, bl); + laggy_probability = (float)lp / (float)0xffffffff; + decode(laggy_interval, bl); + if (struct_v >= 2) + decode(features, bl); + else + features = 0; + if (struct_v >= 3) + decode(old_weight, bl); + else + old_weight = 0; + DECODE_FINISH(bl); +} + +void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o) +{ + o.push_back(new osd_xinfo_t); + o.push_back(new osd_xinfo_t); + o.back()->down_stamp = utime_t(2, 3); + o.back()->laggy_probability = .123; + o.back()->laggy_interval = 123456; + o.back()->old_weight = 0x7fff; +} + +ostream& operator<<(ostream& out, const osd_xinfo_t& xi) +{ + return out << "down_stamp " << xi.down_stamp + << " laggy_probability " << xi.laggy_probability + << " laggy_interval " << xi.laggy_interval + << " old_weight " << xi.old_weight; +} + +// ---------------------------------- +// OSDMap::Incremental + +int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const +{ + int n = 0; + for (auto &weight : new_weight) { + if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first)) + n++; // marked out + else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first)) + n--; // marked in + } + return n; +} + +int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const +{ + int n = 0; + for (auto &state : new_state) { // + if (state.second & CEPH_OSD_UP) { + if (previous->is_up(state.first)) + n++; // marked down + else + n--; // marked up + } + } + return n; +} + +int OSDMap::Incremental::identify_osd(uuid_d u) const +{ + for (auto &uuid : new_uuid) + if (uuid.second == u) + return uuid.first; + return -1; +} + +int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct, + const OSDMap& osdmap) +{ + ceph_assert(epoch == osdmap.get_epoch() + 1); + + for (auto &new_pool : new_pools) { + if (!new_pool.second.tiers.empty()) { + pg_pool_t& base = new_pool.second; + + auto new_rem_it = new_removed_snaps.find(new_pool.first); + + for (const auto &tier_pool : base.tiers) { + const auto &r = new_pools.find(tier_pool); + pg_pool_t *tier = 0; + if (r == new_pools.end()) { + const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool); + if (!orig) { + lderr(cct) << __func__ << " no pool " << tier_pool << dendl; + return -EIO; + } + tier = get_new_pool(tier_pool, orig); + } else { + tier = &r->second; + } + if (tier->tier_of != new_pool.first) { + lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl; + return -EIO; + } + + ldout(cct, 10) << __func__ << " from " << new_pool.first << " to " + << tier_pool << dendl; + tier->snap_seq = base.snap_seq; + tier->snap_epoch = base.snap_epoch; + tier->snaps = base.snaps; + tier->removed_snaps = base.removed_snaps; + tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS| + pg_pool_t::FLAG_POOL_SNAPS); + + if (new_rem_it != new_removed_snaps.end()) { + new_removed_snaps[tier_pool] = new_rem_it->second; + } + } + } + } + return 0; +} + +// ---------------------------------- +// OSDMap + +bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const +{ + if (id >= 0) + return is_down(id); + + if (down_cache && + down_cache->count(id)) { + return true; + } + + list<int> children; + crush->get_children(id, &children); + for (const auto &child : children) { + if (!subtree_is_down(child, down_cache)) { + return false; + } + } + if (down_cache) { + down_cache->insert(id); + } + return true; +} + +bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const +{ + // use a stack-local down_cache if we didn't get one from the + // caller. then at least this particular call will avoid duplicated + // work. + set<int> local_down_cache; + if (!down_cache) { + down_cache = &local_down_cache; + } + + int current = id; + while (true) { + int type; + if (current >= 0) { + type = 0; + } else { + type = crush->get_bucket_type(current); + } + ceph_assert(type >= 0); + + if (!subtree_is_down(current, down_cache)) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl; + return false; + } + + // is this a big enough subtree to be marked as down? + if (type >= subtree_type) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl; + return true; + } + + int r = crush->get_immediate_parent_id(current, ¤t); + if (r < 0) { + return false; + } + } +} + +bool OSDMap::subtree_type_is_down( + CephContext *cct, + int id, + int subtree_type, + set<int> *down_in_osds, + set<int> *up_in_osds, + set<int> *subtree_up, + unordered_map<int, set<int> > *subtree_type_down) const +{ + if (id >= 0) { + bool is_down_ret = is_down(id); + if (!is_out(id)) { + if (is_down_ret) { + down_in_osds->insert(id); + } else { + up_in_osds->insert(id); + } + } + return is_down_ret; + } + + if (subtree_type_down && + (*subtree_type_down)[subtree_type].count(id)) { + return true; + } + + list<int> children; + crush->get_children(id, &children); + for (const auto &child : children) { + if (!subtree_type_is_down( + cct, child, crush->get_bucket_type(child), + down_in_osds, up_in_osds, subtree_up, subtree_type_down)) { + subtree_up->insert(id); + return false; + } + } + if (subtree_type_down) { + (*subtree_type_down)[subtree_type].insert(id); + } + return true; +} + +void OSDMap::Incremental::encode_client_old(bufferlist& bl) const +{ + using ceph::encode; + __u16 v = 5; + encode(v, bl); + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + int32_t new_t = new_pool_max; + encode(new_t, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + // for encode(new_pools, bl); + __u32 n = new_pools.size(); + encode(n, bl); + for (const auto &new_pool : new_pools) { + n = new_pool.first; + encode(n, bl); + encode(new_pool.second, bl, 0); + } + // for encode(new_pool_names, bl); + n = new_pool_names.size(); + encode(n, bl); + + for (const auto &new_pool_name : new_pool_names) { + n = new_pool_name.first; + encode(n, bl); + encode(new_pool_name.second, bl); + } + // for encode(old_pools, bl); + n = old_pools.size(); + encode(n, bl); + for (auto &old_pool : old_pools) { + n = old_pool; + encode(n, bl); + } + encode(new_up_client, bl, 0); + { + // legacy is map<int32_t,uint8_t> + uint32_t n = new_state.size(); + encode(n, bl); + for (auto p : new_state) { + encode(p.first, bl); + encode((uint8_t)p.second, bl); + } + } + encode(new_weight, bl); + // for encode(new_pg_temp, bl); + n = new_pg_temp.size(); + encode(n, bl); + + for (const auto &pg_temp : new_pg_temp) { + old_pg_t opg = pg_temp.first.get_old_pg(); + encode(opg, bl); + encode(pg_temp.second, bl); + } +} + +void OSDMap::Incremental::encode_classic(bufferlist& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGID64) == 0) { + encode_client_old(bl); + return; + } + + // base + __u16 v = 6; + encode(v, bl); + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + encode(new_pool_max, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + encode(new_pools, bl, features); + encode(new_pool_names, bl); + encode(old_pools, bl); + encode(new_up_client, bl, features); + { + uint32_t n = new_state.size(); + encode(n, bl); + for (auto p : new_state) { + encode(p.first, bl); + encode((uint8_t)p.second, bl); + } + } + encode(new_weight, bl); + encode(new_pg_temp, bl); + + // extended + __u16 ev = 10; + encode(ev, bl); + encode(new_hb_back_up, bl, features); + encode(new_up_thru, bl); + encode(new_last_clean_interval, bl); + encode(new_lost, bl); + encode(new_blacklist, bl, features); + encode(old_blacklist, bl, features); + encode(new_up_cluster, bl, features); + encode(cluster_snapshot, bl); + encode(new_uuid, bl); + encode(new_xinfo, bl); + encode(new_hb_front_up, bl, features); +} + +template<class T> +static void encode_addrvec_map_as_addr(const T& m, bufferlist& bl, uint64_t f) +{ + uint32_t n = m.size(); + encode(n, bl); + for (auto& i : m) { + encode(i.first, bl); + encode(i.second.legacy_addr(), bl, f); + } +} + +template<class T> +static void encode_addrvec_pvec_as_addr(const T& m, bufferlist& bl, uint64_t f) +{ + uint32_t n = m.size(); + encode(n, bl); + for (auto& i : m) { + if (i) { + encode(i->legacy_addr(), bl, f); + } else { + encode(entity_addr_t(), bl, f); + } + } +} + +/* for a description of osdmap incremental versions, and when they were + * introduced, please refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) { + encode_classic(bl, features); + return; + } + + // only a select set of callers should *ever* be encoding new + // OSDMaps. others should be passing around the canonical encoded + // buffers from on high. select out those callers by passing in an + // "impossible" feature bit. + ceph_assert(features & CEPH_FEATURE_RESERVED); + features &= ~CEPH_FEATURE_RESERVED; + + size_t start_offset = bl.length(); + size_t tail_offset; + size_t crc_offset; + std::optional<buffer::list::contiguous_filler> crc_filler; + + // meta-encoding: how we include client-used and osd-specific data + ENCODE_START(8, 7, bl); + + { + uint8_t v = 8; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 3; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 5; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 6; + } + ENCODE_START(v, 1, bl); // client-usable data + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + encode(new_pool_max, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + encode(new_pools, bl, features); + encode(new_pool_names, bl); + encode(old_pools, bl); + if (v >= 7) { + encode(new_up_client, bl, features); + } else { + encode_addrvec_map_as_addr(new_up_client, bl, features); + } + if (v >= 5) { + encode(new_state, bl); + } else { + uint32_t n = new_state.size(); + encode(n, bl); + for (auto p : new_state) { + encode(p.first, bl); + encode((uint8_t)p.second, bl); + } + } + encode(new_weight, bl); + encode(new_pg_temp, bl); + encode(new_primary_temp, bl); + encode(new_primary_affinity, bl); + encode(new_erasure_code_profiles, bl); + encode(old_erasure_code_profiles, bl); + if (v >= 4) { + encode(new_pg_upmap, bl); + encode(old_pg_upmap, bl); + encode(new_pg_upmap_items, bl); + encode(old_pg_upmap_items, bl); + } + if (v >= 6) { + encode(new_removed_snaps, bl); + encode(new_purged_snaps, bl); + } + if (v >= 8) { + encode(new_last_up_change, bl); + encode(new_last_in_change, bl); + } + ENCODE_FINISH(bl); // client-usable data + } + + { + uint8_t target_v = 9; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + target_v = 2; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + target_v = 6; + } + ENCODE_START(target_v, 1, bl); // extended, osd-only data + if (target_v < 7) { + encode_addrvec_map_as_addr(new_hb_back_up, bl, features); + } else { + encode(new_hb_back_up, bl, features); + } + encode(new_up_thru, bl); + encode(new_last_clean_interval, bl); + encode(new_lost, bl); + encode(new_blacklist, bl, features); + encode(old_blacklist, bl, features); + if (target_v < 7) { + encode_addrvec_map_as_addr(new_up_cluster, bl, features); + } else { + encode(new_up_cluster, bl, features); + } + encode(cluster_snapshot, bl); + encode(new_uuid, bl); + encode(new_xinfo, bl); + if (target_v < 7) { + encode_addrvec_map_as_addr(new_hb_front_up, bl, features); + } else { + encode(new_hb_front_up, bl, features); + } + encode(features, bl); // NOTE: features arg, not the member + if (target_v >= 3) { + encode(new_nearfull_ratio, bl); + encode(new_full_ratio, bl); + encode(new_backfillfull_ratio, bl); + } + // 5 was string-based new_require_min_compat_client + if (target_v >= 6) { + encode(new_require_min_compat_client, bl); + encode(new_require_osd_release, bl); + } + if (target_v >= 8) { + encode(new_crush_node_flags, bl); + } + if (target_v >= 9) { + encode(new_device_class_flags, bl); + } + ENCODE_FINISH(bl); // osd-only data + } + + crc_offset = bl.length(); + crc_filler = bl.append_hole(sizeof(uint32_t)); + tail_offset = bl.length(); + + encode(full_crc, bl); + + ENCODE_FINISH(bl); // meta-encoding wrapper + + // fill in crc + bufferlist front; + front.substr_of(bl, start_offset, crc_offset - start_offset); + inc_crc = front.crc32c(-1); + bufferlist tail; + tail.substr_of(bl, tail_offset, bl.length() - tail_offset); + inc_crc = tail.crc32c(inc_crc); + ceph_le32 crc_le; + crc_le = inc_crc; + crc_filler->copy_in(4u, (char*)&crc_le); + have_crc = true; +} + +void OSDMap::Incremental::decode_classic(bufferlist::const_iterator &p) +{ + using ceph::decode; + __u32 n, t; + // base + __u16 v; + decode(v, p); + decode(fsid, p); + decode(epoch, p); + decode(modified, p); + if (v == 4 || v == 5) { + decode(n, p); + new_pool_max = n; + } else if (v >= 6) + decode(new_pool_max, p); + decode(new_flags, p); + decode(fullmap, p); + decode(crush, p); + + decode(new_max_osd, p); + if (v < 6) { + new_pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(new_pools[t], p); + } + } else { + decode(new_pools, p); + } + if (v == 5) { + new_pool_names.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(new_pool_names[t], p); + } + } else if (v >= 6) { + decode(new_pool_names, p); + } + if (v < 6) { + old_pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + old_pools.insert(t); + } + } else { + decode(old_pools, p); + } + decode(new_up_client, p); + { + map<int32_t,uint8_t> ns; + decode(ns, p); + for (auto q : ns) { + new_state[q.first] = q.second; + } + } + decode(new_weight, p); + + if (v < 6) { + new_pg_temp.clear(); + decode(n, p); + while (n--) { + old_pg_t opg; + ::decode_raw(opg, p); + decode(new_pg_temp[pg_t(opg)], p); + } + } else { + decode(new_pg_temp, p); + } + + // decode short map, too. + if (v == 5 && p.end()) + return; + + // extended + __u16 ev = 0; + if (v >= 5) + decode(ev, p); + decode(new_hb_back_up, p); + if (v < 5) + decode(new_pool_names, p); + decode(new_up_thru, p); + decode(new_last_clean_interval, p); + decode(new_lost, p); + decode(new_blacklist, p); + decode(old_blacklist, p); + if (ev >= 6) + decode(new_up_cluster, p); + if (ev >= 7) + decode(cluster_snapshot, p); + if (ev >= 8) + decode(new_uuid, p); + if (ev >= 9) + decode(new_xinfo, p); + if (ev >= 10) + decode(new_hb_front_up, p); +} + +/* for a description of osdmap incremental versions, and when they were + * introduced, please refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::Incremental::decode(bufferlist::const_iterator& bl) +{ + using ceph::decode; + /** + * Older encodings of the Incremental had a single struct_v which + * covered the whole encoding, and was prior to our modern + * stuff which includes a compatv and a size. So if we see + * a struct_v < 7, we must rewind to the beginning and use our + * classic decoder. + */ + size_t start_offset = bl.get_off(); + size_t tail_offset = 0; + bufferlist crc_front, crc_tail; + + DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper + if (struct_v < 7) { + bl.seek(start_offset); + decode_classic(bl); + encode_features = 0; + if (struct_v >= 6) + encode_features = CEPH_FEATURE_PGID64; + else + encode_features = 0; + return; + } + { + DECODE_START(8, bl); // client-usable data + decode(fsid, bl); + decode(epoch, bl); + decode(modified, bl); + decode(new_pool_max, bl); + decode(new_flags, bl); + decode(fullmap, bl); + decode(crush, bl); + + decode(new_max_osd, bl); + decode(new_pools, bl); + decode(new_pool_names, bl); + decode(old_pools, bl); + decode(new_up_client, bl); + if (struct_v >= 5) { + decode(new_state, bl); + } else { + map<int32_t,uint8_t> ns; + decode(ns, bl); + for (auto q : ns) { + new_state[q.first] = q.second; + } + } + decode(new_weight, bl); + decode(new_pg_temp, bl); + decode(new_primary_temp, bl); + if (struct_v >= 2) + decode(new_primary_affinity, bl); + else + new_primary_affinity.clear(); + if (struct_v >= 3) { + decode(new_erasure_code_profiles, bl); + decode(old_erasure_code_profiles, bl); + } else { + new_erasure_code_profiles.clear(); + old_erasure_code_profiles.clear(); + } + if (struct_v >= 4) { + decode(new_pg_upmap, bl); + decode(old_pg_upmap, bl); + decode(new_pg_upmap_items, bl); + decode(old_pg_upmap_items, bl); + } + if (struct_v >= 6) { + decode(new_removed_snaps, bl); + decode(new_purged_snaps, bl); + } + if (struct_v >= 8) { + decode(new_last_up_change, bl); + decode(new_last_in_change, bl); + } + DECODE_FINISH(bl); // client-usable data + } + + { + DECODE_START(9, bl); // extended, osd-only data + decode(new_hb_back_up, bl); + decode(new_up_thru, bl); + decode(new_last_clean_interval, bl); + decode(new_lost, bl); + decode(new_blacklist, bl); + decode(old_blacklist, bl); + decode(new_up_cluster, bl); + decode(cluster_snapshot, bl); + decode(new_uuid, bl); + decode(new_xinfo, bl); + decode(new_hb_front_up, bl); + if (struct_v >= 2) + decode(encode_features, bl); + else + encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC; + if (struct_v >= 3) { + decode(new_nearfull_ratio, bl); + decode(new_full_ratio, bl); + } else { + new_nearfull_ratio = -1; + new_full_ratio = -1; + } + if (struct_v >= 4) { + decode(new_backfillfull_ratio, bl); + } else { + new_backfillfull_ratio = -1; + } + if (struct_v == 5) { + string r; + decode(r, bl); + if (r.length()) { + new_require_min_compat_client = ceph_release_from_name(r.c_str()); + } + } + if (struct_v >= 6) { + decode(new_require_min_compat_client, bl); + decode(new_require_osd_release, bl); + } else { + if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) { + // only for compat with post-kraken pre-luminous test clusters + new_require_osd_release = CEPH_RELEASE_LUMINOUS; + new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) { + new_require_osd_release = CEPH_RELEASE_KRAKEN; + } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) { + new_require_osd_release = CEPH_RELEASE_JEWEL; + } else { + new_require_osd_release = -1; + } + } + if (struct_v >= 8) { + decode(new_crush_node_flags, bl); + } + if (struct_v >= 9) { + decode(new_device_class_flags, bl); + } + DECODE_FINISH(bl); // osd-only data + } + + if (struct_v >= 8) { + have_crc = true; + crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset); + decode(inc_crc, bl); + tail_offset = bl.get_off(); + decode(full_crc, bl); + } else { + have_crc = false; + full_crc = 0; + inc_crc = 0; + } + + DECODE_FINISH(bl); // wrapper + + if (have_crc) { + // verify crc + uint32_t actual = crc_front.crc32c(-1); + if (tail_offset < bl.get_off()) { + bufferlist tail; + tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset); + actual = tail.crc32c(actual); + } + if (inc_crc != actual) { + ostringstream ss; + ss << "bad crc, actual " << actual << " != expected " << inc_crc; + string s = ss.str(); + throw buffer::malformed_input(s.c_str()); + } + } +} + +void OSDMap::Incremental::dump(Formatter *f) const +{ + f->dump_int("epoch", epoch); + f->dump_stream("fsid") << fsid; + f->dump_stream("modified") << modified; + f->dump_stream("new_last_up_change") << new_last_up_change; + f->dump_stream("new_last_in_change") << new_last_in_change; + f->dump_int("new_pool_max", new_pool_max); + f->dump_int("new_flags", new_flags); + f->dump_float("new_full_ratio", new_full_ratio); + f->dump_float("new_nearfull_ratio", new_nearfull_ratio); + f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio); + f->dump_int("new_require_min_compat_client", new_require_min_compat_client); + f->dump_int("new_require_osd_release", new_require_osd_release); + + if (fullmap.length()) { + f->open_object_section("full_map"); + OSDMap full; + bufferlist fbl = fullmap; // kludge around constness. + auto p = fbl.cbegin(); + full.decode(p); + full.dump(f); + f->close_section(); + } + if (crush.length()) { + f->open_object_section("crush"); + CrushWrapper c; + bufferlist tbl = crush; // kludge around constness. + auto p = tbl.cbegin(); + c.decode(p); + c.dump(f); + f->close_section(); + } + + f->dump_int("new_max_osd", new_max_osd); + + f->open_array_section("new_pools"); + + for (const auto &new_pool : new_pools) { + f->open_object_section("pool"); + f->dump_int("pool", new_pool.first); + new_pool.second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_pool_names"); + + for (const auto &new_pool_name : new_pool_names) { + f->open_object_section("pool_name"); + f->dump_int("pool", new_pool_name.first); + f->dump_string("name", new_pool_name.second); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pools"); + + for (const auto &old_pool : old_pools) + f->dump_int("pool", old_pool); + f->close_section(); + + f->open_array_section("new_up_osds"); + + for (const auto &upclient : new_up_client) { + f->open_object_section("osd"); + f->dump_int("osd", upclient.first); + f->dump_stream("public_addr") << upclient.second.legacy_addr(); + f->dump_object("public_addrs", upclient.second); + if (auto p = new_up_cluster.find(upclient.first); + p != new_up_cluster.end()) { + f->dump_stream("cluster_addr") << p->second.legacy_addr(); + f->dump_object("cluster_addrs", p->second); + } + if (auto p = new_hb_back_up.find(upclient.first); + p != new_hb_back_up.end()) { + f->dump_object("heartbeat_back_addrs", p->second); + } + if (auto p = new_hb_front_up.find(upclient.first); + p != new_hb_front_up.end()) { + f->dump_object("heartbeat_front_addrs", p->second); + } + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_weight"); + + for (const auto &weight : new_weight) { + f->open_object_section("osd"); + f->dump_int("osd", weight.first); + f->dump_int("weight", weight.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("osd_state_xor"); + for (const auto &ns : new_state) { + f->open_object_section("osd"); + f->dump_int("osd", ns.first); + set<string> st; + calc_state_set(new_state.find(ns.first)->second, st); + f->open_array_section("state_xor"); + for (auto &state : st) + f->dump_string("state", state); + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_pg_temp"); + + for (const auto &pg_temp : new_pg_temp) { + f->open_object_section("pg"); + f->dump_stream("pgid") << pg_temp.first; + f->open_array_section("osds"); + + for (const auto &osd : pg_temp.second) + f->dump_int("osd", osd); + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("primary_temp"); + + for (const auto &primary_temp : new_primary_temp) { + f->dump_stream("pgid") << primary_temp.first; + f->dump_int("osd", primary_temp.second); + } + f->close_section(); // primary_temp + + f->open_array_section("new_pg_upmap"); + for (auto& i : new_pg_upmap) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << i.first; + f->open_array_section("osds"); + for (auto osd : i.second) { + f->dump_int("osd", osd); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pg_upmap"); + for (auto& i : old_pg_upmap) { + f->dump_stream("pgid") << i; + } + f->close_section(); + + f->open_array_section("new_pg_upmap_items"); + for (auto& i : new_pg_upmap_items) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << i.first; + f->open_array_section("mappings"); + for (auto& p : i.second) { + f->open_object_section("mapping"); + f->dump_int("from", p.first); + f->dump_int("to", p.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pg_upmap_items"); + for (auto& i : old_pg_upmap_items) { + f->dump_stream("pgid") << i; + } + f->close_section(); + + f->open_array_section("new_up_thru"); + + for (const auto &up_thru : new_up_thru) { + f->open_object_section("osd"); + f->dump_int("osd", up_thru.first); + f->dump_int("up_thru", up_thru.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_lost"); + + for (const auto &lost : new_lost) { + f->open_object_section("osd"); + f->dump_int("osd", lost.first); + f->dump_int("epoch_lost", lost.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_last_clean_interval"); + + for (const auto &last_clean_interval : new_last_clean_interval) { + f->open_object_section("osd"); + f->dump_int("osd", last_clean_interval.first); + f->dump_int("first", last_clean_interval.second.first); + f->dump_int("last", last_clean_interval.second.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_blacklist"); + for (const auto &blist : new_blacklist) { + stringstream ss; + ss << blist.first; + f->dump_stream(ss.str().c_str()) << blist.second; + } + f->close_section(); + f->open_array_section("old_blacklist"); + for (const auto &blist : old_blacklist) + f->dump_stream("addr") << blist; + f->close_section(); + + f->open_array_section("new_xinfo"); + for (const auto &xinfo : new_xinfo) { + f->open_object_section("xinfo"); + f->dump_int("osd", xinfo.first); + xinfo.second.dump(f); + f->close_section(); + } + f->close_section(); + + if (cluster_snapshot.size()) + f->dump_string("cluster_snapshot", cluster_snapshot); + + f->open_array_section("new_uuid"); + for (const auto &uuid : new_uuid) { + f->open_object_section("osd"); + f->dump_int("osd", uuid.first); + f->dump_stream("uuid") << uuid.second; + f->close_section(); + } + f->close_section(); + + OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f); + f->open_array_section("old_erasure_code_profiles"); + for (const auto &erasure_code_profile : old_erasure_code_profiles) { + f->dump_string("old", erasure_code_profile.c_str()); + } + f->close_section(); + + f->open_array_section("new_removed_snaps"); + for (auto& p : new_removed_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_purged_snaps"); + for (auto& p : new_purged_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->open_array_section("new_crush_node_flags"); + for (auto& i : new_crush_node_flags) { + f->open_object_section("node"); + f->dump_int("id", i.first); + set<string> st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_array_section("new_device_class_flags"); + for (auto& i : new_device_class_flags) { + f->open_object_section("device_class"); + f->dump_int("id", i.first); + set<string> st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->close_section(); +} + +void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o) +{ + o.push_back(new Incremental); +} + +// ---------------------------------- +// OSDMap + +void OSDMap::set_epoch(epoch_t e) +{ + epoch = e; + for (auto &pool : pools) + pool.second.last_change = e; +} + +bool OSDMap::is_blacklisted(const entity_addr_t& orig) const +{ + if (blacklist.empty()) { + return false; + } + + // all blacklist entries are type ANY for nautilus+ + // FIXME: avoid this copy! + entity_addr_t a = orig; + if (require_osd_release < CEPH_RELEASE_NAUTILUS) { + a.set_type(entity_addr_t::TYPE_LEGACY); + } else { + a.set_type(entity_addr_t::TYPE_ANY); + } + + // this specific instance? + if (blacklist.count(a)) { + return true; + } + + // is entire ip blacklisted? + if (a.is_ip()) { + a.set_port(0); + a.set_nonce(0); + if (blacklist.count(a)) { + return true; + } + } + + return false; +} + +bool OSDMap::is_blacklisted(const entity_addrvec_t& av) const +{ + if (blacklist.empty()) + return false; + + for (auto& a : av.v) { + if (is_blacklisted(a)) { + return true; + } + } + + return false; +} + +void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const +{ + std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl)); +} + +void OSDMap::get_blacklist(std::set<entity_addr_t> *bl) const +{ + for (const auto &i : blacklist) { + bl->insert(i.first); + } +} + +void OSDMap::set_max_osd(int m) +{ + int o = max_osd; + max_osd = m; + osd_state.resize(m); + osd_weight.resize(m); + for (; o<max_osd; o++) { + osd_state[o] = 0; + osd_weight[o] = CEPH_OSD_OUT; + } + osd_info.resize(m); + osd_xinfo.resize(m); + osd_addrs->client_addrs.resize(m); + osd_addrs->cluster_addrs.resize(m); + osd_addrs->hb_back_addrs.resize(m); + osd_addrs->hb_front_addrs.resize(m); + osd_uuid->resize(m); + if (osd_primary_affinity) + osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + + calc_num_osds(); +} + +int OSDMap::calc_num_osds() +{ + num_osd = 0; + num_up_osd = 0; + num_in_osd = 0; + for (int i=0; i<max_osd; i++) { + if (osd_state[i] & CEPH_OSD_EXISTS) { + ++num_osd; + if (osd_state[i] & CEPH_OSD_UP) { + ++num_up_osd; + } + if (get_weight(i) != CEPH_OSD_OUT) { + ++num_in_osd; + } + } + } + return num_osd; +} + +void OSDMap::get_full_pools(CephContext *cct, + set<int64_t> *full, + set<int64_t> *backfillfull, + set<int64_t> *nearfull) const +{ + ceph_assert(full); + ceph_assert(backfillfull); + ceph_assert(nearfull); + full->clear(); + backfillfull->clear(); + nearfull->clear(); + + vector<int> full_osds; + vector<int> backfillfull_osds; + vector<int> nearfull_osds; + for (int i = 0; i < max_osd; ++i) { + if (exists(i) && is_up(i) && is_in(i)) { + if (osd_state[i] & CEPH_OSD_FULL) + full_osds.push_back(i); + else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) + backfillfull_osds.push_back(i); + else if (osd_state[i] & CEPH_OSD_NEARFULL) + nearfull_osds.push_back(i); + } + } + + for (auto i: full_osds) { + get_pool_ids_by_osd(cct, i, full); + } + for (auto i: backfillfull_osds) { + get_pool_ids_by_osd(cct, i, backfillfull); + } + for (auto i: nearfull_osds) { + get_pool_ids_by_osd(cct, i, nearfull); + } +} + +void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill, + set<int> *nearfull) const +{ + full->clear(); + backfill->clear(); + nearfull->clear(); + for (int i = 0; i < max_osd; ++i) { + if (exists(i) && is_up(i) && is_in(i)) { + if (osd_state[i] & CEPH_OSD_FULL) + full->emplace(i); + else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) + backfill->emplace(i); + else if (osd_state[i] & CEPH_OSD_NEARFULL) + nearfull->emplace(i); + } + } +} + +void OSDMap::get_all_osds(set<int32_t>& ls) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i)) + ls.insert(i); +} + +void OSDMap::get_up_osds(set<int32_t>& ls) const +{ + for (int i = 0; i < max_osd; i++) { + if (is_up(i)) + ls.insert(i); + } +} + +void OSDMap::get_out_existing_osds(set<int32_t>& ls) const +{ + for (int i = 0; i < max_osd; i++) { + if (exists(i) && get_weight(i) == CEPH_OSD_OUT) + ls.insert(i); + } +} + +void OSDMap::get_flag_set(set<string> *flagset) const +{ + for (unsigned i = 0; i < sizeof(flags) * 8; ++i) { + if (flags & (1<<i)) { + flagset->insert(get_flag_string(flags & (1<<i))); + } + } +} + +void OSDMap::calc_state_set(int state, set<string>& st) +{ + unsigned t = state; + for (unsigned s = 1; t; s <<= 1) { + if (t & s) { + t &= ~s; + st.insert(ceph_osd_state_name(s)); + } + } +} + +void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const +{ + float max = 0; + for (const auto &weight : weights) { + if (weight.second > max) + max = weight.second; + } + + for (const auto &weight : weights) { + inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN); + } +} + +int OSDMap::identify_osd(const entity_addr_t& addr) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i) && (get_addrs(i).contains(addr) || + get_cluster_addrs(i).contains(addr))) + return i; + return -1; +} + +int OSDMap::identify_osd(const uuid_d& u) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i) && get_uuid(i) == u) + return i; + return -1; +} + +int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i) && (get_addrs(i).contains(addr) || + get_cluster_addrs(i).contains(addr) || + get_hb_back_addrs(i).contains(addr) || + get_hb_front_addrs(i).contains(addr))) + return i; + return -1; +} + +int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i) && (get_addrs(i).is_same_host(ip) || + get_cluster_addrs(i).is_same_host(ip))) + return i; + return -1; +} + + +uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const +{ + uint64_t features = 0; // things we actually have + uint64_t mask = 0; // things we could have + + if (crush->has_nondefault_tunables()) + features |= CEPH_FEATURE_CRUSH_TUNABLES; + if (crush->has_nondefault_tunables2()) + features |= CEPH_FEATURE_CRUSH_TUNABLES2; + if (crush->has_nondefault_tunables3()) + features |= CEPH_FEATURE_CRUSH_TUNABLES3; + if (crush->has_v4_buckets()) + features |= CEPH_FEATURE_CRUSH_V4; + if (crush->has_nondefault_tunables5()) + features |= CEPH_FEATURE_CRUSH_TUNABLES5; + if (crush->has_incompat_choose_args()) { + features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS; + } + mask |= CEPH_FEATURES_CRUSH; + + if (!pg_upmap.empty() || !pg_upmap_items.empty()) + features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP; + mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP; + + for (auto &pool: pools) { + if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) { + features |= CEPH_FEATURE_OSDHASHPSPOOL; + } + if (!pool.second.tiers.empty() || + pool.second.is_tier()) { + features |= CEPH_FEATURE_OSD_CACHEPOOL; + } + int ruleid = crush->find_rule(pool.second.get_crush_rule(), + pool.second.get_type(), + pool.second.get_size()); + if (ruleid >= 0) { + if (crush->is_v2_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_V2; + if (crush->is_v3_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_TUNABLES3; + if (crush->is_v5_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_TUNABLES5; + } + } + mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL; + + if (osd_primary_affinity) { + for (int i = 0; i < max_osd; ++i) { + if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY; + break; + } + } + } + mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY; + + if (entity_type == CEPH_ENTITY_TYPE_OSD) { + const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL; + if (require_osd_release >= CEPH_RELEASE_JEWEL) { + features |= jewel_features; + } + mask |= jewel_features; + + const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN + | CEPH_FEATURE_MSG_ADDR2; + if (require_osd_release >= CEPH_RELEASE_KRAKEN) { + features |= kraken_features; + } + mask |= kraken_features; + } + + if (require_min_compat_client >= CEPH_RELEASE_NAUTILUS) { + // if min_compat_client is >= nautilus, require v2 cephx signatures + // from everyone + features |= CEPH_FEATUREMASK_CEPHX_V2; + } else if (require_osd_release >= CEPH_RELEASE_NAUTILUS && + entity_type == CEPH_ENTITY_TYPE_OSD) { + // if osds are >= nautilus, at least require the signatures from them + features |= CEPH_FEATUREMASK_CEPHX_V2; + } + mask |= CEPH_FEATUREMASK_CEPHX_V2; + + if (pmask) + *pmask = mask; + return features; +} + +uint8_t OSDMap::get_min_compat_client() const +{ + uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr); + + if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43 + HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28 + return CEPH_RELEASE_LUMINOUS; // v12.2.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737 + return CEPH_RELEASE_JEWEL; // v10.2.0 + } + if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56 + return CEPH_RELEASE_HAMMER; // v0.94.0 + } + if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624 + HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d + HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5 + return CEPH_RELEASE_FIREFLY; // v0.80.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff + HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f + return CEPH_RELEASE_DUMPLING; // v0.67.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af + return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af + } + return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af +} + +uint8_t OSDMap::get_require_min_compat_client() const +{ + return require_min_compat_client; +} + +void OSDMap::_calc_up_osd_features() +{ + bool first = true; + cached_up_osd_features = 0; + for (int osd = 0; osd < max_osd; ++osd) { + if (!is_up(osd)) + continue; + const osd_xinfo_t &xi = get_xinfo(osd); + if (xi.features == 0) + continue; // bogus xinfo, maybe #20751 or similar, skipping + if (first) { + cached_up_osd_features = xi.features; + first = false; + } else { + cached_up_osd_features &= xi.features; + } + } +} + +uint64_t OSDMap::get_up_osd_features() const +{ + return cached_up_osd_features; +} + +void OSDMap::dedup(const OSDMap *o, OSDMap *n) +{ + using ceph::encode; + if (o->epoch == n->epoch) + return; + + int diff = 0; + + // do addrs match? + if (o->max_osd != n->max_osd) + diff++; + for (int i = 0; i < o->max_osd && i < n->max_osd; i++) { + if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] && + *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i]) + n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i]; + else + diff++; + if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] && + *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i]) + n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i]; + else + diff++; + if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] && + *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i]) + n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i]; + else + diff++; + if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] && + *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i]) + n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i]; + else + diff++; + } + if (diff == 0) { + // zoinks, no differences at all! + n->osd_addrs = o->osd_addrs; + } + + // does crush match? + bufferlist oc, nc; + encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT); + encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT); + if (oc.contents_equal(nc)) { + n->crush = o->crush; + } + + // does pg_temp match? + if (*o->pg_temp == *n->pg_temp) + n->pg_temp = o->pg_temp; + + // does primary_temp match? + if (o->primary_temp->size() == n->primary_temp->size()) { + if (*o->primary_temp == *n->primary_temp) + n->primary_temp = o->primary_temp; + } + + // do uuids match? + if (o->osd_uuid->size() == n->osd_uuid->size() && + *o->osd_uuid == *n->osd_uuid) + n->osd_uuid = o->osd_uuid; +} + +void OSDMap::clean_temps(CephContext *cct, + const OSDMap& oldmap, + const OSDMap& nextmap, + Incremental *pending_inc) +{ + ldout(cct, 10) << __func__ << dendl; + + for (auto pg : *nextmap.pg_temp) { + // if pool does not exist, remove any existing pg_temps associated with + // it. we don't care about pg_temps on the pending_inc either; if there + // are new_pg_temp entries on the pending, clear them out just as well. + if (!nextmap.have_pg_pool(pg.first.pool())) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first + << " for nonexistent pool " << pg.first.pool() << dendl; + pending_inc->new_pg_temp[pg.first].clear(); + continue; + } + // all osds down? + unsigned num_up = 0; + for (auto o : pg.second) { + if (!nextmap.is_down(o)) { + ++num_up; + break; + } + } + if (num_up == 0) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first + << " with all down osds" << pg.second << dendl; + pending_inc->new_pg_temp[pg.first].clear(); + continue; + } + // redundant pg_temp? + vector<int> raw_up; + int primary; + nextmap.pg_to_raw_up(pg.first, &raw_up, &primary); + bool remove = false; + if (raw_up == pg.second) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " " + << pg.second << " that matches raw_up mapping" << dendl; + remove = true; + } + // oversized pg_temp? + if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " " + << pg.second << " exceeds pool size" << dendl; + remove = true; + } + if (remove) { + if (oldmap.pg_temp->count(pg.first)) + pending_inc->new_pg_temp[pg.first].clear(); + else + pending_inc->new_pg_temp.erase(pg.first); + } + } + + for (auto &pg : *nextmap.primary_temp) { + // primary down? + if (nextmap.is_down(pg.second)) { + ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first + << " to down " << pg.second << dendl; + pending_inc->new_primary_temp[pg.first] = -1; + continue; + } + // redundant primary_temp? + vector<int> real_up, templess_up; + int real_primary, templess_primary; + pg_t pgid = pg.first; + nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary); + nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary); + if (real_primary == templess_primary){ + ldout(cct, 10) << __func__ << " removing primary_temp " + << pgid << " -> " << real_primary + << " (unnecessary/redundant)" << dendl; + if (oldmap.primary_temp->count(pgid)) + pending_inc->new_primary_temp[pgid] = -1; + else + pending_inc->new_primary_temp.erase(pgid); + } + } +} + +void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const +{ + upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size()); + for (auto& p : pg_upmap) + upmap_pgs->push_back(p.first); + for (auto& p : pg_upmap_items) + upmap_pgs->push_back(p.first); +} + +bool OSDMap::check_pg_upmaps( + CephContext *cct, + const vector<pg_t>& to_check, + vector<pg_t> *to_cancel, + map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const +{ + bool any_change = false; + map<int, map<int, float>> rule_weight_map; + for (auto& pg : to_check) { + const pg_pool_t *pi = get_pg_pool(pg.pool()); + if (!pi || pg.ps() >= pi->get_pg_num_pending()) { + ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source" + << dendl; + to_cancel->push_back(pg); + continue; + } + if (pi->is_pending_merge(pg, nullptr)) { + ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge" + << dendl; + to_cancel->push_back(pg); + continue; + } + vector<int> raw, up; + pg_to_raw_upmap(pg, &raw, &up); + auto crush_rule = get_pg_pool_crush_rule(pg); + auto r = crush->verify_upmap(cct, + crush_rule, + get_pg_pool_size(pg), + up); + if (r < 0) { + ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg + << " returning " << r + << dendl; + to_cancel->push_back(pg); + continue; + } + // below we check against crush-topology changing.. + map<int, float> weight_map; + auto it = rule_weight_map.find(crush_rule); + if (it == rule_weight_map.end()) { + auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map); + if (r < 0) { + lderr(cct) << __func__ << " unable to get crush weight_map for " + << "crush_rule " << crush_rule + << dendl; + continue; + } + rule_weight_map[crush_rule] = weight_map; + } else { + weight_map = it->second; + } + ldout(cct, 10) << __func__ << " pg " << pg + << " weight_map " << weight_map + << dendl; + for (auto osd : up) { + auto it = weight_map.find(osd); + if (it == weight_map.end()) { + ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has " + << "been moved out of the specific crush-tree" + << dendl; + to_cancel->push_back(pg); + break; + } + auto adjusted_weight = get_weightf(it->first) * it->second; + if (adjusted_weight == 0) { + ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd + << " is out/crush-out" + << dendl; + to_cancel->push_back(pg); + break; + } + } + if (!to_cancel->empty() && to_cancel->back() == pg) + continue; + // okay, upmap is valid + // continue to check if it is still necessary + auto i = pg_upmap.find(pg); + if (i != pg_upmap.end() && raw == i->second) { + ldout(cct, 10) << " removing redundant pg_upmap " + << i->first << " " << i->second + << dendl; + to_cancel->push_back(pg); + continue; + } + auto j = pg_upmap_items.find(pg); + if (j != pg_upmap_items.end()) { + mempool::osdmap::vector<pair<int,int>> newmap; + for (auto& p : j->second) { + if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) { + // cancel mapping if source osd does not exist anymore + continue; + } + if (p.second != CRUSH_ITEM_NONE && p.second < max_osd && + p.second >= 0 && osd_weight[p.second] == 0) { + // cancel mapping if target osd is out + continue; + } + newmap.push_back(p); + } + if (newmap.empty()) { + ldout(cct, 10) << " removing no-op pg_upmap_items " + << j->first << " " << j->second + << dendl; + to_cancel->push_back(pg); + } else if (newmap != j->second) { + ldout(cct, 10) << " simplifying partially no-op pg_upmap_items " + << j->first << " " << j->second + << " -> " << newmap + << dendl; + to_remap->insert({pg, newmap}); + any_change = true; + } + } + } + any_change = any_change || !to_cancel->empty(); + return any_change; +} + +void OSDMap::clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc, + const vector<pg_t>& to_cancel, + const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const +{ + for (auto &pg: to_cancel) { + auto i = pending_inc->new_pg_upmap.find(pg); + if (i != pending_inc->new_pg_upmap.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pending " + << "pg_upmap entry " + << i->first << "->" << i->second + << dendl; + pending_inc->new_pg_upmap.erase(i); + } + auto j = pg_upmap.find(pg); + if (j != pg_upmap.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry " + << j->first << "->" << j->second + << dendl; + pending_inc->old_pg_upmap.insert(pg); + } + auto p = pending_inc->new_pg_upmap_items.find(pg); + if (p != pending_inc->new_pg_upmap_items.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pending " + << "pg_upmap_items entry " + << p->first << "->" << p->second + << dendl; + pending_inc->new_pg_upmap_items.erase(p); + } + auto q = pg_upmap_items.find(pg); + if (q != pg_upmap_items.end()) { + ldout(cct, 10) << __func__ << " cancel invalid " + << "pg_upmap_items entry " + << q->first << "->" << q->second + << dendl; + pending_inc->old_pg_upmap_items.insert(pg); + } + } + for (auto& i : to_remap) + pending_inc->new_pg_upmap_items[i.first] = i.second; +} + +bool OSDMap::clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc) const +{ + ldout(cct, 10) << __func__ << dendl; + vector<pg_t> to_check; + vector<pg_t> to_cancel; + map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap; + + get_upmap_pgs(&to_check); + auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap); + clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap); + return any_change; +} + +int OSDMap::apply_incremental(const Incremental &inc) +{ + new_blacklist_entries = false; + if (inc.epoch == 1) + fsid = inc.fsid; + else if (inc.fsid != fsid) + return -EINVAL; + + ceph_assert(inc.epoch == epoch+1); + + epoch++; + modified = inc.modified; + + // full map? + if (inc.fullmap.length()) { + bufferlist bl(inc.fullmap); + decode(bl); + return 0; + } + + // nope, incremental. + if (inc.new_flags >= 0) { + flags = inc.new_flags; + // the below is just to cover a newly-upgraded luminous mon + // cluster that has to set require_jewel_osds or + // require_kraken_osds before the osds can be upgraded to + // luminous. + if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) { + if (require_osd_release < CEPH_RELEASE_KRAKEN) { + require_osd_release = CEPH_RELEASE_KRAKEN; + } + } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) { + if (require_osd_release < CEPH_RELEASE_JEWEL) { + require_osd_release = CEPH_RELEASE_JEWEL; + } + } + } + + if (inc.new_max_osd >= 0) + set_max_osd(inc.new_max_osd); + + if (inc.new_pool_max != -1) + pool_max = inc.new_pool_max; + + for (const auto &pool : inc.new_pools) { + pools[pool.first] = pool.second; + pools[pool.first].last_change = epoch; + } + + new_removed_snaps = inc.new_removed_snaps; + new_purged_snaps = inc.new_purged_snaps; + for (auto p = new_removed_snaps.begin(); + p != new_removed_snaps.end(); + ++p) { + removed_snaps_queue[p->first].union_of(p->second); + } + for (auto p = new_purged_snaps.begin(); + p != new_purged_snaps.end(); + ++p) { + auto q = removed_snaps_queue.find(p->first); + ceph_assert(q != removed_snaps_queue.end()); + q->second.subtract(p->second); + if (q->second.empty()) { + removed_snaps_queue.erase(q); + } + } + + if (inc.new_last_up_change != utime_t()) { + last_up_change = inc.new_last_up_change; + } + if (inc.new_last_in_change != utime_t()) { + last_in_change = inc.new_last_in_change; + } + + for (const auto &pname : inc.new_pool_names) { + auto pool_name_entry = pool_name.find(pname.first); + if (pool_name_entry != pool_name.end()) { + name_pool.erase(pool_name_entry->second); + pool_name_entry->second = pname.second; + } else { + pool_name[pname.first] = pname.second; + } + name_pool[pname.second] = pname.first; + } + + for (const auto &pool : inc.old_pools) { + pools.erase(pool); + name_pool.erase(pool_name[pool]); + pool_name.erase(pool); + } + + for (const auto &weight : inc.new_weight) { + set_weight(weight.first, weight.second); + + // if we are marking in, clear the AUTOOUT and NEW bits, and clear + // xinfo old_weight. + if (weight.second) { + osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW); + osd_xinfo[weight.first].old_weight = 0; + } + } + + for (const auto &primary_affinity : inc.new_primary_affinity) { + set_primary_affinity(primary_affinity.first, primary_affinity.second); + } + + // erasure_code_profiles + for (const auto &profile : inc.old_erasure_code_profiles) + erasure_code_profiles.erase(profile); + + for (const auto &profile : inc.new_erasure_code_profiles) { + set_erasure_code_profile(profile.first, profile.second); + } + + // up/down + for (const auto &state : inc.new_state) { + const auto osd = state.first; + int s = state.second ? state.second : CEPH_OSD_UP; + if ((osd_state[osd] & CEPH_OSD_UP) && + (s & CEPH_OSD_UP)) { + osd_info[osd].down_at = epoch; + osd_xinfo[osd].down_stamp = modified; + } + if ((osd_state[osd] & CEPH_OSD_EXISTS) && + (s & CEPH_OSD_EXISTS)) { + // osd is destroyed; clear out anything interesting. + (*osd_uuid)[osd] = uuid_d(); + osd_info[osd] = osd_info_t(); + osd_xinfo[osd] = osd_xinfo_t(); + set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + osd_addrs->client_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t()); + osd_state[osd] = 0; + } else { + osd_state[osd] ^= s; + } + } + + for (const auto &client : inc.new_up_client) { + osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; + osd_addrs->client_addrs[client.first].reset( + new entity_addrvec_t(client.second)); + osd_addrs->hb_back_addrs[client.first].reset( + new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second)); + osd_addrs->hb_front_addrs[client.first].reset( + new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second)); + + osd_info[client.first].up_from = epoch; + } + + for (const auto &cluster : inc.new_up_cluster) + osd_addrs->cluster_addrs[cluster.first].reset( + new entity_addrvec_t(cluster.second)); + + // info + for (const auto &thru : inc.new_up_thru) + osd_info[thru.first].up_thru = thru.second; + + for (const auto &interval : inc.new_last_clean_interval) { + osd_info[interval.first].last_clean_begin = interval.second.first; + osd_info[interval.first].last_clean_end = interval.second.second; + } + + for (const auto &lost : inc.new_lost) + osd_info[lost.first].lost_at = lost.second; + + // xinfo + for (const auto &xinfo : inc.new_xinfo) + osd_xinfo[xinfo.first] = xinfo.second; + + // uuid + for (const auto &uuid : inc.new_uuid) + (*osd_uuid)[uuid.first] = uuid.second; + + // pg rebuild + for (const auto &pg : inc.new_pg_temp) { + if (pg.second.empty()) + pg_temp->erase(pg.first); + else + pg_temp->set(pg.first, pg.second); + } + if (!inc.new_pg_temp.empty()) { + // make sure pg_temp is efficiently stored + pg_temp->rebuild(); + } + + for (const auto &pg : inc.new_primary_temp) { + if (pg.second == -1) + primary_temp->erase(pg.first); + else + (*primary_temp)[pg.first] = pg.second; + } + + for (auto& p : inc.new_pg_upmap) { + pg_upmap[p.first] = p.second; + } + for (auto& pg : inc.old_pg_upmap) { + pg_upmap.erase(pg); + } + for (auto& p : inc.new_pg_upmap_items) { + pg_upmap_items[p.first] = p.second; + } + for (auto& pg : inc.old_pg_upmap_items) { + pg_upmap_items.erase(pg); + } + + // blacklist + if (!inc.new_blacklist.empty()) { + blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end()); + new_blacklist_entries = true; + } + for (const auto &addr : inc.old_blacklist) + blacklist.erase(addr); + + for (auto& i : inc.new_crush_node_flags) { + if (i.second) { + crush_node_flags[i.first] = i.second; + } else { + crush_node_flags.erase(i.first); + } + } + + for (auto& i : inc.new_device_class_flags) { + if (i.second) { + device_class_flags[i.first] = i.second; + } else { + device_class_flags.erase(i.first); + } + } + + // cluster snapshot? + if (inc.cluster_snapshot.length()) { + cluster_snapshot = inc.cluster_snapshot; + cluster_snapshot_epoch = inc.epoch; + } else { + cluster_snapshot.clear(); + cluster_snapshot_epoch = 0; + } + + if (inc.new_nearfull_ratio >= 0) { + nearfull_ratio = inc.new_nearfull_ratio; + } + if (inc.new_backfillfull_ratio >= 0) { + backfillfull_ratio = inc.new_backfillfull_ratio; + } + if (inc.new_full_ratio >= 0) { + full_ratio = inc.new_full_ratio; + } + if (inc.new_require_min_compat_client > 0) { + require_min_compat_client = inc.new_require_min_compat_client; + } + if (inc.new_require_osd_release >= 0) { + require_osd_release = inc.new_require_osd_release; + if (require_osd_release >= CEPH_RELEASE_LUMINOUS) { + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } + } + + if (inc.new_require_osd_release >= 0) { + require_osd_release = inc.new_require_osd_release; + if (require_osd_release >= CEPH_RELEASE_NAUTILUS) { + flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT; + } + } + // do new crush map last (after up/down stuff) + if (inc.crush.length()) { + bufferlist bl(inc.crush); + auto blp = bl.cbegin(); + crush.reset(new CrushWrapper); + crush->decode(blp); + if (require_osd_release >= CEPH_RELEASE_LUMINOUS) { + // only increment if this is a luminous-encoded osdmap, lest + // the mon's crush_version diverge from what the osds or others + // are decoding and applying on their end. if we won't encode + // it in the canonical version, don't change it. + ++crush_version; + } + for (auto it = device_class_flags.begin(); + it != device_class_flags.end();) { + const char* class_name = crush->get_class_name(it->first); + if (!class_name) // device class is gone + it = device_class_flags.erase(it); + else + it++; + } + } + + calc_num_osds(); + _calc_up_osd_features(); + return 0; +} + +// mapping +int OSDMap::map_to_pg( + int64_t poolid, + const string& name, + const string& key, + const string& nspace, + pg_t *pg) const +{ + // calculate ps (placement seed) + const pg_pool_t *pool = get_pg_pool(poolid); + if (!pool) + return -ENOENT; + ps_t ps; + if (!key.empty()) + ps = pool->hash_key(key, nspace); + else + ps = pool->hash_key(name, nspace); + *pg = pg_t(ps, poolid); + return 0; +} + +int OSDMap::object_locator_to_pg( + const object_t& oid, const object_locator_t& loc, pg_t &pg) const +{ + if (loc.hash >= 0) { + if (!get_pg_pool(loc.get_pool())) { + return -ENOENT; + } + pg = pg_t(loc.hash, loc.get_pool()); + return 0; + } + return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg); +} + +ceph_object_layout OSDMap::make_object_layout( + object_t oid, int pg_pool, string nspace) const +{ + object_locator_t loc(pg_pool, nspace); + + ceph_object_layout ol; + pg_t pgid = object_locator_to_pg(oid, loc); + ol.ol_pgid = pgid.get_old_pg().v; + ol.ol_stripe_unit = 0; + return ol; +} + +void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool, + vector<int>& osds) const +{ + if (pool.can_shift_osds()) { + unsigned removed = 0; + for (unsigned i = 0; i < osds.size(); i++) { + if (!exists(osds[i])) { + removed++; + continue; + } + if (removed) { + osds[i - removed] = osds[i]; + } + } + if (removed) + osds.resize(osds.size() - removed); + } else { + for (auto& osd : osds) { + if (!exists(osd)) + osd = CRUSH_ITEM_NONE; + } + } +} + +void OSDMap::_pg_to_raw_osds( + const pg_pool_t& pool, pg_t pg, + vector<int> *osds, + ps_t *ppps) const +{ + // map to osds[] + ps_t pps = pool.raw_pg_to_pps(pg); // placement ps + unsigned size = pool.get_size(); + + // what crush rule? + int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size); + if (ruleno >= 0) + crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool()); + + _remove_nonexistent_osds(pool, *osds); + + if (ppps) + *ppps = pps; +} + +int OSDMap::_pick_primary(const vector<int>& osds) const +{ + for (auto osd : osds) { + if (osd != CRUSH_ITEM_NONE) { + return osd; + } + } + return -1; +} + +void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const +{ + pg_t pg = pi.raw_pg_to_pg(raw_pg); + auto p = pg_upmap.find(pg); + if (p != pg_upmap.end()) { + // make sure targets aren't marked out + for (auto osd : p->second) { + if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 && + osd_weight[osd] == 0) { + // reject/ignore the explicit mapping + return; + } + } + *raw = vector<int>(p->second.begin(), p->second.end()); + // continue to check and apply pg_upmap_items if any + } + + auto q = pg_upmap_items.find(pg); + if (q != pg_upmap_items.end()) { + // NOTE: this approach does not allow a bidirectional swap, + // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + for (auto& r : q->second) { + // make sure the replacement value doesn't already appear + bool exists = false; + ssize_t pos = -1; + for (unsigned i = 0; i < raw->size(); ++i) { + int osd = (*raw)[i]; + if (osd == r.second) { + exists = true; + break; + } + // ignore mapping if target is marked out (or invalid osd id) + if (osd == r.first && + pos < 0 && + !(r.second != CRUSH_ITEM_NONE && r.second < max_osd && + r.second >= 0 && osd_weight[r.second] == 0)) { + pos = i; + } + } + if (!exists && pos >= 0) { + (*raw)[pos] = r.second; + } + } + } +} + +// pg -> (up osd list) +void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw, + vector<int> *up) const +{ + if (pool.can_shift_osds()) { + // shift left + up->clear(); + up->reserve(raw.size()); + for (unsigned i=0; i<raw.size(); i++) { + if (!exists(raw[i]) || is_down(raw[i])) + continue; + up->push_back(raw[i]); + } + } else { + // set down/dne devices to NONE + up->resize(raw.size()); + for (int i = raw.size() - 1; i >= 0; --i) { + if (!exists(raw[i]) || is_down(raw[i])) { + (*up)[i] = CRUSH_ITEM_NONE; + } else { + (*up)[i] = raw[i]; + } + } + } +} + +void OSDMap::_apply_primary_affinity(ps_t seed, + const pg_pool_t& pool, + vector<int> *osds, + int *primary) const +{ + // do we have any non-default primary_affinity values for these osds? + if (!osd_primary_affinity) + return; + + bool any = false; + for (const auto osd : *osds) { + if (osd != CRUSH_ITEM_NONE && + (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + any = true; + break; + } + } + if (!any) + return; + + // pick the primary. feed both the seed (for the pg) and the osd + // into the hash/rng so that a proportional fraction of an osd's pgs + // get rejected as primary. + int pos = -1; + for (unsigned i = 0; i < osds->size(); ++i) { + int o = (*osds)[i]; + if (o == CRUSH_ITEM_NONE) + continue; + unsigned a = (*osd_primary_affinity)[o]; + if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY && + (crush_hash32_2(CRUSH_HASH_RJENKINS1, + seed, o) >> 16) >= a) { + // we chose not to use this primary. note it anyway as a + // fallback in case we don't pick anyone else, but keep looking. + if (pos < 0) + pos = i; + } else { + pos = i; + break; + } + } + if (pos < 0) + return; + + *primary = (*osds)[pos]; + + if (pool.can_shift_osds() && pos > 0) { + // move the new primary to the front. + for (int i = pos; i > 0; --i) { + (*osds)[i] = (*osds)[i-1]; + } + (*osds)[0] = *primary; + } +} + +void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg, + vector<int> *temp_pg, int *temp_primary) const +{ + pg = pool.raw_pg_to_pg(pg); + const auto p = pg_temp->find(pg); + temp_pg->clear(); + if (p != pg_temp->end()) { + for (unsigned i=0; i<p->second.size(); i++) { + if (!exists(p->second[i]) || is_down(p->second[i])) { + if (pool.can_shift_osds()) { + continue; + } else { + temp_pg->push_back(CRUSH_ITEM_NONE); + } + } else { + temp_pg->push_back(p->second[i]); + } + } + } + const auto &pp = primary_temp->find(pg); + *temp_primary = -1; + if (pp != primary_temp->end()) { + *temp_primary = pp->second; + } else if (!temp_pg->empty()) { // apply pg_temp's primary + for (unsigned i = 0; i < temp_pg->size(); ++i) { + if ((*temp_pg)[i] != CRUSH_ITEM_NONE) { + *temp_primary = (*temp_pg)[i]; + break; + } + } + } +} + +void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) { + *primary = -1; + raw->clear(); + return; + } + _pg_to_raw_osds(*pool, pg, raw, NULL); + *primary = _pick_primary(*raw); +} + +void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw, + vector<int> *raw_upmap) const +{ + auto pool = get_pg_pool(pg.pool()); + if (!pool) { + raw_upmap->clear(); + return; + } + _pg_to_raw_osds(*pool, pg, raw, NULL); + *raw_upmap = *raw; + _apply_upmap(*pool, pg, raw_upmap); +} + +void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) { + *primary = -1; + up->clear(); + return; + } + vector<int> raw; + ps_t pps; + _pg_to_raw_osds(*pool, pg, &raw, &pps); + _apply_upmap(*pool, pg, &raw); + _raw_to_up_osds(*pool, raw, up); + *primary = _pick_primary(raw); + _apply_primary_affinity(pps, *pool, up, primary); +} + +void OSDMap::_pg_to_up_acting_osds( + const pg_t& pg, vector<int> *up, int *up_primary, + vector<int> *acting, int *acting_primary, + bool raw_pg_to_pg) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool || + (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) { + if (up) + up->clear(); + if (up_primary) + *up_primary = -1; + if (acting) + acting->clear(); + if (acting_primary) + *acting_primary = -1; + return; + } + vector<int> raw; + vector<int> _up; + vector<int> _acting; + int _up_primary; + int _acting_primary; + ps_t pps; + _get_temp_osds(*pool, pg, &_acting, &_acting_primary); + if (_acting.empty() || up || up_primary) { + _pg_to_raw_osds(*pool, pg, &raw, &pps); + _apply_upmap(*pool, pg, &raw); + _raw_to_up_osds(*pool, raw, &_up); + _up_primary = _pick_primary(_up); + _apply_primary_affinity(pps, *pool, &_up, &_up_primary); + if (_acting.empty()) { + _acting = _up; + if (_acting_primary == -1) { + _acting_primary = _up_primary; + } + } + + if (up) + up->swap(_up); + if (up_primary) + *up_primary = _up_primary; + } + + if (acting) + acting->swap(_acting); + if (acting_primary) + *acting_primary = _acting_primary; +} + +int OSDMap::calc_pg_rank(int osd, const vector<int>& acting, int nrep) +{ + if (!nrep) + nrep = acting.size(); + for (int i=0; i<nrep; i++) + if (acting[i] == osd) + return i; + return -1; +} + +int OSDMap::calc_pg_role(int osd, const vector<int>& acting, int nrep) +{ + return calc_pg_rank(osd, acting, nrep); +} + +bool OSDMap::primary_changed( + int oldprimary, + const vector<int> &oldacting, + int newprimary, + const vector<int> &newacting) +{ + if (oldacting.empty() && newacting.empty()) + return false; // both still empty + if (oldacting.empty() ^ newacting.empty()) + return true; // was empty, now not, or vice versa + if (oldprimary != newprimary) + return true; // primary changed + if (calc_pg_rank(oldprimary, oldacting) != + calc_pg_rank(newprimary, newacting)) + return true; + return false; // same primary (tho replicas may have changed) +} + +uint64_t OSDMap::get_encoding_features() const +{ + uint64_t f = SIGNIFICANT_FEATURES; + if (require_osd_release < CEPH_RELEASE_NAUTILUS) { + f &= ~CEPH_FEATURE_SERVER_NAUTILUS; + } + if (require_osd_release < CEPH_RELEASE_MIMIC) { + f &= ~CEPH_FEATURE_SERVER_MIMIC; + } + if (require_osd_release < CEPH_RELEASE_LUMINOUS) { + f &= ~(CEPH_FEATURE_SERVER_LUMINOUS | + CEPH_FEATURE_CRUSH_CHOOSE_ARGS); + } + if (require_osd_release < CEPH_RELEASE_KRAKEN) { + f &= ~(CEPH_FEATURE_SERVER_KRAKEN | + CEPH_FEATURE_MSG_ADDR2); + } + if (require_osd_release < CEPH_RELEASE_JEWEL) { + f &= ~(CEPH_FEATURE_SERVER_JEWEL | + CEPH_FEATURE_NEW_OSDOP_ENCODING | + CEPH_FEATURE_CRUSH_TUNABLES5); + } + return f; +} + +// serialize, unserialize +void OSDMap::encode_client_old(bufferlist& bl) const +{ + using ceph::encode; + __u16 v = 5; + encode(v, bl); + + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + // for encode(pools, bl); + __u32 n = pools.size(); + encode(n, bl); + + for (const auto &pool : pools) { + n = pool.first; + encode(n, bl); + encode(pool.second, bl, 0); + } + // for encode(pool_name, bl); + n = pool_name.size(); + encode(n, bl); + for (const auto &pname : pool_name) { + n = pname.first; + encode(n, bl); + encode(pname.second, bl); + } + // for encode(pool_max, bl); + n = pool_max; + encode(n, bl); + + encode(flags, bl); + + encode(max_osd, bl); + { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + encode(osd_addrs->client_addrs, bl, 0); + + // for encode(pg_temp, bl); + n = pg_temp->size(); + encode(n, bl); + for (const auto pg : *pg_temp) { + old_pg_t opg = pg.first.get_old_pg(); + encode(opg, bl); + encode(pg.second, bl); + } + + // crush + bufferlist cbl; + crush->encode(cbl, 0 /* legacy (no) features */); + encode(cbl, bl); +} + +void OSDMap::encode_classic(bufferlist& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGID64) == 0) { + encode_client_old(bl); + return; + } + + __u16 v = 6; + encode(v, bl); + + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + encode(pools, bl, features); + encode(pool_name, bl); + encode(pool_max, bl); + + encode(flags, bl); + + encode(max_osd, bl); + { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + encode(osd_addrs->client_addrs, bl, features); + + encode(*pg_temp, bl); + + // crush + bufferlist cbl; + crush->encode(cbl, 0 /* legacy (no) features */); + encode(cbl, bl); + + // extended + __u16 ev = 10; + encode(ev, bl); + encode(osd_addrs->hb_back_addrs, bl, features); + encode(osd_info, bl); + encode(blacklist, bl, features); + encode(osd_addrs->cluster_addrs, bl, features); + encode(cluster_snapshot_epoch, bl); + encode(cluster_snapshot, bl); + encode(*osd_uuid, bl); + encode(osd_xinfo, bl); + encode(osd_addrs->hb_front_addrs, bl, features); +} + +/* for a description of osdmap versions, and when they were introduced, please + * refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::encode(bufferlist& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) { + encode_classic(bl, features); + return; + } + + // only a select set of callers should *ever* be encoding new + // OSDMaps. others should be passing around the canonical encoded + // buffers from on high. select out those callers by passing in an + // "impossible" feature bit. + ceph_assert(features & CEPH_FEATURE_RESERVED); + features &= ~CEPH_FEATURE_RESERVED; + + size_t start_offset = bl.length(); + size_t tail_offset; + size_t crc_offset; + std::optional<buffer::list::contiguous_filler> crc_filler; + + // meta-encoding: how we include client-used and osd-specific data + ENCODE_START(8, 7, bl); + + { + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + uint8_t v = 9; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 3; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 6; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 7; + } + ENCODE_START(v, 1, bl); // client-usable data + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + encode(pools, bl, features); + encode(pool_name, bl); + encode(pool_max, bl); + + if (v < 4) { + decltype(flags) f = flags; + if (require_osd_release >= CEPH_RELEASE_LUMINOUS) + f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES; + else if (require_osd_release == CEPH_RELEASE_KRAKEN) + f |= CEPH_OSDMAP_REQUIRE_KRAKEN; + else if (require_osd_release == CEPH_RELEASE_JEWEL) + f |= CEPH_OSDMAP_REQUIRE_JEWEL; + encode(f, bl); + } else { + encode(flags, bl); + } + + encode(max_osd, bl); + if (v >= 5) { + encode(osd_state, bl); + } else { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + if (v >= 8) { + encode(osd_addrs->client_addrs, bl, features); + } else { + encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features); + } + + encode(*pg_temp, bl); + encode(*primary_temp, bl); + if (osd_primary_affinity) { + encode(*osd_primary_affinity, bl); + } else { + vector<__u32> v; + encode(v, bl); + } + + // crush + bufferlist cbl; + crush->encode(cbl, features); + encode(cbl, bl); + encode(erasure_code_profiles, bl); + + if (v >= 4) { + encode(pg_upmap, bl); + encode(pg_upmap_items, bl); + } else { + ceph_assert(pg_upmap.empty()); + ceph_assert(pg_upmap_items.empty()); + } + if (v >= 6) { + encode(crush_version, bl); + } + if (v >= 7) { + encode(new_removed_snaps, bl); + encode(new_purged_snaps, bl); + } + if (v >= 9) { + encode(last_up_change, bl); + encode(last_in_change, bl); + } + ENCODE_FINISH(bl); // client-usable data + } + + { + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + uint8_t target_v = 9; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + target_v = 1; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + target_v = 5; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + target_v = 6; + } + ENCODE_START(target_v, 1, bl); // extended, osd-only data + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features); + } else { + encode(osd_addrs->hb_back_addrs, bl, features); + } + encode(osd_info, bl); + { + // put this in a sorted, ordered map<> so that we encode in a + // deterministic order. + map<entity_addr_t,utime_t> blacklist_map; + for (const auto &addr : blacklist) + blacklist_map.insert(make_pair(addr.first, addr.second)); + encode(blacklist_map, bl, features); + } + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features); + } else { + encode(osd_addrs->cluster_addrs, bl, features); + } + encode(cluster_snapshot_epoch, bl); + encode(cluster_snapshot, bl); + encode(*osd_uuid, bl); + encode(osd_xinfo, bl); + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features); + } else { + encode(osd_addrs->hb_front_addrs, bl, features); + } + if (target_v >= 2) { + encode(nearfull_ratio, bl); + encode(full_ratio, bl); + encode(backfillfull_ratio, bl); + } + // 4 was string-based new_require_min_compat_client + if (target_v >= 5) { + encode(require_min_compat_client, bl); + encode(require_osd_release, bl); + } + if (target_v >= 6) { + encode(removed_snaps_queue, bl); + } + if (target_v >= 8) { + encode(crush_node_flags, bl); + } + if (target_v >= 9) { + encode(device_class_flags, bl); + } + ENCODE_FINISH(bl); // osd-only data + } + + crc_offset = bl.length(); + crc_filler = bl.append_hole(sizeof(uint32_t)); + tail_offset = bl.length(); + + ENCODE_FINISH(bl); // meta-encoding wrapper + + // fill in crc + bufferlist front; + front.substr_of(bl, start_offset, crc_offset - start_offset); + crc = front.crc32c(-1); + if (tail_offset < bl.length()) { + bufferlist tail; + tail.substr_of(bl, tail_offset, bl.length() - tail_offset); + crc = tail.crc32c(crc); + } + ceph_le32 crc_le; + crc_le = crc; + crc_filler->copy_in(4, (char*)&crc_le); + crc_defined = true; +} + +/* for a description of osdmap versions, and when they were introduced, please + * refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::decode(bufferlist& bl) +{ + auto p = bl.cbegin(); + decode(p); +} + +void OSDMap::decode_classic(bufferlist::const_iterator& p) +{ + using ceph::decode; + __u32 n, t; + __u16 v; + decode(v, p); + + // base + decode(fsid, p); + decode(epoch, p); + decode(created, p); + decode(modified, p); + + if (v < 6) { + if (v < 4) { + int32_t max_pools = 0; + decode(max_pools, p); + pool_max = max_pools; + } + pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(pools[t], p); + } + if (v == 4) { + decode(n, p); + pool_max = n; + } else if (v == 5) { + pool_name.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(pool_name[t], p); + } + decode(n, p); + pool_max = n; + } + } else { + decode(pools, p); + decode(pool_name, p); + decode(pool_max, p); + } + // kludge around some old bug that zeroed out pool_max (#2307) + if (pools.size() && pool_max < pools.rbegin()->first) { + pool_max = pools.rbegin()->first; + } + + decode(flags, p); + + decode(max_osd, p); + { + vector<uint8_t> os; + decode(os, p); + osd_state.resize(os.size()); + for (unsigned i = 0; i < os.size(); ++i) { + osd_state[i] = os[i]; + } + } + decode(osd_weight, p); + decode(osd_addrs->client_addrs, p); + if (v <= 5) { + pg_temp->clear(); + decode(n, p); + while (n--) { + old_pg_t opg; + ::decode_raw(opg, p); + mempool::osdmap::vector<int32_t> v; + decode(v, p); + pg_temp->set(pg_t(opg), v); + } + } else { + decode(*pg_temp, p); + } + + // crush + bufferlist cbl; + decode(cbl, p); + auto cblp = cbl.cbegin(); + crush->decode(cblp); + + // extended + __u16 ev = 0; + if (v >= 5) + decode(ev, p); + decode(osd_addrs->hb_back_addrs, p); + decode(osd_info, p); + if (v < 5) + decode(pool_name, p); + + decode(blacklist, p); + if (ev >= 6) + decode(osd_addrs->cluster_addrs, p); + else + osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size()); + + if (ev >= 7) { + decode(cluster_snapshot_epoch, p); + decode(cluster_snapshot, p); + } + + if (ev >= 8) { + decode(*osd_uuid, p); + } else { + osd_uuid->resize(max_osd); + } + if (ev >= 9) + decode(osd_xinfo, p); + else + osd_xinfo.resize(max_osd); + + if (ev >= 10) + decode(osd_addrs->hb_front_addrs, p); + else + osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size()); + + osd_primary_affinity.reset(); + + post_decode(); +} + +void OSDMap::decode(bufferlist::const_iterator& bl) +{ + using ceph::decode; + /** + * Older encodings of the OSDMap had a single struct_v which + * covered the whole encoding, and was prior to our modern + * stuff which includes a compatv and a size. So if we see + * a struct_v < 7, we must rewind to the beginning and use our + * classic decoder. + */ + size_t start_offset = bl.get_off(); + size_t tail_offset = 0; + bufferlist crc_front, crc_tail; + + DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper + if (struct_v < 7) { + bl.seek(start_offset); + decode_classic(bl); + return; + } + /** + * Since we made it past that hurdle, we can use our normal paths. + */ + { + DECODE_START(9, bl); // client-usable data + // base + decode(fsid, bl); + decode(epoch, bl); + decode(created, bl); + decode(modified, bl); + + decode(pools, bl); + decode(pool_name, bl); + decode(pool_max, bl); + + decode(flags, bl); + + decode(max_osd, bl); + if (struct_v >= 5) { + decode(osd_state, bl); + } else { + vector<uint8_t> os; + decode(os, bl); + osd_state.resize(os.size()); + for (unsigned i = 0; i < os.size(); ++i) { + osd_state[i] = os[i]; + } + } + decode(osd_weight, bl); + decode(osd_addrs->client_addrs, bl); + + decode(*pg_temp, bl); + decode(*primary_temp, bl); + // dates back to firefly. version increased from 2 to 3 still in firefly. + // do we really still need to keep this around? even for old clients? + if (struct_v >= 2) { + osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>); + decode(*osd_primary_affinity, bl); + if (osd_primary_affinity->empty()) + osd_primary_affinity.reset(); + } else { + osd_primary_affinity.reset(); + } + + // crush + bufferlist cbl; + decode(cbl, bl); + auto cblp = cbl.cbegin(); + crush->decode(cblp); + // added in firefly; version increased in luminous, so it affects + // giant, hammer, infernallis, jewel, and kraken. probably should be left + // alone until we require clients to be all luminous? + if (struct_v >= 3) { + decode(erasure_code_profiles, bl); + } else { + erasure_code_profiles.clear(); + } + // version increased from 3 to 4 still in luminous, so same as above + // applies. + if (struct_v >= 4) { + decode(pg_upmap, bl); + decode(pg_upmap_items, bl); + } else { + pg_upmap.clear(); + pg_upmap_items.clear(); + } + // again, version increased from 5 to 6 still in luminous, so above + // applies. + if (struct_v >= 6) { + decode(crush_version, bl); + } + // version increase from 6 to 7 in mimic + if (struct_v >= 7) { + decode(new_removed_snaps, bl); + decode(new_purged_snaps, bl); + } + // version increase from 7 to 8, 8 to 9, in nautilus. + if (struct_v >= 9) { + decode(last_up_change, bl); + decode(last_in_change, bl); + } + DECODE_FINISH(bl); // client-usable data + } + + { + DECODE_START(9, bl); // extended, osd-only data + decode(osd_addrs->hb_back_addrs, bl); + decode(osd_info, bl); + decode(blacklist, bl); + decode(osd_addrs->cluster_addrs, bl); + decode(cluster_snapshot_epoch, bl); + decode(cluster_snapshot, bl); + decode(*osd_uuid, bl); + decode(osd_xinfo, bl); + decode(osd_addrs->hb_front_addrs, bl); + // + if (struct_v >= 2) { + decode(nearfull_ratio, bl); + decode(full_ratio, bl); + } else { + nearfull_ratio = 0; + full_ratio = 0; + } + if (struct_v >= 3) { + decode(backfillfull_ratio, bl); + } else { + backfillfull_ratio = 0; + } + if (struct_v == 4) { + string r; + decode(r, bl); + if (r.length()) + require_min_compat_client = ceph_release_from_name(r.c_str()); + } + if (struct_v >= 5) { + decode(require_min_compat_client, bl); + decode(require_osd_release, bl); + if (require_osd_release >= CEPH_RELEASE_NAUTILUS) { + flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT; + } + if (require_osd_release >= CEPH_RELEASE_LUMINOUS) { + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } + } else { + if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) { + // only for compat with post-kraken pre-luminous test clusters + require_osd_release = CEPH_RELEASE_LUMINOUS; + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) { + require_osd_release = CEPH_RELEASE_KRAKEN; + } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) { + require_osd_release = CEPH_RELEASE_JEWEL; + } else { + require_osd_release = 0; + } + } + if (struct_v >= 6) { + decode(removed_snaps_queue, bl); + } + if (struct_v >= 8) { + decode(crush_node_flags, bl); + } else { + crush_node_flags.clear(); + } + if (struct_v >= 9) { + decode(device_class_flags, bl); + } else { + device_class_flags.clear(); + } + DECODE_FINISH(bl); // osd-only data + } + + if (struct_v >= 8) { + crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset); + decode(crc, bl); + tail_offset = bl.get_off(); + crc_defined = true; + } else { + crc_defined = false; + crc = 0; + } + + DECODE_FINISH(bl); // wrapper + + if (tail_offset) { + // verify crc + uint32_t actual = crc_front.crc32c(-1); + if (tail_offset < bl.get_off()) { + bufferlist tail; + tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset); + actual = tail.crc32c(actual); + } + if (crc != actual) { + ostringstream ss; + ss << "bad crc, actual " << actual << " != expected " << crc; + string s = ss.str(); + throw buffer::malformed_input(s.c_str()); + } + } + + post_decode(); +} + +void OSDMap::post_decode() +{ + // index pool names + name_pool.clear(); + for (const auto &pname : pool_name) { + name_pool[pname.second] = pname.first; + } + + calc_num_osds(); + _calc_up_osd_features(); +} + +void OSDMap::dump_erasure_code_profiles( + const mempool::osdmap::map<string,map<string,string>>& profiles, + Formatter *f) +{ + f->open_object_section("erasure_code_profiles"); + for (const auto &profile : profiles) { + f->open_object_section(profile.first.c_str()); + for (const auto &profm : profile.second) { + f->dump_string(profm.first.c_str(), profm.second.c_str()); + } + f->close_section(); + } + f->close_section(); +} + +void OSDMap::dump(Formatter *f) const +{ + f->dump_int("epoch", get_epoch()); + f->dump_stream("fsid") << get_fsid(); + f->dump_stream("created") << get_created(); + f->dump_stream("modified") << get_modified(); + f->dump_stream("last_up_change") << last_up_change; + f->dump_stream("last_in_change") << last_in_change; + f->dump_string("flags", get_flag_string()); + f->dump_unsigned("flags_num", flags); + f->open_array_section("flags_set"); + set<string> flagset; + get_flag_set(&flagset); + for (auto p : flagset) { + f->dump_string("flag", p); + } + f->close_section(); + f->dump_unsigned("crush_version", get_crush_version()); + f->dump_float("full_ratio", full_ratio); + f->dump_float("backfillfull_ratio", backfillfull_ratio); + f->dump_float("nearfull_ratio", nearfull_ratio); + f->dump_string("cluster_snapshot", get_cluster_snapshot()); + f->dump_int("pool_max", get_pool_max()); + f->dump_int("max_osd", get_max_osd()); + f->dump_string("require_min_compat_client", + ceph_release_name(require_min_compat_client)); + f->dump_string("min_compat_client", + ceph_release_name(get_min_compat_client())); + f->dump_string("require_osd_release", + ceph_release_name(require_osd_release)); + + f->open_array_section("pools"); + for (const auto &pool : pools) { + std::string name("<unknown>"); + const auto &pni = pool_name.find(pool.first); + if (pni != pool_name.end()) + name = pni->second; + f->open_object_section("pool"); + f->dump_int("pool", pool.first); + f->dump_string("pool_name", name); + pool.second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_array_section("osds"); + for (int i=0; i<get_max_osd(); i++) + if (exists(i)) { + f->open_object_section("osd_info"); + f->dump_int("osd", i); + f->dump_stream("uuid") << get_uuid(i); + f->dump_int("up", is_up(i)); + f->dump_int("in", is_in(i)); + f->dump_float("weight", get_weightf(i)); + f->dump_float("primary_affinity", get_primary_affinityf(i)); + get_info(i).dump(f); + f->dump_object("public_addrs", get_addrs(i)); + f->dump_object("cluster_addrs", get_cluster_addrs(i)); + f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(i)); + f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(i)); + // compat + f->dump_stream("public_addr") << get_addrs(i).get_legacy_str(); + f->dump_stream("cluster_addr") << get_cluster_addrs(i).get_legacy_str(); + f->dump_stream("heartbeat_back_addr") + << get_hb_back_addrs(i).get_legacy_str(); + f->dump_stream("heartbeat_front_addr") + << get_hb_front_addrs(i).get_legacy_str(); + + set<string> st; + get_state(i, st); + f->open_array_section("state"); + for (const auto &state : st) + f->dump_string("state", state); + f->close_section(); + + f->close_section(); + } + f->close_section(); + + f->open_array_section("osd_xinfo"); + for (int i=0; i<get_max_osd(); i++) { + if (exists(i)) { + f->open_object_section("xinfo"); + f->dump_int("osd", i); + osd_xinfo[i].dump(f); + f->close_section(); + } + } + f->close_section(); + + f->open_array_section("pg_upmap"); + for (auto& p : pg_upmap) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << p.first; + f->open_array_section("osds"); + for (auto q : p.second) { + f->dump_int("osd", q); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("pg_upmap_items"); + for (auto& p : pg_upmap_items) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << p.first; + f->open_array_section("mappings"); + for (auto& q : p.second) { + f->open_object_section("mapping"); + f->dump_int("from", q.first); + f->dump_int("to", q.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("pg_temp"); + pg_temp->dump(f); + f->close_section(); + + f->open_array_section("primary_temp"); + for (const auto &pg : *primary_temp) { + f->dump_stream("pgid") << pg.first; + f->dump_int("osd", pg.second); + } + f->close_section(); // primary_temp + + f->open_object_section("blacklist"); + for (const auto &addr : blacklist) { + stringstream ss; + ss << addr.first; + f->dump_stream(ss.str().c_str()) << addr.second; + } + f->close_section(); + + dump_erasure_code_profiles(erasure_code_profiles, f); + + f->open_array_section("removed_snaps_queue"); + for (auto& p : removed_snaps_queue) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_removed_snaps"); + for (auto& p : new_removed_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_purged_snaps"); + for (auto& p : new_purged_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_object_section("crush_node_flags"); + for (auto& i : crush_node_flags) { + string s = crush->item_exists(i.first) ? crush->get_item_name(i.first) + : stringify(i.first); + f->open_array_section(s.c_str()); + set<string> st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_object_section("device_class_flags"); + for (auto& i : device_class_flags) { + const char* class_name = crush->get_class_name(i.first); + string s = class_name ? class_name : stringify(i.first); + f->open_array_section(s.c_str()); + set<string> st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); +} + +void OSDMap::generate_test_instances(list<OSDMap*>& o) +{ + o.push_back(new OSDMap); + + CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY); + o.push_back(new OSDMap); + uuid_d fsid; + o.back()->build_simple(cct, 1, fsid, 16); + o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp + o.back()->blacklist[entity_addr_t()] = utime_t(5, 6); + cct->put(); +} + +string OSDMap::get_flag_string(unsigned f) +{ + string s; + if ( f& CEPH_OSDMAP_NEARFULL) + s += ",nearfull"; + if (f & CEPH_OSDMAP_FULL) + s += ",full"; + if (f & CEPH_OSDMAP_PAUSERD) + s += ",pauserd"; + if (f & CEPH_OSDMAP_PAUSEWR) + s += ",pausewr"; + if (f & CEPH_OSDMAP_PAUSEREC) + s += ",pauserec"; + if (f & CEPH_OSDMAP_NOUP) + s += ",noup"; + if (f & CEPH_OSDMAP_NODOWN) + s += ",nodown"; + if (f & CEPH_OSDMAP_NOOUT) + s += ",noout"; + if (f & CEPH_OSDMAP_NOIN) + s += ",noin"; + if (f & CEPH_OSDMAP_NOBACKFILL) + s += ",nobackfill"; + if (f & CEPH_OSDMAP_NOREBALANCE) + s += ",norebalance"; + if (f & CEPH_OSDMAP_NORECOVER) + s += ",norecover"; + if (f & CEPH_OSDMAP_NOSCRUB) + s += ",noscrub"; + if (f & CEPH_OSDMAP_NODEEP_SCRUB) + s += ",nodeep-scrub"; + if (f & CEPH_OSDMAP_NOTIERAGENT) + s += ",notieragent"; + if (f & CEPH_OSDMAP_NOSNAPTRIM) + s += ",nosnaptrim"; + if (f & CEPH_OSDMAP_SORTBITWISE) + s += ",sortbitwise"; + if (f & CEPH_OSDMAP_REQUIRE_JEWEL) + s += ",require_jewel_osds"; + if (f & CEPH_OSDMAP_REQUIRE_KRAKEN) + s += ",require_kraken_osds"; + if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS) + s += ",require_luminous_osds"; + if (f & CEPH_OSDMAP_RECOVERY_DELETES) + s += ",recovery_deletes"; + if (f & CEPH_OSDMAP_PURGED_SNAPDIRS) + s += ",purged_snapdirs"; + if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT) + s += ",pglog_hardlimit"; + if (s.length()) + s.erase(0, 1); + return s; +} + +string OSDMap::get_flag_string() const +{ + return get_flag_string(flags); +} + +void OSDMap::print_pools(ostream& out) const +{ + for (const auto &pool : pools) { + std::string name("<unknown>"); + const auto &pni = pool_name.find(pool.first); + if (pni != pool_name.end()) + name = pni->second; + out << "pool " << pool.first + << " '" << name + << "' " << pool.second << "\n"; + + for (const auto &snap : pool.second.snaps) + out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n"; + + if (!pool.second.removed_snaps.empty()) + out << "\tremoved_snaps " << pool.second.removed_snaps << "\n"; + auto p = removed_snaps_queue.find(pool.first); + if (p != removed_snaps_queue.end()) { + out << "\tremoved_snaps_queue " << p->second << "\n"; + } + } + out << std::endl; +} + +void OSDMap::print(ostream& out) const +{ + out << "epoch " << get_epoch() << "\n" + << "fsid " << get_fsid() << "\n" + << "created " << get_created() << "\n" + << "modified " << get_modified() << "\n"; + + out << "flags " << get_flag_string() << "\n"; + out << "crush_version " << get_crush_version() << "\n"; + out << "full_ratio " << full_ratio << "\n"; + out << "backfillfull_ratio " << backfillfull_ratio << "\n"; + out << "nearfull_ratio " << nearfull_ratio << "\n"; + if (require_min_compat_client > 0) { + out << "require_min_compat_client " + << ceph_release_name(require_min_compat_client) << "\n"; + } + out << "min_compat_client " << ceph_release_name(get_min_compat_client()) + << "\n"; + if (require_osd_release > 0) { + out << "require_osd_release " << ceph_release_name(require_osd_release) + << "\n"; + } + if (get_cluster_snapshot().length()) + out << "cluster_snapshot " << get_cluster_snapshot() << "\n"; + out << "\n"; + + print_pools(out); + + out << "max_osd " << get_max_osd() << "\n"; + for (int i=0; i<get_max_osd(); i++) { + if (exists(i)) { + out << "osd." << i; + out << (is_up(i) ? " up ":" down"); + out << (is_in(i) ? " in ":" out"); + out << " weight " << get_weightf(i); + if (get_primary_affinity(i) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) + out << " primary_affinity " << get_primary_affinityf(i); + const osd_info_t& info(get_info(i)); + out << " " << info; + out << " " << get_addrs(i) << " " << get_cluster_addrs(i); + set<string> st; + get_state(i, st); + out << " " << st; + if (!get_uuid(i).is_zero()) + out << " " << get_uuid(i); + out << "\n"; + } + } + out << std::endl; + + for (auto& p : pg_upmap) { + out << "pg_upmap " << p.first << " " << p.second << "\n"; + } + for (auto& p : pg_upmap_items) { + out << "pg_upmap_items " << p.first << " " << p.second << "\n"; + } + + for (const auto pg : *pg_temp) + out << "pg_temp " << pg.first << " " << pg.second << "\n"; + + for (const auto pg : *primary_temp) + out << "primary_temp " << pg.first << " " << pg.second << "\n"; + + for (const auto &addr : blacklist) + out << "blacklist " << addr.first << " expires " << addr.second << "\n"; +} + +class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> { +public: + typedef CrushTreeDumper::Dumper<TextTable> Parent; + + OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + unsigned f) + : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { } + + bool should_dump_leaf(int i) const override { + if (!filter) { + return true; // normal case + } + if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) || + ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) || + ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) || + ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) || + ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) { + return true; + } + return false; + } + + bool should_dump_empty_bucket() const override { + return !filter; + } + + void init_table(TextTable *tbl) { + tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT); + tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT); + } + void dump(TextTable *tbl, string& bucket) { + init_table(tbl); + + if (!bucket.empty()) { + set_root(bucket); + Parent::dump(tbl); + } else { + Parent::dump(tbl); + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) { + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl); + } + } + } + } + +protected: + void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override { + const char *c = crush->get_item_class(qi.id); + if (!c) + c = ""; + *tbl << qi.id + << c + << weightf_t(qi.weight); + + ostringstream name; + for (int k = 0; k < qi.depth; k++) + name << " "; + if (qi.is_bucket()) { + name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " " + << crush->get_item_name(qi.id); + } else { + name << "osd." << qi.id; + } + *tbl << name.str(); + + if (!qi.is_bucket()) { + if (!osdmap->exists(qi.id)) { + *tbl << "DNE" + << 0; + } else { + string s; + if (osdmap->is_up(qi.id)) { + s = "up"; + } else if (osdmap->is_destroyed(qi.id)) { + s = "destroyed"; + } else { + s = "down"; + } + *tbl << s + << weightf_t(osdmap->get_weightf(qi.id)) + << weightf_t(osdmap->get_primary_affinityf(qi.id)); + } + } + *tbl << TextTable::endrow; + } + +private: + const OSDMap *osdmap; + const unsigned filter; +}; + +class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper { +public: + typedef CrushTreeDumper::FormattingDumper Parent; + + OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + unsigned f) + : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { } + + bool should_dump_leaf(int i) const override { + if (!filter) { + return true; // normal case + } + if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) || + ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) || + ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) || + ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) || + ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) { + return true; + } + return false; + } + + bool should_dump_empty_bucket() const override { + return !filter; + } + + void dump(Formatter *f, string& bucket) { + if (!bucket.empty()) { + set_root(bucket); + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + } else { + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + f->open_array_section("stray"); + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f); + } + f->close_section(); + } + } + +protected: + void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override { + Parent::dump_item_fields(qi, f); + if (!qi.is_bucket()) + { + string s; + if (osdmap->is_up(qi.id)) { + s = "up"; + } else if (osdmap->is_destroyed(qi.id)) { + s = "destroyed"; + } else { + s = "down"; + } + f->dump_unsigned("exists", (int)osdmap->exists(qi.id)); + f->dump_string("status", s); + f->dump_float("reweight", osdmap->get_weightf(qi.id)); + f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id)); + } + } + +private: + const OSDMap *osdmap; + const unsigned filter; +}; + +void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const +{ + if (f) { + OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket); + } else { + ceph_assert(out); + TextTable tbl; + OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket); + *out << tbl; + } +} + +void OSDMap::print_summary(Formatter *f, ostream& out, + const string& prefix, bool extra) const +{ + if (f) { + f->open_object_section("osdmap"); + f->dump_int("epoch", get_epoch()); + f->dump_int("num_osds", get_num_osds()); + f->dump_int("num_up_osds", get_num_up_osds()); + f->dump_int("num_in_osds", get_num_in_osds()); + f->dump_unsigned("num_remapped_pgs", get_num_pg_temp()); + f->close_section(); + } else { + utime_t now = ceph_clock_now(); + out << get_num_osds() << " osds: " + << get_num_up_osds() << " up"; + if (last_up_change != utime_t()) { + out << " (since " << utimespan_str(now - last_up_change) << ")"; + } + out << ", " << get_num_in_osds() << " in"; + if (last_in_change != utime_t()) { + out << " (since " << utimespan_str(now - last_in_change) << ")"; + } + if (extra) + out << "; epoch: e" << get_epoch(); + if (get_num_pg_temp()) + out << "; " << get_num_pg_temp() << " remapped pgs"; + out << "\n"; + uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS; + if (important_flags) + out << prefix << "flags " << get_flag_string(important_flags) << "\n"; + } +} + +void OSDMap::print_oneline_summary(ostream& out) const +{ + out << "e" << get_epoch() << ": " + << get_num_osds() << " total, " + << get_num_up_osds() << " up, " + << get_num_in_osds() << " in"; +} + +bool OSDMap::crush_rule_in_use(int rule_id) const +{ + for (const auto &pool : pools) { + if (pool.second.crush_rule == rule_id) + return true; + } + return false; +} + +int OSDMap::validate_crush_rules(CrushWrapper *newcrush, + ostream *ss) const +{ + for (auto& i : pools) { + auto& pool = i.second; + int ruleno = pool.get_crush_rule(); + if (!newcrush->rule_exists(ruleno)) { + *ss << "pool " << i.first << " references crush_rule " << ruleno + << " but it is not present"; + return -EINVAL; + } + if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) { + *ss << "rule " << ruleno << " mask ruleset does not match rule id"; + return -EINVAL; + } + if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) { + *ss << "pool " << i.first << " type does not match rule " << ruleno; + return -EINVAL; + } + int poolsize = pool.get_size(); + if (poolsize < newcrush->get_rule_mask_min_size(ruleno) || + poolsize > newcrush->get_rule_mask_max_size(ruleno)) { + *ss << "pool " << i.first << " size " << poolsize << " does not" + << " fall within rule " << ruleno + << " min_size " << newcrush->get_rule_mask_min_size(ruleno) + << " and max_size " << newcrush->get_rule_mask_max_size(ruleno); + return -EINVAL; + } + } + return 0; +} + +int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid, + int nosd, int pg_bits, int pgp_bits, + bool default_pool) +{ + ldout(cct, 10) << "build_simple on " << nosd + << " osds" << dendl; + epoch = e; + set_fsid(fsid); + created = modified = ceph_clock_now(); + + if (nosd >= 0) { + set_max_osd(nosd); + } else { + // count osds + int maxosd = 0; + const auto& conf = cct->_conf; + vector<string> sections; + conf.get_all_sections(sections); + + for (auto §ion : sections) { + if (section.find("osd.") != 0) + continue; + + const char *begin = section.c_str() + 4; + char *end = (char*)begin; + int o = strtol(begin, &end, 10); + if (*end != '\0') + continue; + + if (o > cct->_conf->mon_max_osd) { + lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl; + return -ERANGE; + } + + if (o > maxosd) + maxosd = o; + } + + set_max_osd(maxosd + 1); + } + + + stringstream ss; + int r; + if (nosd >= 0) + r = build_simple_crush_map(cct, *crush, nosd, &ss); + else + r = build_simple_crush_map_from_conf(cct, *crush, &ss); + ceph_assert(r == 0); + + int poolbase = get_max_osd() ? get_max_osd() : 1; + + const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct); + ceph_assert(default_replicated_rule >= 0); + + if (default_pool) { + // pgp_num <= pg_num + if (pgp_bits > pg_bits) + pgp_bits = pg_bits; + + vector<string> pool_names; + pool_names.push_back("rbd"); + for (auto &plname : pool_names) { + int64_t pool = ++pool_max; + pools[pool].type = pg_pool_t::TYPE_REPLICATED; + pools[pool].flags = cct->_conf->osd_pool_default_flags; + if (cct->_conf->osd_pool_default_flag_hashpspool) + pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL); + if (cct->_conf->osd_pool_default_flag_nodelete) + pools[pool].set_flag(pg_pool_t::FLAG_NODELETE); + if (cct->_conf->osd_pool_default_flag_nopgchange) + pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE); + if (cct->_conf->osd_pool_default_flag_nosizechange) + pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE); + pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size"); + pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size( + pools[pool].size); + pools[pool].crush_rule = default_replicated_rule; + pools[pool].object_hash = CEPH_STR_HASH_RJENKINS; + pools[pool].set_pg_num(poolbase << pg_bits); + pools[pool].set_pgp_num(poolbase << pgp_bits); + pools[pool].set_pg_num_target(poolbase << pg_bits); + pools[pool].set_pgp_num_target(poolbase << pgp_bits); + pools[pool].last_change = epoch; + pools[pool].application_metadata.insert( + {pg_pool_t::APPLICATION_NAME_RBD, {}}); + auto m = pg_pool_t::get_pg_autoscale_mode_by_name( + cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode")); + pools[pool].pg_autoscale_mode = m >= 0 ? m : 0; + pool_name[pool] = plname; + name_pool[plname] = pool; + } + } + + for (int i=0; i<get_max_osd(); i++) { + set_state(i, 0); + set_weight(i, CEPH_OSD_OUT); + } + + map<string,string> profile_map; + r = get_erasure_code_profile_default(cct, profile_map, &ss); + if (r < 0) { + lderr(cct) << ss.str() << dendl; + return r; + } + set_erasure_code_profile("default", profile_map); + return 0; +} + +int OSDMap::get_erasure_code_profile_default(CephContext *cct, + map<string,string> &profile_map, + ostream *ss) +{ + int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"), + *ss, + &profile_map); + return r; +} + +int OSDMap::_build_crush_types(CrushWrapper& crush) +{ + crush.set_type_name(0, "osd"); + crush.set_type_name(1, "host"); + crush.set_type_name(2, "chassis"); + crush.set_type_name(3, "rack"); + crush.set_type_name(4, "row"); + crush.set_type_name(5, "pdu"); + crush.set_type_name(6, "pod"); + crush.set_type_name(7, "room"); + crush.set_type_name(8, "datacenter"); + crush.set_type_name(9, "zone"); + crush.set_type_name(10, "region"); + crush.set_type_name(11, "root"); + return 11; +} + +int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush, + int nosd, ostream *ss) +{ + crush.create(); + + // root + int root_type = _build_crush_types(crush); + int rootid; + int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT, + root_type, 0, NULL, NULL, &rootid); + ceph_assert(r == 0); + crush.set_item_name(rootid, "default"); + + for (int o=0; o<nosd; o++) { + map<string,string> loc; + loc["host"] = "localhost"; + loc["rack"] = "localrack"; + loc["root"] = "default"; + ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl; + char name[32]; + snprintf(name, sizeof(name), "osd.%d", o); + crush.insert_item(cct, o, 1.0, name, loc); + } + + build_simple_crush_rules(cct, crush, "default", ss); + + crush.finalize(); + + return 0; +} + +int OSDMap::build_simple_crush_map_from_conf(CephContext *cct, + CrushWrapper& crush, + ostream *ss) +{ + const auto& conf = cct->_conf; + + crush.create(); + + // root + int root_type = _build_crush_types(crush); + int rootid; + int r = crush.add_bucket(0, 0, + CRUSH_HASH_DEFAULT, + root_type, 0, NULL, NULL, &rootid); + ceph_assert(r == 0); + crush.set_item_name(rootid, "default"); + + // add osds + vector<string> sections; + conf.get_all_sections(sections); + + for (auto §ion : sections) { + if (section.find("osd.") != 0) + continue; + + const char *begin = section.c_str() + 4; + char *end = (char*)begin; + int o = strtol(begin, &end, 10); + if (*end != '\0') + continue; + + string host, rack, row, room, dc, pool; + vector<string> sectiontmp; + sectiontmp.push_back("osd"); + sectiontmp.push_back(section); + conf.get_val_from_conf_file(sectiontmp, "host", host, false); + conf.get_val_from_conf_file(sectiontmp, "rack", rack, false); + conf.get_val_from_conf_file(sectiontmp, "row", row, false); + conf.get_val_from_conf_file(sectiontmp, "room", room, false); + conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false); + conf.get_val_from_conf_file(sectiontmp, "root", pool, false); + + if (host.length() == 0) + host = "unknownhost"; + if (rack.length() == 0) + rack = "unknownrack"; + + map<string,string> loc; + loc["host"] = host; + loc["rack"] = rack; + if (row.size()) + loc["row"] = row; + if (room.size()) + loc["room"] = room; + if (dc.size()) + loc["datacenter"] = dc; + loc["root"] = "default"; + + ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl; + crush.insert_item(cct, o, 1.0, section, loc); + } + + build_simple_crush_rules(cct, crush, "default", ss); + + crush.finalize(); + + return 0; +} + + +int OSDMap::build_simple_crush_rules( + CephContext *cct, + CrushWrapper& crush, + const string& root, + ostream *ss) +{ + int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct); + string failure_domain = + crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type); + + int r; + r = crush.add_simple_rule_at( + "replicated_rule", root, failure_domain, "", + "firstn", pg_pool_t::TYPE_REPLICATED, + crush_rule, ss); + if (r < 0) + return r; + // do not add an erasure rule by default or else we will implicitly + // require the crush_v2 feature of clients + return 0; +} + +int OSDMap::summarize_mapping_stats( + OSDMap *newmap, + const set<int64_t> *pools, + std::string *out, + Formatter *f) const +{ + set<int64_t> ls; + if (pools) { + ls = *pools; + } else { + for (auto &p : get_pools()) + ls.insert(p.first); + } + + unsigned total_pg = 0; + unsigned moved_pg = 0; + vector<unsigned> base_by_osd(get_max_osd(), 0); + vector<unsigned> new_by_osd(get_max_osd(), 0); + for (int64_t pool_id : ls) { + const pg_pool_t *pi = get_pg_pool(pool_id); + vector<int> up, up2; + int up_primary; + for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) { + pg_t pgid(ps, pool_id); + total_pg += pi->get_size(); + pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr); + for (int osd : up) { + if (osd >= 0 && osd < get_max_osd()) + ++base_by_osd[osd]; + } + if (newmap) { + newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr); + for (int osd : up2) { + if (osd >= 0 && osd < get_max_osd()) + ++new_by_osd[osd]; + } + if (pi->type == pg_pool_t::TYPE_ERASURE) { + for (unsigned i=0; i<up.size(); ++i) { + if (up[i] != up2[i]) { + ++moved_pg; + } + } + } else if (pi->type == pg_pool_t::TYPE_REPLICATED) { + for (int osd : up) { + if (std::find(up2.begin(), up2.end(), osd) == up2.end()) { + ++moved_pg; + } + } + } else { + ceph_abort_msg("unhandled pool type"); + } + } + } + } + + unsigned num_up_in = 0; + for (int osd = 0; osd < get_max_osd(); ++osd) { + if (is_up(osd) && is_in(osd)) + ++num_up_in; + } + if (!num_up_in) { + return -EINVAL; + } + + float avg_pg = (float)total_pg / (float)num_up_in; + float base_stddev = 0, new_stddev = 0; + int min = -1, max = -1; + unsigned min_base_pg = 0, max_base_pg = 0; + unsigned min_new_pg = 0, max_new_pg = 0; + for (int osd = 0; osd < get_max_osd(); ++osd) { + if (is_up(osd) && is_in(osd)) { + float base_diff = (float)base_by_osd[osd] - avg_pg; + base_stddev += base_diff * base_diff; + float new_diff = (float)new_by_osd[osd] - avg_pg; + new_stddev += new_diff * new_diff; + if (min < 0 || base_by_osd[osd] < min_base_pg) { + min = osd; + min_base_pg = base_by_osd[osd]; + min_new_pg = new_by_osd[osd]; + } + if (max < 0 || base_by_osd[osd] > max_base_pg) { + max = osd; + max_base_pg = base_by_osd[osd]; + max_new_pg = new_by_osd[osd]; + } + } + } + base_stddev = sqrt(base_stddev / num_up_in); + new_stddev = sqrt(new_stddev / num_up_in); + + float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in))); + + ostringstream ss; + if (f) + f->open_object_section("utilization"); + if (newmap) { + if (f) { + f->dump_unsigned("moved_pgs", moved_pg); + f->dump_unsigned("total_pgs", total_pg); + } else { + float percent = 0; + if (total_pg) + percent = (float)moved_pg * 100.0 / (float)total_pg; + ss << "moved " << moved_pg << " / " << total_pg + << " (" << percent << "%)\n"; + } + } + if (f) { + f->dump_float("avg_pgs", avg_pg); + f->dump_float("std_dev", base_stddev); + f->dump_float("expected_baseline_std_dev", edev); + if (newmap) + f->dump_float("new_std_dev", new_stddev); + } else { + ss << "avg " << avg_pg << "\n"; + ss << "stddev " << base_stddev; + if (newmap) + ss << " -> " << new_stddev; + ss << " (expected baseline " << edev << ")\n"; + } + if (min >= 0) { + if (f) { + f->dump_unsigned("min_osd", min); + f->dump_unsigned("min_osd_pgs", min_base_pg); + if (newmap) + f->dump_unsigned("new_min_osd_pgs", min_new_pg); + } else { + ss << "min osd." << min << " with " << min_base_pg; + if (newmap) + ss << " -> " << min_new_pg; + ss << " pgs (" << (float)min_base_pg / avg_pg; + if (newmap) + ss << " -> " << (float)min_new_pg / avg_pg; + ss << " * mean)\n"; + } + } + if (max >= 0) { + if (f) { + f->dump_unsigned("max_osd", max); + f->dump_unsigned("max_osd_pgs", max_base_pg); + if (newmap) + f->dump_unsigned("new_max_osd_pgs", max_new_pg); + } else { + ss << "max osd." << max << " with " << max_base_pg; + if (newmap) + ss << " -> " << max_new_pg; + ss << " pgs (" << (float)max_base_pg / avg_pg; + if (newmap) + ss << " -> " << (float)max_new_pg / avg_pg; + ss << " * mean)\n"; + } + } + if (f) + f->close_section(); + if (out) + *out = ss.str(); + return 0; +} + +bool OSDMap::try_pg_upmap( + CephContext *cct, + pg_t pg, ///< pg to potentially remap + const set<int>& overfull, ///< osds we'd want to evacuate + const vector<int>& underfull, ///< osds to move to, in order of preference + const vector<int>& more_underfull, ///< more osds only slightly underfull + vector<int> *orig, + vector<int> *out) ///< resulting alternative mapping +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) + return false; + int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(), + pool->get_size()); + if (rule < 0) + return false; + + // make sure there is something there to remap + bool any = false; + for (auto osd : *orig) { + if (overfull.count(osd)) { + any = true; + break; + } + } + if (!any) { + return false; + } + + int r = crush->try_remap_rule( + cct, + rule, + pool->get_size(), + overfull, underfull, + more_underfull, + *orig, + out); + if (r < 0) + return false; + if (*out == *orig) + return false; + return true; +} + +int OSDMap::calc_pg_upmaps( + CephContext *cct, + uint32_t max_deviation, + int max, + const set<int64_t>& only_pools, + OSDMap::Incremental *pending_inc) +{ + ldout(cct, 10) << __func__ << " pools " << only_pools << dendl; + OSDMap tmp; + // Can't be less than 1 pg + if (max_deviation < 1) + max_deviation = 1; + tmp.deepish_copy_from(*this); + int num_changed = 0; + map<int,set<pg_t>> pgs_by_osd; + int total_pgs = 0; + float osd_weight_total = 0; + map<int,float> osd_weight; + for (auto& i : pools) { + if (!only_pools.empty() && !only_pools.count(i.first)) + continue; + for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) { + pg_t pg(ps, i.first); + vector<int> up; + tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr); + ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl; + for (auto osd : up) { + if (osd != CRUSH_ITEM_NONE) + pgs_by_osd[osd].insert(pg); + } + } + total_pgs += i.second.get_size() * i.second.get_pg_num(); + + map<int,float> pmap; + int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(), + i.second.get_type(), + i.second.get_size()); + tmp.crush->get_rule_weight_osd_map(ruleno, &pmap); + ldout(cct,20) << __func__ << " pool " << i.first + << " ruleno " << ruleno + << " weight-map " << pmap + << dendl; + for (auto p : pmap) { + auto adjusted_weight = tmp.get_weightf(p.first) * p.second; + if (adjusted_weight == 0) { + continue; + } + osd_weight[p.first] += adjusted_weight; + osd_weight_total += adjusted_weight; + } + } + for (auto& i : osd_weight) { + int pgs = 0; + auto p = pgs_by_osd.find(i.first); + if (p != pgs_by_osd.end()) + pgs = p->second.size(); + else + pgs_by_osd.emplace(i.first, set<pg_t>()); + ldout(cct, 20) << " osd." << i.first << " weight " << i.second + << " pgs " << pgs << dendl; + } + if (osd_weight_total == 0) { + lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl; + return 0; + } + float pgs_per_weight = total_pgs / osd_weight_total; + ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl; + ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl; + + if (max <= 0) { + lderr(cct) << __func__ << " abort due to max <= 0" << dendl; + return 0; + } + float stddev = 0; + map<int,float> osd_deviation; // osd, deviation(pgs) + multimap<float,int> deviation_osd; // deviation(pgs), osd + float cur_max_deviation = 0; + for (auto& i : pgs_by_osd) { + // make sure osd is still there (belongs to this crush-tree) + ceph_assert(osd_weight.count(i.first)); + float target = osd_weight[i.first] * pgs_per_weight; + float deviation = (float)i.second.size() - target; + ldout(cct, 20) << " osd." << i.first + << "\tpgs " << i.second.size() + << "\ttarget " << target + << "\tdeviation " << deviation + << dendl; + osd_deviation[i.first] = deviation; + deviation_osd.insert(make_pair(deviation, i.first)); + stddev += deviation * deviation; + if (fabsf(deviation) > cur_max_deviation) + cur_max_deviation = fabsf(deviation); + } + ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl; + if (cur_max_deviation <= max_deviation) { + ldout(cct, 10) << __func__ << " distribution is almost perfect" + << dendl; + return 0; + } + bool skip_overfull = false; + auto aggressive = + cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively"); + auto local_fallback_retries = + cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries"); + while (max--) { + ldout(cct, 30) << "Top of loop #" << max+1 << dendl; + // build overfull and underfull + set<int> overfull; + set<int> more_overfull; + bool using_more_overfull = false; + vector<int> underfull; + vector<int> more_underfull; + for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) { + ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl; + if (i->first <= 0) + break; + if (i->first > max_deviation) { + ldout(cct, 30) << " add overfull osd." << i->second << dendl; + overfull.insert(i->second); + } else { + more_overfull.insert(i->second); + } + } + + for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) { + ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl; + if (i->first >= 0) + break; + if (i->first < -(int)max_deviation) { + ldout(cct, 30) << " add underfull osd." << i->second << dendl; + underfull.push_back(i->second); + } else { + more_underfull.push_back(i->second); + } + } + if (underfull.empty() && overfull.empty()) { + ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl; + break; + } + if (overfull.empty() && !underfull.empty()) { + ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl; + overfull = more_overfull; + using_more_overfull = true; + } + + ldout(cct, 10) << " overfull " << overfull + << " underfull " << underfull + << dendl; + set<pg_t> to_skip; + uint64_t local_fallback_retried = 0; + + retry: + + set<pg_t> to_unmap; + map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap; + auto temp_pgs_by_osd = pgs_by_osd; + // always start with fullest, break if we find any changes to make + for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) { + if (skip_overfull && !underfull.empty()) { + ldout(cct, 10) << " skipping overfull " << dendl; + break; // fall through to check underfull + } + int osd = p->second; + float deviation = p->first; + float target = osd_weight[osd] * pgs_per_weight; + ldout(cct, 10) << " Overfull search osd." << osd + << " target " << target + << " deviation " << deviation + << dendl; + ceph_assert(target > 0); + if (!using_more_overfull && deviation <= max_deviation) { + ldout(cct, 10) << " osd." << osd + << " target " << target + << " deviation " << deviation + << " < max deviation " << max_deviation + << dendl; + break; + } + + vector<pg_t> pgs; + pgs.reserve(pgs_by_osd[osd].size()); + for (auto& pg : pgs_by_osd[osd]) { + if (to_skip.count(pg)) + continue; + pgs.push_back(pg); + } + if (aggressive) { + // shuffle PG list so they all get equal (in)attention + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(pgs.begin(), pgs.end(), rng); + } + // look for remaps we can un-remap + for (auto pg : pgs) { + auto p = tmp.pg_upmap_items.find(pg); + if (p == tmp.pg_upmap_items.end()) + continue; + mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items; + for (auto q : p->second) { + if (q.second == osd) { + ldout(cct, 10) << " will try dropping existing" + << " remapping pair " + << q.first << " -> " << q.second + << " which remapped " << pg + << " into overfull osd." << osd + << dendl; + temp_pgs_by_osd[q.second].erase(pg); + temp_pgs_by_osd[q.first].insert(pg); + } else { + new_upmap_items.push_back(q); + } + } + if (new_upmap_items.empty()) { + // drop whole item + ldout(cct, 10) << " existing pg_upmap_items " << p->second + << " remapped " << pg << " into overfull osd." << osd + << ", will try cancelling it entirely" + << dendl; + to_unmap.insert(pg); + goto test_change; + } else if (new_upmap_items.size() != p->second.size()) { + // drop single remapping pair, updating + ceph_assert(new_upmap_items.size() < p->second.size()); + ldout(cct, 10) << " existing pg_upmap_items " << p->second + << " remapped " << pg << " into overfull osd." << osd + << ", new_pg_upmap_items now " << new_upmap_items + << dendl; + to_upmap[pg] = new_upmap_items; + goto test_change; + } + } + + // try upmap + for (auto pg : pgs) { + auto temp_it = tmp.pg_upmap.find(pg); + if (temp_it != tmp.pg_upmap.end()) { + // leave pg_upmap alone + // it must be specified by admin since balancer does not + // support pg_upmap yet + ldout(cct, 10) << " " << pg << " already has pg_upmap " + << temp_it->second << ", skipping" + << dendl; + continue; + } + auto pg_pool_size = tmp.get_pg_pool_size(pg); + mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items; + set<int> existing; + auto it = tmp.pg_upmap_items.find(pg); + if (it != tmp.pg_upmap_items.end() && + it->second.size() >= (size_t)pg_pool_size) { + ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items " + << it->second << ", skipping" + << dendl; + continue; + } else if (it != tmp.pg_upmap_items.end()) { + ldout(cct, 10) << " " << pg << " already has pg_upmap_items " + << it->second + << dendl; + new_upmap_items = it->second; + // build existing too (for dedup) + for (auto i : it->second) { + existing.insert(i.first); + existing.insert(i.second); + } + // fall through + // to see if we can append more remapping pairs + } + ldout(cct, 10) << " trying " << pg << dendl; + vector<int> raw, orig, out; + tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too + if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) { + continue; + } + ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl; + if (orig.size() != out.size()) { + continue; + } + ceph_assert(orig != out); + int pos = -1; + float max_dev = 0; + for (unsigned i = 0; i < out.size(); ++i) { + if (orig[i] == out[i]) + continue; // skip invalid remappings + if (existing.count(orig[i]) || existing.count(out[i])) + continue; // we want new remappings only! + if (osd_deviation[orig[i]] > max_dev) { + max_dev = osd_deviation[orig[i]]; + pos = i; + ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl; + } + } + if (pos != -1) { + int i = pos; + ldout(cct, 10) << " will try adding new remapping pair " + << orig[i] << " -> " << out[i] << " for " << pg + << (orig[i] != osd ? " NOT selected osd" : "") + << dendl; + existing.insert(orig[i]); + existing.insert(out[i]); + temp_pgs_by_osd[orig[i]].erase(pg); + temp_pgs_by_osd[out[i]].insert(pg); + ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size); + new_upmap_items.push_back(make_pair(orig[i], out[i])); + // append new remapping pairs slowly + // This way we can make sure that each tiny change will + // definitely make distribution of PGs converging to + // the perfect status. + to_upmap[pg] = new_upmap_items; + goto test_change; + } + } + } + + ceph_assert(!(to_unmap.size() || to_upmap.size())); + ldout(cct, 10) << " failed to find any changes for overfull osds" + << dendl; + for (auto& p : deviation_osd) { + if (std::find(underfull.begin(), underfull.end(), p.second) == + underfull.end()) + break; + int osd = p.second; + float deviation = p.first; + float target = osd_weight[osd] * pgs_per_weight; + ceph_assert(target > 0); + if (fabsf(deviation) < max_deviation) { + // respect max_deviation too + ldout(cct, 10) << " osd." << osd + << " target " << target + << " deviation " << deviation + << " -> absolute " << fabsf(deviation) + << " < max " << max_deviation + << dendl; + break; + } + // look for remaps we can un-remap + vector<pair<pg_t, + mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates; + candidates.reserve(tmp.pg_upmap_items.size()); + for (auto& i : tmp.pg_upmap_items) { + if (to_skip.count(i.first)) + continue; + if (!only_pools.empty() && !only_pools.count(i.first.pool())) + continue; + candidates.push_back(make_pair(i.first, i.second)); + } + if (aggressive) { + // shuffle candidates so they all get equal (in)attention + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(candidates.begin(), candidates.end(), rng); + } + for (auto& i : candidates) { + auto pg = i.first; + mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items; + for (auto& j : i.second) { + if (j.first == osd) { + ldout(cct, 10) << " will try dropping existing" + << " remapping pair " + << j.first << " -> " << j.second + << " which remapped " << pg + << " out from underfull osd." << osd + << dendl; + temp_pgs_by_osd[j.second].erase(pg); + temp_pgs_by_osd[j.first].insert(pg); + } else { + new_upmap_items.push_back(j); + } + } + if (new_upmap_items.empty()) { + // drop whole item + ldout(cct, 10) << " existing pg_upmap_items " << i.second + << " remapped " << pg + << " out from underfull osd." << osd + << ", will try cancelling it entirely" + << dendl; + to_unmap.insert(pg); + goto test_change; + } else if (new_upmap_items.size() != i.second.size()) { + // drop single remapping pair, updating + ceph_assert(new_upmap_items.size() < i.second.size()); + ldout(cct, 10) << " existing pg_upmap_items " << i.second + << " remapped " << pg + << " out from underfull osd." << osd + << ", new_pg_upmap_items now " << new_upmap_items + << dendl; + to_upmap[pg] = new_upmap_items; + goto test_change; + } + } + } + + ceph_assert(!(to_unmap.size() || to_upmap.size())); + ldout(cct, 10) << " failed to find any changes for underfull osds" + << dendl; + if (!aggressive) { + ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl; + break; + } else if (!skip_overfull) { + // safe to quit because below here we know + // we've done checking both overfull and underfull osds.. + ldout(cct, 10) << " break due to not being able to find any" + << " further optimizations" + << dendl; + break; + } + // restart with fullest and do exhaustive searching + skip_overfull = false; + continue; + + test_change: + + // test change, apply if change is good + ceph_assert(to_unmap.size() || to_upmap.size()); + float new_stddev = 0; + map<int,float> temp_osd_deviation; + multimap<float,int> temp_deviation_osd; + float cur_max_deviation = 0; + for (auto& i : temp_pgs_by_osd) { + // make sure osd is still there (belongs to this crush-tree) + ceph_assert(osd_weight.count(i.first)); + float target = osd_weight[i.first] * pgs_per_weight; + float deviation = (float)i.second.size() - target; + ldout(cct, 20) << " osd." << i.first + << "\tpgs " << i.second.size() + << "\ttarget " << target + << "\tdeviation " << deviation + << dendl; + temp_osd_deviation[i.first] = deviation; + temp_deviation_osd.insert(make_pair(deviation, i.first)); + new_stddev += deviation * deviation; + if (fabsf(deviation) > cur_max_deviation) + cur_max_deviation = fabsf(deviation); + } + ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl; + if (new_stddev >= stddev) { + if (!aggressive) { + ldout(cct, 10) << " break because stddev is not decreasing" + << " and aggressive mode is not enabled" + << dendl; + break; + } + local_fallback_retried++; + if (local_fallback_retried >= local_fallback_retries) { + // does not make progress + // flip *skip_overfull* so both overfull and underfull + // get equal (in)attention + skip_overfull = !skip_overfull; + ldout(cct, 10) << " hit local_fallback_retries " + << local_fallback_retries + << dendl; + continue; + } + for (auto& i : to_unmap) + to_skip.insert(i); + for (auto& i : to_upmap) + to_skip.insert(i.first); + ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried + << " to_skip " << to_skip + << dendl; + goto retry; + } + + // ready to go + ceph_assert(new_stddev < stddev); + stddev = new_stddev; + pgs_by_osd = temp_pgs_by_osd; + osd_deviation = temp_osd_deviation; + deviation_osd = temp_deviation_osd; + for (auto& i : to_unmap) { + ldout(cct, 10) << " unmap pg " << i << dendl; + ceph_assert(tmp.pg_upmap_items.count(i)); + tmp.pg_upmap_items.erase(i); + pending_inc->old_pg_upmap_items.insert(i); + ++num_changed; + } + for (auto& i : to_upmap) { + ldout(cct, 10) << " upmap pg " << i.first + << " new pg_upmap_items " << i.second + << dendl; + tmp.pg_upmap_items[i.first] = i.second; + pending_inc->new_pg_upmap_items[i.first] = i.second; + ++num_changed; + } + ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl; + if (cur_max_deviation <= max_deviation) { + ldout(cct, 10) << __func__ << " Optimization plan is almost perfect" + << dendl; + break; + } + } + ldout(cct, 10) << " num_changed = " << num_changed << dendl; + return num_changed; +} + +int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const +{ + return crush->get_leaves(name, osds); +} + +// get pools whose crush rules might reference the given osd +void OSDMap::get_pool_ids_by_osd(CephContext *cct, + int osd, + set<int64_t> *pool_ids) const +{ + ceph_assert(pool_ids); + set<int> raw_rules; + int r = crush->get_rules_by_osd(osd, &raw_rules); + if (r < 0) { + lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r) + << dendl; + ceph_assert(r >= 0); + } + set<int> rules; + for (auto &i: raw_rules) { + // exclude any dead rule + if (crush_rule_in_use(i)) { + rules.insert(i); + } + } + for (auto &r: rules) { + get_pool_ids_by_rule(r, pool_ids); + } +} + +template <typename F> +class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> { +public: + typedef CrushTreeDumper::Dumper<F> Parent; + + OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + const PGMap& pgmap_, bool tree_, + const string& class_name_, + const string& item_name_) : + Parent(crush, osdmap_->get_pool_names()), + osdmap(osdmap_), + pgmap(pgmap_), + tree(tree_), + class_name(class_name_), + item_name(item_name_), + min_var(-1), + max_var(-1), + stddev(0), + sum(0) { + if (osdmap->crush->name_exists(item_name)) { + // filter out items we are allowed to dump + auto item_id = osdmap->crush->get_item_id(item_name); + allowed.insert(item_id); + osdmap->crush->get_all_children(item_id, &allowed); + } + average_util = average_utilization(); + } + +protected: + + bool should_dump(int id) const { + if (!allowed.empty() && !allowed.count(id)) // filter by name + return false; + if (id >= 0 && !class_name.empty()) { + const char* item_class_name = osdmap->crush->get_item_class(id); + if (!item_class_name || // not bound to a class yet + item_class_name != class_name) // or already bound to + // a different class + return false; + } + return true; + } + + set<int> get_dumped_osds() { + if (class_name.empty() && item_name.empty()) { + // old way, all + return {}; + } + return dumped_osds; + } + + void dump_stray(F *f) { + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !this->is_touched(i)) + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f); + } + } + + void dump_item(const CrushTreeDumper::Item &qi, F *f) override { + if (!tree && qi.is_bucket()) + return; + if (!should_dump(qi.id)) + return; + + if (!qi.is_bucket()) + dumped_osds.insert(qi.id); + float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id); + int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0, + kb_used_meta = 0, kb_avail = 0; + double util = 0; + if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data, + &kb_used_omap, &kb_used_meta, &kb_avail)) + if (kb_used && kb) + util = 100.0 * (double)kb_used / (double)kb; + + double var = 1.0; + if (average_util) + var = util / average_util; + + size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id); + + dump_item(qi, reweight, kb, kb_used, + kb_used_data, kb_used_omap, kb_used_meta, + kb_avail, util, var, num_pgs, f); + + if (!qi.is_bucket() && reweight > 0) { + if (min_var < 0 || var < min_var) + min_var = var; + if (max_var < 0 || var > max_var) + max_var = var; + + double dev = util - average_util; + dev *= dev; + stddev += reweight * dev; + sum += reweight; + } + } + + virtual void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + F *f) = 0; + + double dev() { + return sum > 0 ? sqrt(stddev / sum) : 0; + } + + double average_utilization() { + int64_t kb = 0, kb_used = 0; + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (!osdmap->exists(i) || + osdmap->get_weight(i) == 0 || + !should_dump(i)) + continue; + int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i, + kb_avail_i; + if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i, + &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) { + kb += kb_i; + kb_used += kb_used_i; + } + } + return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0; + } + + bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used, + int64_t* kb_used_data, + int64_t* kb_used_omap, + int64_t* kb_used_meta, + int64_t* kb_avail) const { + const osd_stat_t *p = pgmap.get_osd_stat(id); + if (!p) return false; + *kb = p->statfs.kb(); + *kb_used = p->statfs.kb_used_raw(); + *kb_used_data = p->statfs.kb_used_data(); + *kb_used_omap = p->statfs.kb_used_omap(); + *kb_used_meta = p->statfs.kb_used_internal_metadata(); + *kb_avail = p->statfs.kb_avail(); + + return true; + } + + bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used, + int64_t* kb_used_data, + int64_t* kb_used_omap, + int64_t* kb_used_meta, + int64_t* kb_avail) const { + if (id >= 0) { + if (osdmap->is_out(id) || !should_dump(id)) { + *kb = 0; + *kb_used = 0; + *kb_used_data = 0; + *kb_used_omap = 0; + *kb_used_meta = 0; + *kb_avail = 0; + return true; + } + return get_osd_utilization(id, kb, kb_used, kb_used_data, + kb_used_omap, kb_used_meta, kb_avail); + } + + *kb = 0; + *kb_used = 0; + *kb_used_data = 0; + *kb_used_omap = 0; + *kb_used_meta = 0; + *kb_avail = 0; + + for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) { + int item = osdmap->crush->get_bucket_item(id, k); + int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0, + kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0; + if (!get_bucket_utilization(item, &kb_i, &kb_used_i, + &kb_used_data_i, &kb_used_omap_i, + &kb_used_meta_i, &kb_avail_i)) + return false; + *kb += kb_i; + *kb_used += kb_used_i; + *kb_used_data += kb_used_data_i; + *kb_used_omap += kb_used_omap_i; + *kb_used_meta += kb_used_meta_i; + *kb_avail += kb_avail_i; + } + return true; + } + +protected: + const OSDMap *osdmap; + const PGMap& pgmap; + bool tree; + const string class_name; + const string item_name; + double average_util; + double min_var; + double max_var; + double stddev; + double sum; + set<int> allowed; + set<int> dumped_osds; +}; + + +class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> { +public: + typedef OSDUtilizationDumper<TextTable> Parent; + + OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap, + const PGMap& pgmap, bool tree, + const string& class_name, + const string& item_name) : + Parent(crush, osdmap, pgmap, tree, class_name, item_name) {} + + void dump(TextTable *tbl) { + tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT); + if (tree) + tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT); + + Parent::dump(tbl); + + dump_stray(tbl); + + auto sum = pgmap.get_osd_sum(get_dumped_osds()); + *tbl << "" + << "" + << "" << "TOTAL" + << byte_u_t(sum.statfs.total) + << byte_u_t(sum.statfs.get_used_raw()) + << byte_u_t(sum.statfs.allocated) + << byte_u_t(sum.statfs.omap_allocated) + << byte_u_t(sum.statfs.internal_metadata) + << byte_u_t(sum.statfs.available) + << lowprecision_t(average_util) + << "" + << TextTable::endrow; + } + +protected: + struct lowprecision_t { + float v; + explicit lowprecision_t(float _v) : v(_v) {} + }; + friend std::ostream &operator<<(ostream& out, const lowprecision_t& v); + + using OSDUtilizationDumper<TextTable>::dump_item; + void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + TextTable *tbl) override { + const char *c = crush->get_item_class(qi.id); + if (!c) + c = ""; + *tbl << qi.id + << c + << weightf_t(qi.weight) + << weightf_t(reweight) + << byte_u_t(kb << 10) + << byte_u_t(kb_used << 10) + << byte_u_t(kb_used_data << 10) + << byte_u_t(kb_used_omap << 10) + << byte_u_t(kb_used_meta << 10) + << byte_u_t(kb_avail << 10) + << lowprecision_t(util) + << lowprecision_t(var); + + if (qi.is_bucket()) { + *tbl << "-"; + *tbl << ""; + } else { + *tbl << num_pgs; + if (osdmap->is_up(qi.id)) { + *tbl << "up"; + } else if (osdmap->is_destroyed(qi.id)) { + *tbl << "destroyed"; + } else { + *tbl << "down"; + } + } + + if (tree) { + ostringstream name; + for (int k = 0; k < qi.depth; k++) + name << " "; + if (qi.is_bucket()) { + int type = crush->get_bucket_type(qi.id); + name << crush->get_type_name(type) << " " + << crush->get_item_name(qi.id); + } else { + name << "osd." << qi.id; + } + *tbl << name.str(); + } + + *tbl << TextTable::endrow; + } + +public: + string summary() { + ostringstream out; + out << "MIN/MAX VAR: " << lowprecision_t(min_var) + << "/" << lowprecision_t(max_var) << " " + << "STDDEV: " << lowprecision_t(dev()); + return out.str(); + } +}; + +ostream& operator<<(ostream& out, + const OSDUtilizationPlainDumper::lowprecision_t& v) +{ + if (v.v < -0.01) { + return out << "-"; + } else if (v.v < 0.001) { + return out << "0"; + } else { + std::streamsize p = out.precision(); + return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p); + } +} + +class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> { +public: + typedef OSDUtilizationDumper<Formatter> Parent; + + OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap, + const PGMap& pgmap, bool tree, + const string& class_name, + const string& item_name) : + Parent(crush, osdmap, pgmap, tree, class_name, item_name) {} + + void dump(Formatter *f) { + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + + f->open_array_section("stray"); + dump_stray(f); + f->close_section(); + } + +protected: + using OSDUtilizationDumper<Formatter>::dump_item; + void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + Formatter *f) override { + f->open_object_section("item"); + CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f); + f->dump_float("reweight", reweight); + f->dump_int("kb", kb); + f->dump_int("kb_used", kb_used); + f->dump_int("kb_used_data", kb_used_data); + f->dump_int("kb_used_omap", kb_used_omap); + f->dump_int("kb_used_meta", kb_used_meta); + f->dump_int("kb_avail", kb_avail); + f->dump_float("utilization", util); + f->dump_float("var", var); + f->dump_unsigned("pgs", num_pgs); + if (!qi.is_bucket()) { + if (osdmap->is_up(qi.id)) { + f->dump_string("status", "up"); + } else if (osdmap->is_destroyed(qi.id)) { + f->dump_string("status", "destroyed"); + } else { + f->dump_string("status", "down"); + } + } + CrushTreeDumper::dump_bucket_children(crush, qi, f); + f->close_section(); + } + +public: + void summary(Formatter *f) { + f->open_object_section("summary"); + auto sum = pgmap.get_osd_sum(get_dumped_osds()); + auto& s = sum.statfs; + + f->dump_int("total_kb", s.kb()); + f->dump_int("total_kb_used", s.kb_used_raw()); + f->dump_int("total_kb_used_data", s.kb_used_data()); + f->dump_int("total_kb_used_omap", s.kb_used_omap()); + f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata()); + f->dump_int("total_kb_avail", s.kb_avail()); + f->dump_float("average_utilization", average_util); + f->dump_float("min_var", min_var); + f->dump_float("max_var", max_var); + f->dump_float("dev", dev()); + f->close_section(); + } +}; + +void print_osd_utilization(const OSDMap& osdmap, + const PGMap& pgmap, + ostream& out, + Formatter *f, + bool tree, + const string& class_name, + const string& item_name) +{ + const CrushWrapper *crush = osdmap.crush.get(); + if (f) { + f->open_object_section("df"); + OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, + class_name, item_name); + d.dump(f); + d.summary(f); + f->close_section(); + f->flush(out); + } else { + OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, + class_name, item_name); + TextTable tbl; + d.dump(&tbl); + out << tbl << d.summary() << "\n"; + } +} + +void OSDMap::check_health(CephContext *cct, + health_check_map_t *checks) const +{ + int num_osds = get_num_osds(); + + // OSD_DOWN + // OSD_$subtree_DOWN + // OSD_ORPHAN + if (num_osds >= 0) { + int num_in_osds = 0; + int num_down_in_osds = 0; + set<int> osds; + set<int> down_in_osds; + set<int> up_in_osds; + set<int> subtree_up; + unordered_map<int, set<int> > subtree_type_down; + unordered_map<int, int> num_osds_subtree; + int max_type = crush->get_max_type_id(); + + for (int i = 0; i < get_max_osd(); i++) { + if (!exists(i)) { + if (crush->item_exists(i)) { + osds.insert(i); + } + continue; + } + if (is_out(i)) + continue; + ++num_in_osds; + if (down_in_osds.count(i) || up_in_osds.count(i)) + continue; + if (!is_up(i)) { + down_in_osds.insert(i); + int parent_id = 0; + int current = i; + for (int type = 0; type <= max_type; type++) { + if (!crush->get_type_name(type)) + continue; + int r = crush->get_immediate_parent_id(current, &parent_id); + if (r == -ENOENT) + break; + // break early if this parent is already marked as up + if (subtree_up.count(parent_id)) + break; + type = crush->get_bucket_type(parent_id); + if (!subtree_type_is_down( + cct, parent_id, type, + &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down)) + break; + current = parent_id; + } + } + } + + // calculate the number of down osds in each down subtree and + // store it in num_osds_subtree + for (int type = 1; type <= max_type; type++) { + if (!crush->get_type_name(type)) + continue; + for (auto j = subtree_type_down[type].begin(); + j != subtree_type_down[type].end(); + ++j) { + list<int> children; + int num = 0; + int num_children = crush->get_children(*j, &children); + if (num_children == 0) + continue; + for (auto l = children.begin(); l != children.end(); ++l) { + if (*l >= 0) { + ++num; + } else if (num_osds_subtree[*l] > 0) { + num = num + num_osds_subtree[*l]; + } + } + num_osds_subtree[*j] = num; + } + } + num_down_in_osds = down_in_osds.size(); + ceph_assert(num_down_in_osds <= num_in_osds); + if (num_down_in_osds > 0) { + // summary of down subtree types and osds + for (int type = max_type; type > 0; type--) { + if (!crush->get_type_name(type)) + continue; + if (subtree_type_down[type].size() > 0) { + ostringstream ss; + ss << subtree_type_down[type].size() << " " + << crush->get_type_name(type); + if (subtree_type_down[type].size() > 1) { + ss << "s"; + } + int sum_down_osds = 0; + for (auto j = subtree_type_down[type].begin(); + j != subtree_type_down[type].end(); + ++j) { + sum_down_osds = sum_down_osds + num_osds_subtree[*j]; + } + ss << " (" << sum_down_osds << " osds) down"; + string err = string("OSD_") + + string(crush->get_type_name(type)) + "_DOWN"; + boost::to_upper(err); + auto& d = checks->add(err, HEALTH_WARN, ss.str()); + for (auto j = subtree_type_down[type].rbegin(); + j != subtree_type_down[type].rend(); + ++j) { + ostringstream ss; + ss << crush->get_type_name(type); + ss << " "; + ss << crush->get_item_name(*j); + // at the top level, do not print location + if (type != max_type) { + ss << " ("; + ss << crush->get_full_location_ordered_string(*j); + ss << ")"; + } + int num = num_osds_subtree[*j]; + ss << " (" << num << " osds)"; + ss << " is down"; + d.detail.push_back(ss.str()); + } + } + } + ostringstream ss; + ss << down_in_osds.size() << " osds down"; + auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str()); + for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) { + ostringstream ss; + ss << "osd." << *it << " ("; + ss << crush->get_full_location_ordered_string(*it); + ss << ") is down"; + d.detail.push_back(ss.str()); + } + } + + if (!osds.empty()) { + ostringstream ss; + ss << osds.size() << " osds exist in the crush map but not in the osdmap"; + auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str()); + for (auto osd : osds) { + ostringstream ss; + ss << "osd." << osd << " exists in crush map but not in osdmap"; + d.detail.push_back(ss.str()); + } + } + } + + std::list<std::string> scrub_messages; + bool noscrub = false, nodeepscrub = false; + for (const auto &p : pools) { + if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) { + ostringstream ss; + ss << "Pool " << get_pool_name(p.first) << " has noscrub flag"; + scrub_messages.push_back(ss.str()); + noscrub = true; + } + if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) { + ostringstream ss; + ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag"; + scrub_messages.push_back(ss.str()); + nodeepscrub = true; + } + } + if (noscrub || nodeepscrub) { + string out = ""; + out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : ""; + out += nodeepscrub ? "nodeep-scrub" : ""; + auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK, + "Some pool(s) have the " + out + " flag(s) set"); + d.detail.splice(d.detail.end(), scrub_messages); + } + + // OSD_OUT_OF_ORDER_FULL + { + // An osd could configure failsafe ratio, to something different + // but for now assume it is the same here. + float fsr = cct->_conf->osd_failsafe_full_ratio; + if (fsr > 1.0) fsr /= 100; + float fr = get_full_ratio(); + float br = get_backfillfull_ratio(); + float nr = get_nearfull_ratio(); + + list<string> detail; + // These checks correspond to how OSDService::check_full_status() in an OSD + // handles the improper setting of these values. + if (br < nr) { + ostringstream ss; + ss << "backfillfull_ratio (" << br + << ") < nearfull_ratio (" << nr << "), increased"; + detail.push_back(ss.str()); + br = nr; + } + if (fr < br) { + ostringstream ss; + ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br + << "), increased"; + detail.push_back(ss.str()); + fr = br; + } + if (fsr < fr) { + ostringstream ss; + ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr + << "), increased"; + detail.push_back(ss.str()); + } + if (!detail.empty()) { + auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR, + "full ratio(s) out of order"); + d.detail.swap(detail); + } + } + + // OSD_FULL + // OSD_NEARFULL + // OSD_BACKFILLFULL + // OSD_FAILSAFE_FULL + { + set<int> full, backfillfull, nearfull; + get_full_osd_counts(&full, &backfillfull, &nearfull); + if (full.size()) { + ostringstream ss; + ss << full.size() << " full osd(s)"; + auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str()); + for (auto& i: full) { + ostringstream ss; + ss << "osd." << i << " is full"; + d.detail.push_back(ss.str()); + } + } + if (backfillfull.size()) { + ostringstream ss; + ss << backfillfull.size() << " backfillfull osd(s)"; + auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str()); + for (auto& i: backfillfull) { + ostringstream ss; + ss << "osd." << i << " is backfill full"; + d.detail.push_back(ss.str()); + } + } + if (nearfull.size()) { + ostringstream ss; + ss << nearfull.size() << " nearfull osd(s)"; + auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str()); + for (auto& i: nearfull) { + ostringstream ss; + ss << "osd." << i << " is near full"; + d.detail.push_back(ss.str()); + } + } + } + + // OSDMAP_FLAGS + { + // warn about flags + uint64_t warn_flags = + CEPH_OSDMAP_NEARFULL | + CEPH_OSDMAP_FULL | + CEPH_OSDMAP_PAUSERD | + CEPH_OSDMAP_PAUSEWR | + CEPH_OSDMAP_PAUSEREC | + CEPH_OSDMAP_NOUP | + CEPH_OSDMAP_NODOWN | + CEPH_OSDMAP_NOIN | + CEPH_OSDMAP_NOOUT | + CEPH_OSDMAP_NOBACKFILL | + CEPH_OSDMAP_NORECOVER | + CEPH_OSDMAP_NOSCRUB | + CEPH_OSDMAP_NODEEP_SCRUB | + CEPH_OSDMAP_NOTIERAGENT | + CEPH_OSDMAP_NOSNAPTRIM | + CEPH_OSDMAP_NOREBALANCE; + if (test_flag(warn_flags)) { + ostringstream ss; + ss << get_flag_string(get_flags() & warn_flags) + << " flag(s) set"; + checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str()); + } + } + + // OSD_FLAGS + { + list<string> detail; + const unsigned flags = + CEPH_OSD_NOUP | + CEPH_OSD_NOIN | + CEPH_OSD_NODOWN | + CEPH_OSD_NOOUT; + for (int i = 0; i < max_osd; ++i) { + if (osd_state[i] & flags) { + ostringstream ss; + set<string> states; + OSDMap::calc_state_set(osd_state[i] & flags, states); + ss << "osd." << i << " has flags " << states; + detail.push_back(ss.str()); + } + } + for (auto& i : crush_node_flags) { + if (i.second && crush->item_exists(i.first)) { + ostringstream ss; + set<string> states; + OSDMap::calc_state_set(i.second, states); + int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first); + const char *tn = crush->get_type_name(t); + ss << (tn ? tn : "node") << " " + << crush->get_item_name(i.first) << " has flags " << states; + detail.push_back(ss.str()); + } + } + for (auto& i : device_class_flags) { + const char* class_name = crush->get_class_name(i.first); + if (i.second && class_name) { + ostringstream ss; + set<string> states; + OSDMap::calc_state_set(i.second, states); + ss << "device class '" << class_name << "' has flags " << states; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set"; + auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str()); + d.detail.swap(detail); + } + } + + // OLD_CRUSH_TUNABLES + if (cct->_conf->mon_warn_on_legacy_crush_tunables) { + string min = crush->get_min_required_version(); + if (min < cct->_conf->mon_crush_min_required_version) { + ostringstream ss; + ss << "crush map has legacy tunables (require " << min + << ", min is " << cct->_conf->mon_crush_min_required_version << ")"; + auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str()); + d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables"); + } + } + + // OLD_CRUSH_STRAW_CALC_VERSION + if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) { + if (crush->get_straw_calc_version() == 0) { + ostringstream ss; + ss << "crush map has straw_calc_version=0"; + auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str()); + d.detail.push_back( + "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables"); + } + } + + // CACHE_POOL_NO_HIT_SET + if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) { + list<string> detail; + for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin(); + p != pools.end(); + ++p) { + const pg_pool_t& info = p->second; + if (info.cache_mode_requires_hit_set() && + info.hit_set_params.get_type() == HitSet::TYPE_NONE) { + ostringstream ss; + ss << "pool '" << get_pool_name(p->first) + << "' with cache_mode " << info.get_cache_mode_name() + << " needs hit_set_type to be set but it is not"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " cache pools are missing hit_sets"; + auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str()); + d.detail.swap(detail); + } + } + + // OSD_NO_SORTBITWISE + if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) { + ostringstream ss; + ss << "'sortbitwise' flag is not set"; + checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str()); + } + + // OSD_UPGRADE_FINISHED + // none of these (yet) since we don't run until luminous upgrade is done. + + // POOL_NEARFULL/BACKFILLFULL/FULL + { + list<string> full_detail, backfillfull_detail, nearfull_detail; + for (auto it : get_pools()) { + const pg_pool_t &pool = it.second; + const string& pool_name = get_pool_name(it.first); + if (pool.has_flag(pg_pool_t::FLAG_FULL)) { + stringstream ss; + if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + // may run out of space too, + // but we want EQUOTA taking precedence + ss << "pool '" << pool_name << "' is full (running out of quota)"; + } else { + ss << "pool '" << pool_name << "' is full (no space)"; + } + full_detail.push_back(ss.str()); + } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) { + stringstream ss; + ss << "pool '" << pool_name << "' is backfillfull"; + backfillfull_detail.push_back(ss.str()); + } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) { + stringstream ss; + ss << "pool '" << pool_name << "' is nearfull"; + nearfull_detail.push_back(ss.str()); + } + } + if (!full_detail.empty()) { + ostringstream ss; + ss << full_detail.size() << " pool(s) full"; + auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str()); + d.detail.swap(full_detail); + } + if (!backfillfull_detail.empty()) { + ostringstream ss; + ss << backfillfull_detail.size() << " pool(s) backfillfull"; + auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str()); + d.detail.swap(backfillfull_detail); + } + if (!nearfull_detail.empty()) { + ostringstream ss; + ss << nearfull_detail.size() << " pool(s) nearfull"; + auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str()); + d.detail.swap(nearfull_detail); + } + } + + // POOL_PG_NUM_NOT_POWER_OF_TWO + if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) { + list<string> detail; + for (auto it : get_pools()) { + if (!isp2(it.second.get_pg_num_target())) { + ostringstream ss; + ss << "pool '" << get_pool_name(it.first) + << "' pg_num " << it.second.get_pg_num_target() + << " is not a power of two"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " pool(s) have non-power-of-two pg_num"; + auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN, + ss.str()); + d.detail.swap(detail); + } + } + + // POOL_NO_REDUNDANCY + if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy")) + { + list<string> detail; + for (auto it : get_pools()) { + if (it.second.get_size() == 1) { + ostringstream ss; + ss << "pool '" << get_pool_name(it.first) + << "' has no replicas configured"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " pool(s) have no replicas configured"; + auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN, ss.str()); + d.detail.swap(detail); + } + } +} + +int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out, + ostream *ss) const +{ + out->clear(); + for (auto i = ls.begin(); i != ls.end(); ++i) { + if (i == ls.begin() && + (*i == "any" || *i == "all" || *i == "*")) { + get_all_osds(*out); + break; + } + long osd = parse_osd_id(i->c_str(), ss); + if (osd < 0) { + *ss << "invalid osd id '" << *i << "'"; + return -EINVAL; + } + out->insert(osd); + } + return 0; +} + +void OSDMap::get_random_up_osds_by_subtree(int n, // whoami + string &subtree, + int limit, // how many + set<int> skip, + set<int> *want) const { + if (limit <= 0) + return; + int subtree_type = crush->get_type_id(subtree); + if (subtree_type < 1) + return; + vector<int> subtrees; + crush->get_subtree_of_type(subtree_type, &subtrees); + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(subtrees.begin(), subtrees.end(), rng); + for (auto s : subtrees) { + if (limit <= 0) + break; + if (crush->subtree_contains(s, n)) + continue; + vector<int> osds; + crush->get_children_of_type(s, 0, &osds); + if (osds.empty()) + continue; + vector<int> up_osds; + for (auto o : osds) { + if (is_up(o) && !skip.count(o)) + up_osds.push_back(o); + } + if (up_osds.empty()) + continue; + auto it = up_osds.begin(); + std::advance(it, (n % up_osds.size())); + want->insert(*it); + --limit; + } +} + +float OSDMap::pool_raw_used_rate(int64_t poolid) const +{ + const pg_pool_t *pool = get_pg_pool(poolid); + assert(pool != nullptr); + + switch (pool->get_type()) { + case pg_pool_t::TYPE_REPLICATED: + return pool->get_size(); + break; + case pg_pool_t::TYPE_ERASURE: + { + auto& ecp = + get_erasure_code_profile(pool->erasure_code_profile); + auto pm = ecp.find("m"); + auto pk = ecp.find("k"); + if (pm != ecp.end() && pk != ecp.end()) { + int k = atoi(pk->second.c_str()); + int m = atoi(pm->second.c_str()); + int mk = m + k; + ceph_assert(mk != 0); + ceph_assert(k != 0); + return (float)mk / k; + } else { + return 0.0; + } + } + break; + default: + ceph_abort_msg("unrecognized pool type"); + } +} + +unsigned OSDMap::get_osd_crush_node_flags(int osd) const +{ + unsigned flags = 0; + if (!crush_node_flags.empty()) { + // the map will contain type -> name + std::map<std::string,std::string> ploc = crush->get_full_location(osd); + for (auto& i : ploc) { + int id = crush->get_item_id(i.second); + auto p = crush_node_flags.find(id); + if (p != crush_node_flags.end()) { + flags |= p->second; + } + } + } + return flags; +} + +unsigned OSDMap::get_crush_node_flags(int id) const +{ + unsigned flags = 0; + auto it = crush_node_flags.find(id); + if (it != crush_node_flags.end()) + flags = it->second; + return flags; +} + +unsigned OSDMap::get_device_class_flags(int id) const +{ + unsigned flags = 0; + auto it = device_class_flags.find(id); + if (it != device_class_flags.end()) + flags = it->second; + return flags; +} diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h new file mode 100644 index 00000000..64c5f88d --- /dev/null +++ b/src/osd/OSDMap.h @@ -0,0 +1,1531 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_OSDMAP_H +#define CEPH_OSDMAP_H + +/* + * describe properties of the OSD cluster. + * disks, disk groups, total # osds, + * + */ +#include "include/types.h" +#include "osd_types.h" + +//#include "include/ceph_features.h" +#include "crush/CrushWrapper.h" +#include <vector> +#include <list> +#include <set> +#include <map> +#include <memory> +#include "include/btree_map.h" + +// forward declaration +class CephContext; +class CrushWrapper; +class health_check_map_t; + +/* + * we track up to two intervals during which the osd was alive and + * healthy. the most recent is [up_from,up_thru), where up_thru is + * the last epoch the osd is known to have _started_. i.e., a lower + * bound on the actual osd death. down_at (if it is > up_from) is an + * upper bound on the actual osd death. + * + * the second is the last_clean interval [first,last]. in that case, + * the last interval is the last epoch known to have been either + * _finished_, or during which the osd cleanly shut down. when + * possible, we push this forward to the epoch the osd was eventually + * marked down. + * + * the lost_at is used to allow build_prior to proceed without waiting + * for an osd to recover. In certain cases, progress may be blocked + * because an osd is down that may contain updates (i.e., a pg may have + * gone rw during an interval). If the osd can't be brought online, we + * can force things to proceed knowing that we _might_ be losing some + * acked writes. If the osd comes back to life later, that's fine to, + * but those writes will still be lost (the divergent objects will be + * thrown out). + */ +struct osd_info_t { + epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown + epoch_t last_clean_end; + epoch_t up_from; // epoch osd marked up + epoch_t up_thru; // lower bound on actual osd death (if > up_from) + epoch_t down_at; // upper bound on actual osd death (if > up_from) + epoch_t lost_at; // last epoch we decided data was "lost" + + osd_info_t() : last_clean_begin(0), last_clean_end(0), + up_from(0), up_thru(0), down_at(0), lost_at(0) {} + + void dump(Formatter *f) const; + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + static void generate_test_instances(list<osd_info_t*>& o); +}; +WRITE_CLASS_ENCODER(osd_info_t) + +ostream& operator<<(ostream& out, const osd_info_t& info); + +struct osd_xinfo_t { + utime_t down_stamp; ///< timestamp when we were last marked down + float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy + __u32 laggy_interval; ///< average interval between being marked laggy and recovering + uint64_t features; ///< features supported by this osd we should know about + __u32 old_weight; ///< weight prior to being auto marked out + + osd_xinfo_t() : laggy_probability(0), laggy_interval(0), + features(0), old_weight(0) {} + + void dump(Formatter *f) const; + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + static void generate_test_instances(list<osd_xinfo_t*>& o); +}; +WRITE_CLASS_ENCODER(osd_xinfo_t) + +ostream& operator<<(ostream& out, const osd_xinfo_t& xi); + + +struct PGTempMap { +#if 1 + bufferlist data; + typedef btree::btree_map<pg_t,ceph_le32*> map_t; + map_t map; + + void encode(bufferlist& bl) const { + using ceph::encode; + uint32_t n = map.size(); + encode(n, bl); + for (auto &p : map) { + encode(p.first, bl); + bl.append((char*)p.second, (*p.second + 1) * sizeof(ceph_le32)); + } + } + void decode(bufferlist::const_iterator& p) { + using ceph::decode; + data.clear(); + map.clear(); + uint32_t n; + decode(n, p); + if (!n) + return; + auto pstart = p; + size_t start_off = pstart.get_off(); + vector<pair<pg_t,size_t>> offsets; + offsets.resize(n); + for (unsigned i=0; i<n; ++i) { + pg_t pgid; + decode(pgid, p); + offsets[i].first = pgid; + offsets[i].second = p.get_off() - start_off; + uint32_t vn; + decode(vn, p); + p.advance(vn * sizeof(int32_t)); + } + size_t len = p.get_off() - start_off; + pstart.copy(len, data); + if (data.get_num_buffers() > 1) { + data.rebuild(); + } + //map.reserve(n); + char *start = data.c_str(); + for (auto i : offsets) { + map.insert(map.end(), make_pair(i.first, (ceph_le32*)(start + i.second))); + } + } + void rebuild() { + bufferlist bl; + encode(bl); + auto p = std::cbegin(bl); + decode(p); + } + friend bool operator==(const PGTempMap& l, const PGTempMap& r) { + return + l.map.size() == r.map.size() && + l.data.contents_equal(r.data); + } + + class iterator { + map_t::const_iterator it; + map_t::const_iterator end; + pair<pg_t,vector<int32_t>> current; + void init_current() { + if (it != end) { + current.first = it->first; + ceph_assert(it->second); + current.second.resize(*it->second); + ceph_le32 *p = it->second + 1; + for (uint32_t n = 0; n < *it->second; ++n, ++p) { + current.second[n] = *p; + } + } + } + public: + iterator(map_t::const_iterator p, + map_t::const_iterator e) + : it(p), end(e) { + init_current(); + } + + const pair<pg_t,vector<int32_t>>& operator*() const { + return current; + } + const pair<pg_t,vector<int32_t>>* operator->() const { + return ¤t; + } + friend bool operator==(const iterator& l, const iterator& r) { + return l.it == r.it; + } + friend bool operator!=(const iterator& l, const iterator& r) { + return l.it != r.it; + } + iterator& operator++() { + ++it; + if (it != end) + init_current(); + return *this; + } + iterator operator++(int) { + iterator r = *this; + ++it; + if (it != end) + init_current(); + return r; + } + }; + iterator begin() const { + return iterator(map.begin(), map.end()); + } + iterator end() const { + return iterator(map.end(), map.end()); + } + iterator find(pg_t pgid) const { + return iterator(map.find(pgid), map.end()); + } + size_t size() const { + return map.size(); + } + size_t count(pg_t pgid) const { + return map.count(pgid); + } + void erase(pg_t pgid) { + map.erase(pgid); + } + void clear() { + map.clear(); + data.clear(); + } + void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) { + using ceph::encode; + size_t need = sizeof(ceph_le32) * (1 + v.size()); + if (need < data.get_append_buffer_unused_tail_length()) { + bufferptr z(data.get_append_buffer_unused_tail_length()); + z.zero(); + data.append(z.c_str(), z.length()); + } + encode(v, data); + map[pgid] = (ceph_le32*)(data.back().end_c_str()) - (1 + v.size()); + } + mempool::osdmap::vector<int32_t> get(pg_t pgid) { + mempool::osdmap::vector<int32_t> v; + ceph_le32 *p = map[pgid]; + size_t n = *p++; + v.resize(n); + for (size_t i = 0; i < n; ++i, ++p) { + v[i] = *p; + } + return v; + } +#else + // trivial implementation + mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > pg_temp; + + void encode(bufferlist& bl) const { + encode(pg_temp, bl); + } + void decode(bufferlist::const_iterator& p) { + decode(pg_temp, p); + } + friend bool operator==(const PGTempMap& l, const PGTempMap& r) { + return + l.pg_temp.size() == r.pg_temp.size() && + l.pg_temp == r.pg_temp; + } + + class iterator { + mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >::const_iterator it; + public: + iterator(mempool::osdmap::map<pg_t, + mempool::osdmap::vector<int32_t> >::const_iterator p) + : it(p) {} + + pair<pg_t,const mempool::osdmap::vector<int32_t>&> operator*() const { + return *it; + } + const pair<const pg_t,mempool::osdmap::vector<int32_t>>* operator->() const { + return &*it; + } + friend bool operator==(const iterator& l, const iterator& r) { + return l.it == r.it; + } + friend bool operator!=(const iterator& l, const iterator& r) { + return l.it != r.it; + } + iterator& operator++() { + ++it; + return *this; + } + iterator operator++(int) { + iterator r = *this; + ++it; + return r; + } + }; + iterator begin() const { + return iterator(pg_temp.cbegin()); + } + iterator end() const { + return iterator(pg_temp.cend()); + } + iterator find(pg_t pgid) const { + return iterator(pg_temp.find(pgid)); + } + size_t size() const { + return pg_temp.size(); + } + size_t count(pg_t pgid) const { + return pg_temp.count(pgid); + } + void erase(pg_t pgid) { + pg_temp.erase(pgid); + } + void clear() { + pg_temp.clear(); + } + void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) { + pg_temp[pgid] = v; + } + const mempool::osdmap::vector<int32_t>& get(pg_t pgid) { + return pg_temp.at(pgid); + } +#endif + void dump(Formatter *f) const { + for (const auto &pg : *this) { + f->open_object_section("osds"); + f->dump_stream("pgid") << pg.first; + f->open_array_section("osds"); + for (const auto osd : pg.second) + f->dump_int("osd", osd); + f->close_section(); + f->close_section(); + } + } +}; +WRITE_CLASS_ENCODER(PGTempMap) + +/** OSDMap + */ +class OSDMap { +public: + MEMPOOL_CLASS_HELPERS(); + + typedef interval_set< + snapid_t, + mempool::osdmap::flat_map<snapid_t,snapid_t>> snap_interval_set_t; + + class Incremental { + public: + MEMPOOL_CLASS_HELPERS(); + + /// feature bits we were encoded with. the subsequent OSDMap + /// encoding should match. + uint64_t encode_features; + uuid_d fsid; + epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch + utime_t modified; + int64_t new_pool_max; //incremented by the OSDMonitor on each pool create + int32_t new_flags; + int8_t new_require_osd_release = -1; + + // full (rare) + bufferlist fullmap; // in lieu of below. + bufferlist crush; + + // incremental + int32_t new_max_osd; + mempool::osdmap::map<int64_t,pg_pool_t> new_pools; + mempool::osdmap::map<int64_t,string> new_pool_names; + mempool::osdmap::set<int64_t> old_pools; + mempool::osdmap::map<string,map<string,string> > new_erasure_code_profiles; + mempool::osdmap::vector<string> old_erasure_code_profiles; + mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_client; + mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_cluster; + mempool::osdmap::map<int32_t,uint32_t> new_state; // XORed onto previous state. + mempool::osdmap::map<int32_t,uint32_t> new_weight; + mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp; // [] to remove + mempool::osdmap::map<pg_t, int32_t> new_primary_temp; // [-1] to remove + mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity; + mempool::osdmap::map<int32_t,epoch_t> new_up_thru; + mempool::osdmap::map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval; + mempool::osdmap::map<int32_t,epoch_t> new_lost; + mempool::osdmap::map<int32_t,uuid_d> new_uuid; + mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo; + + mempool::osdmap::map<entity_addr_t,utime_t> new_blacklist; + mempool::osdmap::vector<entity_addr_t> old_blacklist; + mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_back_up; + mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_front_up; + + mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap; + mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> new_pg_upmap_items; + mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items; + mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps; + mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps; + + mempool::osdmap::map<int32_t,uint32_t> new_crush_node_flags; + mempool::osdmap::map<int32_t,uint32_t> new_device_class_flags; + + string cluster_snapshot; + + float new_nearfull_ratio = -1; + float new_backfillfull_ratio = -1; + float new_full_ratio = -1; + + int8_t new_require_min_compat_client = -1; + + utime_t new_last_up_change, new_last_in_change; + + mutable bool have_crc; ///< crc values are defined + uint32_t full_crc; ///< crc of the resulting OSDMap + mutable uint32_t inc_crc; ///< crc of this incremental + + int get_net_marked_out(const OSDMap *previous) const; + int get_net_marked_down(const OSDMap *previous) const; + int identify_osd(uuid_d u) const; + + void encode_client_old(bufferlist& bl) const; + void encode_classic(bufferlist& bl, uint64_t features) const; + void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const; + void decode_classic(bufferlist::const_iterator &p); + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<Incremental*>& o); + + explicit Incremental(epoch_t e=0) : + encode_features(0), + epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1), + have_crc(false), full_crc(0), inc_crc(0) { + } + explicit Incremental(bufferlist &bl) { + auto p = std::cbegin(bl); + decode(p); + } + explicit Incremental(bufferlist::const_iterator &p) { + decode(p); + } + + pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) { + if (new_pools.count(pool) == 0) + new_pools[pool] = *orig; + return &new_pools[pool]; + } + bool has_erasure_code_profile(const string &name) const { + auto i = new_erasure_code_profiles.find(name); + return i != new_erasure_code_profiles.end(); + } + void set_erasure_code_profile(const string &name, + const map<string,string>& profile) { + new_erasure_code_profiles[name] = profile; + } + mempool::osdmap::map<string,map<string,string>> get_erasure_code_profiles() const { + return new_erasure_code_profiles; + } + + /// propagate update pools' snap metadata to any of their tiers + int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base); + + /// filter out osds with any pending state changing + size_t get_pending_state_osds(vector<int> *osds) { + ceph_assert(osds); + osds->clear(); + + for (auto &p : new_state) { + osds->push_back(p.first); + } + + return osds->size(); + } + + bool pending_osd_has_state(int osd, unsigned state) { + return new_state.count(osd) && (new_state[osd] & state) != 0; + } + + bool pending_osd_state_set(int osd, unsigned state) { + if (pending_osd_has_state(osd, state)) + return false; + new_state[osd] |= state; + return true; + } + + // cancel the specified pending osd state if there is any + // return ture on success, false otherwise. + bool pending_osd_state_clear(int osd, unsigned state) { + if (!pending_osd_has_state(osd, state)) { + // never has been set or already has been cancelled. + return false; + } + + new_state[osd] &= ~state; + if (!new_state[osd]) { + // all flags cleared + new_state.erase(osd); + } + return true; + } + + }; + +private: + uuid_d fsid; + epoch_t epoch; // what epoch of the osd cluster descriptor is this + utime_t created, modified; // epoch start time + int32_t pool_max; // the largest pool num, ever + + uint32_t flags; + + int num_osd; // not saved; see calc_num_osds + int num_up_osd; // not saved; see calc_num_osds + int num_in_osd; // not saved; see calc_num_osds + + int32_t max_osd; + vector<uint32_t> osd_state; + + mempool::osdmap::map<int32_t,uint32_t> crush_node_flags; // crush node -> CEPH_OSD_* flags + mempool::osdmap::map<int32_t,uint32_t> device_class_flags; // device class -> CEPH_OSD_* flags + + utime_t last_up_change, last_in_change; + + // These features affect OSDMap[::Incremental] encoding, or the + // encoding of some type embedded therein (CrushWrapper, something + // from osd_types, etc.). + static constexpr uint64_t SIGNIFICANT_FEATURES = + CEPH_FEATUREMASK_PGID64 | + CEPH_FEATUREMASK_PGPOOL3 | + CEPH_FEATUREMASK_OSDENC | + CEPH_FEATUREMASK_OSDMAP_ENC | + CEPH_FEATUREMASK_OSD_POOLRESEND | + CEPH_FEATUREMASK_NEW_OSDOP_ENCODING | + CEPH_FEATUREMASK_MSG_ADDR2 | + CEPH_FEATUREMASK_CRUSH_TUNABLES5 | + CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS | + CEPH_FEATUREMASK_SERVER_LUMINOUS | + CEPH_FEATUREMASK_SERVER_MIMIC | + CEPH_FEATUREMASK_SERVER_NAUTILUS; + + struct addrs_s { + mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > client_addrs; + mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > cluster_addrs; + mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_back_addrs; + mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_front_addrs; + }; + std::shared_ptr<addrs_s> osd_addrs; + + entity_addrvec_t _blank_addrvec; + + mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" + mempool::osdmap::vector<osd_info_t> osd_info; + std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild) + std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild) + std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline + + // remap (post-CRUSH, pre-up) + mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg + mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set + + mempool::osdmap::map<int64_t,pg_pool_t> pools; + mempool::osdmap::map<int64_t,string> pool_name; + mempool::osdmap::map<string,map<string,string> > erasure_code_profiles; + mempool::osdmap::map<string,int64_t> name_pool; + + std::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid; + mempool::osdmap::vector<osd_xinfo_t> osd_xinfo; + + mempool::osdmap::unordered_map<entity_addr_t,utime_t> blacklist; + + /// queue of snaps to remove + mempool::osdmap::map<int64_t, snap_interval_set_t> removed_snaps_queue; + + /// removed_snaps additions this epoch + mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps; + + /// removed_snaps removals this epoch + mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps; + + epoch_t cluster_snapshot_epoch; + string cluster_snapshot; + bool new_blacklist_entries; + + float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0; + + /// min compat client we want to support + uint8_t require_min_compat_client = 0; // CEPH_RELEASE_* + +public: + /// require osds to run at least this release + uint8_t require_osd_release = 0; // CEPH_RELEASE_* + +private: + mutable uint64_t cached_up_osd_features; + + mutable bool crc_defined; + mutable uint32_t crc; + + void _calc_up_osd_features(); + + public: + bool have_crc() const { return crc_defined; } + uint32_t get_crc() const { return crc; } + + std::shared_ptr<CrushWrapper> crush; // hierarchical map +private: + uint32_t crush_version = 1; + + friend class OSDMonitor; + + public: + OSDMap() : epoch(0), + pool_max(0), + flags(0), + num_osd(0), num_up_osd(0), num_in_osd(0), + max_osd(0), + osd_addrs(std::make_shared<addrs_s>()), + pg_temp(std::make_shared<PGTempMap>()), + primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()), + osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()), + cluster_snapshot_epoch(0), + new_blacklist_entries(false), + cached_up_osd_features(0), + crc_defined(false), crc(0), + crush(std::make_shared<CrushWrapper>()) { + } + +private: + OSDMap(const OSDMap& other) = default; + OSDMap& operator=(const OSDMap& other) = default; +public: + + /// return feature mask subset that is relevant to OSDMap encoding + static uint64_t get_significant_features(uint64_t features) { + return SIGNIFICANT_FEATURES & features; + } + + uint64_t get_encoding_features() const; + + void deepish_copy_from(const OSDMap& o) { + *this = o; + primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp)); + pg_temp.reset(new PGTempMap(*o.pg_temp)); + osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid)); + + if (o.osd_primary_affinity) + osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity)); + + // NOTE: this still references shared entity_addrvec_t's. + osd_addrs.reset(new addrs_s(*o.osd_addrs)); + + // NOTE: we do not copy crush. note that apply_incremental will + // allocate a new CrushWrapper, though. + } + + // map info + const uuid_d& get_fsid() const { return fsid; } + void set_fsid(uuid_d& f) { fsid = f; } + + epoch_t get_epoch() const { return epoch; } + void inc_epoch() { epoch++; } + + void set_epoch(epoch_t e); + + uint32_t get_crush_version() const { + return crush_version; + } + + /* stamps etc */ + const utime_t& get_created() const { return created; } + const utime_t& get_modified() const { return modified; } + + bool is_blacklisted(const entity_addr_t& a) const; + bool is_blacklisted(const entity_addrvec_t& a) const; + void get_blacklist(list<pair<entity_addr_t,utime_t > > *bl) const; + void get_blacklist(std::set<entity_addr_t> *bl) const; + + string get_cluster_snapshot() const { + if (cluster_snapshot_epoch == epoch) + return cluster_snapshot; + return string(); + } + + float get_full_ratio() const { + return full_ratio; + } + float get_backfillfull_ratio() const { + return backfillfull_ratio; + } + float get_nearfull_ratio() const { + return nearfull_ratio; + } + void get_full_pools(CephContext *cct, + set<int64_t> *full, + set<int64_t> *backfillfull, + set<int64_t> *nearfull) const; + void get_full_osd_counts(set<int> *full, set<int> *backfill, + set<int> *nearfull) const; + + + /***** cluster state *****/ + /* osds */ + int get_max_osd() const { return max_osd; } + void set_max_osd(int m); + + unsigned get_num_osds() const { + return num_osd; + } + unsigned get_num_up_osds() const { + return num_up_osd; + } + unsigned get_num_in_osds() const { + return num_in_osd; + } + /// recalculate cached values for get_num{,_up,_in}_osds + int calc_num_osds(); + + void get_all_osds(set<int32_t>& ls) const; + void get_up_osds(set<int32_t>& ls) const; + void get_out_osds(set<int32_t>& ls) const; + void get_out_existing_osds(std::set<int32_t>& ls) const; + unsigned get_num_pg_temp() const { + return pg_temp->size(); + } + + int get_flags() const { return flags; } + bool test_flag(int f) const { return flags & f; } + void set_flag(int f) { flags |= f; } + void clear_flag(int f) { flags &= ~f; } + + void get_flag_set(set<string> *flagset) const; + + static void calc_state_set(int state, set<string>& st); + + int get_state(int o) const { + ceph_assert(o < max_osd); + return osd_state[o]; + } + int get_state(int o, set<string>& st) const { + ceph_assert(o < max_osd); + unsigned t = osd_state[o]; + calc_state_set(t, st); + return osd_state[o]; + } + void set_state(int o, unsigned s) { + ceph_assert(o < max_osd); + osd_state[o] = s; + } + void set_weight(int o, unsigned w) { + ceph_assert(o < max_osd); + osd_weight[o] = w; + if (w) + osd_state[o] |= CEPH_OSD_EXISTS; + } + unsigned get_weight(int o) const { + ceph_assert(o < max_osd); + return osd_weight[o]; + } + float get_weightf(int o) const { + return (float)get_weight(o) / (float)CEPH_OSD_IN; + } + void adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const; + + void set_primary_affinity(int o, int w) { + ceph_assert(o < max_osd); + if (!osd_primary_affinity) + osd_primary_affinity.reset( + new mempool::osdmap::vector<__u32>( + max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)); + (*osd_primary_affinity)[o] = w; + } + unsigned get_primary_affinity(int o) const { + ceph_assert(o < max_osd); + if (!osd_primary_affinity) + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + return (*osd_primary_affinity)[o]; + } + float get_primary_affinityf(int o) const { + return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY; + } + + bool has_erasure_code_profile(const string &name) const { + auto i = erasure_code_profiles.find(name); + return i != erasure_code_profiles.end(); + } + int get_erasure_code_profile_default(CephContext *cct, + map<string,string> &profile_map, + ostream *ss); + void set_erasure_code_profile(const string &name, + const map<string,string>& profile) { + erasure_code_profiles[name] = profile; + } + const map<string,string> &get_erasure_code_profile( + const string &name) const { + static map<string,string> empty; + auto i = erasure_code_profiles.find(name); + if (i == erasure_code_profiles.end()) + return empty; + else + return i->second; + } + const mempool::osdmap::map<string,map<string,string> > &get_erasure_code_profiles() const { + return erasure_code_profiles; + } + + bool exists(int osd) const { + //assert(osd >= 0); + return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS); + } + + bool is_destroyed(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED); + } + + bool is_up(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_UP); + } + + bool has_been_up_since(int osd, epoch_t epoch) const { + return is_up(osd) && get_up_from(osd) <= epoch; + } + + bool is_down(int osd) const { + return !is_up(osd); + } + + bool is_out(int osd) const { + return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT; + } + + bool is_in(int osd) const { + return !is_out(osd); + } + + unsigned get_osd_crush_node_flags(int osd) const; + unsigned get_crush_node_flags(int id) const; + unsigned get_device_class_flags(int id) const; + + bool is_noup_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP); + } + + bool is_nodown_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN); + } + + bool is_noin_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN); + } + + bool is_noout_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT); + } + + bool is_noup(int osd) const { + if (test_flag(CEPH_OSDMAP_NOUP)) // global? + return true; + if (is_noup_by_osd(osd)) // by osd? + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOUP) // by crush-node? + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOUP) // by device-class? + return true; + return false; + } + + bool is_nodown(int osd) const { + if (test_flag(CEPH_OSDMAP_NODOWN)) + return true; + if (is_nodown_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NODOWN) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NODOWN) + return true; + return false; + } + + bool is_noin(int osd) const { + if (test_flag(CEPH_OSDMAP_NOIN)) + return true; + if (is_noin_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOIN) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOIN) + return true; + return false; + } + + bool is_noout(int osd) const { + if (test_flag(CEPH_OSDMAP_NOOUT)) + return true; + if (is_noout_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOOUT) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOOUT) + return true; + return false; + } + + /** + * check if an entire crush subtree is down + */ + bool subtree_is_down(int id, set<int> *down_cache) const; + bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const; + + bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds, + set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const; + + int identify_osd(const entity_addr_t& addr) const; + int identify_osd(const uuid_d& u) const; + int identify_osd_on_all_channels(const entity_addr_t& addr) const; + + bool have_addr(const entity_addr_t& addr) const { + return identify_osd(addr) >= 0; + } + int find_osd_on_ip(const entity_addr_t& ip) const; + + const entity_addrvec_t& get_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->client_addrs[osd] ? + *osd_addrs->client_addrs[osd] : _blank_addrvec; + } + const entity_addrvec_t& get_most_recent_addrs(int osd) const { + return get_addrs(osd); + } + const entity_addrvec_t &get_cluster_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->cluster_addrs[osd] ? + *osd_addrs->cluster_addrs[osd] : _blank_addrvec; + } + const entity_addrvec_t &get_hb_back_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->hb_back_addrs[osd] ? + *osd_addrs->hb_back_addrs[osd] : _blank_addrvec; + } + const entity_addrvec_t &get_hb_front_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->hb_front_addrs[osd] ? + *osd_addrs->hb_front_addrs[osd] : _blank_addrvec; + } + + const uuid_d& get_uuid(int osd) const { + ceph_assert(exists(osd)); + return (*osd_uuid)[osd]; + } + + const epoch_t& get_up_from(int osd) const { + ceph_assert(exists(osd)); + return osd_info[osd].up_from; + } + const epoch_t& get_up_thru(int osd) const { + ceph_assert(exists(osd)); + return osd_info[osd].up_thru; + } + const epoch_t& get_down_at(int osd) const { + ceph_assert(exists(osd)); + return osd_info[osd].down_at; + } + const osd_info_t& get_info(int osd) const { + ceph_assert(osd < max_osd); + return osd_info[osd]; + } + + const osd_xinfo_t& get_xinfo(int osd) const { + ceph_assert(osd < max_osd); + return osd_xinfo[osd]; + } + + int get_next_up_osd_after(int n) const { + if (get_max_osd() == 0) + return -1; + for (int i = n + 1; i != n; ++i) { + if (i >= get_max_osd()) + i = 0; + if (i == n) + break; + if (is_up(i)) + return i; + } + return -1; + } + + int get_previous_up_osd_before(int n) const { + if (get_max_osd() == 0) + return -1; + for (int i = n - 1; i != n; --i) { + if (i < 0) + i = get_max_osd() - 1; + if (i == n) + break; + if (is_up(i)) + return i; + } + return -1; + } + + + void get_random_up_osds_by_subtree(int n, // whoami + string &subtree, + int limit, // how many + set<int> skip, + set<int> *want) const; + + /** + * get feature bits required by the current structure + * + * @param entity_type [in] what entity type we are asking about + * @param mask [out] set of all possible map-related features we could set + * @return feature bits used by this map + */ + uint64_t get_features(int entity_type, uint64_t *mask) const; + + /** + * get oldest *client* version (firefly, hammer, etc.) that can connect given + * the feature bits required (according to get_features()). + */ + uint8_t get_min_compat_client() const; + + /** + * gets the required minimum *client* version that can connect to the cluster. + */ + uint8_t get_require_min_compat_client() const; + + /** + * get intersection of features supported by up osds + */ + uint64_t get_up_osd_features() const; + + void get_upmap_pgs(vector<pg_t> *upmap_pgs) const; + bool check_pg_upmaps( + CephContext *cct, + const vector<pg_t>& to_check, + vector<pg_t> *to_cancel, + map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const; + void clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc, + const vector<pg_t>& to_cancel, + const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const; + bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const; + + int apply_incremental(const Incremental &inc); + + /// try to re-use/reference addrs in oldmap from newmap + static void dedup(const OSDMap *oldmap, OSDMap *newmap); + + static void clean_temps(CephContext *cct, + const OSDMap& oldmap, + const OSDMap& nextmap, + Incremental *pending_inc); + + // serialize, unserialize +private: + void encode_client_old(bufferlist& bl) const; + void encode_classic(bufferlist& bl, uint64_t features) const; + void decode_classic(bufferlist::const_iterator& p); + void post_decode(); +public: + void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const; + void decode(bufferlist& bl); + void decode(bufferlist::const_iterator& bl); + + + /**** mapping facilities ****/ + int map_to_pg( + int64_t pool, + const string& name, + const string& key, + const string& nspace, + pg_t *pg) const; + int object_locator_to_pg(const object_t& oid, const object_locator_t& loc, + pg_t &pg) const; + pg_t object_locator_to_pg(const object_t& oid, + const object_locator_t& loc) const { + pg_t pg; + int ret = object_locator_to_pg(oid, loc, pg); + ceph_assert(ret == 0); + return pg; + } + + + static object_locator_t file_to_object_locator(const file_layout_t& layout) { + return object_locator_t(layout.pool_id, layout.pool_ns); + } + + ceph_object_layout file_to_object_layout(object_t oid, + file_layout_t& layout) const { + return make_object_layout(oid, layout.pool_id, layout.pool_ns); + } + + ceph_object_layout make_object_layout(object_t oid, int pg_pool, + string nspace) const; + + int get_pg_num(int pg_pool) const + { + const pg_pool_t *pool = get_pg_pool(pg_pool); + ceph_assert(NULL != pool); + return pool->get_pg_num(); + } + + bool pg_exists(pg_t pgid) const { + const pg_pool_t *p = get_pg_pool(pgid.pool()); + return p && pgid.ps() < p->get_pg_num(); + } + + int get_pg_pool_min_size(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_min_size(); + } + + int get_pg_pool_size(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_size(); + } + + int get_pg_pool_crush_rule(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_crush_rule(); + } + +private: + /// pg -> (raw osd list) + void _pg_to_raw_osds( + const pg_pool_t& pool, pg_t pg, + vector<int> *osds, + ps_t *ppps) const; + int _pick_primary(const vector<int>& osds) const; + void _remove_nonexistent_osds(const pg_pool_t& pool, vector<int>& osds) const; + + void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool, + vector<int> *osds, int *primary) const; + + /// apply pg_upmap[_items] mappings + void _apply_upmap(const pg_pool_t& pi, pg_t pg, vector<int> *raw) const; + + /// pg -> (up osd list) + void _raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw, + vector<int> *up) const; + + + /** + * Get the pg and primary temp, if they are specified. + * @param temp_pg [out] Will be empty or contain the temp PG mapping on return + * @param temp_primary [out] Will be the value in primary_temp, or a value derived + * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary. + */ + void _get_temp_osds(const pg_pool_t& pool, pg_t pg, + vector<int> *temp_pg, int *temp_primary) const; + + /** + * map to up and acting. Fills in whatever fields are non-NULL. + */ + void _pg_to_up_acting_osds(const pg_t& pg, vector<int> *up, int *up_primary, + vector<int> *acting, int *acting_primary, + bool raw_pg_to_pg = true) const; + +public: + /*** + * This is suitable only for looking at raw CRUSH outputs. It skips + * applying the temp and up checks and should not be used + * by anybody for data mapping purposes. + * raw and primary must be non-NULL + */ + void pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const; + void pg_to_raw_upmap(pg_t pg, vector<int> *raw, + vector<int> *raw_upmap) const; + /// map a pg to its acting set. @return acting set size + void pg_to_acting_osds(const pg_t& pg, vector<int> *acting, + int *acting_primary) const { + _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary); + } + void pg_to_acting_osds(pg_t pg, vector<int>& acting) const { + return pg_to_acting_osds(pg, &acting, NULL); + } + /** + * This does not apply temp overrides and should not be used + * by anybody for data mapping purposes. Specify both pointers. + */ + void pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const; + /** + * map a pg to its acting set as well as its up set. You must use + * the acting set for data mapping purposes, but some users will + * also find the up set useful for things like deciding what to + * set as pg_temp. + * Each of these pointers must be non-NULL. + */ + void pg_to_up_acting_osds(pg_t pg, vector<int> *up, int *up_primary, + vector<int> *acting, int *acting_primary) const { + _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary); + } + void pg_to_up_acting_osds(pg_t pg, vector<int>& up, vector<int>& acting) const { + int up_primary, acting_primary; + pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary); + } + bool pg_is_ec(pg_t pg) const { + auto i = pools.find(pg.pool()); + ceph_assert(i != pools.end()); + return i->second.is_erasure(); + } + bool get_primary_shard(const pg_t& pgid, spg_t *out) const { + auto i = get_pools().find(pgid.pool()); + if (i == get_pools().end()) { + return false; + } + if (!i->second.is_erasure()) { + *out = spg_t(pgid); + return true; + } + int primary; + vector<int> acting; + pg_to_acting_osds(pgid, &acting, &primary); + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == primary) { + *out = spg_t(pgid, shard_id_t(i)); + return true; + } + } + return false; + } + bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const { + auto i = get_pools().find(pgid.pool()); + if (i == get_pools().end()) { + return false; + } + vector<int> acting; + pg_to_acting_osds(pgid, &acting, primary); + if (i->second.is_erasure()) { + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == *primary) { + *out = spg_t(pgid, shard_id_t(i)); + return true; + } + } + } else { + *out = spg_t(pgid); + return true; + } + return false; + } + + const mempool::osdmap::map<int64_t,snap_interval_set_t>& + get_removed_snaps_queue() const { + return removed_snaps_queue; + } + const mempool::osdmap::map<int64_t,snap_interval_set_t>& + get_new_removed_snaps() const { + return new_removed_snaps; + } + const mempool::osdmap::map<int64_t,snap_interval_set_t>& + get_new_purged_snaps() const { + return new_purged_snaps; + } + + int64_t lookup_pg_pool_name(const string& name) const { + auto p = name_pool.find(name); + if (p == name_pool.end()) + return -ENOENT; + return p->second; + } + + int64_t get_pool_max() const { + return pool_max; + } + const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const { + return pools; + } + mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() { + return pools; + } + void get_pool_ids_by_rule(int rule_id, set<int64_t> *pool_ids) const { + ceph_assert(pool_ids); + for (auto &p: pools) { + if (p.second.get_crush_rule() == rule_id) { + pool_ids->insert(p.first); + } + } + } + void get_pool_ids_by_osd(CephContext *cct, + int osd, + set<int64_t> *pool_ids) const; + const string& get_pool_name(int64_t p) const { + auto i = pool_name.find(p); + ceph_assert(i != pool_name.end()); + return i->second; + } + const mempool::osdmap::map<int64_t,string>& get_pool_names() const { + return pool_name; + } + bool have_pg_pool(int64_t p) const { + return pools.count(p); + } + const pg_pool_t* get_pg_pool(int64_t p) const { + auto i = pools.find(p); + if (i != pools.end()) + return &i->second; + return NULL; + } + unsigned get_pg_size(pg_t pg) const { + auto p = pools.find(pg.pool()); + ceph_assert(p != pools.end()); + return p->second.get_size(); + } + int get_pg_type(pg_t pg) const { + auto p = pools.find(pg.pool()); + ceph_assert(p != pools.end()); + return p->second.get_type(); + } + + + pg_t raw_pg_to_pg(pg_t pg) const { + auto p = pools.find(pg.pool()); + ceph_assert(p != pools.end()); + return p->second.raw_pg_to_pg(pg); + } + + // pg -> acting primary osd + int get_pg_acting_primary(pg_t pg) const { + int primary = -1; + _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary); + return primary; + } + + /* + * check whether an spg_t maps to a particular osd + */ + bool is_up_acting_osd_shard(spg_t pg, int osd) const { + vector<int> up, acting; + _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false); + if (pg.shard == shard_id_t::NO_SHARD) { + if (calc_pg_role(osd, acting, acting.size()) >= 0 || + calc_pg_role(osd, up, up.size()) >= 0) + return true; + } else { + if (pg.shard < (int)acting.size() && acting[pg.shard] == osd) + return true; + if (pg.shard < (int)up.size() && up[pg.shard] == osd) + return true; + } + return false; + } + + + /* what replica # is a given osd? 0 primary, -1 for none. */ + static int calc_pg_rank(int osd, const vector<int>& acting, int nrep=0); + static int calc_pg_role(int osd, const vector<int>& acting, int nrep=0); + static bool primary_changed( + int oldprimary, + const vector<int> &oldacting, + int newprimary, + const vector<int> &newacting); + + /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ + int get_pg_acting_rank(pg_t pg, int osd) const { + vector<int> group; + pg_to_acting_osds(pg, group); + return calc_pg_rank(osd, group, group.size()); + } + /* role is -1 (stray), 0 (primary), 1 (replica) */ + int get_pg_acting_role(const pg_t& pg, int osd) const { + vector<int> group; + pg_to_acting_osds(pg, group); + return calc_pg_role(osd, group, group.size()); + } + + bool osd_is_valid_op_target(pg_t pg, int osd) const { + int primary; + vector<int> group; + pg_to_acting_osds(pg, &group, &primary); + if (osd == primary) + return true; + if (pg_is_ec(pg)) + return false; + + return calc_pg_role(osd, group, group.size()) >= 0; + } + + bool try_pg_upmap( + CephContext *cct, + pg_t pg, ///< pg to potentially remap + const set<int>& overfull, ///< osds we'd want to evacuate + const vector<int>& underfull, ///< osds to move to, in order of preference + const vector<int>& more_underfull, ///< less full osds to move to, in order of preference + vector<int> *orig, + vector<int> *out); ///< resulting alternative mapping + + int calc_pg_upmaps( + CephContext *cct, + uint32_t max_deviation, ///< max deviation from target (value >= 1) + int max_iterations, ///< max iterations to run + const set<int64_t>& pools, ///< [optional] restrict to pool + Incremental *pending_inc + ); + + int get_osds_by_bucket_name(const string &name, set<int> *osds) const; + + bool have_pg_upmaps(pg_t pg) const { + return pg_upmap.count(pg) || + pg_upmap_items.count(pg); + } + + /* + * handy helpers to build simple maps... + */ + /** + * Build an OSD map suitable for basic usage. If **num_osd** is >= 0 + * it will be initialized with the specified number of OSDs in a + * single host. If **num_osd** is < 0 the layout of the OSD map will + * be built by reading the content of the configuration file. + * + * @param cct [in] in core ceph context + * @param e [in] initial epoch + * @param fsid [in] id of the cluster + * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0 + * @return **0** on success, negative errno on error. + */ +private: + int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd, int pg_bits, int pgp_bits, + bool default_pool); +public: + int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd) { + return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false); + } + int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd, int pg_bits, int pgp_bits) { + return build_simple_optioned(cct, e, fsid, num_osd, + pg_bits, pgp_bits, true); + } + static int _build_crush_types(CrushWrapper& crush); + static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush, + int num_osd, ostream *ss); + static int build_simple_crush_map_from_conf(CephContext *cct, + CrushWrapper& crush, + ostream *ss); + static int build_simple_crush_rules( + CephContext *cct, CrushWrapper& crush, + const string& root, + ostream *ss); + + bool crush_rule_in_use(int rule_id) const; + + int validate_crush_rules(CrushWrapper *crush, ostream *ss) const; + + void clear_temp() { + pg_temp->clear(); + primary_temp->clear(); + } + +private: + void print_osd_line(int cur, ostream *out, Formatter *f) const; +public: + void print(ostream& out) const; + void print_pools(ostream& out) const; + void print_summary(Formatter *f, ostream& out, const string& prefix, bool extra=false) const; + void print_oneline_summary(ostream& out) const; + + enum { + DUMP_IN = 1, // only 'in' osds + DUMP_OUT = 2, // only 'out' osds + DUMP_UP = 4, // only 'up' osds + DUMP_DOWN = 8, // only 'down' osds + DUMP_DESTROYED = 16, // only 'destroyed' osds + }; + void print_tree(Formatter *f, ostream *out, unsigned dump_flags=0, string bucket="") const; + + int summarize_mapping_stats( + OSDMap *newmap, + const set<int64_t> *pools, + std::string *out, + Formatter *f) const; + + string get_flag_string() const; + static string get_flag_string(unsigned flags); + static void dump_erasure_code_profiles( + const mempool::osdmap::map<string,map<string,string> > &profiles, + Formatter *f); + void dump(Formatter *f) const; + static void generate_test_instances(list<OSDMap*>& o); + bool check_new_blacklist_entries() const { return new_blacklist_entries; } + + void check_health(CephContext *cct, health_check_map_t *checks) const; + + int parse_osd_id_list(const vector<string>& ls, + set<int> *out, + ostream *ss) const; + + float pool_raw_used_rate(int64_t poolid) const; + +}; +WRITE_CLASS_ENCODER_FEATURES(OSDMap) +WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental) + +typedef std::shared_ptr<const OSDMap> OSDMapRef; + +inline ostream& operator<<(ostream& out, const OSDMap& m) { + m.print_oneline_summary(out); + return out; +} + +class PGMap; + +void print_osd_utilization(const OSDMap& osdmap, + const PGMap& pgmap, + ostream& out, + Formatter *f, + bool tree, + const string& class_name, + const string& item_name); + +#endif diff --git a/src/osd/OSDMapMapping.cc b/src/osd/OSDMapMapping.cc new file mode 100644 index 00000000..285b2d8f --- /dev/null +++ b/src/osd/OSDMapMapping.cc @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "OSDMapMapping.h" +#include "OSDMap.h" + +#define dout_subsys ceph_subsys_mon + +#include "common/debug.h" + +MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMapMapping, osdmapmapping, + osdmap_mapping); + +// ensure that we have a PoolMappings for each pool and that +// the dimensions (pg_num and size) match up. +void OSDMapMapping::_init_mappings(const OSDMap& osdmap) +{ + num_pgs = 0; + auto q = pools.begin(); + for (auto& p : osdmap.get_pools()) { + num_pgs += p.second.get_pg_num(); + // drop unneeded pools + while (q != pools.end() && q->first < p.first) { + q = pools.erase(q); + } + if (q != pools.end() && q->first == p.first) { + if (q->second.pg_num != p.second.get_pg_num() || + q->second.size != p.second.get_size()) { + // pg_num changed + q = pools.erase(q); + } else { + // keep it + ++q; + continue; + } + } + pools.emplace(p.first, PoolMapping(p.second.get_size(), + p.second.get_pg_num(), + p.second.is_erasure())); + } + pools.erase(q, pools.end()); + ceph_assert(pools.size() == osdmap.get_pools().size()); +} + +void OSDMapMapping::update(const OSDMap& osdmap) +{ + _start(osdmap); + for (auto& p : osdmap.get_pools()) { + _update_range(osdmap, p.first, 0, p.second.get_pg_num()); + } + _finish(osdmap); + //_dump(); // for debugging +} + +void OSDMapMapping::update(const OSDMap& osdmap, pg_t pgid) +{ + _update_range(osdmap, pgid.pool(), pgid.ps(), pgid.ps() + 1); +} + +void OSDMapMapping::_build_rmap(const OSDMap& osdmap) +{ + acting_rmap.resize(osdmap.get_max_osd()); + //up_rmap.resize(osdmap.get_max_osd()); + for (auto& v : acting_rmap) { + v.resize(0); + } + //for (auto& v : up_rmap) { + // v.resize(0); + //} + for (auto& p : pools) { + pg_t pgid(0, p.first); + for (unsigned ps = 0; ps < p.second.pg_num; ++ps) { + pgid.set_ps(ps); + int32_t *row = &p.second.table[p.second.row_size() * ps]; + for (int i = 0; i < row[2]; ++i) { + if (row[4 + i] != CRUSH_ITEM_NONE) { + acting_rmap[row[4 + i]].push_back(pgid); + } + } + //for (int i = 0; i < row[3]; ++i) { + //up_rmap[row[4 + p.second.size + i]].push_back(pgid); + //} + } + } +} + +void OSDMapMapping::_finish(const OSDMap& osdmap) +{ + _build_rmap(osdmap); + epoch = osdmap.get_epoch(); +} + +void OSDMapMapping::_dump() +{ + for (auto& p : pools) { + cout << "pool " << p.first << std::endl; + for (unsigned i = 0; i < p.second.table.size(); ++i) { + cout << " " << p.second.table[i]; + if (i % p.second.row_size() == p.second.row_size() - 1) + cout << std::endl; + } + } +} + +void OSDMapMapping::_update_range( + const OSDMap& osdmap, + int64_t pool, + unsigned pg_begin, + unsigned pg_end) +{ + auto i = pools.find(pool); + ceph_assert(i != pools.end()); + ceph_assert(pg_begin <= pg_end); + ceph_assert(pg_end <= i->second.pg_num); + for (unsigned ps = pg_begin; ps < pg_end; ++ps) { + vector<int> up, acting; + int up_primary, acting_primary; + osdmap.pg_to_up_acting_osds( + pg_t(ps, pool), + &up, &up_primary, &acting, &acting_primary); + i->second.set(ps, std::move(up), up_primary, + std::move(acting), acting_primary); + } +} + +// --------------------------- + +void ParallelPGMapper::Job::finish_one() +{ + Context *fin = nullptr; + { + std::lock_guard l(lock); + if (--shards == 0) { + if (!aborted) { + finish = ceph_clock_now(); + complete(); + } + cond.Signal(); + fin = onfinish; + onfinish = nullptr; + } + } + if (fin) { + fin->complete(0); + } +} + +void ParallelPGMapper::WQ::_process(Item *i, ThreadPool::TPHandle &h) +{ + ldout(m->cct, 20) << __func__ << " " << i->job << " pool " << i->pool + << " [" << i->begin << "," << i->end << ")" + << " pgs " << i->pgs + << dendl; + if (!i->pgs.empty()) + i->job->process(i->pgs); + else + i->job->process(i->pool, i->begin, i->end); + i->job->finish_one(); + delete i; +} + +void ParallelPGMapper::queue( + Job *job, + unsigned pgs_per_item, + const vector<pg_t>& input_pgs) +{ + bool any = false; + if (!input_pgs.empty()) { + unsigned i = 0; + vector<pg_t> item_pgs; + item_pgs.reserve(pgs_per_item); + for (auto& pg : input_pgs) { + if (i < pgs_per_item) { + ++i; + item_pgs.push_back(pg); + } + if (i >= pgs_per_item) { + job->start_one(); + wq.queue(new Item(job, item_pgs)); + i = 0; + item_pgs.clear(); + any = true; + } + } + if (!item_pgs.empty()) { + job->start_one(); + wq.queue(new Item(job, item_pgs)); + any = true; + } + ceph_assert(any); + return; + } + // no input pgs, load all from map + for (auto& p : job->osdmap->get_pools()) { + for (unsigned ps = 0; ps < p.second.get_pg_num(); ps += pgs_per_item) { + unsigned ps_end = std::min(ps + pgs_per_item, p.second.get_pg_num()); + job->start_one(); + wq.queue(new Item(job, p.first, ps, ps_end)); + ldout(cct, 20) << __func__ << " " << job << " " << p.first << " [" << ps + << "," << ps_end << ")" << dendl; + any = true; + } + } + ceph_assert(any); +} diff --git a/src/osd/OSDMapMapping.h b/src/osd/OSDMapMapping.h new file mode 100644 index 00000000..b0965fc1 --- /dev/null +++ b/src/osd/OSDMapMapping.h @@ -0,0 +1,352 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#ifndef CEPH_OSDMAPMAPPING_H +#define CEPH_OSDMAPMAPPING_H + +#include <vector> +#include <map> + +#include "osd/osd_types.h" +#include "common/WorkQueue.h" +#include "common/Cond.h" + +class OSDMap; + +/// work queue to perform work on batches of pgids on multiple CPUs +class ParallelPGMapper { +public: + struct Job { + utime_t start, finish; + unsigned shards = 0; + const OSDMap *osdmap; + bool aborted = false; + Context *onfinish = nullptr; + + Mutex lock = {"ParallelPGMapper::Job::lock"}; + Cond cond; + + Job(const OSDMap *om) : start(ceph_clock_now()), osdmap(om) {} + virtual ~Job() { + ceph_assert(shards == 0); + } + + // child must implement either form of process + virtual void process(const vector<pg_t>& pgs) = 0; + virtual void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) = 0; + virtual void complete() = 0; + + void set_finish_event(Context *fin) { + lock.Lock(); + if (shards == 0) { + // already done. + lock.Unlock(); + fin->complete(0); + } else { + // set finisher + onfinish = fin; + lock.Unlock(); + } + } + bool is_done() { + std::lock_guard l(lock); + return shards == 0; + } + utime_t get_duration() { + return finish - start; + } + void wait() { + std::lock_guard l(lock); + while (shards > 0) { + cond.Wait(lock); + } + } + bool wait_for(double duration) { + utime_t until = start; + until += duration; + std::lock_guard l(lock); + while (shards > 0) { + if (ceph_clock_now() >= until) { + return false; + } + cond.Wait(lock); + } + return true; + } + void abort() { + Context *fin = nullptr; + { + std::lock_guard l(lock); + aborted = true; + fin = onfinish; + onfinish = nullptr; + while (shards > 0) { + cond.Wait(lock); + } + } + if (fin) { + fin->complete(-ECANCELED); + } + } + + void start_one() { + std::lock_guard l(lock); + ++shards; + } + void finish_one(); + }; + +protected: + CephContext *cct; + + struct Item { + Job *job; + int64_t pool; + unsigned begin, end; + vector<pg_t> pgs; + + Item(Job *j, vector<pg_t> pgs) : job(j), pgs(pgs) {} + Item(Job *j, int64_t p, unsigned b, unsigned e) + : job(j), + pool(p), + begin(b), + end(e) {} + }; + std::deque<Item*> q; + + struct WQ : public ThreadPool::WorkQueue<Item> { + ParallelPGMapper *m; + + WQ(ParallelPGMapper *m_, ThreadPool *tp) + : ThreadPool::WorkQueue<Item>("ParallelPGMapper::WQ", 0, 0, tp), + m(m_) {} + + bool _enqueue(Item *i) override { + m->q.push_back(i); + return true; + } + void _dequeue(Item *i) override { + ceph_abort(); + } + Item *_dequeue() override { + while (!m->q.empty()) { + Item *i = m->q.front(); + m->q.pop_front(); + if (i->job->aborted) { + i->job->finish_one(); + delete i; + } else { + return i; + } + } + return nullptr; + } + + void _process(Item *i, ThreadPool::TPHandle &h) override; + + void _clear() override { + ceph_assert(_empty()); + } + + bool _empty() override { + return m->q.empty(); + } + } wq; + +public: + ParallelPGMapper(CephContext *cct, ThreadPool *tp) + : cct(cct), + wq(this, tp) {} + + void queue( + Job *job, + unsigned pgs_per_item, + const vector<pg_t>& input_pgs); + + void drain() { + wq.drain(); + } +}; + + +/// a precalculated mapping of every PG for a given OSDMap +class OSDMapMapping { +public: + MEMPOOL_CLASS_HELPERS(); +private: + + struct PoolMapping { + MEMPOOL_CLASS_HELPERS(); + + unsigned size = 0; + unsigned pg_num = 0; + bool erasure = false; + mempool::osdmap_mapping::vector<int32_t> table; + + size_t row_size() const { + return + 1 + // acting_primary + 1 + // up_primary + 1 + // num acting + 1 + // num up + size + // acting + size; // up + } + + PoolMapping(int s, int p, bool e) + : size(s), + pg_num(p), + erasure(e), + table(pg_num * row_size()) { + } + + void get(size_t ps, + std::vector<int> *up, + int *up_primary, + std::vector<int> *acting, + int *acting_primary) const { + const int32_t *row = &table[row_size() * ps]; + if (acting_primary) { + *acting_primary = row[0]; + } + if (up_primary) { + *up_primary = row[1]; + } + if (acting) { + acting->resize(row[2]); + for (int i = 0; i < row[2]; ++i) { + (*acting)[i] = row[4 + i]; + } + } + if (up) { + up->resize(row[3]); + for (int i = 0; i < row[3]; ++i) { + (*up)[i] = row[4 + size + i]; + } + } + } + + void set(size_t ps, + const std::vector<int>& up, + int up_primary, + const std::vector<int>& acting, + int acting_primary) { + int32_t *row = &table[row_size() * ps]; + row[0] = acting_primary; + row[1] = up_primary; + // these should always be <= the pool size, but just in case, avoid + // blowing out the array. Note that our mapping is not completely + // accurate in this case--this is just to avoid crashing. + row[2] = std::min<int32_t>(acting.size(), size); + row[3] = std::min<int32_t>(up.size(), size); + for (int i = 0; i < row[2]; ++i) { + row[4 + i] = acting[i]; + } + for (int i = 0; i < row[3]; ++i) { + row[4 + size + i] = up[i]; + } + } + }; + + mempool::osdmap_mapping::map<int64_t,PoolMapping> pools; + mempool::osdmap_mapping::vector< + mempool::osdmap_mapping::vector<pg_t>> acting_rmap; // osd -> pg + //unused: mempool::osdmap_mapping::vector<std::vector<pg_t>> up_rmap; // osd -> pg + epoch_t epoch = 0; + uint64_t num_pgs = 0; + + void _init_mappings(const OSDMap& osdmap); + void _update_range( + const OSDMap& map, + int64_t pool, + unsigned pg_begin, unsigned pg_end); + + void _build_rmap(const OSDMap& osdmap); + + void _start(const OSDMap& osdmap) { + _init_mappings(osdmap); + } + void _finish(const OSDMap& osdmap); + + void _dump(); + + friend class ParallelPGMapper; + + struct MappingJob : public ParallelPGMapper::Job { + OSDMapMapping *mapping; + MappingJob(const OSDMap *osdmap, OSDMapMapping *m) + : Job(osdmap), mapping(m) { + mapping->_start(*osdmap); + } + void process(const vector<pg_t>& pgs) override {} + void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override { + mapping->_update_range(*osdmap, pool, ps_begin, ps_end); + } + void complete() override { + mapping->_finish(*osdmap); + } + }; + +public: + void get(pg_t pgid, + std::vector<int> *up, + int *up_primary, + std::vector<int> *acting, + int *acting_primary) const { + auto p = pools.find(pgid.pool()); + ceph_assert(p != pools.end()); + ceph_assert(pgid.ps() < p->second.pg_num); + p->second.get(pgid.ps(), up, up_primary, acting, acting_primary); + } + + bool get_primary_and_shard(pg_t pgid, + int *acting_primary, + spg_t *spgid) { + auto p = pools.find(pgid.pool()); + ceph_assert(p != pools.end()); + ceph_assert(pgid.ps() < p->second.pg_num); + vector<int> acting; + p->second.get(pgid.ps(), nullptr, nullptr, &acting, acting_primary); + if (p->second.erasure) { + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == *acting_primary) { + *spgid = spg_t(pgid, shard_id_t(i)); + return true; + } + } + return false; + } else { + *spgid = spg_t(pgid); + return true; + } + } + + const mempool::osdmap_mapping::vector<pg_t>& get_osd_acting_pgs(unsigned osd) { + ceph_assert(osd < acting_rmap.size()); + return acting_rmap[osd]; + } + + void update(const OSDMap& map); + void update(const OSDMap& map, pg_t pgid); + + std::unique_ptr<MappingJob> start_update( + const OSDMap& map, + ParallelPGMapper& mapper, + unsigned pgs_per_item) { + std::unique_ptr<MappingJob> job(new MappingJob(&map, this)); + mapper.queue(job.get(), pgs_per_item, {}); + return job; + } + + epoch_t get_epoch() const { + return epoch; + } + + uint64_t get_num_pgs() const { + return num_pgs; + } +}; + + +#endif diff --git a/src/osd/ObjectVersioner.h b/src/osd/ObjectVersioner.h new file mode 100644 index 00000000..f7d75633 --- /dev/null +++ b/src/osd/ObjectVersioner.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_OBJECTVERSIONER_H +#define CEPH_OSD_OBJECTVERSIONER_H + +class ObjectVersioner { + public: + pobject_t oid; + + void get_versions(list<version_t>& ls); + version_t head(); // newest + version_t committed(); // last committed + version_t tail(); // oldest + + /* + * prepare a new version, starting wit "raw" transaction t. + */ + void prepare(ObjectStore::Transaction& t, version_t v); + void rollback_to(version_t v); + void commit_to(version_t v); +}; + +#endif diff --git a/src/osd/OpQueueItem.cc b/src/osd/OpQueueItem.cc new file mode 100644 index 00000000..1deb1e7a --- /dev/null +++ b/src/osd/OpQueueItem.cc @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "OpQueueItem.h" +#include "OSD.h" + +void PGOpItem::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_op(pg, op, handle); + pg->unlock(); +} + +void PGPeeringItem::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_peering_evt(sdata, pg.get(), evt, handle); +} + +void PGSnapTrim::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + pg->snap_trimmer(epoch_queued); + pg->unlock(); +} + +void PGScrub::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + pg->scrub(epoch_queued, handle); + pg->unlock(); +} + +void PGRecovery::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->do_recovery(pg.get(), epoch_queued, reserved_pushes, handle); + pg->unlock(); +} + +void PGRecoveryContext::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + c.release()->complete(handle); + pg->unlock(); +} + +void PGDelete::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_delete(sdata, pg.get(), epoch_queued, handle); +} diff --git a/src/osd/OpQueueItem.h b/src/osd/OpQueueItem.h new file mode 100644 index 00000000..558c5c88 --- /dev/null +++ b/src/osd/OpQueueItem.h @@ -0,0 +1,342 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <ostream> + +#include "include/types.h" +#include "include/utime.h" +#include "osd/OpRequest.h" +#include "osd/PG.h" +#include "PGPeeringEvent.h" + +class OSD; +class OSDShard; + +class OpQueueItem { +public: + class OrderLocker { + public: + using Ref = unique_ptr<OrderLocker>; + virtual void lock() = 0; + virtual void unlock() = 0; + virtual ~OrderLocker() {} + }; + // Abstraction for operations queueable in the op queue + class OpQueueable { + public: + enum class op_type_t { + client_op, + peering_event, + bg_snaptrim, + bg_recovery, + bg_scrub, + bg_pg_delete + }; + using Ref = std::unique_ptr<OpQueueable>; + + /// Items with the same queue token will end up in the same shard + virtual uint32_t get_queue_token() const = 0; + + /* Items will be dequeued and locked atomically w.r.t. other items with the + * same ordering token */ + virtual const spg_t& get_ordering_token() const = 0; + virtual OrderLocker::Ref get_order_locker(PGRef pg) = 0; + virtual op_type_t get_op_type() const = 0; + virtual boost::optional<OpRequestRef> maybe_get_op() const { + return boost::none; + } + + virtual uint64_t get_reserved_pushes() const { + return 0; + } + + virtual bool is_peering() const { + return false; + } + virtual bool peering_requires_pg() const { + ceph_abort(); + } + virtual const PGCreateInfo *creates_pg() const { + return nullptr; + } + + virtual ostream &print(ostream &rhs) const = 0; + + virtual void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) = 0; + virtual ~OpQueueable() {} + friend ostream& operator<<(ostream& out, const OpQueueable& q) { + return q.print(out); + } + + }; + +private: + OpQueueable::Ref qitem; + int cost; + unsigned priority; + utime_t start_time; + uint64_t owner; ///< global id (e.g., client.XXX) + epoch_t map_epoch; ///< an epoch we expect the PG to exist in + +public: + OpQueueItem( + OpQueueable::Ref &&item, + int cost, + unsigned priority, + utime_t start_time, + uint64_t owner, + epoch_t e) + : qitem(std::move(item)), + cost(cost), + priority(priority), + start_time(start_time), + owner(owner), + map_epoch(e) + {} + OpQueueItem(OpQueueItem &&) = default; + OpQueueItem(const OpQueueItem &) = delete; + OpQueueItem &operator=(OpQueueItem &&) = default; + OpQueueItem &operator=(const OpQueueItem &) = delete; + + OrderLocker::Ref get_order_locker(PGRef pg) { + return qitem->get_order_locker(pg); + } + uint32_t get_queue_token() const { + return qitem->get_queue_token(); + } + const spg_t& get_ordering_token() const { + return qitem->get_ordering_token(); + } + using op_type_t = OpQueueable::op_type_t; + OpQueueable::op_type_t get_op_type() const { + return qitem->get_op_type(); + } + boost::optional<OpRequestRef> maybe_get_op() const { + return qitem->maybe_get_op(); + } + uint64_t get_reserved_pushes() const { + return qitem->get_reserved_pushes(); + } + void run(OSD *osd, OSDShard *sdata,PGRef& pg, ThreadPool::TPHandle &handle) { + qitem->run(osd, sdata, pg, handle); + } + unsigned get_priority() const { return priority; } + int get_cost() const { return cost; } + utime_t get_start_time() const { return start_time; } + uint64_t get_owner() const { return owner; } + epoch_t get_map_epoch() const { return map_epoch; } + + bool is_peering() const { + return qitem->is_peering(); + } + + const PGCreateInfo *creates_pg() const { + return qitem->creates_pg(); + } + + bool peering_requires_pg() const { + return qitem->peering_requires_pg(); + } + + friend ostream& operator<<(ostream& out, const OpQueueItem& item) { + out << "OpQueueItem(" + << item.get_ordering_token() << " " << *item.qitem + << " prio " << item.get_priority() + << " cost " << item.get_cost() + << " e" << item.get_map_epoch(); + if (item.get_reserved_pushes()) { + out << " reserved_pushes " << item.get_reserved_pushes(); + } + return out << ")"; + } +}; // class OpQueueItem + +/// Implements boilerplate for operations queued for the pg lock +class PGOpQueueable : public OpQueueItem::OpQueueable { + spg_t pgid; +protected: + const spg_t& get_pgid() const { + return pgid; + } +public: + explicit PGOpQueueable(spg_t pg) : pgid(pg) {} + uint32_t get_queue_token() const override final { + return get_pgid().ps(); + } + + const spg_t& get_ordering_token() const override final { + return get_pgid(); + } + + OpQueueItem::OrderLocker::Ref get_order_locker(PGRef pg) override final { + class Locker : public OpQueueItem::OrderLocker { + PGRef pg; + public: + explicit Locker(PGRef pg) : pg(pg) {} + void lock() override final { + pg->lock(); + } + void unlock() override final { + pg->unlock(); + } + }; + return OpQueueItem::OrderLocker::Ref( + new Locker(pg)); + } +}; + +class PGOpItem : public PGOpQueueable { + OpRequestRef op; +public: + PGOpItem(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {} + op_type_t get_op_type() const override final { + return op_type_t::client_op; + } + ostream &print(ostream &rhs) const override final { + return rhs << "PGOpItem(op=" << *(op->get_req()) << ")"; + } + boost::optional<OpRequestRef> maybe_get_op() const override final { + return op; + } + void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) override final; +}; + +class PGPeeringItem : public PGOpQueueable { + PGPeeringEventRef evt; +public: + PGPeeringItem(spg_t pg, PGPeeringEventRef e) : PGOpQueueable(pg), evt(e) {} + op_type_t get_op_type() const override final { + return op_type_t::peering_event; + } + ostream &print(ostream &rhs) const override final { + return rhs << "PGPeeringEvent(" << evt->get_desc() << ")"; + } + void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) override final; + bool is_peering() const override { + return true; + } + bool peering_requires_pg() const override { + return evt->requires_pg; + } + const PGCreateInfo *creates_pg() const override { + return evt->create_info.get(); + } +}; + +class PGSnapTrim : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGSnapTrim( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), epoch_queued(epoch_queued) {} + op_type_t get_op_type() const override final { + return op_type_t::bg_snaptrim; + } + ostream &print(ostream &rhs) const override final { + return rhs << "PGSnapTrim(pgid=" << get_pgid() + << "epoch_queued=" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) override final; +}; + +class PGScrub : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGScrub( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), epoch_queued(epoch_queued) {} + op_type_t get_op_type() const override final { + return op_type_t::bg_scrub; + } + ostream &print(ostream &rhs) const override final { + return rhs << "PGScrub(pgid=" << get_pgid() + << "epoch_queued=" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) override final; +}; + +class PGRecovery : public PGOpQueueable { + epoch_t epoch_queued; + uint64_t reserved_pushes; +public: + PGRecovery( + spg_t pg, + epoch_t epoch_queued, + uint64_t reserved_pushes) + : PGOpQueueable(pg), + epoch_queued(epoch_queued), + reserved_pushes(reserved_pushes) {} + op_type_t get_op_type() const override final { + return op_type_t::bg_recovery; + } + virtual ostream &print(ostream &rhs) const override final { + return rhs << "PGRecovery(pgid=" << get_pgid() + << "epoch_queued=" << epoch_queued + << "reserved_pushes=" << reserved_pushes + << ")"; + } + virtual uint64_t get_reserved_pushes() const override final { + return reserved_pushes; + } + virtual void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) override final; +}; + +class PGRecoveryContext : public PGOpQueueable { + unique_ptr<GenContext<ThreadPool::TPHandle&>> c; + epoch_t epoch; +public: + PGRecoveryContext(spg_t pgid, + GenContext<ThreadPool::TPHandle&> *c, epoch_t epoch) + : PGOpQueueable(pgid), + c(c), epoch(epoch) {} + op_type_t get_op_type() const override final { + return op_type_t::bg_recovery; + } + ostream &print(ostream &rhs) const override final { + return rhs << "PGRecoveryContext(pgid=" << get_pgid() + << " c=" << c.get() << " epoch=" << epoch + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) override final; +}; + +class PGDelete : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGDelete( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), + epoch_queued(epoch_queued) {} + op_type_t get_op_type() const override final { + return op_type_t::bg_pg_delete; + } + ostream &print(ostream &rhs) const override final { + return rhs << "PGDelete(" << get_pgid() + << " e" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) override final; +}; diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc new file mode 100644 index 00000000..35ff7f28 --- /dev/null +++ b/src/osd/OpRequest.cc @@ -0,0 +1,195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "OpRequest.h" +#include "common/Formatter.h" +#include <iostream> +#include <vector> +#include "common/debug.h" +#include "common/config.h" +#include "msg/Message.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "include/ceph_assert.h" +#include "osd/osd_types.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/oprequest.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +OpRequest::OpRequest(Message *req, OpTracker *tracker) : + TrackedOp(tracker, req->get_recv_stamp()), + rmw_flags(0), request(req), + hit_flag_points(0), latest_flag_point(0), + hitset_inserted(false) +{ + if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) { + // don't warn as quickly for low priority ops + warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple; + } + if (req->get_type() == CEPH_MSG_OSD_OP) { + reqid = static_cast<MOSDOp*>(req)->get_reqid(); + } else if (req->get_type() == MSG_OSD_REPOP) { + reqid = static_cast<MOSDRepOp*>(req)->reqid; + } else if (req->get_type() == MSG_OSD_REPOPREPLY) { + reqid = static_cast<MOSDRepOpReply*>(req)->reqid; + } + req_src_inst = req->get_source_inst(); +} + +void OpRequest::_dump(Formatter *f) const +{ + Message *m = request; + f->dump_string("flag_point", state_string()); + if (m->get_orig_source().is_client()) { + f->open_object_section("client_info"); + stringstream client_name, client_addr; + client_name << req_src_inst.name; + client_addr << req_src_inst.addr; + f->dump_string("client", client_name.str()); + f->dump_string("client_addr", client_addr.str()); + f->dump_unsigned("tid", m->get_tid()); + f->close_section(); // client_info + } + { + f->open_array_section("events"); + std::lock_guard l(lock); + for (auto& i : events) { + f->dump_object("event", i); + } + f->close_section(); + } +} + +void OpRequest::_dump_op_descriptor_unlocked(ostream& stream) const +{ + get_req()->print(stream); +} + +void OpRequest::_unregistered() { + request->clear_data(); + request->clear_payload(); + request->release_message_throttle(); + request->set_connection(nullptr); +} + +bool OpRequest::check_rmw(int flag) const { + ceph_assert(rmw_flags != 0); + return rmw_flags & flag; +} +bool OpRequest::may_read() const { + return need_read_cap() || check_rmw(CEPH_OSD_RMW_FLAG_CLASS_READ); +} +bool OpRequest::may_write() const { + return need_write_cap() || check_rmw(CEPH_OSD_RMW_FLAG_CLASS_WRITE); +} +bool OpRequest::may_cache() const { return check_rmw(CEPH_OSD_RMW_FLAG_CACHE); } +bool OpRequest::rwordered_forced() const { + return check_rmw(CEPH_OSD_RMW_FLAG_RWORDERED); +} +bool OpRequest::rwordered() const { + return may_write() || may_cache() || rwordered_forced(); +} + +bool OpRequest::includes_pg_op() { return check_rmw(CEPH_OSD_RMW_FLAG_PGOP); } +bool OpRequest::need_read_cap() const { + return check_rmw(CEPH_OSD_RMW_FLAG_READ); +} +bool OpRequest::need_write_cap() const { + return check_rmw(CEPH_OSD_RMW_FLAG_WRITE); +} +bool OpRequest::need_promote() { + return check_rmw(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE); +} +bool OpRequest::need_skip_handle_cache() { + return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE); +} +bool OpRequest::need_skip_promote() { + return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE); +} + +void OpRequest::set_rmw_flags(int flags) { +#ifdef WITH_LTTNG + int old_rmw_flags = rmw_flags; +#endif + rmw_flags |= flags; + tracepoint(oprequest, set_rmw_flags, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc, + flags, old_rmw_flags, rmw_flags); +} + +void OpRequest::set_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_READ); } +void OpRequest::set_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_WRITE); } +void OpRequest::set_class_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_READ); } +void OpRequest::set_class_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_WRITE); } +void OpRequest::set_pg_op() { set_rmw_flags(CEPH_OSD_RMW_FLAG_PGOP); } +void OpRequest::set_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CACHE); } +void OpRequest::set_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE); } +void OpRequest::set_skip_handle_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE); } +void OpRequest::set_skip_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE); } +void OpRequest::set_force_rwordered() { set_rmw_flags(CEPH_OSD_RMW_FLAG_RWORDERED); } + +void OpRequest::mark_flag_point(uint8_t flag, const char *s) { +#ifdef WITH_LTTNG + uint8_t old_flags = hit_flag_points; +#endif + mark_event(s); + hit_flag_points |= flag; + latest_flag_point = flag; + tracepoint(oprequest, mark_flag_point, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc, rmw_flags, + flag, s, old_flags, hit_flag_points); +} + +void OpRequest::mark_flag_point_string(uint8_t flag, const string& s) { +#ifdef WITH_LTTNG + uint8_t old_flags = hit_flag_points; +#endif + mark_event(s); + hit_flag_points |= flag; + latest_flag_point = flag; + tracepoint(oprequest, mark_flag_point, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc, rmw_flags, + flag, s.c_str(), old_flags, hit_flag_points); +} + +bool OpRequest::filter_out(const set<string>& filters) +{ + set<entity_addr_t> addrs; + for (auto it = filters.begin(); it != filters.end(); it++) { + entity_addr_t addr; + if (addr.parse((*it).c_str())) { + addrs.insert(addr); + } + } + if (addrs.empty()) + return true; + + entity_addr_t cmp_addr = req_src_inst.addr; + if (addrs.count(cmp_addr)) { + return true; + } + cmp_addr.set_nonce(0); + if (addrs.count(cmp_addr)) { + return true; + } + cmp_addr.set_port(0); + if (addrs.count(cmp_addr)) { + return true; + } + + return false; +} + +ostream& operator<<(ostream& out, const OpRequest::ClassInfo& i) +{ + out << "class " << i.class_name << " method " << i.method_name + << " rd " << i.read << " wr " << i.write << " wl " << i.whitelisted; + return out; +} diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h new file mode 100644 index 00000000..184d26ac --- /dev/null +++ b/src/osd/OpRequest.h @@ -0,0 +1,179 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 New Dream Network/Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef OPREQUEST_H_ +#define OPREQUEST_H_ + +#include "osd/osd_types.h" +#include "common/TrackedOp.h" + +/** + * The OpRequest takes in a Message* and takes over a single reference + * to it, which it puts() when destroyed. + */ +struct OpRequest : public TrackedOp { + friend class OpTracker; + + // rmw flags + int rmw_flags; + + bool check_rmw(int flag) const ; + bool may_read() const; + bool may_write() const; + bool may_cache() const; + bool rwordered_forced() const; + bool rwordered() const; + bool includes_pg_op(); + bool need_read_cap() const; + bool need_write_cap() const; + bool need_promote(); + bool need_skip_handle_cache(); + bool need_skip_promote(); + void set_read(); + void set_write(); + void set_cache(); + void set_class_read(); + void set_class_write(); + void set_pg_op(); + void set_promote(); + void set_skip_handle_cache(); + void set_skip_promote(); + void set_force_rwordered(); + + struct ClassInfo { + ClassInfo(std::string&& class_name, std::string&& method_name, + bool read, bool write, bool whitelisted) : + class_name(std::move(class_name)), method_name(std::move(method_name)), + read(read), write(write), whitelisted(whitelisted) + {} + const std::string class_name; + const std::string method_name; + const bool read, write, whitelisted; + }; + + void add_class(std::string&& class_name, std::string&& method_name, + bool read, bool write, bool whitelisted) { + classes_.emplace_back(std::move(class_name), std::move(method_name), + read, write, whitelisted); + } + + std::vector<ClassInfo> classes() const { + return classes_; + } + + void _dump(Formatter *f) const override; + + bool has_feature(uint64_t f) const { + return request->get_connection()->has_feature(f); + } + +private: + Message *request; /// the logical request we are tracking + osd_reqid_t reqid; + entity_inst_t req_src_inst; + uint8_t hit_flag_points; + uint8_t latest_flag_point; + utime_t dequeued_time; + static const uint8_t flag_queued_for_pg=1 << 0; + static const uint8_t flag_reached_pg = 1 << 1; + static const uint8_t flag_delayed = 1 << 2; + static const uint8_t flag_started = 1 << 3; + static const uint8_t flag_sub_op_sent = 1 << 4; + static const uint8_t flag_commit_sent = 1 << 5; + + std::vector<ClassInfo> classes_; + + OpRequest(Message *req, OpTracker *tracker); + +protected: + void _dump_op_descriptor_unlocked(ostream& stream) const override; + void _unregistered() override; + bool filter_out(const set<string>& filters) override; + +public: + ~OpRequest() override { + request->put(); + } + + bool check_send_map = true; ///< true until we check if sender needs a map + epoch_t sent_epoch = 0; ///< client's map epoch + epoch_t min_epoch = 0; ///< min epoch needed to handle this msg + + bool hitset_inserted; + const Message *get_req() const { return request; } + Message *get_nonconst_req() { return request; } + + entity_name_t get_source() { + if (request) { + return request->get_source(); + } else { + return entity_name_t(); + } + } + + std::string_view state_string() const override { + switch(latest_flag_point) { + case flag_queued_for_pg: return "queued for pg"; + case flag_reached_pg: return "reached pg"; + case flag_delayed: return "delayed"; + case flag_started: return "started"; + case flag_sub_op_sent: return "waiting for sub ops"; + case flag_commit_sent: return "commit sent; apply or cleanup"; + default: break; + } + return "no flag points reached"; + } + + void mark_queued_for_pg() { + mark_flag_point(flag_queued_for_pg, "queued_for_pg"); + } + void mark_reached_pg() { + mark_flag_point(flag_reached_pg, "reached_pg"); + } + void mark_delayed(const string& s) { + mark_flag_point_string(flag_delayed, s); + } + void mark_started() { + mark_flag_point(flag_started, "started"); + } + void mark_sub_op_sent(const string& s) { + mark_flag_point_string(flag_sub_op_sent, s); + } + void mark_commit_sent() { + mark_flag_point(flag_commit_sent, "commit_sent"); + } + + utime_t get_dequeued_time() const { + return dequeued_time; + } + void set_dequeued_time(utime_t deq_time) { + dequeued_time = deq_time; + } + + osd_reqid_t get_reqid() const { + return reqid; + } + + typedef boost::intrusive_ptr<OpRequest> Ref; + +private: + void set_rmw_flags(int flags); + void mark_flag_point(uint8_t flag, const char *s); + void mark_flag_point_string(uint8_t flag, const string& s); +}; + +typedef OpRequest::Ref OpRequestRef; + +ostream& operator<<(ostream& out, const OpRequest::ClassInfo& i); + +#endif /* OPREQUEST_H_ */ diff --git a/src/osd/PG.cc b/src/osd/PG.cc new file mode 100644 index 00000000..fda8d569 --- /dev/null +++ b/src/osd/PG.cc @@ -0,0 +1,10082 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "PG.h" +// #include "msg/Messenger.h" +#include "messages/MOSDRepScrub.h" +// #include "common/cmdparse.h" +// #include "common/ceph_context.h" + +#include "common/errno.h" +#include "common/config.h" +#include "OSD.h" +#include "OpRequest.h" +#include "ScrubStore.h" +#include "Session.h" + +#include "common/Timer.h" +#include "common/perf_counters.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDPGNotify.h" +// #include "messages/MOSDPGLog.h" +#include "messages/MOSDPGRemove.h" +#include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGTrim.h" +#include "messages/MOSDPGScan.h" +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MBackfillReserve.h" +#include "messages/MRecoveryReserve.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGPull.h" +#include "messages/MOSDECSubOpWrite.h" +#include "messages/MOSDECSubOpWriteReply.h" +#include "messages/MOSDECSubOpRead.h" +#include "messages/MOSDECSubOpReadReply.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDScrubReserve.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDRepScrubMap.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" + +#include "common/BackTrace.h" +#include "common/EventTrace.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/pg.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#include <sstream> + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can +// easily skip them +const string infover_key("_infover"); +const string info_key("_info"); +const string biginfo_key("_biginfo"); +const string epoch_key("_epoch"); +const string fastinfo_key("_fastinfo"); + +template <class T> +static ostream& _prefix(std::ostream *_dout, T *t) +{ + return t->gen_prefix(*_dout); +} + +void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state) +{ + // Ignore trimming state machine for now + if (::strstr(state, "Trimming") != NULL) { + return; + } else if (pi != nullptr) { + pi->enter_state(entime, state); + } else { + // Store current state since we can't reliably take the PG lock here + if ( tmppi == nullptr) { + tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance); + } + + thispg = pg; + tmppi->enter_state(entime, state); + } +} + +void PGStateHistory::exit(const char* state) { + // Ignore trimming state machine for now + // Do nothing if PG is being destroyed! + if (::strstr(state, "Trimming") != NULL || pg_in_destructor) { + return; + } else { + bool ilocked = false; + if(!thispg->is_locked()) { + thispg->lock(); + ilocked = true; + } + if (pi == nullptr) { + buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release())); + pi = buffer.back().get(); + pi->setepoch(thispg->get_osdmap_epoch()); + } + + pi->exit_state(ceph_clock_now()); + if (::strcmp(state, "Reset") == 0) { + this->reset(); + } + if(ilocked) { + thispg->unlock(); + } + } +} + +void PGStateHistory::dump(Formatter* f) const { + f->open_array_section("history"); + for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) { + f->open_object_section("states"); + f->dump_stream("epoch") << (*pi)->this_epoch; + for (auto she : (*pi)->state_history) { + f->dump_string("state", std::get<2>(she)); + f->dump_stream("enter") << std::get<0>(she); + f->dump_stream("exit") << std::get<1>(she); + } + f->close_section(); + } + f->close_section(); +} + +void PG::get(const char* tag) +{ + int after = ++ref; + lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " + << "tag " << (tag ? tag : "(none") << " " + << (after - 1) << " -> " << after << dendl; +#ifdef PG_DEBUG_REFS + std::lock_guard l(_ref_id_lock); + _tag_counts[tag]++; +#endif +} + +void PG::put(const char* tag) +{ +#ifdef PG_DEBUG_REFS + { + std::lock_guard l(_ref_id_lock); + auto tag_counts_entry = _tag_counts.find(tag); + ceph_assert(tag_counts_entry != _tag_counts.end()); + --tag_counts_entry->second; + if (tag_counts_entry->second == 0) { + _tag_counts.erase(tag_counts_entry); + } + } +#endif + auto local_cct = cct; + int after = --ref; + lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " " + << "tag " << (tag ? tag : "(none") << " " + << (after + 1) << " -> " << after + << dendl; + if (after == 0) + delete this; +} + +#ifdef PG_DEBUG_REFS +uint64_t PG::get_with_id() +{ + ref++; + std::lock_guard l(_ref_id_lock); + uint64_t id = ++_ref_id; + BackTrace bt(0); + stringstream ss; + bt.print(ss); + lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid + << " got id " << id << " " + << (ref - 1) << " -> " << ref + << dendl; + ceph_assert(!_live_ids.count(id)); + _live_ids.insert(make_pair(id, ss.str())); + return id; +} + +void PG::put_with_id(uint64_t id) +{ + int newref = --ref; + lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid + << " put id " << id << " " + << (newref + 1) << " -> " << newref + << dendl; + { + std::lock_guard l(_ref_id_lock); + ceph_assert(_live_ids.count(id)); + _live_ids.erase(id); + } + if (newref) + delete this; +} + +void PG::dump_live_ids() +{ + std::lock_guard l(_ref_id_lock); + dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl; + for (map<uint64_t, string>::iterator i = _live_ids.begin(); + i != _live_ids.end(); + ++i) { + dout(0) << "\t\tid: " << *i << dendl; + } + dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl; + for (map<string, uint64_t>::iterator i = _tag_counts.begin(); + i != _tag_counts.end(); + ++i) { + dout(0) << "\t\tid: " << *i << dendl; + } +} +#endif + + +void PGPool::update(CephContext *cct, OSDMapRef map) +{ + const pg_pool_t *pi = map->get_pg_pool(id); + if (!pi) { + return; // pool has been deleted + } + info = *pi; + name = map->get_pool_name(id); + + bool updated = false; + if ((map->get_epoch() != cached_epoch + 1) || + (pi->get_snap_epoch() == map->get_epoch())) { + updated = true; + } + + if (map->require_osd_release >= CEPH_RELEASE_MIMIC) { + // mimic tracks removed_snaps_queue in the OSDmap and purged_snaps + // in the pg_info_t, with deltas for both in each OSDMap. we don't + // need to (and can't) track it here. + cached_removed_snaps.clear(); + newly_removed_snaps.clear(); + } else { + // legacy (<= luminous) removed_snaps tracking + if (updated) { + if (pi->maybe_updated_removed_snaps(cached_removed_snaps)) { + pi->build_removed_snaps(newly_removed_snaps); + if (cached_removed_snaps.subset_of(newly_removed_snaps)) { + interval_set<snapid_t> removed_snaps = newly_removed_snaps; + newly_removed_snaps.subtract(cached_removed_snaps); + cached_removed_snaps.swap(removed_snaps); + } else { + lgeneric_subdout(cct, osd, 0) << __func__ + << " cached_removed_snaps shrank from " << cached_removed_snaps + << " to " << newly_removed_snaps << dendl; + cached_removed_snaps.swap(newly_removed_snaps); + newly_removed_snaps.clear(); + } + } else { + newly_removed_snaps.clear(); + } + } else { + /* 1) map->get_epoch() == cached_epoch + 1 && + * 2) pi->get_snap_epoch() != map->get_epoch() + * + * From the if branch, 1 && 2 must be true. From 2, we know that + * this map didn't change the set of removed snaps. From 1, we + * know that our cached_removed_snaps matches the previous map. + * Thus, from 1 && 2, cached_removed snaps matches the current + * set of removed snaps and all we have to do is clear + * newly_removed_snaps. + */ + newly_removed_snaps.clear(); + } + lgeneric_subdout(cct, osd, 20) + << "PGPool::update cached_removed_snaps " + << cached_removed_snaps + << " newly_removed_snaps " + << newly_removed_snaps + << " snapc " << snapc + << (updated ? " (updated)":" (no change)") + << dendl; + if (cct->_conf->osd_debug_verify_cached_snaps) { + interval_set<snapid_t> actual_removed_snaps; + pi->build_removed_snaps(actual_removed_snaps); + if (!(actual_removed_snaps == cached_removed_snaps)) { + lgeneric_derr(cct) << __func__ + << ": mismatch between the actual removed snaps " + << actual_removed_snaps + << " and pool.cached_removed_snaps " + << " pool.cached_removed_snaps " << cached_removed_snaps + << dendl; + } + ceph_assert(actual_removed_snaps == cached_removed_snaps); + } + } + if (info.is_pool_snaps_mode() && updated) { + snapc = pi->get_snap_context(); + } + cached_epoch = map->get_epoch(); +} + +PG::PG(OSDService *o, OSDMapRef curmap, + const PGPool &_pool, spg_t p) : + pg_id(p), + coll(p), + osd(o), + cct(o->cct), + osdmap_ref(curmap), + pool(_pool), + osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()), + snap_mapper( + cct, + &osdriver, + p.ps(), + p.get_split_bits(_pool.info.get_pg_num()), + _pool.id, + p.shard), + last_persisted_osdmap(curmap->get_epoch()), + deleting(false), + trace_endpoint("0.0.0.0", 0, "PG"), + dirty_info(false), dirty_big_info(false), + info(p), + info_struct_v(0), + pg_log(cct), + pgmeta_oid(p.make_pgmeta_oid()), + missing_loc(this), + stat_queue_item(this), + scrub_queued(false), + recovery_queued(false), + recovery_ops_active(0), + role(-1), + state(0), + send_notify(false), + pg_whoami(osd->whoami, p.shard), + need_up_thru(false), + last_peering_reset(0), + heartbeat_peer_lock("PG::heartbeat_peer_lock"), + backfill_reserved(false), + backfill_reserving(false), + flushes_in_progress(0), + pg_stats_publish_lock("PG::pg_stats_publish_lock"), + pg_stats_publish_valid(false), + finish_sync_event(NULL), + backoff_lock("PG::backoff_lock"), + scrub_after_recovery(false), + save_req_scrub(false), + active_pushes(0), + recovery_state(this), + peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT), + acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT), + upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT), + last_epoch(0), + last_require_osd_release(curmap->require_osd_release) +{ +#ifdef PG_DEBUG_REFS + osd->add_pgid(p, this); +#endif +#ifdef WITH_BLKIN + std::stringstream ss; + ss << "PG " << info.pgid; + trace_endpoint.copy_name(ss.str()); +#endif +} + +PG::~PG() +{ + pgstate_history.set_pg_in_destructor(); +#ifdef PG_DEBUG_REFS + osd->remove_pgid(info.pgid, this); +#endif +} + +void PG::lock(bool no_lockdep) const +{ + _lock.Lock(no_lockdep); + // if we have unrecorded dirty state with the lock dropped, there is a bug + ceph_assert(!dirty_info); + ceph_assert(!dirty_big_info); + + dout(30) << "lock" << dendl; +} + +std::ostream& PG::gen_prefix(std::ostream& out) const +{ + OSDMapRef mapref = osdmap_ref; + if (_lock.is_locked_by_me()) { + out << "osd." << osd->whoami + << " pg_epoch: " << (mapref ? mapref->get_epoch():0) + << " " << *this << " "; + } else { + out << "osd." << osd->whoami + << " pg_epoch: " << (mapref ? mapref->get_epoch():0) + << " pg[" << info.pgid << "(unlocked)] "; + } + return out; +} + +/********* PG **********/ + +void PG::proc_master_log( + ObjectStore::Transaction& t, pg_info_t &oinfo, + pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from) +{ + dout(10) << "proc_master_log for osd." << from << ": " + << olog << " " << omissing << dendl; + ceph_assert(!is_peered() && is_primary()); + + // merge log into our own log to build master log. no need to + // make any adjustments to their missing map; we are taking their + // log to be authoritative (i.e., their entries are by definitely + // non-divergent). + merge_log(t, oinfo, olog, from); + peer_info[from] = oinfo; + dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl; + might_have_unfound.insert(from); + + // See doc/dev/osd_internals/last_epoch_started + if (oinfo.last_epoch_started > info.last_epoch_started) { + info.last_epoch_started = oinfo.last_epoch_started; + dirty_info = true; + } + if (oinfo.last_interval_started > info.last_interval_started) { + info.last_interval_started = oinfo.last_interval_started; + dirty_info = true; + } + update_history(oinfo.history); + ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les || + info.last_epoch_started >= info.history.last_epoch_started); + + peer_missing[from].claim(omissing); +} + +void PG::proc_replica_log( + pg_info_t &oinfo, + const pg_log_t &olog, + pg_missing_t& omissing, + pg_shard_t from) +{ + dout(10) << "proc_replica_log for osd." << from << ": " + << oinfo << " " << olog << " " << omissing << dendl; + + pg_log.proc_replica_log(oinfo, olog, omissing, from); + + peer_info[from] = oinfo; + dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl; + might_have_unfound.insert(from); + + for (map<hobject_t, pg_missing_item>::const_iterator i = + omissing.get_items().begin(); + i != omissing.get_items().end(); + ++i) { + dout(20) << " after missing " << i->first << " need " << i->second.need + << " have " << i->second.have << dendl; + } + peer_missing[from].claim(omissing); +} + +bool PG::proc_replica_info( + pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch) +{ + map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from); + if (p != peer_info.end() && p->second.last_update == oinfo.last_update) { + dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl; + return false; + } + + if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) { + dout(10) << " got info " << oinfo << " from down osd." << from + << " discarding" << dendl; + return false; + } + + dout(10) << " got osd." << from << " " << oinfo << dendl; + ceph_assert(is_primary()); + peer_info[from] = oinfo; + might_have_unfound.insert(from); + + update_history(oinfo.history); + + // stray? + if (!is_up(from) && !is_acting(from)) { + dout(10) << " osd." << from << " has stray content: " << oinfo << dendl; + stray_set.insert(from); + if (is_clean()) { + purge_strays(); + } + } + + // was this a new info? if so, update peers! + if (p == peer_info.end()) + update_heartbeat_peers(); + + return true; +} + +void PG::remove_snap_mapped_object( + ObjectStore::Transaction &t, const hobject_t &soid) +{ + t.remove( + coll, + ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard)); + clear_object_snap_mapping(&t, soid); +} + +void PG::clear_object_snap_mapping( + ObjectStore::Transaction *t, const hobject_t &soid) +{ + OSDriver::OSTransaction _t(osdriver.get_transaction(t)); + if (soid.snap < CEPH_MAXSNAP) { + int r = snap_mapper.remove_oid( + soid, + &_t); + if (!(r == 0 || r == -ENOENT)) { + derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + } +} + +void PG::update_object_snap_mapping( + ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps) +{ + OSDriver::OSTransaction _t(osdriver.get_transaction(t)); + ceph_assert(soid.snap < CEPH_MAXSNAP); + int r = snap_mapper.remove_oid( + soid, + &_t); + if (!(r == 0 || r == -ENOENT)) { + derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + snap_mapper.add_oid( + soid, + snaps, + &_t); +} + +void PG::merge_log( + ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from) +{ + PGLogEntryHandler rollbacker{this, &t}; + pg_log.merge_log( + oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info); +} + +void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead) +{ + PGLogEntryHandler rollbacker{this, &t}; + pg_log.rewind_divergent_log( + newhead, info, &rollbacker, dirty_info, dirty_big_info); +} + +/* + * Process information from a replica to determine if it could have any + * objects that i need. + * + * TODO: if the missing set becomes very large, this could get expensive. + * Instead, we probably want to just iterate over our unfound set. + */ +bool PG::search_for_missing( + const pg_info_t &oinfo, const pg_missing_t &omissing, + pg_shard_t from, + RecoveryCtx *ctx) +{ + uint64_t num_unfound_before = missing_loc.num_unfound(); + bool found_missing = missing_loc.add_source_info( + from, oinfo, omissing, ctx->handle); + if (found_missing && num_unfound_before != missing_loc.num_unfound()) + publish_stats_to_osd(); + // avoid doing this if the peer is empty. This is abit of paranoia + // to avoid doing something rash if add_source_info() above + // incorrectly decided we found something new. (if the peer has + // last_update=0'0 that's impossible.) + if (found_missing && + oinfo.last_update != eversion_t()) { + pg_info_t tinfo(oinfo); + tinfo.pgid.shard = pg_whoami.shard; + (*(ctx->info_map))[from.osd].push_back( + make_pair( + pg_notify_t( + from.shard, pg_whoami.shard, + get_osdmap_epoch(), + get_osdmap_epoch(), + tinfo), + past_intervals)); + } + return found_missing; +} + + +// MissingLoc + +bool PG::MissingLoc::readable_with_acting( + const hobject_t &hoid, + const set<pg_shard_t> &acting) const { + if (!needs_recovery(hoid)) + return true; + if (is_deleted(hoid)) + return false; + auto missing_loc_entry = missing_loc.find(hoid); + if (missing_loc_entry == missing_loc.end()) + return false; + const set<pg_shard_t> &locs = missing_loc_entry->second; + ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl; + set<pg_shard_t> have_acting; + for (set<pg_shard_t>::const_iterator i = locs.begin(); + i != locs.end(); + ++i) { + if (acting.count(*i)) + have_acting.insert(*i); + } + return (*is_readable)(have_acting); +} + +void PG::MissingLoc::add_batch_sources_info( + const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle) +{ + ldout(pg->cct, 10) << __func__ << ": adding sources in batch " + << sources.size() << dendl; + unsigned loop = 0; + bool sources_updated = false; + for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin(); + i != needs_recovery_map.end(); + ++i) { + if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) { + handle->reset_tp_timeout(); + loop = 0; + } + if (i->second.is_delete()) + continue; + + auto p = missing_loc.find(i->first); + if (p == missing_loc.end()) { + p = missing_loc.emplace(i->first, set<pg_shard_t>()).first; + } else { + _dec_count(p->second); + } + missing_loc[i->first].insert(sources.begin(), sources.end()); + _inc_count(p->second); + + if (!sources_updated) { + missing_loc_sources.insert(sources.begin(), sources.end()); + sources_updated = true; + } + } +} + +bool PG::MissingLoc::add_source_info( + pg_shard_t fromosd, + const pg_info_t &oinfo, + const pg_missing_t &omissing, + ThreadPool::TPHandle* handle) +{ + bool found_missing = false; + unsigned loop = 0; + bool sources_updated = false; + // found items? + for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin(); + p != needs_recovery_map.end(); + ++p) { + const hobject_t &soid(p->first); + eversion_t need = p->second.need; + if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) { + handle->reset_tp_timeout(); + loop = 0; + } + if (p->second.is_delete()) { + ldout(pg->cct, 10) << __func__ << " " << soid + << " delete, ignoring source" << dendl; + continue; + } + if (oinfo.last_update < need) { + ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need + << " also missing on osd." << fromosd + << " (last_update " << oinfo.last_update + << " < needed " << need << ")" << dendl; + continue; + } + if (!oinfo.last_backfill.is_max() && + !oinfo.last_backfill_bitwise) { + ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need + << " also missing on osd." << fromosd + << " (last_backfill " << oinfo.last_backfill + << " but with wrong sort order)" + << dendl; + continue; + } + if (p->first >= oinfo.last_backfill) { + // FIXME: this is _probably_ true, although it could conceivably + // be in the undefined region! Hmm! + ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need + << " also missing on osd." << fromosd + << " (past last_backfill " << oinfo.last_backfill + << ")" << dendl; + continue; + } + if (omissing.is_missing(soid)) { + ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need + << " also missing on osd." << fromosd << dendl; + continue; + } + + ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need + << " is on osd." << fromosd << dendl; + + { + auto p = missing_loc.find(soid); + if (p == missing_loc.end()) { + p = missing_loc.emplace(soid, set<pg_shard_t>()).first; + } else { + _dec_count(p->second); + } + p->second.insert(fromosd); + _inc_count(p->second); + } + + if (!sources_updated) { + missing_loc_sources.insert(fromosd); + sources_updated = true; + } + found_missing = true; + } + + ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map + << dendl; + return found_missing; +} + +void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap) +{ + set<pg_shard_t> now_down; + for (set<pg_shard_t>::iterator p = missing_loc_sources.begin(); + p != missing_loc_sources.end(); + ) { + if (osdmap->is_up(p->osd)) { + ++p; + continue; + } + ldout(pg->cct, 10) << __func__ << " source osd." << *p << " now down" << dendl; + now_down.insert(*p); + missing_loc_sources.erase(p++); + } + + if (now_down.empty()) { + ldout(pg->cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl; + } else { + ldout(pg->cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are " + << missing_loc_sources << dendl; + + // filter missing_loc + map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin(); + while (p != missing_loc.end()) { + set<pg_shard_t>::iterator q = p->second.begin(); + bool changed = false; + while (q != p->second.end()) { + if (now_down.count(*q)) { + if (!changed) { + changed = true; + _dec_count(p->second); + } + p->second.erase(q++); + } else { + ++q; + } + } + if (p->second.empty()) { + missing_loc.erase(p++); + } else { + if (changed) { + _inc_count(p->second); + } + ++p; + } + } + } +} + +void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map) +{ + auto &missing = pg_log.get_missing(); + uint64_t unfound = get_num_unfound(); + + dout(10) << __func__ << " " + << missing.num_missing() << " missing, " + << unfound << " unfound" + << dendl; + + std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin(); + std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end(); + for (; m != mend; ++m) { + pg_shard_t peer(*m); + + if (!get_osdmap()->is_up(peer.osd)) { + dout(20) << __func__ << " skipping down osd." << peer << dendl; + continue; + } + + if (peer_purged.count(peer)) { + dout(20) << __func__ << " skipping purged osd." << peer << dendl; + continue; + } + + map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer); + if (iter != peer_info.end() && + (iter->second.is_empty() || iter->second.dne())) { + // ignore empty peers + continue; + } + + // If we've requested any of this stuff, the pg_missing_t information + // should be on its way. + // TODO: coalsce requested_* into a single data structure + if (peer_missing.find(peer) != peer_missing.end()) { + dout(20) << __func__ << ": osd." << peer + << ": we already have pg_missing_t" << dendl; + continue; + } + if (peer_log_requested.find(peer) != peer_log_requested.end()) { + dout(20) << __func__ << ": osd." << peer + << ": in peer_log_requested" << dendl; + continue; + } + if (peer_missing_requested.find(peer) != peer_missing_requested.end()) { + dout(20) << __func__ << ": osd." << peer + << ": in peer_missing_requested" << dendl; + continue; + } + + // Request missing + dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t" + << dendl; + peer_missing_requested.insert(peer); + query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] = + pg_query_t( + pg_query_t::FULLLOG, + peer.shard, pg_whoami.shard, + info.history, get_osdmap_epoch()); + } +} + +/******* PG ***********/ +bool PG::needs_recovery() const +{ + ceph_assert(is_primary()); + + auto &missing = pg_log.get_missing(); + + if (missing.num_missing()) { + dout(10) << __func__ << " primary has " << missing.num_missing() + << " missing" << dendl; + return true; + } + + ceph_assert(!acting_recovery_backfill.empty()); + set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end(); + set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin(); + for (; a != end; ++a) { + if (*a == get_primary()) continue; + pg_shard_t peer = *a; + map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer); + if (pm == peer_missing.end()) { + dout(10) << __func__ << " osd." << peer << " doesn't have missing set" + << dendl; + continue; + } + if (pm->second.num_missing()) { + dout(10) << __func__ << " osd." << peer << " has " + << pm->second.num_missing() << " missing" << dendl; + return true; + } + } + + dout(10) << __func__ << " is recovered" << dendl; + return false; +} + +bool PG::needs_backfill() const +{ + ceph_assert(is_primary()); + + // We can assume that only possible osds that need backfill + // are on the backfill_targets vector nodes. + set<pg_shard_t>::const_iterator end = backfill_targets.end(); + set<pg_shard_t>::const_iterator a = backfill_targets.begin(); + for (; a != end; ++a) { + pg_shard_t peer = *a; + map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer); + if (!pi->second.last_backfill.is_max()) { + dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl; + return true; + } + } + + dout(10) << __func__ << " does not need backfill" << dendl; + return false; +} + + +void PG::check_past_interval_bounds() const +{ + auto oldest_epoch = osd->get_superblock().oldest_map; + auto rpib = get_required_past_interval_bounds( + info, + oldest_epoch); + if (rpib.first >= rpib.second) { + // do not warn if the start bound is dictated by oldest_map; the + // past intervals are presumably appropriate given the pg info. + if (!past_intervals.empty() && + rpib.first > oldest_epoch) { + osd->clog->error() << info.pgid << " required past_interval bounds are" + << " empty [" << rpib << ") but past_intervals is not: " + << past_intervals; + derr << info.pgid << " required past_interval bounds are" + << " empty [" << rpib << ") but past_intervals is not: " + << past_intervals << dendl; + } + } else { + if (past_intervals.empty()) { + osd->clog->error() << info.pgid << " required past_interval bounds are" + << " not empty [" << rpib << ") but past_intervals " + << past_intervals << " is empty"; + derr << info.pgid << " required past_interval bounds are" + << " not empty [" << rpib << ") but past_intervals " + << past_intervals << " is empty" << dendl; + ceph_assert(!past_intervals.empty()); + } + + auto apib = past_intervals.get_bounds(); + if (apib.first > rpib.first) { + osd->clog->error() << info.pgid << " past_intervals [" << apib + << ") start interval does not contain the required" + << " bound [" << rpib << ") start"; + derr << info.pgid << " past_intervals [" << apib + << ") start interval does not contain the required" + << " bound [" << rpib << ") start" << dendl; + ceph_abort_msg("past_interval start interval mismatch"); + } + if (apib.second != rpib.second) { + osd->clog->error() << info.pgid << " past_interal bound [" << apib + << ") end does not match required [" << rpib + << ") end"; + derr << info.pgid << " past_interal bound [" << apib + << ") end does not match required [" << rpib + << ") end" << dendl; + ceph_abort_msg("past_interval end mismatch"); + } + } +} + +bool PG::adjust_need_up_thru(const OSDMapRef osdmap) +{ + epoch_t up_thru = osdmap->get_up_thru(osd->whoami); + if (need_up_thru && + up_thru >= info.history.same_interval_since) { + dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl; + need_up_thru = false; + return true; + } + return false; +} + +void PG::remove_down_peer_info(const OSDMapRef osdmap) +{ + // Remove any downed osds from peer_info + bool removed = false; + map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin(); + while (p != peer_info.end()) { + if (!osdmap->is_up(p->first.osd)) { + dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl; + peer_missing.erase(p->first); + peer_log_requested.erase(p->first); + peer_missing_requested.erase(p->first); + peer_purged.erase(p->first); // so we can re-purge if necessary + peer_info.erase(p++); + removed = true; + } else + ++p; + } + + // if we removed anyone, update peers (which include peer_info) + if (removed) + update_heartbeat_peers(); + check_recovery_sources(osdmap); +} + +/* + * Returns true unless there is a non-lost OSD in might_have_unfound. + */ +bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const +{ + ceph_assert(is_primary()); + + set<pg_shard_t>::const_iterator peer = might_have_unfound.begin(); + set<pg_shard_t>::const_iterator mend = might_have_unfound.end(); + for (; peer != mend; ++peer) { + if (peer_missing.count(*peer)) + continue; + map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer); + if (iter != peer_info.end() && + (iter->second.is_empty() || iter->second.dne())) + continue; + if (!osdmap->exists(peer->osd)) + continue; + const osd_info_t &osd_info(osdmap->get_info(peer->osd)); + if (osd_info.lost_at <= osd_info.up_from) { + // If there is even one OSD in might_have_unfound that isn't lost, we + // still might retrieve our unfound. + return false; + } + } + dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound + << " have been queried or are marked lost" << dendl; + return true; +} + +PastIntervals::PriorSet PG::build_prior() +{ + if (1) { + // sanity check + for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin(); + it != peer_info.end(); + ++it) { + ceph_assert(info.history.last_epoch_started >= it->second.history.last_epoch_started); + } + } + + const OSDMap &osdmap = *get_osdmap(); + PastIntervals::PriorSet prior = past_intervals.get_prior_set( + pool.info.is_erasure(), + info.history.last_epoch_started, + get_pgbackend()->get_is_recoverable_predicate(), + [&](epoch_t start, int osd, epoch_t *lost_at) { + const osd_info_t *pinfo = 0; + if (osdmap.exists(osd)) { + pinfo = &osdmap.get_info(osd); + if (lost_at) + *lost_at = pinfo->lost_at; + } + + if (osdmap.is_up(osd)) { + return PastIntervals::UP; + } else if (!pinfo) { + return PastIntervals::DNE; + } else if (pinfo->lost_at > start) { + return PastIntervals::LOST; + } else { + return PastIntervals::DOWN; + } + }, + up, + acting, + this); + + if (prior.pg_down) { + state_set(PG_STATE_DOWN); + } + + if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) { + dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami) + << " < same_since " << info.history.same_interval_since + << ", must notify monitor" << dendl; + need_up_thru = true; + } else { + dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami) + << " >= same_since " << info.history.same_interval_since + << ", all is well" << dendl; + need_up_thru = false; + } + set_probe_targets(prior.probe); + return prior; +} + +void PG::clear_primary_state() +{ + dout(10) << "clear_primary_state" << dendl; + + // clear peering state + stray_set.clear(); + peer_log_requested.clear(); + peer_missing_requested.clear(); + peer_info.clear(); + peer_bytes.clear(); + peer_missing.clear(); + need_up_thru = false; + peer_last_complete_ondisk.clear(); + peer_activated.clear(); + min_last_complete_ondisk = eversion_t(); + pg_trim_to = eversion_t(); + might_have_unfound.clear(); + projected_log = PGLog::IndexedLog(); + + last_update_ondisk = eversion_t(); + + snap_trimq.clear(); + + finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread + + missing_loc.clear(); + + release_pg_backoffs(); + + pg_log.reset_recovery_pointers(); + + scrubber.reserved_peers.clear(); + scrub_after_recovery = false; + save_req_scrub = false; + + agent_clear(); +} + +PG::Scrubber::Scrubber() + : local_reserved(false), remote_reserved(false), reserve_failed(false), + epoch_start(0), + active(false), + shallow_errors(0), deep_errors(0), fixed(0), + must_scrub(false), must_deep_scrub(false), must_repair(false), + need_auto(false), req_scrub(false), time_for_deep(false), + auto_repair(false), + check_repair(false), + deep_scrub_on_error(false), + num_digest_updates_pending(0), + state(INACTIVE), + deep(false) +{} + +PG::Scrubber::~Scrubber() {} + +/** + * find_best_info + * + * Returns an iterator to the best info in infos sorted by: + * 1) Prefer newer last_update + * 2) Prefer longer tail if it brings another info into contiguity + * 3) Prefer current primary + */ +map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info( + const map<pg_shard_t, pg_info_t> &infos, + bool restrict_to_up_acting, + bool *history_les_bound) const +{ + ceph_assert(history_les_bound); + /* See doc/dev/osd_internals/last_epoch_started.rst before attempting + * to make changes to this process. Also, make sure to update it + * when you find bugs! */ + eversion_t min_last_update_acceptable = eversion_t::max(); + epoch_t max_last_epoch_started_found = 0; + for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin(); + i != infos.end(); + ++i) { + if (!cct->_conf->osd_find_best_info_ignore_history_les && + max_last_epoch_started_found < i->second.history.last_epoch_started) { + *history_les_bound = true; + max_last_epoch_started_found = i->second.history.last_epoch_started; + } + if (!i->second.is_incomplete() && + max_last_epoch_started_found < i->second.last_epoch_started) { + *history_les_bound = false; + max_last_epoch_started_found = i->second.last_epoch_started; + } + } + for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin(); + i != infos.end(); + ++i) { + if (max_last_epoch_started_found <= i->second.last_epoch_started) { + if (min_last_update_acceptable > i->second.last_update) + min_last_update_acceptable = i->second.last_update; + } + } + if (min_last_update_acceptable == eversion_t::max()) + return infos.end(); + + map<pg_shard_t, pg_info_t>::const_iterator best = infos.end(); + // find osd with newest last_update (oldest for ec_pool). + // if there are multiples, prefer + // - a longer tail, if it brings another peer into log contiguity + // - the current primary + for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin(); + p != infos.end(); + ++p) { + if (restrict_to_up_acting && !is_up(p->first) && + !is_acting(p->first)) + continue; + // Only consider peers with last_update >= min_last_update_acceptable + if (p->second.last_update < min_last_update_acceptable) + continue; + // Disqualify anyone with a too old last_epoch_started + if (p->second.last_epoch_started < max_last_epoch_started_found) + continue; + // Disqualify anyone who is incomplete (not fully backfilled) + if (p->second.is_incomplete()) + continue; + if (best == infos.end()) { + best = p; + continue; + } + // Prefer newer last_update + if (pool.info.require_rollback()) { + if (p->second.last_update > best->second.last_update) + continue; + if (p->second.last_update < best->second.last_update) { + best = p; + continue; + } + } else { + if (p->second.last_update < best->second.last_update) + continue; + if (p->second.last_update > best->second.last_update) { + best = p; + continue; + } + } + + // Prefer longer tail + if (p->second.log_tail > best->second.log_tail) { + continue; + } else if (p->second.log_tail < best->second.log_tail) { + best = p; + continue; + } + + if (!p->second.has_missing() && best->second.has_missing()) { + dout(10) << __func__ << " prefer osd." << p->first + << " because it is complete while best has missing" + << dendl; + best = p; + continue; + } else if (p->second.has_missing() && !best->second.has_missing()) { + dout(10) << __func__ << " skipping osd." << p->first + << " because it has missing while best is complete" + << dendl; + continue; + } else { + // both are complete or have missing + // fall through + } + + // prefer current primary (usually the caller), all things being equal + if (p->first == pg_whoami) { + dout(10) << "calc_acting prefer osd." << p->first + << " because it is current primary" << dendl; + best = p; + continue; + } + } + return best; +} + +void PG::calc_ec_acting( + map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, + unsigned size, + const vector<int> &acting, + const vector<int> &up, + const map<pg_shard_t, pg_info_t> &all_info, + bool restrict_to_up_acting, + vector<int> *_want, + set<pg_shard_t> *backfill, + set<pg_shard_t> *acting_backfill, + ostream &ss) +{ + vector<int> want(size, CRUSH_ITEM_NONE); + map<shard_id_t, set<pg_shard_t> > all_info_by_shard; + for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin(); + i != all_info.end(); + ++i) { + all_info_by_shard[i->first.shard].insert(i->first); + } + for (uint8_t i = 0; i < want.size(); ++i) { + ss << "For position " << (unsigned)i << ": "; + if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE && + !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() && + all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl; + want[i] = up[i]; + continue; + } + if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) { + ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i)) + << " and "; + backfill->insert(pg_shard_t(up[i], shard_id_t(i))); + } + + if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE && + !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() && + all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl; + want[i] = acting[i]; + } else if (!restrict_to_up_acting) { + for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin(); + j != all_info_by_shard[shard_id_t(i)].end(); + ++j) { + ceph_assert(j->shard == i); + if (!all_info.find(*j)->second.is_incomplete() && + all_info.find(*j)->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting stray: " << *j << std::endl; + want[i] = j->osd; + break; + } + } + if (want[i] == CRUSH_ITEM_NONE) + ss << " failed to fill position " << (int)i << std::endl; + } + } + + for (uint8_t i = 0; i < want.size(); ++i) { + if (want[i] != CRUSH_ITEM_NONE) { + acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i))); + } + } + acting_backfill->insert(backfill->begin(), backfill->end()); + _want->swap(want); +} + +/** + * calculate the desired acting set. + * + * Choose an appropriate acting set. Prefer up[0], unless it is + * incomplete, or another osd has a longer tail that allows us to + * bring other up nodes up to date. + */ +void PG::calc_replicated_acting( + map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, + uint64_t force_auth_primary_missing_objects, + unsigned size, + const vector<int> &acting, + const vector<int> &up, + pg_shard_t up_primary, + const map<pg_shard_t, pg_info_t> &all_info, + bool restrict_to_up_acting, + vector<int> *want, + set<pg_shard_t> *backfill, + set<pg_shard_t> *acting_backfill, + const OSDMapRef osdmap, + ostream &ss) +{ + pg_shard_t auth_log_shard_id = auth_log_shard->first; + + ss << __func__ << " newest update on osd." << auth_log_shard_id + << " with " << auth_log_shard->second + << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl; + + // select primary + auto primary = all_info.find(up_primary); + if (up.size() && + !primary->second.is_incomplete() && + primary->second.last_update >= + auth_log_shard->second.log_tail) { + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + primary->second.stats.stats.sum.num_objects_missing; + auto auth_version = auth_log_shard->second.last_update.version; + auto primary_version = primary->second.last_update.version; + if (auth_version > primary_version) { + approx_missing_objects += auth_version - primary_version; + } else { + approx_missing_objects += primary_version - auth_version; + } + if ((uint64_t)approx_missing_objects > + force_auth_primary_missing_objects) { + primary = auth_log_shard; + ss << "up_primary: " << up_primary << ") has approximate " + << approx_missing_objects + << "(>" << force_auth_primary_missing_objects <<") " + << "missing objects, osd." << auth_log_shard_id + << " selected as primary instead" + << std::endl; + } else { + ss << "up_primary: " << up_primary << ") selected as primary" + << std::endl; + } + } else { + ss << "up_primary: " << up_primary << ") selected as primary" << std::endl; + } + } else { + ceph_assert(!auth_log_shard->second.is_incomplete()); + ss << "up[0] needs backfill, osd." << auth_log_shard_id + << " selected as primary instead" << std::endl; + primary = auth_log_shard; + } + + ss << __func__ << " primary is osd." << primary->first + << " with " << primary->second << std::endl; + want->push_back(primary->first.osd); + acting_backfill->insert(primary->first); + + /* We include auth_log_shard->second.log_tail because in GetLog, + * we will request logs back to the min last_update over our + * acting_backfill set, which will result in our log being extended + * as far backwards as necessary to pick up any peers which can + * be log recovered by auth_log_shard's log */ + eversion_t oldest_auth_log_entry = + std::min(primary->second.log_tail, auth_log_shard->second.log_tail); + + // select replicas that have log contiguity with primary. + // prefer up, then acting, then any peer_info osds + for (auto i : up) { + pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD); + if (up_cand == primary->first) + continue; + const pg_info_t &cur_info = all_info.find(up_cand)->second; + if (cur_info.is_incomplete() || + cur_info.last_update < oldest_auth_log_entry) { + ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl; + backfill->insert(up_cand); + acting_backfill->insert(up_cand); + } else { + want->push_back(i); + acting_backfill->insert(up_cand); + ss << " osd." << i << " (up) accepted " << cur_info << std::endl; + } + } + + if (want->size() >= size) { + return; + } + + std::vector<std::pair<eversion_t, int>> candidate_by_last_update; + candidate_by_last_update.reserve(acting.size()); + // This no longer has backfill OSDs, but they are covered above. + for (auto i : acting) { + pg_shard_t acting_cand(i, shard_id_t::NO_SHARD); + // skip up osds we already considered above + if (acting_cand == primary->first) + continue; + vector<int>::const_iterator up_it = find(up.begin(), up.end(), i); + if (up_it != up.end()) + continue; + + const pg_info_t &cur_info = all_info.find(acting_cand)->second; + if (cur_info.is_incomplete() || + cur_info.last_update < oldest_auth_log_entry) { + ss << " shard " << acting_cand << " (acting) REJECTED " + << cur_info << std::endl; + } else { + candidate_by_last_update.push_back(make_pair(cur_info.last_update, i)); + } + } + + auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs, + const std::pair<eversion_t, int> &rhs) { + return lhs.first > rhs.first; + }; + // sort by last_update, in descending order. + std::sort(candidate_by_last_update.begin(), + candidate_by_last_update.end(), sort_by_eversion); + for (auto &p: candidate_by_last_update) { + ceph_assert(want->size() < size); + want->push_back(p.second); + pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); + acting_backfill->insert(s); + ss << " shard " << s << " (acting) accepted " + << all_info.find(s)->second << std::endl; + if (want->size() >= size) { + return; + } + } + + if (restrict_to_up_acting) { + return; + } + candidate_by_last_update.clear(); + candidate_by_last_update.reserve(all_info.size()); // overestimate but fine + // continue to search stray to find more suitable peers + for (auto &i : all_info) { + // skip up osds we already considered above + if (i.first == primary->first) + continue; + vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd); + if (up_it != up.end()) + continue; + vector<int>::const_iterator acting_it = find( + acting.begin(), acting.end(), i.first.osd); + if (acting_it != acting.end()) + continue; + + if (i.second.is_incomplete() || + i.second.last_update < oldest_auth_log_entry) { + ss << " shard " << i.first << " (stray) REJECTED " << i.second + << std::endl; + } else { + candidate_by_last_update.push_back( + make_pair(i.second.last_update, i.first.osd)); + } + } + + if (candidate_by_last_update.empty()) { + // save us some effort + return; + } + + // sort by last_update, in descending order. + std::sort(candidate_by_last_update.begin(), + candidate_by_last_update.end(), sort_by_eversion); + + for (auto &p: candidate_by_last_update) { + ceph_assert(want->size() < size); + want->push_back(p.second); + pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); + acting_backfill->insert(s); + ss << " shard " << s << " (stray) accepted " + << all_info.find(s)->second << std::endl; + if (want->size() >= size) { + return; + } + } +} + +bool PG::recoverable_and_ge_min_size(const vector<int> &want) const +{ + unsigned num_want_acting = 0; + set<pg_shard_t> have; + for (int i = 0; i < (int)want.size(); ++i) { + if (want[i] != CRUSH_ITEM_NONE) { + ++num_want_acting; + have.insert( + pg_shard_t( + want[i], + pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + } + // We go incomplete if below min_size for ec_pools since backfill + // does not currently maintain rollbackability + // Otherwise, we will go "peered", but not "active" + if (num_want_acting < pool.info.min_size && + (pool.info.is_erasure() || + !cct->_conf->osd_allow_recovery_below_min_size)) { + dout(10) << __func__ << " failed, below min size" << dendl; + return false; + } + + /* Check whether we have enough acting shards to later perform recovery */ + boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate( + get_pgbackend()->get_is_recoverable_predicate()); + if (!(*recoverable_predicate)(have)) { + dout(10) << __func__ << " failed, not recoverable" << dendl; + return false; + } + + return true; +} + +void PG::choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info, + const pg_info_t &auth_info, + vector<int> *want, + set<pg_shard_t> *async_recovery, + const OSDMapRef osdmap) const +{ + set<pair<int, pg_shard_t> > candidates_by_cost; + for (uint8_t i = 0; i < want->size(); ++i) { + if ((*want)[i] == CRUSH_ITEM_NONE) + continue; + + // Considering log entries to recover is accurate enough for + // now. We could use minimum_to_decode_with_cost() later if + // necessary. + pg_shard_t shard_i((*want)[i], shard_id_t(i)); + // do not include strays + if (stray_set.find(shard_i) != stray_set.end()) + continue; + // Do not include an osd that is not up, since choosing it as + // an async_recovery_target will move it out of the acting set. + // This results in it being identified as a stray during peering, + // because it is no longer in the up or acting set. + if (!is_up(shard_i)) + continue; + auto shard_info = all_info.find(shard_i)->second; + // for ec pools we rollback all entries past the authoritative + // last_update *before* activation. This is relatively inexpensive + // compared to recovery, since it is purely local, so treat shards + // past the authoritative last_update the same as those equal to it. + version_t auth_version = auth_info.last_update.version; + version_t candidate_version = shard_info.last_update.version; + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + shard_info.stats.stats.sum.num_objects_missing; + if (auth_version > candidate_version) { + approx_missing_objects += auth_version - candidate_version; + } + if (static_cast<uint64_t>(approx_missing_objects) > + cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) { + candidates_by_cost.emplace(approx_missing_objects, shard_i); + } + } else { + if (auth_version > candidate_version && + (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) { + candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i)); + } + } + } + + dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost + << dendl; + + // take out as many osds as we can for async recovery, in order of cost + for (auto rit = candidates_by_cost.rbegin(); + rit != candidates_by_cost.rend(); ++rit) { + pg_shard_t cur_shard = rit->second; + vector<int> candidate_want(*want); + candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE; + if (recoverable_and_ge_min_size(candidate_want)) { + want->swap(candidate_want); + async_recovery->insert(cur_shard); + } + } + dout(20) << __func__ << " result want=" << *want + << " async_recovery=" << *async_recovery << dendl; +} + +void PG::choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info, + const pg_info_t &auth_info, + vector<int> *want, + set<pg_shard_t> *async_recovery, + const OSDMapRef osdmap) const +{ + set<pair<int, pg_shard_t> > candidates_by_cost; + for (auto osd_num : *want) { + pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD); + // do not include strays + if (stray_set.find(shard_i) != stray_set.end()) + continue; + // Do not include an osd that is not up, since choosing it as + // an async_recovery_target will move it out of the acting set. + // This results in it being identified as a stray during peering, + // because it is no longer in the up or acting set. + if (!is_up(shard_i)) + continue; + auto shard_info = all_info.find(shard_i)->second; + // use the approximate magnitude of the difference in length of + // logs plus historical missing objects as the cost of recovery + version_t auth_version = auth_info.last_update.version; + version_t candidate_version = shard_info.last_update.version; + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + shard_info.stats.stats.sum.num_objects_missing; + if (auth_version > candidate_version) { + approx_missing_objects += auth_version - candidate_version; + } else { + approx_missing_objects += candidate_version - auth_version; + } + if (static_cast<uint64_t>(approx_missing_objects) > + cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) { + candidates_by_cost.emplace(approx_missing_objects, shard_i); + } + } else { + size_t approx_entries; + if (auth_version > candidate_version) { + approx_entries = auth_version - candidate_version; + } else { + approx_entries = candidate_version - auth_version; + } + if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) { + candidates_by_cost.insert(make_pair(approx_entries, shard_i)); + } + } + } + + dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost + << dendl; + // take out as many osds as we can for async recovery, in order of cost + for (auto rit = candidates_by_cost.rbegin(); + rit != candidates_by_cost.rend(); ++rit) { + if (want->size() <= pool.info.min_size) { + break; + } + pg_shard_t cur_shard = rit->second; + vector<int> candidate_want(*want); + for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) { + if (*it == cur_shard.osd) { + candidate_want.erase(it); + want->swap(candidate_want); + async_recovery->insert(cur_shard); + break; + } + } + } + dout(20) << __func__ << " result want=" << *want + << " async_recovery=" << *async_recovery << dendl; +} + +/** + * choose acting + * + * calculate the desired acting, and request a change with the monitor + * if it differs from the current acting. + * + * if restrict_to_up_acting=true, we filter out anything that's not in + * up/acting. in order to lift this restriction, we need to + * 1) check whether it's worth switching the acting set any time we get + * a new pg info (not just here, when recovery finishes) + * 2) check whether anything in want_acting went down on each new map + * (and, if so, calculate a new want_acting) + * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap) + * TODO! + */ +bool PG::choose_acting(pg_shard_t &auth_log_shard_id, + bool restrict_to_up_acting, + bool *history_les_bound) +{ + map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end()); + all_info[pg_whoami] = info; + + if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) { + for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin(); + p != all_info.end(); + ++p) { + dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl; + } + } + + map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard = + find_best_info(all_info, restrict_to_up_acting, history_les_bound); + + if (auth_log_shard == all_info.end()) { + if (up != acting) { + dout(10) << __func__ << " no suitable info found (incomplete backfills?)," + << " reverting to up" << dendl; + want_acting = up; + vector<int> empty; + osd->queue_want_pg_temp(info.pgid.pgid, empty); + } else { + dout(10) << __func__ << " failed" << dendl; + ceph_assert(want_acting.empty()); + } + return false; + } + + ceph_assert(!auth_log_shard->second.is_incomplete()); + auth_log_shard_id = auth_log_shard->first; + + set<pg_shard_t> want_backfill, want_acting_backfill; + vector<int> want; + stringstream ss; + if (!pool.info.is_erasure()) + calc_replicated_acting( + auth_log_shard, + cct->_conf.get_val<uint64_t>( + "osd_force_auth_primary_missing_objects"), + get_osdmap()->get_pg_size(info.pgid.pgid), + acting, + up, + up_primary, + all_info, + restrict_to_up_acting, + &want, + &want_backfill, + &want_acting_backfill, + get_osdmap(), + ss); + else + calc_ec_acting( + auth_log_shard, + get_osdmap()->get_pg_size(info.pgid.pgid), + acting, + up, + all_info, + restrict_to_up_acting, + &want, + &want_backfill, + &want_acting_backfill, + ss); + dout(10) << ss.str() << dendl; + + if (!recoverable_and_ge_min_size(want)) { + want_acting.clear(); + return false; + } + + set<pg_shard_t> want_async_recovery; + if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) { + if (pool.info.is_erasure()) { + choose_async_recovery_ec(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap()); + } else { + choose_async_recovery_replicated(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap()); + } + } + while (want.size() > pool.info.size) { + // async recovery should have taken out as many osds as it can. + // if not, then always evict the last peer + // (will get synchronously recovered later) + dout(10) << __func__ << " evicting osd." << want.back() + << " from oversized want " << want << dendl; + want.pop_back(); + } + if (want != acting) { + dout(10) << __func__ << " want " << want << " != acting " << acting + << ", requesting pg_temp change" << dendl; + want_acting = want; + + if (!cct->_conf->osd_debug_no_acting_change) { + if (want_acting == up) { + // There can't be any pending backfill if + // want is the same as crush map up OSDs. + ceph_assert(want_backfill.empty()); + vector<int> empty; + osd->queue_want_pg_temp(info.pgid.pgid, empty); + } else + osd->queue_want_pg_temp(info.pgid.pgid, want); + } + return false; + } + want_acting.clear(); + acting_recovery_backfill = want_acting_backfill; + dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill << dendl; + ceph_assert(backfill_targets.empty() || backfill_targets == want_backfill); + if (backfill_targets.empty()) { + // Caller is GetInfo + backfill_targets = want_backfill; + } + // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete + ceph_assert(async_recovery_targets.empty() || async_recovery_targets == want_async_recovery || !needs_recovery()); + if (async_recovery_targets.empty() || !needs_recovery()) { + async_recovery_targets = want_async_recovery; + } + // Will not change if already set because up would have had to change + // Verify that nothing in backfill is in stray_set + for (set<pg_shard_t>::iterator i = want_backfill.begin(); + i != want_backfill.end(); + ++i) { + ceph_assert(stray_set.find(*i) == stray_set.end()); + } + dout(10) << "choose_acting want=" << want << " backfill_targets=" + << want_backfill << " async_recovery_targets=" + << async_recovery_targets << dendl; + return true; +} + +/* Build the might_have_unfound set. + * + * This is used by the primary OSD during recovery. + * + * This set tracks the OSDs which might have unfound objects that the primary + * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we + * will remove the OSD from the set. + */ +void PG::build_might_have_unfound() +{ + ceph_assert(might_have_unfound.empty()); + ceph_assert(is_primary()); + + dout(10) << __func__ << dendl; + + check_past_interval_bounds(); + + might_have_unfound = past_intervals.get_might_have_unfound( + pg_whoami, + pool.info.is_erasure()); + + // include any (stray) peers + for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin(); + p != peer_info.end(); + ++p) + might_have_unfound.insert(p->first); + + dout(15) << __func__ << ": built " << might_have_unfound << dendl; +} + +void PG::activate(ObjectStore::Transaction& t, + epoch_t activation_epoch, + map<int, map<spg_t,pg_query_t> >& query_map, + map<int, + vector< + pair<pg_notify_t, + PastIntervals> > > *activator_map, + RecoveryCtx *ctx) +{ + ceph_assert(!is_peered()); + ceph_assert(scrubber.callbacks.empty()); + ceph_assert(callbacks_for_degraded_object.empty()); + + // twiddle pg state + state_clear(PG_STATE_DOWN); + + send_notify = false; + + if (is_primary()) { + // only update primary last_epoch_started if we will go active + if (acting.size() >= pool.info.min_size) { + ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les || + info.last_epoch_started <= activation_epoch); + info.last_epoch_started = activation_epoch; + info.last_interval_started = info.history.same_interval_since; + } + } else if (is_acting(pg_whoami)) { + /* update last_epoch_started on acting replica to whatever the primary sent + * unless it's smaller (could happen if we are going peered rather than + * active, see doc/dev/osd_internals/last_epoch_started.rst) */ + if (info.last_epoch_started < activation_epoch) { + info.last_epoch_started = activation_epoch; + info.last_interval_started = info.history.same_interval_since; + } + } + + auto &missing = pg_log.get_missing(); + + if (is_primary()) { + last_update_ondisk = info.last_update; + min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)! + } + last_update_applied = info.last_update; + last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to(); + + need_up_thru = false; + + // write pg info, log + dirty_info = true; + dirty_big_info = true; // maybe + + // find out when we commit + t.register_on_complete( + new C_PG_ActivateCommitted( + this, + get_osdmap_epoch(), + activation_epoch)); + + if (is_primary()) { + // initialize snap_trimq + if (get_osdmap()->require_osd_release < CEPH_RELEASE_MIMIC) { + dout(20) << "activate - purged_snaps " << info.purged_snaps + << " cached_removed_snaps " << pool.cached_removed_snaps + << dendl; + snap_trimq = pool.cached_removed_snaps; + } else { + auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue(); + auto p = removed_snaps_queue.find(info.pgid.pgid.pool()); + snap_trimq.clear(); + if (p != removed_snaps_queue.end()) { + dout(20) << "activate - purged_snaps " << info.purged_snaps + << " removed_snaps " << p->second + << dendl; + for (auto q : p->second) { + snap_trimq.insert(q.first, q.second); + } + } + } + interval_set<snapid_t> purged; + purged.intersection_of(snap_trimq, info.purged_snaps); + snap_trimq.subtract(purged); + + if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) { + // adjust purged_snaps: PG may have been inactive while snaps were pruned + // from the removed_snaps_queue in the osdmap. update local purged_snaps + // reflect only those snaps that we thought were pruned and were still in + // the queue. + info.purged_snaps.swap(purged); + } + } + + // init complete pointer + if (missing.num_missing() == 0) { + dout(10) << "activate - no missing, moving last_complete " << info.last_complete + << " -> " << info.last_update << dendl; + info.last_complete = info.last_update; + info.stats.stats.sum.num_objects_missing = 0; + pg_log.reset_recovery_pointers(); + } else { + dout(10) << "activate - not complete, " << missing << dendl; + info.stats.stats.sum.num_objects_missing = missing.num_missing(); + pg_log.activate_not_complete(info); + } + + log_weirdness(); + + // if primary.. + if (is_primary()) { + ceph_assert(ctx); + // start up replicas + + ceph_assert(!acting_recovery_backfill.empty()); + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == pg_whoami) continue; + pg_shard_t peer = *i; + ceph_assert(peer_info.count(peer)); + pg_info_t& pi = peer_info[peer]; + + dout(10) << "activate peer osd." << peer << " " << pi << dendl; + + MOSDPGLog *m = 0; + ceph_assert(peer_missing.count(peer)); + pg_missing_t& pm = peer_missing[peer]; + + bool needs_past_intervals = pi.dne(); + + /* + * cover case where peer sort order was different and + * last_backfill cannot be interpreted + */ + bool force_restart_backfill = + !pi.last_backfill.is_max() && + !pi.last_backfill_bitwise; + + if (pi.last_update == info.last_update && !force_restart_backfill) { + // empty log + if (!pi.last_backfill.is_max()) + osd->clog->info() << info.pgid << " continuing backfill to osd." + << peer + << " from (" << pi.log_tail << "," << pi.last_update + << "] " << pi.last_backfill + << " to " << info.last_update; + if (!pi.is_empty() && activator_map) { + dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl; + (*activator_map)[peer.osd].push_back( + make_pair( + pg_notify_t( + peer.shard, pg_whoami.shard, + get_osdmap_epoch(), + get_osdmap_epoch(), + info), + past_intervals)); + } else { + dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl; + m = new MOSDPGLog( + i->shard, pg_whoami.shard, + get_osdmap_epoch(), info, + last_peering_reset); + } + } else if ( + pg_log.get_tail() > pi.last_update || + pi.last_backfill == hobject_t() || + force_restart_backfill || + (backfill_targets.count(*i) && pi.last_backfill.is_max())) { + /* ^ This last case covers a situation where a replica is not contiguous + * with the auth_log, but is contiguous with this replica. Reshuffling + * the active set to handle this would be tricky, so instead we just go + * ahead and backfill it anyway. This is probably preferrable in any + * case since the replica in question would have to be significantly + * behind. + */ + // backfill + osd->clog->debug() << info.pgid << " starting backfill to osd." << peer + << " from (" << pi.log_tail << "," << pi.last_update + << "] " << pi.last_backfill + << " to " << info.last_update; + + pi.last_update = info.last_update; + pi.last_complete = info.last_update; + pi.set_last_backfill(hobject_t()); + pi.last_epoch_started = info.last_epoch_started; + pi.last_interval_started = info.last_interval_started; + pi.history = info.history; + pi.hit_set = info.hit_set; + // Save num_bytes for reservation request, can't be negative + peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes); + pi.stats.stats.clear(); + + // initialize peer with our purged_snaps. + pi.purged_snaps = info.purged_snaps; + + m = new MOSDPGLog( + i->shard, pg_whoami.shard, + get_osdmap_epoch(), pi, + last_peering_reset /* epoch to create pg at */); + + // send some recent log, so that op dup detection works well. + m->log.copy_up_to(cct, pg_log.get_log(), cct->_conf->osd_min_pg_log_entries); + m->info.log_tail = m->log.tail; + pi.log_tail = m->log.tail; // sigh... + + pm.clear(); + } else { + // catch up + ceph_assert(pg_log.get_tail() <= pi.last_update); + m = new MOSDPGLog( + i->shard, pg_whoami.shard, + get_osdmap_epoch(), info, + last_peering_reset /* epoch to create pg at */); + // send new stuff to append to replicas log + m->log.copy_after(cct, pg_log.get_log(), pi.last_update); + } + + // share past_intervals if we are creating the pg on the replica + // based on whether our info for that peer was dne() *before* + // updating pi.history in the backfill block above. + if (m && needs_past_intervals) + m->past_intervals = past_intervals; + + // update local version of peer's missing list! + if (m && pi.last_backfill != hobject_t()) { + for (list<pg_log_entry_t>::iterator p = m->log.log.begin(); + p != m->log.log.end(); + ++p) { + if (p->soid <= pi.last_backfill && + !p->is_error()) { + if (perform_deletes_during_peering() && p->is_delete()) { + pm.rm(p->soid, p->version); + } else { + pm.add_next_event(*p); + } + } + } + } + + if (m) { + dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl; + //m->log.print(cout); + osd->send_message_osd_cluster(peer.osd, m, get_osdmap_epoch()); + } + + // peer now has + pi.last_update = info.last_update; + + // update our missing + if (pm.num_missing() == 0) { + pi.last_complete = pi.last_update; + dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl; + } else { + dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl; + } + } + + // Set up missing_loc + set<pg_shard_t> complete_shards; + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl; + if (*i == get_primary()) { + missing_loc.add_active_missing(missing); + if (!missing.have_missing()) + complete_shards.insert(*i); + } else { + auto peer_missing_entry = peer_missing.find(*i); + ceph_assert(peer_missing_entry != peer_missing.end()); + missing_loc.add_active_missing(peer_missing_entry->second); + if (!peer_missing_entry->second.have_missing() && + peer_info[*i].last_backfill.is_max()) + complete_shards.insert(*i); + } + } + + // If necessary, create might_have_unfound to help us find our unfound objects. + // NOTE: It's important that we build might_have_unfound before trimming the + // past intervals. + might_have_unfound.clear(); + if (needs_recovery()) { + // If only one shard has missing, we do a trick to add all others as recovery + // source, this is considered safe since the PGLogs have been merged locally, + // and covers vast majority of the use cases, like one OSD/host is down for + // a while for hardware repairing + if (complete_shards.size() + 1 == acting_recovery_backfill.size()) { + missing_loc.add_batch_sources_info(complete_shards, ctx->handle); + } else { + missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(), + ctx->handle); + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == pg_whoami) continue; + dout(10) << __func__ << ": adding " << *i << " as a source" << dendl; + ceph_assert(peer_missing.count(*i)); + ceph_assert(peer_info.count(*i)); + missing_loc.add_source_info( + *i, + peer_info[*i], + peer_missing[*i], + ctx->handle); + } + } + for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin(); + i != peer_missing.end(); + ++i) { + if (is_acting_recovery_backfill(i->first)) + continue; + ceph_assert(peer_info.count(i->first)); + search_for_missing( + peer_info[i->first], + i->second, + i->first, + ctx); + } + + build_might_have_unfound(); + + // Always call now so _update_calc_stats() will be accurate + discover_all_missing(query_map); + } + + // num_objects_degraded if calculated should reflect this too, unless no + // missing and we are about to go clean. + if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) { + state_set(PG_STATE_UNDERSIZED); + } + + state_set(PG_STATE_ACTIVATING); + release_pg_backoffs(); + projected_last_update = info.last_update; + } + if (acting.size() >= pool.info.min_size) { + PGLogEntryHandler handler{this, &t}; + pg_log.roll_forward(&handler); + } +} + +bool PG::op_has_sufficient_caps(OpRequestRef& op) +{ + // only check MOSDOp + if (op->get_req()->get_type() != CEPH_MSG_OSD_OP) + return true; + + const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req()); + + auto priv = req->get_connection()->get_priv(); + auto session = static_cast<Session*>(priv.get()); + if (!session) { + dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl; + return false; + } + OSDCap& caps = session->caps; + priv.reset(); + + const string &key = req->get_hobj().get_key().empty() ? + req->get_oid().name : + req->get_hobj().get_key(); + + bool cap = caps.is_capable(pool.name, req->get_hobj().nspace, + pool.info.application_metadata, + key, + op->need_read_cap(), + op->need_write_cap(), + op->classes(), + session->get_peer_socket_addr()); + + dout(20) << "op_has_sufficient_caps " + << "session=" << session + << " pool=" << pool.id << " (" << pool.name + << " " << req->get_hobj().nspace + << ")" + << " pool_app_metadata=" << pool.info.application_metadata + << " need_read_cap=" << op->need_read_cap() + << " need_write_cap=" << op->need_write_cap() + << " classes=" << op->classes() + << " -> " << (cap ? "yes" : "NO") + << dendl; + return cap; +} + +void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch) +{ + lock(); + if (pg_has_reset_since(epoch)) { + dout(10) << "_activate_committed " << epoch + << ", that was an old interval" << dendl; + } else if (is_primary()) { + ceph_assert(!peer_activated.count(pg_whoami)); + peer_activated.insert(pg_whoami); + dout(10) << "_activate_committed " << epoch + << " peer_activated now " << peer_activated + << " last_interval_started " << info.history.last_interval_started + << " last_epoch_started " << info.history.last_epoch_started + << " same_interval_since " << info.history.same_interval_since << dendl; + ceph_assert(!acting_recovery_backfill.empty()); + if (peer_activated.size() == acting_recovery_backfill.size()) + all_activated_and_committed(); + } else { + dout(10) << "_activate_committed " << epoch << " telling primary" << dendl; + MOSDPGInfo *m = new MOSDPGInfo(epoch); + pg_notify_t i = pg_notify_t( + get_primary().shard, pg_whoami.shard, + get_osdmap_epoch(), + get_osdmap_epoch(), + info); + + i.info.history.last_epoch_started = activation_epoch; + i.info.history.last_interval_started = i.info.history.same_interval_since; + if (acting.size() >= pool.info.min_size) { + state_set(PG_STATE_ACTIVE); + } else { + state_set(PG_STATE_PEERED); + } + + m->pg_list.push_back(make_pair(i, PastIntervals())); + osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap_epoch()); + + // waiters + if (flushes_in_progress == 0) { + requeue_ops(waiting_for_peered); + } else if (!waiting_for_peered.empty()) { + dout(10) << __func__ << " flushes in progress, moving " + << waiting_for_peered.size() << " items to waiting_for_flush" + << dendl; + ceph_assert(waiting_for_flush.empty()); + waiting_for_flush.swap(waiting_for_peered); + } + } + + ceph_assert(!dirty_info); + + unlock(); +} + +/* + * update info.history.last_epoch_started ONLY after we and all + * replicas have activated AND committed the activate transaction + * (i.e. the peering results are stable on disk). + */ +void PG::all_activated_and_committed() +{ + dout(10) << "all_activated_and_committed" << dendl; + ceph_assert(is_primary()); + ceph_assert(peer_activated.size() == acting_recovery_backfill.size()); + ceph_assert(!acting_recovery_backfill.empty()); + ceph_assert(blocked_by.empty()); + + // Degraded? + _update_calc_stats(); + if (info.stats.stats.sum.num_objects_degraded) { + state_set(PG_STATE_DEGRADED); + } else { + state_clear(PG_STATE_DEGRADED); + } + + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + AllReplicasActivated()))); +} + +bool PG::requeue_scrub(bool high_priority) +{ + ceph_assert(is_locked()); + if (scrub_queued) { + dout(10) << __func__ << ": already queued" << dendl; + return false; + } else { + dout(10) << __func__ << ": queueing" << dendl; + scrub_queued = true; + osd->queue_for_scrub(this, high_priority); + return true; + } +} + +void PG::queue_recovery() +{ + if (!is_primary() || !is_peered()) { + dout(10) << "queue_recovery -- not primary or not peered " << dendl; + ceph_assert(!recovery_queued); + } else if (recovery_queued) { + dout(10) << "queue_recovery -- already queued" << dendl; + } else { + dout(10) << "queue_recovery -- queuing" << dendl; + recovery_queued = true; + osd->queue_for_recovery(this); + } +} + +bool PG::queue_scrub() +{ + ceph_assert(is_locked()); + if (is_scrubbing()) { + return false; + } + // An interrupted recovery repair could leave this set. + state_clear(PG_STATE_REPAIR); + if (scrubber.need_auto) { + scrubber.must_scrub = true; + scrubber.must_deep_scrub = true; + scrubber.auto_repair = true; + scrubber.need_auto = false; + } + scrubber.priority = scrubber.must_scrub ? + cct->_conf->osd_requested_scrub_priority : get_scrub_priority(); + scrubber.must_scrub = false; + state_set(PG_STATE_SCRUBBING); + if (scrubber.must_deep_scrub) { + state_set(PG_STATE_DEEP_SCRUB); + scrubber.must_deep_scrub = false; + } + if (scrubber.must_repair || scrubber.auto_repair) { + state_set(PG_STATE_REPAIR); + scrubber.must_repair = false; + } + requeue_scrub(); + return true; +} + +unsigned PG::get_scrub_priority() +{ + // a higher value -> a higher priority + int64_t pool_scrub_priority = 0; + pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority); + return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority; +} + +void PG::try_mark_clean() +{ + if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) { + state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); + state_set(PG_STATE_CLEAN); + info.history.last_epoch_clean = get_osdmap_epoch(); + info.history.last_interval_clean = info.history.same_interval_since; + past_intervals.clear(); + dirty_big_info = true; + dirty_info = true; + } + + if (is_active()) { + kick_snap_trim(); + } else if (is_peered()) { + if (is_clean()) { + bool target; + if (pool.info.is_pending_merge(info.pgid.pgid, &target)) { + if (target) { + ldout(cct, 10) << "ready to merge (target)" << dendl; + osd->set_ready_to_merge_target(this, + info.last_update, + info.history.last_epoch_started, + info.history.last_epoch_clean); + } else { + ldout(cct, 10) << "ready to merge (source)" << dendl; + osd->set_ready_to_merge_source(this, info.last_update); + } + } + } else { + ldout(cct, 10) << "not clean, not ready to merge" << dendl; + // we should have notified OSD in Active state entry point + } + } + + state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL); + + share_pg_info(); + publish_stats_to_osd(); + requeue_ops(waiting_for_clean_to_primary_repair); +} + +bool PG::set_force_recovery(bool b) +{ + bool did = false; + if (b) { + if (!(state & PG_STATE_FORCED_RECOVERY) && + (state & (PG_STATE_DEGRADED | + PG_STATE_RECOVERY_WAIT | + PG_STATE_RECOVERING))) { + dout(20) << __func__ << " set" << dendl; + state_set(PG_STATE_FORCED_RECOVERY); + publish_stats_to_osd(); + did = true; + } + } else if (state & PG_STATE_FORCED_RECOVERY) { + dout(20) << __func__ << " clear" << dendl; + state_clear(PG_STATE_FORCED_RECOVERY); + publish_stats_to_osd(); + did = true; + } + if (did) { + dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl; + osd->local_reserver.update_priority(info.pgid, get_recovery_priority()); + } + return did; +} + +bool PG::set_force_backfill(bool b) +{ + bool did = false; + if (b) { + if (!(state & PG_STATE_FORCED_BACKFILL) && + (state & (PG_STATE_DEGRADED | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILLING))) { + dout(10) << __func__ << " set" << dendl; + state_set(PG_STATE_FORCED_BACKFILL); + publish_stats_to_osd(); + did = true; + } + } else if (state & PG_STATE_FORCED_BACKFILL) { + dout(10) << __func__ << " clear" << dendl; + state_clear(PG_STATE_FORCED_BACKFILL); + publish_stats_to_osd(); + did = true; + } + if (did) { + dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl; + osd->local_reserver.update_priority(info.pgid, get_backfill_priority()); + } + return did; +} + +int PG::clamp_recovery_priority(int priority, int pool_recovery_priority, int max) +{ + static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range"); + static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type"); + + ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX); + + // User can't set this too high anymore, but might be a legacy value + if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX) + pool_recovery_priority = OSD_POOL_PRIORITY_MAX; + if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN) + pool_recovery_priority = OSD_POOL_PRIORITY_MIN; + // Shift range from min to max to 0 to max - min + pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN); + ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN)); + + priority += pool_recovery_priority; + + // Clamp to valid range + if (priority > max) { + return max; + } else if (priority < OSD_RECOVERY_PRIORITY_MIN) { + return OSD_RECOVERY_PRIORITY_MIN; + } else { + return priority; + } +} + +unsigned PG::get_recovery_priority() +{ + // a higher value -> a higher priority + int ret = OSD_RECOVERY_PRIORITY_BASE; + int base = ret; + + if (state & PG_STATE_FORCED_RECOVERY) { + ret = OSD_RECOVERY_PRIORITY_FORCED; + } else { + // XXX: This priority boost isn't so much about inactive, but about data-at-risk + if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) { + base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE; + // inactive: no. of replicas < min_size, highest priority since it blocks IO + ret = base + (pool.info.min_size - info.stats.avail_no_missing.size()); + } + + int64_t pool_recovery_priority = 0; + pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority); + + ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]); + } + dout(20) << __func__ << " recovery priority is " << ret << dendl; + return static_cast<unsigned>(ret); +} + +unsigned PG::get_backfill_priority() +{ + // a higher value -> a higher priority + int ret = OSD_BACKFILL_PRIORITY_BASE; + int base = ret; + + if (state & PG_STATE_FORCED_BACKFILL) { + ret = OSD_BACKFILL_PRIORITY_FORCED; + } else { + if (acting.size() < pool.info.min_size) { + base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE; + // inactive: no. of replicas < min_size, highest priority since it blocks IO + ret = base + (pool.info.min_size - acting.size()); + + } else if (is_undersized()) { + // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas + ceph_assert(pool.info.size > actingset.size()); + base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE; + ret = base + (pool.info.size - actingset.size()); + + } else if (is_degraded()) { + // degraded: baseline degraded + base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE; + } + + // Adjust with pool's recovery priority + int64_t pool_recovery_priority = 0; + pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority); + + ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]); + } + + dout(20) << __func__ << " backfill priority is " << ret << dendl; + return static_cast<unsigned>(ret); +} + +unsigned PG::get_delete_priority() +{ + auto state = get_osdmap()->get_state(osd->whoami); + if (state & (CEPH_OSD_BACKFILLFULL | + CEPH_OSD_FULL)) { + return OSD_DELETE_PRIORITY_FULL; + } else if (state & CEPH_OSD_NEARFULL) { + return OSD_DELETE_PRIORITY_FULLISH; + } else { + return OSD_DELETE_PRIORITY_NORMAL; + } +} + +Context *PG::finish_recovery() +{ + dout(10) << "finish_recovery" << dendl; + ceph_assert(info.last_complete == info.last_update); + + clear_recovery_state(); + + /* + * sync all this before purging strays. but don't block! + */ + finish_sync_event = new C_PG_FinishRecovery(this); + return finish_sync_event; +} + +void PG::_finish_recovery(Context *c) +{ + lock(); + // When recovery is initiated by a repair, that flag is left on + state_clear(PG_STATE_REPAIR); + if (deleting) { + unlock(); + return; + } + if (c == finish_sync_event) { + dout(10) << "_finish_recovery" << dendl; + finish_sync_event = 0; + purge_strays(); + + publish_stats_to_osd(); + + if (scrub_after_recovery) { + dout(10) << "_finish_recovery requeueing for scrub" << dendl; + scrub_after_recovery = false; + scrubber.must_deep_scrub = true; + scrubber.check_repair = true; + // We remember whether req_scrub was set when scrub_after_recovery set to true + scrubber.req_scrub = save_req_scrub; + queue_scrub(); + } + } else { + dout(10) << "_finish_recovery -- stale" << dendl; + } + unlock(); +} + +void PG::start_recovery_op(const hobject_t& soid) +{ + dout(10) << "start_recovery_op " << soid +#ifdef DEBUG_RECOVERY_OIDS + << " (" << recovering_oids << ")" +#endif + << dendl; + ceph_assert(recovery_ops_active >= 0); + recovery_ops_active++; +#ifdef DEBUG_RECOVERY_OIDS + recovering_oids.insert(soid); +#endif + osd->start_recovery_op(this, soid); +} + +void PG::finish_recovery_op(const hobject_t& soid, bool dequeue) +{ + dout(10) << "finish_recovery_op " << soid +#ifdef DEBUG_RECOVERY_OIDS + << " (" << recovering_oids << ")" +#endif + << dendl; + ceph_assert(recovery_ops_active > 0); + recovery_ops_active--; +#ifdef DEBUG_RECOVERY_OIDS + ceph_assert(recovering_oids.count(soid)); + recovering_oids.erase(recovering_oids.find(soid)); +#endif + osd->finish_recovery_op(this, soid, dequeue); + + if (!dequeue) { + queue_recovery(); + } +} + +void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) +{ + child->update_snap_mapper_bits(split_bits); + child->update_osdmap_ref(get_osdmap()); + + child->pool = pool; + + // Log + pg_log.split_into(child_pgid, split_bits, &(child->pg_log)); + child->info.last_complete = info.last_complete; + + info.last_update = pg_log.get_head(); + child->info.last_update = child->pg_log.get_head(); + + child->info.last_user_version = info.last_user_version; + + info.log_tail = pg_log.get_tail(); + child->info.log_tail = child->pg_log.get_tail(); + + // reset last_complete, we might have modified pg_log & missing above + pg_log.reset_complete_to(&info); + child->pg_log.reset_complete_to(&child->info); + + // Info + child->info.history = info.history; + child->info.history.epoch_created = get_osdmap_epoch(); + child->info.purged_snaps = info.purged_snaps; + + if (info.last_backfill.is_max()) { + child->info.set_last_backfill(hobject_t::get_max()); + } else { + // restart backfill on parent and child to be safe. we could + // probably do better in the bitwise sort case, but it's more + // fragile (there may be special work to do on backfill completion + // in the future). + info.set_last_backfill(hobject_t()); + child->info.set_last_backfill(hobject_t()); + // restarting backfill implies that the missing set is empty, + // since it is only used for objects prior to last_backfill + pg_log.reset_backfill(); + child->pg_log.reset_backfill(); + } + + child->info.stats = info.stats; + child->info.stats.parent_split_bits = split_bits; + info.stats.stats_invalid = true; + child->info.stats.stats_invalid = true; + child->info.last_epoch_started = info.last_epoch_started; + child->info.last_interval_started = info.last_interval_started; + + child->snap_trimq = snap_trimq; + + // There can't be recovery/backfill going on now + int primary, up_primary; + vector<int> newup, newacting; + get_osdmap()->pg_to_up_acting_osds( + child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary); + child->init_primary_up_acting( + newup, + newacting, + up_primary, + primary); + child->role = OSDMap::calc_pg_role(osd->whoami, child->acting); + + // this comparison includes primary rank via pg_shard_t + if (get_primary() != child->get_primary()) + child->info.history.same_primary_since = get_osdmap_epoch(); + + child->info.stats.up = up; + child->info.stats.up_primary = up_primary; + child->info.stats.acting = acting; + child->info.stats.acting_primary = primary; + child->info.stats.mapping_epoch = get_osdmap_epoch(); + + // History + child->past_intervals = past_intervals; + + _split_into(child_pgid, child, split_bits); + + // release all backoffs for simplicity + release_backoffs(hobject_t(), hobject_t::get_max()); + + child->on_new_interval(); + + child->send_notify = !child->is_primary(); + + child->dirty_info = true; + child->dirty_big_info = true; + dirty_info = true; + dirty_big_info = true; +} + +void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out) +{ + out->resize(childpgs.size() + 1); + info.stats.stats.sum.split(*out); +} + +void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t) +{ + info.stats.stats.sum = stats; + write_if_dirty(*t); +} + +void PG::merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx, + unsigned split_bits, + const pg_merge_meta_t& last_pg_merge_meta) +{ + dout(10) << __func__ << " from " << sources << " split_bits " << split_bits + << dendl; + bool incomplete = false; + if (info.last_complete != info.last_update || + info.is_incomplete() || + info.dne()) { + dout(10) << __func__ << " target incomplete" << dendl; + incomplete = true; + } + if (last_pg_merge_meta.source_pgid != pg_t()) { + if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) { + dout(10) << __func__ << " target doesn't match expected parent " + << last_pg_merge_meta.source_pgid.get_parent() + << " of source_pgid " << last_pg_merge_meta.source_pgid + << dendl; + incomplete = true; + } + if (info.last_update != last_pg_merge_meta.target_version) { + dout(10) << __func__ << " target version doesn't match expected " + << last_pg_merge_meta.target_version << dendl; + incomplete = true; + } + } + + PGLogEntryHandler handler{this, rctx->transaction}; + pg_log.roll_forward(&handler); + + info.last_complete = info.last_update; // to fake out trim() + pg_log.reset_recovery_pointers(); + pg_log.trim(info.last_update, info); + + vector<PGLog*> log_from; + for (auto& i : sources) { + auto& source = i.second; + if (!source) { + dout(10) << __func__ << " source " << i.first << " missing" << dendl; + incomplete = true; + continue; + } + if (source->info.last_complete != source->info.last_update || + source->info.is_incomplete() || + source->info.dne()) { + dout(10) << __func__ << " source " << source->pg_id << " incomplete" + << dendl; + incomplete = true; + } + if (last_pg_merge_meta.source_pgid != pg_t()) { + if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) { + dout(10) << __func__ << " source " << source->info.pgid.pgid + << " doesn't match expected source pgid " + << last_pg_merge_meta.source_pgid << dendl; + incomplete = true; + } + if (source->info.last_update != last_pg_merge_meta.source_version) { + dout(10) << __func__ << " source version doesn't match expected " + << last_pg_merge_meta.target_version << dendl; + incomplete = true; + } + } + + // prepare log + PGLogEntryHandler handler{source.get(), rctx->transaction}; + source->pg_log.roll_forward(&handler); + source->info.last_complete = source->info.last_update; // to fake out trim() + source->pg_log.reset_recovery_pointers(); + source->pg_log.trim(source->info.last_update, source->info); + log_from.push_back(&source->pg_log); + + // wipe out source's pgmeta + rctx->transaction->remove(source->coll, source->pgmeta_oid); + + // merge (and destroy source collection) + rctx->transaction->merge_collection(source->coll, coll, split_bits); + + // combine stats + info.stats.add(source->info.stats); + + // pull up last_update + info.last_update = std::max(info.last_update, source->info.last_update); + + // adopt source's PastIntervals if target has none. we can do this since + // pgp_num has been reduced prior to the merge, so the OSD mappings for + // the PGs are identical. + if (past_intervals.empty() && !source->past_intervals.empty()) { + dout(10) << __func__ << " taking source's past_intervals" << dendl; + past_intervals = source->past_intervals; + } + } + + // merge_collection does this, but maybe all of our sources were missing. + rctx->transaction->collection_set_bits(coll, split_bits); + + info.last_complete = info.last_update; + info.log_tail = info.last_update; + if (incomplete) { + info.last_backfill = hobject_t(); + } + + snap_mapper.update_bits(split_bits); + + // merge logs + pg_log.merge_from(log_from, info.last_update); + + // make sure we have a meaningful last_epoch_started/clean (if we were a + // placeholder) + if (info.history.epoch_created == 0) { + // start with (a) source's history, since these PGs *should* have been + // remapped in concert with each other... + info.history = sources.begin()->second->info.history; + + // we use the last_epoch_{started,clean} we got from + // the caller, which are the epochs that were reported by the PGs were + // found to be ready for merge. + info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean; + info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started; + info.last_epoch_started = last_pg_merge_meta.last_epoch_started; + dout(10) << __func__ + << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/" + << last_pg_merge_meta.last_epoch_clean + << " from pool last_dec_*, source pg history was " + << sources.begin()->second->info.history + << dendl; + + // above we have pulled down source's history and we need to check + // history.epoch_created again to confirm that source is not a placeholder + // too. (peering requires a sane history.same_interval_since value for any + // non-newly created pg and below here we know we are basically iterating + // back a series of past maps to fake a merge process, hence we need to + // fix history.same_interval_since first so that start_peering_interval() + // will not complain) + if (info.history.epoch_created == 0) { + dout(10) << __func__ << " both merge target and source are placeholders," + << " set sis to lec " << info.history.last_epoch_clean + << dendl; + info.history.same_interval_since = info.history.last_epoch_clean; + } + + // if the past_intervals start is later than last_epoch_clean, it + // implies the source repeered again but the target didn't, or + // that the source became clean in a later epoch than the target. + // avoid the discrepancy but adjusting the interval start + // backwards to match so that check_past_interval_bounds() will + // not complain. + auto pib = past_intervals.get_bounds(); + if (info.history.last_epoch_clean < pib.first) { + dout(10) << __func__ << " last_epoch_clean " + << info.history.last_epoch_clean << " < past_interval start " + << pib.first << ", adjusting start backwards" << dendl; + past_intervals.adjust_start_backwards(info.history.last_epoch_clean); + } + + // Similarly, if the same_interval_since value is later than + // last_epoch_clean, the next interval change will result in a + // past_interval start that is later than last_epoch_clean. This + // can happen if we use the pg_history values from the merge + // source. Adjust the same_interval_since value backwards if that + // happens. (We trust the les and lec values more because they came from + // the real target, whereas the history value we stole from the source.) + if (info.history.last_epoch_started < info.history.same_interval_since) { + dout(10) << __func__ << " last_epoch_started " + << info.history.last_epoch_started << " < same_interval_since " + << info.history.same_interval_since + << ", adjusting pg_history backwards" << dendl; + info.history.same_interval_since = info.history.last_epoch_clean; + // make sure same_{up,primary}_since are <= same_interval_since + info.history.same_up_since = std::min( + info.history.same_up_since, info.history.same_interval_since); + info.history.same_primary_since = std::min( + info.history.same_primary_since, info.history.same_interval_since); + } + } + + dirty_info = true; + dirty_big_info = true; +} + +void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end) +{ + ConnectionRef con = s->con; + if (!con) // OSD::ms_handle_reset clears s->con without a lock + return; + BackoffRef b(s->have_backoff(info.pgid, begin)); + if (b) { + derr << __func__ << " already have backoff for " << s << " begin " << begin + << " " << *b << dendl; + ceph_abort(); + } + std::lock_guard l(backoff_lock); + { + b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end); + backoffs[begin].insert(b); + s->add_backoff(b); + dout(10) << __func__ << " session " << s << " added " << *b << dendl; + } + con->send_message( + new MOSDBackoff( + info.pgid, + get_osdmap_epoch(), + CEPH_OSD_BACKOFF_OP_BLOCK, + b->id, + begin, + end)); +} + +void PG::release_backoffs(const hobject_t& begin, const hobject_t& end) +{ + dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl; + vector<BackoffRef> bv; + { + std::lock_guard l(backoff_lock); + auto p = backoffs.lower_bound(begin); + while (p != backoffs.end()) { + int r = cmp(p->first, end); + dout(20) << __func__ << " ? " << r << " " << p->first + << " " << p->second << dendl; + // note: must still examine begin=end=p->first case + if (r > 0 || (r == 0 && begin < end)) { + break; + } + dout(20) << __func__ << " checking " << p->first + << " " << p->second << dendl; + auto q = p->second.begin(); + while (q != p->second.end()) { + dout(20) << __func__ << " checking " << *q << dendl; + int r = cmp((*q)->begin, begin); + if (r == 0 || (r > 0 && (*q)->end < end)) { + bv.push_back(*q); + q = p->second.erase(q); + } else { + ++q; + } + } + if (p->second.empty()) { + p = backoffs.erase(p); + } else { + ++p; + } + } + } + for (auto b : bv) { + std::lock_guard l(b->lock); + dout(10) << __func__ << " " << *b << dendl; + if (b->session) { + ceph_assert(b->pg == this); + ConnectionRef con = b->session->con; + if (con) { // OSD::ms_handle_reset clears s->con without a lock + con->send_message( + new MOSDBackoff( + info.pgid, + get_osdmap_epoch(), + CEPH_OSD_BACKOFF_OP_UNBLOCK, + b->id, + b->begin, + b->end)); + } + if (b->is_new()) { + b->state = Backoff::STATE_DELETING; + } else { + b->session->rm_backoff(b); + b->session.reset(); + } + b->pg.reset(); + } + } +} + +void PG::clear_backoffs() +{ + dout(10) << __func__ << " " << dendl; + map<hobject_t,set<BackoffRef>> ls; + { + std::lock_guard l(backoff_lock); + ls.swap(backoffs); + } + for (auto& p : ls) { + for (auto& b : p.second) { + std::lock_guard l(b->lock); + dout(10) << __func__ << " " << *b << dendl; + if (b->session) { + ceph_assert(b->pg == this); + if (b->is_new()) { + b->state = Backoff::STATE_DELETING; + } else { + b->session->rm_backoff(b); + b->session.reset(); + } + b->pg.reset(); + } + } + } +} + +// called by Session::clear_backoffs() +void PG::rm_backoff(BackoffRef b) +{ + dout(10) << __func__ << " " << *b << dendl; + std::lock_guard l(backoff_lock); + ceph_assert(b->lock.is_locked_by_me()); + ceph_assert(b->pg == this); + auto p = backoffs.find(b->begin); + // may race with release_backoffs() + if (p != backoffs.end()) { + auto q = p->second.find(b); + if (q != p->second.end()) { + p->second.erase(q); + if (p->second.empty()) { + backoffs.erase(p); + } + } + } +} + +void PG::clear_recovery_state() +{ + dout(10) << "clear_recovery_state" << dendl; + + pg_log.reset_recovery_pointers(); + finish_sync_event = 0; + + hobject_t soid; + while (recovery_ops_active > 0) { +#ifdef DEBUG_RECOVERY_OIDS + soid = *recovering_oids.begin(); +#endif + finish_recovery_op(soid, true); + } + + async_recovery_targets.clear(); + backfill_targets.clear(); + backfill_info.clear(); + peer_backfill_info.clear(); + waiting_on_backfill.clear(); + _clear_recovery_state(); // pg impl specific hook +} + +void PG::cancel_recovery() +{ + dout(10) << "cancel_recovery" << dendl; + clear_recovery_state(); +} + + +void PG::purge_strays() +{ + if (is_premerge()) { + dout(10) << "purge_strays " << stray_set << " but premerge, doing nothing" + << dendl; + return; + } + if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) { + return; + } + dout(10) << "purge_strays " << stray_set << dendl; + + bool removed = false; + for (set<pg_shard_t>::iterator p = stray_set.begin(); + p != stray_set.end(); + ++p) { + ceph_assert(!is_acting_recovery_backfill(*p)); + if (get_osdmap()->is_up(p->osd)) { + dout(10) << "sending PGRemove to osd." << *p << dendl; + vector<spg_t> to_remove; + to_remove.push_back(spg_t(info.pgid.pgid, p->shard)); + MOSDPGRemove *m = new MOSDPGRemove( + get_osdmap_epoch(), + to_remove); + osd->send_message_osd_cluster(p->osd, m, get_osdmap_epoch()); + } else { + dout(10) << "not sending PGRemove to down osd." << *p << dendl; + } + peer_missing.erase(*p); + peer_info.erase(*p); + peer_purged.insert(*p); + removed = true; + } + + // if we removed anyone, update peers (which include peer_info) + if (removed) + update_heartbeat_peers(); + + stray_set.clear(); + + // clear _requested maps; we may have to peer() again if we discover + // (more) stray content + peer_log_requested.clear(); + peer_missing_requested.clear(); +} + +void PG::set_probe_targets(const set<pg_shard_t> &probe_set) +{ + std::lock_guard l(heartbeat_peer_lock); + probe_targets.clear(); + for (set<pg_shard_t>::iterator i = probe_set.begin(); + i != probe_set.end(); + ++i) { + probe_targets.insert(i->osd); + } +} + +void PG::clear_probe_targets() +{ + std::lock_guard l(heartbeat_peer_lock); + probe_targets.clear(); +} + +void PG::update_heartbeat_peers() +{ + ceph_assert(is_locked()); + + if (!is_primary()) + return; + + set<int> new_peers; + for (unsigned i=0; i<acting.size(); i++) { + if (acting[i] != CRUSH_ITEM_NONE) + new_peers.insert(acting[i]); + } + for (unsigned i=0; i<up.size(); i++) { + if (up[i] != CRUSH_ITEM_NONE) + new_peers.insert(up[i]); + } + for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin(); + p != peer_info.end(); + ++p) + new_peers.insert(p->first.osd); + + bool need_update = false; + heartbeat_peer_lock.Lock(); + if (new_peers == heartbeat_peers) { + dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl; + } else { + dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl; + heartbeat_peers.swap(new_peers); + need_update = true; + } + heartbeat_peer_lock.Unlock(); + + if (need_update) + osd->need_heartbeat_peer_update(); +} + + +bool PG::check_in_progress_op( + const osd_reqid_t &r, + eversion_t *version, + version_t *user_version, + int *return_code) const +{ + return ( + projected_log.get_request(r, version, user_version, return_code) || + pg_log.get_log().get_request(r, version, user_version, return_code)); +} + +static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard) +{ + for (auto&p : pgs) + if (p.shard == shard) + return true; + return false; +} + +static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard) +{ + for (auto&p : pgs) { + if (p == skip) + continue; + if (p.shard == shard) + return p; + } + return pg_shard_t(); +} + +void PG::_update_calc_stats() +{ + info.stats.version = info.last_update; + info.stats.created = info.history.epoch_created; + info.stats.last_scrub = info.history.last_scrub; + info.stats.last_scrub_stamp = info.history.last_scrub_stamp; + info.stats.last_deep_scrub = info.history.last_deep_scrub; + info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp; + info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp; + info.stats.last_epoch_clean = info.history.last_epoch_clean; + + info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version; + info.stats.ondisk_log_size = info.stats.log_size; + info.stats.log_start = pg_log.get_tail(); + info.stats.ondisk_log_start = pg_log.get_tail(); + info.stats.snaptrimq_len = snap_trimq.size(); + + unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid); + + // In rare case that upset is too large (usually transient), use as target + // for calculations below. + unsigned target = std::max(num_shards, (unsigned)upset.size()); + // For undersized actingset may be larger with OSDs out + unsigned nrep = std::max(actingset.size(), upset.size()); + // calc num_object_copies + info.stats.stats.calc_copies(std::max(target, nrep)); + info.stats.stats.sum.num_objects_degraded = 0; + info.stats.stats.sum.num_objects_unfound = 0; + info.stats.stats.sum.num_objects_misplaced = 0; + info.stats.avail_no_missing.clear(); + info.stats.object_location_counts.clear(); + + // We should never hit this condition, but if end up hitting it, + // make sure to update num_objects and set PG_STATE_INCONSISTENT. + if (info.stats.stats.sum.num_objects < 0) { + dout(0) << __func__ << " negative num_objects = " + << info.stats.stats.sum.num_objects << " setting it to 0 " + << dendl; + info.stats.stats.sum.num_objects = 0; + state_set(PG_STATE_INCONSISTENT); + } + + if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) { + dout(20) << __func__ << " actingset " << actingset << " upset " + << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl; + dout(20) << __func__ << " acting " << acting << " up " << up << dendl; + + ceph_assert(!acting_recovery_backfill.empty()); + + bool estimate = false; + + // NOTE: we only generate degraded, misplaced and unfound + // values for the summation, not individual stat categories. + int64_t num_objects = info.stats.stats.sum.num_objects; + + // Objects missing from up nodes, sorted by # objects. + boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects; + // Objects missing from nodes not in up, sort by # objects + boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects; + + // Fill missing_target_objects/acting_source_objects + + { + int64_t missing; + + // Primary first + missing = pg_log.get_missing().num_missing(); + ceph_assert(acting_recovery_backfill.count(pg_whoami)); + if (upset.count(pg_whoami)) { + missing_target_objects.insert(make_pair(missing, pg_whoami)); + } else { + acting_source_objects.insert(make_pair(missing, pg_whoami)); + } + info.stats.stats.sum.num_objects_missing_on_primary = missing; + if (missing == 0) + info.stats.avail_no_missing.push_back(pg_whoami); + dout(20) << __func__ << " shard " << pg_whoami + << " primary objects " << num_objects + << " missing " << missing + << dendl; + } + + // All other peers + for (auto& peer : peer_info) { + // Primary should not be in the peer_info, skip if it is. + if (peer.first == pg_whoami) continue; + int64_t missing = 0; + int64_t peer_num_objects = + std::max((int64_t)0, peer.second.stats.stats.sum.num_objects); + // Backfill targets always track num_objects accurately + // all other peers track missing accurately. + if (is_backfill_targets(peer.first)) { + missing = std::max((int64_t)0, num_objects - peer_num_objects); + } else { + if (peer_missing.count(peer.first)) { + missing = peer_missing[peer.first].num_missing(); + } else { + dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl; + if (is_recovering()) { + estimate = true; + } + missing = std::max((int64_t)0, num_objects - peer_num_objects); + } + } + if (upset.count(peer.first)) { + missing_target_objects.insert(make_pair(missing, peer.first)); + } else if (actingset.count(peer.first)) { + acting_source_objects.insert(make_pair(missing, peer.first)); + } + peer.second.stats.stats.sum.num_objects_missing = missing; + if (missing == 0) + info.stats.avail_no_missing.push_back(peer.first); + dout(20) << __func__ << " shard " << peer.first + << " objects " << peer_num_objects + << " missing " << missing + << dendl; + } + + // Compute object_location_counts + for (auto& ml: missing_loc.get_missing_locs()) { + info.stats.object_location_counts[ml.second]++; + dout(30) << __func__ << " " << ml.first << " object_location_counts[" + << ml.second << "]=" << info.stats.object_location_counts[ml.second] + << dendl; + } + int64_t not_missing = num_objects - missing_loc.get_missing_locs().size(); + if (not_missing) { + // During recovery we know upset == actingset and is being populated + // During backfill we know that all non-missing objects are in the actingset + info.stats.object_location_counts[actingset] = not_missing; + } + dout(30) << __func__ << " object_location_counts[" + << upset << "]=" << info.stats.object_location_counts[upset] + << dendl; + dout(20) << __func__ << " object_location_counts " + << info.stats.object_location_counts << dendl; + + // A misplaced object is not stored on the correct OSD + int64_t misplaced = 0; + // a degraded objects has fewer replicas or EC shards than the pool specifies. + int64_t degraded = 0; + + if (is_recovering()) { + for (auto& sml: missing_loc.get_missing_by_count()) { + for (auto& ml: sml.second) { + int missing_shards; + if (sml.first == shard_id_t::NO_SHARD) { + dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl; + missing_shards = (int)upset.size() - ml.first.up; + } else { + // Handle shards not even in upset below + if (!find_shard(upset, sml.first)) + continue; + missing_shards = std::max(0, 1 - ml.first.up); + dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl; + } + int odegraded = ml.second * missing_shards; + // Copies on other osds but limited to the possible degraded + int more_osds = std::min(missing_shards, ml.first.other); + int omisplaced = ml.second * more_osds; + ceph_assert(omisplaced <= odegraded); + odegraded -= omisplaced; + + misplaced += omisplaced; + degraded += odegraded; + } + } + + dout(20) << __func__ << " missing based degraded " << degraded << dendl; + dout(20) << __func__ << " missing based misplaced " << misplaced << dendl; + + // Handle undersized case + if (pool.info.is_replicated()) { + // Add degraded for missing targets (num_objects missing) + ceph_assert(target >= upset.size()); + unsigned needed = target - upset.size(); + degraded += num_objects * needed; + } else { + for (unsigned i = 0 ; i < num_shards; ++i) { + shard_id_t shard(i); + + if (!find_shard(upset, shard)) { + pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard); + + if (pgs != pg_shard_t()) { + int64_t missing; + + if (pgs == pg_whoami) + missing = info.stats.stats.sum.num_objects_missing_on_primary; + else + missing = peer_info[pgs].stats.stats.sum.num_objects_missing; + + degraded += missing; + misplaced += std::max((int64_t)0, num_objects - missing); + } else { + // No shard anywhere + degraded += num_objects; + } + } + } + } + goto out; + } + + // Handle undersized case + if (pool.info.is_replicated()) { + // Add to missing_target_objects + ceph_assert(target >= missing_target_objects.size()); + unsigned needed = target - missing_target_objects.size(); + if (needed) + missing_target_objects.insert(make_pair(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD))); + } else { + for (unsigned i = 0 ; i < num_shards; ++i) { + shard_id_t shard(i); + bool found = false; + for (const auto& t : missing_target_objects) { + if (std::get<1>(t).shard == shard) { + found = true; + break; + } + } + if (!found) + missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard))); + } + } + + for (const auto& item : missing_target_objects) + dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl; + for (const auto& item : acting_source_objects) + dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl; + + // Handle all objects not in missing for remapped + // or backfill + for (auto m = missing_target_objects.rbegin(); + m != missing_target_objects.rend(); ++m) { + + int64_t extra_missing = -1; + + if (pool.info.is_replicated()) { + if (!acting_source_objects.empty()) { + auto extra_copy = acting_source_objects.begin(); + extra_missing = std::get<0>(*extra_copy); + acting_source_objects.erase(extra_copy); + } + } else { // Erasure coded + // Use corresponding shard + for (const auto& a : acting_source_objects) { + if (std::get<1>(a).shard == std::get<1>(*m).shard) { + extra_missing = std::get<0>(a); + acting_source_objects.erase(a); + break; + } + } + } + + if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) { + // We don't know which of the objects on the target + // are part of extra_missing so assume are all degraded. + misplaced += std::get<0>(*m) - extra_missing; + degraded += extra_missing; + } else { + // 1. extra_missing == -1, more targets than sources so degraded + // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing + // previously degraded are now present on the target. + degraded += std::get<0>(*m); + } + } + // If there are still acting that haven't been accounted for + // then they are misplaced + for (const auto& a : acting_source_objects) { + int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a)); + dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl; + misplaced += extra_misplaced; + } +out: + // NOTE: Tests use these messages to verify this code + dout(20) << __func__ << " degraded " << degraded << (estimate ? " (est)": "") << dendl; + dout(20) << __func__ << " misplaced " << misplaced << (estimate ? " (est)": "")<< dendl; + + info.stats.stats.sum.num_objects_degraded = degraded; + info.stats.stats.sum.num_objects_unfound = get_num_unfound(); + info.stats.stats.sum.num_objects_misplaced = misplaced; + } +} + +void PG::_update_blocked_by() +{ + // set a max on the number of blocking peers we report. if we go + // over, report a random subset. keep the result sorted. + unsigned keep = std::min<unsigned>(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by); + unsigned skip = blocked_by.size() - keep; + info.stats.blocked_by.clear(); + info.stats.blocked_by.resize(keep); + unsigned pos = 0; + for (set<int>::iterator p = blocked_by.begin(); + p != blocked_by.end() && keep > 0; + ++p) { + if (skip > 0 && (rand() % (skip + keep) < skip)) { + --skip; + } else { + info.stats.blocked_by[pos++] = *p; + --keep; + } + } +} + +void PG::publish_stats_to_osd() +{ + if (!is_primary()) + return; + + pg_stats_publish_lock.Lock(); + + if (info.stats.stats.sum.num_scrub_errors) + state_set(PG_STATE_INCONSISTENT); + else { + state_clear(PG_STATE_INCONSISTENT); + state_clear(PG_STATE_FAILED_REPAIR); + } + + utime_t now = ceph_clock_now(); + if (info.stats.state != state) { + info.stats.last_change = now; + // Optimistic estimation, if we just find out an inactive PG, + // assumt it is active till now. + if (!(state & PG_STATE_ACTIVE) && + (info.stats.state & PG_STATE_ACTIVE)) + info.stats.last_active = now; + + if ((state & PG_STATE_ACTIVE) && + !(info.stats.state & PG_STATE_ACTIVE)) + info.stats.last_became_active = now; + if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) && + !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))) + info.stats.last_became_peered = now; + info.stats.state = state; + } + + _update_calc_stats(); + if (info.stats.stats.sum.num_objects_degraded) { + state_set(PG_STATE_DEGRADED); + } else { + state_clear(PG_STATE_DEGRADED); + } + _update_blocked_by(); + + pg_stat_t pre_publish = info.stats; + pre_publish.stats.add(unstable_stats); + utime_t cutoff = now; + cutoff -= cct->_conf->osd_pg_stat_report_interval_max; + + if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) { + // share (some of) our purged_snaps via the pg_stats. limit # of intervals + // because we don't want to make the pg_stat_t structures too expensive. + unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch; + unsigned num = 0; + auto i = info.purged_snaps.begin(); + while (num < max && i != info.purged_snaps.end()) { + pre_publish.purged_snaps.insert(i.get_start(), i.get_len()); + ++num; + ++i; + } + dout(20) << __func__ << " reporting purged_snaps " + << pre_publish.purged_snaps << dendl; + } + + if (pg_stats_publish_valid && pre_publish == pg_stats_publish && + info.stats.last_fresh > cutoff) { + dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch + << ": no change since " << info.stats.last_fresh << dendl; + } else { + // update our stat summary and timestamps + info.stats.reported_epoch = get_osdmap_epoch(); + ++info.stats.reported_seq; + + info.stats.last_fresh = now; + + if (info.stats.state & PG_STATE_CLEAN) + info.stats.last_clean = now; + if (info.stats.state & PG_STATE_ACTIVE) + info.stats.last_active = now; + if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) + info.stats.last_peered = now; + info.stats.last_unstale = now; + if ((info.stats.state & PG_STATE_DEGRADED) == 0) + info.stats.last_undegraded = now; + if ((info.stats.state & PG_STATE_UNDERSIZED) == 0) + info.stats.last_fullsized = now; + + pg_stats_publish_valid = true; + pg_stats_publish = pre_publish; + + dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch + << ":" << pg_stats_publish.reported_seq << dendl; + } + pg_stats_publish_lock.Unlock(); +} + +void PG::clear_publish_stats() +{ + dout(15) << "clear_stats" << dendl; + pg_stats_publish_lock.Lock(); + pg_stats_publish_valid = false; + pg_stats_publish_lock.Unlock(); +} + +/** + * initialize a newly instantiated pg + * + * Initialize PG state, as when a PG is initially created, or when it + * is first instantiated on the current node. + * + * @param role our role/rank + * @param newup up set + * @param newacting acting set + * @param history pg history + * @param pi past_intervals + * @param backfill true if info should be marked as backfill + * @param t transaction to write out our new state in + */ +void PG::init( + int role, + const vector<int>& newup, int new_up_primary, + const vector<int>& newacting, int new_acting_primary, + const pg_history_t& history, + const PastIntervals& pi, + bool backfill, + ObjectStore::Transaction *t) +{ + dout(10) << "init role " << role << " up " << newup << " acting " << newacting + << " history " << history + << " past_intervals " << pi + << dendl; + + set_role(role); + init_primary_up_acting( + newup, + newacting, + new_up_primary, + new_acting_primary); + + info.history = history; + past_intervals = pi; + + info.stats.up = up; + info.stats.up_primary = new_up_primary; + info.stats.acting = acting; + info.stats.acting_primary = new_acting_primary; + info.stats.mapping_epoch = info.history.same_interval_since; + + if (backfill) { + dout(10) << __func__ << ": Setting backfill" << dendl; + info.set_last_backfill(hobject_t()); + info.last_complete = info.last_update; + pg_log.mark_log_for_rewrite(); + } + + on_new_interval(); + + dirty_info = true; + dirty_big_info = true; + write_if_dirty(*t); +} + +void PG::shutdown() +{ + ch->flush(); + lock(); + on_shutdown(); + unlock(); +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +void PG::upgrade(ObjectStore *store) +{ + dout(0) << __func__ << " " << info_struct_v << " -> " << latest_struct_v + << dendl; + ceph_assert(info_struct_v <= 10); + ObjectStore::Transaction t; + + // <do upgrade steps here> + + // finished upgrade! + ceph_assert(info_struct_v == 10); + + // update infover_key + if (info_struct_v < latest_struct_v) { + map<string,bufferlist> v; + __u8 ver = latest_struct_v; + encode(ver, v[infover_key]); + t.omap_setkeys(coll, pgmeta_oid, v); + } + + dirty_info = true; + dirty_big_info = true; + write_if_dirty(t); + + ObjectStore::CollectionHandle ch = store->open_collection(coll); + int r = store->queue_transaction(ch, std::move(t)); + if (r != 0) { + derr << __func__ << ": queue_transaction returned " + << cpp_strerror(r) << dendl; + ceph_abort(); + } + ceph_assert(r == 0); + + C_SaferCond waiter; + if (!ch->flush_commit(&waiter)) { + waiter.wait(); + } +} + +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" + +int PG::_prepare_write_info(CephContext* cct, + map<string,bufferlist> *km, + epoch_t epoch, + pg_info_t &info, pg_info_t &last_written_info, + PastIntervals &past_intervals, + bool dirty_big_info, + bool dirty_epoch, + bool try_fast_info, + PerfCounters *logger) +{ + if (dirty_epoch) { + encode(epoch, (*km)[epoch_key]); + } + + if (logger) + logger->inc(l_osd_pg_info); + + // try to do info efficiently? + if (!dirty_big_info && try_fast_info && + info.last_update > last_written_info.last_update) { + pg_fast_info_t fast; + fast.populate_from(info); + bool did = fast.try_apply_to(&last_written_info); + ceph_assert(did); // we verified last_update increased above + if (info == last_written_info) { + encode(fast, (*km)[fastinfo_key]); + if (logger) + logger->inc(l_osd_pg_fastinfo); + return 0; + } + generic_dout(30) << __func__ << " fastinfo failed, info:\n"; + { + JSONFormatter jf(true); + jf.dump_object("info", info); + jf.flush(*_dout); + } + { + *_dout << "\nlast_written_info:\n"; + JSONFormatter jf(true); + jf.dump_object("last_written_info", last_written_info); + jf.flush(*_dout); + } + *_dout << dendl; + } + last_written_info = info; + + // info. store purged_snaps separately. + interval_set<snapid_t> purged_snaps; + purged_snaps.swap(info.purged_snaps); + encode(info, (*km)[info_key]); + purged_snaps.swap(info.purged_snaps); + + if (dirty_big_info) { + // potentially big stuff + bufferlist& bigbl = (*km)[biginfo_key]; + encode(past_intervals, bigbl); + encode(info.purged_snaps, bigbl); + //dout(20) << "write_info bigbl " << bigbl.length() << dendl; + if (logger) + logger->inc(l_osd_pg_biginfo); + } + + return 0; +} + +void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits) +{ + coll_t coll(pgid); + t.create_collection(coll, bits); +} + +void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool) +{ + coll_t coll(pgid); + + if (pool) { + // Give a hint to the PG collection + bufferlist hint; + uint32_t pg_num = pool->get_pg_num(); + uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num; + encode(pg_num, hint); + encode(expected_num_objects_pg, hint); + uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS; + t.collection_hint(coll, hint_type, hint); + } + + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + t.touch(coll, pgmeta_oid); + map<string,bufferlist> values; + __u8 struct_v = latest_struct_v; + encode(struct_v, values[infover_key]); + t.omap_setkeys(coll, pgmeta_oid, values); +} + +void PG::prepare_write_info(map<string,bufferlist> *km) +{ + info.stats.stats.add(unstable_stats); + unstable_stats.clear(); + + bool need_update_epoch = last_epoch < get_osdmap_epoch(); + int ret = _prepare_write_info(cct, km, get_osdmap_epoch(), + info, + last_written_info, + past_intervals, + dirty_big_info, need_update_epoch, + cct->_conf->osd_fast_info, + osd->logger); + ceph_assert(ret == 0); + if (need_update_epoch) + last_epoch = get_osdmap_epoch(); + last_persisted_osdmap = last_epoch; + + dirty_info = false; + dirty_big_info = false; +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +bool PG::_has_removal_flag(ObjectStore *store, + spg_t pgid) +{ + coll_t coll(pgid); + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + + // first try new way + set<string> keys; + keys.insert("_remove"); + map<string,bufferlist> values; + auto ch = store->open_collection(coll); + ceph_assert(ch); + if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 && + values.size() == 1) + return true; + + return false; +} + +int PG::peek_map_epoch(ObjectStore *store, + spg_t pgid, + epoch_t *pepoch) +{ + coll_t coll(pgid); + ghobject_t legacy_infos_oid(OSD::make_infos_oid()); + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + epoch_t cur_epoch = 0; + + // validate collection name + ceph_assert(coll.is_pg()); + + // try for v8 + set<string> keys; + keys.insert(infover_key); + keys.insert(epoch_key); + map<string,bufferlist> values; + auto ch = store->open_collection(coll); + ceph_assert(ch); + int r = store->omap_get_values(ch, pgmeta_oid, keys, &values); + if (r == 0) { + ceph_assert(values.size() == 2); + + // sanity check version + auto bp = values[infover_key].cbegin(); + __u8 struct_v = 0; + decode(struct_v, bp); + ceph_assert(struct_v >= 8); + + // get epoch + bp = values[epoch_key].begin(); + decode(cur_epoch, bp); + } else { + // probably bug 10617; see OSD::load_pgs() + return -1; + } + + *pepoch = cur_epoch; + return 0; +} + +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" + +void PG::write_if_dirty(ObjectStore::Transaction& t) +{ + map<string,bufferlist> km; + if (dirty_big_info || dirty_info) + prepare_write_info(&km); + pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback()); + if (!km.empty()) + t.omap_setkeys(coll, pgmeta_oid, km); +} + +void PG::add_log_entry(const pg_log_entry_t& e, bool applied) +{ + // raise last_complete only if we were previously up to date + if (info.last_complete == info.last_update) + info.last_complete = e.version; + + // raise last_update. + ceph_assert(e.version > info.last_update); + info.last_update = e.version; + + // raise user_version, if it increased (it may have not get bumped + // by all logged updates) + if (e.user_version > info.last_user_version) + info.last_user_version = e.user_version; + + // log mutation + pg_log.add(e, applied); + dout(10) << "add_log_entry " << e << dendl; +} + + +void PG::append_log( + const vector<pg_log_entry_t>& logv, + eversion_t trim_to, + eversion_t roll_forward_to, + ObjectStore::Transaction &t, + bool transaction_applied, + bool async) +{ + if (transaction_applied) + update_snap_map(logv, t); + + /* The primary has sent an info updating the history, but it may not + * have arrived yet. We want to make sure that we cannot remember this + * write without remembering that it happened in an interval which went + * active in epoch history.last_epoch_started. + */ + if (info.last_epoch_started != info.history.last_epoch_started) { + info.history.last_epoch_started = info.last_epoch_started; + } + if (info.last_interval_started != info.history.last_interval_started) { + info.history.last_interval_started = info.last_interval_started; + } + dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl; + + PGLogEntryHandler handler{this, &t}; + if (!transaction_applied) { + /* We must be a backfill or async recovery peer, so it's ok if we apply + * out-of-turn since we won't be considered when + * determining a min possible last_update. + * + * We skip_rollforward() here, which advances the crt, without + * doing an actual rollforward. This avoids cleaning up entries + * from the backend and we do not end up in a situation, where the + * object is deleted before we can _merge_object_divergent_entries(). + */ + pg_log.skip_rollforward(); + } + + for (vector<pg_log_entry_t>::const_iterator p = logv.begin(); + p != logv.end(); + ++p) { + add_log_entry(*p, transaction_applied); + + /* We don't want to leave the rollforward artifacts around + * here past last_backfill. It's ok for the same reason as + * above */ + if (transaction_applied && + p->soid > info.last_backfill) { + pg_log.roll_forward(&handler); + } + } + auto last = logv.rbegin(); + if (is_primary() && last != logv.rend()) { + projected_log.skip_can_rollback_to_to_head(); + projected_log.trim(cct, last->version, nullptr, nullptr, nullptr); + } + + if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) { + pg_log.roll_forward_to( + roll_forward_to, + &handler); + last_rollback_info_trimmed_to_applied = roll_forward_to; + } + + dout(10) << __func__ << " approx pg log length = " + << pg_log.get_log().approx_size() << dendl; + dout(10) << __func__ << " transaction_applied = " + << transaction_applied << dendl; + if (!transaction_applied || async) + dout(10) << __func__ << " " << pg_whoami + << " is async_recovery or backfill target" << dendl; + pg_log.trim(trim_to, info, transaction_applied, async); + + // update the local pg, pg log + dirty_info = true; + write_if_dirty(t); +} + +bool PG::check_log_for_corruption(ObjectStore *store) +{ + /// TODO: this method needs to work with the omap log + return true; +} + +//! Get the name we're going to save our corrupt page log as +std::string PG::get_corrupt_pg_log_name() const +{ + const int MAX_BUF = 512; + char buf[MAX_BUF]; + struct tm tm_buf; + time_t my_time(time(NULL)); + const struct tm *t = localtime_r(&my_time, &tm_buf); + int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t); + if (ret == 0) { + dout(0) << "strftime failed" << dendl; + return "corrupt_log_unknown_time"; + } + string out(buf); + out += stringify(info.pgid); + return out; +} + +int PG::read_info( + ObjectStore *store, spg_t pgid, const coll_t &coll, + pg_info_t &info, PastIntervals &past_intervals, + __u8 &struct_v) +{ + set<string> keys; + keys.insert(infover_key); + keys.insert(info_key); + keys.insert(biginfo_key); + keys.insert(fastinfo_key); + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + map<string,bufferlist> values; + auto ch = store->open_collection(coll); + ceph_assert(ch); + int r = store->omap_get_values(ch, pgmeta_oid, keys, &values); + ceph_assert(r == 0); + ceph_assert(values.size() == 3 || + values.size() == 4); + + auto p = values[infover_key].cbegin(); + decode(struct_v, p); + ceph_assert(struct_v >= 10); + + p = values[info_key].begin(); + decode(info, p); + + p = values[biginfo_key].begin(); + decode(past_intervals, p); + decode(info.purged_snaps, p); + + p = values[fastinfo_key].begin(); + if (!p.end()) { + pg_fast_info_t fast; + decode(fast, p); + fast.try_apply_to(&info); + } + return 0; +} + +void PG::read_state(ObjectStore *store) +{ + int r = read_info(store, pg_id, coll, info, past_intervals, + info_struct_v); + ceph_assert(r >= 0); + + if (info_struct_v < compat_struct_v) { + derr << "PG needs upgrade, but on-disk data is too old; upgrade to" + << " an older version first." << dendl; + ceph_abort_msg("PG too old to upgrade"); + } + + last_written_info = info; + + ostringstream oss; + pg_log.read_log_and_missing( + store, + ch, + pgmeta_oid, + info, + oss, + cct->_conf->osd_ignore_stale_divergent_priors, + cct->_conf->osd_debug_verify_missing_on_start); + if (oss.tellp()) + osd->clog->error() << oss.str(); + + // log any weirdness + log_weirdness(); + + if (info_struct_v < latest_struct_v) { + upgrade(store); + } + + // initialize current mapping + { + int primary, up_primary; + vector<int> acting, up; + get_osdmap()->pg_to_up_acting_osds( + pg_id.pgid, &up, &up_primary, &acting, &primary); + init_primary_up_acting( + up, + acting, + up_primary, + primary); + int rr = OSDMap::calc_pg_role(osd->whoami, acting); + if (pool.info.is_replicated() || rr == pg_whoami.shard) + set_role(rr); + else + set_role(-1); + } + + // init pool options + store->set_collection_opts(ch, pool.info.opts); + + PG::RecoveryCtx rctx(0, 0, 0, new ObjectStore::Transaction); + handle_initialize(&rctx); + // note: we don't activate here because we know the OSD will advance maps + // during boot. + write_if_dirty(*rctx.transaction); + store->queue_transaction(ch, std::move(*rctx.transaction)); + delete rctx.transaction; +} + +void PG::log_weirdness() +{ + if (pg_log.get_tail() != info.log_tail) + osd->clog->error() << info.pgid + << " info mismatch, log.tail " << pg_log.get_tail() + << " != info.log_tail " << info.log_tail; + if (pg_log.get_head() != info.last_update) + osd->clog->error() << info.pgid + << " info mismatch, log.head " << pg_log.get_head() + << " != info.last_update " << info.last_update; + + if (!pg_log.get_log().empty()) { + // sloppy check + if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail())) + osd->clog->error() << info.pgid + << " log bound mismatch, info (tail,head] (" + << pg_log.get_tail() << "," << pg_log.get_head() << "]" + << " actual [" + << pg_log.get_log().log.begin()->version << "," + << pg_log.get_log().log.rbegin()->version << "]"; + } + + if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) { + osd->clog->error() << info.pgid + << " caller_ops.size " << pg_log.get_log().caller_ops.size() + << " > log size " << pg_log.get_log().log.size(); + } +} + +void PG::update_snap_map( + const vector<pg_log_entry_t> &log_entries, + ObjectStore::Transaction &t) +{ + for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin(); + i != log_entries.end(); + ++i) { + OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); + if (i->soid.snap < CEPH_MAXSNAP) { + if (i->is_delete()) { + int r = snap_mapper.remove_oid( + i->soid, + &_t); + if (r != 0) + derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl; + // On removal tolerate missing key corruption + ceph_assert(r == 0 || r == -ENOENT); + } else if (i->is_update()) { + ceph_assert(i->snaps.length() > 0); + vector<snapid_t> snaps; + bufferlist snapbl = i->snaps; + auto p = snapbl.cbegin(); + try { + decode(snaps, p); + } catch (...) { + derr << __func__ << " decode snaps failure on " << *i << dendl; + snaps.clear(); + } + set<snapid_t> _snaps(snaps.begin(), snaps.end()); + + if (i->is_clone() || i->is_promote()) { + snap_mapper.add_oid( + i->soid, + _snaps, + &_t); + } else if (i->is_modify()) { + int r = snap_mapper.update_snaps( + i->soid, + _snaps, + 0, + &_t); + ceph_assert(r == 0); + } else { + ceph_assert(i->is_clean()); + } + } + } + } +} + +/** + * filter trimming|trimmed snaps out of snapcontext + */ +void PG::filter_snapc(vector<snapid_t> &snaps) +{ + // nothing needs to trim, we can return immediately + if (snap_trimq.empty() && info.purged_snaps.empty()) + return; + + bool filtering = false; + vector<snapid_t> newsnaps; + for (vector<snapid_t>::iterator p = snaps.begin(); + p != snaps.end(); + ++p) { + if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) { + if (!filtering) { + // start building a new vector with what we've seen so far + dout(10) << "filter_snapc filtering " << snaps << dendl; + newsnaps.insert(newsnaps.begin(), snaps.begin(), p); + filtering = true; + } + dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl; + } else { + if (filtering) + newsnaps.push_back(*p); // continue building new vector + } + } + if (filtering) { + snaps.swap(newsnaps); + dout(10) << "filter_snapc result " << snaps << dendl; + } +} + +void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m) +{ + for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin(); + it != m.end(); + ++it) + requeue_ops(it->second); + m.clear(); +} + +void PG::requeue_op(OpRequestRef op) +{ + auto p = waiting_for_map.find(op->get_source()); + if (p != waiting_for_map.end()) { + dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")" + << dendl; + p->second.push_front(op); + } else { + dout(20) << __func__ << " " << op << dendl; + osd->enqueue_front( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)), + op->get_req()->get_cost(), + op->get_req()->get_priority(), + op->get_req()->get_recv_stamp(), + op->get_req()->get_source().num(), + get_osdmap_epoch())); + } +} + +void PG::requeue_ops(list<OpRequestRef> &ls) +{ + for (list<OpRequestRef>::reverse_iterator i = ls.rbegin(); + i != ls.rend(); + ++i) { + requeue_op(*i); + } + ls.clear(); +} + +void PG::requeue_map_waiters() +{ + epoch_t epoch = get_osdmap_epoch(); + auto p = waiting_for_map.begin(); + while (p != waiting_for_map.end()) { + if (epoch < p->second.front()->min_epoch) { + dout(20) << __func__ << " " << p->first << " front op " + << p->second.front() << " must still wait, doing nothing" + << dendl; + ++p; + } else { + dout(20) << __func__ << " " << p->first << " " << p->second << dendl; + for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) { + auto req = *q; + osd->enqueue_front(OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, req)), + req->get_req()->get_cost(), + req->get_req()->get_priority(), + req->get_req()->get_recv_stamp(), + req->get_req()->get_source().num(), + epoch)); + } + p = waiting_for_map.erase(p); + } + } +} + + +// ========================================================================================== +// SCRUB + +/* + * when holding pg and sched_scrub_lock, then the states are: + * scheduling: + * scrubber.local_reserved = true + * scrubber.active = false + * scrubber.reserved_peers includes whoami + * osd->scrubs_local++ + * scheduling, replica declined: + * scrubber.local_reserved = true + * scrubber.reserved_peers includes -1 + * osd->scrub_local++ + * pending: + * scrubber.local_reserved = true + * scrubber.active = false + * scrubber.reserved_peers.size() == acting.size(); + * pg on scrub_wq + * osd->scrub_local++ + * scrubbing: + * scrubber.local_reserved = true; + * scrubber.active = true + * scrubber.reserved_peers empty + */ + +// returns true if a scrub has been newly kicked off +bool PG::sched_scrub() +{ + ceph_assert(is_locked()); + ceph_assert(!is_scrubbing()); + if (!(is_primary() && is_active() && is_clean())) { + return false; + } + + // All processing the first time through commits us to whatever + // choices are made. + if (!scrubber.local_reserved) { + dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl; + + bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || + pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)); + bool allow_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || + pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)); + bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0); + bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair + && get_pgbackend()->auto_repair_supported()); + + scrubber.time_for_deep = false; + // Clear these in case user issues the scrub/repair command during + // the scheduling of the scrub/repair (e.g. request reservation) + scrubber.deep_scrub_on_error = false; + scrubber.auto_repair = false; + + // All periodic scrub handling goes here because must_scrub is + // always set for must_deep_scrub and must_repair. + if (!scrubber.must_scrub) { + ceph_assert(!scrubber.must_deep_scrub && !scrubber.must_repair); + // Handle deep scrub determination only if allowed + if (allow_deep_scrub) { + // Initial entry and scheduled scrubs without nodeep_scrub set get here + if (scrubber.need_auto) { + dout(20) << __func__ << ": need repair after scrub errors" << dendl; + scrubber.time_for_deep = true; + } else { + double deep_scrub_interval = 0; + pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval); + if (deep_scrub_interval <= 0) { + deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; + } + scrubber.time_for_deep = ceph_clock_now() >= + info.history.last_deep_scrub_stamp + deep_scrub_interval; + + bool deep_coin_flip = false; + // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees + // we will deep scrub because this function is called often. + if (!scrubber.time_for_deep && allow_scrub) + deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100; + dout(20) << __func__ << ": time_for_deep=" << scrubber.time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl; + + scrubber.time_for_deep = (scrubber.time_for_deep || deep_coin_flip); + } + + if (!scrubber.time_for_deep && has_deep_errors) { + osd->clog->info() << "osd." << osd->whoami + << " pg " << info.pgid + << " Deep scrub errors, upgrading scrub to deep-scrub"; + scrubber.time_for_deep = true; + } + + if (try_to_auto_repair) { + if (scrubber.time_for_deep) { + dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl; + scrubber.auto_repair = true; + } else if (allow_scrub) { + dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl; + scrubber.deep_scrub_on_error = true; + } + } + } else { // !allow_deep_scrub + dout(20) << __func__ << ": nodeep_scrub set" << dendl; + if (has_deep_errors) { + osd->clog->error() << "osd." << osd->whoami + << " pg " << info.pgid + << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set"; + return false; + } + } + + //NOSCRUB so skip regular scrubs + if (!allow_scrub && !scrubber.time_for_deep) { + return false; + } + // scrubber.must_scrub + } else if (!scrubber.must_deep_scrub && has_deep_errors) { + osd->clog->error() << "osd." << osd->whoami + << " pg " << info.pgid + << " Regular scrub request, deep-scrub details will be lost"; + } + // Unless precluded this was handle above + scrubber.need_auto = false; + + ceph_assert(scrubber.reserved_peers.empty()); + bool allow_scrubing = cct->_conf->osd_scrub_during_recovery || + (cct->_conf->osd_repair_during_recovery && scrubber.must_repair) || + !osd->is_recovery_active(); + if (allow_scrubing && + osd->inc_scrubs_local()) { + dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl; + scrubber.local_reserved = true; + scrubber.reserved_peers.insert(pg_whoami); + scrub_reserve_replicas(); + } else { + dout(20) << __func__ << ": failed to reserve locally" << dendl; + return false; + } + } + + if (scrubber.local_reserved) { + if (scrubber.reserve_failed) { + dout(20) << __func__ << ": failed, a peer declined" << dendl; + clear_scrub_reserved(); + scrub_unreserve_replicas(); + return false; + } else if (scrubber.reserved_peers.size() == actingset.size()) { + dout(20) << __func__ << ": success, reserved self and replicas" << dendl; + if (scrubber.time_for_deep) { + dout(10) << __func__ << ": scrub will be deep" << dendl; + state_set(PG_STATE_DEEP_SCRUB); + scrubber.time_for_deep = false; + } + queue_scrub(); + } else { + // none declined, since scrubber.reserved is set + dout(20) << __func__ << ": reserved " << scrubber.reserved_peers + << ", waiting for replicas" << dendl; + } + } + return true; +} + +bool PG::is_scrub_registered() +{ + return !scrubber.scrub_reg_stamp.is_zero(); +} + +void PG::reg_next_scrub() +{ + if (!is_primary()) + return; + + utime_t reg_stamp; + bool must = false; + if (scrubber.must_scrub || scrubber.need_auto) { + // Set the smallest time that isn't utime_t() + reg_stamp = Scrubber::scrub_must_stamp(); + must = true; + } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) { + reg_stamp = ceph_clock_now(); + must = true; + } else { + reg_stamp = info.history.last_scrub_stamp; + } + // note down the sched_time, so we can locate this scrub, and remove it + // later on. + double scrub_min_interval = 0, scrub_max_interval = 0; + pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval); + pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval); + ceph_assert(!is_scrub_registered()); + scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid, + reg_stamp, + scrub_min_interval, + scrub_max_interval, + must); + dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time " + << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl; +} + +void PG::unreg_next_scrub() +{ + if (is_scrub_registered()) { + osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp); + scrubber.scrub_reg_stamp = utime_t(); + } +} + +void PG::on_info_history_change() +{ + unreg_next_scrub(); + reg_next_scrub(); +} + +void PG::scrub_requested(bool deep, bool repair, bool need_auto) +{ + unreg_next_scrub(); + if (need_auto) { + scrubber.need_auto = true; + } else { + scrubber.must_scrub = true; + scrubber.must_deep_scrub = deep || repair; + scrubber.must_repair = repair; + // User might intervene, so clear this + scrubber.need_auto = false; + scrubber.req_scrub = true; + } + reg_next_scrub(); +} + +void PG::do_replica_scrub_map(OpRequestRef op) +{ + const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req()); + dout(7) << __func__ << " " << *m << dendl; + if (m->map_epoch < info.history.same_interval_since) { + dout(10) << __func__ << " discarding old from " + << m->map_epoch << " < " << info.history.same_interval_since + << dendl; + return; + } + if (!scrubber.is_chunky_scrub_active()) { + dout(10) << __func__ << " scrub isn't active" << dendl; + return; + } + + op->mark_started(); + + auto p = const_cast<bufferlist&>(m->get_data()).cbegin(); + scrubber.received_maps[m->from].decode(p, info.pgid.pool()); + dout(10) << "map version is " + << scrubber.received_maps[m->from].valid_through + << dendl; + + dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom + << dendl; + ceph_assert(scrubber.waiting_on_whom.count(m->from)); + scrubber.waiting_on_whom.erase(m->from); + if (m->preempted) { + dout(10) << __func__ << " replica was preempted, setting flag" << dendl; + scrub_preempted = true; + } + if (scrubber.waiting_on_whom.empty()) { + requeue_scrub(ops_blocked_by_scrub()); + } +} + +// send scrub v3 messages (chunky scrub) +void PG::_request_scrub_map( + pg_shard_t replica, eversion_t version, + hobject_t start, hobject_t end, + bool deep, + bool allow_preemption) +{ + ceph_assert(replica != pg_whoami); + dout(10) << "scrub requesting scrubmap from osd." << replica + << " deep " << (int)deep << dendl; + MOSDRepScrub *repscrubop = new MOSDRepScrub( + spg_t(info.pgid.pgid, replica.shard), version, + get_osdmap_epoch(), + get_last_peering_reset(), + start, end, deep, + allow_preemption, + scrubber.priority, + ops_blocked_by_scrub()); + // default priority, we want the rep scrub processed prior to any recovery + // or client io messages (we are holding a lock!) + osd->send_message_osd_cluster( + replica.osd, repscrubop, get_osdmap_epoch()); +} + +void PG::handle_scrub_reserve_request(OpRequestRef op) +{ + dout(7) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + if (scrubber.local_reserved) { + dout(10) << __func__ << " ignoring reserve request: Already reserved" + << dendl; + return; + } + if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) && + osd->inc_scrubs_remote()) { + scrubber.remote_reserved = true; + } else { + dout(20) << __func__ << ": failed to reserve remotely" << dendl; + scrubber.remote_reserved = false; + } + const MOSDScrubReserve *m = + static_cast<const MOSDScrubReserve*>(op->get_req()); + Message *reply = new MOSDScrubReserve( + spg_t(info.pgid.pgid, primary.shard), + m->map_epoch, + scrubber.remote_reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, + pg_whoami); + osd->send_message_osd_cluster(reply, op->get_req()->get_connection()); +} + +void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) +{ + dout(7) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + if (!scrubber.local_reserved) { + dout(10) << "ignoring obsolete scrub reserve reply" << dendl; + return; + } + if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) { + dout(10) << " already had osd." << from << " reserved" << dendl; + } else { + dout(10) << " osd." << from << " scrub reserve = success" << dendl; + scrubber.reserved_peers.insert(from); + sched_scrub(); + } +} + +void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) +{ + dout(7) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + if (!scrubber.local_reserved) { + dout(10) << "ignoring obsolete scrub reserve reply" << dendl; + return; + } + if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) { + dout(10) << " already had osd." << from << " reserved" << dendl; + } else { + /* One decline stops this pg from being scheduled for scrubbing. */ + dout(10) << " osd." << from << " scrub reserve = fail" << dendl; + scrubber.reserve_failed = true; + sched_scrub(); + } +} + +void PG::handle_scrub_reserve_release(OpRequestRef op) +{ + dout(7) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + clear_scrub_reserved(); +} + +// We can zero the value of primary num_bytes as just an atomic. +// However, setting above zero reserves space for backfill and requires +// the OSDService::stat_lock which protects all OSD usage +void PG::set_reserved_num_bytes(int64_t primary, int64_t local) { + ceph_assert(osd->stat_lock.is_locked_by_me()); + primary_num_bytes.store(primary); + local_num_bytes.store(local); + return; +} + +void PG::clear_reserved_num_bytes() { + primary_num_bytes.store(0); + local_num_bytes.store(0); + return; +} + +void PG::reject_reservation() +{ + clear_reserved_num_bytes(); + osd->send_message_osd_cluster( + primary.osd, + new MBackfillReserve( + MBackfillReserve::REJECT_TOOFULL, + spg_t(info.pgid.pgid, primary.shard), + get_osdmap_epoch()), + get_osdmap_epoch()); +} + +void PG::schedule_backfill_retry(float delay) +{ + std::lock_guard lock(osd->recovery_request_lock); + osd->recovery_request_timer.add_event_after( + delay, + new QueuePeeringEvt<RequestBackfill>( + this, get_osdmap_epoch(), + RequestBackfill())); +} + +void PG::schedule_recovery_retry(float delay) +{ + std::lock_guard lock(osd->recovery_request_lock); + osd->recovery_request_timer.add_event_after( + delay, + new QueuePeeringEvt<DoRecovery>( + this, get_osdmap_epoch(), + DoRecovery())); +} + +void PG::clear_scrub_reserved() +{ + scrubber.reserved_peers.clear(); + scrubber.reserve_failed = false; + + if (scrubber.local_reserved) { + scrubber.local_reserved = false; + osd->dec_scrubs_local(); + } + if (scrubber.remote_reserved) { + scrubber.remote_reserved = false; + osd->dec_scrubs_remote(); + } +} + +void PG::scrub_reserve_replicas() +{ + ceph_assert(backfill_targets.empty()); + for (set<pg_shard_t>::iterator i = actingset.begin(); + i != actingset.end(); + ++i) { + if (*i == pg_whoami) continue; + dout(10) << "scrub requesting reserve from osd." << *i << dendl; + osd->send_message_osd_cluster( + i->osd, + new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), + get_osdmap_epoch(), + MOSDScrubReserve::REQUEST, pg_whoami), + get_osdmap_epoch()); + } +} + +void PG::scrub_unreserve_replicas() +{ + ceph_assert(backfill_targets.empty()); + for (set<pg_shard_t>::iterator i = actingset.begin(); + i != actingset.end(); + ++i) { + if (*i == pg_whoami) continue; + dout(10) << "scrub requesting unreserve from osd." << *i << dendl; + osd->send_message_osd_cluster( + i->osd, + new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), + get_osdmap_epoch(), + MOSDScrubReserve::RELEASE, pg_whoami), + get_osdmap_epoch()); + } +} + +void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs) +{ + ObjectStore::Transaction t; + eversion_t trimmed_to = last_rollback_info_trimmed_to_applied; + for (vector<ghobject_t>::const_iterator i = rollback_obs.begin(); + i != rollback_obs.end(); + ++i) { + if (i->generation < trimmed_to.version) { + dout(10) << __func__ << "osd." << osd->whoami + << " pg " << info.pgid + << " found obsolete rollback obj " + << *i << " generation < trimmed_to " + << trimmed_to + << "...repaired" << dendl; + t.remove(coll, *i); + } + } + if (!t.empty()) { + derr << __func__ << ": queueing trans to clean up obsolete rollback objs" + << dendl; + osd->store->queue_transaction(ch, std::move(t), NULL); + } +} + +void PG::_scan_snaps(ScrubMap &smap) +{ + hobject_t head; + SnapSet snapset; + + // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify + // caller using clean_meta_map(), and it works properly. + dout(20) << __func__ << " start" << dendl; + + for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin(); + i != smap.objects.rend(); + ++i) { + const hobject_t &hoid = i->first; + ScrubMap::object &o = i->second; + + dout(20) << __func__ << " " << hoid << dendl; + + ceph_assert(!hoid.is_snapdir()); + if (hoid.is_head()) { + // parse the SnapSet + bufferlist bl; + if (o.attrs.find(SS_ATTR) == o.attrs.end()) { + continue; + } + bl.push_back(o.attrs[SS_ATTR]); + auto p = bl.cbegin(); + try { + decode(snapset, p); + } catch(...) { + continue; + } + head = hoid.get_head(); + continue; + } + if (hoid.snap < CEPH_MAXSNAP) { + // check and if necessary fix snap_mapper + if (hoid.get_head() != head) { + derr << __func__ << " no head for " << hoid << " (have " << head << ")" + << dendl; + continue; + } + set<snapid_t> obj_snaps; + auto p = snapset.clone_snaps.find(hoid.snap); + if (p == snapset.clone_snaps.end()) { + derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset + << dendl; + continue; + } + obj_snaps.insert(p->second.begin(), p->second.end()); + set<snapid_t> cur_snaps; + int r = snap_mapper.get_snaps(hoid, &cur_snaps); + if (r != 0 && r != -ENOENT) { + derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + if (r == -ENOENT || cur_snaps != obj_snaps) { + ObjectStore::Transaction t; + OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); + if (r == 0) { + r = snap_mapper.remove_oid(hoid, &_t); + if (r != 0) { + derr << __func__ << ": remove_oid returned " << cpp_strerror(r) + << dendl; + ceph_abort(); + } + osd->clog->error() << "osd." << osd->whoami + << " found snap mapper error on pg " + << info.pgid + << " oid " << hoid << " snaps in mapper: " + << cur_snaps << ", oi: " + << obj_snaps + << "...repaired"; + } else { + osd->clog->error() << "osd." << osd->whoami + << " found snap mapper error on pg " + << info.pgid + << " oid " << hoid << " snaps missing in mapper" + << ", should be: " + << obj_snaps + << " was " << cur_snaps << " r " << r + << "...repaired"; + } + snap_mapper.add_oid(hoid, obj_snaps, &_t); + + // wait for repair to apply to avoid confusing other bits of the system. + { + Cond my_cond; + Mutex my_lock("PG::_scan_snaps my_lock"); + int r = 0; + bool done; + t.register_on_applied_sync( + new C_SafeCond(&my_lock, &my_cond, &done, &r)); + r = osd->store->queue_transaction(ch, std::move(t)); + if (r != 0) { + derr << __func__ << ": queue_transaction got " << cpp_strerror(r) + << dendl; + } else { + my_lock.Lock(); + while (!done) + my_cond.Wait(my_lock); + my_lock.Unlock(); + } + } + } + } + } +} + +void PG::_repair_oinfo_oid(ScrubMap &smap) +{ + for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin(); + i != smap.objects.rend(); + ++i) { + const hobject_t &hoid = i->first; + ScrubMap::object &o = i->second; + + bufferlist bl; + if (o.attrs.find(OI_ATTR) == o.attrs.end()) { + continue; + } + bl.push_back(o.attrs[OI_ATTR]); + object_info_t oi; + try { + oi.decode(bl); + } catch(...) { + continue; + } + if (oi.soid != hoid) { + ObjectStore::Transaction t; + OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); + osd->clog->error() << "osd." << osd->whoami + << " found object info error on pg " + << info.pgid + << " oid " << hoid << " oid in object info: " + << oi.soid + << "...repaired"; + // Fix object info + oi.soid = hoid; + bl.clear(); + encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + + bufferptr bp(bl.c_str(), bl.length()); + o.attrs[OI_ATTR] = bp; + + t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl); + int r = osd->store->queue_transaction(ch, std::move(t)); + if (r != 0) { + derr << __func__ << ": queue_transaction got " << cpp_strerror(r) + << dendl; + } + } + } +} +int PG::build_scrub_map_chunk( + ScrubMap &map, + ScrubMapBuilder &pos, + hobject_t start, + hobject_t end, + bool deep, + ThreadPool::TPHandle &handle) +{ + dout(10) << __func__ << " [" << start << "," << end << ") " + << " pos " << pos + << dendl; + + // start + while (pos.empty()) { + pos.deep = deep; + map.valid_through = info.last_update; + + // objects + vector<ghobject_t> rollback_obs; + pos.ret = get_pgbackend()->objects_list_range( + start, + end, + &pos.ls, + &rollback_obs); + if (pos.ret < 0) { + dout(5) << "objects_list_range error: " << pos.ret << dendl; + return pos.ret; + } + if (pos.ls.empty()) { + break; + } + _scan_rollback_obs(rollback_obs); + pos.pos = 0; + return -EINPROGRESS; + } + + // scan objects + while (!pos.done()) { + int r = get_pgbackend()->be_scan_list(map, pos); + if (r == -EINPROGRESS) { + return r; + } + } + + // finish + dout(20) << __func__ << " finishing" << dendl; + ceph_assert(pos.done()); + _repair_oinfo_oid(map); + if (!is_primary()) { + ScrubMap for_meta_scrub; + // In case we restarted smaller chunk, clear old data + scrubber.cleaned_meta_map.clear_from(scrubber.start); + scrubber.cleaned_meta_map.insert(map); + scrubber.clean_meta_map(for_meta_scrub); + _scan_snaps(for_meta_scrub); + } + + dout(20) << __func__ << " done, got " << map.objects.size() << " items" + << dendl; + return 0; +} + +void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) { + if (!store) + return; + struct OnComplete : Context { + std::unique_ptr<Scrub::Store> store; + explicit OnComplete( + std::unique_ptr<Scrub::Store> &&store) + : store(std::move(store)) {} + void finish(int) override {} + }; + store->cleanup(t); + t->register_on_complete(new OnComplete(std::move(store))); + ceph_assert(!store); +} + +void PG::repair_object( + const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers, + pg_shard_t bad_peer) +{ + list<pg_shard_t> op_shards; + for (auto i : *ok_peers) { + op_shards.push_back(i.second); + } + dout(10) << "repair_object " << soid << " bad_peer osd." + << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl; + ScrubMap::object &po = ok_peers->back().first; + eversion_t v; + bufferlist bv; + bv.push_back(po.attrs[OI_ATTR]); + object_info_t oi; + try { + auto bliter = bv.cbegin(); + decode(oi, bliter); + } catch (...) { + dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl; + ceph_abort(); + } + if (bad_peer != primary) { + peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false); + } else { + // We should only be scrubbing if the PG is clean. + ceph_assert(waiting_for_unreadable_object.empty()); + + pg_log.missing_add(soid, oi.version, eversion_t()); + + pg_log.set_last_requested(0); + dout(10) << __func__ << ": primary = " << primary << dendl; + } + + if (is_ec_pg() || bad_peer == primary) { + // we'd better collect all shard for EC pg, and prepare good peers as the + // source of pull in the case of replicated pg. + missing_loc.add_missing(soid, oi.version, eversion_t()); + list<pair<ScrubMap::object, pg_shard_t> >::iterator i; + for (i = ok_peers->begin(); + i != ok_peers->end(); + ++i) + missing_loc.add_location(soid, i->second); + } +} + +/* replica_scrub + * + * Wait for last_update_applied to match msg->scrub_to as above. Wait + * for pushes to complete in case of recent recovery. Build a single + * scrubmap of objects that are in the range [msg->start, msg->end). + */ +void PG::replica_scrub( + OpRequestRef op, + ThreadPool::TPHandle &handle) +{ + const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req()); + ceph_assert(!scrubber.active_rep_scrub); + dout(7) << "replica_scrub" << dendl; + + if (msg->map_epoch < info.history.same_interval_since) { + dout(10) << "replica_scrub discarding old replica_scrub from " + << msg->map_epoch << " < " << info.history.same_interval_since + << dendl; + return; + } + + ceph_assert(msg->chunky); + if (active_pushes > 0) { + dout(10) << "waiting for active pushes to finish" << dendl; + scrubber.active_rep_scrub = op; + return; + } + + scrubber.state = Scrubber::BUILD_MAP_REPLICA; + scrubber.replica_scrub_start = msg->min_epoch; + scrubber.start = msg->start; + scrubber.end = msg->end; + scrubber.max_end = msg->end; + scrubber.deep = msg->deep; + scrubber.epoch_start = info.history.same_interval_since; + if (msg->priority) { + scrubber.priority = msg->priority; + } else { + scrubber.priority = get_scrub_priority(); + } + + scrub_can_preempt = msg->allow_preemption; + scrub_preempted = false; + scrubber.replica_scrubmap_pos.reset(); + + requeue_scrub(msg->high_priority); +} + +/* Scrub: + * PG_STATE_SCRUBBING is set when the scrub is queued + * + * scrub will be chunky if all OSDs in PG support chunky scrub + * scrub will fail if OSDs are too old. + */ +void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle) +{ + if (cct->_conf->osd_scrub_sleep > 0 && + (scrubber.state == PG::Scrubber::NEW_CHUNK || + scrubber.state == PG::Scrubber::INACTIVE) && + scrubber.needs_sleep) { + ceph_assert(!scrubber.sleeping); + dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl; + + // Do an async sleep so we don't block the op queue + OSDService *osds = osd; + spg_t pgid = get_pgid(); + int state = scrubber.state; + auto scrub_requeue_callback = + new FunctionContext([osds, pgid, state](int r) { + PGRef pg = osds->osd->lookup_lock_pg(pgid); + if (pg == nullptr) { + lgeneric_dout(osds->osd->cct, 20) + << "scrub_requeue_callback: Could not find " + << "PG " << pgid << " can't complete scrub requeue after sleep" + << dendl; + return; + } + pg->scrubber.sleeping = false; + pg->scrubber.needs_sleep = false; + lgeneric_dout(pg->cct, 20) + << "scrub_requeue_callback: slept for " + << ceph_clock_now() - pg->scrubber.sleep_start + << ", re-queuing scrub with state " << state << dendl; + pg->scrub_queued = false; + pg->requeue_scrub(); + pg->scrubber.sleep_start = utime_t(); + pg->unlock(); + }); + std::lock_guard l(osd->sleep_lock); + osd->sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep, + scrub_requeue_callback); + scrubber.sleeping = true; + scrubber.sleep_start = ceph_clock_now(); + return; + } + if (pg_has_reset_since(queued)) { + return; + } + ceph_assert(scrub_queued); + scrub_queued = false; + scrubber.needs_sleep = true; + + // for the replica + if (!is_primary() && + scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) { + chunky_scrub(handle); + return; + } + + if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) { + dout(10) << "scrub -- not primary or active or not clean" << dendl; + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_REPAIR); + state_clear(PG_STATE_DEEP_SCRUB); + publish_stats_to_osd(); + return; + } + + if (!scrubber.active) { + ceph_assert(backfill_targets.empty()); + + scrubber.deep = state_test(PG_STATE_DEEP_SCRUB); + + dout(10) << "starting a new chunky scrub" << dendl; + } + + chunky_scrub(handle); +} + +void PG::abort_scrub() +{ + scrub_clear_state(); + scrub_unreserve_replicas(); +} + +/* + * Chunky scrub scrubs objects one chunk at a time with writes blocked for that + * chunk. + * + * The object store is partitioned into chunks which end on hash boundaries. For + * each chunk, the following logic is performed: + * + * (1) Block writes on the chunk + * (2) Request maps from replicas + * (3) Wait for pushes to be applied (after recovery) + * (4) Wait for writes to flush on the chunk + * (5) Wait for maps from replicas + * (6) Compare / repair all scrub maps + * (7) Wait for digest updates to apply + * + * This logic is encoded in the mostly linear state machine: + * + * +------------------+ + * _________v__________ | + * | | | + * | INACTIVE | | + * |____________________| | + * | | + * | +----------+ | + * _________v___v______ | | + * | | | | + * | NEW_CHUNK | | | + * |____________________| | | + * | | | + * _________v__________ | | + * | | | | + * | WAIT_PUSHES | | | + * |____________________| | | + * | | | + * _________v__________ | | + * | | | | + * | WAIT_LAST_UPDATE | | | + * |____________________| | | + * | | | + * _________v__________ | | + * | | | | + * | BUILD_MAP | | | + * |____________________| | | + * | | | + * _________v__________ | | + * | | | | + * | WAIT_REPLICAS | | | + * |____________________| | | + * | | | + * _________v__________ | | + * | | | | + * | COMPARE_MAPS | | | + * |____________________| | | + * | | | + * | | | + * _________v__________ | | + * | | | | + * |WAIT_DIGEST_UPDATES | | | + * |____________________| | | + * | | | | + * | +----------+ | + * _________v__________ | + * | | | + * | FINISH | | + * |____________________| | + * | | + * +------------------+ + * + * The primary determines the last update from the subset by walking the log. If + * it sees a log entry pertaining to a file in the chunk, it tells the replicas + * to wait until that update is applied before building a scrub map. Both the + * primary and replicas will wait for any active pushes to be applied. + * + * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq. + * + * scrubber.state encodes the current state of the scrub (refer to state diagram + * for details). + */ +void PG::chunky_scrub(ThreadPool::TPHandle &handle) +{ + // check for map changes + if (scrubber.is_chunky_scrub_active()) { + if (scrubber.epoch_start != info.history.same_interval_since) { + dout(10) << "scrub pg changed, aborting" << dendl; + abort_scrub(); + return; + } + } + + bool done = false; + int ret; + + while (!done) { + dout(20) << "scrub state " << Scrubber::state_string(scrubber.state) + << " [" << scrubber.start << "," << scrubber.end << ")" + << " max_end " << scrubber.max_end << dendl; + + switch (scrubber.state) { + case PG::Scrubber::INACTIVE: + dout(10) << "scrub start" << dendl; + ceph_assert(is_primary()); + + publish_stats_to_osd(); + scrubber.epoch_start = info.history.same_interval_since; + scrubber.active = true; + + { + ObjectStore::Transaction t; + scrubber.cleanup_store(&t); + scrubber.store.reset(Scrub::Store::create(osd->store, &t, + info.pgid, coll)); + osd->store->queue_transaction(ch, std::move(t), nullptr); + } + + // Don't include temporary objects when scrubbing + scrubber.start = info.pgid.pgid.get_hobj_start(); + scrubber.state = PG::Scrubber::NEW_CHUNK; + + { + bool repair = state_test(PG_STATE_REPAIR); + bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); + const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); + stringstream oss; + oss << info.pgid.pgid << " " << mode << " starts" << std::endl; + osd->clog->debug(oss); + } + + scrubber.preempt_left = cct->_conf.get_val<uint64_t>( + "osd_scrub_max_preemptions"); + scrubber.preempt_divisor = 1; + break; + + case PG::Scrubber::NEW_CHUNK: + scrubber.primary_scrubmap = ScrubMap(); + scrubber.received_maps.clear(); + + // begin (possible) preemption window + if (scrub_preempted) { + scrubber.preempt_left--; + scrubber.preempt_divisor *= 2; + dout(10) << __func__ << " preempted, " << scrubber.preempt_left + << " left" << dendl; + scrub_preempted = false; + } + scrub_can_preempt = scrubber.preempt_left > 0; + + { + /* get the start and end of our scrub chunk + * + * Our scrub chunk has an important restriction we're going to need to + * respect. We can't let head be start or end. + * Using a half-open interval means that if end == head, + * we'd scrub/lock head and the clone right next to head in different + * chunks which would allow us to miss clones created between + * scrubbing that chunk and scrubbing the chunk including head. + * This isn't true for any of the other clones since clones can + * only be created "just to the left of" head. There is one exception + * to this: promotion of clones which always happens to the left of the + * left-most clone, but promote_object checks the scrubber in that + * case, so it should be ok. Also, it's ok to "miss" clones at the + * left end of the range if we are a tier because they may legitimately + * not exist (see _scrub). + */ + ceph_assert(scrubber.preempt_divisor > 0); + int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min / + scrubber.preempt_divisor); + int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max / + scrubber.preempt_divisor); + hobject_t start = scrubber.start; + hobject_t candidate_end; + vector<hobject_t> objects; + ret = get_pgbackend()->objects_list_partial( + start, + min, + max, + &objects, + &candidate_end); + ceph_assert(ret >= 0); + + if (!objects.empty()) { + hobject_t back = objects.back(); + while (candidate_end.is_head() && + candidate_end == back.get_head()) { + candidate_end = back; + objects.pop_back(); + if (objects.empty()) { + ceph_assert(0 == + "Somehow we got more than 2 objects which" + "have the same head but are not clones"); + } + back = objects.back(); + } + if (candidate_end.is_head()) { + ceph_assert(candidate_end != back.get_head()); + candidate_end = candidate_end.get_object_boundary(); + } + } else { + ceph_assert(candidate_end.is_max()); + } + + if (!_range_available_for_scrub(scrubber.start, candidate_end)) { + // we'll be requeued by whatever made us unavailable for scrub + dout(10) << __func__ << ": scrub blocked somewhere in range " + << "[" << scrubber.start << ", " << candidate_end << ")" + << dendl; + done = true; + break; + } + scrubber.end = candidate_end; + if (scrubber.end > scrubber.max_end) + scrubber.max_end = scrubber.end; + } + + // walk the log to find the latest update that affects our chunk + scrubber.subset_last_update = eversion_t(); + for (auto p = projected_log.log.rbegin(); + p != projected_log.log.rend(); + ++p) { + if (p->soid >= scrubber.start && + p->soid < scrubber.end) { + scrubber.subset_last_update = p->version; + break; + } + } + if (scrubber.subset_last_update == eversion_t()) { + for (list<pg_log_entry_t>::const_reverse_iterator p = + pg_log.get_log().log.rbegin(); + p != pg_log.get_log().log.rend(); + ++p) { + if (p->soid >= scrubber.start && + p->soid < scrubber.end) { + scrubber.subset_last_update = p->version; + break; + } + } + } + + scrubber.state = PG::Scrubber::WAIT_PUSHES; + break; + + case PG::Scrubber::WAIT_PUSHES: + if (active_pushes == 0) { + scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE; + } else { + dout(15) << "wait for pushes to apply" << dendl; + done = true; + } + break; + + case PG::Scrubber::WAIT_LAST_UPDATE: + if (last_update_applied < scrubber.subset_last_update) { + // will be requeued by op_applied + dout(15) << "wait for EC read/modify/writes to queue" << dendl; + done = true; + break; + } + + // ask replicas to scan + scrubber.waiting_on_whom.insert(pg_whoami); + + // request maps from replicas + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == pg_whoami) continue; + _request_scrub_map(*i, scrubber.subset_last_update, + scrubber.start, scrubber.end, scrubber.deep, + scrubber.preempt_left > 0); + scrubber.waiting_on_whom.insert(*i); + } + dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom + << dendl; + + scrubber.state = PG::Scrubber::BUILD_MAP; + scrubber.primary_scrubmap_pos.reset(); + break; + + case PG::Scrubber::BUILD_MAP: + ceph_assert(last_update_applied >= scrubber.subset_last_update); + + // build my own scrub map + if (scrub_preempted) { + dout(10) << __func__ << " preempted" << dendl; + scrubber.state = PG::Scrubber::BUILD_MAP_DONE; + break; + } + ret = build_scrub_map_chunk( + scrubber.primary_scrubmap, + scrubber.primary_scrubmap_pos, + scrubber.start, scrubber.end, + scrubber.deep, + handle); + if (ret == -EINPROGRESS) { + requeue_scrub(); + done = true; + break; + } + scrubber.state = PG::Scrubber::BUILD_MAP_DONE; + break; + + case PG::Scrubber::BUILD_MAP_DONE: + if (scrubber.primary_scrubmap_pos.ret < 0) { + dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret + << ", aborting" << dendl; + scrub_clear_state(); + scrub_unreserve_replicas(); + return; + } + dout(10) << __func__ << " waiting_on_whom was " + << scrubber.waiting_on_whom << dendl; + ceph_assert(scrubber.waiting_on_whom.count(pg_whoami)); + scrubber.waiting_on_whom.erase(pg_whoami); + + scrubber.state = PG::Scrubber::WAIT_REPLICAS; + break; + + case PG::Scrubber::WAIT_REPLICAS: + if (!scrubber.waiting_on_whom.empty()) { + // will be requeued by sub_op_scrub_map + dout(10) << "wait for replicas to build scrub map" << dendl; + done = true; + break; + } + // Since repair is only by request and we need to scrub afterward + // treat the same as req_scrub. + if (!scrubber.req_scrub) { + if (state_test(PG_STATE_DEEP_SCRUB)) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || + pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) { + dout(10) << "nodeep_scrub set, aborting" << dendl; + abort_scrub(); + return; + } + } else if (state_test(PG_STATE_SCRUBBING)) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { + dout(10) << "noscrub set, aborting" << dendl; + abort_scrub(); + return; + } + } + } + // end (possible) preemption window + scrub_can_preempt = false; + if (scrub_preempted) { + dout(10) << __func__ << " preempted, restarting chunk" << dendl; + scrubber.state = PG::Scrubber::NEW_CHUNK; + } else { + scrubber.state = PG::Scrubber::COMPARE_MAPS; + } + break; + + case PG::Scrubber::COMPARE_MAPS: + ceph_assert(last_update_applied >= scrubber.subset_last_update); + ceph_assert(scrubber.waiting_on_whom.empty()); + + scrub_compare_maps(); + scrubber.start = scrubber.end; + scrubber.run_callbacks(); + + // requeue the writes from the chunk that just finished + requeue_ops(waiting_for_scrub); + + scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES; + + // fall-thru + + case PG::Scrubber::WAIT_DIGEST_UPDATES: + if (scrubber.num_digest_updates_pending) { + dout(10) << __func__ << " waiting on " + << scrubber.num_digest_updates_pending + << " digest updates" << dendl; + done = true; + break; + } + + scrubber.preempt_left = cct->_conf.get_val<uint64_t>( + "osd_scrub_max_preemptions"); + scrubber.preempt_divisor = 1; + + if (!(scrubber.end.is_max())) { + scrubber.state = PG::Scrubber::NEW_CHUNK; + requeue_scrub(); + done = true; + } else { + scrubber.state = PG::Scrubber::FINISH; + } + + break; + + case PG::Scrubber::FINISH: + scrub_finish(); + scrubber.state = PG::Scrubber::INACTIVE; + done = true; + + if (!snap_trimq.empty()) { + dout(10) << "scrub finished, requeuing snap_trimmer" << dendl; + snap_trimmer_scrub_complete(); + } + + break; + + case PG::Scrubber::BUILD_MAP_REPLICA: + // build my own scrub map + if (scrub_preempted) { + dout(10) << __func__ << " preempted" << dendl; + ret = 0; + } else { + ret = build_scrub_map_chunk( + scrubber.replica_scrubmap, + scrubber.replica_scrubmap_pos, + scrubber.start, scrubber.end, + scrubber.deep, + handle); + } + if (ret == -EINPROGRESS) { + requeue_scrub(); + done = true; + break; + } + // reply + { + MOSDRepScrubMap *reply = new MOSDRepScrubMap( + spg_t(info.pgid.pgid, get_primary().shard), + scrubber.replica_scrub_start, + pg_whoami); + reply->preempted = scrub_preempted; + ::encode(scrubber.replica_scrubmap, reply->get_data()); + osd->send_message_osd_cluster( + get_primary().osd, reply, + scrubber.replica_scrub_start); + } + scrub_preempted = false; + scrub_can_preempt = false; + scrubber.state = PG::Scrubber::INACTIVE; + scrubber.replica_scrubmap = ScrubMap(); + scrubber.replica_scrubmap_pos = ScrubMapBuilder(); + scrubber.start = hobject_t(); + scrubber.end = hobject_t(); + scrubber.max_end = hobject_t(); + done = true; + break; + + default: + ceph_abort(); + } + } + dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state) + << " [" << scrubber.start << "," << scrubber.end << ")" + << " max_end " << scrubber.max_end << dendl; +} + +bool PG::write_blocked_by_scrub(const hobject_t& soid) +{ + if (soid < scrubber.start || soid >= scrubber.end) { + return false; + } + if (scrub_can_preempt) { + if (!scrub_preempted) { + dout(10) << __func__ << " " << soid << " preempted" << dendl; + scrub_preempted = true; + } else { + dout(10) << __func__ << " " << soid << " already preempted" << dendl; + } + return false; + } + return true; +} + +bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end) +{ + // does [start, end] intersect [scrubber.start, scrubber.max_end) + return (start < scrubber.max_end && + end >= scrubber.start); +} + +void PG::scrub_clear_state(bool has_error) +{ + ceph_assert(is_locked()); + state_clear(PG_STATE_SCRUBBING); + if (!has_error) + state_clear(PG_STATE_REPAIR); + state_clear(PG_STATE_DEEP_SCRUB); + publish_stats_to_osd(); + + scrubber.req_scrub = false; + // local -> nothing. + if (scrubber.local_reserved) { + osd->dec_scrubs_local(); + scrubber.local_reserved = false; + scrubber.reserved_peers.clear(); + } + + requeue_ops(waiting_for_scrub); + + scrubber.reset(); + + // type-specific state clear + _scrub_clear_state(); +} + +void PG::scrub_compare_maps() +{ + dout(10) << __func__ << " has maps, analyzing" << dendl; + + // construct authoritative scrub map for type specific scrubbing + scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap); + map<hobject_t, + pair<boost::optional<uint32_t>, + boost::optional<uint32_t>>> missing_digest; + + map<pg_shard_t, ScrubMap *> maps; + maps[pg_whoami] = &scrubber.primary_scrubmap; + + for (const auto& i : acting_recovery_backfill) { + if (i == pg_whoami) continue; + dout(2) << __func__ << " replica " << i << " has " + << scrubber.received_maps[i].objects.size() + << " items" << dendl; + maps[i] = &scrubber.received_maps[i]; + } + + set<hobject_t> master_set; + + // Construct master set + for (const auto map : maps) { + for (const auto i : map.second->objects) { + master_set.insert(i.first); + } + } + + stringstream ss; + get_pgbackend()->be_omap_checks(maps, master_set, + scrubber.omap_stats, ss); + + if (!ss.str().empty()) { + osd->clog->warn(ss); + } + + if (acting.size() > 1) { + dout(10) << __func__ << " comparing replica scrub maps" << dendl; + + // Map from object with errors to good peer + map<hobject_t, list<pg_shard_t>> authoritative; + + dout(2) << __func__ << " osd." << acting[0] << " has " + << scrubber.primary_scrubmap.objects.size() << " items" << dendl; + + ss.str(""); + ss.clear(); + + get_pgbackend()->be_compare_scrubmaps( + maps, + master_set, + state_test(PG_STATE_REPAIR), + scrubber.missing, + scrubber.inconsistent, + authoritative, + missing_digest, + scrubber.shallow_errors, + scrubber.deep_errors, + scrubber.store.get(), + info.pgid, acting, + ss); + dout(2) << ss.str() << dendl; + + if (!ss.str().empty()) { + osd->clog->error(ss); + } + + for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin(); + i != authoritative.end(); + ++i) { + list<pair<ScrubMap::object, pg_shard_t> > good_peers; + for (list<pg_shard_t>::const_iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j)); + } + scrubber.authoritative.insert( + make_pair( + i->first, + good_peers)); + } + + for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin(); + i != authoritative.end(); + ++i) { + scrubber.cleaned_meta_map.objects.erase(i->first); + scrubber.cleaned_meta_map.objects.insert( + *(maps[i->second.back()]->objects.find(i->first)) + ); + } + } + + ScrubMap for_meta_scrub; + scrubber.clean_meta_map(for_meta_scrub); + + // ok, do the pg-type specific scrubbing + scrub_snapshot_metadata(for_meta_scrub, missing_digest); + // Called here on the primary can use an authoritative map if it isn't the primary + _scan_snaps(for_meta_scrub); + if (!scrubber.store->empty()) { + if (state_test(PG_STATE_REPAIR)) { + dout(10) << __func__ << ": discarding scrub results" << dendl; + scrubber.store->flush(nullptr); + } else { + dout(10) << __func__ << ": updating scrub object" << dendl; + ObjectStore::Transaction t; + scrubber.store->flush(&t); + osd->store->queue_transaction(ch, std::move(t), nullptr); + } + } +} + +bool PG::scrub_process_inconsistent() +{ + dout(10) << __func__ << ": checking authoritative" << dendl; + bool repair = state_test(PG_STATE_REPAIR); + bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); + const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); + + // authoriative only store objects which missing or inconsistent. + if (!scrubber.authoritative.empty()) { + stringstream ss; + ss << info.pgid << " " << mode << " " + << scrubber.missing.size() << " missing, " + << scrubber.inconsistent.size() << " inconsistent objects"; + dout(2) << ss.str() << dendl; + osd->clog->error(ss); + if (repair) { + state_clear(PG_STATE_CLEAN); + for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i = + scrubber.authoritative.begin(); + i != scrubber.authoritative.end(); + ++i) { + set<pg_shard_t>::iterator j; + + auto missing_entry = scrubber.missing.find(i->first); + if (missing_entry != scrubber.missing.end()) { + for (j = missing_entry->second.begin(); + j != missing_entry->second.end(); + ++j) { + repair_object( + i->first, + &(i->second), + *j); + ++scrubber.fixed; + } + } + if (scrubber.inconsistent.count(i->first)) { + for (j = scrubber.inconsistent[i->first].begin(); + j != scrubber.inconsistent[i->first].end(); + ++j) { + repair_object(i->first, + &(i->second), + *j); + ++scrubber.fixed; + } + } + } + } + } + return (!scrubber.authoritative.empty() && repair); +} + +bool PG::ops_blocked_by_scrub() const { + return (waiting_for_scrub.size() != 0); +} + +// the part that actually finalizes a scrub +void PG::scrub_finish() +{ + dout(20) << __func__ << dendl; + bool repair = state_test(PG_STATE_REPAIR); + bool do_auto_scrub = false; + // if the repair request comes from auto-repair and large number of errors, + // we would like to cancel auto-repair + if (repair && scrubber.auto_repair + && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) { + state_clear(PG_STATE_REPAIR); + repair = false; + } + bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); + const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); + + // if a regular scrub had errors within the limit, do a deep scrub to auto repair. + if (scrubber.deep_scrub_on_error + && scrubber.authoritative.size() + && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) { + ceph_assert(!deep_scrub); + do_auto_scrub = true; + dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl; + } + scrubber.deep_scrub_on_error = false; + + // type-specific finish (can tally more errors) + _scrub_finish(); + + bool has_error = scrub_process_inconsistent(); + + { + stringstream oss; + oss << info.pgid.pgid << " " << mode << " "; + int total_errors = scrubber.shallow_errors + scrubber.deep_errors; + if (total_errors) + oss << total_errors << " errors"; + else + oss << "ok"; + if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) + oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors + << " remaining deep scrub error details lost)"; + if (repair) + oss << ", " << scrubber.fixed << " fixed"; + if (total_errors) + osd->clog->error(oss); + else + osd->clog->debug(oss); + } + + // finish up + unreg_next_scrub(); + utime_t now = ceph_clock_now(); + info.history.last_scrub = info.last_update; + info.history.last_scrub_stamp = now; + if (scrubber.deep) { + info.history.last_deep_scrub = info.last_update; + info.history.last_deep_scrub_stamp = now; + } + // Since we don't know which errors were fixed, we can only clear them + // when every one has been fixed. + if (repair) { + if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) { + ceph_assert(deep_scrub); + scrubber.shallow_errors = scrubber.deep_errors = 0; + dout(20) << __func__ << " All may be fixed" << dendl; + } else if (has_error) { + // Deep scrub in order to get corrected error counts + scrub_after_recovery = true; + save_req_scrub = scrubber.req_scrub; + dout(20) << __func__ << " Set scrub_after_recovery, req_scrub=" << save_req_scrub << dendl; + } else if (scrubber.shallow_errors || scrubber.deep_errors) { + // We have errors but nothing can be fixed, so there is no repair + // possible. + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors) + << " error(s) present with no repair possible" << dendl; + } + } + if (deep_scrub) { + if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0)) + info.history.last_clean_scrub_stamp = now; + info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors; + info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors; + info.stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects; + info.stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes; + info.stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys; + dout(25) << __func__ << " shard " << pg_whoami << " num_omap_bytes = " + << info.stats.stats.sum.num_omap_bytes << " num_omap_keys = " + << info.stats.stats.sum.num_omap_keys << dendl; + } else { + info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors; + // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent + // because of deep-scrub errors + if (scrubber.shallow_errors == 0) + info.history.last_clean_scrub_stamp = now; + } + info.stats.stats.sum.num_scrub_errors = + info.stats.stats.sum.num_shallow_scrub_errors + + info.stats.stats.sum.num_deep_scrub_errors; + if (scrubber.check_repair) { + scrubber.check_repair = false; + if (info.stats.stats.sum.num_scrub_errors) { + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << __func__ << " " << info.stats.stats.sum.num_scrub_errors + << " error(s) still present after re-scrub" << dendl; + } + } + publish_stats_to_osd(); + + { + ObjectStore::Transaction t; + dirty_info = true; + write_if_dirty(t); + int tr = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(tr == 0); + } + + + if (has_error) { + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + DoRecovery()))); + } + + scrub_clear_state(has_error); + scrub_unreserve_replicas(); + + if (do_auto_scrub) { + scrub_requested(false, false, true); + } else { + reg_next_scrub(); + } + + if (is_active() && is_primary()) { + share_pg_info(); + } +} + +void PG::share_pg_info() +{ + dout(10) << "share_pg_info" << dendl; + + // share new pg_info_t with replicas + ceph_assert(!acting_recovery_backfill.empty()); + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == pg_whoami) continue; + auto pg_shard = *i; + auto peer = peer_info.find(pg_shard); + if (peer != peer_info.end()) { + peer->second.last_epoch_started = info.last_epoch_started; + peer->second.last_interval_started = info.last_interval_started; + peer->second.history.merge(info.history); + } + MOSDPGInfo *m = new MOSDPGInfo(get_osdmap_epoch()); + m->pg_list.push_back( + make_pair( + pg_notify_t( + pg_shard.shard, pg_whoami.shard, + get_osdmap_epoch(), + get_osdmap_epoch(), + info), + past_intervals)); + osd->send_message_osd_cluster(pg_shard.osd, m, get_osdmap_epoch()); + } +} + +bool PG::append_log_entries_update_missing( + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to, + boost::optional<eversion_t> roll_forward_to) +{ + ceph_assert(!entries.empty()); + ceph_assert(entries.begin()->version > info.last_update); + + PGLogEntryHandler rollbacker{this, &t}; + bool invalidate_stats = + pg_log.append_new_log_entries(info.last_backfill, + info.last_backfill_bitwise, + entries, + &rollbacker); + + if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) { + pg_log.roll_forward(&rollbacker); + } + if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) { + pg_log.roll_forward_to(*roll_forward_to, &rollbacker); + last_rollback_info_trimmed_to_applied = *roll_forward_to; + } + + info.last_update = pg_log.get_head(); + + if (pg_log.get_missing().num_missing() == 0) { + // advance last_complete since nothing else is missing! + info.last_complete = info.last_update; + } + info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats; + + dout(20) << __func__ << " trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl; + if (trim_to) + pg_log.trim(*trim_to, info); + dirty_info = true; + write_if_dirty(t); + return invalidate_stats; +} + + +void PG::merge_new_log_entries( + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + ObjectStore::Transaction &t, + boost::optional<eversion_t> trim_to, + boost::optional<eversion_t> roll_forward_to) +{ + dout(10) << __func__ << " " << entries << dendl; + ceph_assert(is_primary()); + + bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to); + for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + pg_shard_t peer(*i); + if (peer == pg_whoami) continue; + ceph_assert(peer_missing.count(peer)); + ceph_assert(peer_info.count(peer)); + pg_missing_t& pmissing(peer_missing[peer]); + dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl; + pg_info_t& pinfo(peer_info[peer]); + bool invalidate_stats = PGLog::append_log_entries_update_missing( + pinfo.last_backfill, + info.last_backfill_bitwise, + entries, + true, + NULL, + pmissing, + NULL, + this); + pinfo.last_update = info.last_update; + pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats; + rebuild_missing = rebuild_missing || invalidate_stats; + } + + if (!rebuild_missing) { + return; + } + + for (auto &&i: entries) { + missing_loc.rebuild( + i.soid, + pg_whoami, + acting_recovery_backfill, + info, + pg_log.get_missing(), + peer_missing, + peer_info); + } +} + +void PG::update_history(const pg_history_t& new_history) +{ + if (info.history.merge(new_history)) { + dout(20) << __func__ << " advanced history from " << new_history << dendl; + dirty_info = true; + if (info.history.last_epoch_clean >= info.history.same_interval_since) { + dout(20) << __func__ << " clearing past_intervals" << dendl; + past_intervals.clear(); + dirty_big_info = true; + } + } + on_info_history_change(); +} + +void PG::fulfill_info( + pg_shard_t from, const pg_query_t &query, + pair<pg_shard_t, pg_info_t> ¬ify_info) +{ + ceph_assert(from == primary); + ceph_assert(query.type == pg_query_t::INFO); + + // info + dout(10) << "sending info" << dendl; + notify_info = make_pair(from, info); +} + +void PG::fulfill_log( + pg_shard_t from, const pg_query_t &query, epoch_t query_epoch) +{ + dout(10) << "log request from " << from << dendl; + ceph_assert(from == primary); + ceph_assert(query.type != pg_query_t::INFO); + ConnectionRef con = osd->get_con_osd_cluster( + from.osd, get_osdmap_epoch()); + if (!con) return; + + MOSDPGLog *mlog = new MOSDPGLog( + from.shard, pg_whoami.shard, + get_osdmap_epoch(), + info, query_epoch); + mlog->missing = pg_log.get_missing(); + + // primary -> other, when building master log + if (query.type == pg_query_t::LOG) { + dout(10) << " sending info+missing+log since " << query.since + << dendl; + if (query.since != eversion_t() && query.since < pg_log.get_tail()) { + osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since + << " when my log.tail is " << pg_log.get_tail() + << ", sending full log instead"; + mlog->log = pg_log.get_log(); // primary should not have requested this!! + } else + mlog->log.copy_after(cct, pg_log.get_log(), query.since); + } + else if (query.type == pg_query_t::FULLLOG) { + dout(10) << " sending info+missing+full log" << dendl; + mlog->log = pg_log.get_log(); + } + + dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl; + + osd->share_map_peer(from.osd, con.get(), get_osdmap()); + osd->send_message_osd_cluster(mlog, con.get()); +} + +void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx) +{ + if (query.query.type == pg_query_t::INFO) { + pair<pg_shard_t, pg_info_t> notify_info; + update_history(query.query.history); + fulfill_info(query.from, query.query, notify_info); + rctx->send_notify( + notify_info.first, + pg_notify_t( + notify_info.first.shard, pg_whoami.shard, + query.query_epoch, + get_osdmap_epoch(), + notify_info.second), + past_intervals); + } else { + update_history(query.query.history); + fulfill_log(query.from, query.query, query.query_epoch); + } +} + +void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap) +{ + bool changed = false; + if (osdmap->test_flag(CEPH_OSDMAP_FULL) && + !lastmap->test_flag(CEPH_OSDMAP_FULL)) { + dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl; + changed = true; + } + const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool()); + if (!pi) { + return; // pool deleted + } + if (pi->has_flag(pg_pool_t::FLAG_FULL)) { + const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool()); + if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) { + dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl; + changed = true; + } + } + if (changed) { + info.history.last_epoch_marked_full = osdmap->get_epoch(); + dirty_info = true; + } +} + +bool PG::should_restart_peering( + int newupprimary, + int newactingprimary, + const vector<int>& newup, + const vector<int>& newacting, + OSDMapRef lastmap, + OSDMapRef osdmap) +{ + if (PastIntervals::is_new_interval( + primary.osd, + newactingprimary, + acting, + newacting, + up_primary.osd, + newupprimary, + up, + newup, + osdmap, + lastmap, + info.pgid.pgid)) { + dout(20) << "new interval newup " << newup + << " newacting " << newacting << dendl; + return true; + } + if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) { + dout(10) << __func__ << " osd transitioned from down -> up" << dendl; + return true; + } + return false; +} + +bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) +{ + if (last_peering_reset > reply_epoch || + last_peering_reset > query_epoch) { + dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch + << " last_peering_reset " << last_peering_reset + << dendl; + return true; + } + return false; +} + +void PG::set_last_peering_reset() +{ + dout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl; + if (last_peering_reset != get_osdmap_epoch()) { + last_peering_reset = get_osdmap_epoch(); + reset_interval_flush(); + } +} + +struct FlushState { + PGRef pg; + epoch_t epoch; + FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {} + ~FlushState() { + pg->lock(); + if (!pg->pg_has_reset_since(epoch)) + pg->on_flushed(); + pg->unlock(); + } +}; +typedef std::shared_ptr<FlushState> FlushStateRef; + +void PG::start_flush(ObjectStore::Transaction *t) +{ + // flush in progress ops + FlushStateRef flush_trigger (std::make_shared<FlushState>( + this, get_osdmap_epoch())); + flushes_in_progress++; + t->register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger)); + t->register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger)); +} + +void PG::reset_interval_flush() +{ + dout(10) << "Clearing blocked outgoing recovery messages" << dendl; + recovery_state.clear_blocked_outgoing(); + + Context *c = new QueuePeeringEvt<IntervalFlush>( + this, get_osdmap_epoch(), IntervalFlush()); + if (!ch->flush_commit(c)) { + dout(10) << "Beginning to block outgoing recovery messages" << dendl; + recovery_state.begin_block_outgoing(); + } else { + dout(10) << "Not blocking outgoing recovery messages" << dendl; + delete c; + } +} + +/* Called before initializing peering during advance_map */ +void PG::start_peering_interval( + const OSDMapRef lastmap, + const vector<int>& newup, int new_up_primary, + const vector<int>& newacting, int new_acting_primary, + ObjectStore::Transaction *t) +{ + const OSDMapRef osdmap = get_osdmap(); + + set_last_peering_reset(); + + vector<int> oldacting, oldup; + int oldrole = get_role(); + + if (is_primary()) { + osd->clear_ready_to_merge(this); + } + + pg_shard_t old_acting_primary = get_primary(); + pg_shard_t old_up_primary = up_primary; + bool was_old_primary = is_primary(); + bool was_old_replica = is_replica(); + + acting.swap(oldacting); + up.swap(oldup); + init_primary_up_acting( + newup, + newacting, + new_up_primary, + new_acting_primary); + + if (info.stats.up != up || + info.stats.acting != acting || + info.stats.up_primary != new_up_primary || + info.stats.acting_primary != new_acting_primary) { + info.stats.up = up; + info.stats.up_primary = new_up_primary; + info.stats.acting = acting; + info.stats.acting_primary = new_acting_primary; + info.stats.mapping_epoch = osdmap->get_epoch(); + } + + pg_stats_publish_lock.Lock(); + pg_stats_publish_valid = false; + pg_stats_publish_lock.Unlock(); + + // This will now be remapped during a backfill in cases + // that it would not have been before. + if (up != acting) + state_set(PG_STATE_REMAPPED); + else + state_clear(PG_STATE_REMAPPED); + + int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size()); + if (pool.info.is_replicated() || role == pg_whoami.shard) + set_role(role); + else + set_role(-1); + + // did acting, up, primary|acker change? + if (!lastmap) { + dout(10) << " no lastmap" << dendl; + dirty_info = true; + dirty_big_info = true; + info.history.same_interval_since = osdmap->get_epoch(); + } else { + std::stringstream debug; + ceph_assert(info.history.same_interval_since != 0); + boost::scoped_ptr<IsPGRecoverablePredicate> recoverable( + get_is_recoverable_predicate()); + bool new_interval = PastIntervals::check_new_interval( + old_acting_primary.osd, + new_acting_primary, + oldacting, newacting, + old_up_primary.osd, + new_up_primary, + oldup, newup, + info.history.same_interval_since, + info.history.last_epoch_clean, + osdmap, + lastmap, + info.pgid.pgid, + recoverable.get(), + &past_intervals, + &debug); + dout(10) << __func__ << ": check_new_interval output: " + << debug.str() << dendl; + if (new_interval) { + if (osdmap->get_epoch() == osd->get_superblock().oldest_map && + info.history.last_epoch_clean < osdmap->get_epoch()) { + dout(10) << " map gap, clearing past_intervals and faking" << dendl; + // our information is incomplete and useless; someone else was clean + // after everything we know if osdmaps were trimmed. + past_intervals.clear(); + } else { + dout(10) << " noting past " << past_intervals << dendl; + } + dirty_info = true; + dirty_big_info = true; + info.history.same_interval_since = osdmap->get_epoch(); + if (osdmap->have_pg_pool(info.pgid.pgid.pool()) && + info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()), + osdmap->get_pg_num(info.pgid.pgid.pool()), + nullptr)) { + info.history.last_epoch_split = osdmap->get_epoch(); + } + } + } + + if (old_up_primary != up_primary || + oldup != up) { + info.history.same_up_since = osdmap->get_epoch(); + } + // this comparison includes primary rank via pg_shard_t + if (old_acting_primary != get_primary()) { + info.history.same_primary_since = osdmap->get_epoch(); + } + + on_new_interval(); + + dout(1) << __func__ << " up " << oldup << " -> " << up + << ", acting " << oldacting << " -> " << acting + << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary + << ", up_primary " << old_up_primary << " -> " << new_up_primary + << ", role " << oldrole << " -> " << role + << ", features acting " << acting_features + << " upacting " << upacting_features + << dendl; + + // deactivate. + state_clear(PG_STATE_ACTIVE); + state_clear(PG_STATE_PEERED); + state_clear(PG_STATE_PREMERGE); + state_clear(PG_STATE_DOWN); + state_clear(PG_STATE_RECOVERY_WAIT); + state_clear(PG_STATE_RECOVERY_TOOFULL); + state_clear(PG_STATE_RECOVERING); + + peer_purged.clear(); + acting_recovery_backfill.clear(); + scrub_queued = false; + + // reset primary/replica state? + if (was_old_primary || is_primary()) { + osd->remove_want_pg_temp(info.pgid.pgid); + } else if (was_old_replica || is_replica()) { + osd->remove_want_pg_temp(info.pgid.pgid); + } + clear_primary_state(); + + + // pg->on_* + on_change(t); + + projected_last_update = eversion_t(); + + ceph_assert(!deleting); + + // should we tell the primary we are here? + send_notify = !is_primary(); + + if (role != oldrole || + was_old_primary != is_primary()) { + // did primary change? + if (was_old_primary != is_primary()) { + state_clear(PG_STATE_CLEAN); + clear_publish_stats(); + } + + on_role_change(); + + // take active waiters + requeue_ops(waiting_for_peered); + + } else { + // no role change. + // did primary change? + if (get_primary() != old_acting_primary) { + dout(10) << *this << " " << oldacting << " -> " << acting + << ", acting primary " + << old_acting_primary << " -> " << get_primary() + << dendl; + } else { + // primary is the same. + if (is_primary()) { + // i am (still) primary. but my replica set changed. + state_clear(PG_STATE_CLEAN); + + dout(10) << oldacting << " -> " << acting + << ", replicas changed" << dendl; + } + } + } + cancel_recovery(); + + if (acting.empty() && !up.empty() && up_primary == pg_whoami) { + dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl; + osd->queue_want_pg_temp(info.pgid.pgid, acting); + } +} + +void PG::on_new_interval() +{ + const OSDMapRef osdmap = get_osdmap(); + + on_info_history_change(); + + // initialize features + acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) { + if (*p == CRUSH_ITEM_NONE) + continue; + uint64_t f = osdmap->get_xinfo(*p).features; + acting_features &= f; + upacting_features &= f; + } + for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) { + if (*p == CRUSH_ITEM_NONE) + continue; + upacting_features &= osdmap->get_xinfo(*p).features; + } + + _on_new_interval(); +} + +void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo) +{ + ceph_assert(!is_primary()); + + update_history(oinfo.history); + if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) { + info.stats.stats.sum.num_scrub_errors = 0; + info.stats.stats.sum.num_shallow_scrub_errors = 0; + info.stats.stats.sum.num_deep_scrub_errors = 0; + dirty_info = true; + } + + if (!(info.purged_snaps == oinfo.purged_snaps)) { + dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps + << dendl; + info.purged_snaps = oinfo.purged_snaps; + dirty_info = true; + dirty_big_info = true; + } +} + +ostream& operator<<(ostream& out, const PG& pg) +{ + out << "pg[" << pg.info + << " " << pg.up; + if (pg.acting != pg.up) + out << "/" << pg.acting; + if (pg.is_ec_pg()) + out << "p" << pg.get_primary(); + if (!pg.async_recovery_targets.empty()) + out << " async=[" << pg.async_recovery_targets << "]"; + if (!pg.backfill_targets.empty()) + out << " backfill=[" << pg.backfill_targets << "]"; + out << " r=" << pg.get_role(); + out << " lpr=" << pg.get_last_peering_reset(); + + if (pg.deleting) + out << " DELETING"; + + if (!pg.past_intervals.empty()) { + out << " pi=[" << pg.past_intervals.get_bounds() + << ")/" << pg.past_intervals.size(); + } + + if (pg.is_peered()) { + if (pg.last_update_ondisk != pg.info.last_update) + out << " luod=" << pg.last_update_ondisk; + if (pg.last_update_applied != pg.info.last_update) + out << " lua=" << pg.last_update_applied; + } + + if (pg.recovery_ops_active) + out << " rops=" << pg.recovery_ops_active; + + if (pg.pg_log.get_tail() != pg.info.log_tail || + pg.pg_log.get_head() != pg.info.last_update) + out << " (info mismatch, " << pg.pg_log.get_log() << ")"; + + if (!pg.pg_log.get_log().empty()) { + if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) { + out << " (log bound mismatch, actual=[" + << pg.pg_log.get_log().log.begin()->version << "," + << pg.pg_log.get_log().log.rbegin()->version << "]"; + out << ")"; + } + } + + out << " crt=" << pg.pg_log.get_can_rollback_to(); + + if (pg.last_complete_ondisk != pg.info.last_complete) + out << " lcod " << pg.last_complete_ondisk; + + if (pg.is_primary()) { + out << " mlcod " << pg.min_last_complete_ondisk; + } + + out << " " << pg_state_string(pg.get_state()); + if (pg.should_send_notify()) + out << " NOTIFY"; + + if (pg.scrubber.must_repair) + out << " MUST_REPAIR"; + if (pg.scrubber.auto_repair) + out << " AUTO_REPAIR"; + if (pg.scrubber.check_repair) + out << " CHECK_REPAIR"; + if (pg.scrubber.deep_scrub_on_error) + out << " DEEP_SCRUB_ON_ERROR"; + if (pg.scrubber.must_deep_scrub) + out << " MUST_DEEP_SCRUB"; + if (pg.scrubber.must_scrub) + out << " MUST_SCRUB"; + if (pg.scrubber.time_for_deep) + out << " TIME_FOR_DEEP"; + if (pg.scrubber.need_auto) + out << " NEED_AUTO"; + if (pg.scrubber.req_scrub) + out << " REQ_SCRUB"; + + //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]"; + if (pg.pg_log.get_missing().num_missing()) { + out << " m=" << pg.pg_log.get_missing().num_missing(); + if (pg.is_primary()) { + uint64_t unfound = pg.get_num_unfound(); + if (unfound) + out << " u=" << unfound; + } + } + if (!pg.is_clean()) { + out << " mbc=" << pg.missing_loc.get_missing_by_count(); + } + if (!pg.snap_trimq.empty()) { + out << " trimq="; + // only show a count if the set is large + if (pg.snap_trimq.num_intervals() > 16) { + out << pg.snap_trimq.size(); + } else { + out << pg.snap_trimq; + } + } + if (!pg.info.purged_snaps.empty()) { + out << " ps="; // snap trim queue / purged snaps + if (pg.info.purged_snaps.num_intervals() > 16) { + out << pg.info.purged_snaps.size(); + } else { + out << pg.info.purged_snaps; + } + } + + out << "]"; + + + return out; +} + +bool PG::can_discard_op(OpRequestRef& op) +{ + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) { + dout(20) << " discard " << *m << dendl; + return true; + } + + if (m->get_map_epoch() < info.history.same_primary_since) { + dout(7) << " changed after " << m->get_map_epoch() + << ", dropping " << *m << dendl; + return true; + } + + if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) { + // >= luminous client + if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) { + // >= nautilus client + if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) { + dout(7) << __func__ << " sent before last_force_op_resend " + << pool.info.last_force_op_resend + << ", dropping" << *m << dendl; + return true; + } + } else { + // == < nautilus client (luminous or mimic) + if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) { + dout(7) << __func__ << " sent before last_force_op_resend_prenautilus " + << pool.info.last_force_op_resend_prenautilus + << ", dropping" << *m << dendl; + return true; + } + } + if (m->get_map_epoch() < info.history.last_epoch_split) { + dout(7) << __func__ << " pg split in " + << info.history.last_epoch_split << ", dropping" << dendl; + return true; + } + } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) { + // < luminous client + if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) { + dout(7) << __func__ << " sent before last_force_op_resend_preluminous " + << pool.info.last_force_op_resend_preluminous + << ", dropping" << *m << dendl; + return true; + } + } + + return false; +} + +template<typename T, int MSGTYPE> +bool PG::can_discard_replica_op(OpRequestRef& op) +{ + const T *m = static_cast<const T *>(op->get_req()); + ceph_assert(m->get_type() == MSGTYPE); + + int from = m->get_source().num(); + + // if a repop is replied after a replica goes down in a new osdmap, and + // before the pg advances to this new osdmap, the repop replies before this + // repop can be discarded by that replica OSD, because the primary resets the + // connection to it when handling the new osdmap marking it down, and also + // resets the messenger sesssion when the replica reconnects. to avoid the + // out-of-order replies, the messages from that replica should be discarded. + OSDMapRef next_map = osd->get_next_osdmap(); + if (next_map->is_down(from)) + return true; + /* Mostly, this overlaps with the old_peering_msg + * condition. An important exception is pushes + * sent by replicas not in the acting set, since + * if such a replica goes down it does not cause + * a new interval. */ + if (next_map->get_down_at(from) >= m->map_epoch) + return true; + + // same pg? + // if pg changes _at all_, we reset and repeer! + if (old_peering_msg(m->map_epoch, m->map_epoch)) { + dout(10) << "can_discard_replica_op pg changed " << info.history + << " after " << m->map_epoch + << ", dropping" << dendl; + return true; + } + return false; +} + +bool PG::can_discard_scan(OpRequestRef op) +{ + const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_SCAN); + + if (old_peering_msg(m->map_epoch, m->query_epoch)) { + dout(10) << " got old scan, ignoring" << dendl; + return true; + } + return false; +} + +bool PG::can_discard_backfill(OpRequestRef op) +{ + const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL); + + if (old_peering_msg(m->map_epoch, m->query_epoch)) { + dout(10) << " got old backfill, ignoring" << dendl; + return true; + } + + return false; + +} + +bool PG::can_discard_request(OpRequestRef& op) +{ + switch (op->get_req()->get_type()) { + case CEPH_MSG_OSD_OP: + return can_discard_op(op); + case CEPH_MSG_OSD_BACKOFF: + return false; // never discard + case MSG_OSD_REPOP: + return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op); + case MSG_OSD_PG_PUSH: + return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op); + case MSG_OSD_PG_PULL: + return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op); + case MSG_OSD_PG_PUSH_REPLY: + return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op); + case MSG_OSD_REPOPREPLY: + return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op); + case MSG_OSD_PG_RECOVERY_DELETE: + return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op); + + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op); + + case MSG_OSD_EC_WRITE: + return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op); + case MSG_OSD_EC_WRITE_REPLY: + return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op); + case MSG_OSD_EC_READ: + return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op); + case MSG_OSD_EC_READ_REPLY: + return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op); + case MSG_OSD_REP_SCRUB: + return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op); + case MSG_OSD_SCRUB_RESERVE: + return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op); + case MSG_OSD_REP_SCRUBMAP: + return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op); + case MSG_OSD_PG_UPDATE_LOG_MISSING: + return can_discard_replica_op< + MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op); + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + return can_discard_replica_op< + MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op); + + case MSG_OSD_PG_SCAN: + return can_discard_scan(op); + case MSG_OSD_PG_BACKFILL: + return can_discard_backfill(op); + case MSG_OSD_PG_BACKFILL_REMOVE: + return can_discard_replica_op<MOSDPGBackfillRemove, + MSG_OSD_PG_BACKFILL_REMOVE>(op); + } + return true; +} + +void PG::take_waiters() +{ + dout(10) << "take_waiters" << dendl; + requeue_map_waiters(); +} + +void PG::do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rctx) +{ + dout(10) << __func__ << ": " << evt->get_desc() << dendl; + ceph_assert(have_same_or_newer_map(evt->get_epoch_sent())); + if (old_peering_evt(evt)) { + dout(10) << "discard old " << evt->get_desc() << dendl; + } else { + recovery_state.handle_event(evt, rctx); + } + // write_if_dirty regardless of path above to ensure we capture any work + // done by OSD::advance_pg(). + write_if_dirty(*rctx->transaction); +} + +void PG::queue_peering_event(PGPeeringEventRef evt) +{ + if (old_peering_evt(evt)) + return; + osd->osd->enqueue_peering_evt(info.pgid, evt); +} + +void PG::queue_null(epoch_t msg_epoch, + epoch_t query_epoch) +{ + dout(10) << "null" << dendl; + queue_peering_event( + PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch, + NullEvt()))); +} + +void PG::find_unfound(epoch_t queued, RecoveryCtx *rctx) +{ + /* + * if we couldn't start any recovery ops and things are still + * unfound, see if we can discover more missing object locations. + * It may be that our initial locations were bad and we errored + * out while trying to pull. + */ + discover_all_missing(*rctx->query_map); + if (rctx->query_map->empty()) { + string action; + if (state_test(PG_STATE_BACKFILLING)) { + auto evt = PGPeeringEventRef( + new PGPeeringEvent( + queued, + queued, + PG::UnfoundBackfill())); + queue_peering_event(evt); + action = "in backfill"; + } else if (state_test(PG_STATE_RECOVERING)) { + auto evt = PGPeeringEventRef( + new PGPeeringEvent( + queued, + queued, + PG::UnfoundRecovery())); + queue_peering_event(evt); + action = "in recovery"; + } else { + action = "already out of recovery/backfill"; + } + dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl; + } else { + dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl; + queue_recovery(); + } +} + +void PG::handle_advance_map( + OSDMapRef osdmap, OSDMapRef lastmap, + vector<int>& newup, int up_primary, + vector<int>& newacting, int acting_primary, + RecoveryCtx *rctx) +{ + ceph_assert(lastmap->get_epoch() == osdmap_ref->get_epoch()); + ceph_assert(lastmap == osdmap_ref); + dout(10) << "handle_advance_map " + << newup << "/" << newacting + << " -- " << up_primary << "/" << acting_primary + << dendl; + update_osdmap_ref(osdmap); + osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch()); + + pool.update(cct, osdmap); + + AdvMap evt( + osdmap, lastmap, newup, up_primary, + newacting, acting_primary); + recovery_state.handle_event(evt, rctx); + if (pool.info.last_change == osdmap_ref->get_epoch()) { + on_pool_change(); + update_store_with_options(); + } + last_require_osd_release = osdmap->require_osd_release; +} + +void PG::handle_activate_map(RecoveryCtx *rctx) +{ + dout(10) << "handle_activate_map " << dendl; + ActMap evt; + recovery_state.handle_event(evt, rctx); + if (osdmap_ref->get_epoch() - last_persisted_osdmap > + cct->_conf->osd_pg_epoch_persisted_max_stale) { + dout(20) << __func__ << ": Dirtying info: last_persisted is " + << last_persisted_osdmap + << " while current is " << osdmap_ref->get_epoch() << dendl; + dirty_info = true; + } else { + dout(20) << __func__ << ": Not dirtying info: last_persisted is " + << last_persisted_osdmap + << " while current is " << osdmap_ref->get_epoch() << dendl; + } + if (osdmap_ref->check_new_blacklist_entries()) { + check_blacklisted_watchers(); + } + write_if_dirty(*rctx->transaction); +} + +void PG::handle_initialize(RecoveryCtx *rctx) +{ + dout(10) << __func__ << dendl; + Initialize evt; + recovery_state.handle_event(evt, rctx); +} + +void PG::handle_query_state(Formatter *f) +{ + dout(10) << "handle_query_state" << dendl; + QueryState q(f); + recovery_state.handle_event(q, 0); +} + +void PG::init_collection_pool_opts() +{ + auto r = osd->store->set_collection_opts(ch, pool.info.opts); + if (r < 0 && r != -EOPNOTSUPP) { + derr << __func__ << " set_collection_opts returns error:" << r << dendl; + } +} + +void PG::update_store_with_options() +{ + init_collection_pool_opts(); +} + +struct C_DeleteMore : public Context { + PGRef pg; + epoch_t epoch; + C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {} + void finish(int r) override { + ceph_abort(); + } + void complete(int r) override { + ceph_assert(r == 0); + pg->lock(); + if (!pg->pg_has_reset_since(epoch)) { + pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch); + } + pg->unlock(); + delete this; + } +}; + +ghobject_t PG::_delete_some(ObjectStore::Transaction *t, + ghobject_t _next) +{ + dout(10) << __func__ << dendl; + + { + float osd_delete_sleep = osd->osd->get_osd_delete_sleep(); + if (osd_delete_sleep > 0 && delete_needs_sleep) { + epoch_t e = get_osdmap()->get_epoch(); + PGRef pgref(this); + auto delete_requeue_callback = new FunctionContext([this, pgref, e](int r) { + dout(20) << __func__ << " wake up at " + << ceph_clock_now() + << ", re-queuing delete" << dendl; + lock(); + delete_needs_sleep = false; + if (!pg_has_reset_since(e)) { + osd->queue_for_pg_delete(get_pgid(), e); + } + unlock(); + }); + + utime_t delete_schedule_time = ceph_clock_now(); + delete_schedule_time += osd_delete_sleep; + Mutex::Locker l(osd->sleep_lock); + osd->sleep_timer.add_event_at(delete_schedule_time, + delete_requeue_callback); + dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl; + return _next; + } + } + + delete_needs_sleep = true; + + ghobject_t next; + + vector<ghobject_t> olist; + int max = std::min(osd->store->get_ideal_list_max(), + (int)cct->_conf->osd_target_transaction_size); + + osd->store->collection_list( + ch, + _next, + ghobject_t::get_max(), + max, + &olist, + &next); + dout(20) << __func__ << " " << olist << dendl; + + // make sure we've removed everything + // by one more listing from the beginning + if (_next != ghobject_t() && olist.empty()) { + next = ghobject_t(); + osd->store->collection_list( + ch, + next, + ghobject_t::get_max(), + max, + &olist, + &next); + if (!olist.empty()) { + dout(0) << __func__ << " additional unexpected onode list" + <<" (new onodes has appeared since PG removal started" + << olist << dendl; + } + } + + OSDriver::OSTransaction _t(osdriver.get_transaction(t)); + int64_t num = 0; + for (auto& oid : olist) { + if (oid == pgmeta_oid) { + continue; + } + if (oid.is_pgmeta()) { + osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid + << " during PG removal"; + } + int r = snap_mapper.remove_oid(oid.hobj, &_t); + if (r != 0 && r != -ENOENT) { + ceph_abort(); + } + t->remove(coll, oid); + ++num; + } + if (num) { + dout(20) << __func__ << " deleting " << num << " objects" << dendl; + Context *fin = new C_DeleteMore(this, get_osdmap_epoch()); + t->register_on_commit(fin); + } else { + dout(20) << __func__ << " finished" << dendl; + if (cct->_conf->osd_inject_failure_on_pg_removal) { + _exit(1); + } + + // final flush here to ensure completions drop refs. Of particular concern + // are the SnapMapper ContainerContexts. + { + PGRef pgref(this); + PGLog::clear_info_log(info.pgid, t); + t->remove_collection(coll); + t->register_on_commit(new ContainerContext<PGRef>(pgref)); + t->register_on_applied(new ContainerContext<PGRef>(pgref)); + osd->store->queue_transaction(ch, std::move(*t)); + } + ch->flush(); + + if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) { + dout(1) << __func__ << " raced with merge, reinstantiating" << dendl; + ch = osd->store->create_new_collection(coll); + _create(*t, + info.pgid, + info.pgid.get_split_bits(pool.info.get_pg_num())); + _init(*t, info.pgid, &pool.info); + last_epoch = 0; // to ensure pg epoch is also written + dirty_info = true; + dirty_big_info = true; + } else { + deleted = true; + + // cancel reserver here, since the PG is about to get deleted and the + // exit() methods don't run when that happens. + osd->local_reserver.cancel_reservation(info.pgid); + + osd->logger->dec(l_osd_pg_removing); + } + } + return next; +} + +// Compute pending backfill data +static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes) +{ + lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB" + << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl; + return std::max((int64_t)0, bf_bytes - local_bytes); +} + +int PG::pg_stat_adjust(osd_stat_t *ns) +{ + osd_stat_t &new_stat = *ns; + if (is_primary()) { + return 0; + } + // Adjust the kb_used by adding pending backfill data + uint64_t reserved_num_bytes = get_reserved_num_bytes(); + + // For now we don't consider projected space gains here + // I suggest we have an optional 2 pass backfill that frees up + // space in a first pass. This could be triggered when at nearfull + // or near to backfillfull. + if (reserved_num_bytes > 0) { + // TODO: Handle compression by adjusting by the PGs average + // compression precentage. + dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB" + << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl; + if (new_stat.statfs.available > reserved_num_bytes) + new_stat.statfs.available -= reserved_num_bytes; + else + new_stat.statfs.available = 0; + dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl; + return 1; + } + return 0; +} + + +/*------------ Recovery State Machine----------------*/ +#undef dout_prefix +#define dout_prefix (context< RecoveryMachine >().pg->gen_prefix(*_dout) \ + << "state<" << get_state_name() << ">: ") + +/*------Crashed-------*/ +PG::RecoveryState::Crashed::Crashed(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Crashed") +{ + context< RecoveryMachine >().log_enter(state_name); + ceph_abort_msg("we got a bad state machine event"); +} + + +/*------Initial-------*/ +PG::RecoveryState::Initial::Initial(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Initial") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->proc_replica_info( + notify.from, notify.notify.info, notify.notify.epoch_sent); + pg->set_last_peering_reset(); + return transit< Primary >(); +} + +boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i) +{ + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(!pg->is_primary()); + post_event(i); + return transit< Stray >(); +} + +boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i) +{ + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(!pg->is_primary()); + post_event(i); + return transit< Stray >(); +} + +void PG::RecoveryState::Initial::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur); +} + +/*------Started-------*/ +PG::RecoveryState::Started::Started(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +boost::statechart::result +PG::RecoveryState::Started::react(const IntervalFlush&) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl; + context< RecoveryMachine >().pg->recovery_state.end_block_outgoing(); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "Started advmap" << dendl; + pg->check_full_transition(advmap.lastmap, advmap.osdmap); + if (pg->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset" + << dendl; + post_event(advmap); + return transit< Reset >(); + } + pg->remove_down_peer_info(advmap.osdmap); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return discard_event(); +} + +void PG::RecoveryState::Started::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_started_latency, dur); +} + +/*--------Reset---------*/ +PG::RecoveryState::Reset::Reset(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Reset") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + + pg->flushes_in_progress = 0; + pg->set_last_peering_reset(); +} + +boost::statechart::result +PG::RecoveryState::Reset::react(const IntervalFlush&) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl; + context< RecoveryMachine >().pg->recovery_state.end_block_outgoing(); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "Reset advmap" << dendl; + + pg->check_full_transition(advmap.lastmap, advmap.osdmap); + + if (pg->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again" + << dendl; + pg->start_peering_interval( + advmap.lastmap, + advmap.newup, advmap.up_primary, + advmap.newacting, advmap.acting_primary, + context< RecoveryMachine >().get_cur_transaction()); + } + pg->remove_down_peer_info(advmap.osdmap); + pg->check_past_interval_bounds(); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&) +{ + PG *pg = context< RecoveryMachine >().pg; + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< RecoveryMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + pg->info), + pg->past_intervals); + } + + pg->update_heartbeat_peers(); + pg->take_waiters(); + + return transit< Started >(); +} + +boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return discard_event(); +} + +void PG::RecoveryState::Reset::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur); +} + +/*-------Start---------*/ +PG::RecoveryState::Start::Start(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Start") +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + if (pg->is_primary()) { + ldout(pg->cct, 1) << "transitioning to Primary" << dendl; + post_event(MakePrimary()); + } else { //is_stray + ldout(pg->cct, 1) << "transitioning to Stray" << dendl; + post_event(MakeStray()); + } +} + +void PG::RecoveryState::Start::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_start_latency, dur); +} + +/*---------Primary--------*/ +PG::RecoveryState::Primary::Primary(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(pg->want_acting.empty()); + + // set CREATING bit until we have peered for the first time. + if (pg->info.history.last_epoch_started == 0) { + pg->state_set(PG_STATE_CREATING); + // use the history timestamp, which ultimately comes from the + // monitor in the create case. + utime_t t = pg->info.history.last_scrub_stamp; + pg->info.stats.last_fresh = t; + pg->info.stats.last_active = t; + pg->info.stats.last_change = t; + pg->info.stats.last_peered = t; + pg->info.stats.last_clean = t; + pg->info.stats.last_unstale = t; + pg->info.stats.last_undegraded = t; + pg->info.stats.last_fullsized = t; + pg->info.stats.last_scrub_stamp = t; + pg->info.stats.last_deep_scrub_stamp = t; + pg->info.stats.last_clean_scrub_stamp = t; + } +} + +boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl; + pg->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 7) << "handle ActMap primary" << dendl; + pg->publish_stats_to_osd(); + pg->take_waiters(); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Primary::react( + const SetForceRecovery&) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->set_force_recovery(true); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Primary::react( + const UnsetForceRecovery&) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->set_force_recovery(false); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Primary::react( + const RequestScrub& evt) +{ + PG *pg = context< RecoveryMachine >().pg; + if (pg->is_primary()) { + pg->scrub_requested(evt.deep, evt.repair); + ldout(pg->cct,10) << "marking for scrub" << dendl; + } + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Primary::react( + const SetForceBackfill&) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->set_force_backfill(true); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Primary::react( + const UnsetForceBackfill&) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->set_force_backfill(false); + return discard_event(); +} + +void PG::RecoveryState::Primary::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->want_acting.clear(); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur); + pg->clear_primary_state(); + pg->state_clear(PG_STATE_CREATING); +} + +/*---------Peering--------*/ +PG::RecoveryState::Peering::Peering(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"), + history_les_bound(false) +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(!pg->is_peered()); + ceph_assert(!pg->is_peering()); + ceph_assert(pg->is_primary()); + pg->state_set(PG_STATE_PEERING); +} + +boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "Peering advmap" << dendl; + if (prior_set.affected_by_map(*(advmap.osdmap), pg)) { + ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl; + post_event(advmap); + return transit< Reset >(); + } + + pg->adjust_need_up_thru(advmap.osdmap); + + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q) +{ + PG *pg = context< RecoveryMachine >().pg; + + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("past_intervals"); + pg->past_intervals.dump(q.f); + q.f->close_section(); + + q.f->open_array_section("probing_osds"); + for (set<pg_shard_t>::iterator p = prior_set.probe.begin(); + p != prior_set.probe.end(); + ++p) + q.f->dump_stream("osd") << *p; + q.f->close_section(); + + if (prior_set.pg_down) + q.f->dump_string("blocked", "peering is blocked due to down osds"); + + q.f->open_array_section("down_osds_we_would_probe"); + for (set<int>::iterator p = prior_set.down.begin(); + p != prior_set.down.end(); + ++p) + q.f->dump_int("osd", *p); + q.f->close_section(); + + q.f->open_array_section("peering_blocked_by"); + for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin(); + p != prior_set.blocked_by.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_int("osd", p->first); + q.f->dump_int("current_lost_at", p->second); + q.f->dump_string("comment", "starting or marking this osd lost may let us proceed"); + q.f->close_section(); + } + q.f->close_section(); + + if (history_les_bound) { + q.f->open_array_section("peering_blocked_by_detail"); + q.f->open_object_section("item"); + q.f->dump_string("detail","peering_blocked_by_history_les_bound"); + q.f->close_section(); + q.f->close_section(); + } + + q.f->close_section(); + return forward_event(); +} + +void PG::RecoveryState::Peering::exit() +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "Leaving Peering" << dendl; + context< RecoveryMachine >().log_exit(state_name, enter_time); + pg->state_clear(PG_STATE_PEERING); + pg->clear_probe_targets(); + + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur); +} + + +/*------Backfilling-------*/ +PG::RecoveryState::Backfilling::Backfilling(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + pg->backfill_reserved = true; + pg->queue_recovery(); + pg->state_clear(PG_STATE_BACKFILL_TOOFULL); + pg->state_clear(PG_STATE_BACKFILL_WAIT); + pg->state_set(PG_STATE_BACKFILLING); + pg->publish_stats_to_osd(); +} + +void PG::RecoveryState::Backfilling::backfill_release_reservations() +{ + PG *pg = context< RecoveryMachine >().pg; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin(); + it != pg->backfill_targets.end(); + ++it) { + ceph_assert(*it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + it->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MBackfillReserve( + MBackfillReserve::RELEASE, + spg_t(pg->info.pgid.pgid, it->shard), + pg->get_osdmap_epoch()), + con.get()); + } + } +} + +void PG::RecoveryState::Backfilling::cancel_backfill() +{ + PG *pg = context< RecoveryMachine >().pg; + backfill_release_reservations(); + if (!pg->waiting_on_backfill.empty()) { + pg->waiting_on_backfill.clear(); + pg->finish_recovery_op(hobject_t::get_max()); + } +} + +boost::statechart::result +PG::RecoveryState::Backfilling::react(const Backfilled &c) +{ + backfill_release_reservations(); + return transit<Recovered>(); +} + +boost::statechart::result +PG::RecoveryState::Backfilling::react(const DeferBackfill &c) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl; + pg->state_set(PG_STATE_BACKFILL_WAIT); + pg->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + pg->schedule_backfill_retry(c.delay); + return transit<NotBackfilling>(); +} + +boost::statechart::result +PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl; + pg->state_set(PG_STATE_BACKFILL_UNFOUND); + pg->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + return transit<NotBackfilling>(); +} + +boost::statechart::result +PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull &) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_TOOFULL); + pg->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval); + return transit<NotBackfilling>(); +} + +boost::statechart::result +PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked &) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_WAIT); + cancel_backfill(); + if (pg->needs_backfill()) { + return transit<WaitLocalBackfillReserved>(); + } else { + // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore + return discard_event(); + } +} + +void PG::RecoveryState::Backfilling::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->backfill_reserved = false; + pg->backfill_reserving = false; + pg->state_clear(PG_STATE_BACKFILLING); + pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur); +} + +/*--WaitRemoteBackfillReserved--*/ + +PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"), + backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin()) +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_WAIT); + pg->publish_stats_to_osd(); + post_event(RemoteBackfillReserved()); +} + +boost::statechart::result +PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + + int64_t num_bytes = pg->info.stats.stats.sum.num_bytes; + ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl; + if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) { + //The primary never backfills itself + ceph_assert(*backfill_osd_it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + backfill_osd_it->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MBackfillReserve( + MBackfillReserve::REQUEST, + spg_t(pg->info.pgid.pgid, backfill_osd_it->shard), + pg->get_osdmap_epoch(), + pg->get_backfill_priority(), + num_bytes, + pg->peer_bytes[*backfill_osd_it]), + con.get()); + } + ++backfill_osd_it; + } else { + pg->peer_bytes.clear(); + post_event(AllBackfillsReserved()); + } + return discard_event(); +} + +void PG::RecoveryState::WaitRemoteBackfillReserved::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur); +} + +void PG::RecoveryState::WaitRemoteBackfillReserved::retry() +{ + PG *pg = context< RecoveryMachine >().pg; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + + // Send CANCEL to all previously acquired reservations + set<pg_shard_t>::const_iterator it, begin, end; + begin = context< Active >().remote_shards_to_reserve_backfill.begin(); + end = context< Active >().remote_shards_to_reserve_backfill.end(); + ceph_assert(begin != end); + for (it = begin; it != backfill_osd_it; ++it) { + //The primary never backfills itself + ceph_assert(*it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + it->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MBackfillReserve( + MBackfillReserve::RELEASE, + spg_t(pg->info.pgid.pgid, it->shard), + pg->get_osdmap_epoch()), + con.get()); + } + } + + pg->state_clear(PG_STATE_BACKFILL_WAIT); + pg->publish_stats_to_osd(); + + pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval); +} + +boost::statechart::result +PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_TOOFULL); + retry(); + return transit<NotBackfilling>(); +} + +boost::statechart::result +PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt) +{ + retry(); + return transit<NotBackfilling>(); +} + +/*--WaitLocalBackfillReserved--*/ +PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_WAIT); + pg->osd->local_reserver.request_reservation( + pg->info.pgid, + new QueuePeeringEvt<LocalBackfillReserved>( + pg, pg->get_osdmap_epoch(), + LocalBackfillReserved()), + pg->get_backfill_priority(), + new QueuePeeringEvt<DeferBackfill>( + pg, pg->get_osdmap_epoch(), + DeferBackfill(0.0))); + pg->publish_stats_to_osd(); +} + +void PG::RecoveryState::WaitLocalBackfillReserved::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur); +} + +/*----NotBackfilling------*/ +PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_REPAIR); + pg->publish_stats_to_osd(); +} + +boost::statechart::result +PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt) +{ + return discard_event(); +} + +boost::statechart::result +PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt) +{ + return discard_event(); +} + +void PG::RecoveryState::NotBackfilling::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_BACKFILL_UNFOUND); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur); +} + +/*----NotRecovering------*/ +PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + pg->publish_stats_to_osd(); +} + +void PG::RecoveryState::NotRecovering::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_RECOVERY_UNFOUND); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur); +} + +/*---RepNotRecovering----*/ +PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +boost::statechart::result +PG::RecoveryState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->reject_reservation(); + post_event(RemoteReservationRejectedTooFull()); + return discard_event(); +} + +void PG::RecoveryState::RepNotRecovering::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur); +} + +/*---RepWaitRecoveryReserved--*/ +PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +boost::statechart::result +PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MRecoveryReserve( + MRecoveryReserve::GRANT, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return transit<RepRecovering>(); +} + +boost::statechart::result +PG::RecoveryState::RepWaitRecoveryReserved::react( + const RemoteReservationCanceled &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + return transit<RepNotRecovering>(); +} + +void PG::RecoveryState::RepWaitRecoveryReserved::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur); +} + +/*-RepWaitBackfillReserved*/ +PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +boost::statechart::result +PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + // Use tentative_bacfill_full() to make sure enough + // space is available to handle target bytes from primary. + + // TODO: If we passed num_objects from primary we could account for + // an estimate of the metadata overhead. + + // TODO: If we had compressed_allocated and compressed_original from primary + // we could compute compression ratio and adjust accordingly. + + // XXX: There is no way to get omap overhead and this would only apply + // to whatever possibly different partition that is storing the database. + + // update_osd_stat() from heartbeat will do this on a new + // statfs using pg->primary_num_bytes. + uint64_t pending_adjustment = 0; + int64_t primary_num_bytes = evt.primary_num_bytes; + int64_t local_num_bytes = evt.local_num_bytes; + if (primary_num_bytes) { + // For erasure coded pool overestimate by a full stripe per object + // because we don't know how each objected rounded to the nearest stripe + if (pg->pool.info.is_erasure()) { + primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); + primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects; + local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); + local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects; + } + pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes); + ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB" + << " local " << (local_num_bytes >> 10) << "KiB" + << " pending_adjustments " << (pending_adjustment >> 10) << "KiB" + << dendl; + } + // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes + // That's why we don't immediately unlock + Mutex::Locker l(pg->osd->stat_lock); + osd_stat_t cur_stat = pg->osd->osd_stat; + if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 && + (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { + ldout(pg->cct, 10) << "backfill reservation rejected: failure injection" + << dendl; + post_event(RejectTooFullRemoteReservation()); + } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && + pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) { + ldout(pg->cct, 10) << "backfill reservation rejected: backfill full" + << dendl; + post_event(RejectTooFullRemoteReservation()); + } else { + Context *preempt = nullptr; + // Don't reserve space if skipped reservation check, this is used + // to test the other backfill full check AND in case a corruption + // of num_bytes requires ignoring that value and trying the + // backfill anyway. + if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) + pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes); + else + pg->clear_reserved_num_bytes(); + // Use un-ec-adjusted bytes for stats. + pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes; + if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) { + // older peers will interpret preemption as TOOFULL + preempt = new QueuePeeringEvt<RemoteBackfillPreempted>( + pg, pg->get_osdmap_epoch(), + RemoteBackfillPreempted()); + } + pg->osd->remote_reserver.request_reservation( + pg->info.pgid, + new QueuePeeringEvt<RemoteBackfillReserved>( + pg, pg->get_osdmap_epoch(), + RemoteBackfillReserved()), + evt.priority, + preempt); + } + return transit<RepWaitBackfillReserved>(); +} + +boost::statechart::result +PG::RecoveryState::RepNotRecovering::react(const RequestRecoveryPrio &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + + // fall back to a local reckoning of priority of primary doesn't pass one + // (pre-mimic compat) + int prio = evt.priority ? evt.priority : pg->get_recovery_priority(); + + Context *preempt = nullptr; + if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) { + // older peers can't handle this + preempt = new QueuePeeringEvt<RemoteRecoveryPreempted>( + pg, pg->get_osdmap_epoch(), + RemoteRecoveryPreempted()); + } + + pg->osd->remote_reserver.request_reservation( + pg->info.pgid, + new QueuePeeringEvt<RemoteRecoveryReserved>( + pg, pg->get_osdmap_epoch(), + RemoteRecoveryReserved()), + prio, + preempt); + return transit<RepWaitRecoveryReserved>(); +} + +void PG::RecoveryState::RepWaitBackfillReserved::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur); +} + +boost::statechart::result +PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MBackfillReserve( + MBackfillReserve::GRANT, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return transit<RepRecovering>(); +} + +boost::statechart::result +PG::RecoveryState::RepWaitBackfillReserved::react( + const RejectTooFullRemoteReservation &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->reject_reservation(); + post_event(RemoteReservationRejectedTooFull()); + return discard_event(); +} + +boost::statechart::result +PG::RecoveryState::RepWaitBackfillReserved::react( + const RemoteReservationRejectedTooFull &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + return transit<RepNotRecovering>(); +} + +boost::statechart::result +PG::RecoveryState::RepWaitBackfillReserved::react( + const RemoteReservationCanceled &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + return transit<RepNotRecovering>(); +} + +/*---RepRecovering-------*/ +PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +boost::statechart::result +PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted &) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MRecoveryReserve( + MRecoveryReserve::REVOKE, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return discard_event(); +} + +boost::statechart::result +PG::RecoveryState::RepRecovering::react(const BackfillTooFull &) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MBackfillReserve( + MBackfillReserve::REVOKE_TOOFULL, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return discard_event(); +} + +boost::statechart::result +PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted &) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MBackfillReserve( + MBackfillReserve::REVOKE, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return discard_event(); +} + +void PG::RecoveryState::RepRecovering::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur); +} + +/*------Activating--------*/ +PG::RecoveryState::Activating::Activating(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +void PG::RecoveryState::Activating::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur); +} + +PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + + // Make sure all nodes that part of the recovery aren't full + if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery && + pg->osd->check_osdmap_full(pg->acting_recovery_backfill)) { + post_event(RecoveryTooFull()); + return; + } + + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); + pg->state_set(PG_STATE_RECOVERY_WAIT); + pg->osd->local_reserver.request_reservation( + pg->info.pgid, + new QueuePeeringEvt<LocalRecoveryReserved>( + pg, pg->get_osdmap_epoch(), + LocalRecoveryReserved()), + pg->get_recovery_priority(), + new QueuePeeringEvt<DeferRecovery>( + pg, pg->get_osdmap_epoch(), + DeferRecovery(0.0))); + pg->publish_stats_to_osd(); +} + +boost::statechart::result +PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->state_set(PG_STATE_RECOVERY_TOOFULL); + pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval); + return transit<NotRecovering>(); +} + +void PG::RecoveryState::WaitLocalRecoveryReserved::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur); +} + +PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"), + remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin()) +{ + context< RecoveryMachine >().log_enter(state_name); + post_event(RemoteRecoveryReserved()); +} + +boost::statechart::result +PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) { + PG *pg = context< RecoveryMachine >().pg; + + if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) { + ceph_assert(*remote_recovery_reservation_it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + remote_recovery_reservation_it->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MRecoveryReserve( + MRecoveryReserve::REQUEST, + spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard), + pg->get_osdmap_epoch(), + pg->get_recovery_priority()), + con.get()); + } + ++remote_recovery_reservation_it; + } else { + post_event(AllRemotesReserved()); + } + return discard_event(); +} + +void PG::RecoveryState::WaitRemoteRecoveryReserved::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur); +} + +PG::RecoveryState::Recovering::Recovering(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering") +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_RECOVERY_WAIT); + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); + pg->state_set(PG_STATE_RECOVERING); + ceph_assert(!pg->state_test(PG_STATE_ACTIVATING)); + pg->publish_stats_to_osd(); + pg->queue_recovery(); +} + +void PG::RecoveryState::Recovering::release_reservations(bool cancel) +{ + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(cancel || !pg->pg_log.get_missing().have_missing()); + + // release remote reservations + for (set<pg_shard_t>::const_iterator i = + context< Active >().remote_shards_to_reserve_recovery.begin(); + i != context< Active >().remote_shards_to_reserve_recovery.end(); + ++i) { + if (*i == pg->pg_whoami) // skip myself + continue; + ConnectionRef con = pg->osd->get_con_osd_cluster( + i->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MRecoveryReserve( + MRecoveryReserve::RELEASE, + spg_t(pg->info.pgid.pgid, i->shard), + pg->get_osdmap_epoch()), + con.get()); + } + } +} + +boost::statechart::result +PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_FORCED_RECOVERY); + release_reservations(); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + return transit<Recovered>(); +} + +boost::statechart::result +PG::RecoveryState::Recovering::react(const RequestBackfill &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_FORCED_RECOVERY); + release_reservations(); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + pg->publish_stats_to_osd(); + // transit any async_recovery_targets back into acting + // so pg won't have to stay undersized for long + // as backfill might take a long time to complete.. + if (!pg->async_recovery_targets.empty()) { + pg_shard_t auth_log_shard; + bool history_les_bound = false; + pg->choose_acting(auth_log_shard, true, &history_les_bound); + } + return transit<WaitLocalBackfillReserved>(); +} + +boost::statechart::result +PG::RecoveryState::Recovering::react(const DeferRecovery &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + if (!pg->state_test(PG_STATE_RECOVERING)) { + // we may have finished recovery and have an AllReplicasRecovered + // event queued to move us to the next state. + ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl; + return discard_event(); + } + ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl; + pg->state_set(PG_STATE_RECOVERY_WAIT); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + release_reservations(true); + pg->schedule_recovery_retry(evt.delay); + return transit<NotRecovering>(); +} + +boost::statechart::result +PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl; + pg->state_set(PG_STATE_RECOVERY_UNFOUND); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + release_reservations(true); + return transit<NotRecovering>(); +} + +void PG::RecoveryState::Recovering::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->state_clear(PG_STATE_RECOVERING); + pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur); +} + +PG::RecoveryState::Recovered::Recovered(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered") +{ + pg_shard_t auth_log_shard; + + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + + ceph_assert(!pg->needs_recovery()); + + // if we finished backfill, all acting are active; recheck if + // DEGRADED | UNDERSIZED is appropriate. + ceph_assert(!pg->acting_recovery_backfill.empty()); + if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= + pg->acting_recovery_backfill.size()) { + pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); + pg->publish_stats_to_osd(); + } + + // adjust acting set? (e.g. because backfill completed...) + bool history_les_bound = false; + if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard, + true, &history_les_bound)) { + ceph_assert(pg->want_acting.size()); + } else if (!pg->async_recovery_targets.empty()) { + pg->choose_acting(auth_log_shard, true, &history_les_bound); + } + + if (context< Active >().all_replicas_activated && + pg->async_recovery_targets.empty()) + post_event(GoClean()); +} + +void PG::RecoveryState::Recovered::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur); +} + +PG::RecoveryState::Clean::Clean(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean") +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + + if (pg->info.last_complete != pg->info.last_update) { + ceph_abort(); + } + Context *c = pg->finish_recovery(); + context< RecoveryMachine >().get_cur_transaction()->register_on_commit(c); + + pg->try_mark_clean(); +} + +void PG::RecoveryState::Clean::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_CLEAN); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur); +} + +template <typename T> +set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in) +{ + set<int> osds_found; + set<pg_shard_t> out; + for (typename T::const_iterator i = in.begin(); + i != in.end(); + ++i) { + if (*i != skip && !osds_found.count(i->osd)) { + osds_found.insert(i->osd); + out.insert(*i); + } + } + return out; +} + +/*---------Active---------*/ +PG::RecoveryState::Active::Active(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"), + remote_shards_to_reserve_recovery( + unique_osd_shard_set( + context< RecoveryMachine >().pg->pg_whoami, + context< RecoveryMachine >().pg->acting_recovery_backfill)), + remote_shards_to_reserve_backfill( + unique_osd_shard_set( + context< RecoveryMachine >().pg->pg_whoami, + context< RecoveryMachine >().pg->backfill_targets)), + all_replicas_activated(false) +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + + ceph_assert(!pg->backfill_reserving); + ceph_assert(!pg->backfill_reserved); + ceph_assert(pg->is_primary()); + ldout(pg->cct, 10) << "In Active, about to call activate" << dendl; + pg->start_flush(context< RecoveryMachine >().get_cur_transaction()); + pg->activate(*context< RecoveryMachine >().get_cur_transaction(), + pg->get_osdmap_epoch(), + *context< RecoveryMachine >().get_query_map(), + context< RecoveryMachine >().get_info_map(), + context< RecoveryMachine >().get_recovery_ctx()); + + // everyone has to commit/ack before we are truly active + pg->blocked_by.clear(); + for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin(); + p != pg->acting_recovery_backfill.end(); + ++p) { + if (p->shard != pg->pg_whoami.shard) { + pg->blocked_by.insert(p->shard); + } + } + pg->publish_stats_to_osd(); + ldout(pg->cct, 10) << "Activate Finished" << dendl; +} + +boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap) +{ + PG *pg = context< RecoveryMachine >().pg; + if (pg->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + ldout(pg->cct, 10) << "Active advmap interval change, fast return" << dendl; + return forward_event(); + } + ldout(pg->cct, 10) << "Active advmap" << dendl; + bool need_publish = false; + + if (advmap.osdmap->require_osd_release >= CEPH_RELEASE_MIMIC) { + const auto& new_removed_snaps = advmap.osdmap->get_new_removed_snaps(); + auto i = new_removed_snaps.find(pg->info.pgid.pool()); + if (i != new_removed_snaps.end()) { + bool bad = false; + for (auto j : i->second) { + if (pg->snap_trimq.intersects(j.first, j.second)) { + decltype(pg->snap_trimq) added, overlap; + added.insert(j.first, j.second); + overlap.intersection_of(pg->snap_trimq, added); + if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) { + lderr(pg->cct) << __func__ << " removed_snaps already contains " + << overlap << ", but this is the first mimic+ osdmap," + << " so it's expected" << dendl; + } else { + lderr(pg->cct) << __func__ << " removed_snaps already contains " + << overlap << dendl; + bad = true; + } + pg->snap_trimq.union_of(added); + } else { + pg->snap_trimq.insert(j.first, j.second); + } + } + if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) { + // at upgrade, we report *all* previously removed snaps as removed in + // the first mimic epoch. remove the ones we previously divined were + // removed (and subsequently purged) from the trimq. + lderr(pg->cct) << __func__ << " first mimic map, filtering purged_snaps" + << " from new removed_snaps" << dendl; + pg->snap_trimq.subtract(pg->info.purged_snaps); + } + ldout(pg->cct,10) << __func__ << " new removed_snaps " << i->second + << ", snap_trimq now " << pg->snap_trimq << dendl; + ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps); + pg->dirty_info = true; + pg->dirty_big_info = true; + } + + const auto& new_purged_snaps = advmap.osdmap->get_new_purged_snaps(); + auto j = new_purged_snaps.find(pg->info.pgid.pool()); + if (j != new_purged_snaps.end()) { + bool bad = false; + for (auto k : j->second) { + if (!pg->info.purged_snaps.contains(k.first, k.second)) { + decltype(pg->info.purged_snaps) rm, overlap; + rm.insert(k.first, k.second); + overlap.intersection_of(pg->info.purged_snaps, rm); + lderr(pg->cct) << __func__ << " purged_snaps does not contain " + << rm << ", only " << overlap << dendl; + pg->info.purged_snaps.subtract(overlap); + // This can currently happen in the normal (if unlikely) course of + // events. Because adding snaps to purged_snaps does not increase + // the pg version or add a pg log entry, we don't reliably propagate + // purged_snaps additions to other OSDs. + // One example: + // - purge S + // - primary and replicas update purged_snaps + // - no object updates + // - pg mapping changes, new primary on different node + // - new primary pg version == eversion_t(), so info is not + // propagated. + //bad = true; + } else { + pg->info.purged_snaps.erase(k.first, k.second); + } + } + ldout(pg->cct,10) << __func__ << " new purged_snaps " << j->second + << ", now " << pg->info.purged_snaps << dendl; + ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps); + pg->dirty_info = true; + pg->dirty_big_info = true; + } + if (pg->dirty_big_info) { + // share updated purged_snaps to mgr/mon so that we (a) stop reporting + // purged snaps and (b) perhaps share more snaps that we have purged + // but didn't fit in pg_stat_t. + need_publish = true; + pg->share_pg_info(); + } + } else if (!pg->pool.newly_removed_snaps.empty()) { + pg->snap_trimq.union_of(pg->pool.newly_removed_snaps); + ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl; + pg->dirty_info = true; + pg->dirty_big_info = true; + } + + for (size_t i = 0; i < pg->want_acting.size(); i++) { + int osd = pg->want_acting[i]; + if (!advmap.osdmap->is_up(osd)) { + pg_shard_t osd_with_shard(osd, shard_id_t(i)); + ceph_assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard)); + } + } + + /* Check for changes in pool size (if the acting set changed as a result, + * this does not matter) */ + if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) != + pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) { + if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) { + pg->state_clear(PG_STATE_UNDERSIZED); + } else { + pg->state_set(PG_STATE_UNDERSIZED); + } + // degraded changes will be detected by call from publish_stats_to_osd() + need_publish = true; + } + + // if we haven't reported our PG stats in a long time, do so now. + if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) { + ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch) + << " epochs" << dendl; + need_publish = true; + } + + if (need_publish) + pg->publish_stats_to_osd(); + + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::Active::react(const ActMap&) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "Active: handling ActMap" << dendl; + ceph_assert(pg->is_primary()); + + if (pg->have_unfound()) { + // object may have become unfound + pg->discover_all_missing(*context< RecoveryMachine >().get_query_map()); + } + + if (pg->cct->_conf->osd_check_for_log_corruption) + pg->check_log_for_corruption(pg->osd->store); + + uint64_t unfound = pg->missing_loc.num_unfound(); + if (unfound > 0 && + pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) { + if (pg->cct->_conf->osd_auto_mark_unfound_lost) { + pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound + << " objects unfound and apparently lost, would automatically " + << "mark these objects lost but this feature is not yet implemented " + << "(osd_auto_mark_unfound_lost)"; + } else + pg->osd->clog->error() << pg->info.pgid.pgid << " has " + << unfound << " objects unfound and apparently lost"; + } + + if (pg->is_active()) { + ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl; + pg->kick_snap_trim(); + } + + if (pg->is_peered() && + !pg->is_clean() && + !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) && + (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) { + pg->queue_recovery(); + } + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(pg->is_primary()); + if (pg->peer_info.count(notevt.from)) { + ldout(pg->cct, 10) << "Active: got notify from " << notevt.from + << ", already have info from that osd, ignoring" + << dendl; + } else if (pg->peer_purged.count(notevt.from)) { + ldout(pg->cct, 10) << "Active: got notify from " << notevt.from + << ", already purged that peer, ignoring" + << dendl; + } else { + ldout(pg->cct, 10) << "Active: got notify from " << notevt.from + << ", calling proc_replica_info and discover_all_missing" + << dendl; + pg->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent); + if (pg->have_unfound() || (pg->is_degraded() && pg->might_have_unfound.count(notevt.from))) { + pg->discover_all_missing(*context< RecoveryMachine >().get_query_map()); + } + } + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Active::react(const MTrim& trim) +{ + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(pg->is_primary()); + + // peer is informing us of their last_complete_ondisk + ldout(pg->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl; + pg->peer_last_complete_ondisk[pg_shard_t(trim.from, trim.shard)] = trim.trim_to; + + // trim log when the pg is recovered + pg->calc_min_last_complete_ondisk(); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(pg->is_primary()); + + ceph_assert(!pg->acting_recovery_backfill.empty()); + // don't update history (yet) if we are active and primary; the replica + // may be telling us they have activated (and committed) but we can't + // share that until _everyone_ does the same. + if (pg->is_acting_recovery_backfill(infoevt.from) && + pg->peer_activated.count(infoevt.from) == 0) { + ldout(pg->cct, 10) << " peer osd." << infoevt.from + << " activated and committed" << dendl; + pg->peer_activated.insert(infoevt.from); + pg->blocked_by.erase(infoevt.from.shard); + pg->publish_stats_to_osd(); + if (pg->peer_activated.size() == pg->acting_recovery_backfill.size()) { + pg->all_activated_and_committed(); + } + } + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "searching osd." << logevt.from + << " log for unfound items" << dendl; + pg->proc_replica_log( + logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from); + bool got_missing = pg->search_for_missing( + pg->peer_info[logevt.from], + pg->peer_missing[logevt.from], + logevt.from, + context< RecoveryMachine >().get_recovery_ctx()); + // If there are missing AND we are "fully" active then start recovery now + if (got_missing && pg->state_test(PG_STATE_ACTIVE)) { + post_event(DoRecovery()); + } + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q) +{ + PG *pg = context< RecoveryMachine >().pg; + + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + { + q.f->open_array_section("might_have_unfound"); + for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin(); + p != pg->might_have_unfound.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (pg->peer_missing.count(*p)) { + q.f->dump_string("status", "already probed"); + } else if (pg->peer_missing_requested.count(*p)) { + q.f->dump_string("status", "querying"); + } else if (!pg->get_osdmap()->is_up(p->osd)) { + q.f->dump_string("status", "osd is down"); + } else { + q.f->dump_string("status", "not queried"); + } + q.f->close_section(); + } + q.f->close_section(); + } + { + q.f->open_object_section("recovery_progress"); + pg->dump_recovery_info(q.f); + q.f->close_section(); + } + + { + q.f->open_object_section("scrub"); + q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start; + q.f->dump_bool("scrubber.active", pg->scrubber.active); + q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state)); + q.f->dump_stream("scrubber.start") << pg->scrubber.start; + q.f->dump_stream("scrubber.end") << pg->scrubber.end; + q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end; + q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update; + q.f->dump_bool("scrubber.deep", pg->scrubber.deep); + { + q.f->open_array_section("scrubber.waiting_on_whom"); + for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin(); + p != pg->scrubber.waiting_on_whom.end(); + ++p) { + q.f->dump_stream("shard") << *p; + } + q.f->close_section(); + } + q.f->close_section(); + } + + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg_t pgid = pg->info.pgid.pgid; + + all_replicas_activated = true; + + pg->state_clear(PG_STATE_ACTIVATING); + pg->state_clear(PG_STATE_CREATING); + pg->state_clear(PG_STATE_PREMERGE); + + bool merge_target; + if (pg->pool.info.is_pending_merge(pgid, &merge_target)) { + pg->state_set(PG_STATE_PEERED); + pg->state_set(PG_STATE_PREMERGE); + + if (pg->actingset.size() != pg->get_osdmap()->get_pg_size(pgid)) { + if (merge_target) { + pg_t src = pgid; + src.set_ps(pg->pool.info.get_pg_num_pending()); + assert(src.get_parent() == pgid); + pg->osd->set_not_ready_to_merge_target(pgid, src); + } else { + pg->osd->set_not_ready_to_merge_source(pgid); + } + } + } else if (pg->acting.size() < pg->pool.info.min_size) { + pg->state_set(PG_STATE_PEERED); + } else { + pg->state_set(PG_STATE_ACTIVE); + } + + if (pg->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) { + pg->osd->send_pg_created(pgid); + } + + pg->info.history.last_epoch_started = pg->info.last_epoch_started; + pg->info.history.last_interval_started = pg->info.last_interval_started; + pg->dirty_info = true; + + pg->share_pg_info(); + pg->publish_stats_to_osd(); + + pg->check_local(); + + // waiters + if (pg->flushes_in_progress == 0) { + pg->requeue_ops(pg->waiting_for_peered); + } else if (!pg->waiting_for_peered.empty()) { + ldout(pg->cct, 10) << __func__ << " flushes in progress, moving " + << pg->waiting_for_peered.size() + << " items to waiting_for_flush" + << dendl; + ceph_assert(pg->waiting_for_flush.empty()); + pg->waiting_for_flush.swap(pg->waiting_for_peered); + } + + pg->on_activate(); + + return discard_event(); +} + +void PG::RecoveryState::Active::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + + pg->blocked_by.clear(); + pg->backfill_reserved = false; + pg->backfill_reserving = false; + pg->state_clear(PG_STATE_ACTIVATING); + pg->state_clear(PG_STATE_DEGRADED); + pg->state_clear(PG_STATE_UNDERSIZED); + pg->state_clear(PG_STATE_BACKFILL_TOOFULL); + pg->state_clear(PG_STATE_BACKFILL_WAIT); + pg->state_clear(PG_STATE_RECOVERY_WAIT); + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_active_latency, dur); + pg->agent_stop(); +} + +/*------ReplicaActive-----*/ +PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive") +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + pg->start_flush(context< RecoveryMachine >().get_cur_transaction()); +} + + +boost::statechart::result PG::RecoveryState::ReplicaActive::react( + const Activate& actevt) { + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl; + map<int, map<spg_t, pg_query_t> > query_map; + pg->activate(*context< RecoveryMachine >().get_cur_transaction(), + actevt.activation_epoch, + query_map, NULL, NULL); + ldout(pg->cct, 10) << "Activate Finished" << dendl; + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(), + infoevt.info); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "received log from " << logevt.from << dendl; + ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction(); + pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from); + ceph_assert(pg->pg_log.get_head() == pg->info.last_update); + + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MTrim& trim) +{ + PG *pg = context< RecoveryMachine >().pg; + // primary is instructing us to trim + pg->pg_log.trim(trim.trim_to, pg->info); + pg->dirty_info = true; + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&) +{ + PG *pg = context< RecoveryMachine >().pg; + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< RecoveryMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + pg->info), + pg->past_intervals); + } + pg->take_waiters(); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::ReplicaActive::react( + const MQuery& query) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx()); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return forward_event(); +} + +void PG::RecoveryState::ReplicaActive::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur); +} + +/*-------Stray---*/ +PG::RecoveryState::Stray::Stray(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Stray") +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(!pg->is_peered()); + ceph_assert(!pg->is_peering()); + ceph_assert(!pg->is_primary()); + + if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) { + ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl; + post_event(DeleteStart()); + } else { + pg->start_flush(context< RecoveryMachine >().get_cur_transaction()); + } +} + +boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt) +{ + PG *pg = context< RecoveryMachine >().pg; + MOSDPGLog *msg = logevt.msg.get(); + ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl; + + ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction(); + if (msg->info.last_backfill == hobject_t()) { + // restart backfill + pg->info = msg->info; + pg->on_info_history_change(); + pg->dirty_info = true; + pg->dirty_big_info = true; // maybe. + + PGLogEntryHandler rollbacker{pg, t}; + pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker); + + pg->pg_log.reset_backfill(); + } else { + pg->merge_log(*t, msg->info, msg->log, logevt.from); + } + + ceph_assert(pg->pg_log.get_head() == pg->info.last_update); + + post_event(Activate(logevt.msg->info.last_epoch_started)); + return transit<ReplicaActive>(); +} + +boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl; + + if (pg->info.last_update > infoevt.info.last_update) { + // rewind divergent log entries + ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction(); + pg->rewind_divergent_log(*t, infoevt.info.last_update); + pg->info.stats = infoevt.info.stats; + pg->info.hit_set = infoevt.info.hit_set; + } + + ceph_assert(infoevt.info.last_update == pg->info.last_update); + ceph_assert(pg->pg_log.get_head() == pg->info.last_update); + + post_event(Activate(infoevt.info.last_epoch_started)); + return transit<ReplicaActive>(); +} + +boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx()); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&) +{ + PG *pg = context< RecoveryMachine >().pg; + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< RecoveryMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + pg->info), + pg->past_intervals); + } + pg->take_waiters(); + return discard_event(); +} + +void PG::RecoveryState::Stray::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur); +} + + +/*--------ToDelete----------*/ +PG::RecoveryState::ToDelete::ToDelete(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/ToDelete") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + pg->osd->logger->inc(l_osd_pg_removing); +} + +void PG::RecoveryState::ToDelete::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + // note: on a successful removal, this path doesn't execute. see + // _delete_some(). + pg->osd->logger->dec(l_osd_pg_removing); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); +} + +/*----WaitDeleteReserved----*/ +PG::RecoveryState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, + "Started/ToDelete/WaitDeleteReseved") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + context<ToDelete>().priority = pg->get_delete_priority(); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + pg->osd->local_reserver.request_reservation( + pg->info.pgid, + new QueuePeeringEvt<DeleteReserved>( + pg, pg->get_osdmap_epoch(), + DeleteReserved()), + context<ToDelete>().priority, + new QueuePeeringEvt<DeleteInterrupted>( + pg, pg->get_osdmap_epoch(), + DeleteInterrupted())); +} + +boost::statechart::result PG::RecoveryState::ToDelete::react( + const ActMap& evt) +{ + PG *pg = context< RecoveryMachine >().pg; + if (pg->get_delete_priority() != priority) { + ldout(pg->cct,10) << __func__ << " delete priority changed, resetting" + << dendl; + return transit<ToDelete>(); + } + return discard_event(); +} + +void PG::RecoveryState::WaitDeleteReserved::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); +} + +/*----Deleting-----*/ +PG::RecoveryState::Deleting::Deleting(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/ToDelete/Deleting") +{ + start = ceph::mono_clock::now(); + + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + pg->deleting = true; + ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction(); + pg->on_removal(t); + t->register_on_commit(new C_DeleteMore(pg, pg->get_osdmap_epoch())); +} + +boost::statechart::result PG::RecoveryState::Deleting::react( + const DeleteSome& evt) +{ + PG *pg = context< RecoveryMachine >().pg; + next = pg->_delete_some(context<RecoveryMachine>().get_cur_transaction(), + next); + return discard_event(); +} + +void PG::RecoveryState::Deleting::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->deleting = false; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + ldout(pg->cct, 20) << "Deleting::" << __func__ << this <<" finished in " + << ceph::mono_clock::now() - start + << dendl; +} + +/*--------GetInfo---------*/ +PG::RecoveryState::GetInfo::GetInfo(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo") +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + pg->check_past_interval_bounds(); + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + + ceph_assert(pg->blocked_by.empty()); + + prior_set = pg->build_prior(); + + pg->reset_min_peer_features(); + get_infos(); + if (prior_set.pg_down) { + post_event(IsDown()); + } else if (peer_info_requested.empty()) { + post_event(GotInfo()); + } +} + +void PG::RecoveryState::GetInfo::get_infos() +{ + PG *pg = context< RecoveryMachine >().pg; + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + + pg->blocked_by.clear(); + for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin(); + it != prior_set.probe.end(); + ++it) { + pg_shard_t peer = *it; + if (peer == pg->pg_whoami) { + continue; + } + if (pg->peer_info.count(peer)) { + ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl; + continue; + } + if (peer_info_requested.count(peer)) { + ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl; + pg->blocked_by.insert(peer.osd); + } else if (!pg->get_osdmap()->is_up(peer.osd)) { + ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl; + } else { + ldout(pg->cct, 10) << " querying info from osd." << peer << dendl; + context< RecoveryMachine >().send_query( + peer, pg_query_t(pg_query_t::INFO, + it->shard, pg->pg_whoami.shard, + pg->info.history, + pg->get_osdmap_epoch())); + peer_info_requested.insert(peer); + pg->blocked_by.insert(peer.osd); + } + } + + pg->publish_stats_to_osd(); +} + +boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt) +{ + PG *pg = context< RecoveryMachine >().pg; + + set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from); + if (p != peer_info_requested.end()) { + peer_info_requested.erase(p); + pg->blocked_by.erase(infoevt.from.osd); + } + + epoch_t old_start = pg->info.history.last_epoch_started; + if (pg->proc_replica_info( + infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) { + // we got something new ... + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + if (old_start < pg->info.history.last_epoch_started) { + ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl; + prior_set = pg->build_prior(); + + // filter out any osds that got dropped from the probe set from + // peer_info_requested. this is less expensive than restarting + // peering (which would re-probe everyone). + set<pg_shard_t>::iterator p = peer_info_requested.begin(); + while (p != peer_info_requested.end()) { + if (prior_set.probe.count(*p) == 0) { + ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; + peer_info_requested.erase(p++); + } else { + ++p; + } + } + get_infos(); + } + ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: " + << hex << infoevt.features << dec << dendl; + pg->apply_peer_features(infoevt.features); + + // are we done getting everything? + if (peer_info_requested.empty() && !prior_set.pg_down) { + ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl; + ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl; + ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl; + post_event(GotInfo()); + } + } + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q) +{ + PG *pg = context< RecoveryMachine >().pg; + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("requested_info_from"); + for (set<pg_shard_t>::iterator p = peer_info_requested.begin(); + p != peer_info_requested.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (pg->peer_info.count(*p)) { + q.f->open_object_section("got_info"); + pg->peer_info[*p].dump(q.f); + q.f->close_section(); + } + q.f->close_section(); + } + q.f->close_section(); + + q.f->close_section(); + return forward_event(); +} + +void PG::RecoveryState::GetInfo::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur); + pg->blocked_by.clear(); +} + +/*------GetLog------------*/ +PG::RecoveryState::GetLog::GetLog(my_context ctx) + : my_base(ctx), + NamedState( + context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"), + msg(0) +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + + // adjust acting? + if (!pg->choose_acting(auth_log_shard, false, + &context< Peering >().history_les_bound)) { + if (!pg->want_acting.empty()) { + post_event(NeedActingChange()); + } else { + post_event(IsIncomplete()); + } + return; + } + + // am i the best? + if (auth_log_shard == pg->pg_whoami) { + post_event(GotLog()); + return; + } + + const pg_info_t& best = pg->peer_info[auth_log_shard]; + + // am i broken? + if (pg->info.last_update < best.log_tail) { + ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl; + post_event(IsIncomplete()); + return; + } + + // how much log to request? + eversion_t request_log_from = pg->info.last_update; + ceph_assert(!pg->acting_recovery_backfill.empty()); + for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin(); + p != pg->acting_recovery_backfill.end(); + ++p) { + if (*p == pg->pg_whoami) continue; + pg_info_t& ri = pg->peer_info[*p]; + if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail && + ri.last_update < request_log_from) + request_log_from = ri.last_update; + } + + // how much? + ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl; + context<RecoveryMachine>().send_query( + auth_log_shard, + pg_query_t( + pg_query_t::LOG, + auth_log_shard.shard, pg->pg_whoami.shard, + request_log_from, pg->info.history, + pg->get_osdmap_epoch())); + + ceph_assert(pg->blocked_by.empty()); + pg->blocked_by.insert(auth_log_shard.osd); + pg->publish_stats_to_osd(); +} + +boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap) +{ + PG *pg = context< RecoveryMachine >().pg; + // make sure our log source didn't go down. we need to check + // explicitly because it may not be part of the prior set, which + // means the Peering state check won't catch it going down. + if (!advmap.osdmap->is_up(auth_log_shard.osd)) { + ldout(pg->cct, 10) << "GetLog: auth_log_shard osd." + << auth_log_shard.osd << " went down" << dendl; + post_event(advmap); + return transit< Reset >(); + } + + // let the Peering state do its checks. + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(!msg); + if (logevt.from != auth_log_shard) { + ldout(pg->cct, 10) << "GetLog: discarding log from " + << "non-auth_log_shard osd." << logevt.from << dendl; + return discard_event(); + } + ldout(pg->cct, 10) << "GetLog: received master log from osd" + << logevt.from << dendl; + msg = logevt.msg; + post_event(GotLog()); + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "leaving GetLog" << dendl; + if (msg) { + ldout(pg->cct, 10) << "processing master log" << dendl; + pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(), + msg->info, msg->log, msg->missing, + auth_log_shard); + } + pg->start_flush(context< RecoveryMachine >().get_cur_transaction()); + return transit< GetMissing >(); +} + +boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_stream("auth_log_shard") << auth_log_shard; + q.f->close_section(); + return forward_event(); +} + +void PG::RecoveryState::GetLog::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur); + pg->blocked_by.clear(); +} + +/*------WaitActingChange--------*/ +PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/WaitActingChange") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap) +{ + PG *pg = context< RecoveryMachine >().pg; + OSDMapRef osdmap = advmap.osdmap; + + ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl; + for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) { + if (!osdmap->is_up(*p)) { + ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl; + post_event(advmap); + return transit< Reset >(); + } + } + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl; + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl; + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl; + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "waiting for pg acting set to change"); + q.f->close_section(); + return forward_event(); +} + +void PG::RecoveryState::WaitActingChange::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur); +} + +/*------Down--------*/ +PG::RecoveryState::Down::Down(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + + pg->state_clear(PG_STATE_PEERING); + pg->state_set(PG_STATE_DOWN); + + auto &prior_set = context< Peering >().prior_set; + ceph_assert(pg->blocked_by.empty()); + pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); + pg->publish_stats_to_osd(); +} + +void PG::RecoveryState::Down::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + + pg->state_clear(PG_STATE_DOWN); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_down_latency, dur); + + pg->blocked_by.clear(); +} + +boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", + "not enough up instances of this PG to go active"); + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::Down::react(const MNotifyRec& infoevt) +{ + PG *pg = context< RecoveryMachine >().pg; + + ceph_assert(pg->is_primary()); + epoch_t old_start = pg->info.history.last_epoch_started; + if (!pg->peer_info.count(infoevt.from) && + pg->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) { + pg->update_history(infoevt.notify.info.history); + } + // if we got something new to make pg escape down state + if (pg->info.history.last_epoch_started > old_start) { + ldout(pg->cct, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl; + pg->state_clear(PG_STATE_DOWN); + pg->state_set(PG_STATE_PEERING); + return transit< GetInfo >(); + } + + return discard_event(); +} + + +/*------Incomplete--------*/ +PG::RecoveryState::Incomplete::Incomplete(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + + pg->state_clear(PG_STATE_PEERING); + pg->state_set(PG_STATE_INCOMPLETE); + + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + ceph_assert(pg->blocked_by.empty()); + pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); + pg->publish_stats_to_osd(); +} + +boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) { + PG *pg = context< RecoveryMachine >().pg; + int64_t poolnum = pg->info.pgid.pool(); + + // Reset if min_size turn smaller than previous value, pg might now be able to go active + if (!advmap.osdmap->have_pg_pool(poolnum) || + advmap.lastmap->get_pools().find(poolnum)->second.min_size > + advmap.osdmap->get_pools().find(poolnum)->second.min_size) { + post_event(advmap); + return transit< Reset >(); + } + + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) { + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl; + if (pg->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) { + // We got something new, try again! + return transit< GetLog >(); + } else { + return discard_event(); + } +} + +boost::statechart::result PG::RecoveryState::Incomplete::react( + const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "not enough complete instances of this PG"); + q.f->close_section(); + return forward_event(); +} + +void PG::RecoveryState::Incomplete::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + + pg->state_clear(PG_STATE_INCOMPLETE); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur); + + pg->blocked_by.clear(); +} + +/*------GetMissing--------*/ +PG::RecoveryState::GetMissing::GetMissing(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing") +{ + context< RecoveryMachine >().log_enter(state_name); + + PG *pg = context< RecoveryMachine >().pg; + ceph_assert(!pg->acting_recovery_backfill.empty()); + eversion_t since; + for (set<pg_shard_t>::iterator i = pg->acting_recovery_backfill.begin(); + i != pg->acting_recovery_backfill.end(); + ++i) { + if (*i == pg->get_primary()) continue; + const pg_info_t& pi = pg->peer_info[*i]; + // reset this so to make sure the pg_missing_t is initialized and + // has the correct semantics even if we don't need to get a + // missing set from a shard. This way later additions due to + // lost+unfound delete work properly. + pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering(); + + if (pi.is_empty()) + continue; // no pg data, nothing divergent + + if (pi.last_update < pg->pg_log.get_tail()) { + ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl; + pg->peer_missing[*i].clear(); + continue; + } + if (pi.last_backfill == hobject_t()) { + ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl; + pg->peer_missing[*i].clear(); + continue; + } + + if (pi.last_update == pi.last_complete && // peer has no missing + pi.last_update == pg->info.last_update) { // peer is up to date + // replica has no missing and identical log as us. no need to + // pull anything. + // FIXME: we can do better here. if last_update==last_complete we + // can infer the rest! + ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl; + pg->peer_missing[*i].clear(); + continue; + } + + // We pull the log from the peer's last_epoch_started to ensure we + // get enough log to detect divergent updates. + since.epoch = pi.last_epoch_started; + ceph_assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing + if (pi.log_tail <= since) { + ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl; + context< RecoveryMachine >().send_query( + *i, + pg_query_t( + pg_query_t::LOG, + i->shard, pg->pg_whoami.shard, + since, pg->info.history, + pg->get_osdmap_epoch())); + } else { + ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i + << " (want since " << since << " < log.tail " + << pi.log_tail << ")" << dendl; + context< RecoveryMachine >().send_query( + *i, pg_query_t( + pg_query_t::FULLLOG, + i->shard, pg->pg_whoami.shard, + pg->info.history, pg->get_osdmap_epoch())); + } + peer_missing_requested.insert(*i); + pg->blocked_by.insert(i->osd); + } + + if (peer_missing_requested.empty()) { + if (pg->need_up_thru) { + ldout(pg->cct, 10) << " still need up_thru update before going active" + << dendl; + post_event(NeedUpThru()); + return; + } + + // all good! + post_event(Activate(pg->get_osdmap_epoch())); + } else { + pg->publish_stats_to_osd(); + } +} + +boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt) +{ + PG *pg = context< RecoveryMachine >().pg; + + peer_missing_requested.erase(logevt.from); + pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from); + + if (peer_missing_requested.empty()) { + if (pg->need_up_thru) { + ldout(pg->cct, 10) << " still need up_thru update before going active" + << dendl; + post_event(NeedUpThru()); + } else { + ldout(pg->cct, 10) << "Got last missing, don't need missing " + << "posting Activate" << dendl; + post_event(Activate(pg->get_osdmap_epoch())); + } + } + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q) +{ + PG *pg = context< RecoveryMachine >().pg; + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("peer_missing_requested"); + for (set<pg_shard_t>::iterator p = peer_missing_requested.begin(); + p != peer_missing_requested.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (pg->peer_missing.count(*p)) { + q.f->open_object_section("got_missing"); + pg->peer_missing[*p].dump(q.f); + q.f->close_section(); + } + q.f->close_section(); + } + q.f->close_section(); + + q.f->close_section(); + return forward_event(); +} + +void PG::RecoveryState::GetMissing::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur); + pg->blocked_by.clear(); +} + +/*------WaitUpThru--------*/ +PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru") +{ + context< RecoveryMachine >().log_enter(state_name); +} + +boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am) +{ + PG *pg = context< RecoveryMachine >().pg; + if (!pg->need_up_thru) { + post_event(Activate(pg->get_osdmap_epoch())); + } + return forward_event(); +} + +boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl; + pg->peer_missing[logevt.from].claim(logevt.msg->missing); + pg->peer_info[logevt.from] = logevt.msg->info; + return discard_event(); +} + +boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd"); + q.f->close_section(); + return forward_event(); +} + +void PG::RecoveryState::WaitUpThru::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur); +} + +/*----RecoveryState::RecoveryMachine Methods-----*/ +#undef dout_prefix +#define dout_prefix pg->gen_prefix(*_dout) + +void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 5) << "enter " << state_name << dendl; + pg->osd->pg_recovery_stats.log_enter(state_name); +} + +void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time) +{ + utime_t dur = ceph_clock_now() - enter_time; + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl; + pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time, + event_count, event_time); + event_count = 0; + event_time = utime_t(); +} + + +/*---------------------------------------------------*/ +#undef dout_prefix +#define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ") + +void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) { + ceph_assert(!rctx); + ceph_assert(!orig_ctx); + orig_ctx = new_ctx; + if (new_ctx) { + if (messages_pending_flush) { + rctx = RecoveryCtx(*messages_pending_flush, *new_ctx); + } else { + rctx = *new_ctx; + } + rctx->start_time = ceph_clock_now(); + } +} + +void PG::RecoveryState::begin_block_outgoing() { + ceph_assert(!messages_pending_flush); + ceph_assert(orig_ctx); + ceph_assert(rctx); + messages_pending_flush = BufferedRecoveryMessages(); + rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx); +} + +void PG::RecoveryState::clear_blocked_outgoing() { + ceph_assert(orig_ctx); + ceph_assert(rctx); + messages_pending_flush = boost::optional<BufferedRecoveryMessages>(); +} + +void PG::RecoveryState::end_block_outgoing() { + ceph_assert(messages_pending_flush); + ceph_assert(orig_ctx); + ceph_assert(rctx); + + rctx = RecoveryCtx(*orig_ctx); + rctx->accept_buffered_messages(*messages_pending_flush); + messages_pending_flush = boost::optional<BufferedRecoveryMessages>(); +} + +void PG::RecoveryState::end_handle() { + if (rctx) { + utime_t dur = ceph_clock_now() - rctx->start_time; + machine.event_time += dur; + } + + machine.event_count++; + rctx = boost::optional<RecoveryCtx>(); + orig_ctx = NULL; +} + +ostream& operator<<(ostream& out, const PG::BackfillInterval& bi) +{ + out << "BackfillInfo(" << bi.begin << "-" << bi.end + << " " << bi.objects.size() << " objects"; + if (!bi.objects.empty()) + out << " " << bi.objects; + out << ")"; + return out; +} + +void PG::dump_pgstate_history(Formatter *f) +{ + lock(); + pgstate_history.dump(f); + unlock(); +} + +void PG::dump_missing(Formatter *f) +{ + for (auto& i : pg_log.get_missing().get_items()) { + f->open_object_section("object"); + f->dump_object("oid", i.first); + f->dump_object("missing_info", i.second); + if (missing_loc.needs_recovery(i.first)) { + f->dump_bool("unfound", missing_loc.is_unfound(i.first)); + f->open_array_section("locations"); + for (auto l : missing_loc.get_locations(i.first)) { + f->dump_object("shard", l); + } + f->close_section(); + } + f->close_section(); + } +} + +void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f) +{ + pg_stats_publish_lock.Lock(); + if (pg_stats_publish_valid) { + f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean()); + } + pg_stats_publish_lock.Unlock(); +} + +void PG::with_heartbeat_peers(std::function<void(int)> f) +{ + heartbeat_peer_lock.Lock(); + for (auto p : heartbeat_peers) { + f(p); + } + for (auto p : probe_targets) { + f(p); + } + heartbeat_peer_lock.Unlock(); +} diff --git a/src/osd/PG.h b/src/osd/PG.h new file mode 100644 index 00000000..7655493b --- /dev/null +++ b/src/osd/PG.h @@ -0,0 +1,3168 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_PG_H +#define CEPH_PG_H + +#include <boost/statechart/custom_reaction.hpp> +#include <boost/statechart/event.hpp> +#include <boost/statechart/simple_state.hpp> +#include <boost/statechart/state.hpp> +#include <boost/statechart/state_machine.hpp> +#include <boost/statechart/transition.hpp> +#include <boost/statechart/event_base.hpp> +#include <boost/scoped_ptr.hpp> +#include <boost/circular_buffer.hpp> +#include <boost/container/flat_set.hpp> +#include "include/mempool.h" + +// re-include our assert to clobber boost's +#include "include/ceph_assert.h" + +#include "include/types.h" +#include "include/stringify.h" +#include "osd_types.h" +#include "include/xlist.h" +#include "SnapMapper.h" +#include "Session.h" +#include "common/Timer.h" + +#include "PGLog.h" +#include "OSDMap.h" +#include "messages/MOSDPGLog.h" +#include "include/str_list.h" +#include "PGBackend.h" +#include "PGPeeringEvent.h" + +#include "mgr/OSDPerfMetricTypes.h" + +#include <atomic> +#include <list> +#include <memory> +#include <stack> +#include <string> +#include <tuple> + +//#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs +//#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks + +class OSD; +class OSDService; +class OSDShard; +class OSDShardPGSlot; +class MOSDOp; +class MOSDPGScan; +class MOSDPGBackfill; +class MOSDPGInfo; + +class PG; +struct OpRequest; +typedef OpRequest::Ref OpRequestRef; +class MOSDPGLog; +class CephContext; +class DynamicPerfStats; + +namespace Scrub { + class Store; +} + +using state_history_entry = std::tuple<utime_t, utime_t, const char*>; +using embedded_state = std::pair<utime_t, const char*>; + +struct PGStateInstance { + // Time spent in pg states + + void setepoch(const epoch_t current_epoch) { + this_epoch = current_epoch; + } + + void enter_state(const utime_t entime, const char* state) { + embedded_states.push(std::make_pair(entime, state)); + } + + void exit_state(const utime_t extime) { + embedded_state this_state = embedded_states.top(); + state_history.push_back(state_history_entry{ + this_state.first, extime, this_state.second}); + embedded_states.pop(); + } + + epoch_t this_epoch; + utime_t enter_time; + std::vector<state_history_entry> state_history; + std::stack<embedded_state> embedded_states; +}; + +class PGStateHistory { + // Member access protected with the PG lock +public: + PGStateHistory() : buffer(10) {} + + void enter(PG* pg, const utime_t entime, const char* state); + + void exit(const char* state); + + void reset() { + pi = nullptr; + } + + void set_pg_in_destructor() { pg_in_destructor = true; } + + void dump(Formatter* f) const; + + string get_current_state() { + if (pi == nullptr) return "unknown"; + return std::get<1>(pi->embedded_states.top()); + } + +private: + bool pg_in_destructor = false; + PG* thispg = nullptr; + std::unique_ptr<PGStateInstance> tmppi; + PGStateInstance* pi = nullptr; + boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer; + +}; + +#ifdef PG_DEBUG_REFS +#include "common/tracked_int_ptr.hpp" + uint64_t get_with_id(PG *pg); + void put_with_id(PG *pg, uint64_t id); + typedef TrackedIntPtr<PG> PGRef; +#else + typedef boost::intrusive_ptr<PG> PGRef; +#endif + +class PGRecoveryStats { + struct per_state_info { + uint64_t enter, exit; // enter/exit counts + uint64_t events; + utime_t event_time; // time spent processing events + utime_t total_time; // total time in state + utime_t min_time, max_time; + + // cppcheck-suppress unreachableCode + per_state_info() : enter(0), exit(0), events(0) {} + }; + map<const char *,per_state_info> info; + Mutex lock; + + public: + PGRecoveryStats() : lock("PGRecoverStats::lock") {} + + void reset() { + std::lock_guard l(lock); + info.clear(); + } + void dump(ostream& out) { + std::lock_guard l(lock); + for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) { + per_state_info& i = p->second; + out << i.enter << "\t" << i.exit << "\t" + << i.events << "\t" << i.event_time << "\t" + << i.total_time << "\t" + << i.min_time << "\t" << i.max_time << "\t" + << p->first << "\n"; + } + } + + void dump_formatted(Formatter *f) { + std::lock_guard l(lock); + f->open_array_section("pg_recovery_stats"); + for (map<const char *,per_state_info>::iterator p = info.begin(); + p != info.end(); ++p) { + per_state_info& i = p->second; + f->open_object_section("recovery_state"); + f->dump_int("enter", i.enter); + f->dump_int("exit", i.exit); + f->dump_int("events", i.events); + f->dump_stream("event_time") << i.event_time; + f->dump_stream("total_time") << i.total_time; + f->dump_stream("min_time") << i.min_time; + f->dump_stream("max_time") << i.max_time; + vector<string> states; + get_str_vec(p->first, "/", states); + f->open_array_section("nested_states"); + for (vector<string>::iterator st = states.begin(); + st != states.end(); ++st) { + f->dump_string("state", *st); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + } + + void log_enter(const char *s) { + std::lock_guard l(lock); + info[s].enter++; + } + void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) { + std::lock_guard l(lock); + per_state_info &i = info[s]; + i.exit++; + i.total_time += dur; + if (dur > i.max_time) + i.max_time = dur; + if (dur < i.min_time || i.min_time == utime_t()) + i.min_time = dur; + i.events += events; + i.event_time += event_dur; + } +}; + +struct PGPool { + CephContext* cct; + epoch_t cached_epoch; + int64_t id; + string name; + + pg_pool_t info; + SnapContext snapc; // the default pool snapc, ready to go. + + // these two sets are for < mimic only + interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set + interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch + + PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info, + const string& name) + : cct(cct), + cached_epoch(map->get_epoch()), + id(i), + name(name), + info(info) { + snapc = info.get_snap_context(); + if (map->require_osd_release < CEPH_RELEASE_MIMIC) { + info.build_removed_snaps(cached_removed_snaps); + } + } + + void update(CephContext *cct, OSDMapRef map); +}; + +/** PG - Replica Placement Group + * + */ + +class PG : public DoutPrefixProvider { +public: + // -- members -- + const spg_t pg_id; + const coll_t coll; + + ObjectStore::CollectionHandle ch; + + struct RecoveryCtx; + + // -- methods -- + std::ostream& gen_prefix(std::ostream& out) const override; + CephContext *get_cct() const override { + return cct; + } + unsigned get_subsys() const override { + return ceph_subsys_osd; + } + + const OSDMapRef& get_osdmap() const { + ceph_assert(is_locked()); + ceph_assert(osdmap_ref); + return osdmap_ref; + } + epoch_t get_osdmap_epoch() const { + return osdmap_ref->get_epoch(); + } + + void lock_suspend_timeout(ThreadPool::TPHandle &handle) { + handle.suspend_tp_timeout(); + lock(); + handle.reset_tp_timeout(); + } + void lock(bool no_lockdep = false) const; + void unlock() const { + //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; + ceph_assert(!dirty_info); + ceph_assert(!dirty_big_info); + _lock.Unlock(); + } + bool is_locked() const { + return _lock.is_locked(); + } + + const spg_t& get_pgid() const { + return pg_id; + } + + const PGPool& get_pool() const { + return pool; + } + uint64_t get_last_user_version() const { + return info.last_user_version; + } + const pg_history_t& get_history() const { + return info.history; + } + bool get_need_up_thru() const { + return need_up_thru; + } + epoch_t get_same_interval_since() const { + return info.history.same_interval_since; + } + + void set_last_scrub_stamp(utime_t t) { + info.stats.last_scrub_stamp = t; + info.history.last_scrub_stamp = t; + } + + void set_last_deep_scrub_stamp(utime_t t) { + info.stats.last_deep_scrub_stamp = t; + info.history.last_deep_scrub_stamp = t; + } + + bool is_deleting() const { + return deleting; + } + bool is_deleted() const { + return deleted; + } + bool is_replica() const { + return role > 0; + } + bool is_primary() const { + return pg_whoami == primary; + } + bool pg_has_reset_since(epoch_t e) { + ceph_assert(is_locked()); + return deleted || e < get_last_peering_reset(); + } + + bool is_ec_pg() const { + return pool.info.is_erasure(); + } + int get_role() const { + return role; + } + const vector<int> get_acting() const { + return acting; + } + int get_acting_primary() const { + return primary.osd; + } + pg_shard_t get_primary() const { + return primary; + } + const vector<int> get_up() const { + return up; + } + int get_up_primary() const { + return up_primary.osd; + } + const PastIntervals& get_past_intervals() const { + return past_intervals; + } + + /// initialize created PG + void init( + int role, + const vector<int>& up, + int up_primary, + const vector<int>& acting, + int acting_primary, + const pg_history_t& history, + const PastIntervals& pim, + bool backfill, + ObjectStore::Transaction *t); + + /// read existing pg state off disk + void read_state(ObjectStore *store); + static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch); + + static int get_latest_struct_v() { + return latest_struct_v; + } + static int get_compat_struct_v() { + return compat_struct_v; + } + static int read_info( + ObjectStore *store, spg_t pgid, const coll_t &coll, + pg_info_t &info, PastIntervals &past_intervals, + __u8 &); + static bool _has_removal_flag(ObjectStore *store, spg_t pgid); + + void rm_backoff(BackoffRef b); + + void update_snap_mapper_bits(uint32_t bits) { + snap_mapper.update_bits(bits); + } + void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v); + virtual void split_colls( + spg_t child, + int split_bits, + int seed, + const pg_pool_t *pool, + ObjectStore::Transaction *t) = 0; + void split_into(pg_t child_pgid, PG *child, unsigned split_bits); + void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx, + unsigned split_bits, + const pg_merge_meta_t& last_pg_merge_meta); + void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t); + + void scrub(epoch_t queued, ThreadPool::TPHandle &handle); + + bool is_scrub_registered(); + void reg_next_scrub(); + void unreg_next_scrub(); + + void on_info_history_change(); + + void scrub_requested(bool deep, bool repair, bool need_auto = false); + + bool is_forced_recovery_or_backfill() const { + return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL); + } + bool set_force_recovery(bool b); + bool set_force_backfill(bool b); + + void queue_peering_event(PGPeeringEventRef evt); + void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx); + void queue_null(epoch_t msg_epoch, epoch_t query_epoch); + void queue_flushed(epoch_t started_at); + void handle_advance_map( + OSDMapRef osdmap, OSDMapRef lastmap, + vector<int>& newup, int up_primary, + vector<int>& newacting, int acting_primary, + RecoveryCtx *rctx); + void handle_activate_map(RecoveryCtx *rctx); + void handle_initialize(RecoveryCtx *rctx); + void handle_query_state(Formatter *f); + + /** + * @param ops_begun returns how many recovery ops the function started + * @returns true if any useful work was accomplished; false otherwise + */ + virtual bool start_recovery_ops( + uint64_t max, + ThreadPool::TPHandle &handle, + uint64_t *ops_begun) = 0; + + // more work after the above, but with a RecoveryCtx + void find_unfound(epoch_t queued, RecoveryCtx *rctx); + + virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0; + + void dump_pgstate_history(Formatter *f); + void dump_missing(Formatter *f); + + void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f); + void with_heartbeat_peers(std::function<void(int)> f); + + void shutdown(); + virtual void on_shutdown() = 0; + + bool get_must_scrub() const { + return scrubber.must_scrub; + } + bool sched_scrub(); + + virtual void do_request( + OpRequestRef& op, + ThreadPool::TPHandle &handle + ) = 0; + virtual void clear_cache() = 0; + virtual int get_cache_obj_count() = 0; + + virtual void snap_trimmer(epoch_t epoch_queued) = 0; + virtual int do_command( + cmdmap_t cmdmap, + ostream& ss, + bufferlist& idata, + bufferlist& odata, + ConnectionRef conn, + ceph_tid_t tid) = 0; + + virtual bool agent_work(int max) = 0; + virtual bool agent_work(int max, int agent_flush_quota) = 0; + virtual void agent_stop() = 0; + virtual void agent_delay() = 0; + virtual void agent_clear() = 0; + virtual void agent_choose_mode_restart() = 0; + + virtual void on_removal(ObjectStore::Transaction *t) = 0; + + ghobject_t _delete_some(ObjectStore::Transaction *t, + ghobject_t _next); + + virtual void set_dynamic_perf_stats_queries( + const std::list<OSDPerfMetricQuery> &queries) { + } + virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) { + } + + // reference counting +#ifdef PG_DEBUG_REFS + uint64_t get_with_id(); + void put_with_id(uint64_t); + void dump_live_ids(); +#endif + void get(const char* tag); + void put(const char* tag); + int get_num_ref() { + return ref; + } + + // ctor + PG(OSDService *o, OSDMapRef curmap, + const PGPool &pool, spg_t p); + ~PG() override; + + // prevent copying + explicit PG(const PG& rhs) = delete; + PG& operator=(const PG& rhs) = delete; + +protected: + // ------------- + // protected + OSDService *osd; +public: + OSDShard *osd_shard = nullptr; + OSDShardPGSlot *pg_slot = nullptr; +protected: + CephContext *cct; + + // osdmap + OSDMapRef osdmap_ref; + + PGPool pool; + + // locking and reference counting. + // I destroy myself when the reference count hits zero. + // lock() should be called before doing anything. + // get() should be called on pointer copy (to another thread, etc.). + // put() should be called on destruction of some previously copied pointer. + // unlock() when done with the current pointer (_most common_). + mutable Mutex _lock = {"PG::_lock"}; + + std::atomic<unsigned int> ref{0}; + +#ifdef PG_DEBUG_REFS + Mutex _ref_id_lock = {"PG::_ref_id_lock"}; + map<uint64_t, string> _live_ids; + map<string, uint64_t> _tag_counts; + uint64_t _ref_id = 0; + + friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); } + friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); } +#endif + +private: + friend void intrusive_ptr_add_ref(PG *pg) { + pg->get("intptr"); + } + friend void intrusive_ptr_release(PG *pg) { + pg->put("intptr"); + } + + + // ===================== + +protected: + OSDriver osdriver; + SnapMapper snap_mapper; + bool eio_errors_to_process = false; + + virtual PGBackend *get_pgbackend() = 0; + virtual const PGBackend* get_pgbackend() const = 0; + +protected: + /*** PG ****/ + /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done + IsPGRecoverablePredicate *get_is_recoverable_predicate() const { + return get_pgbackend()->get_is_recoverable_predicate(); + } +protected: + epoch_t last_persisted_osdmap; + + void requeue_map_waiters(); + + void update_osdmap_ref(OSDMapRef newmap) { + ceph_assert(_lock.is_locked_by_me()); + osdmap_ref = std::move(newmap); + } + +protected: + + + bool deleting; // true while in removing or OSD is shutting down + atomic<bool> deleted = {false}; + + ZTracer::Endpoint trace_endpoint; + + +protected: + bool dirty_info, dirty_big_info; + +protected: + // pg state + pg_info_t info; ///< current pg info + pg_info_t last_written_info; ///< last written info + __u8 info_struct_v = 0; + static const __u8 latest_struct_v = 10; + // v10 is the new past_intervals encoding + // v9 was fastinfo_key addition + // v8 was the move to a per-pg pgmeta object + // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad + // (first appeared in cuttlefish). + static const __u8 compat_struct_v = 10; + void upgrade(ObjectStore *store); + +protected: + PGLog pg_log; + ghobject_t pgmeta_oid; + + // ------------------ + // MissingLoc + + class MissingLoc { + public: + // a loc_count indicates how many locations we know in each of + // these distinct sets + struct loc_count_t { + int up = 0; //< up + int other = 0; //< other + + friend bool operator<(const loc_count_t& l, + const loc_count_t& r) { + return (l.up < r.up || + (l.up == r.up && + (l.other < r.other))); + } + friend ostream& operator<<(ostream& out, const loc_count_t& l) { + ceph_assert(l.up >= 0); + ceph_assert(l.other >= 0); + return out << "(" << l.up << "+" << l.other << ")"; + } + }; + + + private: + + loc_count_t _get_count(const set<pg_shard_t>& shards) { + loc_count_t r; + for (auto s : shards) { + if (pg->upset.count(s)) { + r.up++; + } else { + r.other++; + } + } + return r; + } + + map<hobject_t, pg_missing_item> needs_recovery_map; + map<hobject_t, set<pg_shard_t> > missing_loc; + set<pg_shard_t> missing_loc_sources; + + // for every entry in missing_loc, we count how many of each type of shard we have, + // and maintain totals here. The sum of the values for this map will always equal + // missing_loc.size(). + map < shard_id_t, map<loc_count_t,int> > missing_by_count; + + void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) { + if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) { + int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid); + // For completely missing shards initialize with empty set<pg_shard_t> + for (int i = 0 ; i < num_shards ; ++i) { + shard_id_t shard(i); + pgsbs[shard]; + } + for (auto pgs: s) + pgsbs[pgs.shard].insert(pgs); + } else { + pgsbs[shard_id_t::NO_SHARD] = s; + } + } + + void _inc_count(const set<pg_shard_t>& s) { + map< shard_id_t, set<pg_shard_t> > pgsbs; + pgs_by_shard_id(s, pgsbs); + for (auto shard: pgsbs) + ++missing_by_count[shard.first][_get_count(shard.second)]; + } + void _dec_count(const set<pg_shard_t>& s) { + map< shard_id_t, set<pg_shard_t> > pgsbs; + pgs_by_shard_id(s, pgsbs); + for (auto shard: pgsbs) { + auto p = missing_by_count[shard.first].find(_get_count(shard.second)); + ceph_assert(p != missing_by_count[shard.first].end()); + if (--p->second == 0) { + missing_by_count[shard.first].erase(p); + } + } + } + + PG *pg; + set<pg_shard_t> empty_set; + public: + boost::scoped_ptr<IsPGReadablePredicate> is_readable; + boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable; + explicit MissingLoc(PG *pg) + : pg(pg) { } + void set_backend_predicates( + IsPGReadablePredicate *_is_readable, + IsPGRecoverablePredicate *_is_recoverable) { + is_readable.reset(_is_readable); + is_recoverable.reset(_is_recoverable); + } + std::ostream& gen_prefix(std::ostream& out) const { + return pg->gen_prefix(out); + } + bool needs_recovery( + const hobject_t &hoid, + eversion_t *v = 0) const { + map<hobject_t, pg_missing_item>::const_iterator i = + needs_recovery_map.find(hoid); + if (i == needs_recovery_map.end()) + return false; + if (v) + *v = i->second.need; + return true; + } + bool is_deleted(const hobject_t &hoid) const { + auto i = needs_recovery_map.find(hoid); + if (i == needs_recovery_map.end()) + return false; + return i->second.is_delete(); + } + bool is_unfound(const hobject_t &hoid) const { + auto it = needs_recovery_map.find(hoid); + if (it == needs_recovery_map.end()) { + return false; + } + if (it->second.is_delete()) { + return false; + } + auto mit = missing_loc.find(hoid); + return mit == missing_loc.end() || !(*is_recoverable)(mit->second); + } + bool readable_with_acting( + const hobject_t &hoid, + const set<pg_shard_t> &acting) const; + uint64_t num_unfound() const { + uint64_t ret = 0; + for (map<hobject_t, pg_missing_item>::const_iterator i = + needs_recovery_map.begin(); + i != needs_recovery_map.end(); + ++i) { + if (i->second.is_delete()) + continue; + auto mi = missing_loc.find(i->first); + if (mi == missing_loc.end() || !(*is_recoverable)(mi->second)) + ++ret; + } + return ret; + } + + bool have_unfound() const { + for (map<hobject_t, pg_missing_item>::const_iterator i = + needs_recovery_map.begin(); + i != needs_recovery_map.end(); + ++i) { + if (i->second.is_delete()) + continue; + auto mi = missing_loc.find(i->first); + if (mi == missing_loc.end() || !(*is_recoverable)(mi->second)) + return true; + } + return false; + } + void clear() { + needs_recovery_map.clear(); + missing_loc.clear(); + missing_loc_sources.clear(); + missing_by_count.clear(); + } + + void add_location(const hobject_t &hoid, pg_shard_t location) { + auto p = missing_loc.find(hoid); + if (p == missing_loc.end()) { + p = missing_loc.emplace(hoid, set<pg_shard_t>()).first; + } else { + _dec_count(p->second); + } + p->second.insert(location); + _inc_count(p->second); + } + void remove_location(const hobject_t &hoid, pg_shard_t location) { + auto p = missing_loc.find(hoid); + if (p != missing_loc.end()) { + _dec_count(p->second); + p->second.erase(location); + if (p->second.empty()) { + missing_loc.erase(p); + } else { + _inc_count(p->second); + } + } + } + + void clear_location(const hobject_t &hoid) { + auto p = missing_loc.find(hoid); + if (p != missing_loc.end()) { + _dec_count(p->second); + missing_loc.erase(p); + } + } + + void add_active_missing(const pg_missing_t &missing) { + for (map<hobject_t, pg_missing_item>::const_iterator i = + missing.get_items().begin(); + i != missing.get_items().end(); + ++i) { + map<hobject_t, pg_missing_item>::const_iterator j = + needs_recovery_map.find(i->first); + if (j == needs_recovery_map.end()) { + needs_recovery_map.insert(*i); + } else { + lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for " + << i->first << " have " << j->second + << " tried to add " << i->second << dendl; + ceph_assert(i->second.need == j->second.need); + } + } + } + + void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) { + needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete); + } + void revise_need(const hobject_t &hoid, eversion_t need) { + auto it = needs_recovery_map.find(hoid); + ceph_assert(it != needs_recovery_map.end()); + it->second.need = need; + } + + /// Adds info about a possible recovery source + bool add_source_info( + pg_shard_t source, ///< [in] source + const pg_info_t &oinfo, ///< [in] info + const pg_missing_t &omissing, ///< [in] (optional) missing + ThreadPool::TPHandle* handle ///< [in] ThreadPool handle + ); ///< @return whether a new object location was discovered + + /// Adds recovery sources in batch + void add_batch_sources_info( + const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects + ThreadPool::TPHandle* handle ///< [in] ThreadPool handle + ); + + /// Uses osdmap to update structures for now down sources + void check_recovery_sources(const OSDMapRef& osdmap); + + /// Call when hoid is no longer missing in acting set + void recovered(const hobject_t &hoid) { + needs_recovery_map.erase(hoid); + auto p = missing_loc.find(hoid); + if (p != missing_loc.end()) { + _dec_count(p->second); + missing_loc.erase(p); + } + } + + /// Call to update structures for hoid after a change + void rebuild( + const hobject_t &hoid, + pg_shard_t self, + const set<pg_shard_t> to_recover, + const pg_info_t &info, + const pg_missing_t &missing, + const map<pg_shard_t, pg_missing_t> &pmissing, + const map<pg_shard_t, pg_info_t> &pinfo) { + recovered(hoid); + boost::optional<pg_missing_item> item; + auto miter = missing.get_items().find(hoid); + if (miter != missing.get_items().end()) { + item = miter->second; + } else { + for (auto &&i: to_recover) { + if (i == self) + continue; + auto pmiter = pmissing.find(i); + ceph_assert(pmiter != pmissing.end()); + miter = pmiter->second.get_items().find(hoid); + if (miter != pmiter->second.get_items().end()) { + item = miter->second; + break; + } + } + } + if (!item) + return; // recovered! + + needs_recovery_map[hoid] = *item; + if (item->is_delete()) + return; + auto mliter = + missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first; + ceph_assert(info.last_backfill.is_max()); + ceph_assert(info.last_update >= item->need); + if (!missing.is_missing(hoid)) + mliter->second.insert(self); + for (auto &&i: pmissing) { + if (i.first == self) + continue; + auto pinfoiter = pinfo.find(i.first); + ceph_assert(pinfoiter != pinfo.end()); + if (item->need <= pinfoiter->second.last_update && + hoid <= pinfoiter->second.last_backfill && + !i.second.is_missing(hoid)) + mliter->second.insert(i.first); + } + _inc_count(mliter->second); + } + + const set<pg_shard_t> &get_locations(const hobject_t &hoid) const { + auto it = missing_loc.find(hoid); + return it == missing_loc.end() ? empty_set : it->second; + } + const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const { + return missing_loc; + } + const map<hobject_t, pg_missing_item> &get_needs_recovery() const { + return needs_recovery_map; + } + const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const { + return missing_by_count; + } + } missing_loc; + + PastIntervals past_intervals; + + interval_set<snapid_t> snap_trimq; + + /* You should not use these items without taking their respective queue locks + * (if they have one) */ + xlist<PG*>::item stat_queue_item; + bool scrub_queued; + bool recovery_queued; + + int recovery_ops_active; + set<pg_shard_t> waiting_on_backfill; +#ifdef DEBUG_RECOVERY_OIDS + multiset<hobject_t> recovering_oids; +#endif + +protected: + int role; // 0 = primary, 1 = replica, -1=none. + uint64_t state; // PG_STATE_* + + bool send_notify; ///< true if we are non-primary and should notify the primary + +protected: + eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active() + eversion_t last_complete_ondisk; // last_complete that has committed. + eversion_t last_update_applied; + + // entries <= last_rollback_info_trimmed_to_applied have been trimmed + eversion_t last_rollback_info_trimmed_to_applied; + + // primary state +protected: + pg_shard_t primary; + pg_shard_t pg_whoami; + pg_shard_t up_primary; + vector<int> up, acting, want_acting; + // acting_recovery_backfill contains shards that are acting, + // async recovery targets, or backfill targets. + set<pg_shard_t> acting_recovery_backfill, actingset, upset; + map<pg_shard_t,eversion_t> peer_last_complete_ondisk; + eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk + eversion_t pg_trim_to; + + set<int> blocked_by; ///< osds we are blocked by (for pg stats) + +protected: + // [primary only] content recovery state + struct BufferedRecoveryMessages { + map<int, map<spg_t, pg_query_t> > query_map; + map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map; + map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list; + }; + +public: + bool dne() { return info.dne(); } + struct RecoveryCtx { + utime_t start_time; + map<int, map<spg_t, pg_query_t> > *query_map; + map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map; + map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list; + ObjectStore::Transaction *transaction; + ThreadPool::TPHandle* handle; + RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map, + map<int, + vector<pair<pg_notify_t, PastIntervals> > > *info_map, + map<int, + vector<pair<pg_notify_t, PastIntervals> > > *notify_list, + ObjectStore::Transaction *transaction) + : query_map(query_map), info_map(info_map), + notify_list(notify_list), + transaction(transaction), + handle(NULL) {} + + RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx) + : query_map(&(buf.query_map)), + info_map(&(buf.info_map)), + notify_list(&(buf.notify_list)), + transaction(rctx.transaction), + handle(rctx.handle) {} + + void accept_buffered_messages(BufferedRecoveryMessages &m) { + ceph_assert(query_map); + ceph_assert(info_map); + ceph_assert(notify_list); + for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin(); + i != m.query_map.end(); + ++i) { + map<spg_t, pg_query_t> &omap = (*query_map)[i->first]; + for (map<spg_t, pg_query_t>::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + omap[j->first] = j->second; + } + } + for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i + = m.info_map.begin(); + i != m.info_map.end(); + ++i) { + vector<pair<pg_notify_t, PastIntervals> > &ovec = + (*info_map)[i->first]; + ovec.reserve(ovec.size() + i->second.size()); + ovec.insert(ovec.end(), i->second.begin(), i->second.end()); + } + for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i + = m.notify_list.begin(); + i != m.notify_list.end(); + ++i) { + vector<pair<pg_notify_t, PastIntervals> > &ovec = + (*notify_list)[i->first]; + ovec.reserve(ovec.size() + i->second.size()); + ovec.insert(ovec.end(), i->second.begin(), i->second.end()); + } + } + + void send_notify(pg_shard_t to, + const pg_notify_t &info, const PastIntervals &pi) { + ceph_assert(notify_list); + (*notify_list)[to.osd].push_back(make_pair(info, pi)); + } + }; +protected: + + PGStateHistory pgstate_history; + + struct NamedState { + const char *state_name; + utime_t enter_time; + PG* pg; + const char *get_state_name() { return state_name; } + NamedState(PG *pg_, const char *state_name_) + : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) { + pg->pgstate_history.enter(pg, enter_time, state_name); + } + virtual ~NamedState() { pg->pgstate_history.exit(state_name); } + }; + + + +protected: + + /* + * peer_info -- projected (updates _before_ replicas ack) + * peer_missing -- committed (updates _after_ replicas ack) + */ + + bool need_up_thru; + set<pg_shard_t> stray_set; // non-acting osds that have PG data. + map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior) + map<pg_shard_t, int64_t> peer_bytes; // Peer's num_bytes from peer_info + set<pg_shard_t> peer_purged; // peers purged + map<pg_shard_t, pg_missing_t> peer_missing; + set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps) + set<pg_shard_t> peer_missing_requested; + + // i deleted these strays; ignore racing PGInfo from them + set<pg_shard_t> peer_activated; + + // primary-only, recovery-only state + set<pg_shard_t> might_have_unfound; // These osds might have objects on them + // which are unfound on the primary + epoch_t last_peering_reset; + + epoch_t get_last_peering_reset() const { + return last_peering_reset; + } + + /* heartbeat peers */ + void set_probe_targets(const set<pg_shard_t> &probe_set); + void clear_probe_targets(); + + Mutex heartbeat_peer_lock; + set<int> heartbeat_peers; + set<int> probe_targets; + +public: + /** + * BackfillInterval + * + * Represents the objects in a range [begin, end) + * + * Possible states: + * 1) begin == end == hobject_t() indicates the the interval is unpopulated + * 2) Else, objects contains all objects in [begin, end) + */ + struct BackfillInterval { + // info about a backfill interval on a peer + eversion_t version; /// version at which the scan occurred + map<hobject_t,eversion_t> objects; + hobject_t begin; + hobject_t end; + + /// clear content + void clear() { + *this = BackfillInterval(); + } + + /// clear objects list only + void clear_objects() { + objects.clear(); + } + + /// reinstantiate with a new start+end position and sort order + void reset(hobject_t start) { + clear(); + begin = end = start; + } + + /// true if there are no objects in this interval + bool empty() const { + return objects.empty(); + } + + /// true if interval extends to the end of the range + bool extends_to_end() const { + return end.is_max(); + } + + /// removes items <= soid and adjusts begin to the first object + void trim_to(const hobject_t &soid) { + trim(); + while (!objects.empty() && + objects.begin()->first <= soid) { + pop_front(); + } + } + + /// Adjusts begin to the first object + void trim() { + if (!objects.empty()) + begin = objects.begin()->first; + else + begin = end; + } + + /// drop first entry, and adjust @begin accordingly + void pop_front() { + ceph_assert(!objects.empty()); + objects.erase(objects.begin()); + trim(); + } + + /// dump + void dump(Formatter *f) const { + f->dump_stream("begin") << begin; + f->dump_stream("end") << end; + f->open_array_section("objects"); + for (map<hobject_t, eversion_t>::const_iterator i = + objects.begin(); + i != objects.end(); + ++i) { + f->open_object_section("object"); + f->dump_stream("object") << i->first; + f->dump_stream("version") << i->second; + f->close_section(); + } + f->close_section(); + } + }; + +protected: + BackfillInterval backfill_info; + map<pg_shard_t, BackfillInterval> peer_backfill_info; + bool backfill_reserved; + bool backfill_reserving; + + set<pg_shard_t> backfill_targets, async_recovery_targets; + + // The primary's num_bytes and local num_bytes for this pg, only valid + // during backfill for non-primary shards. + // Both of these are adjusted for EC to reflect the on-disk bytes + std::atomic<int64_t> primary_num_bytes = 0; + std::atomic<int64_t> local_num_bytes = 0; + +public: + bool is_backfill_targets(pg_shard_t osd) { + return backfill_targets.count(osd); + } + + // Space reserved for backfill is primary_num_bytes - local_num_bytes + // Don't care that difference itself isn't atomic + uint64_t get_reserved_num_bytes() { + int64_t primary = primary_num_bytes.load(); + int64_t local = local_num_bytes.load(); + if (primary > local) + return primary - local; + else + return 0; + } + + bool is_remote_backfilling() { + return primary_num_bytes.load() > 0; + } + + void set_reserved_num_bytes(int64_t primary, int64_t local); + void clear_reserved_num_bytes(); + + // If num_bytes are inconsistent and local_num- goes negative + // it's ok, because it would then be ignored. + + // The value of num_bytes could be negative, + // but we don't let local_num_bytes go negative. + void add_local_num_bytes(int64_t num_bytes) { + if (num_bytes) { + int64_t prev_bytes = local_num_bytes.load(); + int64_t new_bytes; + do { + new_bytes = prev_bytes + num_bytes; + if (new_bytes < 0) + new_bytes = 0; + } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes)); + } + } + void sub_local_num_bytes(int64_t num_bytes) { + ceph_assert(num_bytes >= 0); + if (num_bytes) { + int64_t prev_bytes = local_num_bytes.load(); + int64_t new_bytes; + do { + new_bytes = prev_bytes - num_bytes; + if (new_bytes < 0) + new_bytes = 0; + } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes)); + } + } + // The value of num_bytes could be negative, + // but we don't let info.stats.stats.sum.num_bytes go negative. + void add_num_bytes(int64_t num_bytes) { + ceph_assert(_lock.is_locked_by_me()); + if (num_bytes) { + info.stats.stats.sum.num_bytes += num_bytes; + if (info.stats.stats.sum.num_bytes < 0) { + info.stats.stats.sum.num_bytes = 0; + } + } + } + void sub_num_bytes(int64_t num_bytes) { + ceph_assert(_lock.is_locked_by_me()); + ceph_assert(num_bytes >= 0); + if (num_bytes) { + info.stats.stats.sum.num_bytes -= num_bytes; + if (info.stats.stats.sum.num_bytes < 0) { + info.stats.stats.sum.num_bytes = 0; + } + } + } + + // Only used in testing so not worried about needing the PG lock here + int64_t get_stats_num_bytes() { + Mutex::Locker l(_lock); + int num_bytes = info.stats.stats.sum.num_bytes; + if (pool.info.is_erasure()) { + num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); + // Round up each object by a stripe + num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects; + } + int64_t lnb = local_num_bytes.load(); + if (lnb && lnb != num_bytes) { + lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch " + << lnb << " vs stats " + << info.stats.stats.sum.num_bytes << " / chunk " + << get_pgbackend()->get_ec_data_chunk_count() + << dendl; + } + return num_bytes; + } + +protected: + + /* + * blocked request wait hierarchy + * + * In order to preserve request ordering we need to be careful about the + * order in which blocked requests get requeued. Generally speaking, we + * push the requests back up to the op_wq in reverse order (most recent + * request first) so that they come back out again in the original order. + * However, because there are multiple wait queues, we need to requeue + * waitlists in order. Generally speaking, we requeue the wait lists + * that are checked first. + * + * Here are the various wait lists, in the order they are used during + * request processing, with notes: + * + * - waiting_for_map + * - may start or stop blocking at any time (depending on client epoch) + * - waiting_for_peered + * - !is_peered() or flushes_in_progress + * - only starts blocking on interval change; never restarts + * - waiting_for_active + * - !is_active() + * - only starts blocking on interval change; never restarts + * - waiting_for_flush + * - is_active() and flushes_in_progress + * - waiting for final flush during activate + * - waiting_for_scrub + * - starts and stops blocking for varying intervals during scrub + * - waiting_for_unreadable_object + * - never restarts once object is readable (* except for EIO?) + * - waiting_for_degraded_object + * - never restarts once object is writeable (* except for EIO?) + * - waiting_for_blocked_object + * - starts and stops based on proxied op activity + * - obc rwlocks + * - starts and stops based on read/write activity + * + * Notes: + * + * 1. During and interval change, we requeue *everything* in the above order. + * + * 2. When an obc rwlock is released, we check for a scrub block and requeue + * the op there if it applies. We ignore the unreadable/degraded/blocked + * queues because we assume they cannot apply at that time (this is + * probably mostly true). + * + * 3. The requeue_ops helper will push ops onto the waiting_for_map list if + * it is non-empty. + * + * These three behaviors are generally sufficient to maintain ordering, with + * the possible exception of cases where we make an object degraded or + * unreadable that was previously okay, e.g. when scrub or op processing + * encounter an unexpected error. FIXME. + */ + + // pg waiters + unsigned flushes_in_progress; + + // ops with newer maps than our (or blocked behind them) + // track these by client, since inter-request ordering doesn't otherwise + // matter. + unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map; + + // ops waiting on peered + list<OpRequestRef> waiting_for_peered; + + // ops waiting on active (require peered as well) + list<OpRequestRef> waiting_for_active; + list<OpRequestRef> waiting_for_flush; + list<OpRequestRef> waiting_for_scrub; + + list<OpRequestRef> waiting_for_cache_not_full; + list<OpRequestRef> waiting_for_clean_to_primary_repair; + map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object, + waiting_for_degraded_object, + waiting_for_blocked_object; + + set<hobject_t> objects_blocked_on_cache_full; + map<hobject_t,snapid_t> objects_blocked_on_degraded_snap; + map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion; + + // Callbacks should assume pg (and nothing else) is locked + map<hobject_t, list<Context*>> callbacks_for_degraded_object; + + map<eversion_t, + list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk; + + void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m); + void requeue_op(OpRequestRef op); + void requeue_ops(list<OpRequestRef> &l); + + // stats that persist lazily + object_stat_collection_t unstable_stats; + + // publish stats + Mutex pg_stats_publish_lock; + bool pg_stats_publish_valid; + pg_stat_t pg_stats_publish; + + void _update_calc_stats(); + void _update_blocked_by(); + friend class TestOpsSocketHook; + void publish_stats_to_osd(); + void clear_publish_stats(); + + void clear_primary_state(); + + bool is_acting_recovery_backfill(pg_shard_t osd) const { + return acting_recovery_backfill.count(osd); + } + bool is_acting(pg_shard_t osd) const { + return has_shard(pool.info.is_erasure(), acting, osd); + } + bool is_up(pg_shard_t osd) const { + return has_shard(pool.info.is_erasure(), up, osd); + } + static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) { + if (ec) { + return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd; + } else { + return std::find(v.begin(), v.end(), osd.osd) != v.end(); + } + } + + bool needs_recovery() const; + bool needs_backfill() const; + + /// clip calculated priority to reasonable range + int clamp_recovery_priority(int prio, int pool_recovery_prio, int max); + /// get log recovery reservation priority + unsigned get_recovery_priority(); + /// get backfill reservation priority + unsigned get_backfill_priority(); + /// get priority for pg deletion + unsigned get_delete_priority(); + + void try_mark_clean(); ///< mark an active pg clean + + /// return [start,end) bounds for required past_intervals + static pair<epoch_t, epoch_t> get_required_past_interval_bounds( + const pg_info_t &info, + epoch_t oldest_map) { + epoch_t start = std::max( + info.history.last_epoch_clean ? info.history.last_epoch_clean : + info.history.epoch_pool_created, + oldest_map); + epoch_t end = std::max( + info.history.same_interval_since, + info.history.epoch_pool_created); + return make_pair(start, end); + } + void check_past_interval_bounds() const; + PastIntervals::PriorSet build_prior(); + + void remove_down_peer_info(const OSDMapRef osdmap); + + bool adjust_need_up_thru(const OSDMapRef osdmap); + + bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const; + virtual void dump_recovery_info(Formatter *f) const = 0; + + void calc_min_last_complete_ondisk() { + eversion_t min = last_complete_ondisk; + ceph_assert(!acting_recovery_backfill.empty()); + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == get_primary()) continue; + if (peer_last_complete_ondisk.count(*i) == 0) + return; // we don't have complete info + eversion_t a = peer_last_complete_ondisk[*i]; + if (a < min) + min = a; + } + if (min == min_last_complete_ondisk) + return; + min_last_complete_ondisk = min; + return; + } + + virtual void calc_trim_to() = 0; + + virtual void calc_trim_to_aggressive() = 0; + + void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog, + pg_missing_t& omissing, pg_shard_t from); + void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, + pg_missing_t& omissing, pg_shard_t from); + bool proc_replica_info( + pg_shard_t from, const pg_info_t &info, epoch_t send_epoch); + + struct PGLogEntryHandler : public PGLog::LogEntryHandler { + PG *pg; + ObjectStore::Transaction *t; + PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {} + + // LogEntryHandler + void remove(const hobject_t &hoid) override { + pg->get_pgbackend()->remove(hoid, t); + } + void try_stash(const hobject_t &hoid, version_t v) override { + pg->get_pgbackend()->try_stash(hoid, v, t); + } + void rollback(const pg_log_entry_t &entry) override { + ceph_assert(entry.can_rollback()); + pg->get_pgbackend()->rollback(entry, t); + } + void rollforward(const pg_log_entry_t &entry) override { + pg->get_pgbackend()->rollforward(entry, t); + } + void trim(const pg_log_entry_t &entry) override { + pg->get_pgbackend()->trim(entry, t); + } + }; + + void update_object_snap_mapping( + ObjectStore::Transaction *t, const hobject_t &soid, + const set<snapid_t> &snaps); + void clear_object_snap_mapping( + ObjectStore::Transaction *t, const hobject_t &soid); + void remove_snap_mapped_object( + ObjectStore::Transaction& t, const hobject_t& soid); + void merge_log( + ObjectStore::Transaction& t, pg_info_t &oinfo, + pg_log_t &olog, pg_shard_t from); + void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead); + bool search_for_missing( + const pg_info_t &oinfo, const pg_missing_t &omissing, + pg_shard_t fromosd, + RecoveryCtx*); + + void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map); + + map<pg_shard_t, pg_info_t>::const_iterator find_best_info( + const map<pg_shard_t, pg_info_t> &infos, + bool restrict_to_up_acting, + bool *history_les_bound) const; + static void calc_ec_acting( + map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, + unsigned size, + const vector<int> &acting, + const vector<int> &up, + const map<pg_shard_t, pg_info_t> &all_info, + bool restrict_to_up_acting, + vector<int> *want, + set<pg_shard_t> *backfill, + set<pg_shard_t> *acting_backfill, + ostream &ss); + static void calc_replicated_acting( + map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, + uint64_t force_auth_primary_missing_objects, + unsigned size, + const vector<int> &acting, + const vector<int> &up, + pg_shard_t up_primary, + const map<pg_shard_t, pg_info_t> &all_info, + bool restrict_to_up_acting, + vector<int> *want, + set<pg_shard_t> *backfill, + set<pg_shard_t> *acting_backfill, + const OSDMapRef osdmap, + ostream &ss); + void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info, + const pg_info_t &auth_info, + vector<int> *want, + set<pg_shard_t> *async_recovery, + const OSDMapRef osdmap) const; + void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info, + const pg_info_t &auth_info, + vector<int> *want, + set<pg_shard_t> *async_recovery, + const OSDMapRef osdmap) const; + + bool recoverable_and_ge_min_size(const vector<int> &want) const; + bool choose_acting(pg_shard_t &auth_log_shard, + bool restrict_to_up_acting, + bool *history_les_bound); + void build_might_have_unfound(); + void activate( + ObjectStore::Transaction& t, + epoch_t activation_epoch, + map<int, map<spg_t,pg_query_t> >& query_map, + map<int, + vector<pair<pg_notify_t, PastIntervals> > > *activator_map, + RecoveryCtx *ctx); + + struct C_PG_ActivateCommitted : public Context { + PGRef pg; + epoch_t epoch; + epoch_t activation_epoch; + C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae) + : pg(p), epoch(e), activation_epoch(ae) {} + void finish(int r) override { + pg->_activate_committed(epoch, activation_epoch); + } + }; + void _activate_committed(epoch_t epoch, epoch_t activation_epoch); + void all_activated_and_committed(); + + void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info); + + bool have_unfound() const { + return missing_loc.have_unfound(); + } + uint64_t get_num_unfound() const { + return missing_loc.num_unfound(); + } + bool all_missing_unfound() const { + const auto& missing = pg_log.get_missing(); + if (!missing.have_missing()) + return false; + for (auto& m : missing.get_items()) { + if (!missing_loc.is_unfound(m.first)) + return false; + } + return true; + } + + virtual void check_local() = 0; + + void purge_strays(); + + void update_heartbeat_peers(); + + Context *finish_sync_event; + + Context *finish_recovery(); + void _finish_recovery(Context *c); + struct C_PG_FinishRecovery : public Context { + PGRef pg; + explicit C_PG_FinishRecovery(PG *p) : pg(p) {} + void finish(int r) override { + pg->_finish_recovery(this); + } + }; + void cancel_recovery(); + void clear_recovery_state(); + virtual void _clear_recovery_state() = 0; + virtual void check_recovery_sources(const OSDMapRef& newmap) = 0; + void start_recovery_op(const hobject_t& soid); + void finish_recovery_op(const hobject_t& soid, bool dequeue=false); + + virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0; + + friend class C_OSD_RepModify_Commit; + friend class C_DeleteMore; + + // -- backoff -- + Mutex backoff_lock; // orders inside Backoff::lock + map<hobject_t,set<BackoffRef>> backoffs; + + void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end); + void release_backoffs(const hobject_t& begin, const hobject_t& end); + void release_backoffs(const hobject_t& o) { + release_backoffs(o, o); + } + void clear_backoffs(); + + void add_pg_backoff(SessionRef s) { + hobject_t begin = info.pgid.pgid.get_hobj_start(); + hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + add_backoff(s, begin, end); + } +public: + void release_pg_backoffs() { + hobject_t begin = info.pgid.pgid.get_hobj_start(); + hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + release_backoffs(begin, end); + } +protected: + + // -- scrub -- +public: + struct Scrubber { + Scrubber(); + ~Scrubber(); + + // metadata + set<pg_shard_t> reserved_peers; + bool local_reserved, remote_reserved, reserve_failed; + epoch_t epoch_start; + + // common to both scrubs + bool active; + set<pg_shard_t> waiting_on_whom; + int shallow_errors; + int deep_errors; + int fixed; + ScrubMap primary_scrubmap; + ScrubMapBuilder primary_scrubmap_pos; + epoch_t replica_scrub_start = 0; + ScrubMap replica_scrubmap; + ScrubMapBuilder replica_scrubmap_pos; + map<pg_shard_t, ScrubMap> received_maps; + OpRequestRef active_rep_scrub; + utime_t scrub_reg_stamp; // stamp we registered for + + static utime_t scrub_must_stamp() { return utime_t(0,1); } + + omap_stat_t omap_stats = (const struct omap_stat_t){ 0 }; + + // For async sleep + bool sleeping = false; + bool needs_sleep = true; + utime_t sleep_start; + + // flags to indicate explicitly requested scrubs (by admin) + bool must_scrub, must_deep_scrub, must_repair, need_auto, req_scrub; + + // Priority to use for scrub scheduling + unsigned priority = 0; + + bool time_for_deep; + // this flag indicates whether we would like to do auto-repair of the PG or not + bool auto_repair; + // this flag indicates that we are scrubbing post repair to verify everything is fixed + bool check_repair; + // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors, + // we should deep scrub in order to auto repair + bool deep_scrub_on_error; + + // Maps from objects with errors to missing/inconsistent peers + map<hobject_t, set<pg_shard_t>> missing; + map<hobject_t, set<pg_shard_t>> inconsistent; + + // Map from object with errors to good peers + map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative; + + // Cleaned map pending snap metadata scrub + ScrubMap cleaned_meta_map; + + void clean_meta_map(ScrubMap &for_meta_scrub) { + if (end.is_max() || + cleaned_meta_map.objects.empty()) { + cleaned_meta_map.swap(for_meta_scrub); + } else { + auto iter = cleaned_meta_map.objects.end(); + --iter; // not empty, see if clause + auto begin = cleaned_meta_map.objects.begin(); + if (iter->first.has_snapset()) { + ++iter; + } else { + while (iter != begin) { + auto next = iter--; + if (next->first.get_head() != iter->first.get_head()) { + ++iter; + break; + } + } + } + for_meta_scrub.objects.insert(begin, iter); + cleaned_meta_map.objects.erase(begin, iter); + } + } + + // digest updates which we are waiting on + int num_digest_updates_pending; + + // chunky scrub + hobject_t start, end; // [start,end) + hobject_t max_end; // Largest end that may have been sent to replicas + eversion_t subset_last_update; + + // chunky scrub state + enum State { + INACTIVE, + NEW_CHUNK, + WAIT_PUSHES, + WAIT_LAST_UPDATE, + BUILD_MAP, + BUILD_MAP_DONE, + WAIT_REPLICAS, + COMPARE_MAPS, + WAIT_DIGEST_UPDATES, + FINISH, + BUILD_MAP_REPLICA, + } state; + + std::unique_ptr<Scrub::Store> store; + // deep scrub + bool deep; + int preempt_left; + int preempt_divisor; + + list<Context*> callbacks; + void add_callback(Context *context) { + callbacks.push_back(context); + } + void run_callbacks() { + list<Context*> to_run; + to_run.swap(callbacks); + for (list<Context*>::iterator i = to_run.begin(); + i != to_run.end(); + ++i) { + (*i)->complete(0); + } + } + + static const char *state_string(const PG::Scrubber::State& state) { + const char *ret = NULL; + switch( state ) + { + case INACTIVE: ret = "INACTIVE"; break; + case NEW_CHUNK: ret = "NEW_CHUNK"; break; + case WAIT_PUSHES: ret = "WAIT_PUSHES"; break; + case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break; + case BUILD_MAP: ret = "BUILD_MAP"; break; + case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break; + case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break; + case COMPARE_MAPS: ret = "COMPARE_MAPS"; break; + case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break; + case FINISH: ret = "FINISH"; break; + case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break; + } + return ret; + } + + bool is_chunky_scrub_active() const { return state != INACTIVE; } + + // clear all state + void reset() { + active = false; + waiting_on_whom.clear(); + if (active_rep_scrub) { + active_rep_scrub = OpRequestRef(); + } + received_maps.clear(); + + must_scrub = false; + must_deep_scrub = false; + must_repair = false; + need_auto = false; + req_scrub = false; + time_for_deep = false; + auto_repair = false; + check_repair = false; + deep_scrub_on_error = false; + + state = PG::Scrubber::INACTIVE; + start = hobject_t(); + end = hobject_t(); + max_end = hobject_t(); + subset_last_update = eversion_t(); + shallow_errors = 0; + deep_errors = 0; + fixed = 0; + omap_stats = (const struct omap_stat_t){ 0 }; + deep = false; + run_callbacks(); + inconsistent.clear(); + missing.clear(); + authoritative.clear(); + num_digest_updates_pending = 0; + primary_scrubmap = ScrubMap(); + primary_scrubmap_pos.reset(); + replica_scrubmap = ScrubMap(); + replica_scrubmap_pos.reset(); + cleaned_meta_map = ScrubMap(); + sleeping = false; + needs_sleep = true; + sleep_start = utime_t(); + } + + void create_results(const hobject_t& obj); + void cleanup_store(ObjectStore::Transaction *t); + } scrubber; + +protected: + bool scrub_after_recovery; + bool save_req_scrub; // Saved for scrub_after_recovery + + int active_pushes; + + bool scrub_can_preempt = false; + bool scrub_preempted = false; + + // we allow some number of preemptions of the scrub, which mean we do + // not block. then we start to block. once we start blocking, we do + // not stop until the scrub range is completed. + bool write_blocked_by_scrub(const hobject_t &soid); + + /// true if the given range intersects the scrub interval in any way + bool range_intersects_scrub(const hobject_t &start, const hobject_t& end); + + void repair_object( + const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers, + pg_shard_t bad_peer); + + void abort_scrub(); + void chunky_scrub(ThreadPool::TPHandle &handle); + void scrub_compare_maps(); + /** + * return true if any inconsistency/missing is repaired, false otherwise + */ + bool scrub_process_inconsistent(); + bool ops_blocked_by_scrub() const; + void scrub_finish(); + void scrub_clear_state(bool keep_repair = false); + void _scan_snaps(ScrubMap &map); + void _repair_oinfo_oid(ScrubMap &map); + void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs); + void _request_scrub_map(pg_shard_t replica, eversion_t version, + hobject_t start, hobject_t end, bool deep, + bool allow_preemption); + int build_scrub_map_chunk( + ScrubMap &map, + ScrubMapBuilder &pos, + hobject_t start, hobject_t end, bool deep, + ThreadPool::TPHandle &handle); + /** + * returns true if [begin, end) is good to scrub at this time + * a false return value obliges the implementer to requeue scrub when the + * condition preventing scrub clears + */ + virtual bool _range_available_for_scrub( + const hobject_t &begin, const hobject_t &end) = 0; + virtual void scrub_snapshot_metadata( + ScrubMap &map, + const std::map<hobject_t, + pair<boost::optional<uint32_t>, + boost::optional<uint32_t>>> &missing_digest) { } + virtual void _scrub_clear_state() { } + virtual void _scrub_finish() { } + void clear_scrub_reserved(); + void scrub_reserve_replicas(); + void scrub_unreserve_replicas(); + bool scrub_all_replicas_reserved() const; + + void replica_scrub( + OpRequestRef op, + ThreadPool::TPHandle &handle); + void do_replica_scrub_map(OpRequestRef op); + + void handle_scrub_reserve_request(OpRequestRef op); + void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from); + void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from); + void handle_scrub_reserve_release(OpRequestRef op); + + void reject_reservation(); + void schedule_backfill_retry(float retry); + void schedule_recovery_retry(float retry); + + // -- recovery state -- + + template <class EVT> + struct QueuePeeringEvt : Context { + PGRef pg; + epoch_t epoch; + EVT evt; + QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) : + pg(pg), epoch(epoch), evt(evt) {} + void finish(int r) override { + pg->lock(); + pg->queue_peering_event(PGPeeringEventRef( + new PGPeeringEvent( + epoch, + epoch, + evt))); + pg->unlock(); + } + }; + + + struct QueryState : boost::statechart::event< QueryState > { + Formatter *f; + explicit QueryState(Formatter *f) : f(f) {} + void print(std::ostream *out) const { + *out << "Query"; + } + }; + +public: + int pg_stat_adjust(osd_stat_t *new_stat); +protected: + + struct AdvMap : boost::statechart::event< AdvMap > { + OSDMapRef osdmap; + OSDMapRef lastmap; + vector<int> newup, newacting; + int up_primary, acting_primary; + AdvMap( + OSDMapRef osdmap, OSDMapRef lastmap, + vector<int>& newup, int up_primary, + vector<int>& newacting, int acting_primary): + osdmap(osdmap), lastmap(lastmap), + newup(newup), + newacting(newacting), + up_primary(up_primary), + acting_primary(acting_primary) {} + void print(std::ostream *out) const { + *out << "AdvMap"; + } + }; + + struct ActMap : boost::statechart::event< ActMap > { + ActMap() : boost::statechart::event< ActMap >() {} + void print(std::ostream *out) const { + *out << "ActMap"; + } + }; + struct Activate : boost::statechart::event< Activate > { + epoch_t activation_epoch; + explicit Activate(epoch_t q) : boost::statechart::event< Activate >(), + activation_epoch(q) {} + void print(std::ostream *out) const { + *out << "Activate from " << activation_epoch; + } + }; +public: + struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> { + explicit UnfoundBackfill() {} + void print(std::ostream *out) const { + *out << "UnfoundBackfill"; + } + }; + struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> { + explicit UnfoundRecovery() {} + void print(std::ostream *out) const { + *out << "UnfoundRecovery"; + } + }; + + struct RequestScrub : boost::statechart::event<RequestScrub> { + bool deep; + bool repair; + explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {} + void print(std::ostream *out) const { + *out << "RequestScrub(" << (deep ? "deep" : "shallow") + << (repair ? " repair" : ""); + } + }; + +protected: + TrivialEvent(Initialize) + TrivialEvent(GotInfo) + TrivialEvent(NeedUpThru) + TrivialEvent(Backfilled) + TrivialEvent(LocalBackfillReserved) + TrivialEvent(RejectTooFullRemoteReservation) + public: + TrivialEvent(RequestBackfill) + protected: + TrivialEvent(RemoteRecoveryPreempted) + TrivialEvent(RemoteBackfillPreempted) + TrivialEvent(BackfillTooFull) + TrivialEvent(RecoveryTooFull) + + TrivialEvent(MakePrimary) + TrivialEvent(MakeStray) + TrivialEvent(NeedActingChange) + TrivialEvent(IsIncomplete) + TrivialEvent(IsDown) + + TrivialEvent(AllReplicasRecovered) + TrivialEvent(DoRecovery) + TrivialEvent(LocalRecoveryReserved) + public: + protected: + TrivialEvent(AllRemotesReserved) + TrivialEvent(AllBackfillsReserved) + TrivialEvent(GoClean) + + TrivialEvent(AllReplicasActivated) + + TrivialEvent(IntervalFlush) + + public: + TrivialEvent(DeleteStart) + TrivialEvent(DeleteSome) + + TrivialEvent(SetForceRecovery) + TrivialEvent(UnsetForceRecovery) + TrivialEvent(SetForceBackfill) + TrivialEvent(UnsetForceBackfill) + + protected: + TrivialEvent(DeleteReserved) + TrivialEvent(DeleteInterrupted) + + /* Encapsulates PG recovery process */ + class RecoveryState { + void start_handle(RecoveryCtx *new_ctx); + void end_handle(); + public: + void begin_block_outgoing(); + void end_block_outgoing(); + void clear_blocked_outgoing(); + private: + + /* States */ + struct Initial; + class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > { + RecoveryState *state; + public: + PG *pg; + + utime_t event_time; + uint64_t event_count; + + void clear_event_counters() { + event_time = utime_t(); + event_count = 0; + } + + void log_enter(const char *state_name); + void log_exit(const char *state_name, utime_t duration); + + RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {} + + /* Accessor functions for state methods */ + ObjectStore::Transaction* get_cur_transaction() { + ceph_assert(state->rctx); + ceph_assert(state->rctx->transaction); + return state->rctx->transaction; + } + + void send_query(pg_shard_t to, const pg_query_t &query) { + ceph_assert(state->rctx); + ceph_assert(state->rctx->query_map); + (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] = + query; + } + + map<int, map<spg_t, pg_query_t> > *get_query_map() { + ceph_assert(state->rctx); + ceph_assert(state->rctx->query_map); + return state->rctx->query_map; + } + + map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() { + ceph_assert(state->rctx); + ceph_assert(state->rctx->info_map); + return state->rctx->info_map; + } + + RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); } + + void send_notify(pg_shard_t to, + const pg_notify_t &info, const PastIntervals &pi) { + ceph_assert(state->rctx); + state->rctx->send_notify(to, info, pi); + } + }; + friend class RecoveryMachine; + + /* States */ + // Initial + // Reset + // Start + // Started + // Primary + // WaitActingChange + // Peering + // GetInfo + // GetLog + // GetMissing + // WaitUpThru + // Incomplete + // Active + // Activating + // Clean + // Recovered + // Backfilling + // WaitRemoteBackfillReserved + // WaitLocalBackfillReserved + // NotBackfilling + // NotRecovering + // Recovering + // WaitRemoteRecoveryReserved + // WaitLocalRecoveryReserved + // ReplicaActive + // RepNotRecovering + // RepRecovering + // RepWaitBackfillReserved + // RepWaitRecoveryReserved + // Stray + // ToDelete + // WaitDeleteReserved + // Deleting + // Crashed + + struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState { + explicit Crashed(my_context ctx); + }; + + struct Reset; + + struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState { + explicit Initial(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::transition< Initialize, Reset >, + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + + boost::statechart::result react(const MNotifyRec&); + boost::statechart::result react(const MInfoRec&); + boost::statechart::result react(const MLogRec&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState { + explicit Reset(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::custom_reaction< IntervalFlush >, + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const IntervalFlush&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Start; + + struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState { + explicit Started(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< IntervalFlush >, + // ignored + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::custom_reaction<SetForceRecovery>, + boost::statechart::custom_reaction<UnsetForceRecovery>, + boost::statechart::custom_reaction<SetForceBackfill>, + boost::statechart::custom_reaction<UnsetForceBackfill>, + boost::statechart::custom_reaction<RequestScrub>, + // crash + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const IntervalFlush&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Primary; + struct Stray; + + struct Start : boost::statechart::state< Start, Started >, NamedState { + explicit Start(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::transition< MakePrimary, Primary >, + boost::statechart::transition< MakeStray, Stray > + > reactions; + }; + + struct Peering; + struct WaitActingChange; + struct Incomplete; + struct Down; + + struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState { + explicit Primary(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::custom_reaction<SetForceRecovery>, + boost::statechart::custom_reaction<UnsetForceRecovery>, + boost::statechart::custom_reaction<SetForceBackfill>, + boost::statechart::custom_reaction<UnsetForceBackfill>, + boost::statechart::custom_reaction<RequestScrub> + > reactions; + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const MNotifyRec&); + boost::statechart::result react(const SetForceRecovery&); + boost::statechart::result react(const UnsetForceRecovery&); + boost::statechart::result react(const SetForceBackfill&); + boost::statechart::result react(const UnsetForceBackfill&); + boost::statechart::result react(const RequestScrub&); + }; + + struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>, + NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MNotifyRec > + > reactions; + explicit WaitActingChange(my_context ctx); + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const MLogRec&); + boost::statechart::result react(const MInfoRec&); + boost::statechart::result react(const MNotifyRec&); + void exit(); + }; + + struct GetInfo; + struct Active; + + struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState { + PastIntervals::PriorSet prior_set; + bool history_les_bound; //< need osd_find_best_info_ignore_history_les + + explicit Peering(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::transition< Activate, Active >, + boost::statechart::custom_reaction< AdvMap > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const AdvMap &advmap); + }; + + struct WaitLocalRecoveryReserved; + struct Activating; + struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState { + explicit Active(my_context ctx); + void exit(); + + const set<pg_shard_t> remote_shards_to_reserve_recovery; + const set<pg_shard_t> remote_shards_to_reserve_backfill; + bool all_replicas_activated; + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MTrim >, + boost::statechart::custom_reaction< Backfilled >, + boost::statechart::custom_reaction< AllReplicasActivated >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, + boost::statechart::custom_reaction< RemoteReservationRevoked>, + boost::statechart::custom_reaction< DoRecovery> + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const MNotifyRec& notevt); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MTrim& trimevt); + boost::statechart::result react(const Backfilled&) { + return discard_event(); + } + boost::statechart::result react(const AllReplicasActivated&); + boost::statechart::result react(const DeferRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const DeferBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteReservationRevokedTooFull&) { + return discard_event(); + } + boost::statechart::result react(const RemoteReservationRevoked&) { + return discard_event(); + } + boost::statechart::result react(const DoRecovery&) { + return discard_event(); + } + }; + + struct Clean : boost::statechart::state< Clean, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction<SetForceRecovery>, + boost::statechart::custom_reaction<SetForceBackfill> + > reactions; + explicit Clean(my_context ctx); + void exit(); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Recovered : boost::statechart::state< Recovered, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< GoClean, Clean >, + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction< AllReplicasActivated > + > reactions; + explicit Recovered(my_context ctx); + void exit(); + boost::statechart::result react(const AllReplicasActivated&) { + post_event(GoClean()); + return forward_event(); + } + }; + + struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< Backfilled >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >, + boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, + boost::statechart::custom_reaction< RemoteReservationRevoked> + > reactions; + explicit Backfilling(my_context ctx); + boost::statechart::result react(const RemoteReservationRejectedTooFull& evt) { + // for compat with old peers + post_event(RemoteReservationRevokedTooFull()); + return discard_event(); + } + void backfill_release_reservations(); + boost::statechart::result react(const Backfilled& evt); + boost::statechart::result react(const RemoteReservationRevokedTooFull& evt); + boost::statechart::result react(const RemoteReservationRevoked& evt); + boost::statechart::result react(const DeferBackfill& evt); + boost::statechart::result react(const UnfoundBackfill& evt); + void cancel_backfill(); + void exit(); + }; + + struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >, + boost::statechart::custom_reaction< RemoteReservationRevoked >, + boost::statechart::transition< AllBackfillsReserved, Backfilling > + > reactions; + set<pg_shard_t>::const_iterator backfill_osd_it; + explicit WaitRemoteBackfillReserved(my_context ctx); + void retry(); + void exit(); + boost::statechart::result react(const RemoteBackfillReserved& evt); + boost::statechart::result react(const RemoteReservationRejectedTooFull& evt); + boost::statechart::result react(const RemoteReservationRevoked& evt); + }; + + struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >, + boost::statechart::custom_reaction< RemoteBackfillReserved > + > reactions; + explicit WaitLocalBackfillReserved(my_context ctx); + boost::statechart::result react(const RemoteBackfillReserved& evt) { + /* no-op */ + return discard_event(); + } + void exit(); + }; + + struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>, + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull > + > reactions; + explicit NotBackfilling(my_context ctx); + void exit(); + boost::statechart::result react(const RemoteBackfillReserved& evt); + boost::statechart::result react(const RemoteReservationRejectedTooFull& evt); + }; + + struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< UnfoundRecovery > + > reactions; + explicit NotRecovering(my_context ctx); + boost::statechart::result react(const DeferRecovery& evt) { + /* no-op */ + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + /* no-op */ + return discard_event(); + } + void exit(); + }; + + struct ToDelete; + struct RepNotRecovering; + struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState { + explicit ReplicaActive(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MQuery >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MTrim >, + boost::statechart::custom_reaction< Activate >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteBackfillPreempted >, + boost::statechart::custom_reaction< RemoteRecoveryPreempted >, + boost::statechart::custom_reaction< RecoveryDone >, + boost::statechart::transition<DeleteStart, ToDelete> + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MTrim& trimevt); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const MQuery&); + boost::statechart::result react(const Activate&); + boost::statechart::result react(const RecoveryDone&) { + return discard_event(); + } + boost::statechart::result react(const DeferRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const DeferBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteBackfillPreempted& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteRecoveryPreempted& evt) { + return discard_event(); + } + }; + + struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< RecoveryDone, RepNotRecovering >, + // for compat with old peers + boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >, + boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, + boost::statechart::custom_reaction< BackfillTooFull >, + boost::statechart::custom_reaction< RemoteRecoveryPreempted >, + boost::statechart::custom_reaction< RemoteBackfillPreempted > + > reactions; + explicit RepRecovering(my_context ctx); + boost::statechart::result react(const RemoteRecoveryPreempted &evt); + boost::statechart::result react(const BackfillTooFull &evt); + boost::statechart::result react(const RemoteBackfillPreempted &evt); + void exit(); + }; + + struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RejectTooFullRemoteReservation >, + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >, + boost::statechart::custom_reaction< RemoteReservationCanceled > + > reactions; + explicit RepWaitBackfillReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RemoteBackfillReserved &evt); + boost::statechart::result react(const RejectTooFullRemoteReservation &evt); + boost::statechart::result react(const RemoteReservationRejectedTooFull &evt); + boost::statechart::result react(const RemoteReservationCanceled &evt); + }; + + struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + // for compat with old peers + boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >, + boost::statechart::custom_reaction< RemoteReservationCanceled > + > reactions; + explicit RepWaitRecoveryReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RemoteRecoveryReserved &evt); + boost::statechart::result react(const RemoteReservationRejectedTooFull &evt) { + // for compat with old peers + post_event(RemoteReservationCanceled()); + return discard_event(); + } + boost::statechart::result react(const RemoteReservationCanceled &evt); + }; + + struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RequestRecoveryPrio >, + boost::statechart::custom_reaction< RequestBackfillPrio >, + boost::statechart::custom_reaction< RejectTooFullRemoteReservation >, + boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >, + boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers + > reactions; + explicit RepNotRecovering(my_context ctx); + boost::statechart::result react(const RequestRecoveryPrio &evt); + boost::statechart::result react(const RequestBackfillPrio &evt); + boost::statechart::result react(const RemoteBackfillReserved &evt) { + // my reservation completion raced with a RELEASE from primary + return discard_event(); + } + boost::statechart::result react(const RemoteRecoveryReserved &evt) { + // my reservation completion raced with a RELEASE from primary + return discard_event(); + } + boost::statechart::result react(const RejectTooFullRemoteReservation &evt); + void exit(); + }; + + struct Recovering : boost::statechart::state< Recovering, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< AllReplicasRecovered >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< RequestBackfill > + > reactions; + explicit Recovering(my_context ctx); + void exit(); + void release_reservations(bool cancel = false); + boost::statechart::result react(const AllReplicasRecovered &evt); + boost::statechart::result react(const DeferRecovery& evt); + boost::statechart::result react(const UnfoundRecovery& evt); + boost::statechart::result react(const RequestBackfill &evt); + }; + + struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + boost::statechart::transition< AllRemotesReserved, Recovering > + > reactions; + set<pg_shard_t>::const_iterator remote_recovery_reservation_it; + explicit WaitRemoteRecoveryReserved(my_context ctx); + boost::statechart::result react(const RemoteRecoveryReserved &evt); + void exit(); + }; + + struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >, + boost::statechart::custom_reaction< RecoveryTooFull > + > reactions; + explicit WaitLocalRecoveryReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RecoveryTooFull &evt); + }; + + struct Activating : boost::statechart::state< Activating, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::transition< AllReplicasRecovered, Recovered >, + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved > + > reactions; + explicit Activating(my_context ctx); + void exit(); + }; + + struct Stray : boost::statechart::state< Stray, Started >, + NamedState { + explicit Stray(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< MQuery >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< RecoveryDone >, + boost::statechart::transition<DeleteStart, ToDelete> + > reactions; + boost::statechart::result react(const MQuery& query); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const RecoveryDone&) { + return discard_event(); + } + }; + + struct WaitDeleteReserved; + struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState { + unsigned priority = 0; + typedef boost::mpl::list < + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< DeleteSome > + > reactions; + explicit ToDelete(my_context ctx); + boost::statechart::result react(const ActMap &evt); + boost::statechart::result react(const DeleteSome &evt) { + // happens if we drop out of Deleting due to reprioritization etc. + return discard_event(); + } + void exit(); + }; + + struct Deleting; + struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved, + ToDelete>, NamedState { + typedef boost::mpl::list < + boost::statechart::transition<DeleteReserved, Deleting> + > reactions; + explicit WaitDeleteReserved(my_context ctx); + void exit(); + }; + + struct Deleting : boost::statechart::state<Deleting, + ToDelete>, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< DeleteSome >, + boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved> + > reactions; + ghobject_t next; + ceph::mono_clock::time_point start; + explicit Deleting(my_context ctx); + boost::statechart::result react(const DeleteSome &evt); + void exit(); + }; + + struct GetLog; + + struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState { + set<pg_shard_t> peer_info_requested; + + explicit GetInfo(my_context ctx); + void exit(); + void get_infos(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::transition< GotInfo, GetLog >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::transition< IsDown, Down > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MNotifyRec& infoevt); + }; + + struct GotLog : boost::statechart::event< GotLog > { + GotLog() : boost::statechart::event< GotLog >() {} + }; + + struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState { + pg_shard_t auth_log_shard; + boost::intrusive_ptr<MOSDPGLog> msg; + + explicit GetLog(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< GotLog >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::transition< NeedActingChange, WaitActingChange >, + boost::statechart::transition< IsIncomplete, Incomplete > + > reactions; + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const GotLog&); + }; + + struct WaitUpThru; + + struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState { + set<pg_shard_t> peer_missing_requested; + + explicit GetMissing(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::transition< NeedUpThru, WaitUpThru > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MLogRec& logevt); + }; + + struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState { + explicit WaitUpThru(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MLogRec > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const ActMap& am); + boost::statechart::result react(const MLogRec& logrec); + }; + + struct Down : boost::statechart::state< Down, Peering>, NamedState { + explicit Down(my_context ctx); + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< MNotifyRec > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MNotifyRec& infoevt); + void exit(); + }; + + struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::custom_reaction< QueryState > + > reactions; + explicit Incomplete(my_context ctx); + boost::statechart::result react(const AdvMap &advmap); + boost::statechart::result react(const MNotifyRec& infoevt); + boost::statechart::result react(const QueryState& q); + void exit(); + }; + + RecoveryMachine machine; + PG *pg; + + /// context passed in by state machine caller + RecoveryCtx *orig_ctx; + + /// populated if we are buffering messages pending a flush + boost::optional<BufferedRecoveryMessages> messages_pending_flush; + + /** + * populated between start_handle() and end_handle(), points into + * the message lists for messages_pending_flush while blocking messages + * or into orig_ctx otherwise + */ + boost::optional<RecoveryCtx> rctx; + + public: + explicit RecoveryState(PG *pg) + : machine(this, pg), pg(pg), orig_ctx(0) { + machine.initiate(); + } + + void handle_event(const boost::statechart::event_base &evt, + RecoveryCtx *rctx) { + start_handle(rctx); + machine.process_event(evt); + end_handle(); + } + + void handle_event(PGPeeringEventRef evt, + RecoveryCtx *rctx) { + start_handle(rctx); + machine.process_event(evt->get_event()); + end_handle(); + } + + } recovery_state; + + + + uint64_t peer_features; + uint64_t acting_features; + uint64_t upacting_features; + + epoch_t last_epoch; + + /// most recently consumed osdmap's require_osd_version + unsigned last_require_osd_release = 0; + bool delete_needs_sleep = false; + +protected: + void reset_min_peer_features() { + peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + } + uint64_t get_min_peer_features() const { return peer_features; } + void apply_peer_features(uint64_t f) { peer_features &= f; } + + uint64_t get_min_acting_features() const { return acting_features; } + uint64_t get_min_upacting_features() const { return upacting_features; } + bool perform_deletes_during_peering() const { + return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)); + } + + bool hard_limit_pglog() const { + return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT)); + } + + void init_primary_up_acting( + const vector<int> &newup, + const vector<int> &newacting, + int new_up_primary, + int new_acting_primary) { + actingset.clear(); + acting = newacting; + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] != CRUSH_ITEM_NONE) + actingset.insert( + pg_shard_t( + acting[i], + pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + upset.clear(); + up = newup; + for (uint8_t i = 0; i < up.size(); ++i) { + if (up[i] != CRUSH_ITEM_NONE) + upset.insert( + pg_shard_t( + up[i], + pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + if (!pool.info.is_erasure()) { + up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD); + primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD); + return; + } + up_primary = pg_shard_t(); + primary = pg_shard_t(); + for (uint8_t i = 0; i < up.size(); ++i) { + if (up[i] == new_up_primary) { + up_primary = pg_shard_t(up[i], shard_id_t(i)); + break; + } + } + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == new_acting_primary) { + primary = pg_shard_t(acting[i], shard_id_t(i)); + break; + } + } + ceph_assert(up_primary.osd == new_up_primary); + ceph_assert(primary.osd == new_acting_primary); + } + + void set_role(int r) { + role = r; + } + + bool state_test(uint64_t m) const { return (state & m) != 0; } + void state_set(uint64_t m) { state |= m; } + void state_clear(uint64_t m) { state &= ~m; } + + bool is_complete() const { return info.last_complete == info.last_update; } + bool should_send_notify() const { return send_notify; } + + uint64_t get_state() const { return state; } + bool is_active() const { return state_test(PG_STATE_ACTIVE); } + bool is_activating() const { return state_test(PG_STATE_ACTIVATING); } + bool is_peering() const { return state_test(PG_STATE_PEERING); } + bool is_down() const { return state_test(PG_STATE_DOWN); } + bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); } + bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); } + bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); } + bool is_clean() const { return state_test(PG_STATE_CLEAN); } + bool is_degraded() const { return state_test(PG_STATE_DEGRADED); } + bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); } + bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } + bool is_remapped() const { return state_test(PG_STATE_REMAPPED); } + bool is_peered() const { + return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED); + } + bool is_recovering() const { return state_test(PG_STATE_RECOVERING); } + bool is_premerge() const { return state_test(PG_STATE_PREMERGE); } + bool is_repair() const { return state_test(PG_STATE_REPAIR); } + + bool is_empty() const { return info.last_update == eversion_t(0,0); } + + // pg on-disk state + void do_pending_flush(); + +public: + static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits); + static void _init(ObjectStore::Transaction& t, + spg_t pgid, const pg_pool_t *pool); + +protected: + void prepare_write_info(map<string,bufferlist> *km); + + void update_store_with_options(); + +public: + static int _prepare_write_info( + CephContext* cct, + map<string,bufferlist> *km, + epoch_t epoch, + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + bool dirty_big_info, + bool dirty_epoch, + bool try_fast_info, + PerfCounters *logger = nullptr); + + void write_if_dirty(RecoveryCtx *rctx) { + write_if_dirty(*rctx->transaction); + } +protected: + void write_if_dirty(ObjectStore::Transaction& t); + + PGLog::IndexedLog projected_log; + bool check_in_progress_op( + const osd_reqid_t &r, + eversion_t *version, + version_t *user_version, + int *return_code) const; + eversion_t projected_last_update; + eversion_t get_next_version() const { + eversion_t at_version( + get_osdmap_epoch(), + projected_last_update.version+1); + ceph_assert(at_version > info.last_update); + ceph_assert(at_version > pg_log.get_head()); + ceph_assert(at_version > projected_last_update); + return at_version; + } + + void add_log_entry(const pg_log_entry_t& e, bool applied); + void append_log( + const vector<pg_log_entry_t>& logv, + eversion_t trim_to, + eversion_t roll_forward_to, + ObjectStore::Transaction &t, + bool transaction_applied = true, + bool async = false); + bool check_log_for_corruption(ObjectStore *store); + + std::string get_corrupt_pg_log_name() const; + + void update_snap_map( + const vector<pg_log_entry_t> &log_entries, + ObjectStore::Transaction& t); + + void filter_snapc(vector<snapid_t> &snaps); + + void log_weirdness(); + + virtual void kick_snap_trim() = 0; + virtual void snap_trimmer_scrub_complete() = 0; + bool requeue_scrub(bool high_priority = false); + void queue_recovery(); + bool queue_scrub(); + unsigned get_scrub_priority(); + + /// share pg info after a pg is active + void share_pg_info(); + + + bool append_log_entries_update_missing( + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + ObjectStore::Transaction &t, + boost::optional<eversion_t> trim_to, + boost::optional<eversion_t> roll_forward_to); + + /** + * Merge entries updating missing as necessary on all + * acting_recovery_backfill logs and missings (also missing_loc) + */ + void merge_new_log_entries( + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + ObjectStore::Transaction &t, + boost::optional<eversion_t> trim_to, + boost::optional<eversion_t> roll_forward_to); + + void reset_interval_flush(); + void start_peering_interval( + const OSDMapRef lastmap, + const vector<int>& newup, int up_primary, + const vector<int>& newacting, int acting_primary, + ObjectStore::Transaction *t); + void on_new_interval(); + virtual void _on_new_interval() = 0; + void start_flush(ObjectStore::Transaction *t); + void set_last_peering_reset(); + + void update_history(const pg_history_t& history); + void fulfill_info(pg_shard_t from, const pg_query_t &query, + pair<pg_shard_t, pg_info_t> ¬ify_info); + void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch); + void fulfill_query(const MQuery& q, RecoveryCtx *rctx); + void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap); + + bool should_restart_peering( + int newupprimary, + int newactingprimary, + const vector<int>& newup, + const vector<int>& newacting, + OSDMapRef lastmap, + OSDMapRef osdmap); + + // OpRequest queueing + bool can_discard_op(OpRequestRef& op); + bool can_discard_scan(OpRequestRef op); + bool can_discard_backfill(OpRequestRef op); + bool can_discard_request(OpRequestRef& op); + + template<typename T, int MSGTYPE> + bool can_discard_replica_op(OpRequestRef& op); + + bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch); + bool old_peering_evt(PGPeeringEventRef evt) { + return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested()); + } + static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) { + return e <= cur_epoch; + } + bool have_same_or_newer_map(epoch_t e) { + return e <= get_osdmap_epoch(); + } + + bool op_has_sufficient_caps(OpRequestRef& op); + + + // recovery bits + void take_waiters(); + + + // abstract bits + friend class FlushState; + +public: + void init_collection_pool_opts(); +protected: + virtual void on_role_change() = 0; + virtual void on_pool_change() = 0; + virtual void on_change(ObjectStore::Transaction *t) = 0; + virtual void on_activate() = 0; + virtual void on_flushed() = 0; + virtual void check_blacklisted_watchers() = 0; + + friend ostream& operator<<(ostream& out, const PG& pg); +}; + + +ostream& operator<<(ostream& out, const PG::BackfillInterval& bi); + +#endif diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc new file mode 100644 index 00000000..6c193c6a --- /dev/null +++ b/src/osd/PGBackend.cc @@ -0,0 +1,1310 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013,2014 Inktank Storage, Inc. + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/errno.h" +#include "common/scrub_types.h" +#include "ReplicatedBackend.h" +#include "ScrubStore.h" +#include "ECBackend.h" +#include "PGBackend.h" +#include "OSD.h" +#include "erasure-code/ErasureCodePlugin.h" +#include "OSDMap.h" +#include "PGLog.h" +#include "common/LogClient.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) { + return pgb->get_parent()->gen_dbg_prefix(*_dout); +} + +void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v, + RecoveryHandle *h) +{ + ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0); + for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) { + if (shard == get_parent()->whoami_shard()) + continue; + if (get_parent()->get_shard_missing(shard).is_missing(oid)) { + dout(20) << __func__ << " will remove " << oid << " " << v << " from " + << shard << dendl; + h->deletes[shard].push_back(make_pair(oid, v)); + get_parent()->begin_peer_recover(shard, oid); + } + } +} + +void PGBackend::send_recovery_deletes(int prio, + const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes) +{ + epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch(); + for (const auto& p : deletes) { + const auto& shard = p.first; + const auto& objects = p.second; + ConnectionRef con = get_parent()->get_con_osd_cluster( + shard.osd, + get_osdmap_epoch()); + if (!con) + continue; + auto it = objects.begin(); + while (it != objects.end()) { + uint64_t cost = 0; + uint64_t deletes = 0; + spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard); + MOSDPGRecoveryDelete *msg = + new MOSDPGRecoveryDelete(get_parent()->whoami_shard(), + target_pg, + get_osdmap_epoch(), + min_epoch); + msg->set_priority(prio); + + while (it != objects.end() && + cost < cct->_conf->osd_max_push_cost && + deletes < cct->_conf->osd_max_push_objects) { + dout(20) << __func__ << ": sending recovery delete << " << it->first + << " " << it->second << " to osd." << shard << dendl; + msg->objects.push_back(*it); + cost += cct->_conf->osd_push_per_object_cost; + ++deletes; + ++it; + } + + msg->set_cost(cost); + get_parent()->send_message_osd_cluster(msg, con); + } + } +} + +bool PGBackend::handle_message(OpRequestRef op) +{ + switch (op->get_req()->get_type()) { + case MSG_OSD_PG_RECOVERY_DELETE: + handle_recovery_delete(op); + return true; + + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + handle_recovery_delete_reply(op); + return true; + + default: + break; + } + + return _handle_message(op); +} + +void PGBackend::handle_recovery_delete(OpRequestRef op) +{ + const MOSDPGRecoveryDelete *m = static_cast<const MOSDPGRecoveryDelete *>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE); + dout(20) << __func__ << " " << op << dendl; + + op->mark_started(); + + C_GatherBuilder gather(cct); + for (const auto &p : m->objects) { + get_parent()->remove_missing_object(p.first, p.second, gather.new_sub()); + } + + MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply; + reply->from = get_parent()->whoami_shard(); + reply->set_priority(m->get_priority()); + reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard); + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->objects = m->objects; + ConnectionRef conn = m->get_connection(); + + gather.set_finisher(new FunctionContext( + [=](int r) { + if (r != -EAGAIN) { + get_parent()->send_message_osd_cluster(reply, conn.get()); + } else { + reply->put(); + } + })); + gather.activate(); +} + +void PGBackend::handle_recovery_delete_reply(OpRequestRef op) +{ + const MOSDPGRecoveryDeleteReply *m = static_cast<const MOSDPGRecoveryDeleteReply *>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY); + dout(20) << __func__ << " " << op << dendl; + + for (const auto &p : m->objects) { + ObjectRecoveryInfo recovery_info; + hobject_t oid = p.first; + recovery_info.version = p.second; + get_parent()->on_peer_recover(m->from, oid, recovery_info); + bool peers_recovered = true; + for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) { + if (shard == get_parent()->whoami_shard()) + continue; + if (get_parent()->get_shard_missing(shard).is_missing(oid)) { + dout(20) << __func__ << " " << oid << " still missing on at least " + << shard << dendl; + peers_recovered = false; + break; + } + } + if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) { + dout(20) << __func__ << " completed recovery, local_missing = " + << get_parent()->get_local_missing() << dendl; + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + get_parent()->on_global_recover(p.first, stat_diff, true); + } + } +} + +void PGBackend::rollback( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t) +{ + + struct RollbackVisitor : public ObjectModDesc::Visitor { + const hobject_t &hoid; + PGBackend *pg; + ObjectStore::Transaction t; + RollbackVisitor( + const hobject_t &hoid, + PGBackend *pg) : hoid(hoid), pg(pg) {} + void append(uint64_t old_size) override { + ObjectStore::Transaction temp; + pg->rollback_append(hoid, old_size, &temp); + temp.append(t); + temp.swap(t); + } + void setattrs(map<string, boost::optional<bufferlist> > &attrs) override { + ObjectStore::Transaction temp; + pg->rollback_setattrs(hoid, attrs, &temp); + temp.append(t); + temp.swap(t); + } + void rmobject(version_t old_version) override { + ObjectStore::Transaction temp; + pg->rollback_stash(hoid, old_version, &temp); + temp.append(t); + temp.swap(t); + } + void try_rmobject(version_t old_version) override { + ObjectStore::Transaction temp; + pg->rollback_try_stash(hoid, old_version, &temp); + temp.append(t); + temp.swap(t); + } + void create() override { + ObjectStore::Transaction temp; + pg->rollback_create(hoid, &temp); + temp.append(t); + temp.swap(t); + } + void update_snaps(const set<snapid_t> &snaps) override { + ObjectStore::Transaction temp; + pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp); + temp.append(t); + temp.swap(t); + } + void rollback_extents( + version_t gen, + const vector<pair<uint64_t, uint64_t> > &extents) override { + ObjectStore::Transaction temp; + pg->rollback_extents(gen, extents, hoid, &temp); + temp.append(t); + temp.swap(t); + } + }; + + ceph_assert(entry.mod_desc.can_rollback()); + RollbackVisitor vis(entry.soid, this); + entry.mod_desc.visit(&vis); + t->append(vis.t); +} + +struct Trimmer : public ObjectModDesc::Visitor { + const hobject_t &soid; + PGBackend *pg; + ObjectStore::Transaction *t; + Trimmer( + const hobject_t &soid, + PGBackend *pg, + ObjectStore::Transaction *t) + : soid(soid), pg(pg), t(t) {} + void rmobject(version_t old_version) override { + pg->trim_rollback_object( + soid, + old_version, + t); + } + // try_rmobject defaults to rmobject + void rollback_extents( + version_t gen, + const vector<pair<uint64_t, uint64_t> > &extents) override { + pg->trim_rollback_object( + soid, + gen, + t); + } +}; + +void PGBackend::rollforward( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t) +{ + auto dpp = get_parent()->get_dpp(); + ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl; + if (!entry.can_rollback()) + return; + Trimmer trimmer(entry.soid, this, t); + entry.mod_desc.visit(&trimmer); +} + +void PGBackend::trim( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t) +{ + if (!entry.can_rollback()) + return; + Trimmer trimmer(entry.soid, this, t); + entry.mod_desc.visit(&trimmer); +} + +void PGBackend::try_stash( + const hobject_t &hoid, + version_t v, + ObjectStore::Transaction *t) +{ + t->try_rename( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ghobject_t(hoid, v, get_parent()->whoami_shard().shard)); +} + +void PGBackend::remove( + const hobject_t &hoid, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->remove( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + get_parent()->pgb_clear_object_snap_mapping(hoid, t); +} + +void PGBackend::on_change_cleanup(ObjectStore::Transaction *t) +{ + dout(10) << __func__ << dendl; + // clear temp + for (set<hobject_t>::iterator i = temp_contents.begin(); + i != temp_contents.end(); + ++i) { + dout(10) << __func__ << ": Removing oid " + << *i << " from the temp collection" << dendl; + t->remove( + coll, + ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + } + temp_contents.clear(); +} + +int PGBackend::objects_list_partial( + const hobject_t &begin, + int min, + int max, + vector<hobject_t> *ls, + hobject_t *next) +{ + ceph_assert(ls); + // Starts with the smallest generation to make sure the result list + // has the marker object (it might have multiple generations + // though, which would be filtered). + ghobject_t _next; + if (!begin.is_min()) + _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard); + ls->reserve(max); + int r = 0; + + if (min > max) + min = max; + + while (!_next.is_max() && ls->size() < (unsigned)min) { + vector<ghobject_t> objects; + if (HAVE_FEATURE(parent->min_upacting_features(), + OSD_FIXED_COLLECTION_LIST)) { + r = store->collection_list( + ch, + _next, + ghobject_t::get_max(), + max - ls->size(), + &objects, + &_next); + } else { + r = store->collection_list_legacy( + ch, + _next, + ghobject_t::get_max(), + max - ls->size(), + &objects, + &_next); + } + if (r != 0) { + derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl; + break; + } + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (i->is_pgmeta() || i->hobj.is_temp()) { + continue; + } + if (i->is_no_gen()) { + ls->push_back(i->hobj); + } + } + } + if (r == 0) + *next = _next.hobj; + return r; +} + +int PGBackend::objects_list_range( + const hobject_t &start, + const hobject_t &end, + vector<hobject_t> *ls, + vector<ghobject_t> *gen_obs) +{ + ceph_assert(ls); + vector<ghobject_t> objects; + int r; + if (HAVE_FEATURE(parent->min_upacting_features(), + OSD_FIXED_COLLECTION_LIST)) { + r = store->collection_list( + ch, + ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + INT_MAX, + &objects, + NULL); + } else { + r = store->collection_list_legacy( + ch, + ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + INT_MAX, + &objects, + NULL); + } + ls->reserve(objects.size()); + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (i->is_pgmeta() || i->hobj.is_temp()) { + continue; + } + if (i->is_no_gen()) { + ls->push_back(i->hobj); + } else if (gen_obs) { + gen_obs->push_back(*i); + } + } + return r; +} + +int PGBackend::objects_get_attr( + const hobject_t &hoid, + const string &attr, + bufferlist *out) +{ + bufferptr bp; + int r = store->getattr( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + attr.c_str(), + bp); + if (r >= 0 && out) { + out->clear(); + out->push_back(std::move(bp)); + } + return r; +} + +int PGBackend::objects_get_attrs( + const hobject_t &hoid, + map<string, bufferlist> *out) +{ + return store->getattrs( + ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + *out); +} + +void PGBackend::rollback_setattrs( + const hobject_t &hoid, + map<string, boost::optional<bufferlist> > &old_attrs, + ObjectStore::Transaction *t) { + map<string, bufferlist> to_set; + ceph_assert(!hoid.is_temp()); + for (map<string, boost::optional<bufferlist> >::iterator i = old_attrs.begin(); + i != old_attrs.end(); + ++i) { + if (i->second) { + to_set[i->first] = i->second.get(); + } else { + t->rmattr( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + i->first); + } + } + t->setattrs( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + to_set); +} + +void PGBackend::rollback_append( + const hobject_t &hoid, + uint64_t old_size, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->truncate( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + old_size); +} + +void PGBackend::rollback_stash( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->remove( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + t->collection_move_rename( + coll, + ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard), + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); +} + +void PGBackend::rollback_try_stash( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->remove( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + t->try_rename( + coll, + ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard), + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); +} + +void PGBackend::rollback_extents( + version_t gen, + const vector<pair<uint64_t, uint64_t> > &extents, + const hobject_t &hoid, + ObjectStore::Transaction *t) { + auto shard = get_parent()->whoami_shard().shard; + for (auto &&extent: extents) { + t->clone_range( + coll, + ghobject_t(hoid, gen, shard), + ghobject_t(hoid, ghobject_t::NO_GEN, shard), + extent.first, + extent.second, + extent.first); + } + t->remove( + coll, + ghobject_t(hoid, gen, shard)); +} + +void PGBackend::trim_rollback_object( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t) { + ceph_assert(!hoid.is_temp()); + t->remove( + coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard)); +} + +PGBackend *PGBackend::build_pg_backend( + const pg_pool_t &pool, + const map<string,string>& profile, + Listener *l, + coll_t coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct) +{ + ErasureCodeProfile ec_profile = profile; + switch (pool.type) { + case pg_pool_t::TYPE_REPLICATED: { + return new ReplicatedBackend(l, coll, ch, store, cct); + } + case pg_pool_t::TYPE_ERASURE: { + ErasureCodeInterfaceRef ec_impl; + stringstream ss; + ceph::ErasureCodePluginRegistry::instance().factory( + profile.find("plugin")->second, + cct->_conf.get_val<std::string>("erasure_code_dir"), + ec_profile, + &ec_impl, + &ss); + ceph_assert(ec_impl); + return new ECBackend( + l, + coll, + ch, + store, + cct, + ec_impl, + pool.stripe_width); + } + default: + ceph_abort(); + return NULL; + } +} + +int PGBackend::be_scan_list( + ScrubMap &map, + ScrubMapBuilder &pos) +{ + dout(10) << __func__ << " " << pos << dendl; + ceph_assert(!pos.done()); + ceph_assert(pos.pos < pos.ls.size()); + hobject_t& poid = pos.ls[pos.pos]; + + struct stat st; + int r = store->stat( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + &st, + true); + if (r == 0) { + ScrubMap::object &o = map.objects[poid]; + o.size = st.st_size; + ceph_assert(!o.negative); + store->getattrs( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + o.attrs); + + if (pos.deep) { + r = be_deep_scrub(poid, map, pos, o); + } + dout(25) << __func__ << " " << poid << dendl; + } else if (r == -ENOENT) { + dout(25) << __func__ << " " << poid << " got " << r + << ", skipping" << dendl; + } else if (r == -EIO) { + dout(25) << __func__ << " " << poid << " got " << r + << ", stat_error" << dendl; + ScrubMap::object &o = map.objects[poid]; + o.stat_error = true; + } else { + derr << __func__ << " got: " << cpp_strerror(r) << dendl; + ceph_abort(); + } + if (r == -EINPROGRESS) { + return -EINPROGRESS; + } + pos.next_object(); + return 0; +} + +bool PGBackend::be_compare_scrub_objects( + pg_shard_t auth_shard, + const ScrubMap::object &auth, + const object_info_t& auth_oi, + const ScrubMap::object &candidate, + shard_info_wrapper &shard_result, + inconsistent_obj_wrapper &obj_result, + ostream &errorstream, + bool has_snapset) +{ + enum { CLEAN, FOUND_ERROR } error = CLEAN; + if (auth.digest_present && candidate.digest_present) { + if (auth.digest != candidate.digest) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "data_digest 0x" << std::hex << candidate.digest + << " != data_digest 0x" << auth.digest << std::dec + << " from shard " << auth_shard; + obj_result.set_data_digest_mismatch(); + } + } + if (auth.omap_digest_present && candidate.omap_digest_present) { + if (auth.omap_digest != candidate.omap_digest) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest + << " != omap_digest 0x" << auth.omap_digest << std::dec + << " from shard " << auth_shard; + obj_result.set_omap_digest_mismatch(); + } + } + if (parent->get_pool().is_replicated()) { + if (auth_oi.is_data_digest() && candidate.digest_present) { + if (auth_oi.data_digest != candidate.digest) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "data_digest 0x" << std::hex << candidate.digest + << " != data_digest 0x" << auth_oi.data_digest << std::dec + << " from auth oi " << auth_oi; + shard_result.set_data_digest_mismatch_info(); + } + } + if (auth_oi.is_omap_digest() && candidate.omap_digest_present) { + if (auth_oi.omap_digest != candidate.omap_digest) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest + << " != omap_digest 0x" << auth_oi.omap_digest << std::dec + << " from auth oi " << auth_oi; + shard_result.set_omap_digest_mismatch_info(); + } + } + } + if (candidate.stat_error) + return error == FOUND_ERROR; + if (!shard_result.has_info_missing() + && !shard_result.has_info_corrupted()) { + bufferlist can_bl, auth_bl; + auto can_attr = candidate.attrs.find(OI_ATTR); + auto auth_attr = auth.attrs.find(OI_ATTR); + + ceph_assert(auth_attr != auth.attrs.end()); + ceph_assert(can_attr != candidate.attrs.end()); + + can_bl.push_back(can_attr->second); + auth_bl.push_back(auth_attr->second); + if (!can_bl.contents_equal(auth_bl)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + obj_result.set_object_info_inconsistency(); + errorstream << "object info inconsistent "; + } + } + if (has_snapset) { + if (!shard_result.has_snapset_missing() + && !shard_result.has_snapset_corrupted()) { + bufferlist can_bl, auth_bl; + auto can_attr = candidate.attrs.find(SS_ATTR); + auto auth_attr = auth.attrs.find(SS_ATTR); + + ceph_assert(auth_attr != auth.attrs.end()); + ceph_assert(can_attr != candidate.attrs.end()); + + can_bl.push_back(can_attr->second); + auth_bl.push_back(auth_attr->second); + if (!can_bl.contents_equal(auth_bl)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + obj_result.set_snapset_inconsistency(); + errorstream << "snapset inconsistent "; + } + } + } + if (parent->get_pool().is_erasure()) { + if (!shard_result.has_hinfo_missing() + && !shard_result.has_hinfo_corrupted()) { + bufferlist can_bl, auth_bl; + auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key()); + auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key()); + + ceph_assert(auth_hi != auth.attrs.end()); + ceph_assert(can_hi != candidate.attrs.end()); + + can_bl.push_back(can_hi->second); + auth_bl.push_back(auth_hi->second); + if (!can_bl.contents_equal(auth_bl)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + obj_result.set_hinfo_inconsistency(); + errorstream << "hinfo inconsistent "; + } + } + } + uint64_t oi_size = be_get_ondisk_size(auth_oi.size); + if (oi_size != candidate.size) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "size " << candidate.size + << " != size " << oi_size + << " from auth oi " << auth_oi; + shard_result.set_size_mismatch_info(); + } + if (auth.size != candidate.size) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "size " << candidate.size + << " != size " << auth.size + << " from shard " << auth_shard; + obj_result.set_size_mismatch(); + } + // If the replica is too large and we didn't already count it for this object + // + if (candidate.size > cct->_conf->osd_max_object_size + && !obj_result.has_size_too_large()) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "size " << candidate.size + << " > " << cct->_conf->osd_max_object_size + << " is too large"; + obj_result.set_size_too_large(); + } + for (map<string,bufferptr>::const_iterator i = auth.attrs.begin(); + i != auth.attrs.end(); + ++i) { + // We check system keys seperately + if (i->first == OI_ATTR || i->first[0] != '_') + continue; + if (!candidate.attrs.count(i->first)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "attr name mismatch '" << i->first << "'"; + obj_result.set_attr_name_mismatch(); + } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "attr value mismatch '" << i->first << "'"; + obj_result.set_attr_value_mismatch(); + } + } + for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin(); + i != candidate.attrs.end(); + ++i) { + // We check system keys seperately + if (i->first == OI_ATTR || i->first[0] != '_') + continue; + if (!auth.attrs.count(i->first)) { + if (error != CLEAN) + errorstream << ", "; + error = FOUND_ERROR; + errorstream << "attr name mismatch '" << i->first << "'"; + obj_result.set_attr_name_mismatch(); + } + } + return error == FOUND_ERROR; +} + +static int dcount(const object_info_t &oi) +{ + int count = 0; + if (oi.is_data_digest()) + count++; + if (oi.is_omap_digest()) + count++; + return count; +} + +map<pg_shard_t, ScrubMap *>::const_iterator + PGBackend::be_select_auth_object( + const hobject_t &obj, + const map<pg_shard_t,ScrubMap*> &maps, + object_info_t *auth_oi, + map<pg_shard_t, shard_info_wrapper> &shard_map, + bool &digest_match, + spg_t pgid, + ostream &errorstream) +{ + eversion_t auth_version; + + // Create list of shards with primary first so it will be auth copy all + // other things being equal. + list<pg_shard_t> shards; + for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin(); + j != maps.end(); + ++j) { + if (j->first == get_parent()->whoami_shard()) + continue; + shards.push_back(j->first); + } + shards.push_front(get_parent()->whoami_shard()); + + map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end(); + digest_match = true; + for (auto &l : shards) { + ostringstream shard_errorstream; + bool error = false; + map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l); + map<hobject_t, ScrubMap::object>::iterator i = + j->second->objects.find(obj); + if (i == j->second->objects.end()) { + continue; + } + auto& shard_info = shard_map[j->first]; + if (j->first == get_parent()->whoami_shard()) + shard_info.primary = true; + if (i->second.read_error) { + shard_info.set_read_error(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a read error"; + } + if (i->second.ec_hash_mismatch) { + shard_info.set_ec_hash_mismatch(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had an ec hash mismatch"; + } + if (i->second.ec_size_mismatch) { + shard_info.set_ec_size_mismatch(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had an ec size mismatch"; + } + + object_info_t oi; + bufferlist bl; + map<string, bufferptr>::iterator k; + SnapSet ss; + bufferlist ss_bl, hk_bl; + + if (i->second.stat_error) { + shard_info.set_stat_error(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a stat error"; + // With stat_error no further checking + // We don't need to also see a missing_object_info_attr + goto out; + } + + // We won't pick an auth copy if the snapset is missing or won't decode. + ceph_assert(!obj.is_snapdir()); + if (obj.is_head()) { + k = i->second.attrs.find(SS_ATTR); + if (k == i->second.attrs.end()) { + shard_info.set_snapset_missing(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a missing snapset key"; + } else { + ss_bl.push_back(k->second); + try { + auto bliter = ss_bl.cbegin(); + decode(ss, bliter); + } catch (...) { + // invalid snapset, probably corrupt + shard_info.set_snapset_corrupted(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a corrupt snapset"; + } + } + } + + if (parent->get_pool().is_erasure()) { + ECUtil::HashInfo hi; + k = i->second.attrs.find(ECUtil::get_hinfo_key()); + if (k == i->second.attrs.end()) { + shard_info.set_hinfo_missing(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a missing hinfo key"; + } else { + hk_bl.push_back(k->second); + try { + auto bliter = hk_bl.cbegin(); + decode(hi, bliter); + } catch (...) { + // invalid snapset, probably corrupt + shard_info.set_hinfo_corrupted(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a corrupt hinfo"; + } + } + } + + k = i->second.attrs.find(OI_ATTR); + if (k == i->second.attrs.end()) { + // no object info on object, probably corrupt + shard_info.set_info_missing(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a missing info key"; + goto out; + } + bl.push_back(k->second); + try { + auto bliter = bl.cbegin(); + decode(oi, bliter); + } catch (...) { + // invalid object info, probably corrupt + shard_info.set_info_corrupted(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate had a corrupt info"; + goto out; + } + + // This is automatically corrected in PG::_repair_oinfo_oid() + ceph_assert(oi.soid == obj); + + if (i->second.size != be_get_ondisk_size(oi.size)) { + shard_info.set_obj_size_info_mismatch(); + if (error) + shard_errorstream << ", "; + error = true; + shard_errorstream << "candidate size " << i->second.size << " info size " + << oi.size << " mismatch"; + } + + // digest_match will only be true if computed digests are the same + if (auth_version != eversion_t() + && auth->second->objects[obj].digest_present + && i->second.digest_present + && auth->second->objects[obj].digest != i->second.digest) { + digest_match = false; + dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest + << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec + << dendl; + } + + // Don't use this particular shard due to previous errors + // XXX: For now we can't pick one shard for repair and another's object info or snapset + if (shard_info.errors) + goto out; + + if (auth_version == eversion_t() || oi.version > auth_version || + (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) { + auth = j; + *auth_oi = oi; + auth_version = oi.version; + } + +out: + if (error) + errorstream << pgid.pgid << " shard " << l << " soid " << obj + << " : " << shard_errorstream.str() << "\n"; + // Keep scanning other shards + } + dout(10) << __func__ << ": selecting osd " << auth->first + << " for obj " << obj + << " with oi " << *auth_oi + << dendl; + return auth; +} + +void PGBackend::be_compare_scrubmaps( + const map<pg_shard_t,ScrubMap*> &maps, + const set<hobject_t> &master_set, + bool repair, + map<hobject_t, set<pg_shard_t>> &missing, + map<hobject_t, set<pg_shard_t>> &inconsistent, + map<hobject_t, list<pg_shard_t>> &authoritative, + map<hobject_t, pair<boost::optional<uint32_t>, + boost::optional<uint32_t>>> &missing_digest, + int &shallow_errors, int &deep_errors, + Scrub::Store *store, + const spg_t& pgid, + const vector<int> &acting, + ostream &errorstream) +{ + utime_t now = ceph_clock_now(); + + // Check maps against master set and each other + for (set<hobject_t>::const_iterator k = master_set.begin(); + k != master_set.end(); + ++k) { + object_info_t auth_oi; + map<pg_shard_t, shard_info_wrapper> shard_map; + + inconsistent_obj_wrapper object_error{*k}; + + bool digest_match; + map<pg_shard_t, ScrubMap *>::const_iterator auth = + be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match, + pgid, errorstream); + + list<pg_shard_t> auth_list; + set<pg_shard_t> object_errors; + if (auth == maps.end()) { + object_error.set_version(0); + object_error.set_auth_missing(*k, maps, shard_map, shallow_errors, + deep_errors, get_parent()->whoami_shard()); + if (object_error.has_deep_errors()) + ++deep_errors; + else if (object_error.has_shallow_errors()) + ++shallow_errors; + store->add_object_error(k->pool, object_error); + errorstream << pgid.pgid << " soid " << *k + << " : failed to pick suitable object info\n"; + continue; + } + object_error.set_version(auth_oi.user_version); + ScrubMap::object& auth_object = auth->second->objects[*k]; + set<pg_shard_t> cur_missing; + set<pg_shard_t> cur_inconsistent; + bool fix_digest = false; + + for (auto j = maps.cbegin(); j != maps.cend(); ++j) { + if (j == auth) + shard_map[auth->first].selected_oi = true; + if (j->second->objects.count(*k)) { + shard_map[j->first].set_object(j->second->objects[*k]); + // Compare + stringstream ss; + bool found = be_compare_scrub_objects(auth->first, + auth_object, + auth_oi, + j->second->objects[*k], + shard_map[j->first], + object_error, + ss, + k->has_snapset()); + + dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "") + << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ") + << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "") + << dendl; + // If all replicas match, but they don't match object_info we can + // repair it by using missing_digest mechanism + if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1 + && digest_match && shard_map[j->first].only_data_digest_mismatch_info() + && auth_object.digest_present) { + // Set in missing_digests + fix_digest = true; + // Clear the error + shard_map[j->first].clear_data_digest_mismatch_info(); + errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n"; + } + // Some errors might have already been set in be_select_auth_object() + if (shard_map[j->first].errors != 0) { + cur_inconsistent.insert(j->first); + if (shard_map[j->first].has_deep_errors()) + ++deep_errors; + else + ++shallow_errors; + // Only true if be_compare_scrub_objects() found errors and put something + // in ss. + if (found) + errorstream << pgid << " shard " << j->first << " soid " << *k + << " : " << ss.str() << "\n"; + } else if (found) { + // Track possible shard to use as authoritative, if needed + // There are errors, without identifying the shard + object_errors.insert(j->first); + errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n"; + } else { + // XXX: The auth shard might get here that we don't know + // that it has the "correct" data. + auth_list.push_back(j->first); + } + } else { + cur_missing.insert(j->first); + shard_map[j->first].set_missing(); + shard_map[j->first].primary = (j->first == get_parent()->whoami_shard()); + // Can't have any other errors if there is no information available + ++shallow_errors; + errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n"; + } + object_error.add_shard(j->first, shard_map[j->first]); + } + + if (auth_list.empty()) { + if (object_errors.empty()) { + errorstream << pgid.pgid << " soid " << *k + << " : failed to pick suitable auth object\n"; + goto out; + } + // Object errors exist and nothing in auth_list + // Prefer the auth shard otherwise take first from list. + pg_shard_t shard; + if (object_errors.count(auth->first)) { + shard = auth->first; + } else { + shard = *(object_errors.begin()); + } + auth_list.push_back(shard); + object_errors.erase(shard); + } + // At this point auth_list is populated, so we add the object errors shards + // as inconsistent. + cur_inconsistent.insert(object_errors.begin(), object_errors.end()); + if (!cur_missing.empty()) { + missing[*k] = cur_missing; + } + if (!cur_inconsistent.empty()) { + inconsistent[*k] = cur_inconsistent; + } + + if (fix_digest) { + boost::optional<uint32_t> data_digest, omap_digest; + ceph_assert(auth_object.digest_present); + data_digest = auth_object.digest; + if (auth_object.omap_digest_present) { + omap_digest = auth_object.omap_digest; + } + missing_digest[*k] = make_pair(data_digest, omap_digest); + } + if (!cur_inconsistent.empty() || !cur_missing.empty()) { + authoritative[*k] = auth_list; + } else if (!fix_digest && parent->get_pool().is_replicated()) { + enum { + NO = 0, + MAYBE = 1, + FORCE = 2, + } update = NO; + + if (auth_object.digest_present && !auth_oi.is_data_digest()) { + dout(20) << __func__ << " missing data digest on " << *k << dendl; + update = MAYBE; + } + if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) { + dout(20) << __func__ << " missing omap digest on " << *k << dendl; + update = MAYBE; + } + + // recorded digest != actual digest? + if (auth_oi.is_data_digest() && auth_object.digest_present && + auth_oi.data_digest != auth_object.digest) { + ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info()); + errorstream << pgid << " recorded data digest 0x" + << std::hex << auth_oi.data_digest << " != on disk 0x" + << auth_object.digest << std::dec << " on " << auth_oi.soid + << "\n"; + if (repair) + update = FORCE; + } + if (auth_oi.is_omap_digest() && auth_object.omap_digest_present && + auth_oi.omap_digest != auth_object.omap_digest) { + ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info()); + errorstream << pgid << " recorded omap digest 0x" + << std::hex << auth_oi.omap_digest << " != on disk 0x" + << auth_object.omap_digest << std::dec + << " on " << auth_oi.soid << "\n"; + if (repair) + update = FORCE; + } + + if (update != NO) { + utime_t age = now - auth_oi.local_mtime; + if (update == FORCE || + age > cct->_conf->osd_deep_scrub_update_digest_min_age) { + boost::optional<uint32_t> data_digest, omap_digest; + if (auth_object.digest_present) { + data_digest = auth_object.digest; + dout(20) << __func__ << " will update data digest on " << *k << dendl; + } + if (auth_object.omap_digest_present) { + omap_digest = auth_object.omap_digest; + dout(20) << __func__ << " will update omap digest on " << *k << dendl; + } + missing_digest[*k] = make_pair(data_digest, omap_digest); + } else { + dout(20) << __func__ << " missing digest but age " << age + << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age + << " on " << *k << dendl; + } + } + } +out: + if (object_error.has_deep_errors()) + ++deep_errors; + else if (object_error.has_shallow_errors()) + ++shallow_errors; + if (object_error.errors || object_error.union_shards.errors) { + store->add_object_error(k->pool, object_error); + } + } +} + +void PGBackend::be_omap_checks(const map<pg_shard_t,ScrubMap*> &maps, + const set<hobject_t> &master_set, + omap_stat_t& omap_stats, + ostream &warnstream) const +{ + bool needs_omap_check = false; + for (const auto& map : maps) { + if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) { + needs_omap_check = true; + break; + } + } + + if (!needs_omap_check) { + return; // Nothing to do + } + + // Iterate through objects and update omap stats + for (const auto& k : master_set) { + for (const auto& map : maps) { + if (map.first != get_parent()->primary_shard()) { + // Only set omap stats for the primary + continue; + } + auto it = map.second->objects.find(k); + if (it == map.second->objects.end()) + continue; + ScrubMap::object& obj = it->second; + omap_stats.omap_bytes += obj.object_omap_bytes; + omap_stats.omap_keys += obj.object_omap_keys; + if (obj.large_omap_object_found) { + pg_t pg; + auto osdmap = get_osdmap(); + osdmap->map_to_pg(k.pool, k.oid.name, k.get_key(), k.nspace, &pg); + pg_t mpg = osdmap->raw_pg_to_pg(pg); + omap_stats.large_omap_objects++; + warnstream << "Large omap object found. Object: " << k + << " PG: " << pg << " (" << mpg << ")" + << " Key count: " << obj.large_omap_object_key_count + << " Size (bytes): " << obj.large_omap_object_value_size + << '\n'; + break; + } + } + } +} diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h new file mode 100644 index 00000000..18ef7235 --- /dev/null +++ b/src/osd/PGBackend.h @@ -0,0 +1,633 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013,2014 Inktank Storage, Inc. + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef PGBACKEND_H +#define PGBACKEND_H + +#include "osd_types.h" +#include "common/WorkQueue.h" +#include "include/Context.h" +#include "os/ObjectStore.h" +#include "common/LogClient.h" +#include <string> +#include "PGTransaction.h" + +namespace Scrub { + class Store; +} +struct shard_info_wrapper; +struct inconsistent_obj_wrapper; + +//forward declaration +class OSDMap; +class PGLog; +typedef std::shared_ptr<const OSDMap> OSDMapRef; + + /** + * PGBackend + * + * PGBackend defines an interface for logic handling IO and + * replication on RADOS objects. The PGBackend implementation + * is responsible for: + * + * 1) Handling client operations + * 2) Handling object recovery + * 3) Handling object access + * 4) Handling scrub, deep-scrub, repair + */ + class PGBackend { + public: + CephContext* cct; + protected: + ObjectStore *store; + const coll_t coll; + ObjectStore::CollectionHandle &ch; + public: + /** + * Provides interfaces for PGBackend callbacks + * + * The intention is that the parent calls into the PGBackend + * implementation holding a lock and that the callbacks are + * called under the same locks. + */ + class Listener { + public: + /// Debugging + virtual DoutPrefixProvider *get_dpp() = 0; + + /// Recovery + + /** + * Called with the transaction recovering oid + */ + virtual void on_local_recover( + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info, + ObjectContextRef obc, + bool is_delete, + ObjectStore::Transaction *t + ) = 0; + + /** + * Called when transaction recovering oid is durable and + * applied on all replicas + */ + virtual void on_global_recover( + const hobject_t &oid, + const object_stat_sum_t &stat_diff, + bool is_delete + ) = 0; + + /** + * Called when peer is recovered + */ + virtual void on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info + ) = 0; + + virtual void begin_peer_recover( + pg_shard_t peer, + const hobject_t oid) = 0; + + virtual void failed_push(const list<pg_shard_t> &from, + const hobject_t &soid, + const eversion_t &need = eversion_t()) = 0; + virtual void finish_degraded_object(const hobject_t oid) = 0; + virtual void primary_failed(const hobject_t &soid) = 0; + virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0; + virtual void cancel_pull(const hobject_t &soid) = 0; + + virtual void apply_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats) = 0; + + /** + * Called when a read on the primary fails when pushing + */ + virtual void on_primary_error( + const hobject_t &oid, + eversion_t v + ) = 0; + + virtual void backfill_add_missing( + const hobject_t &oid, + eversion_t v + ) = 0; + + virtual void remove_missing_object(const hobject_t &oid, + eversion_t v, + Context *on_complete) = 0; + + + /** + * Bless a context + * + * Wraps a context in whatever outer layers the parent usually + * uses to call into the PGBackend + */ + virtual Context *bless_context(Context *c) = 0; + virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext( + GenContext<ThreadPool::TPHandle&> *c) = 0; + virtual GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext( + GenContext<ThreadPool::TPHandle&> *c) = 0; + + virtual void send_message(int to_osd, Message *m) = 0; + virtual void queue_transaction( + ObjectStore::Transaction&& t, + OpRequestRef op = OpRequestRef() + ) = 0; + virtual void queue_transactions( + vector<ObjectStore::Transaction>& tls, + OpRequestRef op = OpRequestRef() + ) = 0; + virtual epoch_t get_interval_start_epoch() const = 0; + virtual epoch_t get_last_peering_reset_epoch() const = 0; + + virtual const set<pg_shard_t> &get_acting_recovery_backfill_shards() const = 0; + virtual const set<pg_shard_t> &get_acting_shards() const = 0; + virtual const set<pg_shard_t> &get_backfill_shards() const = 0; + + virtual std::ostream& gen_dbg_prefix(std::ostream& out) const = 0; + + virtual const map<hobject_t, set<pg_shard_t>> &get_missing_loc_shards() + const = 0; + + virtual const pg_missing_tracker_t &get_local_missing() const = 0; + virtual void add_local_next_event(const pg_log_entry_t& e) = 0; + virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing() + const = 0; + virtual boost::optional<const pg_missing_const_i &> maybe_get_shard_missing( + pg_shard_t peer) const { + if (peer == primary_shard()) { + return get_local_missing(); + } else { + map<pg_shard_t, pg_missing_t>::const_iterator i = + get_shard_missing().find(peer); + if (i == get_shard_missing().end()) { + return boost::optional<const pg_missing_const_i &>(); + } else { + return i->second; + } + } + } + virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const { + auto m = maybe_get_shard_missing(peer); + ceph_assert(m); + return *m; + } + + virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0; + virtual const pg_info_t &get_shard_info(pg_shard_t peer) const { + if (peer == primary_shard()) { + return get_info(); + } else { + map<pg_shard_t, pg_info_t>::const_iterator i = + get_shard_info().find(peer); + ceph_assert(i != get_shard_info().end()); + return i->second; + } + } + + virtual const PGLog &get_log() const = 0; + virtual bool pgb_is_primary() const = 0; + virtual const OSDMapRef& pgb_get_osdmap() const = 0; + virtual epoch_t pgb_get_osdmap_epoch() const = 0; + virtual const pg_info_t &get_info() const = 0; + virtual const pg_pool_t &get_pool() const = 0; + + virtual ObjectContextRef get_obc( + const hobject_t &hoid, + const map<string, bufferlist> &attrs) = 0; + + virtual bool try_lock_for_read( + const hobject_t &hoid, + ObcLockManager &manager) = 0; + + virtual void release_locks(ObcLockManager &manager) = 0; + + virtual void op_applied( + const eversion_t &applied_version) = 0; + + virtual bool should_send_op( + pg_shard_t peer, + const hobject_t &hoid) = 0; + + virtual bool pg_is_undersized() const = 0; + virtual bool pg_is_repair() const = 0; + + virtual void log_operation( + const vector<pg_log_entry_t> &logv, + const boost::optional<pg_hit_set_history_t> &hset_history, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + bool transaction_applied, + ObjectStore::Transaction &t, + bool async = false) = 0; + + virtual void pgb_set_object_snap_mapping( + const hobject_t &soid, + const set<snapid_t> &snaps, + ObjectStore::Transaction *t) = 0; + + virtual void pgb_clear_object_snap_mapping( + const hobject_t &soid, + ObjectStore::Transaction *t) = 0; + + virtual void update_peer_last_complete_ondisk( + pg_shard_t fromosd, + eversion_t lcod) = 0; + + virtual void update_last_complete_ondisk( + eversion_t lcod) = 0; + + virtual void update_stats( + const pg_stat_t &stat) = 0; + + virtual void schedule_recovery_work( + GenContext<ThreadPool::TPHandle&> *c) = 0; + + virtual pg_shard_t whoami_shard() const = 0; + int whoami() const { + return whoami_shard().osd; + } + spg_t whoami_spg_t() const { + return get_info().pgid; + } + + virtual spg_t primary_spg_t() const = 0; + virtual pg_shard_t primary_shard() const = 0; + + virtual uint64_t min_upacting_features() const = 0; + virtual hobject_t get_temp_recovery_object(const hobject_t& target, + eversion_t version) = 0; + + virtual void send_message_osd_cluster( + int peer, Message *m, epoch_t from_epoch) = 0; + virtual void send_message_osd_cluster( + Message *m, Connection *con) = 0; + virtual void send_message_osd_cluster( + Message *m, const ConnectionRef& con) = 0; + virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0; + virtual entity_name_t get_cluster_msgr_name() = 0; + + virtual PerfCounters *get_logger() = 0; + + virtual ceph_tid_t get_tid() = 0; + + virtual LogClientTemp clog_error() = 0; + virtual LogClientTemp clog_warn() = 0; + + virtual bool check_failsafe_full() = 0; + + virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0; + + virtual bool pg_is_repair() = 0; + virtual void inc_osd_stat_repaired() = 0; + virtual void set_osd_stat_repaired(int64_t) = 0; + virtual bool pg_is_remote_backfilling() = 0; + virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0; + virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0; + virtual void pg_add_num_bytes(int64_t num_bytes) = 0; + virtual void pg_sub_num_bytes(int64_t num_bytes) = 0; + virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0; + virtual ~Listener() {} + }; + Listener *parent; + Listener *get_parent() const { return parent; } + PGBackend(CephContext* cct, Listener *l, ObjectStore *store, const coll_t &coll, + ObjectStore::CollectionHandle &ch) : + cct(cct), + store(store), + coll(coll), + ch(ch), + parent(l) {} + bool is_primary() const { return get_parent()->pgb_is_primary(); } + const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); } + epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); } + const pg_info_t &get_info() { return get_parent()->get_info(); } + + std::ostream& gen_prefix(std::ostream& out) const { + return parent->gen_dbg_prefix(out); + } + + /** + * RecoveryHandle + * + * We may want to recover multiple objects in the same set of + * messages. RecoveryHandle is an interface for the opaque + * object used by the implementation to store the details of + * the pending recovery operations. + */ + struct RecoveryHandle { + bool cache_dont_need; + map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > deletes; + + RecoveryHandle(): cache_dont_need(false) {} + virtual ~RecoveryHandle() {} + }; + + /// Get a fresh recovery operation + virtual RecoveryHandle *open_recovery_op() = 0; + + /// run_recovery_op: finish the operation represented by h + virtual void run_recovery_op( + RecoveryHandle *h, ///< [in] op to finish + int priority ///< [in] msg priority + ) = 0; + + void recover_delete_object(const hobject_t &oid, eversion_t v, + RecoveryHandle *h); + void send_recovery_deletes(int prio, + const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes); + + /** + * recover_object + * + * Triggers a recovery operation on the specified hobject_t + * onreadable must be called before onwriteable + * + * On each replica (primary included), get_parent()->on_not_missing() + * must be called when the transaction finalizing the recovery + * is queued. Similarly, get_parent()->on_readable() must be called + * when the transaction is applied in the backing store. + * + * get_parent()->on_not_degraded() should be called on the primary + * when writes can resume on the object. + * + * obc may be NULL if the primary lacks the object. + * + * head may be NULL only if the head/snapdir is missing + * + * @param missing [in] set of info, missing pairs for queried nodes + * @param overlaps [in] mapping of object to file offset overlaps + */ + virtual int recover_object( + const hobject_t &hoid, ///< [in] object to recover + eversion_t v, ///< [in] version to recover + ObjectContextRef head, ///< [in] context of the head/snapdir object + ObjectContextRef obc, ///< [in] context of the object + RecoveryHandle *h ///< [in,out] handle to attach recovery op to + ) = 0; + + /** + * true if PGBackend can handle this message while inactive + * + * If it returns true, handle_message *must* also return true + */ + virtual bool can_handle_while_inactive(OpRequestRef op) = 0; + + /// gives PGBackend a crack at an incoming message + bool handle_message( + OpRequestRef op ///< [in] message received + ); ///< @return true if the message was handled + + /// the variant of handle_message that is overridden by child classes + virtual bool _handle_message(OpRequestRef op) = 0; + + virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0; + + + /** + * clean up any temporary on-disk state due to a pg interval change + */ + void on_change_cleanup(ObjectStore::Transaction *t); + /** + * implementation should clear itself, contexts blessed prior to on_change + * won't be called after on_change() + */ + virtual void on_change() = 0; + virtual void clear_recovery_state() = 0; + + virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0; + virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0; + virtual int get_ec_data_chunk_count() const { return 0; }; + virtual int get_ec_stripe_chunk_size() const { return 0; }; + + virtual void dump_recovery_info(Formatter *f) const = 0; + + private: + set<hobject_t> temp_contents; + public: + // Track contents of temp collection, clear on reset + void add_temp_obj(const hobject_t &oid) { + temp_contents.insert(oid); + } + void add_temp_objs(const set<hobject_t> &oids) { + temp_contents.insert(oids.begin(), oids.end()); + } + void clear_temp_obj(const hobject_t &oid) { + temp_contents.erase(oid); + } + void clear_temp_objs(const set<hobject_t> &oids) { + for (set<hobject_t>::const_iterator i = oids.begin(); + i != oids.end(); + ++i) { + temp_contents.erase(*i); + } + } + + virtual ~PGBackend() {} + + /// execute implementation specific transaction + virtual void submit_transaction( + const hobject_t &hoid, ///< [in] object + const object_stat_sum_t &delta_stats,///< [in] stat change + const eversion_t &at_version, ///< [in] version + PGTransactionUPtr &&t, ///< [in] trans to execute (move) + const eversion_t &trim_to, ///< [in] trim log to here + const eversion_t &roll_forward_to, ///< [in] trim rollback info to here + const vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t + /// [in] hitset history (if updated with this transaction) + boost::optional<pg_hit_set_history_t> &hset_history, + Context *on_all_commit, ///< [in] called when all commit + ceph_tid_t tid, ///< [in] tid + osd_reqid_t reqid, ///< [in] reqid + OpRequestRef op ///< [in] op + ) = 0; + + /// submit callback to be called in order with pending writes + virtual void call_write_ordered(std::function<void(void)> &&cb) = 0; + + void try_stash( + const hobject_t &hoid, + version_t v, + ObjectStore::Transaction *t); + + void rollback( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t); + + friend class LRBTrimmer; + void rollforward( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t); + + void trim( + const pg_log_entry_t &entry, + ObjectStore::Transaction *t); + + void remove( + const hobject_t &hoid, + ObjectStore::Transaction *t); + + protected: + + void handle_recovery_delete(OpRequestRef op); + void handle_recovery_delete_reply(OpRequestRef op); + + /// Reapply old attributes + void rollback_setattrs( + const hobject_t &hoid, + map<string, boost::optional<bufferlist> > &old_attrs, + ObjectStore::Transaction *t); + + /// Truncate object to rollback append + virtual void rollback_append( + const hobject_t &hoid, + uint64_t old_size, + ObjectStore::Transaction *t); + + /// Unstash object to rollback stash + void rollback_stash( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t); + + /// Unstash object to rollback stash + void rollback_try_stash( + const hobject_t &hoid, + version_t old_version, + ObjectStore::Transaction *t); + + /// Delete object to rollback create + void rollback_create( + const hobject_t &hoid, + ObjectStore::Transaction *t) { + remove(hoid, t); + } + + /// Clone the extents back into place + void rollback_extents( + version_t gen, + const vector<pair<uint64_t, uint64_t> > &extents, + const hobject_t &hoid, + ObjectStore::Transaction *t); + public: + + /// Trim object stashed at version + void trim_rollback_object( + const hobject_t &hoid, + version_t gen, + ObjectStore::Transaction *t); + + /// List objects in collection + int objects_list_partial( + const hobject_t &begin, + int min, + int max, + vector<hobject_t> *ls, + hobject_t *next); + + int objects_list_range( + const hobject_t &start, + const hobject_t &end, + vector<hobject_t> *ls, + vector<ghobject_t> *gen_obs=0); + + int objects_get_attr( + const hobject_t &hoid, + const string &attr, + bufferlist *out); + + virtual int objects_get_attrs( + const hobject_t &hoid, + map<string, bufferlist> *out); + + virtual int objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + bufferlist *bl) = 0; + + virtual void objects_read_async( + const hobject_t &hoid, + const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, + pair<bufferlist*, Context*> > > &to_read, + Context *on_complete, bool fast_read = false) = 0; + + virtual bool auto_repair_supported() const = 0; + int be_scan_list( + ScrubMap &map, + ScrubMapBuilder &pos); + bool be_compare_scrub_objects( + pg_shard_t auth_shard, + const ScrubMap::object &auth, + const object_info_t& auth_oi, + const ScrubMap::object &candidate, + shard_info_wrapper& shard_error, + inconsistent_obj_wrapper &result, + ostream &errorstream, + bool has_snapset); + map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object( + const hobject_t &obj, + const map<pg_shard_t,ScrubMap*> &maps, + object_info_t *auth_oi, + map<pg_shard_t, shard_info_wrapper> &shard_map, + bool &digest_match, + spg_t pgid, + ostream &errorstream); + void be_compare_scrubmaps( + const map<pg_shard_t,ScrubMap*> &maps, + const set<hobject_t> &master_set, + bool repair, + map<hobject_t, set<pg_shard_t>> &missing, + map<hobject_t, set<pg_shard_t>> &inconsistent, + map<hobject_t, list<pg_shard_t>> &authoritative, + map<hobject_t, pair<boost::optional<uint32_t>, + boost::optional<uint32_t>>> &missing_digest, + int &shallow_errors, int &deep_errors, + Scrub::Store *store, + const spg_t& pgid, + const vector<int> &acting, + ostream &errorstream); + virtual uint64_t be_get_ondisk_size( + uint64_t logical_size) = 0; + virtual int be_deep_scrub( + const hobject_t &oid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) = 0; + void be_omap_checks( + const map<pg_shard_t,ScrubMap*> &maps, + const set<hobject_t> &master_set, + omap_stat_t& omap_stats, + ostream &warnstream) const; + + static PGBackend *build_pg_backend( + const pg_pool_t &pool, + const map<string,string>& profile, + Listener *l, + coll_t coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct); +}; + +#endif diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc new file mode 100644 index 00000000..3aeeb55e --- /dev/null +++ b/src/osd/PGLog.cc @@ -0,0 +1,1018 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "PGLog.h" +#include "include/unordered_map.h" +#include "common/ceph_context.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +static ostream& _prefix(std::ostream *_dout, const PGLog *pglog) +{ + return pglog->gen_prefix(*_dout); +} + +//////////////////// PGLog::IndexedLog //////////////////// + +void PGLog::IndexedLog::split_out_child( + pg_t child_pgid, + unsigned split_bits, + PGLog::IndexedLog *target) +{ + unindex(); + *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits)); + index(); + target->index(); + reset_rollback_info_trimmed_to_riter(); +} + +void PGLog::IndexedLog::trim( + CephContext* cct, + eversion_t s, + set<eversion_t> *trimmed, + set<string>* trimmed_dups, + eversion_t *write_from_dups) +{ + ceph_assert(s <= can_rollback_to); + if (complete_to != log.end()) + lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl; + + auto earliest_dup_version = + log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked + ? 0u + : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked + 1; + + lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl; + while (!log.empty()) { + const pg_log_entry_t &e = *log.begin(); + if (e.version > s) + break; + lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl; + if (trimmed) + trimmed->emplace(e.version); + + unindex(e); // remove from index, + + // add to dup list + if (e.version.version >= earliest_dup_version) { + if (write_from_dups != nullptr && *write_from_dups > e.version) { + lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl; + *write_from_dups = e.version; + } + dups.push_back(pg_log_dup_t(e)); + index(dups.back()); + uint32_t idx = 0; + for (const auto& extra : e.extra_reqids) { + int return_code = e.return_code; + if (return_code >= 0) { + auto it = e.extra_reqid_return_codes.find(idx); + if (it != e.extra_reqid_return_codes.end()) { + return_code = it->second; + } + } + ++idx; + + // note: extras have the same version as outer op + dups.push_back(pg_log_dup_t(e.version, extra.second, + extra.first, return_code)); + index(dups.back()); + } + } + + bool reset_complete_to = false; + // we are trimming past complete_to, so reset complete_to + if (complete_to != log.end() && e.version >= complete_to->version) + reset_complete_to = true; + if (rollback_info_trimmed_to_riter == log.rend() || + e.version == rollback_info_trimmed_to_riter->version) { + log.pop_front(); + rollback_info_trimmed_to_riter = log.rend(); + } else { + log.pop_front(); + } + + // reset complete_to to the beginning of the log + if (reset_complete_to) { + complete_to = log.begin(); + if (complete_to != log.end()) { + lgeneric_subdout(cct, osd, 20) << " moving complete_to to " + << log.begin()->version << dendl; + } else { + lgeneric_subdout(cct, osd, 20) << " log is now empty" << dendl; + } + } + } + + while (!dups.empty()) { + const auto& e = *dups.begin(); + if (e.version.version >= earliest_dup_version) + break; + lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl; + if (trimmed_dups) + trimmed_dups->insert(e.get_key_name()); + unindex(e); + dups.pop_front(); + } + + // raise tail? + if (tail < s) + tail = s; +} + +ostream& PGLog::IndexedLog::print(ostream& out) const +{ + out << *this << std::endl; + for (list<pg_log_entry_t>::const_iterator p = log.begin(); + p != log.end(); + ++p) { + out << *p << " " << + (logged_object(p->soid) ? "indexed" : "NOT INDEXED") << + std::endl; + ceph_assert(!p->reqid_is_indexed() || logged_req(p->reqid)); + } + + for (list<pg_log_dup_t>::const_iterator p = dups.begin(); + p != dups.end(); + ++p) { + out << *p << std::endl; + } + + return out; +} + +//////////////////// PGLog //////////////////// + +void PGLog::reset_backfill() +{ + missing.clear(); +} + +void PGLog::clear() { + missing.clear(); + log.clear(); + log_keys_debug.clear(); + undirty(); +} + +void PGLog::clear_info_log( + spg_t pgid, + ObjectStore::Transaction *t) { + coll_t coll(pgid); + t->remove(coll, pgid.make_pgmeta_oid()); +} + +void PGLog::trim( + eversion_t trim_to, + pg_info_t &info, + bool transaction_applied, + bool async) +{ + dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl; + // trim? + if (trim_to > log.tail) { + dout(10) << __func__ << " missing = " << missing.num_missing() << dendl; + // Don't assert for async_recovery_targets or backfill_targets + // or whenever there are missing items + if (transaction_applied && !async && (missing.num_missing() == 0)) + ceph_assert(trim_to <= info.last_complete); + + dout(10) << "trim " << log << " to " << trim_to << dendl; + log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups); + info.log_tail = log.tail; + if (log.complete_to != log.log.end()) + dout(10) << " after trim complete_to " << log.complete_to->version << dendl; + } +} + +void PGLog::proc_replica_log( + pg_info_t &oinfo, + const pg_log_t &olog, + pg_missing_t& omissing, + pg_shard_t from) const +{ + dout(10) << "proc_replica_log for osd." << from << ": " + << oinfo << " " << olog << " " << omissing << dendl; + + if (olog.head < log.tail) { + dout(10) << __func__ << ": osd." << from << " does not overlap, not looking " + << "for divergent objects" << dendl; + return; + } + if (olog.head == log.head) { + dout(10) << __func__ << ": osd." << from << " same log head, not looking " + << "for divergent objects" << dendl; + return; + } + + /* + basically what we're doing here is rewinding the remote log, + dropping divergent entries, until we find something that matches + our master log. we then reset last_update to reflect the new + point up to which missing is accurate. + + later, in activate(), missing will get wound forward again and + we will send the peer enough log to arrive at the same state. + */ + + for (map<hobject_t, pg_missing_item>::const_iterator i = omissing.get_items().begin(); + i != omissing.get_items().end(); + ++i) { + dout(20) << " before missing " << i->first << " need " << i->second.need + << " have " << i->second.have << dendl; + } + + list<pg_log_entry_t>::const_reverse_iterator first_non_divergent = + log.log.rbegin(); + while (1) { + if (first_non_divergent == log.log.rend()) + break; + if (first_non_divergent->version <= olog.head) { + dout(20) << "merge_log point (usually last shared) is " + << *first_non_divergent << dendl; + break; + } + ++first_non_divergent; + } + + /* Because olog.head >= log.tail, we know that both pgs must at least have + * the event represented by log.tail. Similarly, because log.head >= olog.tail, + * we know that the even represented by olog.tail must be common to both logs. + * Furthermore, the event represented by a log tail was necessarily trimmed, + * thus neither olog.tail nor log.tail can be divergent. It's + * possible that olog/log contain no actual events between olog.head and + * max(log.tail, olog.tail), however, since they might have been split out. + * Thus, if we cannot find an event e such that + * log.tail <= e.version <= log.head, the last_update must actually be + * max(log.tail, olog.tail). + */ + eversion_t limit = std::max(olog.tail, log.tail); + eversion_t lu = + (first_non_divergent == log.log.rend() || + first_non_divergent->version < limit) ? + limit : + first_non_divergent->version; + + // we merge and adjust the replica's log, rollback the rollbackable divergent entry, + // remove the unrollbackable divergent entry and mark the according object as missing. + // the rollback boundary must choose crt of the olog which going to be merged. + // The replica log's(olog) crt will not be modified, so it could get passed + // to _merge_divergent_entries() directly. + IndexedLog folog(olog); + auto divergent = folog.rewind_from_head(lu); + _merge_divergent_entries( + folog, + divergent, + oinfo, + olog.get_can_rollback_to(), + omissing, + 0, + this); + + if (lu < oinfo.last_update) { + dout(10) << " peer osd." << from << " last_update now " << lu << dendl; + oinfo.last_update = lu; + } + + if (omissing.have_missing()) { + eversion_t first_missing = + omissing.get_items().at(omissing.get_rmissing().begin()->second).need; + oinfo.last_complete = eversion_t(); + list<pg_log_entry_t>::const_iterator i = olog.log.begin(); + for (; + i != olog.log.end(); + ++i) { + if (i->version < first_missing) + oinfo.last_complete = i->version; + else + break; + } + } else { + oinfo.last_complete = oinfo.last_update; + } +} // proc_replica_log + +/** + * rewind divergent entries at the head of the log + * + * This rewinds entries off the head of our log that are divergent. + * This is used by replicas during activation. + * + * @param newhead new head to rewind to + */ +void PGLog::rewind_divergent_log(eversion_t newhead, + pg_info_t &info, LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info) +{ + dout(10) << "rewind_divergent_log truncate divergent future " << + newhead << dendl; + + // We need to preserve the original crt before it gets updated in rewind_from_head(). + // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback + // a divergent entry or not. + eversion_t original_crt = log.get_can_rollback_to(); + dout(20) << __func__ << " original_crt = " << original_crt << dendl; + if (info.last_complete > newhead) + info.last_complete = newhead; + + auto divergent = log.rewind_from_head(newhead); + if (!divergent.empty()) { + mark_dirty_from(divergent.front().version); + } + for (auto &&entry: divergent) { + dout(10) << "rewind_divergent_log future divergent " << entry << dendl; + } + info.last_update = newhead; + + _merge_divergent_entries( + log, + divergent, + info, + original_crt, + missing, + rollbacker, + this); + + dirty_info = true; + dirty_big_info = true; +} + +void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd, + pg_info_t &info, LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info) +{ + dout(10) << "merge_log " << olog << " from osd." << fromosd + << " into " << log << dendl; + + // Check preconditions + + // If our log is empty, the incoming log needs to have not been trimmed. + ceph_assert(!log.null() || olog.tail == eversion_t()); + // The logs must overlap. + ceph_assert(log.head >= olog.tail && olog.head >= log.tail); + + for (map<hobject_t, pg_missing_item>::const_iterator i = missing.get_items().begin(); + i != missing.get_items().end(); + ++i) { + dout(20) << "pg_missing_t sobject: " << i->first << dendl; + } + + bool changed = false; + + // extend on tail? + // this is just filling in history. it does not affect our + // missing set, as that should already be consistent with our + // current log. + eversion_t orig_tail = log.tail; + if (olog.tail < log.tail) { + dout(10) << "merge_log extending tail to " << olog.tail << dendl; + list<pg_log_entry_t>::iterator from = olog.log.begin(); + list<pg_log_entry_t>::iterator to; + eversion_t last; + for (to = from; + to != olog.log.end(); + ++to) { + if (to->version > log.tail) + break; + log.index(*to); + dout(15) << *to << dendl; + last = to->version; + } + mark_dirty_to(last); + + // splice into our log. + log.log.splice(log.log.begin(), + olog.log, from, to); + + info.log_tail = log.tail = olog.tail; + changed = true; + } + + if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases + oinfo.stats.reported_epoch < info.stats.reported_epoch) { + oinfo.stats.reported_seq = info.stats.reported_seq; + oinfo.stats.reported_epoch = info.stats.reported_epoch; + } + if (info.last_backfill.is_max()) + info.stats = oinfo.stats; + info.hit_set = oinfo.hit_set; + + // do we have divergent entries to throw out? + if (olog.head < log.head) { + rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info); + changed = true; + } + + // extend on head? + if (olog.head > log.head) { + dout(10) << "merge_log extending head to " << olog.head << dendl; + + // find start point in olog + list<pg_log_entry_t>::iterator to = olog.log.end(); + list<pg_log_entry_t>::iterator from = olog.log.end(); + eversion_t lower_bound = std::max(olog.tail, orig_tail); + while (1) { + if (from == olog.log.begin()) + break; + --from; + dout(20) << " ? " << *from << dendl; + if (from->version <= log.head) { + lower_bound = std::max(lower_bound, from->version); + ++from; + break; + } + } + dout(20) << "merge_log cut point (usually last shared) is " + << lower_bound << dendl; + mark_dirty_from(lower_bound); + + // We need to preserve the original crt before it gets updated in rewind_from_head(). + // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback + // a divergent entry or not. + eversion_t original_crt = log.get_can_rollback_to(); + dout(20) << __func__ << " original_crt = " << original_crt << dendl; + auto divergent = log.rewind_from_head(lower_bound); + // move aside divergent items + for (auto &&oe: divergent) { + dout(10) << "merge_log divergent " << oe << dendl; + } + log.roll_forward_to(log.head, rollbacker); + + mempool::osd_pglog::list<pg_log_entry_t> new_entries; + new_entries.splice(new_entries.end(), olog.log, from, to); + append_log_entries_update_missing( + info.last_backfill, + info.last_backfill_bitwise, + new_entries, + false, + &log, + missing, + rollbacker, + this); + + _merge_divergent_entries( + log, + divergent, + info, + original_crt, + missing, + rollbacker, + this); + + info.last_update = log.head = olog.head; + + // We cannot rollback into the new log entries + log.skip_can_rollback_to_to_head(); + + info.last_user_version = oinfo.last_user_version; + info.purged_snaps = oinfo.purged_snaps; + // update num_missing too + // we might have appended some more missing objects above + info.stats.stats.sum.num_objects_missing = missing.num_missing(); + + changed = true; + } + + // now handle dups + if (merge_log_dups(olog)) { + changed = true; + } + + dout(10) << "merge_log result " << log << " " << missing << + " changed=" << changed << dendl; + + if (changed) { + dirty_info = true; + dirty_big_info = true; + } +} + + +// returns true if any changes were made to log.dups +bool PGLog::merge_log_dups(const pg_log_t& olog) { + bool changed = false; + + if (!olog.dups.empty()) { + if (log.dups.empty()) { + dout(10) << "merge_log copying olog dups to log " << + olog.dups.front().version << " to " << + olog.dups.back().version << dendl; + changed = true; + dirty_from_dups = eversion_t(); + dirty_to_dups = eversion_t::max(); + // since our log.dups is empty just copy them + for (const auto& i : olog.dups) { + log.dups.push_back(i); + log.index(log.dups.back()); + } + } else { + // since our log.dups is not empty try to extend on each end + + if (olog.dups.back().version > log.dups.back().version) { + // extend the dups's tail (i.e., newer dups) + dout(10) << "merge_log extending dups tail to " << + olog.dups.back().version << dendl; + changed = true; + + auto log_tail_version = log.dups.back().version; + + auto insert_cursor = log.dups.end(); + eversion_t last_shared = eversion_t::max(); + for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) { + if (i->version <= log_tail_version) break; + log.dups.insert(insert_cursor, *i); + last_shared = i->version; + + auto prev = insert_cursor; + --prev; + // be sure to pass reference of copy in log.dups + log.index(*prev); + + --insert_cursor; // make sure we insert in reverse order + } + mark_dirty_from_dups(last_shared); + } + + if (olog.dups.front().version < log.dups.front().version) { + // extend the dups's head (i.e., older dups) + dout(10) << "merge_log extending dups head to " << + olog.dups.front().version << dendl; + changed = true; + + eversion_t last; + auto insert_cursor = log.dups.begin(); + for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) { + if (i->version >= insert_cursor->version) break; + log.dups.insert(insert_cursor, *i); + last = i->version; + auto prev = insert_cursor; + --prev; + // be sure to pass address of copy in log.dups + log.index(*prev); + } + mark_dirty_to_dups(last); + } + } + } + + // remove any dup entries that overlap with pglog + if (!log.dups.empty() && log.dups.back().version > log.tail) { + dout(10) << "merge_log removed dups overlapping log entries (" << + log.tail << "," << log.dups.back().version << "]" << dendl; + changed = true; + + while (!log.dups.empty() && log.dups.back().version > log.tail) { + log.unindex(log.dups.back()); + mark_dirty_from_dups(log.dups.back().version); + log.dups.pop_back(); + } + } + + return changed; +} + +void PGLog::check() { + if (!pg_log_debug) + return; + if (log.log.size() != log_keys_debug.size()) { + derr << "log.log.size() != log_keys_debug.size()" << dendl; + derr << "actual log:" << dendl; + for (list<pg_log_entry_t>::iterator i = log.log.begin(); + i != log.log.end(); + ++i) { + derr << " " << *i << dendl; + } + derr << "log_keys_debug:" << dendl; + for (set<string>::const_iterator i = log_keys_debug.begin(); + i != log_keys_debug.end(); + ++i) { + derr << " " << *i << dendl; + } + } + ceph_assert(log.log.size() == log_keys_debug.size()); + for (list<pg_log_entry_t>::iterator i = log.log.begin(); + i != log.log.end(); + ++i) { + ceph_assert(log_keys_debug.count(i->get_key_name())); + } +} + +// non-static +void PGLog::write_log_and_missing( + ObjectStore::Transaction& t, + map<string,bufferlist> *km, + const coll_t& coll, + const ghobject_t &log_oid, + bool require_rollback) +{ + if (is_dirty()) { + dout(6) << "write_log_and_missing with: " + << "dirty_to: " << dirty_to + << ", dirty_from: " << dirty_from + << ", writeout_from: " << writeout_from + << ", trimmed: " << trimmed + << ", trimmed_dups: " << trimmed_dups + << ", clear_divergent_priors: " << clear_divergent_priors + << dendl; + _write_log_and_missing( + t, km, log, coll, log_oid, + dirty_to, + dirty_from, + writeout_from, + std::move(trimmed), + std::move(trimmed_dups), + missing, + !touched_log, + require_rollback, + clear_divergent_priors, + dirty_to_dups, + dirty_from_dups, + write_from_dups, + &rebuilt_missing_with_deletes, + (pg_log_debug ? &log_keys_debug : nullptr)); + undirty(); + } else { + dout(10) << "log is not dirty" << dendl; + } +} + +// static +void PGLog::write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + map<string,bufferlist> *km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + map<eversion_t, hobject_t> &divergent_priors, + bool require_rollback + ) +{ + _write_log_and_missing_wo_missing( + t, km, log, coll, log_oid, + divergent_priors, eversion_t::max(), eversion_t(), eversion_t(), + true, true, require_rollback, + eversion_t::max(), eversion_t(), eversion_t(), nullptr); +} + +// static +void PGLog::write_log_and_missing( + ObjectStore::Transaction& t, + map<string,bufferlist> *km, + pg_log_t &log, + const coll_t& coll, + const ghobject_t &log_oid, + const pg_missing_tracker_t &missing, + bool require_rollback, + bool *rebuilt_missing_with_deletes) +{ + _write_log_and_missing( + t, km, log, coll, log_oid, + eversion_t::max(), + eversion_t(), + eversion_t(), + set<eversion_t>(), + set<string>(), + missing, + true, require_rollback, false, + eversion_t::max(), + eversion_t(), + eversion_t(), + rebuilt_missing_with_deletes, nullptr); +} + +// static +void PGLog::_write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + map<string,bufferlist> *km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + map<eversion_t, hobject_t> &divergent_priors, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + bool dirty_divergent_priors, + bool touch_log, + bool require_rollback, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + set<string> *log_keys_debug + ) +{ + // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl; + if (touch_log) + t.touch(coll, log_oid); + if (dirty_to != eversion_t()) { + t.omap_rmkeyrange( + coll, log_oid, + eversion_t().get_key_name(), dirty_to.get_key_name()); + clear_up_to(log_keys_debug, dirty_to.get_key_name()); + } + if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) { + // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl; + t.omap_rmkeyrange( + coll, log_oid, + dirty_from.get_key_name(), eversion_t::max().get_key_name()); + clear_after(log_keys_debug, dirty_from.get_key_name()); + } + + for (list<pg_log_entry_t>::iterator p = log.log.begin(); + p != log.log.end() && p->version <= dirty_to; + ++p) { + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + (*km)[p->get_key_name()].claim(bl); + } + + for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin(); + p != log.log.rend() && + (p->version >= dirty_from || p->version >= writeout_from) && + p->version >= dirty_to; + ++p) { + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + (*km)[p->get_key_name()].claim(bl); + } + + if (log_keys_debug) { + for (map<string, bufferlist>::iterator i = (*km).begin(); + i != (*km).end(); + ++i) { + if (i->first[0] == '_') + continue; + ceph_assert(!log_keys_debug->count(i->first)); + log_keys_debug->insert(i->first); + } + } + + // process dups after log_keys_debug is filled, so dups do not + // end up in that set + if (dirty_to_dups != eversion_t()) { + pg_log_dup_t min, dirty_to_dup; + dirty_to_dup.version = dirty_to_dups; + t.omap_rmkeyrange( + coll, log_oid, + min.get_key_name(), dirty_to_dup.get_key_name()); + } + if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) { + pg_log_dup_t max, dirty_from_dup; + max.version = eversion_t::max(); + dirty_from_dup.version = dirty_from_dups; + t.omap_rmkeyrange( + coll, log_oid, + dirty_from_dup.get_key_name(), max.get_key_name()); + } + + for (const auto& entry : log.dups) { + if (entry.version > dirty_to_dups) + break; + bufferlist bl; + encode(entry, bl); + (*km)[entry.get_key_name()].claim(bl); + } + + for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin(); + p != log.dups.rend() && + (p->version >= dirty_from_dups || p->version >= write_from_dups) && + p->version >= dirty_to_dups; + ++p) { + bufferlist bl; + encode(*p, bl); + (*km)[p->get_key_name()].claim(bl); + } + + if (dirty_divergent_priors) { + //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl; + encode(divergent_priors, (*km)["divergent_priors"]); + } + if (require_rollback) { + encode( + log.get_can_rollback_to(), + (*km)["can_rollback_to"]); + encode( + log.get_rollback_info_trimmed_to(), + (*km)["rollback_info_trimmed_to"]); + } +} + +// static +void PGLog::_write_log_and_missing( + ObjectStore::Transaction& t, + map<string,bufferlist>* km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + set<eversion_t> &&trimmed, + set<string> &&trimmed_dups, + const pg_missing_tracker_t &missing, + bool touch_log, + bool require_rollback, + bool clear_divergent_priors, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + bool *rebuilt_missing_with_deletes, // in/out param + set<string> *log_keys_debug + ) { + set<string> to_remove; + to_remove.swap(trimmed_dups); + for (auto& t : trimmed) { + string key = t.get_key_name(); + if (log_keys_debug) { + auto it = log_keys_debug->find(key); + ceph_assert(it != log_keys_debug->end()); + log_keys_debug->erase(it); + } + to_remove.emplace(std::move(key)); + } + trimmed.clear(); + + if (touch_log) + t.touch(coll, log_oid); + if (dirty_to != eversion_t()) { + t.omap_rmkeyrange( + coll, log_oid, + eversion_t().get_key_name(), dirty_to.get_key_name()); + clear_up_to(log_keys_debug, dirty_to.get_key_name()); + } + if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) { + // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl; + t.omap_rmkeyrange( + coll, log_oid, + dirty_from.get_key_name(), eversion_t::max().get_key_name()); + clear_after(log_keys_debug, dirty_from.get_key_name()); + } + + for (list<pg_log_entry_t>::iterator p = log.log.begin(); + p != log.log.end() && p->version <= dirty_to; + ++p) { + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + (*km)[p->get_key_name()].claim(bl); + } + + for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin(); + p != log.log.rend() && + (p->version >= dirty_from || p->version >= writeout_from) && + p->version >= dirty_to; + ++p) { + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + (*km)[p->get_key_name()].claim(bl); + } + + if (log_keys_debug) { + for (map<string, bufferlist>::iterator i = (*km).begin(); + i != (*km).end(); + ++i) { + if (i->first[0] == '_') + continue; + ceph_assert(!log_keys_debug->count(i->first)); + log_keys_debug->insert(i->first); + } + } + + // process dups after log_keys_debug is filled, so dups do not + // end up in that set + if (dirty_to_dups != eversion_t()) { + pg_log_dup_t min, dirty_to_dup; + dirty_to_dup.version = dirty_to_dups; + t.omap_rmkeyrange( + coll, log_oid, + min.get_key_name(), dirty_to_dup.get_key_name()); + } + if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) { + pg_log_dup_t max, dirty_from_dup; + max.version = eversion_t::max(); + dirty_from_dup.version = dirty_from_dups; + t.omap_rmkeyrange( + coll, log_oid, + dirty_from_dup.get_key_name(), max.get_key_name()); + } + + for (const auto& entry : log.dups) { + if (entry.version > dirty_to_dups) + break; + bufferlist bl; + encode(entry, bl); + (*km)[entry.get_key_name()].claim(bl); + } + + for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin(); + p != log.dups.rend() && + (p->version >= dirty_from_dups || p->version >= write_from_dups) && + p->version >= dirty_to_dups; + ++p) { + bufferlist bl; + encode(*p, bl); + (*km)[p->get_key_name()].claim(bl); + } + + if (clear_divergent_priors) { + //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl; + to_remove.insert("divergent_priors"); + } + // since we encode individual missing items instead of a whole + // missing set, we need another key to store this bit of state + if (*rebuilt_missing_with_deletes) { + (*km)["may_include_deletes_in_missing"] = bufferlist(); + *rebuilt_missing_with_deletes = false; + } + missing.get_changed( + [&](const hobject_t &obj) { + string key = string("missing/") + obj.to_str(); + pg_missing_item item; + if (!missing.is_missing(obj, &item)) { + to_remove.insert(key); + } else { + uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0; + encode(make_pair(obj, item), (*km)[key], features); + } + }); + if (require_rollback) { + encode( + log.get_can_rollback_to(), + (*km)["can_rollback_to"]); + encode( + log.get_rollback_info_trimmed_to(), + (*km)["rollback_info_trimmed_to"]); + } + + if (!to_remove.empty()) + t.omap_rmkeys(coll, log_oid, to_remove); +} + +void PGLog::rebuild_missing_set_with_deletes( + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + const pg_info_t &info) +{ + // save entries not generated from the current log (e.g. added due + // to repair, EIO handling, or divergent_priors). + map<hobject_t, pg_missing_item> extra_missing; + for (const auto& p : missing.get_items()) { + if (!log.logged_object(p.first)) { + dout(20) << __func__ << " extra missing entry: " << p.first + << " " << p.second << dendl; + extra_missing[p.first] = p.second; + } + } + missing.clear(); + missing.may_include_deletes = true; + + // go through the log and add items that are not present or older + // versions on disk, just as if we were reading the log + metadata + // off disk originally + set<hobject_t> did; + for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin(); + i != log.log.rend(); + ++i) { + if (i->version <= info.last_complete) + break; + if (i->soid > info.last_backfill || + i->is_error() || + did.find(i->soid) != did.end()) + continue; + did.insert(i->soid); + + bufferlist bv; + int r = store->getattr( + ch, + ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl; + + if (r >= 0) { + object_info_t oi(bv); + dout(20) << __func__ << " store version = " << oi.version << dendl; + if (oi.version < i->version) { + missing.add(i->soid, i->version, oi.version, i->is_delete()); + } + } else { + missing.add(i->soid, i->version, eversion_t(), i->is_delete()); + } + } + + for (const auto& p : extra_missing) { + missing.add(p.first, p.second.need, p.second.have, p.second.is_delete()); + } + rebuilt_missing_with_deletes = true; +} diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h new file mode 100644 index 00000000..6ff3eae8 --- /dev/null +++ b/src/osd/PGLog.h @@ -0,0 +1,1565 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#pragma once + +// re-include our assert to clobber boost's +#include "include/ceph_assert.h" +#include "osd_types.h" +#include "os/ObjectStore.h" +#include <list> + +constexpr auto PGLOG_INDEXED_OBJECTS = 1 << 0; +constexpr auto PGLOG_INDEXED_CALLER_OPS = 1 << 1; +constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2; +constexpr auto PGLOG_INDEXED_DUPS = 1 << 3; +constexpr auto PGLOG_INDEXED_ALL = PGLOG_INDEXED_OBJECTS + | PGLOG_INDEXED_CALLER_OPS + | PGLOG_INDEXED_EXTRA_CALLER_OPS + | PGLOG_INDEXED_DUPS; + +class CephContext; + +struct PGLog : DoutPrefixProvider { + std::ostream& gen_prefix(std::ostream& out) const override { + return out; + } + unsigned get_subsys() const override { + return static_cast<unsigned>(ceph_subsys_osd); + } + CephContext *get_cct() const override { + return cct; + } + + ////////////////////////////// sub classes ////////////////////////////// + struct LogEntryHandler { + virtual void rollback( + const pg_log_entry_t &entry) = 0; + virtual void rollforward( + const pg_log_entry_t &entry) = 0; + virtual void trim( + const pg_log_entry_t &entry) = 0; + virtual void remove( + const hobject_t &hoid) = 0; + virtual void try_stash( + const hobject_t &hoid, + version_t v) = 0; + virtual ~LogEntryHandler() {} + }; + +public: + /** + * IndexLog - adds in-memory index of the log, by oid. + * plus some methods to manipulate it all. + */ + struct IndexedLog : public pg_log_t { + mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful! + mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops; + mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops; + mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index; + + // recovery pointers + list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item + version_t last_requested = 0; // last object requested by primary + + // + private: + mutable __u16 indexed_data = 0; + /** + * rollback_info_trimmed_to_riter points to the first log entry <= + * rollback_info_trimmed_to + * + * It's a reverse_iterator because rend() is a natural representation for + * tail, and rbegin() works nicely for head. + */ + mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator + rollback_info_trimmed_to_riter; + + /* + * return true if we need to mark the pglog as dirty + */ + template <typename F> + bool advance_can_rollback_to(eversion_t to, F &&f) { + bool dirty_log = to > can_rollback_to || to > rollback_info_trimmed_to; + if (dirty_log) { + if (to > can_rollback_to) + can_rollback_to = to; + + if (to > rollback_info_trimmed_to) + rollback_info_trimmed_to = to; + } + + while (rollback_info_trimmed_to_riter != log.rbegin()) { + --rollback_info_trimmed_to_riter; + if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) { + ++rollback_info_trimmed_to_riter; + break; + } + f(*rollback_info_trimmed_to_riter); + } + + return dirty_log; + } + + void reset_rollback_info_trimmed_to_riter() { + rollback_info_trimmed_to_riter = log.rbegin(); + while (rollback_info_trimmed_to_riter != log.rend() && + rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) + ++rollback_info_trimmed_to_riter; + } + + // indexes objects, caller ops and extra caller ops + public: + IndexedLog() : + complete_to(log.end()), + last_requested(0), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { } + + template <typename... Args> + explicit IndexedLog(Args&&... args) : + pg_log_t(std::forward<Args>(args)...), + complete_to(log.end()), + last_requested(0), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { + reset_rollback_info_trimmed_to_riter(); + index(); + } + + IndexedLog(const IndexedLog &rhs) : + pg_log_t(rhs), + complete_to(log.end()), + last_requested(rhs.last_requested), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { + reset_rollback_info_trimmed_to_riter(); + index(rhs.indexed_data); + } + + IndexedLog &operator=(const IndexedLog &rhs) { + this->~IndexedLog(); + new (this) IndexedLog(rhs); + return *this; + } + + void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) { + advance_can_rollback_to( + to, + [&](pg_log_entry_t &entry) { + h->trim(entry); + }); + } + bool roll_forward_to(eversion_t to, LogEntryHandler *h) { + return advance_can_rollback_to( + to, + [&](pg_log_entry_t &entry) { + h->rollforward(entry); + }); + } + + void skip_can_rollback_to_to_head() { + advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {}); + } + + mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) { + auto divergent = pg_log_t::rewind_from_head(newhead); + index(); + reset_rollback_info_trimmed_to_riter(); + return divergent; + } + + template <typename T> + void scan_log_after( + const eversion_t &bound, ///< [in] scan entries > bound + T &&f) const { + auto iter = log.rbegin(); + while (iter != log.rend() && iter->version > bound) + ++iter; + + while (true) { + if (iter == log.rbegin()) + break; + f(*(--iter)); + } + } + + /****/ + void claim_log_and_clear_rollback_info(const pg_log_t& o) { + // we must have already trimmed the old entries + ceph_assert(rollback_info_trimmed_to == head); + ceph_assert(rollback_info_trimmed_to_riter == log.rbegin()); + + *this = IndexedLog(o); + + skip_can_rollback_to_to_head(); + index(); + } + + void split_out_child( + pg_t child_pgid, + unsigned split_bits, + IndexedLog *target); + + void zero() { + // we must have already trimmed the old entries + ceph_assert(rollback_info_trimmed_to == head); + ceph_assert(rollback_info_trimmed_to_riter == log.rbegin()); + + unindex(); + pg_log_t::clear(); + rollback_info_trimmed_to_riter = log.rbegin(); + reset_recovery_pointers(); + } + void clear() { + skip_can_rollback_to_to_head(); + zero(); + } + void reset_recovery_pointers() { + complete_to = log.end(); + last_requested = 0; + } + + bool logged_object(const hobject_t& oid) const { + if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { + index_objects(); + } + return objects.count(oid); + } + + bool logged_req(const osd_reqid_t &r) const { + if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { + index_caller_ops(); + } + if (!caller_ops.count(r)) { + if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { + index_extra_caller_ops(); + } + return extra_caller_ops.count(r); + } + return true; + } + + bool get_request( + const osd_reqid_t &r, + eversion_t *version, + version_t *user_version, + int *return_code) const + { + ceph_assert(version); + ceph_assert(user_version); + ceph_assert(return_code); + ceph::unordered_map<osd_reqid_t,pg_log_entry_t*>::const_iterator p; + if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { + index_caller_ops(); + } + p = caller_ops.find(r); + if (p != caller_ops.end()) { + *version = p->second->version; + *user_version = p->second->user_version; + *return_code = p->second->return_code; + return true; + } + + // warning: we will return *a* request for this reqid, but not + // necessarily the most recent. + if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { + index_extra_caller_ops(); + } + p = extra_caller_ops.find(r); + if (p != extra_caller_ops.end()) { + uint32_t idx = 0; + for (auto i = p->second->extra_reqids.begin(); + i != p->second->extra_reqids.end(); + ++idx, ++i) { + if (i->first == r) { + *version = p->second->version; + *user_version = i->second; + *return_code = p->second->return_code; + if (*return_code >= 0) { + auto it = p->second->extra_reqid_return_codes.find(idx); + if (it != p->second->extra_reqid_return_codes.end()) { + *return_code = it->second; + } + } + return true; + } + } + ceph_abort_msg("in extra_caller_ops but not extra_reqids"); + } + + if (!(indexed_data & PGLOG_INDEXED_DUPS)) { + index_dups(); + } + auto q = dup_index.find(r); + if (q != dup_index.end()) { + *version = q->second->version; + *user_version = q->second->user_version; + *return_code = q->second->return_code; + return true; + } + + return false; + } + + /// get a (bounded) list of recent reqids for the given object + void get_object_reqids(const hobject_t& oid, unsigned max, + mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *pls, + mempool::osd_pglog::map<uint32_t, int> *return_codes) const { + // make sure object is present at least once before we do an + // O(n) search. + if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { + index_objects(); + } + if (objects.count(oid) == 0) + return; + + for (list<pg_log_entry_t>::const_reverse_iterator i = log.rbegin(); + i != log.rend(); + ++i) { + if (i->soid == oid) { + if (i->reqid_is_indexed()) { + if (i->op == pg_log_entry_t::ERROR) { + // propagate op errors to the cache tier's PG log + return_codes->emplace(pls->size(), i->return_code); + } + pls->push_back(make_pair(i->reqid, i->user_version)); + } + + pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end()); + if (pls->size() >= max) { + if (pls->size() > max) { + pls->resize(max); + } + return; + } + } + } + } + + void index(__u16 to_index = PGLOG_INDEXED_ALL) const { + // if to_index is 0, no need to run any of this code, especially + // loop below; this can happen with copy constructor for + // IndexedLog (and indirectly through assignment operator) + if (!to_index) return; + + if (to_index & PGLOG_INDEXED_OBJECTS) + objects.clear(); + if (to_index & PGLOG_INDEXED_CALLER_OPS) + caller_ops.clear(); + if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) + extra_caller_ops.clear(); + if (to_index & PGLOG_INDEXED_DUPS) { + dup_index.clear(); + for (auto& i : dups) { + dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i); + } + } + + constexpr __u16 any_log_entry_index = + PGLOG_INDEXED_OBJECTS | + PGLOG_INDEXED_CALLER_OPS | + PGLOG_INDEXED_EXTRA_CALLER_OPS; + + if (to_index & any_log_entry_index) { + for (list<pg_log_entry_t>::const_iterator i = log.begin(); + i != log.end(); + ++i) { + if (to_index & PGLOG_INDEXED_OBJECTS) { + if (i->object_is_indexed()) { + objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i)); + } + } + + if (to_index & PGLOG_INDEXED_CALLER_OPS) { + if (i->reqid_is_indexed()) { + caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i)); + } + } + + if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = i->extra_reqids.begin(); + j != i->extra_reqids.end(); + ++j) { + extra_caller_ops.insert( + make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i)))); + } + } + } + } + + indexed_data |= to_index; + } + + void index_objects() const { + index(PGLOG_INDEXED_OBJECTS); + } + + void index_caller_ops() const { + index(PGLOG_INDEXED_CALLER_OPS); + } + + void index_extra_caller_ops() const { + index(PGLOG_INDEXED_EXTRA_CALLER_OPS); + } + + void index_dups() const { + index(PGLOG_INDEXED_DUPS); + } + + void index(pg_log_entry_t& e) { + if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { + if (objects.count(e.soid) == 0 || + objects[e.soid]->version < e.version) + objects[e.soid] = &e; + } + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + // divergent merge_log indexes new before unindexing old + if (e.reqid_is_indexed()) { + caller_ops[e.reqid] = &e; + } + } + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + extra_caller_ops.insert(make_pair(j->first, &e)); + } + } + } + + void unindex() { + objects.clear(); + caller_ops.clear(); + extra_caller_ops.clear(); + dup_index.clear(); + indexed_data = 0; + } + + void unindex(const pg_log_entry_t& e) { + // NOTE: this only works if we remove from the _tail_ of the log! + if (indexed_data & PGLOG_INDEXED_OBJECTS) { + auto it = objects.find(e.soid); + if (it != objects.end() && it->second->version == e.version) + objects.erase(it); + } + if (e.reqid_is_indexed()) { + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + auto it = caller_ops.find(e.reqid); + // divergent merge_log indexes new before unindexing old + if (it != caller_ops.end() && it->second == &e) + caller_ops.erase(it); + } + } + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + for (ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*>::iterator k = + extra_caller_ops.find(j->first); + k != extra_caller_ops.end() && k->first == j->first; + ++k) { + if (k->second == &e) { + extra_caller_ops.erase(k); + break; + } + } + } + } + } + + void index(pg_log_dup_t& e) { + if (indexed_data & PGLOG_INDEXED_DUPS) { + dup_index[e.reqid] = &e; + } + } + + void unindex(const pg_log_dup_t& e) { + if (indexed_data & PGLOG_INDEXED_DUPS) { + auto i = dup_index.find(e.reqid); + if (i != dup_index.end()) { + dup_index.erase(i); + } + } + } + + // actors + void add(const pg_log_entry_t& e, bool applied = true) { + if (!applied) { + ceph_assert(get_can_rollback_to() == head); + } + + // make sure our buffers don't pin bigger buffers + e.mod_desc.trim_bl(); + + // add to log + log.push_back(e); + + // riter previously pointed to the previous entry + if (rollback_info_trimmed_to_riter == log.rbegin()) + ++rollback_info_trimmed_to_riter; + + ceph_assert(e.version > head); + ceph_assert(head.version == 0 || e.version.version > head.version); + head = e.version; + + // to our index + if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { + objects[e.soid] = &(log.back()); + } + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + if (e.reqid_is_indexed()) { + caller_ops[e.reqid] = &(log.back()); + } + } + + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + extra_caller_ops.insert(make_pair(j->first, &(log.back()))); + } + } + + if (!applied) { + skip_can_rollback_to_to_head(); + } + } // add + + void trim( + CephContext* cct, + eversion_t s, + set<eversion_t> *trimmed, + set<string>* trimmed_dups, + eversion_t *write_from_dups); + + ostream& print(ostream& out) const; + }; // IndexedLog + + +protected: + //////////////////// data members //////////////////// + + pg_missing_tracker_t missing; + IndexedLog log; + + eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to + eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from + eversion_t writeout_from; ///< must writout keys >= writeout_from + set<eversion_t> trimmed; ///< must clear keys in trimmed + eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups + eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups + eversion_t write_from_dups; ///< must write keys >= write_from_dups + set<string> trimmed_dups; ///< must clear keys in trimmed_dups + CephContext *cct; + bool pg_log_debug; + /// Log is clean on [dirty_to, dirty_from) + bool touched_log; + bool dirty_log; + bool clear_divergent_priors; + bool rebuilt_missing_with_deletes = false; + + void mark_dirty_to(eversion_t to) { + if (to > dirty_to) + dirty_to = to; + } + void mark_dirty_from(eversion_t from) { + if (from < dirty_from) + dirty_from = from; + } + void mark_writeout_from(eversion_t from) { + if (from < writeout_from) + writeout_from = from; + } + void mark_dirty_to_dups(eversion_t to) { + if (to > dirty_to_dups) + dirty_to_dups = to; + } + void mark_dirty_from_dups(eversion_t from) { + if (from < dirty_from_dups) + dirty_from_dups = from; + } +public: + bool is_dirty() const { + return !touched_log || dirty_log || + (dirty_to != eversion_t()) || + (dirty_from != eversion_t::max()) || + (writeout_from != eversion_t::max()) || + !(trimmed.empty()) || + !missing.is_clean() || + !(trimmed_dups.empty()) || + (dirty_to_dups != eversion_t()) || + (dirty_from_dups != eversion_t::max()) || + (write_from_dups != eversion_t::max()) || + rebuilt_missing_with_deletes; + } + void mark_log_for_rewrite() { + mark_dirty_to(eversion_t::max()); + mark_dirty_from(eversion_t()); + mark_dirty_to_dups(eversion_t::max()); + mark_dirty_from_dups(eversion_t()); + touched_log = false; + } + bool get_rebuilt_missing_with_deletes() const { + return rebuilt_missing_with_deletes; + } +protected: + + /// DEBUG + set<string> log_keys_debug; + static void clear_after(set<string> *log_keys_debug, const string &lb) { + if (!log_keys_debug) + return; + for (set<string>::iterator i = log_keys_debug->lower_bound(lb); + i != log_keys_debug->end(); + log_keys_debug->erase(i++)); + } + static void clear_up_to(set<string> *log_keys_debug, const string &ub) { + if (!log_keys_debug) + return; + for (set<string>::iterator i = log_keys_debug->begin(); + i != log_keys_debug->end() && *i < ub; + log_keys_debug->erase(i++)); + } + + void check(); + void undirty() { + dirty_to = eversion_t(); + dirty_from = eversion_t::max(); + touched_log = true; + dirty_log = false; + trimmed.clear(); + trimmed_dups.clear(); + writeout_from = eversion_t::max(); + check(); + missing.flush(); + dirty_to_dups = eversion_t(); + dirty_from_dups = eversion_t::max(); + write_from_dups = eversion_t::max(); + } +public: + + // cppcheck-suppress noExplicitConstructor + PGLog(CephContext *cct) : + dirty_from(eversion_t::max()), + writeout_from(eversion_t::max()), + dirty_from_dups(eversion_t::max()), + write_from_dups(eversion_t::max()), + cct(cct), + pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))), + touched_log(false), + dirty_log(false), + clear_divergent_priors(false) + { } + + void reset_backfill(); + + void clear(); + + //////////////////// get or set missing //////////////////// + + const pg_missing_tracker_t& get_missing() const { return missing; } + + void missing_add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete=false) { + missing.add(oid, need, have, is_delete); + } + + void missing_add_next_entry(const pg_log_entry_t& e) { + missing.add_next_event(e); + } + + //////////////////// get or set log //////////////////// + + const IndexedLog &get_log() const { return log; } + + const eversion_t &get_tail() const { return log.tail; } + + void set_tail(eversion_t tail) { log.tail = tail; } + + const eversion_t &get_head() const { return log.head; } + + void set_head(eversion_t head) { log.head = head; } + + void set_last_requested(version_t last_requested) { + log.last_requested = last_requested; + } + + void index() { log.index(); } + + void unindex() { log.unindex(); } + + void add(const pg_log_entry_t& e, bool applied = true) { + mark_writeout_from(e.version); + log.add(e, applied); + } + + void reset_recovery_pointers() { log.reset_recovery_pointers(); } + + static void clear_info_log( + spg_t pgid, + ObjectStore::Transaction *t); + + void trim( + eversion_t trim_to, + pg_info_t &info, + bool transaction_applied = true, + bool async = false); + + void roll_forward_to( + eversion_t roll_forward_to, + LogEntryHandler *h) { + if (log.roll_forward_to( + roll_forward_to, + h)) + dirty_log = true; + } + + eversion_t get_can_rollback_to() const { + return log.get_can_rollback_to(); + } + + void roll_forward(LogEntryHandler *h) { + roll_forward_to( + log.head, + h); + } + + void skip_rollforward() { + log.skip_can_rollback_to_to_head(); + } + + //////////////////// get or set log & missing //////////////////// + + void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) { + log.trim_rollback_info_to(log.head, h); + log.claim_log_and_clear_rollback_info(o); + missing.clear(); + mark_dirty_to(eversion_t::max()); + mark_dirty_to_dups(eversion_t::max()); + } + + void split_into( + pg_t child_pgid, + unsigned split_bits, + PGLog *opg_log) { + log.split_out_child(child_pgid, split_bits, &opg_log->log); + missing.split_into(child_pgid, split_bits, &(opg_log->missing)); + opg_log->mark_dirty_to(eversion_t::max()); + opg_log->mark_dirty_to_dups(eversion_t::max()); + mark_dirty_to(eversion_t::max()); + mark_dirty_to_dups(eversion_t::max()); + if (missing.may_include_deletes) + opg_log->rebuilt_missing_with_deletes = true; + } + + void merge_from( + const vector<PGLog*>& sources, + eversion_t last_update) { + unindex(); + missing.clear(); + + vector<pg_log_t*> slogs; + for (auto s : sources) { + slogs.push_back(&s->log); + } + log.merge_from(slogs, last_update); + + index(); + + mark_log_for_rewrite(); + } + + void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) { + if (missing.is_missing(oid, v)) { + missing.got(oid, v); + info.stats.stats.sum.num_objects_missing = missing.num_missing(); + + // raise last_complete? + if (missing.get_items().empty()) { + log.complete_to = log.log.end(); + info.last_complete = info.last_update; + } + auto oldest_need = missing.get_oldest_need(); + while (log.complete_to != log.log.end()) { + if (oldest_need <= log.complete_to->version) + break; + if (info.last_complete < log.complete_to->version) + info.last_complete = log.complete_to->version; + ++log.complete_to; + } + } + + ceph_assert(log.get_can_rollback_to() >= v); + } + + void reset_complete_to(pg_info_t *info) { + if (log.log.empty()) // caller is split_into() + return; + log.complete_to = log.log.begin(); + ceph_assert(log.complete_to != log.log.end()); + auto oldest_need = missing.get_oldest_need(); + if (oldest_need != eversion_t()) { + while (log.complete_to->version < oldest_need) { + ++log.complete_to; + ceph_assert(log.complete_to != log.log.end()); + } + } + if (!info) + return; + if (log.complete_to == log.log.begin()) { + info->last_complete = eversion_t(); + } else { + --log.complete_to; + info->last_complete = log.complete_to->version; + ++log.complete_to; + } + } + + void activate_not_complete(pg_info_t &info) { + reset_complete_to(&info); + log.last_requested = 0; + } + + void proc_replica_log(pg_info_t &oinfo, + const pg_log_t &olog, + pg_missing_t& omissing, pg_shard_t from) const; + + void rebuild_missing_set_with_deletes(ObjectStore *store, + ObjectStore::CollectionHandle& ch, + const pg_info_t &info); + +protected: + static void split_by_object( + mempool::osd_pglog::list<pg_log_entry_t> &entries, + map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) { + while (!entries.empty()) { + auto &out_list = (*out_entries)[entries.front().soid]; + out_list.splice(out_list.end(), entries, entries.begin()); + } + } + + /** + * _merge_object_divergent_entries + * + * There are 5 distinct cases: + * 1) There is a more recent update: in this case we assume we adjusted the + * store and missing during merge_log + * 2) The first entry in the divergent sequence is a create. This might + * either be because the object is a clone or because prior_version is + * eversion_t(). In this case the object does not exist and we must + * adjust missing and the store to match. + * 3) We are currently missing the object. In this case, we adjust the + * missing to our prior_version taking care to add a divergent_prior + * if necessary + * 4) We can rollback all of the entries. In this case, we do so using + * the rollbacker and return -- the object does not go into missing. + * 5) We cannot rollback at least 1 of the entries. In this case, we + * clear the object out of the store and add a missing entry at + * prior_version taking care to add a divergent_prior if + * necessary. + */ + template <typename missing_type> + static void _merge_object_divergent_entries( + const IndexedLog &log, ///< [in] log to merge against + const hobject_t &hoid, ///< [in] object we are merging + const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge + const pg_info_t &info, ///< [in] info for merging entries + eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input InedexedLog + missing_type &missing, ///< [in,out] missing to adjust, use + LogEntryHandler *rollbacker, ///< [in] optional rollbacker object + const DoutPrefixProvider *dpp ///< [in] logging provider + ) { + ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid + << " entries: " << orig_entries << dendl; + + if (hoid > info.last_backfill) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill" + << dendl; + return; + } + + // entries is non-empty + ceph_assert(!orig_entries.empty()); + // strip out and ignore ERROR entries + mempool::osd_pglog::list<pg_log_entry_t> entries; + eversion_t last; + bool seen_non_error = false; + for (list<pg_log_entry_t>::const_iterator i = orig_entries.begin(); + i != orig_entries.end(); + ++i) { + // all entries are on hoid + ceph_assert(i->soid == hoid); + // did not see error entries before this entry and this entry is not error + // then this entry is the first non error entry + bool first_non_error = ! seen_non_error && ! i->is_error(); + if (! i->is_error() ) { + // see a non error entry now + seen_non_error = true; + } + + // No need to check the first entry since it prior_version is unavailable + // in the list + // No need to check if the prior_version is the minimal version + // No need to check the first non-error entry since the leading error + // entries are not its prior version + if (i != orig_entries.begin() && i->prior_version != eversion_t() && + ! first_non_error) { + // in increasing order of version + ceph_assert(i->version > last); + // prior_version correct (unless it is an ERROR entry) + ceph_assert(i->prior_version == last || i->is_error()); + } + if (i->is_error()) { + ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl; + } else { + ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl; + entries.push_back(*i); + last = i->version; + } + } + if (entries.empty()) { + ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl; + return; + } + + const eversion_t prior_version = entries.begin()->prior_version; + const eversion_t first_divergent_update = entries.begin()->version; + const eversion_t last_divergent_update = entries.rbegin()->version; + const bool object_not_in_store = + !missing.is_missing(hoid) && + entries.rbegin()->is_delete(); + ldpp_dout(dpp, 10) << __func__ << ": hoid " << " object_not_in_store: " + << object_not_in_store << dendl; + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version: " << prior_version + << " first_divergent_update: " << first_divergent_update + << " last_divergent_update: " << last_divergent_update + << dendl; + + ceph::unordered_map<hobject_t, pg_log_entry_t*>::const_iterator objiter = + log.objects.find(hoid); + if (objiter != log.objects.end() && + objiter->second->version >= first_divergent_update) { + /// Case 1) + ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: " + << *objiter->second << ", already merged" << dendl; + + ceph_assert(objiter->second->version > last_divergent_update); + + // ensure missing has been updated appropriately + if (objiter->second->is_update() || + (missing.may_include_deletes && objiter->second->is_delete())) { + ceph_assert(missing.is_missing(hoid) && + missing.get_items().at(hoid).need == objiter->second->version); + } else { + ceph_assert(!missing.is_missing(hoid)); + } + missing.revise_have(hoid, eversion_t()); + if (rollbacker) { + if (!object_not_in_store) { + rollbacker->remove(hoid); + } + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + <<" has no more recent entries in log" << dendl; + if (prior_version == eversion_t() || entries.front().is_clone()) { + /// Case 2) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version or op type indicates creation," + << " deleting" + << dendl; + if (missing.is_missing(hoid)) + missing.rm(missing.get_items().find(hoid)); + if (rollbacker) { + if (!object_not_in_store) { + rollbacker->remove(hoid); + } + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + if (missing.is_missing(hoid)) { + /// Case 3) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing, " << missing.get_items().at(hoid) + << " adjusting" << dendl; + + if (missing.get_items().at(hoid).have == prior_version) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing.have is prior_version " << prior_version + << " removing from missing" << dendl; + missing.rm(missing.get_items().find(hoid)); + } else { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing.have is " << missing.get_items().at(hoid).have + << ", adjusting" << dendl; + missing.revise_need(hoid, prior_version, false); + if (prior_version <= info.log_tail) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version " << prior_version + << " <= info.log_tail " + << info.log_tail << dendl; + } + } + if (rollbacker) { + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " must be rolled back or recovered," + << " attempting to rollback" + << dendl; + bool can_rollback = true; + // We are going to make an important decision based on the + // olog_can_rollback_to value we have received, better known it. + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " olog_can_rollback_to: " + << olog_can_rollback_to << dendl; + /// Distinguish between 4) and 5) + for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin(); + i != entries.rend(); + ++i) { + if (!i->can_rollback() || i->version <= olog_can_rollback_to) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback " + << *i << dendl; + can_rollback = false; + break; + } + } + + if (can_rollback) { + /// Case 4) + for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin(); + i != entries.rend(); + ++i) { + ceph_assert(i->can_rollback() && i->version > olog_can_rollback_to); + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " rolling back " << *i << dendl; + if (rollbacker) + rollbacker->rollback(*i); + } + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " rolled back" << dendl; + return; + } else { + /// Case 5) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, " + << "removing and adding to missing" << dendl; + if (rollbacker) { + if (!object_not_in_store) + rollbacker->remove(hoid); + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + missing.add(hoid, prior_version, eversion_t(), false); + if (prior_version <= info.log_tail) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version " << prior_version + << " <= info.log_tail " + << info.log_tail << dendl; + } + } + } + + /// Merge all entries using above + template <typename missing_type> + static void _merge_divergent_entries( + const IndexedLog &log, ///< [in] log to merge against + mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge + const pg_info_t &oinfo, ///< [in] info for merging entries + eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input IndexedLog + missing_type &omissing, ///< [in,out] missing to adjust, use + LogEntryHandler *rollbacker, ///< [in] optional rollbacker object + const DoutPrefixProvider *dpp ///< [in] logging provider + ) { + map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split; + split_by_object(entries, &split); + for (map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>>::iterator i = split.begin(); + i != split.end(); + ++i) { + _merge_object_divergent_entries( + log, + i->first, + i->second, + oinfo, + olog_can_rollback_to, + omissing, + rollbacker, + dpp); + } + } + + /** + * Exists for use in TestPGLog for simply testing single divergent log + * cases + */ + void merge_old_entry( + ObjectStore::Transaction& t, + const pg_log_entry_t& oe, + const pg_info_t& info, + LogEntryHandler *rollbacker) { + mempool::osd_pglog::list<pg_log_entry_t> entries; + entries.push_back(oe); + _merge_object_divergent_entries( + log, + oe.soid, + entries, + info, + log.get_can_rollback_to(), + missing, + rollbacker, + this); + } + + bool merge_log_dups(const pg_log_t& olog); + +public: + + void rewind_divergent_log(eversion_t newhead, + pg_info_t &info, + LogEntryHandler *rollbacker, + bool &dirty_info, + bool &dirty_big_info); + + void merge_log(pg_info_t &oinfo, + pg_log_t &olog, + pg_shard_t from, + pg_info_t &info, LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info); + + template <typename missing_type> + static bool append_log_entries_update_missing( + const hobject_t &last_backfill, + bool last_backfill_bitwise, + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + bool maintain_rollback, + IndexedLog *log, + missing_type &missing, + LogEntryHandler *rollbacker, + const DoutPrefixProvider *dpp) { + bool invalidate_stats = false; + if (log && !entries.empty()) { + ceph_assert(log->head < entries.begin()->version); + } + for (list<pg_log_entry_t>::const_iterator p = entries.begin(); + p != entries.end(); + ++p) { + invalidate_stats = invalidate_stats || !p->is_error(); + if (log) { + ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl; + log->add(*p); + } + if (p->soid <= last_backfill && + !p->is_error()) { + if (missing.may_include_deletes) { + missing.add_next_event(*p); + } else { + if (p->is_delete()) { + missing.rm(p->soid, p->version); + } else { + missing.add_next_event(*p); + } + if (rollbacker) { + // hack to match PG::mark_all_unfound_lost + if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) { + rollbacker->try_stash(p->soid, p->version.version); + } else if (p->is_delete()) { + rollbacker->remove(p->soid); + } + } + } + } + } + return invalidate_stats; + } + bool append_new_log_entries( + const hobject_t &last_backfill, + bool last_backfill_bitwise, + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + LogEntryHandler *rollbacker) { + bool invalidate_stats = append_log_entries_update_missing( + last_backfill, + last_backfill_bitwise, + entries, + true, + &log, + missing, + rollbacker, + this); + if (!entries.empty()) { + mark_writeout_from(entries.begin()->version); + if (entries.begin()->is_lost_delete()) { + // hack: since lost deletes queue recovery directly, and don't + // go through activate_not_complete() again, our complete_to + // iterator may still point at log.end(). Reset it to point + // before these new lost_delete entries. This only occurs + // when lost+delete entries are initially added, which is + // always in a list of solely lost_delete entries, so it is + // sufficient to check whether the first entry is a + // lost_delete + reset_complete_to(nullptr); + } + } + return invalidate_stats; + } + + void write_log_and_missing( + ObjectStore::Transaction& t, + map<string,bufferlist> *km, + const coll_t& coll, + const ghobject_t &log_oid, + bool require_rollback); + + static void write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + map<string,bufferlist>* km, + pg_log_t &log, + const coll_t& coll, + const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors, + bool require_rollback); + + static void write_log_and_missing( + ObjectStore::Transaction& t, + map<string,bufferlist>* km, + pg_log_t &log, + const coll_t& coll, + const ghobject_t &log_oid, + const pg_missing_tracker_t &missing, + bool require_rollback, + bool *rebuilt_missing_set_with_deletes); + + static void _write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + map<string,bufferlist>* km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + map<eversion_t, hobject_t> &divergent_priors, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + bool dirty_divergent_priors, + bool touch_log, + bool require_rollback, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + set<string> *log_keys_debug + ); + + static void _write_log_and_missing( + ObjectStore::Transaction& t, + map<string,bufferlist>* km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + set<eversion_t> &&trimmed, + set<string> &&trimmed_dups, + const pg_missing_tracker_t &missing, + bool touch_log, + bool require_rollback, + bool clear_divergent_priors, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + bool *rebuilt_missing_with_deletes, + set<string> *log_keys_debug + ); + + void read_log_and_missing( + ObjectStore *store, + ObjectStore::CollectionHandle& ch, + ghobject_t pgmeta_oid, + const pg_info_t &info, + ostringstream &oss, + bool tolerate_divergent_missing_log, + bool debug_verify_stored_missing = false + ) { + return read_log_and_missing( + store, ch, pgmeta_oid, info, + log, missing, oss, + tolerate_divergent_missing_log, + &clear_divergent_priors, + this, + (pg_log_debug ? &log_keys_debug : nullptr), + debug_verify_stored_missing); + } + + template <typename missing_type> + static void read_log_and_missing( + ObjectStore *store, + ObjectStore::CollectionHandle &ch, + ghobject_t pgmeta_oid, + const pg_info_t &info, + IndexedLog &log, + missing_type &missing, + ostringstream &oss, + bool tolerate_divergent_missing_log, + bool *clear_divergent_priors = nullptr, + const DoutPrefixProvider *dpp = nullptr, + set<string> *log_keys_debug = nullptr, + bool debug_verify_stored_missing = false + ) { + ldpp_dout(dpp, 20) << "read_log_and_missing coll " << ch->cid + << " " << pgmeta_oid << dendl; + + // legacy? + struct stat st; + int r = store->stat(ch, pgmeta_oid, &st); + ceph_assert(r == 0); + ceph_assert(st.st_size == 0); + + // will get overridden below if it had been recorded + eversion_t on_disk_can_rollback_to = info.last_update; + eversion_t on_disk_rollback_info_trimmed_to = eversion_t(); + ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, + pgmeta_oid); + map<eversion_t, hobject_t> divergent_priors; + bool must_rebuild = false; + missing.may_include_deletes = false; + list<pg_log_entry_t> entries; + list<pg_log_dup_t> dups; + if (p) { + for (p->seek_to_first(); p->valid() ; p->next()) { + // non-log pgmeta_oid keys are prefixed with _; skip those + if (p->key()[0] == '_') + continue; + bufferlist bl = p->value();//Copy bufferlist before creating iterator + auto bp = bl.cbegin(); + if (p->key() == "divergent_priors") { + decode(divergent_priors, bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size() + << " divergent_priors" << dendl; + must_rebuild = true; + debug_verify_stored_missing = false; + } else if (p->key() == "can_rollback_to") { + decode(on_disk_can_rollback_to, bp); + } else if (p->key() == "rollback_info_trimmed_to") { + decode(on_disk_rollback_info_trimmed_to, bp); + } else if (p->key() == "may_include_deletes_in_missing") { + missing.may_include_deletes = true; + } else if (p->key().substr(0, 7) == string("missing")) { + hobject_t oid; + pg_missing_item item; + decode(oid, bp); + decode(item, bp); + if (item.is_delete()) { + ceph_assert(missing.may_include_deletes); + } + missing.add(oid, item.need, item.have, item.is_delete()); + } else if (p->key().substr(0, 4) == string("dup_")) { + pg_log_dup_t dup; + decode(dup, bp); + if (!dups.empty()) { + ceph_assert(dups.back().version < dup.version); + } + dups.push_back(dup); + } else { + pg_log_entry_t e; + e.decode_with_checksum(bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl; + if (!entries.empty()) { + pg_log_entry_t last_e(entries.back()); + ceph_assert(last_e.version.version < e.version.version); + ceph_assert(last_e.version.epoch <= e.version.epoch); + } + entries.push_back(e); + if (log_keys_debug) + log_keys_debug->insert(e.get_key_name()); + } + } + } + log = IndexedLog( + info.last_update, + info.log_tail, + on_disk_can_rollback_to, + on_disk_rollback_info_trimmed_to, + std::move(entries), + std::move(dups)); + + if (must_rebuild || debug_verify_stored_missing) { + // build missing + if (debug_verify_stored_missing || info.last_complete < info.last_update) { + ldpp_dout(dpp, 10) + << "read_log_and_missing checking for missing items over interval (" + << info.last_complete + << "," << info.last_update << "]" << dendl; + + set<hobject_t> did; + set<hobject_t> checked; + set<hobject_t> skipped; + for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin(); + i != log.log.rend(); + ++i) { + if (!debug_verify_stored_missing && i->version <= info.last_complete) break; + if (i->soid > info.last_backfill) + continue; + if (i->is_error()) + continue; + if (did.count(i->soid)) continue; + did.insert(i->soid); + + if (!missing.may_include_deletes && i->is_delete()) + continue; + + bufferlist bv; + int r = store->getattr( + ch, + ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + if (oi.version < i->version) { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i + << " (have " << oi.version << ")" << dendl; + if (debug_verify_stored_missing) { + auto miter = missing.get_items().find(i->soid); + ceph_assert(miter != missing.get_items().end()); + ceph_assert(miter->second.need == i->version); + // the 'have' version is reset if an object is deleted, + // then created again + ceph_assert(miter->second.have == oi.version || miter->second.have == eversion_t()); + checked.insert(i->soid); + } else { + missing.add(i->soid, i->version, oi.version, i->is_delete()); + } + } + } else { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; + if (debug_verify_stored_missing) { + auto miter = missing.get_items().find(i->soid); + if (i->is_delete()) { + ceph_assert(miter == missing.get_items().end() || + (miter->second.need == i->version && + miter->second.have == eversion_t())); + } else { + ceph_assert(miter != missing.get_items().end()); + ceph_assert(miter->second.need == i->version); + ceph_assert(miter->second.have == eversion_t()); + } + checked.insert(i->soid); + } else { + missing.add(i->soid, i->version, eversion_t(), i->is_delete()); + } + } + } + if (debug_verify_stored_missing) { + for (auto &&i: missing.get_items()) { + if (checked.count(i.first)) + continue; + if (i.first > info.last_backfill) { + ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry " + << "found before last_backfill: " + << i.first << " " << i.second + << " last_backfill = " << info.last_backfill + << dendl; + ceph_abort_msg("invalid missing set entry found"); + } + bufferlist bv; + int r = store->getattr( + ch, + ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + ceph_assert(oi.version == i.second.have || eversion_t() == i.second.have); + } else { + ceph_assert(i.second.is_delete() || eversion_t() == i.second.have); + } + } + } else { + ceph_assert(must_rebuild); + for (map<eversion_t, hobject_t>::reverse_iterator i = + divergent_priors.rbegin(); + i != divergent_priors.rend(); + ++i) { + if (i->first <= info.last_complete) break; + if (i->second > info.last_backfill) + continue; + if (did.count(i->second)) continue; + did.insert(i->second); + bufferlist bv; + int r = store->getattr( + ch, + ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + /** + * 1) we see this entry in the divergent priors mapping + * 2) we didn't see an entry for this object in the log + * + * From 1 & 2 we know that either the object does not exist + * or it is at the version specified in the divergent_priors + * map since the object would have been deleted atomically + * with the addition of the divergent_priors entry, an older + * version would not have been recovered, and a newer version + * would show up in the log above. + */ + /** + * Unfortunately the assessment above is incorrect because of + * http://tracker.ceph.com/issues/17916 (we were incorrectly + * not removing the divergent_priors set from disk state!), + * so let's check that. + */ + if (oi.version > i->first && tolerate_divergent_missing_log) { + ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i + << ") inconsistent with disk state (" << oi + << "), assuming it is tracker.ceph.com/issues/17916" + << dendl; + } else { + ceph_assert(oi.version == i->first); + } + } else { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; + missing.add(i->second, i->first, eversion_t(), false); + } + } + } + if (clear_divergent_priors) + (*clear_divergent_priors) = true; + } + } + + if (!must_rebuild) { + if (clear_divergent_priors) + (*clear_divergent_priors) = false; + missing.flush(); + } + ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl; + } // static read_log_and_missing +}; // struct PGLog diff --git a/src/osd/PGPeeringEvent.cc b/src/osd/PGPeeringEvent.cc new file mode 100644 index 00000000..52aff7dc --- /dev/null +++ b/src/osd/PGPeeringEvent.cc @@ -0,0 +1,8 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/mempool.h" +#include "osd/PGPeeringEvent.h" +#include "messages/MOSDPGLog.h" + +MEMPOOL_DEFINE_OBJECT_FACTORY(PGPeeringEvent, pg_peering_evt, osd); diff --git a/src/osd/PGPeeringEvent.h b/src/osd/PGPeeringEvent.h new file mode 100644 index 00000000..a4a557ef --- /dev/null +++ b/src/osd/PGPeeringEvent.h @@ -0,0 +1,189 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/statechart/event.hpp> + +#include "osd/osd_types.h" + +class MOSDPGLog; + +/// what we need to instantiate a pg +struct PGCreateInfo { + spg_t pgid; + epoch_t epoch = 0; + pg_history_t history; + PastIntervals past_intervals; + bool by_mon; + PGCreateInfo(spg_t p, epoch_t e, + const pg_history_t& h, + const PastIntervals& pi, + bool mon) + : pgid(p), epoch(e), history(h), past_intervals(pi), by_mon(mon) {} +}; + +class PGPeeringEvent { + epoch_t epoch_sent; + epoch_t epoch_requested; + string desc; +public: + boost::intrusive_ptr< const boost::statechart::event_base > evt; + bool requires_pg; + std::unique_ptr<PGCreateInfo> create_info; + MEMPOOL_CLASS_HELPERS(); + template <class T> + PGPeeringEvent( + epoch_t epoch_sent, + epoch_t epoch_requested, + const T &evt_, + bool req = true, + PGCreateInfo *ci = 0) + : epoch_sent(epoch_sent), + epoch_requested(epoch_requested), + evt(evt_.intrusive_from_this()), + requires_pg(req), + create_info(ci) { + stringstream out; + out << "epoch_sent: " << epoch_sent + << " epoch_requested: " << epoch_requested << " "; + evt_.print(&out); + if (create_info) { + out << " +create_info"; + } + desc = out.str(); + } + epoch_t get_epoch_sent() { + return epoch_sent; + } + epoch_t get_epoch_requested() { + return epoch_requested; + } + const boost::statechart::event_base &get_event() { + return *evt; + } + const string& get_desc() { + return desc; + } +}; +typedef std::shared_ptr<PGPeeringEvent> PGPeeringEventRef; + +struct MInfoRec : boost::statechart::event< MInfoRec > { + pg_shard_t from; + pg_info_t info; + epoch_t msg_epoch; + MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) : + from(from), info(info), msg_epoch(msg_epoch) {} + void print(std::ostream *out) const { + *out << "MInfoRec from " << from << " info: " << info; + } +}; + +struct MLogRec : boost::statechart::event< MLogRec > { + pg_shard_t from; + boost::intrusive_ptr<MOSDPGLog> msg; + MLogRec(pg_shard_t from, MOSDPGLog *msg) : + from(from), msg(msg) {} + void print(std::ostream *out) const { + *out << "MLogRec from " << from; + } +}; + +struct MNotifyRec : boost::statechart::event< MNotifyRec > { + spg_t pgid; + pg_shard_t from; + pg_notify_t notify; + uint64_t features; + PastIntervals past_intervals; + MNotifyRec(spg_t p, pg_shard_t from, const pg_notify_t ¬ify, uint64_t f, + const PastIntervals& pi) + : pgid(p), from(from), notify(notify), features(f), past_intervals(pi) {} + void print(std::ostream *out) const { + *out << "MNotifyRec " << pgid << " from " << from << " notify: " << notify + << " features: 0x" << hex << features << dec + << " " << past_intervals; + } +}; + +struct MQuery : boost::statechart::event< MQuery > { + spg_t pgid; + pg_shard_t from; + pg_query_t query; + epoch_t query_epoch; + MQuery(spg_t p, pg_shard_t from, const pg_query_t &query, epoch_t query_epoch) + : pgid(p), from(from), query(query), query_epoch(query_epoch) {} + void print(std::ostream *out) const { + *out << "MQuery " << pgid << " from " << from + << " query_epoch " << query_epoch + << " query: " << query; + } +}; + +struct MTrim : boost::statechart::event<MTrim> { + epoch_t epoch; + int from; + shard_id_t shard; + eversion_t trim_to; + MTrim(epoch_t epoch, int from, shard_id_t shard, eversion_t trim_to) + : epoch(epoch), from(from), shard(shard), trim_to(trim_to) {} + void print(std::ostream *out) const { + *out << "MTrim epoch " << epoch << " from " << from << " shard " << shard + << " trim_to " << trim_to; + } +}; + +struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > { + unsigned priority; + int64_t primary_num_bytes; + int64_t local_num_bytes; + explicit RequestBackfillPrio(unsigned prio, int64_t pbytes, int64_t lbytes) : + boost::statechart::event< RequestBackfillPrio >(), + priority(prio), primary_num_bytes(pbytes), local_num_bytes(lbytes) {} + void print(std::ostream *out) const { + *out << "RequestBackfillPrio: priority " << priority + << " primary bytes " << primary_num_bytes + << " local bytes " << local_num_bytes; + } +}; + +struct RequestRecoveryPrio : boost::statechart::event< RequestRecoveryPrio > { + unsigned priority; + explicit RequestRecoveryPrio(unsigned prio) : + boost::statechart::event< RequestRecoveryPrio >(), + priority(prio) {} + void print(std::ostream *out) const { + *out << "RequestRecoveryPrio: priority " << priority; + } +}; + +#define TrivialEvent(T) struct T : boost::statechart::event< T > { \ + T() : boost::statechart::event< T >() {} \ + void print(std::ostream *out) const { \ + *out << #T; \ + } \ + }; + +TrivialEvent(NullEvt) +TrivialEvent(RemoteBackfillReserved) +TrivialEvent(RemoteReservationRejectedTooFull) +TrivialEvent(RemoteReservationRevokedTooFull) +TrivialEvent(RemoteReservationRevoked) +TrivialEvent(RemoteReservationCanceled) +TrivialEvent(RemoteRecoveryReserved) +TrivialEvent(RecoveryDone) + +struct DeferRecovery : boost::statechart::event<DeferRecovery> { + float delay; + explicit DeferRecovery(float delay) : delay(delay) {} + void print(std::ostream *out) const { + *out << "DeferRecovery: delay " << delay; + } +}; + +struct DeferBackfill : boost::statechart::event<DeferBackfill> { + float delay; + explicit DeferBackfill(float delay) : delay(delay) {} + void print(std::ostream *out) const { + *out << "DeferBackfill: delay " << delay; + } +}; diff --git a/src/osd/PGTransaction.h b/src/osd/PGTransaction.h new file mode 100644 index 00000000..e3a7b8e1 --- /dev/null +++ b/src/osd/PGTransaction.h @@ -0,0 +1,579 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef PGTRANSACTION_H +#define PGTRANSACTION_H + +#include <map> +#include <memory> +#include <boost/optional.hpp> + +#include "common/hobject.h" +#include "osd/osd_types.h" +#include "osd/osd_internal_types.h" +#include "common/interval_map.h" +#include "common/inline_variant.h" + +/** + * This class represents transactions which can be submitted to + * a PGBackend. For expediency, there are some constraints on + * the operations submitted: + * 1) Rename sources may only be referenced prior to the rename + * operation to the destination. + * 2) The graph formed by edges of source->destination for clones + * (Create) and Renames must be acyclic. + * 3) clone_range sources must not be modified by the same + * transaction + */ +class PGTransaction { +public: + map<hobject_t, ObjectContextRef> obc_map; + + class ObjectOperation { + public: + struct Init + { + struct None {}; + struct Create {}; + struct Clone { + hobject_t source; + }; + struct Rename { + hobject_t source; // must be temp object + }; + }; + using InitType = boost::variant< + Init::None, + Init::Create, + Init::Clone, + Init::Rename>; + + InitType init_type = Init::None(); + bool delete_first = false; + + /** + * is_none() && is_delete() indicates that we are deleting an + * object which already exists and not recreating it. delete_first means + * that the transaction logically removes the object. + + * There are really 4 cases: + + * 1) We are modifying an existing object (is_none() && + * !is_delete()) + * a) If it's an append, we just write into the log entry the old size + * b) If it's an actual overwrite, we save the old versions of the + * extents being overwritten and write those offsets into the log + * entry + * 2) We are removing and then recreating an object (!is_none() && is_delete()) + * -- stash + * 3) We are removing an object (is_none() && is_delete()) -- stash + * 4) We are creating an object (!is_none() && !is_delete()) -- create (no + * stash) + * + * Create, Clone, Rename are the three ways we can recreate it. + * ECBackend transaction planning needs this context + * to figure out how to perform the transaction. + */ + bool deletes_first() const { + return delete_first; + } + bool is_delete() const { + return boost::get<Init::None>(&init_type) != nullptr && delete_first; + } + bool is_none() const { + return boost::get<Init::None>(&init_type) != nullptr && !delete_first; + } + bool is_fresh_object() const { + return boost::get<Init::None>(&init_type) == nullptr; + } + bool is_rename() const { + return boost::get<Init::Rename>(&init_type) != nullptr; + } + bool has_source(hobject_t *source = nullptr) const { + return match( + init_type, + [&](const Init::Clone &op) -> bool { + if (source) + *source = op.source; + return true; + }, + [&](const Init::Rename &op) -> bool { + if (source) + *source = op.source; + return true; + }, + [&](const Init::None &) -> bool { return false; }, + [&](const Init::Create &) -> bool { return false; }); + } + + bool clear_omap = false; + + /** + * truncate + * <lowest, last> ? + * + * truncate is represented as a pair because in the event of + * multiple truncates within a single transaction we need to + * remember the lowest truncate and the final object size + * (the last truncate). We also adjust the buffers map + * to account for truncates overriding previous writes */ + boost::optional<pair<uint64_t, uint64_t> > truncate = boost::none; + + std::map<string, boost::optional<bufferlist> > attr_updates; + + enum class OmapUpdateType {Remove, Insert}; + std::vector<std::pair<OmapUpdateType, bufferlist> > omap_updates; + + boost::optional<bufferlist> omap_header; + + /// (old, new) -- only valid with no truncate or buffer updates + boost::optional<pair<set<snapid_t>, set<snapid_t> > > updated_snaps; + + struct alloc_hint_t { + uint64_t expected_object_size; + uint64_t expected_write_size; + uint32_t flags; + }; + boost::optional<alloc_hint_t> alloc_hint; + + struct BufferUpdate { + struct Write { + bufferlist buffer; + uint32_t fadvise_flags; + }; + struct Zero { + uint64_t len; + }; + struct CloneRange { + hobject_t from; + uint64_t offset; + uint64_t len; + }; + }; + using BufferUpdateType = boost::variant< + BufferUpdate::Write, + BufferUpdate::Zero, + BufferUpdate::CloneRange>; + + private: + struct SplitMerger { + BufferUpdateType split( + uint64_t offset, + uint64_t len, + const BufferUpdateType &bu) const { + return match( + bu, + [&](const BufferUpdate::Write &w) -> BufferUpdateType { + bufferlist bl; + bl.substr_of(w.buffer, offset, len); + return BufferUpdate::Write{bl, w.fadvise_flags}; + }, + [&](const BufferUpdate::Zero &) -> BufferUpdateType { + return BufferUpdate::Zero{len}; + }, + [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType { + return BufferUpdate::CloneRange{c.from, c.offset + offset, len}; + }); + } + uint64_t length( + const BufferUpdateType &left) const { + return match( + left, + [&](const BufferUpdate::Write &w) -> uint64_t { + return w.buffer.length(); + }, + [&](const BufferUpdate::Zero &z) -> uint64_t { + return z.len; + }, + [&](const BufferUpdate::CloneRange &c) -> uint64_t { + return c.len; + }); + } + bool can_merge( + const BufferUpdateType &left, + const BufferUpdateType &right) const { + return match( + left, + [&](const BufferUpdate::Write &w) -> bool { + auto r = boost::get<BufferUpdate::Write>(&right); + return r != nullptr && (w.fadvise_flags == r->fadvise_flags); + }, + [&](const BufferUpdate::Zero &) -> bool { + auto r = boost::get<BufferUpdate::Zero>(&right); + return r != nullptr; + }, + [&](const BufferUpdate::CloneRange &c) -> bool { + return false; + }); + } + BufferUpdateType merge( + BufferUpdateType &&left, + BufferUpdateType &&right) const { + return match( + left, + [&](const BufferUpdate::Write &w) -> BufferUpdateType { + auto r = boost::get<BufferUpdate::Write>(&right); + ceph_assert(r && w.fadvise_flags == r->fadvise_flags); + bufferlist bl = w.buffer; + bl.append(r->buffer); + return BufferUpdate::Write{bl, w.fadvise_flags}; + }, + [&](const BufferUpdate::Zero &z) -> BufferUpdateType { + auto r = boost::get<BufferUpdate::Zero>(&right); + ceph_assert(r); + return BufferUpdate::Zero{z.len + r->len}; + }, + [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType { + ceph_abort_msg("violates can_merge condition"); + return left; + }); + } + }; + public: + using buffer_update_type = interval_map< + uint64_t, BufferUpdateType, SplitMerger>; + buffer_update_type buffer_updates; + + friend class PGTransaction; + }; + map<hobject_t, ObjectOperation> op_map; +private: + ObjectOperation &get_object_op_for_modify(const hobject_t &hoid) { + auto &op = op_map[hoid]; + ceph_assert(!op.is_delete()); + return op; + } + ObjectOperation &get_object_op(const hobject_t &hoid) { + return op_map[hoid]; + } +public: + void add_obc( + ObjectContextRef obc) { + ceph_assert(obc); + obc_map[obc->obs.oi.soid] = obc; + } + /// Sets up state for new object + void create( + const hobject_t &hoid + ) { + auto &op = op_map[hoid]; + ceph_assert(op.is_none() || op.is_delete()); + op.init_type = ObjectOperation::Init::Create(); + } + + /// Sets up state for target cloned from source + void clone( + const hobject_t &target, ///< [in] obj to clone to + const hobject_t &source ///< [in] obj to clone from + ) { + auto &op = op_map[target]; + ceph_assert(op.is_none() || op.is_delete()); + op.init_type = ObjectOperation::Init::Clone{source}; + } + + /// Sets up state for target renamed from source + void rename( + const hobject_t &target, ///< [in] to, must not exist, be non-temp + const hobject_t &source ///< [in] source (must be a temp object) + ) { + ceph_assert(source.is_temp()); + ceph_assert(!target.is_temp()); + auto &op = op_map[target]; + ceph_assert(op.is_none() || op.is_delete()); + + bool del_first = op.is_delete(); + auto iter = op_map.find(source); + if (iter != op_map.end()) { + op = iter->second; + op_map.erase(iter); + op.delete_first = del_first; + } + + op.init_type = ObjectOperation::Init::Rename{source}; + } + + /// Remove -- must not be called on rename target + void remove( + const hobject_t &hoid ///< [in] obj to remove + ) { + auto &op = get_object_op_for_modify(hoid); + if (!op.is_fresh_object()) { + ceph_assert(!op.updated_snaps); + op = ObjectOperation(); + op.delete_first = true; + } else { + ceph_assert(!op.is_rename()); + op_map.erase(hoid); // make it a noop if it's a fresh object + } + } + + void update_snaps( + const hobject_t &hoid, ///< [in] object for snaps + const set<snapid_t> &old_snaps,///< [in] old snaps value + const set<snapid_t> &new_snaps ///< [in] new snaps value + ) { + auto &op = get_object_op(hoid); + ceph_assert(!op.updated_snaps); + ceph_assert(op.buffer_updates.empty()); + ceph_assert(!op.truncate); + op.updated_snaps = make_pair( + old_snaps, + new_snaps); + } + + /// Clears, truncates + void omap_clear( + const hobject_t &hoid ///< [in] object to clear omap + ) { + auto &op = get_object_op_for_modify(hoid); + op.clear_omap = true; + op.omap_updates.clear(); + op.omap_header = boost::none; + } + void truncate( + const hobject_t &hoid, ///< [in] object + uint64_t off ///< [in] offset to truncate to + ) { + auto &op = get_object_op_for_modify(hoid); + ceph_assert(!op.updated_snaps); + op.buffer_updates.erase( + off, + std::numeric_limits<uint64_t>::max() - off); + if (!op.truncate || off < op.truncate->first) { + op.truncate = std::pair<uint64_t, uint64_t>(off, off); + } else { + op.truncate->second = off; + } + } + + /// Attr ops + void setattrs( + const hobject_t &hoid, ///< [in] object to write + map<string, bufferlist> &attrs ///< [in] attrs, may be cleared + ) { + auto &op = get_object_op_for_modify(hoid); + for (auto &&i: attrs) { + auto& d = op.attr_updates[i.first]; + d = i.second; + d->rebuild(); + } + } + void setattr( + const hobject_t &hoid, ///< [in] object to write + const string &attrname, ///< [in] attr to write + bufferlist &bl ///< [in] val to write, may be claimed + ) { + auto &op = get_object_op_for_modify(hoid); + auto& d = op.attr_updates[attrname]; + d = bl; + d->rebuild(); + } + void rmattr( + const hobject_t &hoid, ///< [in] object to write + const string &attrname ///< [in] attr to remove + ) { + auto &op = get_object_op_for_modify(hoid); + op.attr_updates[attrname] = boost::none; + } + + /// set alloc hint + void set_alloc_hint( + const hobject_t &hoid, ///< [in] object (must exist) + uint64_t expected_object_size, ///< [in] + uint64_t expected_write_size, + uint32_t flags + ) { + auto &op = get_object_op_for_modify(hoid); + op.alloc_hint = ObjectOperation::alloc_hint_t{ + expected_object_size, expected_write_size, flags}; + } + + /// Buffer updates + void write( + const hobject_t &hoid, ///< [in] object to write + uint64_t off, ///< [in] off at which to write + uint64_t len, ///< [in] len to write from bl + bufferlist &bl, ///< [in] bl to write will be claimed to len + uint32_t fadvise_flags = 0 ///< [in] fadvise hint + ) { + auto &op = get_object_op_for_modify(hoid); + ceph_assert(!op.updated_snaps); + ceph_assert(len > 0); + ceph_assert(len == bl.length()); + op.buffer_updates.insert( + off, + len, + ObjectOperation::BufferUpdate::Write{bl, fadvise_flags}); + } + void clone_range( + const hobject_t &from, ///< [in] from + const hobject_t &to, ///< [in] to + uint64_t fromoff, ///< [in] offset + uint64_t len, ///< [in] len + uint64_t tooff ///< [in] offset + ) { + auto &op = get_object_op_for_modify(to); + ceph_assert(!op.updated_snaps); + op.buffer_updates.insert( + tooff, + len, + ObjectOperation::BufferUpdate::CloneRange{from, fromoff, len}); + } + void zero( + const hobject_t &hoid, ///< [in] object + uint64_t off, ///< [in] offset to start zeroing at + uint64_t len ///< [in] amount to zero + ) { + auto &op = get_object_op_for_modify(hoid); + ceph_assert(!op.updated_snaps); + op.buffer_updates.insert( + off, + len, + ObjectOperation::BufferUpdate::Zero{len}); + } + + /// Omap updates + void omap_setkeys( + const hobject_t &hoid, ///< [in] object to write + bufferlist &keys_bl ///< [in] encoded map<string, bufferlist> + ) { + auto &op = get_object_op_for_modify(hoid); + op.omap_updates.emplace_back( + make_pair( + ObjectOperation::OmapUpdateType::Insert, + keys_bl)); + } + void omap_setkeys( + const hobject_t &hoid, ///< [in] object to write + map<string, bufferlist> &keys ///< [in] omap keys, may be cleared + ) { + bufferlist bl; + encode(keys, bl); + omap_setkeys(hoid, bl); + } + void omap_rmkeys( + const hobject_t &hoid, ///< [in] object to write + bufferlist &keys_bl ///< [in] encode set<string> + ) { + auto &op = get_object_op_for_modify(hoid); + op.omap_updates.emplace_back( + make_pair( + ObjectOperation::OmapUpdateType::Remove, + keys_bl)); + } + void omap_rmkeys( + const hobject_t &hoid, ///< [in] object to write + set<string> &keys ///< [in] omap keys, may be cleared + ) { + bufferlist bl; + encode(keys, bl); + omap_rmkeys(hoid, bl); + } + void omap_setheader( + const hobject_t &hoid, ///< [in] object to write + bufferlist &header ///< [in] header + ) { + auto &op = get_object_op_for_modify(hoid); + op.omap_header = header; + } + + bool empty() const { + return op_map.empty(); + } + + uint64_t get_bytes_written() const { + uint64_t ret = 0; + for (auto &&i: op_map) { + for (auto &&j: i.second.buffer_updates) { + ret += j.get_len(); + } + } + return ret; + } + + void nop( + const hobject_t &hoid ///< [in] obj to which we are doing nothing + ) { + get_object_op_for_modify(hoid); + } + + /* Calls t() on all pair<hobject_t, ObjectOperation> & such that clone/rename + * sinks are always called before clone sources + * + * TODO: add a fast path for the single object case and possibly the single + * object clone from source case (make_writeable made a clone). + * + * This structure only requires that the source->sink graph be acyclic. + * This is much more general than is actually required by PrimaryLogPG. + * Only 4 flavors of multi-object transactions actually happen: + * 1) rename temp -> object for copyfrom + * 2) clone head -> clone, modify head for make_writeable on normal head write + * 3) clone clone -> head for rollback + * 4) 2 + 3 + * + * We can bypass the below logic for single object transactions trivially + * (including case 1 above since temp doesn't show up again). + * For 2-3, we could add something ad-hoc to ensure that they happen in the + * right order, but it actually seems easier to just do the graph construction. + */ + template <typename T> + void safe_create_traverse(T &&t) { + map<hobject_t, list<hobject_t>> dgraph; + list<hobject_t> stack; + + // Populate stack with roots, dgraph with edges + for (auto &&opair: op_map) { + hobject_t source; + if (opair.second.has_source(&source)) { + auto &l = dgraph[source]; + if (l.empty() && !op_map.count(source)) { + /* Source oids not in op_map need to be added as roots + * (but only once!) */ + stack.push_back(source); + } + l.push_back(opair.first); + } else { + stack.push_back(opair.first); + } + } + + /* Why don't we need to worry about accessing the same node + * twice? dgraph nodes always have in-degree at most 1 because + * the inverse graph nodes (source->dest) can have out-degree + * at most 1 (only one possible source). We do a post-order + * depth-first traversal here to ensure we call f on children + * before parents. + */ + while (!stack.empty()) { + hobject_t &cur = stack.front(); + auto diter = dgraph.find(cur); + if (diter == dgraph.end()) { + /* Leaf: pop and call t() */ + auto opiter = op_map.find(cur); + if (opiter != op_map.end()) + t(*opiter); + stack.pop_front(); + } else { + /* Internal node: push children onto stack, remove edge, + * recurse. When this node is encountered again, it'll + * be a leaf */ + ceph_assert(!diter->second.empty()); + stack.splice(stack.begin(), diter->second); + dgraph.erase(diter); + } + } + } +}; +using PGTransactionUPtr = std::unique_ptr<PGTransaction>; + +#endif diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc new file mode 100644 index 00000000..3b3e3e59 --- /dev/null +++ b/src/osd/PrimaryLogPG.cc @@ -0,0 +1,15554 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "boost/tuple/tuple.hpp" +#include "boost/intrusive_ptr.hpp" +#include "PG.h" +#include "PrimaryLogPG.h" +#include "OSD.h" +#include "OpRequest.h" +#include "ScrubStore.h" +#include "Session.h" +#include "objclass/objclass.h" + +#include "common/errno.h" +#include "common/scrub_types.h" +#include "common/perf_counters.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDPGTrim.h" +#include "messages/MOSDPGScan.h" +#include "messages/MOSDRepScrub.h" +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" +#include "messages/MCommandReply.h" +#include "messages/MOSDScrubReserve.h" +#include "mds/inode_backtrace.h" // Ugh +#include "common/EventTrace.h" + +#include "common/config.h" +#include "include/compat.h" +#include "mon/MonClient.h" +#include "osdc/Objecter.h" +#include "json_spirit/json_spirit_value.h" +#include "json_spirit/json_spirit_reader.h" +#include "include/ceph_assert.h" // json_spirit clobbers it +#include "include/rados/rados_types.hpp" + +#ifdef WITH_LTTNG +#include "tracing/osd.h" +#else +#define tracepoint(...) +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap() +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +template <typename T> +static ostream& _prefix(std::ostream *_dout, T *pg) { + return pg->gen_prefix(*_dout); +} + + +#include <sstream> +#include <utility> + +#include <errno.h> + +MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd); + +PGLSFilter::PGLSFilter() : cct(nullptr) +{ +} + +PGLSFilter::~PGLSFilter() +{ +} + +/** + * The CopyCallback class defines an interface for completions to the + * copy_start code. Users of the copy infrastructure must implement + * one and give an instance of the class to start_copy. + * + * The implementer is responsible for making sure that the CopyCallback + * can associate itself with the correct copy operation. + */ +class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> { +protected: + CopyCallback() {} + /** + * results.get<0>() is the return code: 0 for success; -ECANCELED if + * the operation was cancelled by the local OSD; -errno for other issues. + * results.get<1>() is a pointer to a CopyResults object, which you are + * responsible for deleting. + */ + void finish(CopyCallbackResults results_) override = 0; + +public: + /// Provide the final size of the copied object to the CopyCallback + ~CopyCallback() override {} +}; + +template <typename T> +class PrimaryLogPG::BlessedGenContext : public GenContext<T> { + PrimaryLogPGRef pg; + unique_ptr<GenContext<T>> c; + epoch_t e; +public: + BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(T t) override { + pg->lock(); + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(t); + pg->unlock(); + } + bool sync_finish(T t) { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(t); + return true; + } +}; + +GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext( + GenContext<ThreadPool::TPHandle&> *c) { + return new BlessedGenContext<ThreadPool::TPHandle&>( + this, c, get_osdmap_epoch()); +} + +template <typename T> +class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> { + PrimaryLogPGRef pg; + unique_ptr<GenContext<T>> c; + epoch_t e; +public: + UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(T t) override { + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(t); + } + bool sync_finish(T t) { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(t); + return true; + } +}; + +GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext( + GenContext<ThreadPool::TPHandle&> *c) { + return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>( + this, c, get_osdmap_epoch()); +} + +class PrimaryLogPG::BlessedContext : public Context { + PrimaryLogPGRef pg; + unique_ptr<Context> c; + epoch_t e; +public: + BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(int r) override { + pg->lock(); + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(r); + pg->unlock(); + } + bool sync_finish(int r) { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(r); + return true; + } +}; + +Context *PrimaryLogPG::bless_context(Context *c) { + return new BlessedContext(this, c, get_osdmap_epoch()); +} + +class PrimaryLogPG::C_PG_ObjectContext : public Context { + PrimaryLogPGRef pg; + ObjectContext *obc; + public: + C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) : + pg(p), obc(o) {} + void finish(int r) override { + pg->object_context_destructor_callback(obc); + } +}; + +struct OnReadComplete : public Context { + PrimaryLogPG *pg; + PrimaryLogPG::OpContext *opcontext; + OnReadComplete( + PrimaryLogPG *pg, + PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {} + void finish(int r) override { + opcontext->finish_read(pg); + } + ~OnReadComplete() override {} +}; + +class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context { + PrimaryLogPGRef pg; + ObjectContextRef obc; + public: + C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) : + pg(p), obc(o) {} + bool sync_finish(int r) override { + pg->_applied_recovered_object(obc); + return true; + } + void finish(int r) override { + pg->lock(); + pg->_applied_recovered_object(obc); + pg->unlock(); + } +}; + +class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context { + PrimaryLogPGRef pg; + epoch_t epoch; + eversion_t last_complete; + public: + C_OSD_CommittedPushedObject( + PrimaryLogPG *p, epoch_t epoch, eversion_t lc) : + pg(p), epoch(epoch), last_complete(lc) { + } + void finish(int r) override { + pg->_committed_pushed_object(epoch, last_complete); + } +}; + +class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context { + PrimaryLogPGRef pg; + public: + explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) : + pg(p) {} + bool sync_finish(int r) override { + pg->_applied_recovered_object_replica(); + return true; + } + void finish(int r) override { + pg->lock(); + pg->_applied_recovered_object_replica(); + pg->unlock(); + } +}; + +// OpContext +void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg) +{ + inflightreads = 1; + list<pair<boost::tuple<uint64_t, uint64_t, unsigned>, + pair<bufferlist*, Context*> > > in; + in.swap(pending_async_reads); + pg->pgbackend->objects_read_async( + obc->obs.oi.soid, + in, + new OnReadComplete(pg, this), pg->get_pool().fast_read); +} +void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg) +{ + ceph_assert(inflightreads > 0); + --inflightreads; + if (async_reads_complete()) { + ceph_assert(pg->in_progress_async_reads.size()); + ceph_assert(pg->in_progress_async_reads.front().second == this); + pg->in_progress_async_reads.pop_front(); + + // Restart the op context now that all reads have been + // completed. Read failures will be handled by the op finisher + pg->execute_ctx(this); + } +} + +class CopyFromCallback : public PrimaryLogPG::CopyCallback { +public: + PrimaryLogPG::CopyResults *results = nullptr; + PrimaryLogPG::OpContext *ctx; + OSDOp &osd_op; + + CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op) + : ctx(ctx), osd_op(osd_op) { + } + ~CopyFromCallback() override {} + + void finish(PrimaryLogPG::CopyCallbackResults results_) override { + results = results_.get<1>(); + int r = results_.get<0>(); + + // for finish_copyfrom + ctx->user_at_version = results->user_version; + + if (r >= 0) { + ctx->pg->execute_ctx(ctx); + } else { + if (r != -ECANCELED) { // on cancel just toss it out; client resends + if (ctx->op) + ctx->pg->osd->reply_op_error(ctx->op, r); + } else if (results->should_requeue) { + if (ctx->op) + ctx->pg->requeue_op(ctx->op); + } + ctx->pg->close_op_ctx(ctx); + } + } + + bool is_temp_obj_used() { + return results->started_temp_obj; + } + uint64_t get_data_size() { + return results->object_size; + } +}; + +struct CopyFromFinisher : public PrimaryLogPG::OpFinisher { + CopyFromCallback *copy_from_callback; + + explicit CopyFromFinisher(CopyFromCallback *copy_from_callback) + : copy_from_callback(copy_from_callback) { + } + + int execute() override { + // instance will be destructed after this method completes + copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback); + return 0; + } +}; + +// ====================== +// PGBackend::Listener + +void PrimaryLogPG::on_local_recover( + const hobject_t &hoid, + const ObjectRecoveryInfo &_recovery_info, + ObjectContextRef obc, + bool is_delete, + ObjectStore::Transaction *t + ) +{ + dout(10) << __func__ << ": " << hoid << dendl; + + ObjectRecoveryInfo recovery_info(_recovery_info); + clear_object_snap_mapping(t, hoid); + if (!is_delete && recovery_info.soid.is_snap()) { + OSDriver::OSTransaction _t(osdriver.get_transaction(t)); + set<snapid_t> snaps; + dout(20) << " snapset " << recovery_info.ss << dendl; + auto p = recovery_info.ss.clone_snaps.find(hoid.snap); + if (p != recovery_info.ss.clone_snaps.end()) { + snaps.insert(p->second.begin(), p->second.end()); + dout(20) << " snaps " << snaps << dendl; + snap_mapper.add_oid( + recovery_info.soid, + snaps, + &_t); + } else { + derr << __func__ << " " << hoid << " had no clone_snaps" << dendl; + } + } + if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) && + pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) { + ceph_assert(is_primary()); + const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second; + if (latest->op == pg_log_entry_t::LOST_REVERT && + latest->reverting_to == recovery_info.version) { + dout(10) << " got old revert version " << recovery_info.version + << " for " << *latest << dendl; + recovery_info.version = latest->version; + // update the attr to the revert event version + recovery_info.oi.prior_version = recovery_info.oi.version; + recovery_info.oi.version = latest->version; + bufferlist bl; + encode(recovery_info.oi, bl, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + ceph_assert(!pool.info.is_erasure()); + t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl); + if (obc) + obc->attr_cache[OI_ATTR] = bl; + } + } + + // keep track of active pushes for scrub + ++active_pushes; + + if (recovery_info.version > pg_log.get_can_rollback_to()) { + /* This can only happen during a repair, and even then, it would + * be one heck of a race. If we are repairing the object, the + * write in question must be fully committed, so it's not valid + * to roll it back anyway (and we'll be rolled forward shortly + * anyway) */ + PGLogEntryHandler h{this, t}; + pg_log.roll_forward_to(recovery_info.version, &h); + } + recover_got(recovery_info.soid, recovery_info.version); + + if (is_primary()) { + if (!is_delete) { + obc->obs.exists = true; + + bool got = obc->get_recovery_read(); + ceph_assert(got); + + ceph_assert(recovering.count(obc->obs.oi.soid)); + recovering[obc->obs.oi.soid] = obc; + obc->obs.oi = recovery_info.oi; // may have been updated above + } + + t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc)); + + publish_stats_to_osd(); + ceph_assert(missing_loc.needs_recovery(hoid)); + if (!is_delete) + missing_loc.add_location(hoid, pg_whoami); + release_backoffs(hoid); + if (!is_unreadable_object(hoid)) { + auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid); + if (unreadable_object_entry != waiting_for_unreadable_object.end()) { + dout(20) << " kicking unreadable waiters on " << hoid << dendl; + requeue_ops(unreadable_object_entry->second); + waiting_for_unreadable_object.erase(unreadable_object_entry); + } + } + } else { + t->register_on_applied( + new C_OSD_AppliedRecoveredObjectReplica(this)); + + } + + t->register_on_commit( + new C_OSD_CommittedPushedObject( + this, + get_osdmap_epoch(), + info.last_complete)); + + // update pg + dirty_info = true; + write_if_dirty(*t); +} + +void PrimaryLogPG::on_global_recover( + const hobject_t &soid, + const object_stat_sum_t &stat_diff, + bool is_delete) +{ + info.stats.stats.sum.add(stat_diff); + missing_loc.recovered(soid); + publish_stats_to_osd(); + dout(10) << "pushed " << soid << " to all replicas" << dendl; + map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid); + ceph_assert(i != recovering.end()); + + if (i->second && i->second->rwstate.recovery_read_marker) { + // recover missing won't have had an obc, but it gets filled in + // during on_local_recover + ceph_assert(i->second); + list<OpRequestRef> requeue_list; + i->second->drop_recovery_read(&requeue_list); + requeue_ops(requeue_list); + } + + backfills_in_flight.erase(soid); + + recovering.erase(i); + finish_recovery_op(soid); + release_backoffs(soid); + auto degraded_object_entry = waiting_for_degraded_object.find(soid); + if (degraded_object_entry != waiting_for_degraded_object.end()) { + dout(20) << " kicking degraded waiters on " << soid << dendl; + requeue_ops(degraded_object_entry->second); + waiting_for_degraded_object.erase(degraded_object_entry); + } + auto unreadable_object_entry = waiting_for_unreadable_object.find(soid); + if (unreadable_object_entry != waiting_for_unreadable_object.end()) { + dout(20) << " kicking unreadable waiters on " << soid << dendl; + requeue_ops(unreadable_object_entry->second); + waiting_for_unreadable_object.erase(unreadable_object_entry); + } + finish_degraded_object(soid); +} + +void PrimaryLogPG::on_peer_recover( + pg_shard_t peer, + const hobject_t &soid, + const ObjectRecoveryInfo &recovery_info) +{ + publish_stats_to_osd(); + // done! + peer_missing[peer].got(soid, recovery_info.version); + missing_loc.add_location(soid, peer); +} + +void PrimaryLogPG::begin_peer_recover( + pg_shard_t peer, + const hobject_t soid) +{ + peer_missing[peer].revise_have(soid, eversion_t()); +} + +void PrimaryLogPG::schedule_recovery_work( + GenContext<ThreadPool::TPHandle&> *c) +{ + osd->queue_recovery_context(this, c); +} + +void PrimaryLogPG::send_message_osd_cluster( + int peer, Message *m, epoch_t from_epoch) +{ + osd->send_message_osd_cluster(peer, m, from_epoch); +} + +void PrimaryLogPG::send_message_osd_cluster( + Message *m, Connection *con) +{ + osd->send_message_osd_cluster(m, con); +} + +void PrimaryLogPG::send_message_osd_cluster( + Message *m, const ConnectionRef& con) +{ + osd->send_message_osd_cluster(m, con); +} + +void PrimaryLogPG::on_primary_error( + const hobject_t &oid, + eversion_t v) +{ + dout(0) << __func__ << ": oid " << oid << " version " << v << dendl; + primary_failed(oid); + primary_error(oid, v); + backfill_add_missing(oid, v); +} + +void PrimaryLogPG::backfill_add_missing( + const hobject_t &oid, + eversion_t v) +{ + dout(0) << __func__ << ": oid " << oid << " version " << v << dendl; + backfills_in_flight.erase(oid); + missing_loc.add_missing(oid, v, eversion_t()); +} + +bool PrimaryLogPG::should_send_op( + pg_shard_t peer, + const hobject_t &hoid) { + if (peer == get_primary()) + return true; + ceph_assert(peer_info.count(peer)); + bool should_send = + hoid.pool != (int64_t)info.pgid.pool() || + hoid <= last_backfill_started || + hoid <= peer_info[peer].last_backfill; + if (!should_send) { + ceph_assert(is_backfill_targets(peer)); + dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer + << ", object " << hoid + << " beyond std::max(last_backfill_started " + << ", peer_info[peer].last_backfill " + << peer_info[peer].last_backfill << ")" << dendl; + return should_send; + } + if (async_recovery_targets.count(peer) && peer_missing[peer].is_missing(hoid)) { + should_send = false; + dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer + << ", object " << hoid + << " which is pending recovery in async_recovery_targets" << dendl; + } + return should_send; +} + + +ConnectionRef PrimaryLogPG::get_con_osd_cluster( + int peer, epoch_t from_epoch) +{ + return osd->get_con_osd_cluster(peer, from_epoch); +} + +PerfCounters *PrimaryLogPG::get_logger() +{ + return osd->logger; +} + + +// ==================== +// missing objects + +bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const +{ + return pg_log.get_missing().get_items().count(soid); +} + +void PrimaryLogPG::maybe_kick_recovery( + const hobject_t &soid) +{ + eversion_t v; + bool work_started = false; + if (!missing_loc.needs_recovery(soid, &v)) + return; + + map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid); + if (p != recovering.end()) { + dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl; + } else if (missing_loc.is_unfound(soid)) { + dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl; + } else { + dout(7) << "object " << soid << " v " << v << ", recovering." << dendl; + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + if (is_missing_object(soid)) { + recover_missing(soid, v, cct->_conf->osd_client_op_priority, h); + } else if (missing_loc.is_deleted(soid)) { + prep_object_replica_deletes(soid, v, h, &work_started); + } else { + prep_object_replica_pushes(soid, v, h, &work_started); + } + pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority); + } +} + +void PrimaryLogPG::wait_for_unreadable_object( + const hobject_t& soid, OpRequestRef op) +{ + ceph_assert(is_unreadable_object(soid)); + maybe_kick_recovery(soid); + waiting_for_unreadable_object[soid].push_back(op); + op->mark_delayed("waiting for missing object"); +} + +bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid) +{ + /* The conditions below may clear (on_local_recover, before we queue + * the transaction) before we actually requeue the degraded waiters + * in on_global_recover after the transaction completes. + */ + if (waiting_for_degraded_object.count(soid)) + return true; + if (pg_log.get_missing().get_items().count(soid)) + return true; + ceph_assert(!acting_recovery_backfill.empty()); + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + auto peer_missing_entry = peer_missing.find(peer); + // If an object is missing on an async_recovery_target, return false. + // This will not block the op and the object is async recovered later. + if (peer_missing_entry != peer_missing.end() && + peer_missing_entry->second.get_items().count(soid)) { + if (async_recovery_targets.count(peer)) + continue; + else + return true; + } + // Object is degraded if after last_backfill AND + // we are backfilling it + if (is_backfill_targets(peer) && + peer_info[peer].last_backfill <= soid && + last_backfill_started >= soid && + backfills_in_flight.count(soid)) + return true; + } + return false; +} + +bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid) +{ + for (auto &i: async_recovery_targets) { + auto peer_missing_entry = peer_missing.find(i); + if (peer_missing_entry != peer_missing.end() && + peer_missing_entry->second.get_items().count(soid)) { + dout(30) << __func__ << " " << soid << dendl; + return true; + } + } + return false; +} + +void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op) +{ + ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid)); + + maybe_kick_recovery(soid); + waiting_for_degraded_object[soid].push_back(op); + op->mark_delayed("waiting for degraded object"); +} + +void PrimaryLogPG::block_write_on_full_cache( + const hobject_t& _oid, OpRequestRef op) +{ + const hobject_t oid = _oid.get_head(); + dout(20) << __func__ << ": blocking object " << oid + << " on full cache" << dendl; + objects_blocked_on_cache_full.insert(oid); + waiting_for_cache_not_full.push_back(op); + op->mark_delayed("waiting for cache not full"); +} + +void PrimaryLogPG::block_for_clean( + const hobject_t& oid, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << oid + << " on primary repair" << dendl; + waiting_for_clean_to_primary_repair.push_back(op); + op->mark_delayed("waiting for clean to repair"); +} + +void PrimaryLogPG::block_write_on_snap_rollback( + const hobject_t& oid, ObjectContextRef obc, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << oid.get_head() + << " on snap promotion " << obc->obs.oi.soid << dendl; + // otherwise, we'd have blocked in do_op + ceph_assert(oid.is_head()); + ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0); + objects_blocked_on_snap_promotion[oid] = obc; + wait_for_blocked_object(obc->obs.oi.soid, op); +} + +void PrimaryLogPG::block_write_on_degraded_snap( + const hobject_t& snap, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << snap.get_head() + << " on degraded snap " << snap << dendl; + // otherwise, we'd have blocked in do_op + ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0); + objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap; + wait_for_degraded_object(snap, op); +} + +bool PrimaryLogPG::maybe_await_blocked_head( + const hobject_t &hoid, + OpRequestRef op) +{ + ObjectContextRef obc; + obc = object_contexts.lookup(hoid.get_head()); + if (obc) { + if (obc->is_blocked()) { + wait_for_blocked_object(obc->obs.oi.soid, op); + return true; + } else { + return false; + } + } + return false; +} + +void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op) +{ + dout(10) << __func__ << " " << soid << " " << op << dendl; + waiting_for_blocked_object[soid].push_back(op); + op->mark_delayed("waiting for blocked object"); +} + +void PrimaryLogPG::maybe_force_recovery() +{ + // no force if not in degraded/recovery/backfill states + if (!is_degraded() && + !state_test(PG_STATE_RECOVERING | + PG_STATE_RECOVERY_WAIT | + PG_STATE_BACKFILLING | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILL_TOOFULL)) + return; + + if (pg_log.get_log().approx_size() < + cct->_conf->osd_max_pg_log_entries * + cct->_conf->osd_force_recovery_pg_log_entries_factor) + return; + + // find the oldest missing object + version_t min_version = pg_log.get_log().head.version; + hobject_t soid; + if (!pg_log.get_missing().get_rmissing().empty()) { + min_version = pg_log.get_missing().get_rmissing().begin()->first; + soid = pg_log.get_missing().get_rmissing().begin()->second; + } + ceph_assert(!acting_recovery_backfill.empty()); + for (set<pg_shard_t>::iterator it = acting_recovery_backfill.begin(); + it != acting_recovery_backfill.end(); + ++it) { + if (*it == get_primary()) continue; + pg_shard_t peer = *it; + auto it_missing = peer_missing.find(peer); + if (it_missing != peer_missing.end() && + !it_missing->second.get_rmissing().empty()) { + const auto& min_obj = peer_missing[peer].get_rmissing().begin(); + dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first + << " oid " << min_obj->second << dendl; + if (min_version > min_obj->first) { + min_version = min_obj->first; + soid = min_obj->second; + } + } + } + + // recover it + if (soid != hobject_t()) + maybe_kick_recovery(soid); +} + +class PGLSPlainFilter : public PGLSFilter { + string val; +public: + int init(bufferlist::const_iterator ¶ms) override + { + try { + decode(xattr, params); + decode(val, params); + } catch (buffer::error &e) { + return -EINVAL; + } + + return 0; + } + ~PGLSPlainFilter() override {} + bool filter(const hobject_t &obj, bufferlist& xattr_data, + bufferlist& outdata) override; +}; + +class PGLSParentFilter : public PGLSFilter { + inodeno_t parent_ino; +public: + CephContext* cct; + explicit PGLSParentFilter(CephContext* cct) : cct(cct) { + xattr = "_parent"; + } + int init(bufferlist::const_iterator ¶ms) override + { + try { + decode(parent_ino, params); + } catch (buffer::error &e) { + return -EINVAL; + } + generic_dout(0) << "parent_ino=" << parent_ino << dendl; + + return 0; + } + ~PGLSParentFilter() override {} + bool filter(const hobject_t &obj, bufferlist& xattr_data, + bufferlist& outdata) override; +}; + +bool PGLSParentFilter::filter(const hobject_t &obj, + bufferlist& xattr_data, bufferlist& outdata) +{ + auto iter = xattr_data.cbegin(); + inode_backtrace_t bt; + + generic_dout(0) << "PGLSParentFilter::filter" << dendl; + + decode(bt, iter); + + vector<inode_backpointer_t>::iterator vi; + for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) { + generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl; + if (vi->dirino == parent_ino) { + encode(*vi, outdata); + return true; + } + } + + return false; +} + +bool PGLSPlainFilter::filter(const hobject_t &obj, + bufferlist& xattr_data, bufferlist& outdata) +{ + if (val.size() != xattr_data.length()) + return false; + + if (memcmp(val.c_str(), xattr_data.c_str(), val.size())) + return false; + + return true; +} + +bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata) +{ + bufferlist bl; + + // If filter has expressed an interest in an xattr, load it. + if (!filter->get_xattr().empty()) { + int ret = pgbackend->objects_get_attr( + sobj, + filter->get_xattr(), + &bl); + dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl; + if (ret < 0) { + if (ret != -ENODATA || filter->reject_empty_xattr()) { + return false; + } + } + } + + return filter->filter(sobj, bl, outdata); +} + +int PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter, PGLSFilter **pfilter) +{ + string type; + PGLSFilter *filter; + + try { + decode(type, iter); + } + catch (buffer::error& e) { + return -EINVAL; + } + + if (type.compare("parent") == 0) { + filter = new PGLSParentFilter(cct); + } else if (type.compare("plain") == 0) { + filter = new PGLSPlainFilter(); + } else { + std::size_t dot = type.find("."); + if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) { + return -EINVAL; + } + + const std::string class_name = type.substr(0, dot); + const std::string filter_name = type.substr(dot + 1); + ClassHandler::ClassData *cls = NULL; + int r = osd->class_handler->open_class(class_name, &cls); + if (r != 0) { + derr << "Error opening class '" << class_name << "': " + << cpp_strerror(r) << dendl; + if (r != -EPERM) // propogate permission error + r = -EINVAL; + return r; + } else { + ceph_assert(cls); + } + + ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name); + if (class_filter == NULL) { + derr << "Error finding filter '" << filter_name << "' in class " + << class_name << dendl; + return -EINVAL; + } + filter = class_filter->fn(); + if (!filter) { + // Object classes are obliged to return us something, but let's + // give an error rather than asserting out. + derr << "Buggy class " << class_name << " failed to construct " + "filter " << filter_name << dendl; + return -EINVAL; + } + } + + ceph_assert(filter); + int r = filter->init(iter); + if (r < 0) { + derr << "Error initializing filter " << type << ": " + << cpp_strerror(r) << dendl; + delete filter; + return -EINVAL; + } else { + // Successfully constructed and initialized, return it. + *pfilter = filter; + return 0; + } +} + + +// ========================================================== + +int PrimaryLogPG::do_command( + cmdmap_t cmdmap, + ostream& ss, + bufferlist& idata, + bufferlist& odata, + ConnectionRef con, + ceph_tid_t tid) +{ + string prefix; + string format; + + cmd_getval(cct, cmdmap, "format", format); + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json")); + + string command; + cmd_getval(cct, cmdmap, "cmd", command); + if (command == "query") { + f->open_object_section("pg"); + f->dump_string("state", pg_state_string(get_state())); + f->dump_stream("snap_trimq") << snap_trimq; + f->dump_unsigned("snap_trimq_len", snap_trimq.size()); + f->dump_unsigned("epoch", get_osdmap_epoch()); + f->open_array_section("up"); + for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) + f->dump_unsigned("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) + f->dump_unsigned("osd", *p); + f->close_section(); + if (!backfill_targets.empty()) { + f->open_array_section("backfill_targets"); + for (set<pg_shard_t>::iterator p = backfill_targets.begin(); + p != backfill_targets.end(); + ++p) + f->dump_stream("shard") << *p; + f->close_section(); + } + if (!async_recovery_targets.empty()) { + f->open_array_section("async_recovery_targets"); + for (set<pg_shard_t>::iterator p = async_recovery_targets.begin(); + p != async_recovery_targets.end(); + ++p) + f->dump_stream("shard") << *p; + f->close_section(); + } + if (!acting_recovery_backfill.empty()) { + f->open_array_section("acting_recovery_backfill"); + for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin(); + p != acting_recovery_backfill.end(); + ++p) + f->dump_stream("shard") << *p; + f->close_section(); + } + f->open_object_section("info"); + _update_calc_stats(); + info.dump(f.get()); + f->close_section(); + + f->open_array_section("peer_info"); + for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin(); + p != peer_info.end(); + ++p) { + f->open_object_section("info"); + f->dump_stream("peer") << p->first; + p->second.dump(f.get()); + f->close_section(); + } + f->close_section(); + + f->open_array_section("recovery_state"); + handle_query_state(f.get()); + f->close_section(); + + f->open_object_section("agent_state"); + if (agent_state) + agent_state->dump(f.get()); + f->close_section(); + + f->close_section(); + f->flush(odata); + return 0; + } + else if (command == "mark_unfound_lost") { + string mulcmd; + cmd_getval(cct, cmdmap, "mulcmd", mulcmd); + int mode = -1; + if (mulcmd == "revert") { + if (pool.info.is_erasure()) { + ss << "mode must be 'delete' for ec pool"; + return -EINVAL; + } + mode = pg_log_entry_t::LOST_REVERT; + } else if (mulcmd == "delete") { + mode = pg_log_entry_t::LOST_DELETE; + } else { + ss << "mode must be 'revert' or 'delete'; mark not yet implemented"; + return -EINVAL; + } + ceph_assert(mode == pg_log_entry_t::LOST_REVERT || + mode == pg_log_entry_t::LOST_DELETE); + + if (!is_primary()) { + ss << "not primary"; + return -EROFS; + } + + uint64_t unfound = missing_loc.num_unfound(); + if (!unfound) { + ss << "pg has no unfound objects"; + return 0; // make command idempotent + } + + if (!all_unfound_are_queried_or_lost(get_osdmap())) { + ss << "pg has " << unfound + << " unfound objects but we haven't probed all sources, not marking lost"; + return -EINVAL; + } + + mark_all_unfound_lost(mode, con, tid); + return -EAGAIN; + } + else if (command == "list_unfound") { + hobject_t offset; + string offset_json; + bool show_offset = false; + if (cmd_getval(cct, cmdmap, "offset", offset_json)) { + json_spirit::Value v; + try { + if (!json_spirit::read(offset_json, v)) + throw std::runtime_error("bad json"); + offset.decode(v); + } catch (std::runtime_error& e) { + ss << "error parsing offset: " << e.what(); + return -EINVAL; + } + show_offset = true; + } + f->open_object_section("missing"); + if (show_offset) { + f->open_object_section("offset"); + offset.dump(f.get()); + f->close_section(); + } + auto &needs_recovery_map = missing_loc.get_needs_recovery(); + f->dump_int("num_missing", needs_recovery_map.size()); + f->dump_int("num_unfound", get_num_unfound()); + map<hobject_t, pg_missing_item>::const_iterator p = + needs_recovery_map.upper_bound(offset); + { + f->open_array_section("objects"); + int32_t num = 0; + for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) { + if (missing_loc.is_unfound(p->first)) { + f->open_object_section("object"); + { + f->open_object_section("oid"); + p->first.dump(f.get()); + f->close_section(); + } + p->second.dump(f.get()); // have, need keys + { + f->open_array_section("locations"); + for (set<pg_shard_t>::iterator r = + missing_loc.get_locations(p->first).begin(); + r != missing_loc.get_locations(p->first).end(); + ++r) + f->dump_stream("shard") << *r; + f->close_section(); + } + f->close_section(); + num++; + } + } + f->close_section(); + } + f->dump_bool("more", p != needs_recovery_map.end()); + f->close_section(); + f->flush(odata); + return 0; + } + + ss << "unknown pg command " << prefix; + return -EINVAL; +} + +// ========================================================== + +void PrimaryLogPG::do_pg_op(OpRequestRef op) +{ + // NOTE: this is non-const because we modify the OSDOp.outdata in + // place + MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + dout(10) << "do_pg_op " << *m << dendl; + + op->mark_started(); + + int result = 0; + string cname, mname; + PGLSFilter *filter = NULL; + bufferlist filter_out; + + snapid_t snapid = m->get_snapid(); + + vector<OSDOp> ops = m->ops; + + for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) { + OSDOp& osd_op = *p; + auto bp = p->indata.cbegin(); + switch (p->op.op) { + case CEPH_OSD_OP_PGNLS_FILTER: + try { + decode(cname, bp); + decode(mname, bp); + } + catch (const buffer::error& e) { + dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl; + result = -EINVAL; + break; + } + if (filter) { + delete filter; + filter = NULL; + } + result = get_pgls_filter(bp, &filter); + if (result < 0) + break; + + ceph_assert(filter); + + // fall through + + case CEPH_OSD_OP_PGNLS: + if (snapid != CEPH_NOSNAP) { + result = -EINVAL; + break; + } + if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) { + dout(10) << " pgnls pg=" << m->get_pg() + << " " << get_osdmap()->raw_pg_to_pg(m->get_pg()) + << " != " << info.pgid << dendl; + result = 0; // hmm? + } else { + unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls, + p->op.pgls.count); + + dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size + << dendl; + // read into a buffer + vector<hobject_t> sentries; + pg_nls_response_t response; + try { + decode(response.handle, bp); + } + catch (const buffer::error& e) { + dout(0) << "unable to decode PGNLS handle in " << *m << dendl; + result = -EINVAL; + break; + } + + hobject_t next; + hobject_t lower_bound = response.handle; + hobject_t pg_start = info.pgid.pgid.get_hobj_start(); + hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + dout(10) << " pgnls lower_bound " << lower_bound + << " pg_end " << pg_end << dendl; + if (((!lower_bound.is_max() && lower_bound >= pg_end) || + (lower_bound != hobject_t() && lower_bound < pg_start))) { + // this should only happen with a buggy client. + dout(10) << "outside of PG bounds " << pg_start << " .. " + << pg_end << dendl; + result = -EINVAL; + break; + } + + hobject_t current = lower_bound; + int r = pgbackend->objects_list_partial( + current, + list_size, + list_size, + &sentries, + &next); + if (r != 0) { + result = -EINVAL; + break; + } + + map<hobject_t, pg_missing_item>::const_iterator missing_iter = + pg_log.get_missing().get_items().lower_bound(current); + vector<hobject_t>::iterator ls_iter = sentries.begin(); + hobject_t _max = hobject_t::get_max(); + while (1) { + const hobject_t &mcand = + missing_iter == pg_log.get_missing().get_items().end() ? + _max : + missing_iter->first; + const hobject_t &lcand = + ls_iter == sentries.end() ? + _max : + *ls_iter; + + hobject_t candidate; + if (mcand == lcand) { + candidate = mcand; + if (!mcand.is_max()) { + ++ls_iter; + ++missing_iter; + } + } else if (mcand < lcand) { + candidate = mcand; + ceph_assert(!mcand.is_max()); + ++missing_iter; + } else { + candidate = lcand; + ceph_assert(!lcand.is_max()); + ++ls_iter; + } + + dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash() + << " vs lower bound 0x" << lower_bound.get_hash() + << std::dec << dendl; + + if (candidate >= next) { + break; + } + + if (response.entries.size() == list_size) { + next = candidate; + break; + } + + if (candidate.snap != CEPH_NOSNAP) + continue; + + // skip internal namespace + if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace) + continue; + + if (missing_loc.is_deleted(candidate)) + continue; + + // skip wrong namespace + if (m->get_hobj().nspace != librados::all_nspaces && + candidate.get_namespace() != m->get_hobj().nspace) + continue; + + if (filter && !pgls_filter(filter, candidate, filter_out)) + continue; + + dout(20) << "pgnls item 0x" << std::hex + << candidate.get_hash() + << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash()) + << std::dec << " " + << candidate.oid.name << dendl; + + librados::ListObjectImpl item; + item.nspace = candidate.get_namespace(); + item.oid = candidate.oid.name; + item.locator = candidate.get_key(); + response.entries.push_back(item); + } + + if (next.is_max() && + missing_iter == pg_log.get_missing().get_items().end() && + ls_iter == sentries.end()) { + result = 1; + + // Set response.handle to the start of the next PG according + // to the object sort order. + response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + } else { + response.handle = next; + } + dout(10) << "pgnls handle=" << response.handle << dendl; + encode(response, osd_op.outdata); + if (filter) + encode(filter_out, osd_op.outdata); + dout(10) << " pgnls result=" << result << " outdata.length()=" + << osd_op.outdata.length() << dendl; + } + break; + + case CEPH_OSD_OP_PGLS_FILTER: + try { + decode(cname, bp); + decode(mname, bp); + } + catch (const buffer::error& e) { + dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl; + result = -EINVAL; + break; + } + if (filter) { + delete filter; + filter = NULL; + } + result = get_pgls_filter(bp, &filter); + if (result < 0) + break; + + ceph_assert(filter); + + // fall through + + case CEPH_OSD_OP_PGLS: + if (snapid != CEPH_NOSNAP) { + result = -EINVAL; + break; + } + if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) { + dout(10) << " pgls pg=" << m->get_pg() + << " " << get_osdmap()->raw_pg_to_pg(m->get_pg()) + << " != " << info.pgid << dendl; + result = 0; // hmm? + } else { + unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls, + p->op.pgls.count); + + dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl; + // read into a buffer + vector<hobject_t> sentries; + pg_ls_response_t response; + try { + decode(response.handle, bp); + } + catch (const buffer::error& e) { + dout(0) << "unable to decode PGLS handle in " << *m << dendl; + result = -EINVAL; + break; + } + + hobject_t next; + hobject_t current = response.handle; + int r = pgbackend->objects_list_partial( + current, + list_size, + list_size, + &sentries, + &next); + if (r != 0) { + result = -EINVAL; + break; + } + + ceph_assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty()); + + map<hobject_t, pg_missing_item>::const_iterator missing_iter = + pg_log.get_missing().get_items().lower_bound(current); + vector<hobject_t>::iterator ls_iter = sentries.begin(); + hobject_t _max = hobject_t::get_max(); + while (1) { + const hobject_t &mcand = + missing_iter == pg_log.get_missing().get_items().end() ? + _max : + missing_iter->first; + const hobject_t &lcand = + ls_iter == sentries.end() ? + _max : + *ls_iter; + + hobject_t candidate; + if (mcand == lcand) { + candidate = mcand; + if (!mcand.is_max()) { + ++ls_iter; + ++missing_iter; + } + } else if (mcand < lcand) { + candidate = mcand; + ceph_assert(!mcand.is_max()); + ++missing_iter; + } else { + candidate = lcand; + ceph_assert(!lcand.is_max()); + ++ls_iter; + } + + if (candidate >= next) { + break; + } + + if (response.entries.size() == list_size) { + next = candidate; + break; + } + + if (candidate.snap != CEPH_NOSNAP) + continue; + + // skip wrong namespace + if (candidate.get_namespace() != m->get_hobj().nspace) + continue; + + if (missing_loc.is_deleted(candidate)) + continue; + + if (filter && !pgls_filter(filter, candidate, filter_out)) + continue; + + response.entries.push_back(make_pair(candidate.oid, + candidate.get_key())); + } + if (next.is_max() && + missing_iter == pg_log.get_missing().get_items().end() && + ls_iter == sentries.end()) { + result = 1; + } + response.handle = next; + encode(response, osd_op.outdata); + if (filter) + encode(filter_out, osd_op.outdata); + dout(10) << " pgls result=" << result << " outdata.length()=" + << osd_op.outdata.length() << dendl; + } + break; + + case CEPH_OSD_OP_PG_HITSET_LS: + { + list< pair<utime_t,utime_t> > ls; + for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) + ls.push_back(make_pair(p->begin, p->end)); + if (hit_set) + ls.push_back(make_pair(hit_set_start_stamp, utime_t())); + encode(ls, osd_op.outdata); + } + break; + + case CEPH_OSD_OP_PG_HITSET_GET: + { + utime_t stamp(osd_op.op.hit_set_get.stamp); + if (hit_set_start_stamp && stamp >= hit_set_start_stamp) { + // read the current in-memory HitSet, not the version we've + // checkpointed. + if (!hit_set) { + result= -ENOENT; + break; + } + encode(*hit_set, osd_op.outdata); + result = osd_op.outdata.length(); + } else { + // read an archived HitSet. + hobject_t oid; + for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + if (stamp >= p->begin && stamp <= p->end) { + oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + break; + } + } + if (oid == hobject_t()) { + result = -ENOENT; + break; + } + if (!pool.info.is_replicated()) { + // FIXME: EC not supported yet + result = -EOPNOTSUPP; + break; + } + if (is_unreadable_object(oid)) { + wait_for_unreadable_object(oid, op); + delete filter; + return; + } + result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata); + } + } + break; + + case CEPH_OSD_OP_SCRUBLS: + result = do_scrub_ls(m, &osd_op); + break; + + default: + result = -EINVAL; + break; + } + + if (result < 0) + break; + } + + // reply + MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + false); + reply->claim_op_out_data(ops); + reply->set_result(result); + reply->set_reply_versions(info.last_update, info.last_user_version); + osd->send_message_osd_client(reply, m->get_connection()); + delete filter; +} + +int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op) +{ + if (m->get_pg() != info.pgid.pgid) { + dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl; + return -EINVAL; // hmm? + } + auto bp = osd_op->indata.cbegin(); + scrub_ls_arg_t arg; + try { + arg.decode(bp); + } catch (buffer::error&) { + dout(10) << " corrupted scrub_ls_arg_t" << dendl; + return -EINVAL; + } + int r = 0; + scrub_ls_result_t result = {.interval = info.history.same_interval_since}; + if (arg.interval != 0 && arg.interval != info.history.same_interval_since) { + r = -EAGAIN; + } else if (!scrubber.store) { + r = -ENOENT; + } else if (arg.get_snapsets) { + result.vals = scrubber.store->get_snap_errors(osd->store, + get_pgid().pool(), + arg.start_after, + arg.max_return); + } else { + result.vals = scrubber.store->get_object_errors(osd->store, + get_pgid().pool(), + arg.start_after, + arg.max_return); + } + encode(result, osd_op->outdata); + return r; +} + +void PrimaryLogPG::calc_trim_to() +{ + size_t target = cct->_conf->osd_min_pg_log_entries; + if (is_degraded() || + state_test(PG_STATE_RECOVERING | + PG_STATE_RECOVERY_WAIT | + PG_STATE_BACKFILLING | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILL_TOOFULL)) { + target = cct->_conf->osd_max_pg_log_entries; + } + + eversion_t limit = std::min( + min_last_complete_ondisk, + pg_log.get_can_rollback_to()); + if (limit != eversion_t() && + limit != pg_trim_to && + pg_log.get_log().approx_size() > target) { + size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target, + cct->_conf->osd_pg_log_trim_max); + if (num_to_trim < cct->_conf->osd_pg_log_trim_min && + cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) { + return; + } + list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin(); + eversion_t new_trim_to; + for (size_t i = 0; i < num_to_trim; ++i) { + new_trim_to = it->version; + ++it; + if (new_trim_to > limit) { + new_trim_to = limit; + dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl; + break; + } + } + dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl; + pg_trim_to = new_trim_to; + assert(pg_trim_to <= pg_log.get_head()); + assert(pg_trim_to <= min_last_complete_ondisk); + } +} + +void PrimaryLogPG::calc_trim_to_aggressive() +{ + size_t target = cct->_conf->osd_min_pg_log_entries; + if (is_degraded() || + state_test(PG_STATE_RECOVERING | + PG_STATE_RECOVERY_WAIT | + PG_STATE_BACKFILLING | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILL_TOOFULL)) { + target = cct->_conf->osd_max_pg_log_entries; + } + // limit pg log trimming up to the can_rollback_to value + eversion_t limit = std::min({ + pg_log.get_head(), + pg_log.get_can_rollback_to(), + last_update_ondisk}); + dout(10) << __func__ << " limit = " << limit << dendl; + + if (limit != eversion_t() && + limit != pg_trim_to && + pg_log.get_log().approx_size() > target) { + dout(10) << __func__ << " approx pg log length = " + << pg_log.get_log().approx_size() << dendl; + uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target, + cct->_conf->osd_pg_log_trim_max); + dout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl; + if (num_to_trim < cct->_conf->osd_pg_log_trim_min && + cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) { + return; + } + auto it = pg_log.get_log().log.begin(); // oldest log entry + auto rit = pg_log.get_log().log.rbegin(); + eversion_t by_n_to_keep; // start from tail + eversion_t by_n_to_trim = eversion_t::max(); // start from head + for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) { + i++; + if (i > target && by_n_to_keep == eversion_t()) { + by_n_to_keep = rit->version; + } + if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) { + by_n_to_trim = it->version; + } + if (by_n_to_keep != eversion_t() && + by_n_to_trim != eversion_t::max()) { + break; + } + } + + if (by_n_to_keep == eversion_t()) { + return; + } + + pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit}); + dout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl; + ceph_assert(pg_trim_to <= pg_log.get_head()); + } +} + +PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap, + const PGPool &_pool, + const map<string,string>& ec_profile, spg_t p) : + PG(o, curmap, _pool, p), + pgbackend( + PGBackend::build_pg_backend( + _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)), + object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count), + snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"), + new_backfill(false), + temp_seq(0), + snap_trimmer_machine(this) +{ + missing_loc.set_backend_predicates( + pgbackend->get_is_readable_predicate(), + pgbackend->get_is_recoverable_predicate()); + snap_trimmer_machine.initiate(); +} + +void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc) +{ + src_oloc = oloc; + if (oloc.key.empty()) + src_oloc.key = oid.name; +} + +void PrimaryLogPG::handle_backoff(OpRequestRef& op) +{ + const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req()); + SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())}; + if (!session) + return; // drop it. + hobject_t begin = info.pgid.pgid.get_hobj_start(); + hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + if (begin < m->begin) { + begin = m->begin; + } + if (end > m->end) { + end = m->end; + } + dout(10) << __func__ << " backoff ack id " << m->id + << " [" << begin << "," << end << ")" << dendl; + session->ack_backoff(cct, m->pgid, m->id, begin, end); +} + +void PrimaryLogPG::do_request( + OpRequestRef& op, + ThreadPool::TPHandle &handle) +{ + if (op->osd_trace) { + op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace); + op->pg_trace.event("do request"); + } + // make sure we have a new enough map + auto p = waiting_for_map.find(op->get_source()); + if (p != waiting_for_map.end()) { + // preserve ordering + dout(20) << __func__ << " waiting_for_map " + << p->first << " not empty, queueing" << dendl; + p->second.push_back(op); + op->mark_delayed("waiting_for_map not empty"); + return; + } + if (!have_same_or_newer_map(op->min_epoch)) { + dout(20) << __func__ << " min " << op->min_epoch + << ", queue on waiting_for_map " << op->get_source() << dendl; + waiting_for_map[op->get_source()].push_back(op); + op->mark_delayed("op must wait for map"); + osd->request_osdmap_update(op->min_epoch); + return; + } + + if (can_discard_request(op)) { + return; + } + + // pg-wide backoffs + const Message *m = op->get_req(); + int msg_type = m->get_type(); + if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) { + SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())}; + if (!session) + return; // drop it. + + if (msg_type == CEPH_MSG_OSD_OP) { + if (session->check_backoff(cct, info.pgid, + info.pgid.pgid.get_hobj_start(), m)) { + return; + } + + bool backoff = + is_down() || + is_incomplete() || + (!is_active() && is_peered()); + if (g_conf()->osd_backoff_on_peering && !backoff) { + if (is_peering()) { + backoff = true; + } + } + if (backoff) { + add_pg_backoff(session); + return; + } + } + // pg backoff acks at pg-level + if (msg_type == CEPH_MSG_OSD_BACKOFF) { + const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m); + if (ba->begin != ba->end) { + handle_backoff(op); + return; + } + } + } + + if (!is_peered()) { + // Delay unless PGBackend says it's ok + if (pgbackend->can_handle_while_inactive(op)) { + bool handled = pgbackend->handle_message(op); + ceph_assert(handled); + return; + } else { + waiting_for_peered.push_back(op); + op->mark_delayed("waiting for peered"); + return; + } + } + + if (flushes_in_progress > 0) { + dout(20) << flushes_in_progress + << " flushes_in_progress pending " + << "waiting for flush on " << op << dendl; + waiting_for_flush.push_back(op); + op->mark_delayed("waiting for flush"); + return; + } + + ceph_assert(is_peered() && flushes_in_progress == 0); + if (pgbackend->handle_message(op)) + return; + + switch (msg_type) { + case CEPH_MSG_OSD_OP: + case CEPH_MSG_OSD_BACKOFF: + if (!is_active()) { + dout(20) << " peered, not active, waiting for active on " << op << dendl; + waiting_for_active.push_back(op); + op->mark_delayed("waiting for active"); + return; + } + switch (msg_type) { + case CEPH_MSG_OSD_OP: + // verify client features + if ((pool.info.has_tiers() || pool.info.is_tier()) && + !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) { + osd->reply_op_error(op, -EOPNOTSUPP); + return; + } + do_op(op); + break; + case CEPH_MSG_OSD_BACKOFF: + // object-level backoff acks handled in osdop context + handle_backoff(op); + break; + } + break; + + case MSG_OSD_PG_SCAN: + do_scan(op, handle); + break; + + case MSG_OSD_PG_BACKFILL: + do_backfill(op); + break; + + case MSG_OSD_PG_BACKFILL_REMOVE: + do_backfill_remove(op); + break; + + case MSG_OSD_SCRUB_RESERVE: + { + const MOSDScrubReserve *m = + static_cast<const MOSDScrubReserve*>(op->get_req()); + switch (m->type) { + case MOSDScrubReserve::REQUEST: + handle_scrub_reserve_request(op); + break; + case MOSDScrubReserve::GRANT: + handle_scrub_reserve_grant(op, m->from); + break; + case MOSDScrubReserve::REJECT: + handle_scrub_reserve_reject(op, m->from); + break; + case MOSDScrubReserve::RELEASE: + handle_scrub_reserve_release(op); + break; + } + } + break; + + case MSG_OSD_REP_SCRUB: + replica_scrub(op, handle); + break; + + case MSG_OSD_REP_SCRUBMAP: + do_replica_scrub_map(op); + break; + + case MSG_OSD_PG_UPDATE_LOG_MISSING: + do_update_log_missing(op); + break; + + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + do_update_log_missing_reply(op); + break; + + default: + ceph_abort_msg("bad message type in do_request"); + } +} + +hobject_t PrimaryLogPG::earliest_backfill() const +{ + hobject_t e = hobject_t::get_max(); + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt); + ceph_assert(iter != peer_info.end()); + if (iter->second.last_backfill < e) + e = iter->second.last_backfill; + } + return e; +} + +/** do_op - do an op + * pg lock will be held (if multithreaded) + * osd_lock NOT held. + */ +void PrimaryLogPG::do_op(OpRequestRef& op) +{ + FUNCTRACE(cct); + // NOTE: take a non-const pointer here; we must be careful not to + // change anything that will break other reads on m (operator<<). + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + if (m->finish_decode()) { + op->reset_desc(); // for TrackedOp + m->clear_payload(); + } + + dout(20) << __func__ << ": op " << *m << dendl; + + hobject_t head = m->get_hobj(); + head.snap = CEPH_NOSNAP; + + if (!info.pgid.pgid.contains( + info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) { + derr << __func__ << " " << info.pgid.pgid << " does not contain " + << head << " pg_num " << pool.info.get_pg_num() << " hash " + << std::hex << head.get_hash() << std::dec << dendl; + osd->clog->warn() << info.pgid.pgid << " does not contain " << head + << " op " << *m; + ceph_assert(!cct->_conf->osd_debug_misdirected_ops); + return; + } + + bool can_backoff = + m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF); + SessionRef session; + if (can_backoff) { + session = static_cast<Session*>(m->get_connection()->get_priv().get()); + if (!session.get()) { + dout(10) << __func__ << " no session" << dendl; + return; + } + + if (session->check_backoff(cct, info.pgid, head, m)) { + return; + } + } + + if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) { + // not implemented. + dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + if (op->rmw_flags == 0) { + int r = osd->osd->init_op_flags(op); + if (r) { + osd->reply_op_error(op, r); + return; + } + } + + if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) && + op->may_read() && + !(op->may_write() || op->may_cache())) { + // balanced reads; any replica will do + if (!(is_primary() || is_replica())) { + osd->handle_misdirected_op(this, op); + return; + } + } else { + // normal case; must be primary + if (!is_primary()) { + osd->handle_misdirected_op(this, op); + return; + } + } + + if (!op_has_sufficient_caps(op)) { + osd->reply_op_error(op, -EPERM); + return; + } + + if (op->includes_pg_op()) { + return do_pg_op(op); + } + + // object name too long? + if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) { + dout(4) << "do_op name is longer than " + << cct->_conf->osd_max_object_name_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) { + dout(4) << "do_op locator is longer than " + << cct->_conf->osd_max_object_name_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) { + dout(4) << "do_op namespace is longer than " + << cct->_conf->osd_max_object_namespace_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().oid.name.empty()) { + dout(4) << "do_op empty oid name is not allowed" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + if (int r = osd->store->validate_hobject_key(head)) { + dout(4) << "do_op object " << head << " invalid for backing store: " + << r << dendl; + osd->reply_op_error(op, r); + return; + } + + // blacklisted? + if (get_osdmap()->is_blacklisted(m->get_source_addr())) { + dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl; + osd->reply_op_error(op, -EBLACKLISTED); + return; + } + + // order this op as a write? + bool write_ordered = op->rwordered(); + + // discard due to cluster full transition? (we discard any op that + // originates before the cluster or pool is marked full; the client + // will resend after the full flag is removed or if they expect the + // op to succeed despite being full). The except is FULL_FORCE and + // FULL_TRY ops, which there is no reason to discard because they + // bypass all full checks anyway. If this op isn't write or + // read-ordered, we skip. + // FIXME: we exclude mds writes for now. + if (write_ordered && !(m->get_source().is_mds() || + m->has_flag(CEPH_OSD_FLAG_FULL_TRY) || + m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) && + info.history.last_epoch_marked_full > m->get_map_epoch()) { + dout(10) << __func__ << " discarding op sent before full " << m << " " + << *m << dendl; + return; + } + // mds should have stopped writing before this point. + // We can't allow OSD to become non-startable even if mds + // could be writing as part of file removals. + if (write_ordered && osd->check_failsafe_full(get_dpp()) && + !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl; + return; + } + int64_t poolid = get_pgid().pool(); + if (op->may_write()) { + + const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid); + if (!pi) { + return; + } + + // invalid? + if (m->get_snapid() != CEPH_NOSNAP) { + dout(20) << __func__ << ": write to clone not valid " << *m << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + // too big? + if (cct->_conf->osd_max_write_size && + m->get_data_len() > cct->_conf->osd_max_write_size << 20) { + // journal can't hold commit! + derr << "do_op msg data len " << m->get_data_len() + << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20) + << " on " << *m << dendl; + osd->reply_op_error(op, -OSD_WRITETOOBIG); + return; + } + } + + dout(10) << "do_op " << *m + << (op->may_write() ? " may_write" : "") + << (op->may_read() ? " may_read" : "") + << (op->may_cache() ? " may_cache" : "") + << " -> " << (write_ordered ? "write-ordered" : "read-ordered") + << " flags " << ceph_osd_flag_string(m->get_flags()) + << dendl; + + // missing object? + if (is_unreadable_object(head)) { + if (!is_primary()) { + osd->reply_op_error(op, -EAGAIN); + return; + } + if (can_backoff && + (g_conf()->osd_backoff_on_degraded || + (g_conf()->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) { + add_backoff(session, head, head); + maybe_kick_recovery(head); + } else { + wait_for_unreadable_object(head, op); + } + return; + } + + if (write_ordered) { + // degraded object? + if (is_degraded_or_backfilling_object(head)) { + if (can_backoff && g_conf()->osd_backoff_on_degraded) { + add_backoff(session, head, head); + maybe_kick_recovery(head); + } else { + wait_for_degraded_object(head, op); + } + return; + } + + if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) { + dout(20) << __func__ << ": waiting for scrub" << dendl; + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + return; + } + + // blocked on snap? + if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head); + blocked_iter != std::end(objects_blocked_on_degraded_snap)) { + hobject_t to_wait_on(head); + to_wait_on.snap = blocked_iter->second; + wait_for_degraded_object(to_wait_on, op); + return; + } + if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head); + blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) { + wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op); + return; + } + if (objects_blocked_on_cache_full.count(head)) { + block_write_on_full_cache(head, op); + return; + } + } + + // dup/resent? + if (op->may_write() || op->may_cache()) { + // warning: we will get back *a* request for this reqid, but not + // necessarily the most recent. this happens with flush and + // promote ops, but we can't possible have both in our log where + // the original request is still not stable on disk, so for our + // purposes here it doesn't matter which one we get. + eversion_t version; + version_t user_version; + int return_code = 0; + bool got = check_in_progress_op( + m->get_reqid(), &version, &user_version, &return_code); + if (got) { + dout(3) << __func__ << " dup " << m->get_reqid() + << " version " << version << dendl; + if (already_complete(version)) { + osd->reply_op_error(op, return_code, version, user_version); + } else { + dout(10) << " waiting for " << version << " to commit" << dendl; + // always queue ondisk waiters, so that we can requeue if needed + waiting_for_ondisk[version].emplace_back(op, user_version, return_code); + op->mark_delayed("waiting for ondisk"); + } + return; + } + } + + ObjectContextRef obc; + bool can_create = op->may_write(); + hobject_t missing_oid; + + // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS + hobject_t _oid_head; + if (m->get_snapid() == CEPH_SNAPDIR) { + _oid_head = m->get_hobj().get_head(); + } + const hobject_t& oid = + m->get_snapid() == CEPH_SNAPDIR ? _oid_head : m->get_hobj(); + + // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else + for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) { + OSDOp& osd_op = *p; + + if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) { + if (m->get_snapid() != CEPH_SNAPDIR) { + dout(10) << "LIST_SNAPS with incorrect context" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + } else { + if (m->get_snapid() == CEPH_SNAPDIR) { + dout(10) << "non-LIST_SNAPS on snapdir" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + } + } + + // io blocked on obc? + if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) && + maybe_await_blocked_head(oid, op)) { + return; + } + + int r = find_object_context( + oid, &obc, can_create, + m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE), + &missing_oid); + + // LIST_SNAPS needs the ssc too + if (obc && + m->get_snapid() == CEPH_SNAPDIR && + !obc->ssc) { + obc->ssc = get_snapset_context(oid, true); + } + + if (r == -EAGAIN) { + // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise, + // we have to wait for the object. + if (is_primary()) { + // missing the specific snap we need; requeue and wait. + ceph_assert(!op->may_write()); // only happens on a read/cache + wait_for_unreadable_object(missing_oid, op); + return; + } + } else if (r == 0) { + if (is_unreadable_object(obc->obs.oi.soid)) { + dout(10) << __func__ << ": clone " << obc->obs.oi.soid + << " is unreadable, waiting" << dendl; + wait_for_unreadable_object(obc->obs.oi.soid, op); + return; + } + + // degraded object? (the check above was for head; this could be a clone) + if (write_ordered && + obc->obs.oi.soid.snap != CEPH_NOSNAP && + is_degraded_or_backfilling_object(obc->obs.oi.soid)) { + dout(10) << __func__ << ": clone " << obc->obs.oi.soid + << " is degraded, waiting" << dendl; + wait_for_degraded_object(obc->obs.oi.soid, op); + return; + } + } + + bool in_hit_set = false; + if (hit_set) { + if (obc.get()) { + if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid)) + in_hit_set = true; + } else { + if (missing_oid != hobject_t() && hit_set->contains(missing_oid)) + in_hit_set = true; + } + if (!op->hitset_inserted) { + hit_set->insert(oid); + op->hitset_inserted = true; + if (hit_set->is_full() || + hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) { + hit_set_persist(); + } + } + } + + if (agent_state) { + if (agent_choose_mode(false, op)) + return; + } + + if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) { + if (maybe_handle_manifest(op, + write_ordered, + obc)) + return; + } + + if (maybe_handle_cache(op, + write_ordered, + obc, + r, + missing_oid, + false, + in_hit_set)) + return; + + if (r && (r != -ENOENT || !obc)) { + // copy the reqids for copy get on ENOENT + if (r == -ENOENT && + (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) { + fill_in_copy_get_noent(op, oid, m->ops[0]); + return; + } + dout(20) << __func__ << ": find_object_context got error " << r << dendl; + if (op->may_write() && + get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) { + record_write_error(op, oid, nullptr, r); + } else { + osd->reply_op_error(op, r); + } + return; + } + + // make sure locator is consistent + object_locator_t oloc(obc->obs.oi.soid); + if (m->get_object_locator() != oloc) { + dout(10) << " provided locator " << m->get_object_locator() + << " != object's " << obc->obs.oi.soid << dendl; + osd->clog->warn() << "bad locator " << m->get_object_locator() + << " on object " << oloc + << " op " << *m; + } + + // io blocked on obc? + if (obc->is_blocked() && + !m->has_flag(CEPH_OSD_FLAG_FLUSH)) { + wait_for_blocked_object(obc->obs.oi.soid, op); + return; + } + + dout(25) << __func__ << " oi " << obc->obs.oi << dendl; + + OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this); + + if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) { + dout(20) << __func__ << ": skipping rw locks" << dendl; + } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) { + dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl; + + // verify there is in fact a flush in progress + // FIXME: we could make this a stronger test. + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid); + if (p == flush_ops.end()) { + dout(10) << __func__ << " no flush in progress, aborting" << dendl; + reply_ctx(ctx, -EINVAL); + return; + } + } else if (!get_rw_locks(write_ordered, ctx)) { + dout(20) << __func__ << " waiting for rw locks " << dendl; + op->mark_delayed("waiting for rw locks"); + close_op_ctx(ctx); + return; + } + dout(20) << __func__ << " obc " << *obc << dendl; + + if (r) { + dout(20) << __func__ << " returned an error: " << r << dendl; + close_op_ctx(ctx); + if (op->may_write() && + get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) { + record_write_error(op, oid, nullptr, r); + } else { + osd->reply_op_error(op, r); + } + return; + } + + if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) { + ctx->ignore_cache = true; + } + + if ((op->may_read()) && (obc->obs.oi.is_lost())) { + // This object is lost. Reading from it returns an error. + dout(20) << __func__ << ": object " << obc->obs.oi.soid + << " is lost" << dendl; + reply_ctx(ctx, -ENFILE); + return; + } + if (!op->may_write() && + !op->may_cache() && + (!obc->obs.exists || + ((m->get_snapid() != CEPH_SNAPDIR) && + obc->obs.oi.is_whiteout()))) { + // copy the reqids for copy get on ENOENT + if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) { + fill_in_copy_get_noent(op, oid, m->ops[0]); + close_op_ctx(ctx); + return; + } + reply_ctx(ctx, -ENOENT); + return; + } + + op->mark_started(); + + execute_ctx(ctx); + utime_t prepare_latency = ceph_clock_now(); + prepare_latency -= op->get_dequeued_time(); + osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency); + if (op->may_read() && op->may_write()) { + osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency); + } else if (op->may_read()) { + osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency); + } else if (op->may_write() || op->may_cache()) { + osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency); + } + + // force recovery of the oldest missing object if too many logs + maybe_force_recovery(); +} + +PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail( + OpRequestRef op, + bool write_ordered, + ObjectContextRef obc) +{ + ceph_assert(obc); + if (static_cast<const MOSDOp *>(op->get_req())->get_flags() & + CEPH_OSD_FLAG_IGNORE_REDIRECT) { + dout(20) << __func__ << ": ignoring redirect due to flag" << dendl; + return cache_result_t::NOOP; + } + + // if it is write-ordered and blocked, stop now + if (obc->is_blocked() && write_ordered) { + // we're already doing something with this object + dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl; + return cache_result_t::NOOP; + } + + vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops; + for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) { + OSDOp& osd_op = *p; + ceph_osd_op& op = osd_op.op; + if (op.op == CEPH_OSD_OP_SET_REDIRECT || + op.op == CEPH_OSD_OP_SET_CHUNK || + op.op == CEPH_OSD_OP_TIER_PROMOTE || + op.op == CEPH_OSD_OP_UNSET_MANIFEST) { + return cache_result_t::NOOP; + } + } + + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + if (op->may_write() || write_ordered) { + do_proxy_write(op, obc); + } else { + // promoted object + if (obc->obs.oi.size != 0) { + return cache_result_t::NOOP; + } + do_proxy_read(op, obc); + } + return cache_result_t::HANDLED_PROXY; + case object_manifest_t::TYPE_CHUNKED: + { + if (can_proxy_chunked_read(op, obc)) { + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid); + if (p != flush_ops.end()) { + do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true); + return cache_result_t::HANDLED_PROXY; + } + do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered); + return cache_result_t::HANDLED_PROXY; + } + + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + hobject_t head = m->get_hobj(); + + if (is_degraded_or_backfilling_object(head)) { + dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl; + wait_for_degraded_object(head, op); + return cache_result_t::BLOCKED_RECOVERY; + } + + if (write_blocked_by_scrub(head)) { + dout(20) << __func__ << ": waiting for scrub" << dendl; + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + return cache_result_t::BLOCKED_RECOVERY; + } + + for (auto& p : obc->obs.oi.manifest.chunk_map) { + if (p.second.is_missing()) { + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + const object_locator_t oloc = m->get_object_locator(); + promote_object(obc, obc->obs.oi.soid, oloc, op, NULL); + return cache_result_t::BLOCKED_PROMOTE; + } + } + + bool all_dirty = true; + for (auto& p : obc->obs.oi.manifest.chunk_map) { + if (!p.second.is_dirty()) { + all_dirty = false; + } + } + if (all_dirty) { + start_flush(OpRequestRef(), obc, true, NULL, boost::none); + } + return cache_result_t::NOOP; + } + default: + ceph_abort_msg("unrecognized manifest type"); + } + + return cache_result_t::NOOP; +} + +struct C_ManifestFlush : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t lpr; + ceph_tid_t tid; + utime_t start; + uint64_t offset; + uint64_t last_offset; + C_ManifestFlush(PrimaryLogPG *p, hobject_t o, epoch_t e) + : pg(p), oid(o), lpr(e), + tid(0), start(ceph_clock_now()) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + pg->lock(); + pg->handle_manifest_flush(oid, tid, r, offset, last_offset, lpr); + pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start); + pg->unlock(); + } +}; + +void PrimaryLogPG::handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r, + uint64_t offset, uint64_t last_offset, + epoch_t lpr) +{ + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid); + if (p == flush_ops.end()) { + dout(10) << __func__ << " no flush_op found" << dendl; + return; + } + if (p->second->rval < 0) { + return; + } + p->second->io_results[offset] = r; + for (auto &ior: p->second->io_results) { + if (ior.second < 0) { + finish_manifest_flush(oid, tid, r, p->second->obc, last_offset); + p->second->rval = r; + return; + } + } + if (p->second->chunks == p->second->io_results.size()) { + if (lpr == get_last_peering_reset()) { + ceph_assert(p->second->obc); + finish_manifest_flush(oid, tid, r, p->second->obc, last_offset); + } + } +} + +int PrimaryLogPG::start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking, + boost::optional<std::function<void()>> &&on_flush) +{ + auto p = obc->obs.oi.manifest.chunk_map.begin(); + FlushOpRef manifest_fop(std::make_shared<FlushOp>()); + manifest_fop->op = op; + manifest_fop->obc = obc; + manifest_fop->flushed_version = obc->obs.oi.user_version; + manifest_fop->blocking = blocking; + manifest_fop->on_flush = std::move(on_flush); + int r = do_manifest_flush(op, obc, manifest_fop, p->first, blocking); + if (r < 0) { + return r; + } + + flush_ops[obc->obs.oi.soid] = manifest_fop; + return -EINPROGRESS; +} + +int PrimaryLogPG::do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop, + uint64_t start_offset, bool block) +{ + struct object_manifest_t &manifest = obc->obs.oi.manifest; + hobject_t soid = obc->obs.oi.soid; + ceph_tid_t tid; + SnapContext snapc; + uint64_t max_copy_size = 0, last_offset = 0; + + map<uint64_t, chunk_info_t>::iterator iter = manifest.chunk_map.find(start_offset); + ceph_assert(iter != manifest.chunk_map.end()); + for (;iter != manifest.chunk_map.end(); ++iter) { + if (iter->second.is_dirty()) { + last_offset = iter->first; + max_copy_size += iter->second.length; + } + if (get_copy_chunk_size() < max_copy_size) { + break; + } + } + + iter = manifest.chunk_map.find(start_offset); + for (;iter != manifest.chunk_map.end(); ++iter) { + if (!iter->second.is_dirty()) { + continue; + } + uint64_t tgt_length = iter->second.length; + uint64_t tgt_offset= iter->second.offset; + hobject_t tgt_soid = iter->second.oid; + object_locator_t oloc(tgt_soid); + ObjectOperation obj_op; + bufferlist chunk_data; + int r = pgbackend->objects_read_sync( + soid, iter->first, tgt_length, 0, &chunk_data); + if (r < 0) { + dout(0) << __func__ << " read fail " << " offset: " << tgt_offset + << " len: " << tgt_length << " r: " << r << dendl; + return r; + } + if (!chunk_data.length()) { + return -ENODATA; + } + + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_RWORDERED; + tgt_length = chunk_data.length(); + pg_pool_t::fingerprint_t fp_algo_t = pool.info.get_fingerprint_type(); + if (iter->second.has_reference() && + fp_algo_t != pg_pool_t::TYPE_FINGERPRINT_NONE) { + switch (fp_algo_t) { + case pg_pool_t::TYPE_FINGERPRINT_SHA1: + { + sha1_digest_t sha1r = chunk_data.sha1(); + object_t fp_oid = sha1r.to_str(); + bufferlist in; + if (fp_oid != tgt_soid.oid) { + // decrement old chunk's reference count + ObjectOperation dec_op; + cls_chunk_refcount_put_op put_call; + ::encode(put_call, in); + dec_op.call("refcount", "chunk_put", in); + // we don't care dec_op's completion. scrub for dedup will fix this. + tid = osd->objecter->mutate( + tgt_soid.oid, oloc, dec_op, snapc, + ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime), + flags, NULL); + in.clear(); + } + tgt_soid.oid = fp_oid; + iter->second.oid = tgt_soid; + // add data op + ceph_osd_op osd_op; + osd_op.extent.offset = 0; + osd_op.extent.length = chunk_data.length(); + encode(osd_op, in); + encode(soid, in); + in.append(chunk_data); + obj_op.call("cas", "cas_write_or_get", in); + break; + } + default: + assert(0 == "unrecognized fingerprint type"); + break; + } + } else { + obj_op.add_data(CEPH_OSD_OP_WRITE, tgt_offset, tgt_length, chunk_data); + } + + C_ManifestFlush *fin = new C_ManifestFlush(this, soid, get_last_peering_reset()); + fin->offset = iter->first; + fin->last_offset = last_offset; + manifest_fop->chunks++; + + unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers); + tid = osd->objecter->mutate( + tgt_soid.oid, oloc, obj_op, snapc, + ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime), + flags, new C_OnFinisher(fin, osd->objecter_finishers[n])); + fin->tid = tid; + manifest_fop->io_tids[iter->first] = tid; + + dout(20) << __func__ << " offset: " << tgt_offset << " len: " << tgt_length + << " oid: " << tgt_soid.oid << " ori oid: " << soid.oid.name + << " tid: " << tid << dendl; + if (last_offset < iter->first) { + break; + } + } + + return 0; +} + +void PrimaryLogPG::finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r, + ObjectContextRef obc, uint64_t last_offset) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << " last_offset: " << last_offset << dendl; + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid); + if (p == flush_ops.end()) { + dout(10) << __func__ << " no flush_op found" << dendl; + return; + } + map<uint64_t, chunk_info_t>::iterator iter = + obc->obs.oi.manifest.chunk_map.find(last_offset); + ceph_assert(iter != obc->obs.oi.manifest.chunk_map.end()); + for (;iter != obc->obs.oi.manifest.chunk_map.end(); ++iter) { + if (iter->second.is_dirty() && last_offset < iter->first) { + do_manifest_flush(p->second->op, obc, p->second, iter->first, p->second->blocking); + return; + } + } + finish_flush(oid, tid, r); +} + +void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid, + MOSDOpReply *orig_reply, int r) +{ + dout(20) << __func__ << " r=" << r << dendl; + ceph_assert(op->may_write()); + const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid(); + mempool::osd_pglog::list<pg_log_entry_t> entries; + entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid, + get_next_version(), eversion_t(), 0, + reqid, utime_t(), r)); + + struct OnComplete { + PrimaryLogPG *pg; + OpRequestRef op; + boost::intrusive_ptr<MOSDOpReply> orig_reply; + int r; + OnComplete( + PrimaryLogPG *pg, + OpRequestRef op, + MOSDOpReply *orig_reply, + int r) + : pg(pg), op(op), + orig_reply(orig_reply, false /* take over ref */), r(r) + {} + void operator()() { + ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl; + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + MOSDOpReply *reply = orig_reply.detach(); + if (reply == nullptr) { + reply = new MOSDOpReply(m, r, pg->get_osdmap_epoch(), + flags, true); + } + ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl; + pg->osd->send_message_osd_client(reply, m->get_connection()); + } + }; + + ObcLockManager lock_manager; + submit_log_entries( + entries, + std::move(lock_manager), + boost::optional<std::function<void(void)> >( + OnComplete(this, op, orig_reply, r)), + op, + r); +} + +PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail( + OpRequestRef op, + bool write_ordered, + ObjectContextRef obc, + int r, hobject_t missing_oid, + bool must_promote, + bool in_hit_set, + ObjectContextRef *promote_obc) +{ + // return quickly if caching is not enabled + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) + return cache_result_t::NOOP; + + if (op && + op->get_req() && + op->get_req()->get_type() == CEPH_MSG_OSD_OP && + (static_cast<const MOSDOp *>(op->get_req())->get_flags() & + CEPH_OSD_FLAG_IGNORE_CACHE)) { + dout(20) << __func__ << ": ignoring cache due to flag" << dendl; + return cache_result_t::NOOP; + } + + must_promote = must_promote || op->need_promote(); + + if (obc) + dout(25) << __func__ << " " << obc->obs.oi << " " + << (obc->obs.exists ? "exists" : "DNE") + << " missing_oid " << missing_oid + << " must_promote " << (int)must_promote + << " in_hit_set " << (int)in_hit_set + << dendl; + else + dout(25) << __func__ << " (no obc)" + << " missing_oid " << missing_oid + << " must_promote " << (int)must_promote + << " in_hit_set " << (int)in_hit_set + << dendl; + + // if it is write-ordered and blocked, stop now + if (obc.get() && obc->is_blocked() && write_ordered) { + // we're already doing something with this object + dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl; + return cache_result_t::NOOP; + } + + if (r == -ENOENT && missing_oid == hobject_t()) { + // we know this object is logically absent (e.g., an undefined clone) + return cache_result_t::NOOP; + } + + if (obc.get() && obc->obs.exists) { + osd->logger->inc(l_osd_op_cache_hit); + return cache_result_t::NOOP; + } + if (!is_primary()) { + dout(20) << __func__ << " cache miss; ask the primary" << dendl; + osd->reply_op_error(op, -EAGAIN); + return cache_result_t::REPLIED_WITH_EAGAIN; + } + + if (missing_oid == hobject_t() && obc.get()) { + missing_oid = obc->obs.oi.soid; + } + + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + const object_locator_t oloc = m->get_object_locator(); + + if (op->need_skip_handle_cache()) { + return cache_result_t::NOOP; + } + + OpRequestRef promote_op; + + switch (pool.info.cache_mode) { + case pg_pool_t::CACHEMODE_WRITEBACK: + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + if (!op->may_write() && !op->may_cache() && + !write_ordered && !must_promote) { + dout(20) << __func__ << " cache pool full, proxying read" << dendl; + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + } + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + + if (must_promote || (!hit_set && !op->need_skip_promote())) { + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + + if (op->may_write() || op->may_cache()) { + do_proxy_write(op); + + // Promote too? + if (!op->need_skip_promote() && + maybe_promote(obc, missing_oid, oloc, in_hit_set, + pool.info.min_write_recency_for_promote, + OpRequestRef(), + promote_obc)) { + return cache_result_t::BLOCKED_PROMOTE; + } + return cache_result_t::HANDLED_PROXY; + } else { + do_proxy_read(op); + + // Avoid duplicate promotion + if (obc.get() && obc->is_blocked()) { + if (promote_obc) + *promote_obc = obc; + return cache_result_t::BLOCKED_PROMOTE; + } + + // Promote too? + if (!op->need_skip_promote()) { + (void)maybe_promote(obc, missing_oid, oloc, in_hit_set, + pool.info.min_read_recency_for_promote, + promote_op, promote_obc); + } + + return cache_result_t::HANDLED_PROXY; + } + ceph_abort_msg("unreachable"); + return cache_result_t::NOOP; + + case pg_pool_t::CACHEMODE_FORWARD: + // FIXME: this mode allows requests to be reordered. + do_cache_redirect(op); + return cache_result_t::HANDLED_REDIRECT; + + case pg_pool_t::CACHEMODE_READONLY: + // TODO: clean this case up + if (!obc.get() && r == -ENOENT) { + // we don't have the object and op's a read + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + if (!r) { // it must be a write + do_cache_redirect(op); + return cache_result_t::HANDLED_REDIRECT; + } + // crap, there was a failure of some kind + return cache_result_t::NOOP; + + case pg_pool_t::CACHEMODE_READFORWARD: + // Do writeback to the cache tier for writes + if (op->may_write() || write_ordered || must_promote) { + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + + // If it is a read, we can read, we need to forward it + do_cache_redirect(op); + return cache_result_t::HANDLED_REDIRECT; + + case pg_pool_t::CACHEMODE_PROXY: + if (!must_promote) { + if (op->may_write() || op->may_cache() || write_ordered) { + do_proxy_write(op); + return cache_result_t::HANDLED_PROXY; + } else { + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + } + } + // ugh, we're forced to promote. + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + + case pg_pool_t::CACHEMODE_READPROXY: + // Do writeback to the cache tier for writes + if (op->may_write() || write_ordered || must_promote) { + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + + // If it is a read, we can read, we need to proxy it + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + + default: + ceph_abort_msg("unrecognized cache_mode"); + } + return cache_result_t::NOOP; +} + +bool PrimaryLogPG::maybe_promote(ObjectContextRef obc, + const hobject_t& missing_oid, + const object_locator_t& oloc, + bool in_hit_set, + uint32_t recency, + OpRequestRef promote_op, + ObjectContextRef *promote_obc) +{ + dout(20) << __func__ << " missing_oid " << missing_oid + << " in_hit_set " << in_hit_set << dendl; + + switch (recency) { + case 0: + break; + case 1: + // Check if in the current hit set + if (in_hit_set) { + break; + } else { + // not promoting + return false; + } + break; + default: + { + unsigned count = (int)in_hit_set; + if (count) { + // Check if in other hit sets + const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid; + for (map<time_t,HitSetRef>::reverse_iterator itor = + agent_state->hit_set_map.rbegin(); + itor != agent_state->hit_set_map.rend(); + ++itor) { + if (!itor->second->contains(oid)) { + break; + } + ++count; + if (count >= recency) { + break; + } + } + } + if (count >= recency) { + break; + } + return false; // not promoting + } + break; + } + + if (osd->promote_throttle()) { + dout(10) << __func__ << " promote throttled" << dendl; + return false; + } + promote_object(obc, missing_oid, oloc, promote_op, promote_obc); + return true; +} + +void PrimaryLogPG::do_cache_redirect(OpRequestRef op) +{ + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); + MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(), + flags, false); + request_redirect_t redir(m->get_object_locator(), pool.info.tier_of); + reply->set_redirect(redir); + dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op " + << op << dendl; + m->get_connection()->send_message(reply); + return; +} + +struct C_ProxyRead : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyReadOpRef prdop; + utime_t start; + C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyReadOpRef& prd) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), prdop(prd), start(ceph_clock_now()) + {} + void finish(int r) override { + if (prdop->canceled) + return; + pg->lock(); + if (prdop->canceled) { + pg->unlock(); + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_proxy_read(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start); + } + pg->unlock(); + } +}; + +struct C_ProxyChunkRead : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyReadOpRef prdop; + utime_t start; + ObjectOperation *obj_op; + int op_index = 0; + uint64_t req_offset = 0; + ObjectContextRef obc; + uint64_t req_total_len = 0; + C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyReadOpRef& prd) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL) + {} + void finish(int r) override { + if (prdop->canceled) + return; + pg->lock(); + if (prdop->canceled) { + pg->unlock(); + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + if (r >= 0) { + if (!prdop->ops[op_index].outdata.length()) { + ceph_assert(req_total_len); + bufferlist list; + bufferptr bptr(req_total_len); + list.push_back(std::move(bptr)); + prdop->ops[op_index].outdata.append(list); + } + ceph_assert(obj_op); + uint64_t copy_offset; + if (req_offset >= prdop->ops[op_index].op.extent.offset) { + copy_offset = req_offset - prdop->ops[op_index].op.extent.offset; + } else { + copy_offset = 0; + } + prdop->ops[op_index].outdata.copy_in(copy_offset, obj_op->ops[0].outdata.length(), + obj_op->ops[0].outdata.c_str()); + } + + pg->finish_proxy_read(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start); + if (obj_op) { + delete obj_op; + } + } + pg->unlock(); + } +}; + +void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc) +{ + // NOTE: non-const here because the ProxyReadOp needs mutable refs to + // stash the result in the request's OSDOp vector + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + object_locator_t oloc; + hobject_t soid; + /* extensible tier */ + if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) { + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + oloc = object_locator_t(obc->obs.oi.manifest.redirect_target); + soid = obc->obs.oi.manifest.redirect_target; + break; + default: + ceph_abort_msg("unrecognized manifest type"); + } + } else { + /* proxy */ + soid = m->get_hobj(); + oloc = object_locator_t(m->get_object_locator()); + oloc.pool = pool.info.tier_of; + } + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + + // pass through some original flags that make sense. + // - leave out redirection and balancing flags since we are + // already proxying through the primary + // - leave off read/write/exec flags that are derived from the op + flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED | + CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ENFORCE_SNAPC | + CEPH_OSD_FLAG_MAP_SNAP_CLONE); + + dout(10) << __func__ << " Start proxy read for " << *m << dendl; + + ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops)); + + ObjectOperation obj_op; + obj_op.dup(prdop->ops); + + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK && + (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) { + for (unsigned i = 0; i < obj_op.ops.size(); i++) { + ceph_osd_op op = obj_op.ops[i].op; + switch (op.op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_CHECKSUM: + case CEPH_OSD_OP_CMPEXT: + op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) & + ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + } + } + } + + C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(), + prdop); + unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers); + ceph_tid_t tid = osd->objecter->read( + soid.oid, oloc, obj_op, + m->get_snapid(), NULL, + flags, new C_OnFinisher(fin, osd->objecter_finishers[n]), + &prdop->user_version, + &prdop->data_offset, + m->get_features()); + fin->tid = tid; + prdop->objecter_tid = tid; + proxyread_ops[tid] = prdop; + in_progress_proxy_ops[soid].push_back(op); +} + +void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + + map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid); + if (p == proxyread_ops.end()) { + dout(10) << __func__ << " no proxyread_op found" << dendl; + return; + } + ProxyReadOpRef prdop = p->second; + if (tid != prdop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != prdop " << prdop + << " tid " << prdop->objecter_tid << dendl; + return; + } + if (oid != prdop->soid) { + dout(10) << __func__ << " oid " << oid << " != prdop " << prdop + << " soid " << prdop->soid << dendl; + return; + } + proxyread_ops.erase(tid); + + map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid); + if (q == in_progress_proxy_ops.end()) { + dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl; + return; + } + ceph_assert(q->second.size()); + list<OpRequestRef>::iterator it = std::find(q->second.begin(), + q->second.end(), + prdop->op); + ceph_assert(it != q->second.end()); + OpRequestRef op = *it; + q->second.erase(it); + if (q->second.size() == 0) { + in_progress_proxy_ops.erase(oid); + } else if (std::find(q->second.begin(), + q->second.end(), + prdop->op) != q->second.end()) { + /* multiple read case */ + dout(20) << __func__ << " " << oid << " is not completed " << dendl; + return; + } + + osd->logger->inc(l_osd_tier_proxy_read); + + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this); + ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + ctx->user_at_version = prdop->user_version; + ctx->data_off = prdop->data_offset; + ctx->ignore_log_op_stats = true; + complete_read_ctx(r, ctx); +} + +void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid) +{ + map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid); + if (p == in_progress_proxy_ops.end()) + return; + + list<OpRequestRef>& ls = p->second; + dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl; + requeue_ops(ls); + in_progress_proxy_ops.erase(p); +} + +void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop, + vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << " " << prdop->soid << dendl; + prdop->canceled = true; + + // cancel objecter op, if we can + if (prdop->objecter_tid) { + tids->push_back(prdop->objecter_tid); + for (uint32_t i = 0; i < prdop->ops.size(); i++) { + prdop->ops[i].outdata.clear(); + } + proxyread_ops.erase(prdop->objecter_tid); + prdop->objecter_tid = 0; + } +} + +void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << dendl; + + // cancel proxy reads + map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin(); + while (p != proxyread_ops.end()) { + cancel_proxy_read((p++)->second, tids); + } + + // cancel proxy writes + map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin(); + while (q != proxywrite_ops.end()) { + cancel_proxy_write((q++)->second, tids); + } + + if (requeue) { + map<hobject_t, list<OpRequestRef>>::iterator p = + in_progress_proxy_ops.begin(); + while (p != in_progress_proxy_ops.end()) { + list<OpRequestRef>& ls = p->second; + dout(10) << __func__ << " " << p->first << " requeuing " << ls.size() + << " requests" << dendl; + requeue_ops(ls); + in_progress_proxy_ops.erase(p++); + } + } else { + in_progress_proxy_ops.clear(); + } +} + +struct C_ProxyWrite_Commit : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyWriteOpRef pwop; + C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyWriteOpRef& pw) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), pwop(pw) + {} + void finish(int r) override { + if (pwop->canceled) + return; + pg->lock(); + if (pwop->canceled) { + pg->unlock(); + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_proxy_write(oid, tid, r); + } + pg->unlock(); + } +}; + +void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc) +{ + // NOTE: non-const because ProxyWriteOp takes a mutable ref + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + object_locator_t oloc; + SnapContext snapc(m->get_snap_seq(), m->get_snaps()); + hobject_t soid; + /* extensible tier */ + if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) { + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + oloc = object_locator_t(obc->obs.oi.manifest.redirect_target); + soid = obc->obs.oi.manifest.redirect_target; + break; + default: + ceph_abort_msg("unrecognized manifest type"); + } + } else { + /* proxy */ + soid = m->get_hobj(); + oloc = object_locator_t(m->get_object_locator()); + oloc.pool = pool.info.tier_of; + } + + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (!(op->may_write() || op->may_cache())) { + flags |= CEPH_OSD_FLAG_RWORDERED; + } + dout(10) << __func__ << " Start proxy write for " << *m << dendl; + + ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid())); + pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this); + pwop->mtime = m->get_mtime(); + + ObjectOperation obj_op; + obj_op.dup(pwop->ops); + + C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit( + this, soid, get_last_peering_reset(), pwop); + unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers); + ceph_tid_t tid = osd->objecter->mutate( + soid.oid, oloc, obj_op, snapc, + ceph::real_clock::from_ceph_timespec(pwop->mtime), + flags, new C_OnFinisher(fin, osd->objecter_finishers[n]), + &pwop->user_version, pwop->reqid); + fin->tid = tid; + pwop->objecter_tid = tid; + proxywrite_ops[tid] = pwop; + in_progress_proxy_ops[soid].push_back(op); +} + +void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid, + ObjectContextRef obc, bool write_ordered) +{ + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + OSDOp *osd_op = NULL; + for (unsigned int i = 0; i < m->ops.size(); i++) { + osd_op = &m->ops[i]; + uint64_t cursor = osd_op->op.extent.offset; + uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length; + uint64_t chunk_length = 0, chunk_index = 0, req_len = 0; + object_manifest_t *manifest = &obc->obs.oi.manifest; + map <uint64_t, map<uint64_t, uint64_t>> chunk_read; + + while (cursor < op_length) { + chunk_index = 0; + chunk_length = 0; + /* find the right chunk position for cursor */ + for (auto &p : manifest->chunk_map) { + if (p.first <= cursor && p.first + p.second.length > cursor) { + chunk_length = p.second.length; + chunk_index = p.first; + break; + } + } + /* no index */ + if (!chunk_index && !chunk_length) { + if (cursor == osd_op->op.extent.offset) { + OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this); + ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + ctx->data_off = osd_op->op.extent.offset; + ctx->ignore_log_op_stats = true; + complete_read_ctx(0, ctx); + } + break; + } + uint64_t next_length = chunk_length; + /* the size to read -> | op length | */ + /* | a chunk | */ + if (cursor + next_length > op_length) { + next_length = op_length - cursor; + } + /* the size to read -> | op length | */ + /* | a chunk | */ + if (cursor + next_length > chunk_index + chunk_length) { + next_length = chunk_index + chunk_length - cursor; + } + + chunk_read[cursor] = {{chunk_index, next_length}}; + cursor += next_length; + } + + req_len = cursor - osd_op->op.extent.offset; + for (auto &p : chunk_read) { + auto chunks = p.second.begin(); + dout(20) << __func__ << " chunk_index: " << chunks->first + << " next_length: " << chunks->second << " cursor: " + << p.first << dendl; + do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered); + } + } +} + +struct RefCountCallback : public Context { +public: + PrimaryLogPG *pg; + PrimaryLogPG::OpContext *ctx; + OSDOp& osd_op; + epoch_t last_peering_reset; + + RefCountCallback(PrimaryLogPG *pg, PrimaryLogPG::OpContext *ctx, + OSDOp &osd_op, epoch_t lpr) + : pg(pg), ctx(ctx), osd_op(osd_op), last_peering_reset(lpr) + {} + void finish(int r) override { + pg->lock(); + if (last_peering_reset == pg->get_last_peering_reset()) { + if (r >= 0) { + osd_op.rval = 0; + pg->execute_ctx(ctx); + } else { + if (ctx->op) { + pg->osd->reply_op_error(ctx->op, r); + } + pg->close_op_ctx(ctx); + } + } + pg->unlock(); + } +}; + +struct SetManifestFinisher : public PrimaryLogPG::OpFinisher { + OSDOp& osd_op; + + explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) { + } + + int execute() override { + return osd_op.rval; + } +}; + +void PrimaryLogPG::refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid, + SnapContext snapc, bool get, Context *cb, uint64_t offset) +{ + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_RWORDERED; + + dout(10) << __func__ << " Start refcount for " << soid << dendl; + + ObjectOperation obj_op; + bufferlist in; + if (get) { + cls_chunk_refcount_get_op call; + call.source = obc->obs.oi.soid; + ::encode(call, in); + obj_op.call("cas", "chunk_get", in); + } else { + cls_chunk_refcount_put_op call; + call.source = obc->obs.oi.soid; + ::encode(call, in); + obj_op.call("cas", "chunk_put", in); + } + + unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers); + Context *c; + if (cb) { + c = new C_OnFinisher(cb, osd->objecter_finishers[n]); + } else { + c = NULL; + } + + osd->objecter->mutate( + soid.oid, oloc, obj_op, snapc, + ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime), + flags, c); +} + +void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index, + uint64_t chunk_index, uint64_t req_offset, uint64_t req_length, + uint64_t req_total_len, bool write_ordered) +{ + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + object_manifest_t *manifest = &obc->obs.oi.manifest; + if (!manifest->chunk_map.count(chunk_index)) { + return; + } + uint64_t chunk_length = manifest->chunk_map[chunk_index].length; + hobject_t soid = manifest->chunk_map[chunk_index].oid; + hobject_t ori_soid = m->get_hobj(); + object_locator_t oloc(soid); + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (write_ordered) { + flags |= CEPH_OSD_FLAG_RWORDERED; + } + + if (!chunk_length || soid == hobject_t()) { + return; + } + + /* same as do_proxy_read() */ + flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED | + CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ENFORCE_SNAPC | + CEPH_OSD_FLAG_MAP_SNAP_CLONE); + + dout(10) << __func__ << " Start do chunk proxy read for " << *m + << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset + << " req_length: " << req_length << dendl; + + ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops)); + + ObjectOperation *pobj_op = new ObjectOperation; + OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op); + + if (chunk_index <= req_offset) { + osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index; + } else { + ceph_abort_msg("chunk_index > req_offset"); + } + osd_op.op.extent.length = req_length; + + ObjectOperation obj_op; + obj_op.dup(pobj_op->ops); + + C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(), + prdop); + fin->obj_op = pobj_op; + fin->op_index = op_index; + fin->req_offset = req_offset; + fin->obc = obc; + fin->req_total_len = req_total_len; + + unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers); + ceph_tid_t tid = osd->objecter->read( + soid.oid, oloc, obj_op, + m->get_snapid(), NULL, + flags, new C_OnFinisher(fin, osd->objecter_finishers[n]), + &prdop->user_version, + &prdop->data_offset, + m->get_features()); + fin->tid = tid; + prdop->objecter_tid = tid; + proxyread_ops[tid] = prdop; + in_progress_proxy_ops[ori_soid].push_back(op); +} + +bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc) +{ + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + OSDOp *osd_op = NULL; + bool ret = true; + for (unsigned int i = 0; i < m->ops.size(); i++) { + osd_op = &m->ops[i]; + ceph_osd_op op = osd_op->op; + switch (op.op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SYNC_READ: { + uint64_t cursor = osd_op->op.extent.offset; + uint64_t remain = osd_op->op.extent.length; + + /* requested chunks exist in chunk_map ? */ + for (auto &p : obc->obs.oi.manifest.chunk_map) { + if (p.first <= cursor && p.first + p.second.length > cursor) { + if (!p.second.is_missing()) { + return false; + } + if (p.second.length >= remain) { + remain = 0; + break; + } else { + remain = remain - p.second.length; + } + cursor += p.second.length; + } + } + + if (remain) { + dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl; + return false; + } + continue; + } + default: + return false; + } + } + return ret; +} + +void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + + map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid); + if (p == proxywrite_ops.end()) { + dout(10) << __func__ << " no proxywrite_op found" << dendl; + return; + } + ProxyWriteOpRef pwop = p->second; + ceph_assert(tid == pwop->objecter_tid); + ceph_assert(oid == pwop->soid); + + proxywrite_ops.erase(tid); + + map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid); + if (q == in_progress_proxy_ops.end()) { + dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl; + delete pwop->ctx; + pwop->ctx = NULL; + return; + } + list<OpRequestRef>& in_progress_op = q->second; + ceph_assert(in_progress_op.size()); + list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(), + in_progress_op.end(), + pwop->op); + ceph_assert(it != in_progress_op.end()); + in_progress_op.erase(it); + if (in_progress_op.size() == 0) { + in_progress_proxy_ops.erase(oid); + } else if (std::find(in_progress_op.begin(), + in_progress_op.end(), + pwop->op) != in_progress_op.end()) { + if (pwop->ctx) + delete pwop->ctx; + pwop->ctx = NULL; + dout(20) << __func__ << " " << oid << " tid " << tid + << " in_progress_op size: " + << in_progress_op.size() << dendl; + return; + } + + osd->logger->inc(l_osd_tier_proxy_write); + + const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req()); + ceph_assert(m != NULL); + + if (!pwop->sent_reply) { + // send commit. + MOSDOpReply *reply = pwop->ctx->reply; + if (reply) + pwop->ctx->reply = NULL; + else { + reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0, true); + reply->set_reply_versions(eversion_t(), pwop->user_version); + } + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + dout(10) << " sending commit on " << pwop << " " << reply << dendl; + osd->send_message_osd_client(reply, m->get_connection()); + pwop->sent_reply = true; + pwop->ctx->op->mark_commit_sent(); + } + + delete pwop->ctx; + pwop->ctx = NULL; +} + +void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop, + vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << " " << pwop->soid << dendl; + pwop->canceled = true; + + // cancel objecter op, if we can + if (pwop->objecter_tid) { + tids->push_back(pwop->objecter_tid); + delete pwop->ctx; + pwop->ctx = NULL; + proxywrite_ops.erase(pwop->objecter_tid); + pwop->objecter_tid = 0; + } +} + +class PromoteCallback: public PrimaryLogPG::CopyCallback { + ObjectContextRef obc; + PrimaryLogPG *pg; + utime_t start; +public: + PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_) + : obc(obc_), + pg(pg_), + start(ceph_clock_now()) {} + + void finish(PrimaryLogPG::CopyCallbackResults results) override { + PrimaryLogPG::CopyResults *results_data = results.get<1>(); + int r = results.get<0>(); + pg->finish_promote(r, results_data, obc); + pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start); + } +}; + +class PromoteManifestCallback: public PrimaryLogPG::CopyCallback { + ObjectContextRef obc; + PrimaryLogPG *pg; + utime_t start; + PrimaryLogPG::OpContext *ctx; + PrimaryLogPG::CopyCallbackResults promote_results; +public: + PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL) + : obc(obc_), + pg(pg_), + start(ceph_clock_now()), ctx(ctx) {} + + void finish(PrimaryLogPG::CopyCallbackResults results) override { + PrimaryLogPG::CopyResults *results_data = results.get<1>(); + int r = results.get<0>(); + if (ctx) { + promote_results = results; + pg->execute_ctx(ctx); + } else { + pg->finish_promote_manifest(r, results_data, obc); + } + pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start); + } + friend struct PromoteFinisher; +}; + +struct PromoteFinisher : public PrimaryLogPG::OpFinisher { + PromoteManifestCallback *promote_callback; + + explicit PromoteFinisher(PromoteManifestCallback *promote_callback) + : promote_callback(promote_callback) { + } + + int execute() override { + if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) { + promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(), + promote_callback->promote_results.get<1>(), + promote_callback->obc); + } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) { + promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(), + promote_callback->promote_results.get<1>(), + promote_callback->obc); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + return 0; + } +}; + +void PrimaryLogPG::promote_object(ObjectContextRef obc, + const hobject_t& missing_oid, + const object_locator_t& oloc, + OpRequestRef op, + ObjectContextRef *promote_obc) +{ + hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid; + ceph_assert(hoid != hobject_t()); + if (write_blocked_by_scrub(hoid)) { + dout(10) << __func__ << " " << hoid + << " blocked by scrub" << dendl; + if (op) { + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + dout(10) << __func__ << " " << hoid + << " placing op in waiting_for_scrub" << dendl; + } else { + dout(10) << __func__ << " " << hoid + << " no op, dropping on the floor" << dendl; + } + return; + } + if (!obc) { // we need to create an ObjectContext + ceph_assert(missing_oid != hobject_t()); + obc = get_object_context(missing_oid, true); + } + if (promote_obc) + *promote_obc = obc; + + /* + * Before promote complete, if there are proxy-reads for the object, + * for this case we don't use DONTNEED. + */ + unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL; + map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid); + if (q == in_progress_proxy_ops.end()) { + src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED; + } + + CopyCallback *cb; + object_locator_t my_oloc; + hobject_t src_hoid; + if (!obc->obs.oi.has_manifest()) { + my_oloc = oloc; + my_oloc.pool = pool.info.tier_of; + src_hoid = obc->obs.oi.soid; + cb = new PromoteCallback(obc, this); + } else { + if (obc->obs.oi.manifest.is_chunked()) { + src_hoid = obc->obs.oi.soid; + cb = new PromoteManifestCallback(obc, this); + } else if (obc->obs.oi.manifest.is_redirect()) { + object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target); + my_oloc = src_oloc; + src_hoid = obc->obs.oi.manifest.redirect_target; + cb = new PromoteCallback(obc, this); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + } + + unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | + CEPH_OSD_COPY_FROM_FLAG_RWORDERED; + start_copy(cb, obc, src_hoid, my_oloc, 0, flags, + obc->obs.oi.soid.snap == CEPH_NOSNAP, + src_fadvise_flags, 0); + + ceph_assert(obc->is_blocked()); + + if (op) + wait_for_blocked_object(obc->obs.oi.soid, op); + info.stats.stats.sum.num_promote++; +} + +void PrimaryLogPG::execute_ctx(OpContext *ctx) +{ + FUNCTRACE(cct); + dout(10) << __func__ << " " << ctx << dendl; + ctx->reset_obs(ctx->obc); + ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx + OpRequestRef op = ctx->op; + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + ObjectContextRef obc = ctx->obc; + const hobject_t& soid = obc->obs.oi.soid; + + // this method must be idempotent since we may call it several times + // before we finally apply the resulting transaction. + ctx->op_t.reset(new PGTransaction); + + if (op->may_write() || op->may_cache()) { + // snap + if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) && + pool.info.is_pool_snaps_mode()) { + // use pool's snapc + ctx->snapc = pool.snapc; + } else { + // client specified snapc + ctx->snapc.seq = m->get_snap_seq(); + ctx->snapc.snaps = m->get_snaps(); + filter_snapc(ctx->snapc.snaps); + } + if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) && + ctx->snapc.seq < obc->ssc->snapset.seq) { + dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq + << " < snapset seq " << obc->ssc->snapset.seq + << " on " << obc->obs.oi.soid << dendl; + reply_ctx(ctx, -EOLDSNAPC); + return; + } + + // version + ctx->at_version = get_next_version(); + ctx->mtime = m->get_mtime(); + + dout(10) << __func__ << " " << soid << " " << *ctx->ops + << " ov " << obc->obs.oi.version << " av " << ctx->at_version + << " snapc " << ctx->snapc + << " snapset " << obc->ssc->snapset + << dendl; + } else { + dout(10) << __func__ << " " << soid << " " << *ctx->ops + << " ov " << obc->obs.oi.version + << dendl; + } + + if (!ctx->user_at_version) + ctx->user_at_version = obc->obs.oi.user_version; + dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl; + + { +#ifdef WITH_LTTNG + osd_reqid_t reqid = ctx->op->get_reqid(); +#endif + tracepoint(osd, prepare_tx_enter, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + int result = prepare_transaction(ctx); + + { +#ifdef WITH_LTTNG + osd_reqid_t reqid = ctx->op->get_reqid(); +#endif + tracepoint(osd, prepare_tx_exit, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + bool pending_async_reads = !ctx->pending_async_reads.empty(); + if (result == -EINPROGRESS || pending_async_reads) { + // come back later. + if (pending_async_reads) { + ceph_assert(pool.info.is_erasure()); + in_progress_async_reads.push_back(make_pair(op, ctx)); + ctx->start_async_reads(this); + } + return; + } + + if (result == -EAGAIN) { + // clean up after the ctx + close_op_ctx(ctx); + return; + } + + bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0; + // prepare the reply + ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, + successful_write); + + // Write operations aren't allowed to return a data payload because + // we can't do so reliably. If the client has to resend the request + // and it has already been applied, we will return 0 with no + // payload. Non-deterministic behavior is no good. However, it is + // possible to construct an operation that does a read, does a guard + // check (e.g., CMPXATTR), and then a write. Then we either succeed + // with the write, or return a CMPXATTR and the read value. + if (successful_write) { + // write. normalize the result code. + dout(20) << " zeroing write result code " << result << dendl; + result = 0; + } + ctx->reply->set_result(result); + + // read or error? + if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) { + // finish side-effects + if (result >= 0) + do_osd_op_effects(ctx, m->get_connection()); + + complete_read_ctx(result, ctx); + return; + } + + ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version); + + ceph_assert(op->may_write() || op->may_cache()); + + // trim log? + if (hard_limit_pglog()) + calc_trim_to_aggressive(); + else + calc_trim_to(); + + // verify that we are doing this in order? + if (cct->_conf->osd_debug_op_order && m->get_source().is_client() && + !pool.info.is_tier() && !pool.info.has_tiers()) { + map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid]; + ceph_tid_t t = m->get_tid(); + client_t n = m->get_source().num(); + map<client_t,ceph_tid_t>::iterator p = cm.find(n); + if (p == cm.end()) { + dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl; + cm[n] = t; + } else { + dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl; + if (p->second > t) { + derr << "bad op order, already applied " << p->second << " > this " << t << dendl; + ceph_abort_msg("out of order op"); + } + p->second = t; + } + } + + if (ctx->update_log_only) { + if (result >= 0) + do_osd_op_effects(ctx, m->get_connection()); + + dout(20) << __func__ << " update_log_only -- result=" << result << dendl; + // save just what we need from ctx + MOSDOpReply *reply = ctx->reply; + ctx->reply = nullptr; + reply->claim_op_out_data(*ctx->ops); + reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0); + close_op_ctx(ctx); + + if (result == -ENOENT) { + reply->set_enoent_reply_versions(info.last_update, + info.last_user_version); + } + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + // append to pg log for dup detection - don't save buffers for now + record_write_error(op, soid, reply, result); + return; + } + + // no need to capture PG ref, repop cancel will handle that + // Can capture the ctx by pointer, it's owned by the repop + ctx->register_on_commit( + [m, ctx, this](){ + if (ctx->op) + log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read); + + if (m && !ctx->sent_reply) { + MOSDOpReply *reply = ctx->reply; + if (reply) + ctx->reply = nullptr; + else { + reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, true); + reply->set_reply_versions(ctx->at_version, + ctx->user_at_version); + } + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + dout(10) << " sending reply on " << *m << " " << reply << dendl; + osd->send_message_osd_client(reply, m->get_connection()); + ctx->sent_reply = true; + ctx->op->mark_commit_sent(); + } + }); + ctx->register_on_success( + [ctx, this]() { + do_osd_op_effects( + ctx, + ctx->op ? ctx->op->get_req()->get_connection() : + ConnectionRef()); + }); + ctx->register_on_finish( + [ctx]() { + delete ctx; + }); + + // issue replica writes + ceph_tid_t rep_tid = osd->get_tid(); + + RepGather *repop = new_repop(ctx, obc, rep_tid); + + issue_repop(repop, ctx); + eval_repop(repop); + repop->put(); +} + +void PrimaryLogPG::close_op_ctx(OpContext *ctx) { + release_object_locks(ctx->lock_manager); + + ctx->op_t.reset(); + + for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end(); + ctx->on_finish.erase(p++)) { + (*p)(); + } + delete ctx; +} + +void PrimaryLogPG::reply_ctx(OpContext *ctx, int r) +{ + if (ctx->op) + osd->reply_op_error(ctx->op, r); + close_op_ctx(ctx); +} + +void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv) +{ + if (ctx->op) + osd->reply_op_error(ctx->op, r, v, uv); + close_op_ctx(ctx); +} + +void PrimaryLogPG::log_op_stats(const OpRequest& op, + const uint64_t inb, + const uint64_t outb) +{ + const MOSDOp* const m = static_cast<const MOSDOp*>(op.get_req()); + const utime_t now = ceph_clock_now(); + + const utime_t latency = now - m->get_recv_stamp(); + const utime_t process_latency = now - op.get_dequeued_time(); + + osd->logger->inc(l_osd_op); + + osd->logger->inc(l_osd_op_outb, outb); + osd->logger->inc(l_osd_op_inb, inb); + osd->logger->tinc(l_osd_op_lat, latency); + osd->logger->tinc(l_osd_op_process_lat, process_latency); + + if (op.may_read() && op.may_write()) { + osd->logger->inc(l_osd_op_rw); + osd->logger->inc(l_osd_op_rw_inb, inb); + osd->logger->inc(l_osd_op_rw_outb, outb); + osd->logger->tinc(l_osd_op_rw_lat, latency); + osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb); + osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb); + osd->logger->tinc(l_osd_op_rw_process_lat, process_latency); + } else if (op.may_read()) { + osd->logger->inc(l_osd_op_r); + osd->logger->inc(l_osd_op_r_outb, outb); + osd->logger->tinc(l_osd_op_r_lat, latency); + osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb); + osd->logger->tinc(l_osd_op_r_process_lat, process_latency); + } else if (op.may_write() || op.may_cache()) { + osd->logger->inc(l_osd_op_w); + osd->logger->inc(l_osd_op_w_inb, inb); + osd->logger->tinc(l_osd_op_w_lat, latency); + osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb); + osd->logger->tinc(l_osd_op_w_process_lat, process_latency); + } else { + ceph_abort(); + } + + dout(15) << "log_op_stats " << *m + << " inb " << inb + << " outb " << outb + << " lat " << latency << dendl; + + if (m_dynamic_perf_stats.is_enabled()) { + m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency); + } +} + +void PrimaryLogPG::set_dynamic_perf_stats_queries( + const std::list<OSDPerfMetricQuery> &queries) +{ + m_dynamic_perf_stats.set_queries(queries); +} + +void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats) +{ + std::swap(m_dynamic_perf_stats, *stats); +} + +void PrimaryLogPG::do_scan( + OpRequestRef op, + ThreadPool::TPHandle &handle) +{ + const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_SCAN); + dout(10) << "do_scan " << *m << dendl; + + op->mark_started(); + + switch (m->op) { + case MOSDPGScan::OP_SCAN_GET_DIGEST: + { + auto dpp = get_dpp(); + if (osd->check_backfill_full(dpp)) { + dout(1) << __func__ << ": Canceling backfill: Full." << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + BackfillTooFull()))); + return; + } + + BackfillInterval bi; + bi.begin = m->begin; + // No need to flush, there won't be any in progress writes occuring + // past m->begin + scan_range( + cct->_conf->osd_backfill_scan_min, + cct->_conf->osd_backfill_scan_max, + &bi, + handle); + MOSDPGScan *reply = new MOSDPGScan( + MOSDPGScan::OP_SCAN_DIGEST, + pg_whoami, + get_osdmap_epoch(), m->query_epoch, + spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end); + encode(bi.objects, reply->get_data()); + osd->send_message_osd_cluster(reply, m->get_connection()); + } + break; + + case MOSDPGScan::OP_SCAN_DIGEST: + { + pg_shard_t from = m->from; + + // Check that from is in backfill_targets vector + ceph_assert(is_backfill_targets(from)); + + BackfillInterval& bi = peer_backfill_info[from]; + bi.begin = m->begin; + bi.end = m->end; + auto p = m->get_data().cbegin(); + + // take care to preserve ordering! + bi.clear_objects(); + ::decode_noclear(bi.objects, p); + + if (waiting_on_backfill.erase(from)) { + if (waiting_on_backfill.empty()) { + ceph_assert(peer_backfill_info.size() == backfill_targets.size()); + finish_recovery_op(hobject_t::get_max()); + } + } else { + // we canceled backfill for a while due to a too full, and this + // is an extra response from a non-too-full peer + dout(20) << __func__ << " canceled backfill (too full?)" << dendl; + } + } + break; + } +} + +void PrimaryLogPG::do_backfill(OpRequestRef op) +{ + const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL); + dout(10) << "do_backfill " << *m << dendl; + + op->mark_started(); + + switch (m->op) { + case MOSDPGBackfill::OP_BACKFILL_FINISH: + { + ceph_assert(cct->_conf->osd_kill_backfill_at != 1); + + MOSDPGBackfill *reply = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + get_osdmap_epoch(), + m->query_epoch, + spg_t(info.pgid.pgid, get_primary().shard)); + reply->set_priority(get_recovery_op_priority()); + osd->send_message_osd_cluster(reply, m->get_connection()); + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + RecoveryDone()))); + } + // fall-thru + + case MOSDPGBackfill::OP_BACKFILL_PROGRESS: + { + ceph_assert(cct->_conf->osd_kill_backfill_at != 2); + + info.set_last_backfill(m->last_backfill); + // During backfill submit_push_data() tracks num_bytes which is needed in case + // backfill stops and starts again. We want to know how many bytes this + // pg is consuming on the disk in order to compute amount of new data + // reserved to hold backfill if it won't fit. + if (m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS) { + dout(25) << __func__ << " primary " << m->stats.stats.sum.num_bytes << " local " << info.stats.stats.sum.num_bytes << dendl; + int64_t bytes = info.stats.stats.sum.num_bytes; + info.stats = m->stats; + info.stats.stats.sum.num_bytes = bytes; + } else { + dout(20) << __func__ << " final " << m->stats.stats.sum.num_bytes << " replaces local " << info.stats.stats.sum.num_bytes << dendl; + info.stats = m->stats; + } + + ObjectStore::Transaction t; + dirty_info = true; + write_if_dirty(t); + int tr = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(tr == 0); + } + break; + + case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK: + { + ceph_assert(is_primary()); + ceph_assert(cct->_conf->osd_kill_backfill_at != 3); + finish_recovery_op(hobject_t::get_max()); + } + break; + } +} + +void PrimaryLogPG::do_backfill_remove(OpRequestRef op) +{ + const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>( + op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE); + dout(7) << __func__ << " " << m->ls << dendl; + + op->mark_started(); + + ObjectStore::Transaction t; + for (auto& p : m->ls) { + if (is_remote_backfilling()) { + struct stat st; + int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN, + pg_whoami.shard) , &st); + if (r == 0) { + sub_local_num_bytes(st.st_size); + int64_t usersize; + if (pool.info.is_erasure()) { + bufferlist bv; + int r = osd->store->getattr( + ch, + ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + usersize = oi.size * pgbackend->get_ec_data_chunk_count(); + } else { + dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard) + << " can't get object info" << dendl; + usersize = 0; + } + } else { + usersize = st.st_size; + } + sub_num_bytes(usersize); + dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard) + << " sub actual data by " << st.st_size + << " sub num_bytes by " << usersize + << dendl; + } + } + remove_snap_mapped_object(t, p.first); + } + int r = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(r == 0); +} + +int PrimaryLogPG::trim_object( + bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp) +{ + *ctxp = NULL; + + // load clone info + bufferlist bl; + ObjectContextRef obc = get_object_context(coid, false, NULL); + if (!obc || !obc->ssc || !obc->ssc->exists) { + osd->clog->error() << __func__ << ": Can not trim " << coid + << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)"); + return -ENOENT; + } + + hobject_t head_oid = coid.get_head(); + ObjectContextRef head_obc = get_object_context(head_oid, false); + if (!head_obc) { + osd->clog->error() << __func__ << ": Can not trim " << coid + << " repair needed, no snapset obc for " << head_oid; + return -ENOENT; + } + + SnapSet& snapset = obc->ssc->snapset; + + object_info_t &coi = obc->obs.oi; + auto citer = snapset.clone_snaps.find(coid.snap); + if (citer == snapset.clone_snaps.end()) { + osd->clog->error() << "No clone_snaps in snapset " << snapset + << " for object " << coid << "\n"; + return -ENOENT; + } + set<snapid_t> old_snaps(citer->second.begin(), citer->second.end()); + if (old_snaps.empty()) { + osd->clog->error() << "No object info snaps for object " << coid; + return -ENOENT; + } + + dout(10) << coid << " old_snaps " << old_snaps + << " old snapset " << snapset << dendl; + if (snapset.seq == 0) { + osd->clog->error() << "No snapset.seq for object " << coid; + return -ENOENT; + } + + set<snapid_t> new_snaps; + for (set<snapid_t>::iterator i = old_snaps.begin(); + i != old_snaps.end(); + ++i) { + if (!pool.info.is_removed_snap(*i)) + new_snaps.insert(*i); + } + + vector<snapid_t>::iterator p = snapset.clones.end(); + + if (new_snaps.empty()) { + p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap); + if (p == snapset.clones.end()) { + osd->clog->error() << "Snap " << coid.snap << " not in clones"; + return -ENOENT; + } + } + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->head_obc = head_obc; + + if (!ctx->lock_manager.get_snaptrimmer_write( + coid, + obc, + first)) { + close_op_ctx(ctx.release()); + dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl; + return -ENOLCK; + } + + if (!ctx->lock_manager.get_snaptrimmer_write( + head_oid, + head_obc, + first)) { + close_op_ctx(ctx.release()); + dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl; + return -ENOLCK; + } + + ctx->at_version = get_next_version(); + + PGTransaction *t = ctx->op_t.get(); + + if (new_snaps.empty()) { + // remove clone + dout(10) << coid << " snaps " << old_snaps << " -> " + << new_snaps << " ... deleting" << dendl; + + // ...from snapset + ceph_assert(p != snapset.clones.end()); + + snapid_t last = coid.snap; + ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last); + + if (p != snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + vector<snapid_t>::iterator n = p - 1; + hobject_t prev_coid = coid; + prev_coid.snap = *n; + bool adjust_prev_bytes = is_present_clone(prev_coid); + + if (adjust_prev_bytes) + ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n); + + snapset.clone_overlap[*n].intersection_of( + snapset.clone_overlap[*p]); + + if (adjust_prev_bytes) + ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n); + } + ctx->delta_stats.num_objects--; + if (coi.is_dirty()) + ctx->delta_stats.num_objects_dirty--; + if (coi.is_omap()) + ctx->delta_stats.num_objects_omap--; + if (coi.is_whiteout()) { + dout(20) << __func__ << " trimming whiteout on " << coid << dendl; + ctx->delta_stats.num_whiteouts--; + } + ctx->delta_stats.num_object_clones--; + if (coi.is_cache_pinned()) + ctx->delta_stats.num_objects_pinned--; + if (coi.has_manifest()) + ctx->delta_stats.num_objects_manifest--; + obc->obs.exists = false; + + snapset.clones.erase(p); + snapset.clone_overlap.erase(last); + snapset.clone_size.erase(last); + snapset.clone_snaps.erase(last); + + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::DELETE, + coid, + ctx->at_version, + ctx->obs->oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + t->remove(coid); + t->update_snaps( + coid, + old_snaps, + new_snaps); + + coi = object_info_t(coid); + + ctx->at_version.version++; + } else { + // save adjusted snaps for this object + dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl; + snapset.clone_snaps[coid.snap] = + vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend()); + // we still do a 'modify' event on this object just to trigger a + // snapmapper.update ... :( + + coi.prior_version = coi.version; + coi.version = ctx->at_version; + bl.clear(); + encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + t->setattr(coid, OI_ATTR, bl); + + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + coid, + coi.version, + coi.prior_version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + ctx->at_version.version++; + + t->update_snaps( + coid, + old_snaps, + new_snaps); + } + + // save head snapset + dout(10) << coid << " new snapset " << snapset << " on " + << head_obc->obs.oi << dendl; + if (snapset.clones.empty() && + (head_obc->obs.oi.is_whiteout() && + !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) && + !head_obc->obs.oi.is_cache_pinned())) { + // NOTE: this arguably constitutes minor interference with the + // tiering agent if this is a cache tier since a snap trim event + // is effectively evicting a whiteout we might otherwise want to + // keep around. + dout(10) << coid << " removing " << head_oid << dendl; + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::DELETE, + head_oid, + ctx->at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + derr << "removing snap head" << dendl; + object_info_t& oi = head_obc->obs.oi; + ctx->delta_stats.num_objects--; + if (oi.is_dirty()) { + ctx->delta_stats.num_objects_dirty--; + } + if (oi.is_omap()) + ctx->delta_stats.num_objects_omap--; + if (oi.is_whiteout()) { + dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl; + ctx->delta_stats.num_whiteouts--; + } + if (oi.is_cache_pinned()) { + ctx->delta_stats.num_objects_pinned--; + } + if (coi.has_manifest()) + ctx->delta_stats.num_objects_manifest--; + head_obc->obs.exists = false; + head_obc->obs.oi = object_info_t(head_oid); + t->remove(head_oid); + } else { + dout(10) << coid << " filtering snapset on " << head_oid << dendl; + snapset.filter(pool.info); + dout(10) << coid << " writing updated snapset on " << head_oid + << ", snapset is " << snapset << dendl; + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + head_oid, + ctx->at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + + head_obc->obs.oi.prior_version = head_obc->obs.oi.version; + head_obc->obs.oi.version = ctx->at_version; + + map <string, bufferlist> attrs; + bl.clear(); + encode(snapset, bl); + attrs[SS_ATTR].claim(bl); + + bl.clear(); + encode(head_obc->obs.oi, bl, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR].claim(bl); + t->setattrs(head_oid, attrs); + } + + *ctxp = std::move(ctx); + return 0; +} + +void PrimaryLogPG::kick_snap_trim() +{ + ceph_assert(is_active()); + ceph_assert(is_primary()); + if (is_clean() && + !state_test(PG_STATE_PREMERGE) && + !snap_trimq.empty()) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) { + dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl; + } else { + dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl; + snap_trimmer_machine.process_event(KickTrim()); + } + } +} + +void PrimaryLogPG::snap_trimmer_scrub_complete() +{ + if (is_primary() && is_active() && is_clean()) { + ceph_assert(!snap_trimq.empty()); + snap_trimmer_machine.process_event(ScrubComplete()); + } +} + +void PrimaryLogPG::snap_trimmer(epoch_t queued) +{ + if (deleting || pg_has_reset_since(queued)) { + return; + } + + ceph_assert(is_primary()); + + dout(10) << "snap_trimmer posting" << dendl; + snap_trimmer_machine.process_event(DoSnapWork()); + dout(10) << "snap_trimmer complete" << dendl; + return; +} + +int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr) +{ + __u64 v2; + + string v2s(xattr.c_str(), xattr.length()); + if (v2s.length()) + v2 = strtoull(v2s.c_str(), NULL, 10); + else + v2 = 0; + + dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl; + + switch (op) { + case CEPH_OSD_CMPXATTR_OP_EQ: + return (v1 == v2); + case CEPH_OSD_CMPXATTR_OP_NE: + return (v1 != v2); + case CEPH_OSD_CMPXATTR_OP_GT: + return (v1 > v2); + case CEPH_OSD_CMPXATTR_OP_GTE: + return (v1 >= v2); + case CEPH_OSD_CMPXATTR_OP_LT: + return (v1 < v2); + case CEPH_OSD_CMPXATTR_OP_LTE: + return (v1 <= v2); + default: + return -EINVAL; + } +} + +int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr) +{ + string v2s(xattr.c_str(), xattr.length()); + + dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl; + + switch (op) { + case CEPH_OSD_CMPXATTR_OP_EQ: + return (v1s.compare(v2s) == 0); + case CEPH_OSD_CMPXATTR_OP_NE: + return (v1s.compare(v2s) != 0); + case CEPH_OSD_CMPXATTR_OP_GT: + return (v1s.compare(v2s) > 0); + case CEPH_OSD_CMPXATTR_OP_GTE: + return (v1s.compare(v2s) >= 0); + case CEPH_OSD_CMPXATTR_OP_LT: + return (v1s.compare(v2s) < 0); + case CEPH_OSD_CMPXATTR_OP_LTE: + return (v1s.compare(v2s) <= 0); + default: + return -EINVAL; + } +} + +int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op) +{ + ceph_osd_op& op = osd_op.op; + vector<OSDOp> write_ops(1); + OSDOp& write_op = write_ops[0]; + uint64_t write_length = op.writesame.length; + int result = 0; + + if (!write_length) + return 0; + + if (!op.writesame.data_length || write_length % op.writesame.data_length) + return -EINVAL; + + if (op.writesame.data_length != osd_op.indata.length()) { + derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl; + return -EINVAL; + } + + while (write_length) { + write_op.indata.append(osd_op.indata); + write_length -= op.writesame.data_length; + } + + write_op.op.op = CEPH_OSD_OP_WRITE; + write_op.op.extent.offset = op.writesame.offset; + write_op.op.extent.length = op.writesame.length; + result = do_osd_ops(ctx, write_ops); + if (result < 0) + derr << "do_writesame do_osd_ops failed " << result << dendl; + + return result; +} + +// ======================================================================== +// low level osd ops + +int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags) +{ + dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl; + bufferlist header, vals; + int r = _get_tmap(ctx, &header, &vals); + if (r < 0) { + if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK)) + r = 0; + return r; + } + + vector<OSDOp> ops(3); + + ops[0].op.op = CEPH_OSD_OP_TRUNCATE; + ops[0].op.extent.offset = 0; + ops[0].op.extent.length = 0; + + ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER; + ops[1].indata.claim(header); + + ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS; + ops[2].indata.claim(vals); + + return do_osd_ops(ctx, ops); +} + +int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp, + OSDOp& osd_op, bufferlist& bl) +{ + // decode + bufferlist header; + map<string, bufferlist> m; + if (bl.length()) { + auto p = bl.cbegin(); + decode(header, p); + decode(m, p); + ceph_assert(p.end()); + } + + // do the update(s) + while (!bp.end()) { + __u8 op; + string key; + decode(op, bp); + + switch (op) { + case CEPH_OSD_TMAP_SET: // insert key + { + decode(key, bp); + bufferlist data; + decode(data, bp); + m[key] = data; + } + break; + case CEPH_OSD_TMAP_RM: // remove key + decode(key, bp); + if (!m.count(key)) { + return -ENOENT; + } + m.erase(key); + break; + case CEPH_OSD_TMAP_RMSLOPPY: // remove key + decode(key, bp); + m.erase(key); + break; + case CEPH_OSD_TMAP_HDR: // update header + { + decode(header, bp); + } + break; + default: + return -EINVAL; + } + } + + // reencode + bufferlist obl; + encode(header, obl); + encode(m, obl); + + // write it out + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = obl.length(); + newop.indata = obl; + do_osd_ops(ctx, nops); + osd_op.outdata.claim(newop.outdata); + return 0; +} + +int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op) +{ + bufferlist::const_iterator orig_bp = bp; + int result = 0; + if (bp.end()) { + dout(10) << "tmapup is a no-op" << dendl; + } else { + // read the whole object + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_READ; + newop.op.extent.offset = 0; + newop.op.extent.length = 0; + result = do_osd_ops(ctx, nops); + + dout(10) << "tmapup read " << newop.outdata.length() << dendl; + + dout(30) << " starting is \n"; + newop.outdata.hexdump(*_dout); + *_dout << dendl; + + auto ip = newop.outdata.cbegin(); + bufferlist obl; + + dout(30) << "the update command is: \n"; + osd_op.indata.hexdump(*_dout); + *_dout << dendl; + + // header + bufferlist header; + __u32 nkeys = 0; + if (newop.outdata.length()) { + decode(header, ip); + decode(nkeys, ip); + } + dout(10) << "tmapup header " << header.length() << dendl; + + if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) { + ++bp; + decode(header, bp); + dout(10) << "tmapup new header " << header.length() << dendl; + } + + encode(header, obl); + + dout(20) << "tmapup initial nkeys " << nkeys << dendl; + + // update keys + bufferlist newkeydata; + string nextkey, last_in_key; + bufferlist nextval; + bool have_next = false; + if (!ip.end()) { + have_next = true; + decode(nextkey, ip); + decode(nextval, ip); + } + while (!bp.end() && !result) { + __u8 op; + string key; + try { + decode(op, bp); + decode(key, bp); + } + catch (buffer::error& e) { + return -EINVAL; + } + if (key < last_in_key) { + dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key + << "', falling back to an inefficient (unsorted) update" << dendl; + bp = orig_bp; + return do_tmapup_slow(ctx, bp, osd_op, newop.outdata); + } + last_in_key = key; + + dout(10) << "tmapup op " << (int)op << " key " << key << dendl; + + // skip existing intervening keys + bool key_exists = false; + while (have_next && !key_exists) { + dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl; + if (nextkey > key) + break; + if (nextkey < key) { + // copy untouched. + encode(nextkey, newkeydata); + encode(nextval, newkeydata); + dout(20) << " keep " << nextkey << " " << nextval.length() << dendl; + } else { + // don't copy; discard old value. and stop. + dout(20) << " drop " << nextkey << " " << nextval.length() << dendl; + key_exists = true; + nkeys--; + } + if (!ip.end()) { + decode(nextkey, ip); + decode(nextval, ip); + } else { + have_next = false; + } + } + + if (op == CEPH_OSD_TMAP_SET) { + bufferlist val; + try { + decode(val, bp); + } + catch (buffer::error& e) { + return -EINVAL; + } + encode(key, newkeydata); + encode(val, newkeydata); + dout(20) << " set " << key << " " << val.length() << dendl; + nkeys++; + } else if (op == CEPH_OSD_TMAP_CREATE) { + if (key_exists) { + return -EEXIST; + } + bufferlist val; + try { + decode(val, bp); + } + catch (buffer::error& e) { + return -EINVAL; + } + encode(key, newkeydata); + encode(val, newkeydata); + dout(20) << " create " << key << " " << val.length() << dendl; + nkeys++; + } else if (op == CEPH_OSD_TMAP_RM) { + // do nothing. + if (!key_exists) { + return -ENOENT; + } + } else if (op == CEPH_OSD_TMAP_RMSLOPPY) { + // do nothing + } else { + dout(10) << " invalid tmap op " << (int)op << dendl; + return -EINVAL; + } + } + + // copy remaining + if (have_next) { + encode(nextkey, newkeydata); + encode(nextval, newkeydata); + dout(20) << " keep " << nextkey << " " << nextval.length() << dendl; + } + if (!ip.end()) { + bufferlist rest; + rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off()); + dout(20) << " keep trailing " << rest.length() + << " at " << newkeydata.length() << dendl; + newkeydata.claim_append(rest); + } + + // encode final key count + key data + dout(20) << "tmapup final nkeys " << nkeys << dendl; + encode(nkeys, obl); + obl.claim_append(newkeydata); + + if (0) { + dout(30) << " final is \n"; + obl.hexdump(*_dout); + *_dout << dendl; + + // sanity check + auto tp = obl.cbegin(); + bufferlist h; + decode(h, tp); + map<string,bufferlist> d; + decode(d, tp); + ceph_assert(tp.end()); + dout(0) << " **** debug sanity check, looks ok ****" << dendl; + } + + // write it out + if (!result) { + dout(20) << "tmapput write " << obl.length() << dendl; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = obl.length(); + newop.indata = obl; + do_osd_ops(ctx, nops); + osd_op.outdata.claim(newop.outdata); + } + } + return result; +} + +static int check_offset_and_length(uint64_t offset, uint64_t length, + uint64_t max, DoutPrefixProvider *dpp) +{ + if (offset >= max || + length > max || + offset + length > max) { + ldpp_dout(dpp, 10) << __func__ << " " + << "osd_max_object_size: " << max + << "; Hard limit of object size is 4GB." << dendl; + return -EFBIG; + } + + return 0; +} + +struct FillInVerifyExtent : public Context { + ceph_le64 *r; + int32_t *rval; + bufferlist *outdatap; + boost::optional<uint32_t> maybe_crc; + uint64_t size; + OSDService *osd; + hobject_t soid; + __le32 flags; + FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp, + boost::optional<uint32_t> mc, uint64_t size, + OSDService *osd, hobject_t soid, __le32 flags) : + r(r), rval(rv), outdatap(blp), maybe_crc(mc), + size(size), osd(osd), soid(soid), flags(flags) {} + void finish(int len) override { + *r = len; + if (len < 0) { + *rval = len; + return; + } + *rval = 0; + + // whole object? can we verify the checksum? + if (maybe_crc && *r == size) { + uint32_t crc = outdatap->crc32c(-1); + if (maybe_crc != crc) { + osd->clog->error() << std::hex << " full-object read crc 0x" << crc + << " != expected 0x" << *maybe_crc + << std::dec << " on " << soid; + if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) { + *rval = -EIO; + *r = 0; + } + } + } + } +}; + +struct ToSparseReadResult : public Context { + int* result; + bufferlist* data_bl; + uint64_t data_offset; + ceph_le64* len; + ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset, + ceph_le64* len) + : result(result), data_bl(bl), data_offset(offset),len(len) {} + void finish(int r) override { + if (r < 0) { + *result = r; + return; + } + *result = 0; + *len = r; + bufferlist outdata; + map<uint64_t, uint64_t> extents = {{data_offset, r}}; + encode(extents, outdata); + ::encode_destructively(*data_bl, outdata); + data_bl->swap(outdata); + } +}; + +template<typename V> +static string list_keys(const map<string, V>& m) { + string s; + for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) { + if (!s.empty()) { + s.push_back(','); + } + s.append(itr->first); + } + return s; +} + +template<typename T> +static string list_entries(const T& m) { + string s; + for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) { + if (!s.empty()) { + s.push_back(','); + } + s.append(*itr); + } + return s; +} + +void PrimaryLogPG::maybe_create_new_object( + OpContext *ctx, + bool ignore_transaction) +{ + ObjectState& obs = ctx->new_obs; + if (!obs.exists) { + ctx->delta_stats.num_objects++; + obs.exists = true; + ceph_assert(!obs.oi.is_whiteout()); + obs.oi.new_object(); + if (!ignore_transaction) + ctx->op_t->create(obs.oi.soid); + } else if (obs.oi.is_whiteout()) { + dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + --ctx->delta_stats.num_whiteouts; + } +} + +struct ReadFinisher : public PrimaryLogPG::OpFinisher { + OSDOp& osd_op; + + explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) { + } + + int execute() override { + return osd_op.rval; + } +}; + +struct C_ChecksumRead : public Context { + PrimaryLogPG *primary_log_pg; + OSDOp &osd_op; + Checksummer::CSumType csum_type; + bufferlist init_value_bl; + ceph_le64 read_length; + bufferlist read_bl; + Context *fill_extent_ctx; + + C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op, + Checksummer::CSumType csum_type, bufferlist &&init_value_bl, + boost::optional<uint32_t> maybe_crc, uint64_t size, + OSDService *osd, hobject_t soid, __le32 flags) + : primary_log_pg(primary_log_pg), osd_op(osd_op), + csum_type(csum_type), init_value_bl(std::move(init_value_bl)), + fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval, + &read_bl, maybe_crc, size, + osd, soid, flags)) { + } + ~C_ChecksumRead() override { + delete fill_extent_ctx; + } + + void finish(int r) override { + fill_extent_ctx->complete(r); + fill_extent_ctx = nullptr; + + if (osd_op.rval >= 0) { + bufferlist::const_iterator init_value_bl_it = init_value_bl.begin(); + osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type, + &init_value_bl_it, read_bl); + } + } +}; + +int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op, + bufferlist::const_iterator *bl_it) +{ + dout(20) << __func__ << dendl; + + auto& op = osd_op.op; + if (op.checksum.chunk_size > 0) { + if (op.checksum.length == 0) { + dout(10) << __func__ << ": length required when chunk size provided" + << dendl; + return -EINVAL; + } + if (op.checksum.length % op.checksum.chunk_size != 0) { + dout(10) << __func__ << ": length not aligned to chunk size" << dendl; + return -EINVAL; + } + } + + auto& oi = ctx->new_obs.oi; + if (op.checksum.offset == 0 && op.checksum.length == 0) { + // zeroed offset+length implies checksum whole object + op.checksum.length = oi.size; + } else if (op.checksum.offset >= oi.size) { + // read size was trimmed to zero, do nothing + // see PrimaryLogPG::do_read + return 0; + } else if (op.extent.offset + op.extent.length > oi.size) { + op.extent.length = oi.size - op.extent.offset; + if (op.checksum.chunk_size > 0 && + op.checksum.length % op.checksum.chunk_size != 0) { + dout(10) << __func__ << ": length (trimmed to 0x" + << std::hex << op.checksum.length + << ") not aligned to chunk size 0x" + << op.checksum.chunk_size << std::dec + << dendl; + return -EINVAL; + } + } + + Checksummer::CSumType csum_type; + switch (op.checksum.type) { + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32: + csum_type = Checksummer::CSUM_XXHASH32; + break; + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64: + csum_type = Checksummer::CSUM_XXHASH64; + break; + case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C: + csum_type = Checksummer::CSUM_CRC32C; + break; + default: + dout(10) << __func__ << ": unknown crc type (" + << static_cast<uint32_t>(op.checksum.type) << ")" << dendl; + return -EINVAL; + } + + size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type); + if (bl_it->get_remaining() < csum_init_value_size) { + dout(10) << __func__ << ": init value not provided" << dendl; + return -EINVAL; + } + + bufferlist init_value_bl; + init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(), + csum_init_value_size); + bl_it->advance(csum_init_value_size); + + if (pool.info.is_erasure() && op.checksum.length > 0) { + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. + boost::optional<uint32_t> maybe_crc; + if (oi.is_data_digest() && op.checksum.offset == 0 && + op.checksum.length >= oi.size) { + maybe_crc = oi.data_digest; + } + + // async read + auto& soid = oi.soid; + auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type, + std::move(init_value_bl), maybe_crc, + oi.size, osd, soid, op.flags); + + ctx->pending_async_reads.push_back({ + {op.checksum.offset, op.checksum.length, op.flags}, + {&checksum_ctx->read_bl, checksum_ctx}}); + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + return -EINPROGRESS; + } + + // sync read + std::vector<OSDOp> read_ops(1); + auto& read_op = read_ops[0]; + if (op.checksum.length > 0) { + read_op.op.op = CEPH_OSD_OP_READ; + read_op.op.flags = op.flags; + read_op.op.extent.offset = op.checksum.offset; + read_op.op.extent.length = op.checksum.length; + read_op.op.extent.truncate_size = 0; + read_op.op.extent.truncate_seq = 0; + + int r = do_osd_ops(ctx, read_ops); + if (r < 0) { + derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl; + return r; + } + } + + bufferlist::const_iterator init_value_bl_it = init_value_bl.begin(); + return finish_checksum(osd_op, csum_type, &init_value_bl_it, + read_op.outdata); +} + +int PrimaryLogPG::finish_checksum(OSDOp& osd_op, + Checksummer::CSumType csum_type, + bufferlist::const_iterator *init_value_bl_it, + const bufferlist &read_bl) { + dout(20) << __func__ << dendl; + + auto& op = osd_op.op; + + if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) { + derr << __func__ << ": bytes read " << read_bl.length() << " != " + << op.checksum.length << dendl; + return -EINVAL; + } + + size_t csum_chunk_size = (op.checksum.chunk_size != 0 ? + op.checksum.chunk_size : read_bl.length()); + uint32_t csum_count = (csum_chunk_size > 0 ? + read_bl.length() / csum_chunk_size : 0); + + bufferlist csum; + bufferptr csum_data; + if (csum_count > 0) { + size_t csum_value_size = Checksummer::get_csum_value_size(csum_type); + csum_data = buffer::create(csum_value_size * csum_count); + csum_data.zero(); + csum.append(csum_data); + + switch (csum_type) { + case Checksummer::CSUM_XXHASH32: + { + Checksummer::xxhash32::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate<Checksummer::xxhash32>( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + case Checksummer::CSUM_XXHASH64: + { + Checksummer::xxhash64::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate<Checksummer::xxhash64>( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + case Checksummer::CSUM_CRC32C: + { + Checksummer::crc32c::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate<Checksummer::crc32c>( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + default: + break; + } + } + + encode(csum_count, osd_op.outdata); + osd_op.outdata.claim_append(csum); + return 0; +} + +struct C_ExtentCmpRead : public Context { + PrimaryLogPG *primary_log_pg; + OSDOp &osd_op; + ceph_le64 read_length{}; + bufferlist read_bl; + Context *fill_extent_ctx; + + C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op, + boost::optional<uint32_t> maybe_crc, uint64_t size, + OSDService *osd, hobject_t soid, __le32 flags) + : primary_log_pg(primary_log_pg), osd_op(osd_op), + fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval, + &read_bl, maybe_crc, size, + osd, soid, flags)) { + } + ~C_ExtentCmpRead() override { + delete fill_extent_ctx; + } + + void finish(int r) override { + if (r == -ENOENT) { + osd_op.rval = 0; + read_bl.clear(); + delete fill_extent_ctx; + } else { + fill_extent_ctx->complete(r); + } + fill_extent_ctx = nullptr; + + if (osd_op.rval >= 0) { + osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl); + } + } +}; + +int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op) +{ + dout(20) << __func__ << dendl; + ceph_osd_op& op = osd_op.op; + + auto& oi = ctx->new_obs.oi; + uint64_t size = oi.size; + if ((oi.truncate_seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size)) { + size = op.extent.truncate_size; + } + + if (op.extent.offset >= size) { + op.extent.length = 0; + } else if (op.extent.offset + op.extent.length > size) { + op.extent.length = size - op.extent.offset; + } + + if (op.extent.length == 0) { + dout(20) << __func__ << " zero length extent" << dendl; + return finish_extent_cmp(osd_op, bufferlist{}); + } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) { + dout(20) << __func__ << " object DNE" << dendl; + return finish_extent_cmp(osd_op, {}); + } else if (pool.info.is_erasure()) { + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. + boost::optional<uint32_t> maybe_crc; + if (oi.is_data_digest() && op.checksum.offset == 0 && + op.checksum.length >= oi.size) { + maybe_crc = oi.data_digest; + } + + // async read + auto& soid = oi.soid; + auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size, + osd, soid, op.flags); + ctx->pending_async_reads.push_back({ + {op.extent.offset, op.extent.length, op.flags}, + {&extent_cmp_ctx->read_bl, extent_cmp_ctx}}); + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + return -EINPROGRESS; + } + + // sync read + vector<OSDOp> read_ops(1); + OSDOp& read_op = read_ops[0]; + + read_op.op.op = CEPH_OSD_OP_SYNC_READ; + read_op.op.extent.offset = op.extent.offset; + read_op.op.extent.length = op.extent.length; + read_op.op.extent.truncate_seq = op.extent.truncate_seq; + read_op.op.extent.truncate_size = op.extent.truncate_size; + + int result = do_osd_ops(ctx, read_ops); + if (result < 0) { + derr << __func__ << " failed " << result << dendl; + return result; + } + return finish_extent_cmp(osd_op, read_op.outdata); +} + +int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl) +{ + for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) { + char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0); + if (osd_op.indata[idx] != read_byte) { + return (-MAX_ERRNO - idx); + } + } + + return 0; +} + +int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) { + dout(20) << __func__ << dendl; + auto& op = osd_op.op; + auto& oi = ctx->new_obs.oi; + auto& soid = oi.soid; + __u32 seq = oi.truncate_seq; + uint64_t size = oi.size; + bool trimmed_read = false; + + dout(30) << __func__ << " oi.size: " << oi.size << dendl; + dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl; + dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl; + dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl; + + // are we beyond truncate_size? + if ( (seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size) && + (size > op.extent.truncate_size) ) + size = op.extent.truncate_size; + + if (op.extent.length == 0) //length is zero mean read the whole object + op.extent.length = size; + + if (op.extent.offset >= size) { + op.extent.length = 0; + trimmed_read = true; + } else if (op.extent.offset + op.extent.length > size) { + op.extent.length = size - op.extent.offset; + trimmed_read = true; + } + + dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl; + + // read into a buffer + int result = 0; + if (trimmed_read && op.extent.length == 0) { + // read size was trimmed to zero and it is expected to do nothing + // a read operation of 0 bytes does *not* do nothing, this is why + // the trimmed_read boolean is needed + } else if (pool.info.is_erasure()) { + // The initialisation below is required to silence a false positive + // -Wmaybe-uninitialized warning + boost::optional<uint32_t> maybe_crc = boost::make_optional(false, uint32_t()); + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. FillInVerifyExtent will + // will check the oi.size again. + if (oi.is_data_digest() && op.extent.offset == 0 && + op.extent.length >= oi.size) + maybe_crc = oi.data_digest; + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(op.extent.offset, op.extent.length, op.flags), + make_pair(&osd_op.outdata, + new FillInVerifyExtent(&op.extent.length, &osd_op.rval, + &osd_op.outdata, maybe_crc, oi.size, + osd, soid, op.flags)))); + dout(10) << " async_read noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + } else { + int r = pgbackend->objects_read_sync( + soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata); + // whole object? can we verify the checksum? + if (r >= 0 && op.extent.offset == 0 && + (uint64_t)r == oi.size && oi.is_data_digest()) { + uint32_t crc = osd_op.outdata.crc32c(-1); + if (oi.data_digest != crc) { + osd->clog->error() << info.pgid << std::hex + << " full-object read crc 0x" << crc + << " != expected 0x" << oi.data_digest + << std::dec << " on " << soid; + r = -EIO; // try repair later + } + } + if (r == -EIO) { + r = rep_repair_primary_object(soid, ctx); + } + if (r >= 0) + op.extent.length = r; + else if (r == -EAGAIN) { + result = -EAGAIN; + } else { + result = r; + op.extent.length = 0; + } + dout(10) << " read got " << r << " / " << op.extent.length + << " bytes from obj " << soid << dendl; + } + if (result >= 0) { + ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10); + ctx->delta_stats.num_rd++; + } + return result; +} + +int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { + dout(20) << __func__ << dendl; + auto& op = osd_op.op; + auto& oi = ctx->new_obs.oi; + auto& soid = oi.soid; + + if (op.extent.truncate_seq) { + dout(0) << "sparse_read does not support truncation sequence " << dendl; + return -EINVAL; + } + + ++ctx->num_read; + if (pool.info.is_erasure()) { + // translate sparse read to a normal one if not supported + uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + if (offset > oi.size) { + length = 0; + } else if (offset + length > oi.size) { + length = oi.size - offset; + } + + if (length > 0) { + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(offset, length, op.flags), + make_pair( + &osd_op.outdata, + new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset, + &op.extent.length)))); + dout(10) << " async_read (was sparse_read) noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + } else { + dout(10) << " sparse read ended up empty for " << soid << dendl; + map<uint64_t, uint64_t> extents; + encode(extents, osd_op.outdata); + } + } else { + // read into a buffer + map<uint64_t, uint64_t> m; + uint32_t total_read = 0; + int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN, + info.pgid.shard), + op.extent.offset, op.extent.length, m); + if (r < 0) { + return r; + } + + map<uint64_t, uint64_t>::iterator miter; + bufferlist data_bl; + uint64_t last = op.extent.offset; + for (miter = m.begin(); miter != m.end(); ++miter) { + // verify hole? + if (cct->_conf->osd_verify_sparse_read_holes && + last < miter->first) { + bufferlist t; + uint64_t len = miter->first - last; + r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t); + if (r < 0) { + osd->clog->error() << coll << " " << soid + << " sparse-read failed to read: " + << r; + } else if (!t.is_zero()) { + osd->clog->error() << coll << " " << soid + << " sparse-read found data in hole " + << last << "~" << len; + } + } + + bufferlist tmpbl; + r = pgbackend->objects_read_sync(soid, miter->first, miter->second, + op.flags, &tmpbl); + if (r == -EIO) { + r = rep_repair_primary_object(soid, ctx); + } + if (r < 0) { + return r; + } + + // this is usually happen when we get extent that exceeds the actual file + // size + if (r < (int)miter->second) + miter->second = r; + total_read += r; + dout(10) << "sparse-read " << miter->first << "@" << miter->second + << dendl; + data_bl.claim_append(tmpbl); + last = miter->first + r; + } + + // verify trailing hole? + if (cct->_conf->osd_verify_sparse_read_holes) { + uint64_t end = std::min<uint64_t>(op.extent.offset + op.extent.length, + oi.size); + if (last < end) { + bufferlist t; + uint64_t len = end - last; + r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t); + if (r < 0) { + osd->clog->error() << coll << " " << soid + << " sparse-read failed to read: " << r; + } else if (!t.is_zero()) { + osd->clog->error() << coll << " " << soid + << " sparse-read found data in hole " + << last << "~" << len; + } + } + } + + // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read. + // Maybe at first, there is no much whole objects. With continued use, more + // and more whole object exist. So from this point, for spare-read add + // checksum make sense. + if (total_read == oi.size && oi.is_data_digest()) { + uint32_t crc = data_bl.crc32c(-1); + if (oi.data_digest != crc) { + osd->clog->error() << info.pgid << std::hex + << " full-object read crc 0x" << crc + << " != expected 0x" << oi.data_digest + << std::dec << " on " << soid; + r = rep_repair_primary_object(soid, ctx); + if (r < 0) { + return r; + } + } + } + + op.extent.length = total_read; + + encode(m, osd_op.outdata); // re-encode since it might be modified + ::encode_destructively(data_bl, osd_op.outdata); + + dout(10) << " sparse_read got " << total_read << " bytes from object " + << soid << dendl; + } + + ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10); + ctx->delta_stats.num_rd++; + return 0; +} + +int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) +{ + int result = 0; + SnapSetContext *ssc = ctx->obc->ssc; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + const bool skip_data_digest = osd->store->has_builtin_csum() && + osd->osd_skip_data_digest; + + PGTransaction* t = ctx->op_t.get(); + + dout(10) << "do_osd_op " << soid << " " << ops << dendl; + + ctx->current_osd_subop_num = 0; + for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) { + OSDOp& osd_op = *p; + ceph_osd_op& op = osd_op.op; + + OpFinisher* op_finisher = nullptr; + { + auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num); + if (op_finisher_it != ctx->op_finishers.end()) { + op_finisher = op_finisher_it->second.get(); + } + } + + // TODO: check endianness (__le32 vs uint32_t, etc.) + // The fields in ceph_osd_op are little-endian (according to the definition in rados.h), + // but the code in this function seems to treat them as native-endian. What should the + // tracepoints do? + tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags); + + dout(10) << "do_osd_op " << osd_op << dendl; + + auto bp = osd_op.indata.cbegin(); + + // user-visible modifcation? + switch (op.op) { + // non user-visible modifications + case CEPH_OSD_OP_WATCH: + case CEPH_OSD_OP_CACHE_EVICT: + case CEPH_OSD_OP_CACHE_FLUSH: + case CEPH_OSD_OP_CACHE_TRY_FLUSH: + case CEPH_OSD_OP_UNDIRTY: + case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly + case CEPH_OSD_OP_CACHE_PIN: + case CEPH_OSD_OP_CACHE_UNPIN: + case CEPH_OSD_OP_SET_REDIRECT: + case CEPH_OSD_OP_TIER_PROMOTE: + break; + default: + if (op.op & CEPH_OSD_OP_MODE_WR) + ctx->user_modify = true; + } + + // munge -1 truncate to 0 truncate + if (ceph_osd_op_uses_extent(op.op) && + op.extent.truncate_seq == 1 && + op.extent.truncate_size == (-1ULL)) { + op.extent.truncate_size = 0; + op.extent.truncate_seq = 0; + } + + // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes) + if (op.op == CEPH_OSD_OP_ZERO && + obs.exists && + op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) && + op.extent.length >= 1 && + op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) && + op.extent.offset + op.extent.length >= oi.size) { + if (op.extent.offset >= oi.size) { + // no-op + goto fail; + } + dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length + << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl; + op.op = CEPH_OSD_OP_TRUNCATE; + } + + switch (op.op) { + + // --- READS --- + + case CEPH_OSD_OP_CMPEXT: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + + if (op_finisher == nullptr) { + result = do_extent_cmp(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_SYNC_READ: + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + // fall through + case CEPH_OSD_OP_READ: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + if (op_finisher == nullptr) { + if (!ctx->data_off) { + ctx->data_off = op.extent.offset; + } + result = do_read(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_CHECKSUM: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type, + op.checksum.offset, op.checksum.length, + op.checksum.chunk_size); + + if (op_finisher == nullptr) { + result = do_checksum(ctx, osd_op, &bp); + } else { + result = op_finisher->execute(); + } + } + break; + + /* map extents */ + case CEPH_OSD_OP_MAPEXT: + tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_read; + { + // read into a buffer + bufferlist bl; + int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN, + info.pgid.shard), + op.extent.offset, op.extent.length, bl); + osd_op.outdata.claim(bl); + if (r < 0) + result = r; + else + ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + ctx->delta_stats.num_rd++; + dout(10) << " map_extents done on object " << soid << dendl; + } + break; + + /* map extents */ + case CEPH_OSD_OP_SPARSE_READ: + tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + if (op_finisher == nullptr) { + result = do_sparse_read(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_CALL: + { + string cname, mname; + bufferlist indata; + try { + bp.copy(op.cls.class_len, cname); + bp.copy(op.cls.method_len, mname); + bp.copy(op.cls.indata_len, indata); + } catch (buffer::error& e) { + dout(10) << "call unable to decode class + method + indata" << dendl; + dout(30) << "in dump: "; + osd_op.indata.hexdump(*_dout); + *_dout << dendl; + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???"); + break; + } + tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str()); + + ClassHandler::ClassData *cls; + result = osd->class_handler->open_class(cname, &cls); + ceph_assert(result == 0); // init_op_flags() already verified this works. + + ClassHandler::ClassMethod *method = cls->get_method(mname.c_str()); + if (!method) { + dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl; + result = -EOPNOTSUPP; + break; + } + + int flags = method->get_flags(); + if (flags & CLS_METHOD_WR) + ctx->user_modify = true; + + bufferlist outdata; + dout(10) << "call method " << cname << "." << mname << dendl; + int prev_rd = ctx->num_read; + int prev_wr = ctx->num_write; + result = method->exec((cls_method_context_t)&ctx, indata, outdata); + + if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) { + derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl; + result = -EIO; + break; + } + if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) { + derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl; + result = -EIO; + break; + } + + dout(10) << "method called response length=" << outdata.length() << dendl; + op.extent.length = outdata.length(); + osd_op.outdata.claim_append(outdata); + dout(30) << "out dump: "; + osd_op.outdata.hexdump(*_dout); + *_dout << dendl; + } + break; + + case CEPH_OSD_OP_STAT: + // note: stat does not require RD + { + tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val); + + if (obs.exists && !oi.is_whiteout()) { + encode(oi.size, osd_op.outdata); + encode(oi.mtime, osd_op.outdata); + dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl; + } else { + result = -ENOENT; + dout(10) << "stat oi object does not exist" << dendl; + } + + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_ISDIRTY: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val); + bool is_dirty = obs.oi.is_dirty(); + encode(is_dirty, osd_op.outdata); + ctx->delta_stats.num_rd++; + result = 0; + } + break; + + case CEPH_OSD_OP_UNDIRTY: + ++ctx->num_write; + { + tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val); + if (oi.is_dirty()) { + ctx->undirty = true; // see make_writeable() + ctx->modify = true; + ctx->delta_stats.num_wr++; + } + result = 0; + } + break; + + case CEPH_OSD_OP_CACHE_TRY_FLUSH: + ++ctx->num_write; + { + tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val); + if (ctx->lock_type != ObjectContext::RWState::RWNONE) { + dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl; + result = -EINVAL; + break; + } + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + if (oi.is_dirty()) { + result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none); + if (result == -EINPROGRESS) + result = -EAGAIN; + } else { + result = 0; + } + } + break; + + case CEPH_OSD_OP_CACHE_FLUSH: + ++ctx->num_write; + { + tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val); + if (ctx->lock_type == ObjectContext::RWState::RWNONE) { + dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl; + result = -EINVAL; + break; + } + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + hobject_t missing; + if (oi.is_dirty()) { + result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none); + if (result == -EINPROGRESS) + result = -EAGAIN; + } else { + result = 0; + } + // Check special return value which has set missing_return + if (result == -ENOENT) { + dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl; + ceph_assert(!missing.is_min()); + wait_for_unreadable_object(missing, ctx->op); + // Error code which is used elsewhere when wait_for_unreadable_object() is used + result = -EAGAIN; + } + } + break; + + case CEPH_OSD_OP_CACHE_EVICT: + ++ctx->num_write; + { + tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + if (oi.is_dirty()) { + result = -EBUSY; + break; + } + if (!oi.watchers.empty()) { + result = -EBUSY; + break; + } + if (soid.snap == CEPH_NOSNAP) { + result = _verify_no_head_clones(soid, ssc->snapset); + if (result < 0) + break; + } + result = _delete_oid(ctx, true, false); + if (result >= 0) { + // mark that this is a cache eviction to avoid triggering normal + // make_writeable() clone creation in finish_ctx() + ctx->cache_evict = true; + } + osd->logger->inc(l_osd_tier_evict); + } + break; + + case CEPH_OSD_OP_GETXATTR: + ++ctx->num_read; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + int r = getattr_maybe_cache( + ctx->obc, + name, + &(osd_op.outdata)); + if (r >= 0) { + op.xattr.value_len = osd_op.outdata.length(); + result = 0; + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + } else + result = r; + + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_GETXATTRS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val); + map<string, bufferlist> out; + result = getattrs_maybe_cache( + ctx->obc, + &out); + + bufferlist bl; + encode(out, bl); + ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + ctx->delta_stats.num_rd++; + osd_op.outdata.claim_append(bl); + } + break; + + case CEPH_OSD_OP_CMPXATTR: + ++ctx->num_read; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + name[op.xattr.name_len + 1] = 0; + + bufferlist xattr; + result = getattr_maybe_cache( + ctx->obc, + name, + &xattr); + if (result < 0 && result != -EEXIST && result != -ENODATA) + break; + + ctx->delta_stats.num_rd++; + ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10); + + switch (op.xattr.cmp_mode) { + case CEPH_OSD_CMPXATTR_MODE_STRING: + { + string val; + bp.copy(op.xattr.value_len, val); + val[op.xattr.value_len] = 0; + dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val + << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl; + result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr); + } + break; + + case CEPH_OSD_CMPXATTR_MODE_U64: + { + uint64_t u64val; + try { + decode(u64val, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + goto fail; + } + dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val + << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl; + result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr); + } + break; + + default: + dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl; + result = -EINVAL; + } + + if (!result) { + dout(10) << "comparison returned false" << dendl; + result = -ECANCELED; + break; + } + if (result < 0) { + dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl; + break; + } + + dout(10) << "comparison returned true" << dendl; + } + break; + + case CEPH_OSD_OP_ASSERT_VER: + ++ctx->num_read; + { + uint64_t ver = op.assert_ver.ver; + tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver); + if (!ver) + result = -EINVAL; + else if (ver < oi.user_version) + result = -ERANGE; + else if (ver > oi.user_version) + result = -EOVERFLOW; + } + break; + + case CEPH_OSD_OP_LIST_WATCHERS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val); + obj_list_watch_response_t resp; + + map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter; + for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end(); + ++oi_iter) { + dout(20) << "key cookie=" << oi_iter->first.first + << " entity=" << oi_iter->first.second << " " + << oi_iter->second << dendl; + ceph_assert(oi_iter->first.first == oi_iter->second.cookie); + ceph_assert(oi_iter->first.second.is_client()); + + watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie, + oi_iter->second.timeout_seconds, oi_iter->second.addr); + resp.entries.push_back(wi); + } + + resp.encode(osd_op.outdata, ctx->get_features()); + result = 0; + + ctx->delta_stats.num_rd++; + break; + } + + case CEPH_OSD_OP_LIST_SNAPS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val); + obj_list_snap_response_t resp; + + if (!ssc) { + ssc = ctx->obc->ssc = get_snapset_context(soid, false); + } + ceph_assert(ssc); + dout(20) << " snapset " << ssc->snapset << dendl; + + int clonecount = ssc->snapset.clones.size(); + clonecount++; // for head + resp.clones.reserve(clonecount); + for (auto clone_iter = ssc->snapset.clones.begin(); + clone_iter != ssc->snapset.clones.end(); ++clone_iter) { + clone_info ci; + ci.cloneid = *clone_iter; + + hobject_t clone_oid = soid; + clone_oid.snap = *clone_iter; + + auto p = ssc->snapset.clone_snaps.find(*clone_iter); + if (p == ssc->snapset.clone_snaps.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_snaps found for oid " + << soid << " clone " << *clone_iter + << " snapset " << ssc->snapset; + result = -EINVAL; + break; + } + for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) { + ci.snaps.push_back(*q); + } + + dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl; + + map<snapid_t, interval_set<uint64_t> >::const_iterator coi; + coi = ssc->snapset.clone_overlap.find(ci.cloneid); + if (coi == ssc->snapset.clone_overlap.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_overlap found for oid " + << soid << " clone " << *clone_iter; + result = -EINVAL; + break; + } + const interval_set<uint64_t> &o = coi->second; + ci.overlap.reserve(o.num_intervals()); + for (interval_set<uint64_t>::const_iterator r = o.begin(); + r != o.end(); ++r) { + ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(), + r.get_len())); + } + + map<snapid_t, uint64_t>::const_iterator si; + si = ssc->snapset.clone_size.find(ci.cloneid); + if (si == ssc->snapset.clone_size.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_size found for oid " + << soid << " clone " << *clone_iter; + result = -EINVAL; + break; + } + ci.size = si->second; + + resp.clones.push_back(ci); + } + if (result < 0) { + break; + } + if (!ctx->obc->obs.oi.is_whiteout()) { + ceph_assert(obs.exists); + clone_info ci; + ci.cloneid = CEPH_NOSNAP; + + //Size for HEAD is oi.size + ci.size = oi.size; + + resp.clones.push_back(ci); + } + resp.seq = ssc->snapset.seq; + + resp.encode(osd_op.outdata); + result = 0; + + ctx->delta_stats.num_rd++; + break; + } + + case CEPH_OSD_OP_NOTIFY: + ++ctx->num_read; + { + uint32_t timeout; + bufferlist bl; + + try { + uint32_t ver; // obsolete + decode(ver, bp); + decode(timeout, bp); + decode(bl, bp); + } catch (const buffer::error &e) { + timeout = 0; + } + tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout); + if (!timeout) + timeout = cct->_conf->osd_default_notify_timeout; + + notify_info_t n; + n.timeout = timeout; + n.notify_id = osd->get_next_id(get_osdmap_epoch()); + n.cookie = op.watch.cookie; + n.bl = bl; + ctx->notifies.push_back(n); + + // return our unique notify id to the client + encode(n.notify_id, osd_op.outdata); + } + break; + + case CEPH_OSD_OP_NOTIFY_ACK: + ++ctx->num_read; + { + try { + uint64_t notify_id = 0; + uint64_t watch_cookie = 0; + decode(notify_id, bp); + decode(watch_cookie, bp); + bufferlist reply_bl; + if (!bp.end()) { + decode(reply_bl, bp); + } + tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y"); + OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl); + ctx->notify_acks.push_back(ack); + } catch (const buffer::error &e) { + tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N"); + OpContext::NotifyAck ack( + // op.watch.cookie is actually the notify_id for historical reasons + op.watch.cookie + ); + ctx->notify_acks.push_back(ack); + } + } + break; + + case CEPH_OSD_OP_SETALLOCHINT: + ++ctx->num_write; + { + tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size); + maybe_create_new_object(ctx); + oi.expected_object_size = op.alloc_hint.expected_object_size; + oi.expected_write_size = op.alloc_hint.expected_write_size; + oi.alloc_hint_flags = op.alloc_hint.flags; + t->set_alloc_hint(soid, op.alloc_hint.expected_object_size, + op.alloc_hint.expected_write_size, + op.alloc_hint.flags); + result = 0; + } + break; + + + // --- WRITES --- + + // -- object data -- + + case CEPH_OSD_OP_WRITE: + ++ctx->num_write; + { // write + __u32 seq = oi.truncate_seq; + tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + if (op.extent.length != osd_op.indata.length()) { + result = -EINVAL; + break; + } + + if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED)) + op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + if (pool.info.requires_aligned_append() && + (op.extent.offset % pool.info.required_alignment() != 0)) { + result = -EOPNOTSUPP; + break; + } + + if (!obs.exists) { + if (pool.info.requires_aligned_append() && op.extent.offset) { + result = -EOPNOTSUPP; + break; + } + } else if (op.extent.offset != oi.size && + pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + + if (seq && (seq > op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > oi.size)) { + // old write, arrived after trimtrunc + op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset); + dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq + << ", adjusting write length to " << op.extent.length << dendl; + bufferlist t; + t.substr_of(osd_op.indata, 0, op.extent.length); + osd_op.indata.swap(t); + } + if (op.extent.truncate_seq > seq) { + // write arrives before trimtrunc + if (obs.exists && !oi.is_whiteout()) { + dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq + << ", truncating to " << op.extent.truncate_size << dendl; + t->truncate(soid, op.extent.truncate_size); + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + if (oi.size > op.extent.truncate_size) { + interval_set<uint64_t> trim; + trim.insert(op.extent.truncate_size, + oi.size - op.extent.truncate_size); + ctx->modified_ranges.union_of(trim); + } + if (op.extent.truncate_size != oi.size) { + truncate_update_size_and_usage(ctx->delta_stats, + oi, + op.extent.truncate_size); + } + } else { + dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq + << ", but object is new" << dendl; + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + } + } + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + maybe_create_new_object(ctx); + + if (op.extent.length == 0) { + if (op.extent.offset > oi.size) { + t->truncate( + soid, op.extent.offset); + truncate_update_size_and_usage(ctx->delta_stats, oi, + op.extent.offset); + } else { + t->nop(soid); + } + } else { + t->write( + soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags); + } + + if (op.extent.offset == 0 && op.extent.length >= oi.size + && !skip_data_digest) { + obs.oi.set_data_digest(osd_op.indata.crc32c(-1)); + } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) { + if (skip_data_digest) { + obs.oi.clear_data_digest(); + } else { + obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest)); + } + } else { + obs.oi.clear_data_digest(); + } + write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, + op.extent.offset, op.extent.length); + + } + break; + + case CEPH_OSD_OP_WRITEFULL: + ++ctx->num_write; + { // write full object + tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length); + + if (op.extent.length != osd_op.indata.length()) { + result = -EINVAL; + break; + } + result = check_offset_and_length( + 0, op.extent.length, + static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED)) + op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + maybe_create_new_object(ctx); + if (pool.info.is_erasure()) { + t->truncate(soid, 0); + } else if (obs.exists && op.extent.length < oi.size) { + t->truncate(soid, op.extent.length); + } + if (op.extent.length) { + t->write(soid, 0, op.extent.length, osd_op.indata, op.flags); + } + if (!skip_data_digest) { + obs.oi.set_data_digest(osd_op.indata.crc32c(-1)); + } else { + obs.oi.clear_data_digest(); + } + + write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, + 0, op.extent.length, true); + } + break; + + case CEPH_OSD_OP_WRITESAME: + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length); + result = do_writesame(ctx, osd_op); + break; + + case CEPH_OSD_OP_ROLLBACK : + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val); + result = _rollback_to(ctx, op); + break; + + case CEPH_OSD_OP_ZERO: + tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length); + if (pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + { // zero + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + ceph_assert(op.extent.length); + if (obs.exists && !oi.is_whiteout()) { + t->zero(soid, op.extent.offset, op.extent.length); + interval_set<uint64_t> ch; + ch.insert(op.extent.offset, op.extent.length); + ctx->modified_ranges.union_of(ch); + ctx->delta_stats.num_wr++; + oi.clear_data_digest(); + } else { + // no-op + } + } + break; + case CEPH_OSD_OP_CREATE: + ++ctx->num_write; + { + tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val); + int flags = le32_to_cpu(op.flags); + if (obs.exists && !oi.is_whiteout() && + (flags & CEPH_OSD_OP_FLAG_EXCL)) { + result = -EEXIST; /* this is an exclusive create */ + } else { + if (osd_op.indata.length()) { + auto p = osd_op.indata.cbegin(); + string category; + try { + decode(category, p); + } + catch (buffer::error& e) { + result = -EINVAL; + goto fail; + } + // category is no longer implemented. + } + if (result >= 0) { + maybe_create_new_object(ctx); + t->nop(soid); + } + } + } + break; + + case CEPH_OSD_OP_TRIMTRUNC: + op.extent.offset = op.extent.truncate_size; + // falling through + + case CEPH_OSD_OP_TRUNCATE: + tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + if (pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + { + // truncate + if (!obs.exists || oi.is_whiteout()) { + dout(10) << " object dne, truncate is a no-op" << dendl; + break; + } + + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + if (op.extent.truncate_seq) { + ceph_assert(op.extent.offset == op.extent.truncate_size); + if (op.extent.truncate_seq <= oi.truncate_seq) { + dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq + << ", no-op" << dendl; + break; // old + } + dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq + << ", truncating" << dendl; + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + } + + maybe_create_new_object(ctx); + t->truncate(soid, op.extent.offset); + if (oi.size > op.extent.offset) { + interval_set<uint64_t> trim; + trim.insert(op.extent.offset, oi.size-op.extent.offset); + ctx->modified_ranges.union_of(trim); + } + if (op.extent.offset != oi.size) { + truncate_update_size_and_usage(ctx->delta_stats, + oi, + op.extent.offset); + } + ctx->delta_stats.num_wr++; + // do no set exists, or we will break above DELETE -> TRUNCATE munging. + + oi.clear_data_digest(); + } + break; + + case CEPH_OSD_OP_DELETE: + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val); + { + if (oi.has_manifest()) { + if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) && oi.manifest.is_redirect()) { + ctx->register_on_commit( + [oi, ctx, this](){ + object_locator_t target_oloc(oi.manifest.redirect_target); + refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target, + SnapContext(), false, NULL, 0); + }); + } else if (oi.manifest.is_chunked()) { + ctx->register_on_commit( + [oi, ctx, this](){ + for (auto p : oi.manifest.chunk_map) { + if (p.second.has_reference()) { + object_locator_t target_oloc(p.second.oid); + refcount_manifest(ctx->obc, target_oloc, p.second.oid, + SnapContext(), false, NULL, p.first); + } + } + }); + } + } + result = _delete_oid(ctx, false, ctx->ignore_cache); + } + break; + + case CEPH_OSD_OP_WATCH: + ++ctx->num_write; + { + tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val, + op.watch.cookie, op.watch.op); + if (!obs.exists) { + result = -ENOENT; + break; + } + uint64_t cookie = op.watch.cookie; + entity_name_t entity = ctx->reqid.name; + ObjectContextRef obc = ctx->obc; + + dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op) + << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie + << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl; + dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl; + dout(10) << "watch: peer_addr=" + << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl; + + uint32_t timeout = cct->_conf->osd_client_watch_timeout; + if (op.watch.timeout != 0) { + timeout = op.watch.timeout; + } + + watch_info_t w(cookie, timeout, + ctx->op->get_req()->get_connection()->get_peer_addr()); + if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH || + op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) { + if (oi.watchers.count(make_pair(cookie, entity))) { + dout(10) << " found existing watch " << w << " by " << entity << dendl; + } else { + dout(10) << " registered new watch " << w << " by " << entity << dendl; + oi.watchers[make_pair(cookie, entity)] = w; + t->nop(soid); // make sure update the object_info on disk! + } + bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH); + ctx->watch_connects.push_back(make_pair(w, will_ping)); + } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) { + if (!oi.watchers.count(make_pair(cookie, entity))) { + result = -ENOTCONN; + break; + } + dout(10) << " found existing watch " << w << " by " << entity << dendl; + ctx->watch_connects.push_back(make_pair(w, true)); + } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) { + /* Note: WATCH with PING doesn't cause may_write() to return true, + * so if there is nothing else in the transaction, this is going + * to run do_osd_op_effects, but not write out a log entry */ + if (!oi.watchers.count(make_pair(cookie, entity))) { + result = -ENOTCONN; + break; + } + map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p = + obc->watchers.find(make_pair(cookie, entity)); + if (p == obc->watchers.end() || + !p->second->is_connected()) { + // client needs to reconnect + result = -ETIMEDOUT; + break; + } + dout(10) << " found existing watch " << w << " by " << entity << dendl; + p->second->got_ping(ceph_clock_now()); + result = 0; + } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) { + map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter = + oi.watchers.find(make_pair(cookie, entity)); + if (oi_iter != oi.watchers.end()) { + dout(10) << " removed watch " << oi_iter->second << " by " + << entity << dendl; + oi.watchers.erase(oi_iter); + t->nop(soid); // update oi on disk + ctx->watch_disconnects.push_back( + watch_disconnect_t(cookie, entity, false)); + } else { + dout(10) << " can't remove: no watch by " << entity << dendl; + } + } + } + break; + + case CEPH_OSD_OP_CACHE_PIN: + tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val); + if ((!pool.info.is_tier() || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) { + result = -EINVAL; + dout(10) << " pin object is only allowed on the cache tier " << dendl; + break; + } + ++ctx->num_write; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + + if (!oi.is_cache_pinned()) { + oi.set_flag(object_info_t::FLAG_CACHE_PIN); + ctx->modify = true; + ctx->delta_stats.num_objects_pinned++; + ctx->delta_stats.num_wr++; + } + result = 0; + } + break; + + case CEPH_OSD_OP_CACHE_UNPIN: + tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val); + if ((!pool.info.is_tier() || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) { + result = -EINVAL; + dout(10) << " pin object is only allowed on the cache tier " << dendl; + break; + } + ++ctx->num_write; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + + if (oi.is_cache_pinned()) { + oi.clear_flag(object_info_t::FLAG_CACHE_PIN); + ctx->modify = true; + ctx->delta_stats.num_objects_pinned--; + ctx->delta_stats.num_wr++; + } + result = 0; + } + break; + + case CEPH_OSD_OP_SET_REDIRECT: + ++ctx->num_write; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) { + result = -EOPNOTSUPP; + break; + } + + object_t target_name; + object_locator_t target_oloc; + snapid_t target_snapid = (uint64_t)op.copy_from.snapid; + version_t target_version = op.copy_from.src_version; + try { + decode(target_name, bp); + decode(target_oloc, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + goto fail; + } + pg_t raw_pg; + get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg); + hobject_t target(target_name, target_oloc.key, target_snapid, + raw_pg.ps(), raw_pg.pool(), + target_oloc.nspace); + if (target == soid) { + dout(20) << " set-redirect self is invalid" << dendl; + result = -EINVAL; + break; + } + + bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE); + bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE); + if (has_reference) { + result = -EINVAL; + dout(5) << " the object is already a manifest " << dendl; + break; + } + if (op_finisher == nullptr && need_reference) { + // start + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new SetManifestFinisher(osd_op)); + RefCountCallback *fin = new RefCountCallback( + this, ctx, osd_op, get_last_peering_reset()); + refcount_manifest(ctx->obc, target_oloc, target, SnapContext(), + true, fin, 0); + result = -EINPROGRESS; + } else { + // finish + if (op_finisher) { + result = op_finisher->execute(); + ceph_assert(result == 0); + } + + if (!oi.has_manifest() && !oi.manifest.is_redirect()) + ctx->delta_stats.num_objects_manifest++; + + oi.set_flag(object_info_t::FLAG_MANIFEST); + oi.manifest.redirect_target = target; + oi.manifest.type = object_manifest_t::TYPE_REDIRECT; + t->truncate(soid, 0); + if (oi.is_omap() && pool.info.supports_omap()) { + t->omap_clear(soid); + obs.oi.clear_omap_digest(); + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + ctx->delta_stats.num_bytes -= oi.size; + oi.size = 0; + oi.new_object(); + oi.user_version = target_version; + ctx->user_at_version = target_version; + /* rm_attrs */ + map<string,bufferlist> rmattrs; + result = getattrs_maybe_cache(ctx->obc, &rmattrs); + if (result < 0) { + dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl; + return result; + } + map<string, bufferlist>::iterator iter; + for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) { + const string& name = iter->first; + t->rmattr(soid, name); + } + if (!has_reference && need_reference) { + oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE); + } + dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl; + if (op_finisher) { + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + } + + break; + + case CEPH_OSD_OP_SET_CHUNK: + ++ctx->num_write; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) { + result = -EOPNOTSUPP; + break; + } + + object_locator_t tgt_oloc; + uint64_t src_offset, src_length, tgt_offset; + object_t tgt_name; + try { + decode(src_offset, bp); + decode(src_length, bp); + decode(tgt_oloc, bp); + decode(tgt_name, bp); + decode(tgt_offset, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + goto fail; + } + + if (!src_length) { + result = -EINVAL; + goto fail; + } + + for (auto &p : oi.manifest.chunk_map) { + if ((p.first <= src_offset && p.first + p.second.length > src_offset) || + (p.first > src_offset && p.first <= src_offset + src_length)) { + dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length + << " chunk_info: " << p << dendl; + result = -EOPNOTSUPP; + goto fail; + } + } + + if (!oi.manifest.is_chunked()) { + oi.manifest.clear(); + } + + pg_t raw_pg; + chunk_info_t chunk_info; + get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg); + hobject_t target(tgt_name, tgt_oloc.key, snapid_t(), + raw_pg.ps(), raw_pg.pool(), + tgt_oloc.nspace); + bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE); + bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) && + (oi.manifest.chunk_map[src_offset].flags & chunk_info_t::FLAG_HAS_REFERENCE); + if (has_reference) { + result = -EINVAL; + dout(5) << " the object is already a manifest " << dendl; + break; + } + if (op_finisher == nullptr && need_reference) { + // start + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new SetManifestFinisher(osd_op)); + RefCountCallback *fin = new RefCountCallback( + this, ctx, osd_op, get_last_peering_reset()); + refcount_manifest(ctx->obc, tgt_oloc, target, SnapContext(), + true, fin, src_offset); + result = -EINPROGRESS; + } else { + if (op_finisher) { + result = op_finisher->execute(); + ceph_assert(result == 0); + } + + chunk_info_t chunk_info; + chunk_info.set_flag(chunk_info_t::FLAG_MISSING); + chunk_info.oid = target; + chunk_info.offset = tgt_offset; + chunk_info.length= src_length; + oi.manifest.chunk_map[src_offset] = chunk_info; + if (!oi.has_manifest() && !oi.manifest.is_chunked()) + ctx->delta_stats.num_objects_manifest++; + oi.set_flag(object_info_t::FLAG_MANIFEST); + oi.manifest.type = object_manifest_t::TYPE_CHUNKED; + if (!has_reference && need_reference) { + oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE); + } + if (need_reference && pool.info.get_fingerprint_type() != pg_pool_t::TYPE_FINGERPRINT_NONE) { + oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_FINGERPRINT); + } + ctx->modify = true; + + dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version + << " chunk_info: " << chunk_info << dendl; + if (op_finisher) { + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + } + + break; + + case CEPH_OSD_OP_TIER_PROMOTE: + ++ctx->num_write; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) { + result = -EOPNOTSUPP; + break; + } + if (!obs.oi.has_manifest()) { + result = 0; + break; + } + + if (op_finisher == nullptr) { + PromoteManifestCallback *cb; + object_locator_t my_oloc; + hobject_t src_hoid; + + if (obs.oi.manifest.is_chunked()) { + src_hoid = obs.oi.soid; + cb = new PromoteManifestCallback(ctx->obc, this, ctx); + } else if (obs.oi.manifest.is_redirect()) { + object_locator_t src_oloc(obs.oi.manifest.redirect_target); + my_oloc = src_oloc; + src_hoid = obs.oi.manifest.redirect_target; + cb = new PromoteManifestCallback(ctx->obc, this, ctx); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new PromoteFinisher(cb)); + unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | + CEPH_OSD_COPY_FROM_FLAG_RWORDERED; + unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL; + start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags, + obs.oi.soid.snap == CEPH_NOSNAP, + src_fadvise_flags, 0); + + dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl; + result = -EINPROGRESS; + } else { + result = op_finisher->execute(); + ceph_assert(result == 0); + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + + break; + + case CEPH_OSD_OP_UNSET_MANIFEST: + ++ctx->num_write; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (!oi.has_manifest()) { + result = -EOPNOTSUPP; + break; + } + if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) { + result = -EOPNOTSUPP; + break; + } + + if (oi.manifest.is_redirect()) { + if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) { + ctx->register_on_commit( + [oi, ctx, this](){ + object_locator_t target_oloc(oi.manifest.redirect_target); + refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target, + SnapContext(), false, NULL, 0); + }); + } + } else if (oi.manifest.is_chunked()) { + ctx->register_on_commit( + [oi, ctx, this](){ + for (auto p : oi.manifest.chunk_map) { + if (p.second.flags & chunk_info_t::FLAG_HAS_REFERENCE) { + object_locator_t target_oloc(p.second.oid); + refcount_manifest(ctx->obc, target_oloc, p.second.oid, + SnapContext(), false, NULL, p.first); + } + } + }); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + + oi.clear_flag(object_info_t::FLAG_MANIFEST); + oi.manifest = object_manifest_t(); + ctx->delta_stats.num_objects_manifest--; + ctx->delta_stats.num_wr++; + ctx->modify = true; + } + + break; + + // -- object attrs -- + + case CEPH_OSD_OP_SETXATTR: + ++ctx->num_write; + { + if (cct->_conf->osd_max_attr_size > 0 && + op.xattr.value_len > cct->_conf->osd_max_attr_size) { + tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???"); + result = -EFBIG; + break; + } + unsigned max_name_len = + std::min<uint64_t>(osd->store->get_max_attr_name_length(), + cct->_conf->osd_max_attr_name_len); + if (op.xattr.name_len > max_name_len) { + result = -ENAMETOOLONG; + break; + } + maybe_create_new_object(ctx); + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + bufferlist bl; + bp.copy(op.xattr.value_len, bl); + t->setattr(soid, name, bl); + ctx->delta_stats.num_wr++; + } + break; + + case CEPH_OSD_OP_RMXATTR: + ++ctx->num_write; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + string name = "_" + aname; + t->rmattr(soid, name); + ctx->delta_stats.num_wr++; + } + break; + + + // -- fancy writers -- + case CEPH_OSD_OP_APPEND: + { + tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + // just do it inline; this works because we are happy to execute + // fancy op on replicas as well. + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITE; + newop.op.extent.offset = oi.size; + newop.op.extent.length = op.extent.length; + newop.op.extent.truncate_seq = oi.truncate_seq; + newop.indata = osd_op.indata; + result = do_osd_ops(ctx, nops); + osd_op.outdata.claim(newop.outdata); + } + break; + + case CEPH_OSD_OP_STARTSYNC: + t->nop(soid); + break; + + // -- trivial map -- + case CEPH_OSD_OP_TMAPGET: + tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + { + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_SYNC_READ; + newop.op.extent.offset = 0; + newop.op.extent.length = 0; + do_osd_ops(ctx, nops); + osd_op.outdata.claim(newop.outdata); + } + break; + + case CEPH_OSD_OP_TMAPPUT: + tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + { + //_dout_lock.Lock(); + //osd_op.data.hexdump(*_dout); + //_dout_lock.Unlock(); + + // verify sort order + bool unsorted = false; + if (true) { + bufferlist header; + decode(header, bp); + uint32_t n; + decode(n, bp); + string last_key; + while (n--) { + string key; + decode(key, bp); + dout(10) << "tmapput key " << key << dendl; + bufferlist val; + decode(val, bp); + if (key < last_key) { + dout(10) << "TMAPPUT is unordered; resorting" << dendl; + unsorted = true; + break; + } + last_key = key; + } + } + + // write it + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = osd_op.indata.length(); + newop.indata = osd_op.indata; + + if (unsorted) { + bp = osd_op.indata.begin(); + bufferlist header; + map<string, bufferlist> m; + decode(header, bp); + decode(m, bp); + ceph_assert(bp.end()); + bufferlist newbl; + encode(header, newbl); + encode(m, newbl); + newop.indata = newbl; + } + result = do_osd_ops(ctx, nops); + ceph_assert(result == 0); + } + break; + + case CEPH_OSD_OP_TMAPUP: + tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = do_tmapup(ctx, bp, osd_op); + break; + + case CEPH_OSD_OP_TMAP2OMAP: + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val); + result = do_tmap2omap(ctx, op.tmap2omap.flags); + break; + + // OMAP Read ops + case CEPH_OSD_OP_OMAPGETKEYS: + ++ctx->num_read; + { + string start_after; + uint64_t max_return; + try { + decode(start_after, bp); + decode(max_return, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0); + goto fail; + } + if (max_return > cct->_conf->osd_max_omap_entries_per_request) { + max_return = cct->_conf->osd_max_omap_entries_per_request; + } + tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return); + + bufferlist bl; + uint32_t num = 0; + bool truncated = false; + if (oi.is_omap()) { + ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( + ch, ghobject_t(soid) + ); + ceph_assert(iter); + iter->upper_bound(start_after); + for (num = 0; iter->valid(); ++num, iter->next()) { + if (num >= max_return || + bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(iter->key(), bl); + } + } // else return empty out_set + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(bl); + encode(truncated, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETVALS: + ++ctx->num_read; + { + string start_after; + uint64_t max_return; + string filter_prefix; + try { + decode(start_after, bp); + decode(max_return, bp); + decode(filter_prefix, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???"); + goto fail; + } + if (max_return > cct->_conf->osd_max_omap_entries_per_request) { + max_return = cct->_conf->osd_max_omap_entries_per_request; + } + tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str()); + + uint32_t num = 0; + bool truncated = false; + bufferlist bl; + if (oi.is_omap()) { + ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( + ch, ghobject_t(soid) + ); + if (!iter) { + result = -ENOENT; + goto fail; + } + iter->upper_bound(start_after); + if (filter_prefix > start_after) iter->lower_bound(filter_prefix); + for (num = 0; + iter->valid() && + iter->key().substr(0, filter_prefix.size()) == filter_prefix; + ++num, iter->next()) { + dout(20) << "Found key " << iter->key() << dendl; + if (num >= max_return || + bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(iter->key(), bl); + encode(iter->value(), bl); + } + } // else return empty out_set + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(bl); + encode(truncated, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETHEADER: + tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val); + if (!oi.is_omap()) { + // return empty header + break; + } + ++ctx->num_read; + { + osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + ++ctx->num_read; + { + set<string> keys_to_get; + try { + decode(keys_to_get, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???"); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str()); + map<string, bufferlist> out; + if (oi.is_omap()) { + osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out); + } // else return empty omap entries + encode(out, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAP_CMP: + ++ctx->num_read; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???"); + break; + } + map<string, pair<bufferlist, int> > assertions; + try { + decode(assertions, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???"); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str()); + + map<string, bufferlist> out; + + if (oi.is_omap()) { + set<string> to_get; + for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin(); + i != assertions.end(); + ++i) + to_get.insert(i->first); + int r = osd->store->omap_get_values(ch, ghobject_t(soid), + to_get, &out); + if (r < 0) { + result = r; + break; + } + } // else leave out empty + + //Should set num_rd_kb based on encode length of map + ctx->delta_stats.num_rd++; + + int r = 0; + bufferlist empty; + for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin(); + i != assertions.end(); + ++i) { + auto out_entry = out.find(i->first); + bufferlist &bl = (out_entry != out.end()) ? + out_entry->second : empty; + switch (i->second.second) { + case CEPH_OSD_CMPXATTR_OP_EQ: + if (!(bl == i->second.first)) { + r = -ECANCELED; + } + break; + case CEPH_OSD_CMPXATTR_OP_LT: + if (!(bl < i->second.first)) { + r = -ECANCELED; + } + break; + case CEPH_OSD_CMPXATTR_OP_GT: + if (!(bl > i->second.first)) { + r = -ECANCELED; + } + break; + default: + r = -EINVAL; + break; + } + if (r < 0) + break; + } + if (r < 0) { + result = r; + } + } + break; + + // OMAP Write ops + case CEPH_OSD_OP_OMAPSETVALS: + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + break; + } + ++ctx->num_write; + { + maybe_create_new_object(ctx); + bufferlist to_set_bl; + try { + decode_str_str_map_to_bl(bp, &to_set_bl); + } + catch (buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) { + dout(20) << "setting vals: " << dendl; + map<string,bufferlist> to_set; + bufferlist::const_iterator pt = to_set_bl.begin(); + decode(to_set, pt); + for (map<string, bufferlist>::iterator i = to_set.begin(); + i != to_set.end(); + ++i) { + dout(20) << "\t" << i->first << dendl; + } + } + t->omap_setkeys(soid, to_set_bl); + ctx->delta_stats.num_wr++; + ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10); + } + obs.oi.set_flag(object_info_t::FLAG_OMAP); + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_OMAPSETHEADER: + tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val); + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + { + maybe_create_new_object(ctx); + t->omap_setheader(soid, osd_op.indata); + ctx->delta_stats.num_wr++; + } + obs.oi.set_flag(object_info_t::FLAG_OMAP); + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_OMAPCLEAR: + tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val); + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + if (oi.is_omap()) { + t->omap_clear(soid); + ctx->delta_stats.num_wr++; + obs.oi.clear_omap_digest(); + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + } + break; + + case CEPH_OSD_OP_OMAPRMKEYS: + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + break; + } + ++ctx->num_write; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + break; + } + bufferlist to_rm_bl; + try { + decode_str_set_to_bl(bp, &to_rm_bl); + } + catch (buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + t->omap_rmkeys(soid, to_rm_bl); + ctx->delta_stats.num_wr++; + } + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_COPY_GET: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(), + soid.snap.val); + if (op_finisher == nullptr) { + result = do_copy_get(ctx, bp, osd_op, ctx->obc); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_COPY_FROM: + ++ctx->num_write; + { + object_t src_name; + object_locator_t src_oloc; + snapid_t src_snapid = (uint64_t)op.copy_from.snapid; + version_t src_version = op.copy_from.src_version; + try { + decode(src_name, bp); + decode(src_oloc, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + tracepoint(osd, + do_osd_op_pre_copy_from, + soid.oid.name.c_str(), + soid.snap.val, + "???", + 0, + "???", + "???", + 0, + src_snapid, + src_version); + goto fail; + } + tracepoint(osd, + do_osd_op_pre_copy_from, + soid.oid.name.c_str(), + soid.snap.val, + src_name.name.c_str(), + src_oloc.pool, + src_oloc.key.c_str(), + src_oloc.nspace.c_str(), + src_oloc.hash, + src_snapid, + src_version); + if (op_finisher == nullptr) { + // start + pg_t raw_pg; + get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg); + hobject_t src(src_name, src_oloc.key, src_snapid, + raw_pg.ps(), raw_pg.pool(), + src_oloc.nspace); + if (src == soid) { + dout(20) << " copy from self is invalid" << dendl; + result = -EINVAL; + break; + } + CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op); + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new CopyFromFinisher(cb)); + start_copy(cb, ctx->obc, src, src_oloc, src_version, + op.copy_from.flags, + false, + op.copy_from.src_fadvise_flags, + op.flags); + result = -EINPROGRESS; + } else { + // finish + result = op_finisher->execute(); + ceph_assert(result == 0); + + // COPY_FROM cannot be executed multiple times -- it must restart + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + break; + + default: + tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op)); + dout(1) << "unrecognized osd op " << op.op + << " " << ceph_osd_op_name(op.op) + << dendl; + result = -EOPNOTSUPP; + } + + fail: + osd_op.rval = result; + tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result); + if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) && + result != -EAGAIN && result != -EINPROGRESS) + result = 0; + + if (result < 0) + break; + } + if (result < 0) { + dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl; + } + return result; +} + +int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals) +{ + if (ctx->new_obs.oi.size == 0) { + dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl; + return -ENODATA; + } + vector<OSDOp> nops(1); + OSDOp &newop = nops[0]; + newop.op.op = CEPH_OSD_OP_TMAPGET; + do_osd_ops(ctx, nops); + try { + bufferlist::const_iterator i = newop.outdata.begin(); + decode(*header, i); + (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining()); + } catch (...) { + dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid + << dendl; + return -EINVAL; + } + dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid + << dendl; + return 0; +} + +int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid, + const SnapSet& ss) +{ + // verify that all clones have been evicted + dout(20) << __func__ << " verifying clones are absent " + << ss << dendl; + for (vector<snapid_t>::const_iterator p = ss.clones.begin(); + p != ss.clones.end(); + ++p) { + hobject_t clone_oid = soid; + clone_oid.snap = *p; + if (is_missing_object(clone_oid)) + return -EBUSY; + ObjectContextRef clone_obc = get_object_context(clone_oid, false); + if (clone_obc && clone_obc->obs.exists) { + dout(10) << __func__ << " cannot evict head before clone " + << clone_oid << dendl; + return -EBUSY; + } + if (copy_ops.count(clone_oid)) { + dout(10) << __func__ << " cannot evict head, pending promote on clone " + << clone_oid << dendl; + return -EBUSY; + } + } + return 0; +} + +inline int PrimaryLogPG::_delete_oid( + OpContext *ctx, + bool no_whiteout, // no whiteouts, no matter what. + bool try_no_whiteout) // try not to whiteout +{ + SnapSet& snapset = ctx->new_snapset; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + PGTransaction* t = ctx->op_t.get(); + + // cache: cache: set whiteout on delete? + bool whiteout = false; + if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE + && !no_whiteout + && !try_no_whiteout) { + whiteout = true; + } + + // in luminous or later, we can't delete the head if there are + // clones. we trust the caller passing no_whiteout has already + // verified they don't exist. + if (!snapset.clones.empty() || + (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) { + if (no_whiteout) { + dout(20) << __func__ << " has or will have clones but no_whiteout=1" + << dendl; + } else { + dout(20) << __func__ << " has or will have clones; will whiteout" + << dendl; + whiteout = true; + } + } + dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout + << " no_whiteout=" << (int)no_whiteout + << " try_no_whiteout=" << (int)try_no_whiteout + << dendl; + if (!obs.exists || (obs.oi.is_whiteout() && whiteout)) + return -ENOENT; + + t->remove(soid); + + if (oi.size > 0) { + interval_set<uint64_t> ch; + ch.insert(0, oi.size); + ctx->modified_ranges.union_of(ch); + } + + ctx->delta_stats.num_wr++; + if (soid.is_snap()) { + ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap)); + ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap); + } else { + ctx->delta_stats.num_bytes -= oi.size; + } + oi.size = 0; + oi.new_object(); + + // disconnect all watchers + for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p = + oi.watchers.begin(); + p != oi.watchers.end(); + ++p) { + dout(20) << __func__ << " will disconnect watcher " << p->first << dendl; + ctx->watch_disconnects.push_back( + watch_disconnect_t(p->first.first, p->first.second, true)); + } + oi.watchers.clear(); + + if (whiteout) { + dout(20) << __func__ << " setting whiteout on " << soid << dendl; + oi.set_flag(object_info_t::FLAG_WHITEOUT); + ctx->delta_stats.num_whiteouts++; + t->create(soid); + osd->logger->inc(l_osd_tier_whiteout); + return 0; + } + + // delete the head + ctx->delta_stats.num_objects--; + if (soid.is_snap()) + ctx->delta_stats.num_object_clones--; + if (oi.is_whiteout()) { + dout(20) << __func__ << " deleting whiteout on " << soid << dendl; + ctx->delta_stats.num_whiteouts--; + oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + if (oi.is_cache_pinned()) { + ctx->delta_stats.num_objects_pinned--; + } + if (oi.has_manifest()) { + ctx->delta_stats.num_objects_manifest--; + } + obs.exists = false; + return 0; +} + +int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) +{ + SnapSet& snapset = ctx->new_snapset; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + PGTransaction* t = ctx->op_t.get(); + snapid_t snapid = (uint64_t)op.snap.snapid; + hobject_t missing_oid; + + dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl; + + ObjectContextRef rollback_to; + + int ret = find_object_context( + hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(), + soid.get_namespace()), + &rollback_to, false, false, &missing_oid); + if (ret == -EAGAIN) { + /* clone must be missing */ + ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid)); + dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone " + << missing_oid << " (requested snapid: ) " << snapid << dendl; + block_write_on_degraded_snap(missing_oid, ctx->op); + return ret; + } + { + ObjectContextRef promote_obc; + cache_result_t tier_mode_result; + if (obs.exists && obs.oi.has_manifest()) { + tier_mode_result = + maybe_handle_manifest_detail( + ctx->op, + true, + rollback_to); + } else { + tier_mode_result = + maybe_handle_cache_detail( + ctx->op, + true, + rollback_to, + ret, + missing_oid, + true, + false, + &promote_obc); + } + switch (tier_mode_result) { + case cache_result_t::NOOP: + break; + case cache_result_t::BLOCKED_PROMOTE: + ceph_assert(promote_obc); + block_write_on_snap_rollback(soid, promote_obc, ctx->op); + return -EAGAIN; + case cache_result_t::BLOCKED_FULL: + block_write_on_full_cache(soid, ctx->op); + return -EAGAIN; + case cache_result_t::REPLIED_WITH_EAGAIN: + ceph_abort_msg("this can't happen, no rollback on replica"); + default: + ceph_abort_msg("must promote was set, other values are not valid"); + return -EAGAIN; + } + } + + if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) { + // there's no snapshot here, or there's no object. + // if there's no snapshot, we delete the object; otherwise, do nothing. + dout(20) << "_rollback_to deleting head on " << soid.oid + << " because got ENOENT|whiteout on find_object_context" << dendl; + if (ctx->obc->obs.oi.watchers.size()) { + // Cannot delete an object with watchers + ret = -EBUSY; + } else { + _delete_oid(ctx, false, false); + ret = 0; + } + } else if (ret) { + // ummm....huh? It *can't* return anything else at time of writing. + ceph_abort_msg("unexpected error code in _rollback_to"); + } else { //we got our context, let's use it to do the rollback! + hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid; + if (is_degraded_or_backfilling_object(rollback_to_sobject) || + is_degraded_on_async_recovery_target(rollback_to_sobject)) { + dout(20) << "_rollback_to attempted to roll back to a degraded object " + << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl; + block_write_on_degraded_snap(rollback_to_sobject, ctx->op); + ret = -EAGAIN; + } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) { + // rolling back to the head; we just need to clone it. + ctx->modify = true; + } else { + /* 1) Delete current head + * 2) Clone correct snapshot into head + * 3) Calculate clone_overlaps by following overlaps + * forward from rollback snapshot */ + dout(10) << "_rollback_to deleting " << soid.oid + << " and rolling back to old snap" << dendl; + + if (obs.exists) { + t->remove(soid); + } + t->clone(soid, rollback_to_sobject); + t->add_obc(rollback_to); + + map<snapid_t, interval_set<uint64_t> >::iterator iter = + snapset.clone_overlap.lower_bound(snapid); + ceph_assert(iter != snapset.clone_overlap.end()); + interval_set<uint64_t> overlaps = iter->second; + for ( ; + iter != snapset.clone_overlap.end(); + ++iter) + overlaps.intersection_of(iter->second); + + if (obs.oi.size > 0) { + interval_set<uint64_t> modified; + modified.insert(0, obs.oi.size); + overlaps.intersection_of(modified); + modified.subtract(overlaps); + ctx->modified_ranges.union_of(modified); + } + + // Adjust the cached objectcontext + maybe_create_new_object(ctx, true); + ctx->delta_stats.num_bytes -= obs.oi.size; + ctx->delta_stats.num_bytes += rollback_to->obs.oi.size; + obs.oi.size = rollback_to->obs.oi.size; + if (rollback_to->obs.oi.is_data_digest()) + obs.oi.set_data_digest(rollback_to->obs.oi.data_digest); + else + obs.oi.clear_data_digest(); + if (rollback_to->obs.oi.is_omap_digest()) + obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest); + else + obs.oi.clear_omap_digest(); + + if (rollback_to->obs.oi.is_omap()) { + dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl; + obs.oi.set_flag(object_info_t::FLAG_OMAP); + } else { + dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + } + } + return ret; +} + +void PrimaryLogPG::_make_clone( + OpContext *ctx, + PGTransaction* t, + ObjectContextRef obc, + const hobject_t& head, const hobject_t& coid, + object_info_t *poi) +{ + bufferlist bv; + encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + + t->clone(coid, head); + setattr_maybe_cache(obc, t, OI_ATTR, bv); + rmattr_maybe_cache(obc, t, SS_ATTR); +} + +void PrimaryLogPG::make_writeable(OpContext *ctx) +{ + const hobject_t& soid = ctx->obs->oi.soid; + SnapContext& snapc = ctx->snapc; + + // clone? + ceph_assert(soid.snap == CEPH_NOSNAP); + dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset + << " snapc=" << snapc << dendl; + + bool was_dirty = ctx->obc->obs.oi.is_dirty(); + if (ctx->new_obs.exists) { + // we will mark the object dirty + if (ctx->undirty && was_dirty) { + dout(20) << " clearing DIRTY flag" << dendl; + ceph_assert(ctx->new_obs.oi.is_dirty()); + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + osd->logger->inc(l_osd_tier_clean); + } else if (!was_dirty && !ctx->undirty) { + dout(20) << " setting DIRTY flag" << dendl; + ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY); + ++ctx->delta_stats.num_objects_dirty; + osd->logger->inc(l_osd_tier_dirty); + } + } else { + if (was_dirty) { + dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + } + } + + if ((ctx->new_obs.exists && + ctx->new_obs.oi.is_omap()) && + (!ctx->obc->obs.exists || + !ctx->obc->obs.oi.is_omap())) { + ++ctx->delta_stats.num_objects_omap; + } + if ((!ctx->new_obs.exists || + !ctx->new_obs.oi.is_omap()) && + (ctx->obc->obs.exists && + ctx->obc->obs.oi.is_omap())) { + --ctx->delta_stats.num_objects_omap; + } + + if (ctx->new_snapset.seq > snapc.seq) { + dout(10) << " op snapset is old" << dendl; + } + + if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed) + snapc.snaps.size() && // there are snaps + !ctx->cache_evict && + snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old + // clone + hobject_t coid = soid; + coid.snap = snapc.seq; + + unsigned l; + for (l = 1; + l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; + l++) ; + + vector<snapid_t> snaps(l); + for (unsigned i=0; i<l; i++) + snaps[i] = snapc.snaps[i]; + + // prepare clone + object_info_t static_snap_oi(coid); + object_info_t *snap_oi; + if (is_primary()) { + ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid); + ctx->clone_obc->destructor_callback = + new C_PG_ObjectContext(this, ctx->clone_obc.get()); + ctx->clone_obc->obs.oi = static_snap_oi; + ctx->clone_obc->obs.exists = true; + ctx->clone_obc->ssc = ctx->obc->ssc; + ctx->clone_obc->ssc->ref++; + if (pool.info.is_erasure()) + ctx->clone_obc->attr_cache = ctx->obc->attr_cache; + snap_oi = &ctx->clone_obc->obs.oi; + bool got = ctx->lock_manager.get_write_greedy( + coid, + ctx->clone_obc, + ctx->op); + ceph_assert(got); + dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl; + } else { + snap_oi = &static_snap_oi; + } + snap_oi->version = ctx->at_version; + snap_oi->prior_version = ctx->obs->oi.version; + snap_oi->copy_user_bits(ctx->obs->oi); + + _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi); + + ctx->delta_stats.num_objects++; + if (snap_oi->is_dirty()) { + ctx->delta_stats.num_objects_dirty++; + osd->logger->inc(l_osd_tier_dirty); + } + if (snap_oi->is_omap()) + ctx->delta_stats.num_objects_omap++; + if (snap_oi->is_cache_pinned()) + ctx->delta_stats.num_objects_pinned++; + if (snap_oi->has_manifest()) + ctx->delta_stats.num_objects_manifest++; + ctx->delta_stats.num_object_clones++; + ctx->new_snapset.clones.push_back(coid.snap); + ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size; + ctx->new_snapset.clone_snaps[coid.snap] = snaps; + + // clone_overlap should contain an entry for each clone + // (an empty interval_set if there is no overlap) + ctx->new_snapset.clone_overlap[coid.snap]; + if (ctx->obs->oi.size) + ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size); + + // log clone + dout(10) << " cloning v " << ctx->obs->oi.version + << " to " << coid << " v " << ctx->at_version + << " snaps=" << snaps + << " snapset=" << ctx->new_snapset << dendl; + ctx->log.push_back(pg_log_entry_t( + pg_log_entry_t::CLONE, coid, ctx->at_version, + ctx->obs->oi.version, + ctx->obs->oi.user_version, + osd_reqid_t(), ctx->new_obs.oi.mtime, 0)); + encode(snaps, ctx->log.back().snaps); + + ctx->at_version.version++; + } + + // update most recent clone_overlap and usage stats + if (ctx->new_snapset.clones.size() > 0) { + // the clone_overlap is difference of range between head and clones. + // we need to check whether the most recent clone exists, if it's + // been evicted, it's not included in the stats, but the clone_overlap + // is still exist in the snapset, so we should update the + // clone_overlap to make it sense. + hobject_t last_clone_oid = soid; + last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first; + interval_set<uint64_t> &newest_overlap = + ctx->new_snapset.clone_overlap.rbegin()->second; + ctx->modified_ranges.intersection_of(newest_overlap); + if (is_present_clone(last_clone_oid)) { + // modified_ranges is still in use by the clone + ctx->delta_stats.num_bytes += ctx->modified_ranges.size(); + } + newest_overlap.subtract(ctx->modified_ranges); + } + + if (snapc.seq > ctx->new_snapset.seq) { + // update snapset with latest snap context + ctx->new_snapset.seq = snapc.seq; + ctx->new_snapset.snaps = snapc.snaps; + } + dout(20) << "make_writeable " << soid + << " done, snapset=" << ctx->new_snapset << dendl; +} + + +void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi, + interval_set<uint64_t>& modified, uint64_t offset, + uint64_t length, bool write_full) +{ + interval_set<uint64_t> ch; + if (write_full) { + if (oi.size) + ch.insert(0, oi.size); + } else if (length) + ch.insert(offset, length); + modified.union_of(ch); + if (write_full || + (offset + length > oi.size && length)) { + uint64_t new_size = offset + length; + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += new_size; + oi.size = new_size; + } + + if (oi.has_manifest() && oi.manifest.is_chunked()) { + for (auto &p : oi.manifest.chunk_map) { + if ((p.first <= offset && p.first + p.second.length > offset) || + (p.first > offset && p.first <= offset + length)) { + p.second.clear_flag(chunk_info_t::FLAG_MISSING); + p.second.set_flag(chunk_info_t::FLAG_DIRTY); + } + } + } + delta_stats.num_wr++; + delta_stats.num_wr_kb += shift_round_up(length, 10); +} + +void PrimaryLogPG::truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size) +{ + if (oi.size != truncate_size) { + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += truncate_size; + oi.size = truncate_size; + } +} + +void PrimaryLogPG::complete_disconnect_watches( + ObjectContextRef obc, + const list<watch_disconnect_t> &to_disconnect) +{ + for (list<watch_disconnect_t>::const_iterator i = + to_disconnect.begin(); + i != to_disconnect.end(); + ++i) { + pair<uint64_t, entity_name_t> watcher(i->cookie, i->name); + auto watchers_entry = obc->watchers.find(watcher); + if (watchers_entry != obc->watchers.end()) { + WatchRef watch = watchers_entry->second; + dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl; + obc->watchers.erase(watcher); + watch->remove(i->send_disconnect); + } else { + dout(10) << "do_osd_op_effects disconnect failed to find watcher " + << watcher << dendl; + } + } +} + +void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn) +{ + entity_name_t entity = ctx->reqid.name; + dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl; + + // disconnects first + complete_disconnect_watches(ctx->obc, ctx->watch_disconnects); + + ceph_assert(conn); + + auto session = conn->get_priv(); + if (!session) + return; + + for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin(); + i != ctx->watch_connects.end(); + ++i) { + pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity); + dout(15) << "do_osd_op_effects applying watch connect on session " + << session.get() << " watcher " << watcher << dendl; + WatchRef watch; + if (ctx->obc->watchers.count(watcher)) { + dout(15) << "do_osd_op_effects found existing watch watcher " << watcher + << dendl; + watch = ctx->obc->watchers[watcher]; + } else { + dout(15) << "do_osd_op_effects new watcher " << watcher + << dendl; + watch = Watch::makeWatchRef( + this, osd, ctx->obc, i->first.timeout_seconds, + i->first.cookie, entity, conn->get_peer_addr()); + ctx->obc->watchers.insert( + make_pair( + watcher, + watch)); + } + watch->connect(conn, i->second); + } + + for (list<notify_info_t>::iterator p = ctx->notifies.begin(); + p != ctx->notifies.end(); + ++p) { + dout(10) << "do_osd_op_effects, notify " << *p << dendl; + ConnectionRef conn(ctx->op->get_req()->get_connection()); + NotifyRef notif( + Notify::makeNotifyRef( + conn, + ctx->reqid.name.num(), + p->bl, + p->timeout, + p->cookie, + p->notify_id, + ctx->obc->obs.oi.user_version, + osd)); + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i = + ctx->obc->watchers.begin(); + i != ctx->obc->watchers.end(); + ++i) { + dout(10) << "starting notify on watch " << i->first << dendl; + i->second->start_notify(notif); + } + notif->init(); + } + + for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin(); + p != ctx->notify_acks.end(); + ++p) { + if (p->watch_cookie) + dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl; + else + dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl; + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i = + ctx->obc->watchers.begin(); + i != ctx->obc->watchers.end(); + ++i) { + if (i->first.second != entity) continue; + if (p->watch_cookie && + p->watch_cookie.get() != i->first.first) continue; + dout(10) << "acking notify on watch " << i->first << dendl; + i->second->notify_ack(p->notify_id, p->reply_bl); + } + } +} + +hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target) +{ + ostringstream ss; + ss << "temp_" << info.pgid << "_" << get_role() + << "_" << osd->monc->get_global_id() << "_" << (++temp_seq); + hobject_t hoid = target.make_temp_hobject(ss.str()); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +hobject_t PrimaryLogPG::get_temp_recovery_object( + const hobject_t& target, + eversion_t version) +{ + ostringstream ss; + ss << "temp_recovering_" << info.pgid // (note this includes the shardid) + << "_" << version + << "_" << info.history.same_interval_since + << "_" << target.snap; + // pgid + version + interval + snapid is unique, and short + hobject_t hoid = target.make_temp_hobject(ss.str()); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +int PrimaryLogPG::prepare_transaction(OpContext *ctx) +{ + ceph_assert(!ctx->ops->empty()); + + // valid snap context? + if (!ctx->snapc.is_valid()) { + dout(10) << " invalid snapc " << ctx->snapc << dendl; + return -EINVAL; + } + + // prepare the actual mutation + int result = do_osd_ops(ctx, *ctx->ops); + if (result < 0) { + if (ctx->op->may_write() && + get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) { + // need to save the error code in the pg log, to detect dup ops, + // but do nothing else + ctx->update_log_only = true; + } + return result; + } + + // read-op? write-op noop? done? + if (ctx->op_t->empty() && !ctx->modify) { + if (ctx->pending_async_reads.empty()) + unstable_stats.add(ctx->delta_stats); + if (ctx->op->may_write() && + get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) { + ctx->update_log_only = true; + } + return result; + } + + // check for full + if ((ctx->delta_stats.num_bytes > 0 || + ctx->delta_stats.num_objects > 0) && // FIXME: keys? + (pool.info.has_flag(pg_pool_t::FLAG_FULL) || + get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) { + const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req()); + if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now + m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) { + dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS" + << dendl; + } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + // they tried, they failed. + dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl; + return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC; + } else { + // drop request + dout(20) << __func__ << " full, dropping request (bad client)" << dendl; + return -EAGAIN; + } + } + + const hobject_t& soid = ctx->obs->oi.soid; + // clone, if necessary + if (soid.snap == CEPH_NOSNAP) + make_writeable(ctx); + + finish_ctx(ctx, + ctx->new_obs.exists ? pg_log_entry_t::MODIFY : + pg_log_entry_t::DELETE); + + return result; +} + +void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type) +{ + const hobject_t& soid = ctx->obs->oi.soid; + dout(20) << __func__ << " " << soid << " " << ctx + << " op " << pg_log_entry_t::get_op_name(log_op_type) + << dendl; + utime_t now = ceph_clock_now(); + + // finish and log the op. + if (ctx->user_modify) { + // update the user_version for any modify ops, except for the watch op + ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1; + /* In order for new clients and old clients to interoperate properly + * when exchanging versions, we need to lower bound the user_version + * (which our new clients pay proper attention to) + * by the at_version (which is all the old clients can ever see). */ + if (ctx->at_version.version > ctx->user_at_version) + ctx->user_at_version = ctx->at_version.version; + ctx->new_obs.oi.user_version = ctx->user_at_version; + } + ctx->bytes_written = ctx->op_t->get_bytes_written(); + + if (ctx->new_obs.exists) { + ctx->new_obs.oi.version = ctx->at_version; + ctx->new_obs.oi.prior_version = ctx->obs->oi.version; + ctx->new_obs.oi.last_reqid = ctx->reqid; + if (ctx->mtime != utime_t()) { + ctx->new_obs.oi.mtime = ctx->mtime; + dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl; + ctx->new_obs.oi.local_mtime = now; + } else { + dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl; + } + + // object_info_t + map <string, bufferlist> attrs; + bufferlist bv(sizeof(ctx->new_obs.oi)); + encode(ctx->new_obs.oi, bv, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR].claim(bv); + + // snapset + if (soid.snap == CEPH_NOSNAP) { + dout(10) << " final snapset " << ctx->new_snapset + << " in " << soid << dendl; + bufferlist bss; + encode(ctx->new_snapset, bss); + attrs[SS_ATTR].claim(bss); + } else { + dout(10) << " no snapset (this is a clone)" << dendl; + } + ctx->op_t->setattrs(soid, attrs); + } else { + // reset cached oi + ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid); + } + + // append to log + ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version, + ctx->obs->oi.version, + ctx->user_at_version, ctx->reqid, + ctx->mtime, 0)); + if (soid.snap < CEPH_NOSNAP) { + switch (log_op_type) { + case pg_log_entry_t::MODIFY: + case pg_log_entry_t::PROMOTE: + case pg_log_entry_t::CLEAN: + dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset + << dendl; + encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps); + break; + default: + break; + } + } + + if (!ctx->extra_reqids.empty()) { + dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " " + << ctx->extra_reqid_return_codes << dendl; + ctx->log.back().extra_reqids.swap(ctx->extra_reqids); + ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes); + } + + // apply new object state. + ctx->obc->obs = ctx->new_obs; + + if (soid.is_head() && !ctx->obc->obs.exists) { + ctx->obc->ssc->exists = false; + ctx->obc->ssc->snapset = SnapSet(); + } else { + ctx->obc->ssc->exists = true; + ctx->obc->ssc->snapset = ctx->new_snapset; + } +} + +void PrimaryLogPG::apply_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats) { + + info.stats.stats.add(delta_stats); + info.stats.stats.floor(0); + + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + pg_info_t& pinfo = peer_info[bt]; + if (soid <= pinfo.last_backfill) + pinfo.stats.stats.add(delta_stats); + else if (soid <= last_backfill_started) + pending_backfill_updates[soid].stats.add(delta_stats); + } + + if (is_primary() && scrubber.active) { + if (soid < scrubber.start) { + dout(20) << __func__ << " " << soid << " < [" << scrubber.start + << "," << scrubber.end << ")" << dendl; + scrub_cstat.add(delta_stats); + } else { + dout(20) << __func__ << " " << soid << " >= [" << scrubber.start + << "," << scrubber.end << ")" << dendl; + } + } +} + +void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx) +{ + const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req()); + ceph_assert(ctx->async_reads_complete()); + + for (vector<OSDOp>::iterator p = ctx->ops->begin(); + p != ctx->ops->end() && result >= 0; ++p) { + if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = p->rval; + break; + } + ctx->bytes_read += p->outdata.length(); + } + ctx->reply->claim_op_out_data(*ctx->ops); + ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0); + + MOSDOpReply *reply = ctx->reply; + ctx->reply = nullptr; + + if (result >= 0) { + if (!ctx->ignore_log_op_stats) { + log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read); + + publish_stats_to_osd(); + } + + // on read, return the current object version + if (ctx->obs) { + reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version); + } else { + reply->set_reply_versions(eversion_t(), ctx->user_at_version); + } + } else if (result == -ENOENT) { + // on ENOENT, set a floor for what the next user version will be. + reply->set_enoent_reply_versions(info.last_update, info.last_user_version); + } + + reply->set_result(result); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + osd->send_message_osd_client(reply, m->get_connection()); + close_op_ctx(ctx); +} + +// ======================================================================== +// copyfrom + +struct C_Copyfrom : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive + C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::CopyOpRef& c) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), cop(c) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + pg->lock(); + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->process_copy_chunk(oid, tid, r); + cop.reset(); + } + pg->unlock(); + } +}; + +struct C_CopyFrom_AsyncReadCb : public Context { + OSDOp *osd_op; + object_copy_data_t reply_obj; + uint64_t features; + size_t len; + C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) : + osd_op(osd_op), features(features), len(0) {} + void finish(int r) override { + osd_op->rval = r; + if (r < 0) { + return; + } + + ceph_assert(len > 0); + ceph_assert(len <= reply_obj.data.length()); + bufferlist bl; + bl.substr_of(reply_obj.data, 0, len); + reply_obj.data.swap(bl); + encode(reply_obj, osd_op->outdata, features); + } +}; + +struct C_CopyChunk : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive + uint64_t offset = 0; + C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::CopyOpRef& c) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), cop(c) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + pg->lock(); + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->process_copy_chunk_manifest(oid, tid, r, offset); + cop.reset(); + } + pg->unlock(); + } +}; + +int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp, + OSDOp& osd_op, ObjectContextRef &obc) +{ + object_info_t& oi = obc->obs.oi; + hobject_t& soid = oi.soid; + int result = 0; + object_copy_cursor_t cursor; + uint64_t out_max; + try { + decode(cursor, bp); + decode(out_max, bp); + } + catch (buffer::error& e) { + result = -EINVAL; + return result; + } + + const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req()); + uint64_t features = op->get_features(); + + bool async_read_started = false; + object_copy_data_t _reply_obj; + C_CopyFrom_AsyncReadCb *cb = nullptr; + if (pool.info.is_erasure()) { + cb = new C_CopyFrom_AsyncReadCb(&osd_op, features); + } + object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj; + // size, mtime + reply_obj.size = oi.size; + reply_obj.mtime = oi.mtime; + ceph_assert(obc->ssc); + if (soid.snap < CEPH_NOSNAP) { + auto p = obc->ssc->snapset.clone_snaps.find(soid.snap); + ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn? + reply_obj.snaps = p->second; + } else { + reply_obj.snap_seq = obc->ssc->snapset.seq; + } + if (oi.is_data_digest()) { + reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST; + reply_obj.data_digest = oi.data_digest; + } + if (oi.is_omap_digest()) { + reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST; + reply_obj.omap_digest = oi.omap_digest; + } + reply_obj.truncate_seq = oi.truncate_seq; + reply_obj.truncate_size = oi.truncate_size; + + // attrs + map<string,bufferlist>& out_attrs = reply_obj.attrs; + if (!cursor.attr_complete) { + result = getattrs_maybe_cache( + ctx->obc, + &out_attrs); + if (result < 0) { + if (cb) { + delete cb; + } + return result; + } + cursor.attr_complete = true; + dout(20) << " got attrs" << dendl; + } + + int64_t left = out_max - osd_op.outdata.length(); + + // data + bufferlist& bl = reply_obj.data; + if (left > 0 && !cursor.data_complete) { + if (cursor.data_offset < oi.size) { + uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left); + if (cb) { + async_read_started = true; + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags), + make_pair(&bl, cb))); + cb->len = max_read; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + result = -EINPROGRESS; + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + } else { + result = pgbackend->objects_read_sync( + oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl); + if (result < 0) + return result; + } + left -= max_read; + cursor.data_offset += max_read; + } + if (cursor.data_offset == oi.size) { + cursor.data_complete = true; + dout(20) << " got data" << dendl; + } + ceph_assert(cursor.data_offset <= oi.size); + } + + // omap + uint32_t omap_keys = 0; + if (!pool.info.supports_omap() || !oi.is_omap()) { + cursor.omap_complete = true; + } else { + if (left > 0 && !cursor.omap_complete) { + ceph_assert(cursor.data_complete); + if (cursor.omap_offset.empty()) { + osd->store->omap_get_header(ch, ghobject_t(oi.soid), + &reply_obj.omap_header); + } + bufferlist omap_data; + ObjectMap::ObjectMapIterator iter = + osd->store->get_omap_iterator(ch, ghobject_t(oi.soid)); + ceph_assert(iter); + iter->upper_bound(cursor.omap_offset); + for (; iter->valid(); iter->next()) { + ++omap_keys; + encode(iter->key(), omap_data); + encode(iter->value(), omap_data); + left -= iter->key().length() + 4 + iter->value().length() + 4; + if (left <= 0) + break; + } + if (omap_keys) { + encode(omap_keys, reply_obj.omap_data); + reply_obj.omap_data.claim_append(omap_data); + } + if (iter->valid()) { + cursor.omap_offset = iter->key(); + } else { + cursor.omap_complete = true; + dout(20) << " got omap" << dendl; + } + } + } + + if (cursor.is_complete()) { + // include reqids only in the final step. this is a bit fragile + // but it works... + pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, + &reply_obj.reqids, + &reply_obj.reqid_return_codes); + dout(20) << " got reqids" << dendl; + } + + dout(20) << " cursor.is_complete=" << cursor.is_complete() + << " " << out_attrs.size() << " attrs" + << " " << bl.length() << " bytes" + << " " << reply_obj.omap_header.length() << " omap header bytes" + << " " << reply_obj.omap_data.length() << " omap data bytes in " + << omap_keys << " keys" + << " " << reply_obj.reqids.size() << " reqids" + << dendl; + reply_obj.cursor = cursor; + if (!async_read_started) { + encode(reply_obj, osd_op.outdata, features); + } + if (cb && !async_read_started) { + delete cb; + } + + if (result > 0) { + result = 0; + } + return result; +} + +void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid, + OSDOp& osd_op) +{ + // NOTE: we take non-const ref here for claim_op_out_data below; we must + // be careful not to modify anything else that will upset a racing + // operator<< + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + uint64_t features = m->get_features(); + object_copy_data_t reply_obj; + + pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids, + &reply_obj.reqid_return_codes); + dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl; + encode(reply_obj, osd_op.outdata, features); + osd_op.rval = -ENOENT; + MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + reply->claim_op_out_data(m->ops); + reply->set_result(-ENOENT); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + osd->send_message_osd_client(reply, m->get_connection()); +} + +void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc, + hobject_t src, object_locator_t oloc, + version_t version, unsigned flags, + bool mirror_snapset, + unsigned src_obj_fadvise_flags, + unsigned dest_obj_fadvise_flags) +{ + const hobject_t& dest = obc->obs.oi.soid; + dout(10) << __func__ << " " << dest + << " from " << src << " " << oloc << " v" << version + << " flags " << flags + << (mirror_snapset ? " mirror_snapset" : "") + << dendl; + + ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP); + + // cancel a previous in-progress copy? + if (copy_ops.count(dest)) { + // FIXME: if the src etc match, we could avoid restarting from the + // beginning. + CopyOpRef cop = copy_ops[dest]; + vector<ceph_tid_t> tids; + cancel_copy(cop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + } + + CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags, + mirror_snapset, src_obj_fadvise_flags, + dest_obj_fadvise_flags)); + copy_ops[dest] = cop; + obc->start_block(); + + if (!obc->obs.oi.has_manifest()) { + _copy_some(obc, cop); + } else { + if (obc->obs.oi.manifest.is_redirect()) { + _copy_some(obc, cop); + } else if (obc->obs.oi.manifest.is_chunked()) { + auto p = obc->obs.oi.manifest.chunk_map.begin(); + _copy_some_manifest(obc, cop, p->first); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + } +} + +void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop) +{ + dout(10) << __func__ << " " << *obc << " " << cop << dendl; + + unsigned flags = 0; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH) + flags |= CEPH_OSD_FLAG_FLUSH; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE) + flags |= CEPH_OSD_FLAG_IGNORE_CACHE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY) + flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE) + flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED) + flags |= CEPH_OSD_FLAG_RWORDERED; + + C_GatherBuilder gather(cct); + + if (cop->cursor.is_initial() && cop->mirror_snapset) { + // list snaps too. + ceph_assert(cop->src.snap == CEPH_NOSNAP); + ObjectOperation op; + op.list_snaps(&cop->results.snapset, NULL); + ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op, + CEPH_SNAPDIR, NULL, + flags, gather.new_sub(), NULL); + cop->objecter_tid2 = tid; + } + + ObjectOperation op; + if (cop->results.user_version) { + op.assert_version(cop->results.user_version); + } else { + // we should learn the version after the first chunk, if we didn't know + // it already! + ceph_assert(cop->cursor.is_initial()); + } + op.copy_get(&cop->cursor, get_copy_chunk_size(), + &cop->results.object_size, &cop->results.mtime, + &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data, + &cop->results.snaps, &cop->results.snap_seq, + &cop->results.flags, + &cop->results.source_data_digest, + &cop->results.source_omap_digest, + &cop->results.reqids, + &cop->results.reqid_return_codes, + &cop->results.truncate_seq, + &cop->results.truncate_size, + &cop->rval); + op.set_last_op_flags(cop->src_obj_fadvise_flags); + + C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid, + get_last_peering_reset(), cop); + unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers); + gather.set_finisher(new C_OnFinisher(fin, + osd->objecter_finishers[n])); + + ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op, + cop->src.snap, NULL, + flags, + gather.new_sub(), + // discover the object version if we don't know it yet + cop->results.user_version ? NULL : &cop->results.user_version); + fin->tid = tid; + cop->objecter_tid = tid; + gather.activate(); +} + +void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset) +{ + dout(10) << __func__ << " " << *obc << " " << cop << dendl; + + unsigned flags = 0; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH) + flags |= CEPH_OSD_FLAG_FLUSH; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE) + flags |= CEPH_OSD_FLAG_IGNORE_CACHE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY) + flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE) + flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED) + flags |= CEPH_OSD_FLAG_RWORDERED; + + int num_chunks = 0; + uint64_t last_offset = 0, chunks_size = 0; + object_manifest_t *manifest = &obc->obs.oi.manifest; + map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset); + for (;iter != manifest->chunk_map.end(); ++iter) { + num_chunks++; + chunks_size += iter->second.length; + last_offset = iter->first; + if (get_copy_chunk_size() < chunks_size) { + break; + } + } + + cop->num_chunk = num_chunks; + cop->start_offset = start_offset; + cop->last_offset = last_offset; + dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks + << " start_offset: " << start_offset << " chunks_size: " << chunks_size + << " last_offset: " << last_offset << dendl; + + iter = manifest->chunk_map.find(start_offset); + for (;iter != manifest->chunk_map.end(); ++iter) { + uint64_t obj_offset = iter->first; + uint64_t length = manifest->chunk_map[iter->first].length; + hobject_t soid = manifest->chunk_map[iter->first].oid; + object_locator_t oloc(soid); + CopyCallback * cb = NULL; + CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc, + cop->results.user_version, cop->flags, cop->mirror_snapset, + cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags)); + sub_cop->cursor.data_offset = obj_offset; + cop->chunk_cops[obj_offset] = sub_cop; + + int s = sub_cop->chunk_ops.size(); + sub_cop->chunk_ops.resize(s+1); + sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ; + sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset; + sub_cop->chunk_ops[s].op.extent.length = length; + + ObjectOperation op; + op.dup(sub_cop->chunk_ops); + + dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: " + << manifest->chunk_map[iter->first].offset + << " length: " << length << " pool id: " << oloc.pool << dendl; + + if (cop->results.user_version) { + op.assert_version(cop->results.user_version); + } else { + // we should learn the version after the first chunk, if we didn't know + // it already! + ceph_assert(cop->cursor.is_initial()); + } + op.set_last_op_flags(cop->src_obj_fadvise_flags); + + C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid, + get_last_peering_reset(), cop); + fin->offset = obj_offset; + unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers); + + ceph_tid_t tid = osd->objecter->read(soid.oid, oloc, op, + sub_cop->src.snap, NULL, + flags, + new C_OnFinisher(fin, osd->objecter_finishers[n]), + // discover the object version if we don't know it yet + sub_cop->results.user_version ? NULL : &sub_cop->results.user_version); + fin->tid = tid; + sub_cop->objecter_tid = tid; + if (last_offset < iter->first) { + break; + } + } +} + +void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid); + if (p == copy_ops.end()) { + dout(10) << __func__ << " no copy_op found" << dendl; + return; + } + CopyOpRef cop = p->second; + if (tid != cop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != cop " << cop + << " tid " << cop->objecter_tid << dendl; + return; + } + + if (cop->omap_data.length() || cop->omap_header.length()) + cop->results.has_omap = true; + + if (r >= 0 && !pool.info.supports_omap() && + (cop->omap_data.length() || cop->omap_header.length())) { + r = -EOPNOTSUPP; + } + cop->objecter_tid = 0; + cop->objecter_tid2 = 0; // assume this ordered before us (if it happened) + ObjectContextRef& cobc = cop->obc; + + if (r < 0) + goto out; + + ceph_assert(cop->rval >= 0); + + if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) { + // verify snap hasn't been deleted + vector<snapid_t>::iterator p = cop->results.snaps.begin(); + while (p != cop->results.snaps.end()) { + if (pool.info.is_removed_snap(*p)) { + dout(10) << __func__ << " clone snap " << *p << " has been deleted" + << dendl; + for (vector<snapid_t>::iterator q = p + 1; + q != cop->results.snaps.end(); + ++q) + *(q - 1) = *q; + cop->results.snaps.resize(cop->results.snaps.size() - 1); + } else { + ++p; + } + } + if (cop->results.snaps.empty()) { + dout(10) << __func__ << " no more snaps for " << oid << dendl; + r = -ENOENT; + goto out; + } + } + + ceph_assert(cop->rval >= 0); + + if (!cop->temp_cursor.data_complete) { + cop->results.data_digest = cop->data.crc32c(cop->results.data_digest); + } + if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) { + if (cop->omap_header.length()) { + cop->results.omap_digest = + cop->omap_header.crc32c(cop->results.omap_digest); + } + if (cop->omap_data.length()) { + bufferlist keys; + keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4); + cop->results.omap_digest = keys.crc32c(cop->results.omap_digest); + } + } + + if (!cop->temp_cursor.attr_complete) { + for (map<string,bufferlist>::iterator p = cop->attrs.begin(); + p != cop->attrs.end(); + ++p) { + cop->results.attrs[string("_") + p->first] = p->second; + } + cop->attrs.clear(); + } + + if (!cop->cursor.is_complete()) { + // write out what we have so far + if (cop->temp_cursor.is_initial()) { + ceph_assert(!cop->results.started_temp_obj); + cop->results.started_temp_obj = true; + cop->results.temp_oid = generate_temp_object(oid); + dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl; + } + ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true); + OpContextUPtr ctx = simple_opc_create(tempobc); + if (cop->temp_cursor.is_initial()) { + ctx->new_temp_oid = cop->results.temp_oid; + } + _write_copy_chunk(cop, ctx->op_t.get()); + simple_opc_submit(std::move(ctx)); + dout(10) << __func__ << " fetching more" << dendl; + _copy_some(cobc, cop); + return; + } + + // verify digests? + if (cop->results.is_data_digest() || cop->results.is_omap_digest()) { + dout(20) << __func__ << std::hex + << " got digest: rx data 0x" << cop->results.data_digest + << " omap 0x" << cop->results.omap_digest + << ", source: data 0x" << cop->results.source_data_digest + << " omap 0x" << cop->results.source_omap_digest + << std::dec + << " flags " << cop->results.flags + << dendl; + } + if (cop->results.is_data_digest() && + cop->results.data_digest != cop->results.source_data_digest) { + derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest + << " != source 0x" << cop->results.source_data_digest << std::dec + << dendl; + osd->clog->error() << info.pgid << " copy from " << cop->src + << " to " << cop->obc->obs.oi.soid << std::hex + << " data digest 0x" << cop->results.data_digest + << " != source 0x" << cop->results.source_data_digest + << std::dec; + r = -EIO; + goto out; + } + if (cop->results.is_omap_digest() && + cop->results.omap_digest != cop->results.source_omap_digest) { + derr << __func__ << std::hex + << " omap digest 0x" << cop->results.omap_digest + << " != source 0x" << cop->results.source_omap_digest + << std::dec << dendl; + osd->clog->error() << info.pgid << " copy from " << cop->src + << " to " << cop->obc->obs.oi.soid << std::hex + << " omap digest 0x" << cop->results.omap_digest + << " != source 0x" << cop->results.source_omap_digest + << std::dec; + r = -EIO; + goto out; + } + if (cct->_conf->osd_debug_inject_copyfrom_error) { + derr << __func__ << " injecting copyfrom failure" << dendl; + r = -EIO; + goto out; + } + + cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>( + [this, &cop /* avoid ref cycle */](PGTransaction *t) { + ObjectState& obs = cop->obc->obs; + if (cop->temp_cursor.is_initial()) { + dout(20) << "fill_in_final_tx: writing " + << "directly to final object" << dendl; + // write directly to final object + cop->results.temp_oid = obs.oi.soid; + _write_copy_chunk(cop, t); + } else { + // finish writing to temp object, then move into place + dout(20) << "fill_in_final_tx: writing to temp object" << dendl; + _write_copy_chunk(cop, t); + t->rename(obs.oi.soid, cop->results.temp_oid); + } + t->setattrs(obs.oi.soid, cop->results.attrs); + }); + + dout(20) << __func__ << " success; committing" << dendl; + + out: + dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl; + CopyCallbackResults results(r, &cop->results); + cop->cb->complete(results); + + copy_ops.erase(cobc->obs.oi.soid); + cobc->stop_block(); + + if (r < 0 && cop->results.started_temp_obj) { + dout(10) << __func__ << " deleting partial temp object " + << cop->results.temp_oid << dendl; + ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true); + OpContextUPtr ctx = simple_opc_create(tempobc); + ctx->op_t->remove(cop->results.temp_oid); + ctx->discard_temp_oid = cop->results.temp_oid; + simple_opc_submit(std::move(ctx)); + } + + // cancel and requeue proxy ops on this object + if (!r) { + cancel_and_requeue_proxy_ops(cobc->obs.oi.soid); + } + + kick_object_context_blocked(cobc); +} + +void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid); + if (p == copy_ops.end()) { + dout(10) << __func__ << " no copy_op found" << dendl; + return; + } + CopyOpRef obj_cop = p->second; + CopyOpRef chunk_cop = obj_cop->chunk_cops[offset]; + + if (tid != chunk_cop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop + << " tid " << chunk_cop->objecter_tid << dendl; + return; + } + + if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) { + r = -EOPNOTSUPP; + } + + chunk_cop->objecter_tid = 0; + chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened) + ObjectContextRef& cobc = obj_cop->obc; + OSDOp &chunk_data = chunk_cop->chunk_ops[0]; + + if (r < 0) { + obj_cop->failed = true; + goto out; + } + + if (obj_cop->failed) { + return; + } + if (!chunk_data.outdata.length()) { + r = -EIO; + obj_cop->failed = true; + goto out; + } + + obj_cop->num_chunk--; + + /* check all of the copyop are completed */ + if (obj_cop->num_chunk) { + dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl; + return; + } + + { + OpContextUPtr ctx = simple_opc_create(obj_cop->obc); + if (!ctx->lock_manager.take_write_lock( + obj_cop->obc->obs.oi.soid, + obj_cop->obc)) { + // recovery op can take read lock. + // so need to wait for recovery completion + r = -EAGAIN; + obj_cop->failed = true; + close_op_ctx(ctx.release()); + goto out; + } + dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl; + + PGTransaction *t = ctx->op_t.get(); + ObjectState& obs = ctx->new_obs; + for (auto p : obj_cop->chunk_cops) { + OSDOp &sub_chunk = p.second->chunk_ops[0]; + t->write(cobc->obs.oi.soid, + p.second->cursor.data_offset, + sub_chunk.outdata.length(), + sub_chunk.outdata, + p.second->dest_obj_fadvise_flags); + dout(20) << __func__ << " offset: " << p.second->cursor.data_offset + << " length: " << sub_chunk.outdata.length() << dendl; + write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges, + p.second->cursor.data_offset, sub_chunk.outdata.length()); + obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_DIRTY); + obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING); + sub_chunk.outdata.clear(); + } + obs.oi.clear_data_digest(); + ctx->at_version = get_next_version(); + finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE); + simple_opc_submit(std::move(ctx)); + + auto p = cobc->obs.oi.manifest.chunk_map.rbegin(); + /* check remaining work */ + if (p != cobc->obs.oi.manifest.chunk_map.rend()) { + if (obj_cop->last_offset >= p->first + p->second.length) { + for (auto &en : cobc->obs.oi.manifest.chunk_map) { + if (obj_cop->last_offset < en.first) { + _copy_some_manifest(cobc, obj_cop, en.first); + return; + } + } + } + } + } + + out: + dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl; + CopyCallbackResults results(r, &obj_cop->results); + obj_cop->cb->complete(results); + + copy_ops.erase(cobc->obs.oi.soid); + cobc->stop_block(); + + // cancel and requeue proxy ops on this object + if (!r) { + cancel_and_requeue_proxy_ops(cobc->obs.oi.soid); + } + + kick_object_context_blocked(cobc); +} + +void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) { + vector<ceph_tid_t> tids; + for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin(); + it != proxyread_ops.end();) { + if (it->second->soid == oid) { + cancel_proxy_read((it++)->second, &tids); + } else { + ++it; + } + } + for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin(); + it != proxywrite_ops.end();) { + if (it->second->soid == oid) { + cancel_proxy_write((it++)->second, &tids); + } else { + ++it; + } + } + osd->objecter->op_cancel(tids, -ECANCELED); + kick_proxy_ops_blocked(oid); +} + +void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t) +{ + dout(20) << __func__ << " " << cop + << " " << cop->attrs.size() << " attrs" + << " " << cop->data.length() << " bytes" + << " " << cop->omap_header.length() << " omap header bytes" + << " " << cop->omap_data.length() << " omap data bytes" + << dendl; + if (!cop->temp_cursor.attr_complete) { + t->create(cop->results.temp_oid); + } + if (!cop->temp_cursor.data_complete) { + ceph_assert(cop->data.length() + cop->temp_cursor.data_offset == + cop->cursor.data_offset); + if (pool.info.required_alignment() && + !cop->cursor.data_complete) { + /** + * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset + * to pick it up on the next pass. + */ + ceph_assert(cop->temp_cursor.data_offset % + pool.info.required_alignment() == 0); + if (cop->data.length() % pool.info.required_alignment() != 0) { + uint64_t to_trim = + cop->data.length() % pool.info.required_alignment(); + bufferlist bl; + bl.substr_of(cop->data, 0, cop->data.length() - to_trim); + cop->data.swap(bl); + cop->cursor.data_offset -= to_trim; + ceph_assert(cop->data.length() + cop->temp_cursor.data_offset == + cop->cursor.data_offset); + } + } + if (cop->data.length()) { + t->write( + cop->results.temp_oid, + cop->temp_cursor.data_offset, + cop->data.length(), + cop->data, + cop->dest_obj_fadvise_flags); + } + cop->data.clear(); + } + if (pool.info.supports_omap()) { + if (!cop->temp_cursor.omap_complete) { + if (cop->omap_header.length()) { + t->omap_setheader( + cop->results.temp_oid, + cop->omap_header); + cop->omap_header.clear(); + } + if (cop->omap_data.length()) { + map<string,bufferlist> omap; + bufferlist::const_iterator p = cop->omap_data.begin(); + decode(omap, p); + t->omap_setkeys(cop->results.temp_oid, omap); + cop->omap_data.clear(); + } + } + } else { + ceph_assert(cop->omap_header.length() == 0); + ceph_assert(cop->omap_data.length() == 0); + } + cop->temp_cursor = cop->cursor; +} + +void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb) +{ + OpContext *ctx = cb->ctx; + dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl; + + ObjectState& obs = ctx->new_obs; + if (obs.exists) { + dout(20) << __func__ << ": exists, removing" << dendl; + ctx->op_t->remove(obs.oi.soid); + } else { + ctx->delta_stats.num_objects++; + obs.exists = true; + } + if (cb->is_temp_obj_used()) { + ctx->discard_temp_oid = cb->results->temp_oid; + } + cb->results->fill_in_final_tx(ctx->op_t.get()); + + // CopyFromCallback fills this in for us + obs.oi.user_version = ctx->user_at_version; + + if (cb->results->is_data_digest()) { + obs.oi.set_data_digest(cb->results->data_digest); + } else { + obs.oi.clear_data_digest(); + } + if (cb->results->is_omap_digest()) { + obs.oi.set_omap_digest(cb->results->omap_digest); + } else { + obs.oi.clear_omap_digest(); + } + + obs.oi.truncate_seq = cb->results->truncate_seq; + obs.oi.truncate_size = cb->results->truncate_size; + + ctx->extra_reqids = cb->results->reqids; + ctx->extra_reqid_return_codes = cb->results->reqid_return_codes; + + // cache: clear whiteout? + if (obs.oi.is_whiteout()) { + dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + --ctx->delta_stats.num_whiteouts; + } + + if (cb->results->has_omap) { + dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl; + obs.oi.set_flag(object_info_t::FLAG_OMAP); + } else { + dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + + interval_set<uint64_t> ch; + if (obs.oi.size > 0) + ch.insert(0, obs.oi.size); + ctx->modified_ranges.union_of(ch); + + if (cb->get_data_size() != obs.oi.size) { + ctx->delta_stats.num_bytes -= obs.oi.size; + obs.oi.size = cb->get_data_size(); + ctx->delta_stats.num_bytes += obs.oi.size; + } + ctx->delta_stats.num_wr++; + ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10); + + osd->logger->inc(l_osd_copyfrom); +} + +void PrimaryLogPG::finish_promote(int r, CopyResults *results, + ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + dout(10) << __func__ << " " << soid << " r=" << r + << " uv" << results->user_version << dendl; + + if (r == -ECANCELED) { + return; + } + + if (r != -ENOENT && soid.is_snap()) { + if (results->snaps.empty()) { + // we must have read "snap" content from the head object in + // the base pool. use snap_seq to construct what snaps should + // be for this clone (what is was before we evicted the clean + // clone from this pool, and what it will be when we flush and + // the clone eventually happens in the base pool). + SnapSet& snapset = obc->ssc->snapset; + vector<snapid_t>::iterator p = snapset.snaps.begin(); + while (p != snapset.snaps.end() && *p > soid.snap) + ++p; + while (p != snapset.snaps.end() && *p > results->snap_seq) { + results->snaps.push_back(*p); + ++p; + } + } + + dout(20) << __func__ << " snaps " << results->snaps << dendl; + filter_snapc(results->snaps); + + dout(20) << __func__ << " filtered snaps " << results->snaps << dendl; + if (results->snaps.empty()) { + dout(20) << __func__ + << " snaps are empty, clone is invalid," + << " setting r to ENOENT" << dendl; + r = -ENOENT; + } + } + + if (r < 0 && results->started_temp_obj) { + dout(10) << __func__ << " abort; will clean up partial work" << dendl; + ObjectContextRef tempobc = get_object_context(results->temp_oid, false); + ceph_assert(tempobc); + OpContextUPtr ctx = simple_opc_create(tempobc); + ctx->op_t->remove(results->temp_oid); + simple_opc_submit(std::move(ctx)); + results->started_temp_obj = false; + } + + if (r == -ENOENT && soid.is_snap()) { + dout(10) << __func__ + << ": enoent while trying to promote clone, " << soid + << " must have been trimmed, removing from snapset" + << dendl; + hobject_t head(soid.get_head()); + ObjectContextRef obc = get_object_context(head, false); + ceph_assert(obc); + + OpContextUPtr tctx = simple_opc_create(obc); + tctx->at_version = get_next_version(); + filter_snapc(tctx->new_snapset.snaps); + vector<snapid_t> new_clones; + map<snapid_t, vector<snapid_t>> new_clone_snaps; + for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin(); + i != tctx->new_snapset.clones.end(); + ++i) { + if (*i != soid.snap) { + new_clones.push_back(*i); + auto p = tctx->new_snapset.clone_snaps.find(*i); + if (p != tctx->new_snapset.clone_snaps.end()) { + new_clone_snaps[*i] = p->second; + } + } + } + tctx->new_snapset.clones.swap(new_clones); + tctx->new_snapset.clone_overlap.erase(soid.snap); + tctx->new_snapset.clone_size.erase(soid.snap); + tctx->new_snapset.clone_snaps.swap(new_clone_snaps); + + // take RWWRITE lock for duration of our local write. ignore starvation. + if (!tctx->lock_manager.take_write_lock( + head, + obc)) { + ceph_abort_msg("problem!"); + } + dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl; + + finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE); + + simple_opc_submit(std::move(tctx)); + return; + } + + bool whiteout = false; + if (r == -ENOENT) { + ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above + dout(10) << __func__ << " whiteout " << soid << dendl; + whiteout = true; + } + + if (r < 0 && !whiteout) { + derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl; + // pass error to everyone blocked on this object + // FIXME: this is pretty sloppy, but at this point we got + // something unexpected and don't have many other options. + map<hobject_t,list<OpRequestRef>>::iterator blocked_iter = + waiting_for_blocked_object.find(soid); + if (blocked_iter != waiting_for_blocked_object.end()) { + while (!blocked_iter->second.empty()) { + osd->reply_op_error(blocked_iter->second.front(), r); + blocked_iter->second.pop_front(); + } + waiting_for_blocked_object.erase(blocked_iter); + } + return; + } + + osd->promote_finish(results->object_size); + + OpContextUPtr tctx = simple_opc_create(obc); + tctx->at_version = get_next_version(); + + if (!obc->obs.oi.has_manifest()) { + ++tctx->delta_stats.num_objects; + } + if (soid.snap < CEPH_NOSNAP) + ++tctx->delta_stats.num_object_clones; + tctx->new_obs.exists = true; + + tctx->extra_reqids = results->reqids; + tctx->extra_reqid_return_codes = results->reqid_return_codes; + + if (whiteout) { + // create a whiteout + tctx->op_t->create(soid); + tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT); + ++tctx->delta_stats.num_whiteouts; + dout(20) << __func__ << " creating whiteout on " << soid << dendl; + osd->logger->inc(l_osd_tier_whiteout); + } else { + if (results->has_omap) { + dout(10) << __func__ << " setting omap flag on " << soid << dendl; + tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP); + ++tctx->delta_stats.num_objects_omap; + } + + results->fill_in_final_tx(tctx->op_t.get()); + if (results->started_temp_obj) { + tctx->discard_temp_oid = results->temp_oid; + } + tctx->new_obs.oi.size = results->object_size; + tctx->new_obs.oi.user_version = results->user_version; + if (results->is_data_digest()) { + tctx->new_obs.oi.set_data_digest(results->data_digest); + } else { + tctx->new_obs.oi.clear_data_digest(); + } + if (results->is_omap_digest()) { + tctx->new_obs.oi.set_omap_digest(results->omap_digest); + } else { + tctx->new_obs.oi.clear_omap_digest(); + } + tctx->new_obs.oi.truncate_seq = results->truncate_seq; + tctx->new_obs.oi.truncate_size = results->truncate_size; + + if (soid.snap != CEPH_NOSNAP) { + ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap)); + ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap)); + ceph_assert(obc->ssc->snapset.clone_size[soid.snap] == + results->object_size); + ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap)); + + tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap); + } else { + tctx->delta_stats.num_bytes += results->object_size; + } + } + + if (results->mirror_snapset) { + ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP); + tctx->new_snapset.from_snap_set( + results->snapset, + get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS); + } + dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl; + + // take RWWRITE lock for duration of our local write. ignore starvation. + if (!tctx->lock_manager.take_write_lock( + obc->obs.oi.soid, + obc)) { + ceph_abort_msg("problem!"); + } + dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl; + + finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE); + + simple_opc_submit(std::move(tctx)); + + osd->logger->inc(l_osd_tier_promote); + + if (agent_state && + agent_state->is_idle()) + agent_choose_mode(); +} + +void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results, + ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + dout(10) << __func__ << " " << soid << " r=" << r + << " uv" << results->user_version << dendl; + + if (r == -ECANCELED || r == -EAGAIN) { + return; + } + + if (r < 0) { + derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl; + // pass error to everyone blocked on this object + // FIXME: this is pretty sloppy, but at this point we got + // something unexpected and don't have many other options. + map<hobject_t,list<OpRequestRef>>::iterator blocked_iter = + waiting_for_blocked_object.find(soid); + if (blocked_iter != waiting_for_blocked_object.end()) { + while (!blocked_iter->second.empty()) { + osd->reply_op_error(blocked_iter->second.front(), r); + blocked_iter->second.pop_front(); + } + waiting_for_blocked_object.erase(blocked_iter); + } + return; + } + + osd->promote_finish(results->object_size); + osd->logger->inc(l_osd_tier_promote); + + if (agent_state && + agent_state->is_idle()) + agent_choose_mode(); +} + +void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue, + vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << " " << cop->obc->obs.oi.soid + << " from " << cop->src << " " << cop->oloc + << " v" << cop->results.user_version << dendl; + + // cancel objecter op, if we can + if (cop->objecter_tid) { + tids->push_back(cop->objecter_tid); + cop->objecter_tid = 0; + if (cop->objecter_tid2) { + tids->push_back(cop->objecter_tid2); + cop->objecter_tid2 = 0; + } + } + + copy_ops.erase(cop->obc->obs.oi.soid); + cop->obc->stop_block(); + + kick_object_context_blocked(cop->obc); + cop->results.should_requeue = requeue; + CopyCallbackResults result(-ECANCELED, &cop->results); + cop->cb->complete(result); + + // There may still be an objecter callback referencing this copy op. + // That callback will not need the obc since it's been canceled, and + // we need the obc reference to go away prior to flush. + cop->obc = ObjectContextRef(); +} + +void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << dendl; + map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin(); + while (p != copy_ops.end()) { + // requeue this op? can I queue up all of them? + cancel_copy((p++)->second, requeue, tids); + } +} + + +// ======================================================================== +// flush +// +// Flush a dirty object in the cache tier by writing it back to the +// base tier. The sequence looks like: +// +// * send a copy-from operation to the base tier to copy the current +// version of the object +// * base tier will pull the object via (perhaps multiple) copy-get(s) +// * on completion, we check if the object has been modified. if so, +// just reply with -EAGAIN. +// * try to take a write lock so we can clear the dirty flag. if this +// fails, wait and retry +// * start a repop that clears the bit. +// +// If we have to wait, we will retry by coming back through the +// start_flush method. We check if a flush is already in progress +// and, if so, try to finish it by rechecking the version and trying +// to clear the dirty bit. +// +// In order for the cache-flush (a write op) to not block the copy-get +// from reading the object, the client *must* set the SKIPRWLOCKS +// flag. +// +// NOTE: normally writes are strictly ordered for the client, but +// flushes are special in that they can be reordered with respect to +// other writes. In particular, we can't have a flush request block +// an update to the cache pool object! + +struct C_Flush : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + utime_t start; + C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), start(ceph_clock_now()) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + pg->lock(); + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_flush(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start); + } + pg->unlock(); + } +}; + +int PrimaryLogPG::start_flush( + OpRequestRef op, ObjectContextRef obc, + bool blocking, hobject_t *pmissing, + boost::optional<std::function<void()>> &&on_flush) +{ + const object_info_t& oi = obc->obs.oi; + const hobject_t& soid = oi.soid; + dout(10) << __func__ << " " << soid + << " v" << oi.version + << " uv" << oi.user_version + << " " << (blocking ? "blocking" : "non-blocking/best-effort") + << dendl; + + // get a filtered snapset, need to remove removed snaps + SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info); + + // verify there are no (older) check for dirty clones + { + dout(20) << " snapset " << snapset << dendl; + vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin(); + while (p != snapset.clones.rend() && *p >= soid.snap) + ++p; + if (p != snapset.clones.rend()) { + hobject_t next = soid; + next.snap = *p; + ceph_assert(next.snap < soid.snap); + if (pg_log.get_missing().is_missing(next)) { + dout(10) << __func__ << " missing clone is " << next << dendl; + if (pmissing) + *pmissing = next; + return -ENOENT; + } + ObjectContextRef older_obc = get_object_context(next, false); + if (older_obc) { + dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi + << dendl; + if (older_obc->obs.oi.is_dirty()) { + dout(10) << __func__ << " next oldest clone is dirty: " + << older_obc->obs.oi << dendl; + return -EBUSY; + } + } else { + dout(20) << __func__ << " next oldest clone " << next + << " is not present; implicitly clean" << dendl; + } + } else { + dout(20) << __func__ << " no older clones" << dendl; + } + } + + if (blocking) + obc->start_block(); + + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid); + if (p != flush_ops.end()) { + FlushOpRef fop = p->second; + if (fop->op == op) { + // we couldn't take the write lock on a cache-try-flush before; + // now we are trying again for the lock. + return try_flush_mark_clean(fop); + } + if (fop->flushed_version == obc->obs.oi.user_version && + (fop->blocking || !blocking)) { + // nonblocking can join anything + // blocking can only join a blocking flush + dout(20) << __func__ << " piggybacking on existing flush " << dendl; + if (op) + fop->dup_ops.push_back(op); + return -EAGAIN; // clean up this ctx; op will retry later + } + + // cancel current flush since it will fail anyway, or because we + // are blocking and the existing flush is nonblocking. + dout(20) << __func__ << " canceling previous flush; it will fail" << dendl; + if (fop->op) + osd->reply_op_error(fop->op, -EBUSY); + while (!fop->dup_ops.empty()) { + osd->reply_op_error(fop->dup_ops.front(), -EBUSY); + fop->dup_ops.pop_front(); + } + vector<ceph_tid_t> tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + } + + if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) { + int r = start_manifest_flush(op, obc, blocking, std::move(on_flush)); + if (r != -EINPROGRESS) { + if (blocking) + obc->stop_block(); + } + return r; + } + + /** + * In general, we need to send a delete and a copyfrom. + * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)] + * where 4 is marked as clean. To flush 10, we have to: + * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4 + * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8 + * + * There is a complicating case. Supposed there had been a clone 7 + * for snaps [7, 6] which has been trimmed since they no longer exist. + * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit + * the delete, the snap will be promoted to 5, and the head will become + * a whiteout. When the copy-from goes through, we'll end up with + * 8:[8,4,3,2]:[4(4,3,2)]+head. + * + * Another complication is the case where there is an interval change + * after doing the delete and the flush but before marking the object + * clean. We'll happily delete head and then recreate it at the same + * sequence number, which works out ok. + */ + + SnapContext snapc, dsnapc; + if (snapset.seq != 0) { + if (soid.snap == CEPH_NOSNAP) { + snapc.seq = snapset.seq; + snapc.snaps = snapset.snaps; + } else { + snapid_t min_included_snap; + auto p = snapset.clone_snaps.find(soid.snap); + ceph_assert(p != snapset.clone_snaps.end()); + min_included_snap = p->second.back(); + snapc = snapset.get_ssc_as_of(min_included_snap - 1); + } + + snapid_t prev_snapc = 0; + for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin(); + citer != snapset.clones.rend(); + ++citer) { + if (*citer < soid.snap) { + prev_snapc = *citer; + break; + } + } + + dsnapc = snapset.get_ssc_as_of(prev_snapc); + } + + object_locator_t base_oloc(soid); + base_oloc.pool = pool.info.tier_of; + + if (dsnapc.seq < snapc.seq) { + ObjectOperation o; + o.remove(); + osd->objecter->mutate( + soid.oid, + base_oloc, + o, + dsnapc, + ceph::real_clock::from_ceph_timespec(oi.mtime), + (CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_ENFORCE_SNAPC), + NULL /* no callback, we'll rely on the ordering w.r.t the next op */); + } + + FlushOpRef fop(std::make_shared<FlushOp>()); + fop->obc = obc; + fop->flushed_version = oi.user_version; + fop->blocking = blocking; + fop->on_flush = std::move(on_flush); + fop->op = op; + + ObjectOperation o; + if (oi.is_whiteout()) { + fop->removal = true; + o.remove(); + } else { + object_locator_t oloc(soid); + o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version, + CEPH_OSD_COPY_FROM_FLAG_FLUSH | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE, + LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + + //mean the base tier don't cache data after this + if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) + o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED); + } + C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset()); + + unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers); + ceph_tid_t tid = osd->objecter->mutate( + soid.oid, base_oloc, o, snapc, + ceph::real_clock::from_ceph_timespec(oi.mtime), + CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC, + new C_OnFinisher(fin, + osd->objecter_finishers[n])); + /* we're under the pg lock and fin->finish() is grabbing that */ + fin->tid = tid; + fop->objecter_tid = tid; + + flush_ops[soid] = fop; + info.stats.stats.sum.num_flush++; + info.stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10); + return -EINPROGRESS; +} + +void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid); + if (p == flush_ops.end()) { + dout(10) << __func__ << " no flush_op found" << dendl; + return; + } + FlushOpRef fop = p->second; + if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) { + dout(10) << __func__ << " tid " << tid << " != fop " << fop + << " tid " << fop->objecter_tid << dendl; + return; + } + ObjectContextRef obc = fop->obc; + fop->objecter_tid = 0; + + if (r < 0 && !(r == -ENOENT && fop->removal)) { + if (fop->op) + osd->reply_op_error(fop->op, -EBUSY); + if (fop->blocking) { + obc->stop_block(); + kick_object_context_blocked(obc); + } + + if (!fop->dup_ops.empty()) { + dout(20) << __func__ << " requeueing dups" << dendl; + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = boost::none; + } + flush_ops.erase(oid); + return; + } + + r = try_flush_mark_clean(fop); + if (r == -EBUSY && fop->op) { + osd->reply_op_error(fop->op, r); + } +} + +int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop) +{ + ObjectContextRef obc = fop->obc; + const hobject_t& oid = obc->obs.oi.soid; + + if (fop->blocking) { + obc->stop_block(); + kick_object_context_blocked(obc); + } + + if (fop->flushed_version != obc->obs.oi.user_version || + !obc->obs.exists) { + if (obc->obs.exists) + dout(10) << __func__ << " flushed_version " << fop->flushed_version + << " != current " << obc->obs.oi.user_version + << dendl; + else + dout(10) << __func__ << " object no longer exists" << dendl; + + if (!fop->dup_ops.empty()) { + dout(20) << __func__ << " requeueing dups" << dendl; + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = boost::none; + } + flush_ops.erase(oid); + if (fop->blocking) + osd->logger->inc(l_osd_tier_flush_fail); + else + osd->logger->inc(l_osd_tier_try_flush_fail); + return -EBUSY; + } + + if (!fop->blocking && + write_blocked_by_scrub(oid)) { + if (fop->op) { + dout(10) << __func__ << " blocked by scrub" << dendl; + requeue_op(fop->op); + requeue_ops(fop->dup_ops); + return -EAGAIN; // will retry + } else { + osd->logger->inc(l_osd_tier_try_flush_fail); + vector<ceph_tid_t> tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + return -ECANCELED; + } + } + + // successfully flushed, can we evict this object? + if (!obc->obs.oi.has_manifest() && !fop->op && + agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && + agent_maybe_evict(obc, true)) { + osd->logger->inc(l_osd_tier_clean); + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = boost::none; + } + flush_ops.erase(oid); + return 0; + } + + dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl; + OpContextUPtr ctx = simple_opc_create(fop->obc); + + // successfully flushed; can we clear the dirty bit? + // try to take the lock manually, since we don't + // have a ctx yet. + if (ctx->lock_manager.get_lock_type( + ObjectContext::RWState::RWWRITE, + oid, + obc, + fop->op)) { + dout(20) << __func__ << " took write lock" << dendl; + } else if (fop->op) { + dout(10) << __func__ << " waiting on write lock " << fop->op << " " + << fop->dup_ops << dendl; + // fop->op is now waiting on the lock; get fop->dup_ops to wait too. + for (auto op : fop->dup_ops) { + bool locked = ctx->lock_manager.get_lock_type( + ObjectContext::RWState::RWWRITE, + oid, + obc, + op); + ceph_assert(!locked); + } + close_op_ctx(ctx.release()); + return -EAGAIN; // will retry + } else { + dout(10) << __func__ << " failed write lock, no op; failing" << dendl; + close_op_ctx(ctx.release()); + osd->logger->inc(l_osd_tier_try_flush_fail); + vector<ceph_tid_t> tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + return -ECANCELED; + } + + if (fop->on_flush) { + ctx->register_on_finish(*(fop->on_flush)); + fop->on_flush = boost::none; + } + + ctx->at_version = get_next_version(); + + ctx->new_obs = obc->obs; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + if (fop->obc->obs.oi.has_manifest()) { + ceph_assert(obc->obs.oi.manifest.is_chunked()); + PGTransaction* t = ctx->op_t.get(); + uint64_t chunks_size = 0; + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + chunks_size += p.second.length; + } + if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) { + t->omap_clear(oid); + ctx->new_obs.oi.clear_omap_digest(); + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + if (obc->obs.oi.size == chunks_size) { + t->truncate(oid, 0); + interval_set<uint64_t> trim; + trim.insert(0, ctx->new_obs.oi.size); + ctx->modified_ranges.union_of(trim); + truncate_update_size_and_usage(ctx->delta_stats, + ctx->new_obs.oi, + 0); + ctx->new_obs.oi.new_object(); + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + p.second.clear_flag(chunk_info_t::FLAG_DIRTY); + p.second.set_flag(chunk_info_t::FLAG_MISSING); + } + } else { + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + if (p.second.is_dirty()) { + dout(20) << __func__ << " offset: " << p.second.offset + << " length: " << p.second.length << dendl; + p.second.clear_flag(chunk_info_t::FLAG_DIRTY); + p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN + } + } + } + } + + finish_ctx(ctx.get(), pg_log_entry_t::CLEAN); + + osd->logger->inc(l_osd_tier_clean); + + if (!fop->dup_ops.empty() || fop->op) { + dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl; + list<OpRequestRef> ls; + if (fop->op) + ls.push_back(fop->op); + ls.splice(ls.end(), fop->dup_ops); + requeue_ops(ls); + } + + simple_opc_submit(std::move(ctx)); + + flush_ops.erase(oid); + + if (fop->blocking) + osd->logger->inc(l_osd_tier_flush); + else + osd->logger->inc(l_osd_tier_try_flush); + + return -EINPROGRESS; +} + +void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue, + vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid " + << fop->objecter_tid << dendl; + if (fop->objecter_tid) { + tids->push_back(fop->objecter_tid); + fop->objecter_tid = 0; + } + if (fop->io_tids.size()) { + for (auto &p : fop->io_tids) { + tids->push_back(p.second); + p.second = 0; + } + } + if (fop->blocking && fop->obc->is_blocked()) { + fop->obc->stop_block(); + kick_object_context_blocked(fop->obc); + } + if (requeue) { + if (fop->op) + requeue_op(fop->op); + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = boost::none; + } + flush_ops.erase(fop->obc->obs.oi.soid); +} + +void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << dendl; + map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin(); + while (p != flush_ops.end()) { + cancel_flush((p++)->second, requeue, tids); + } +} + +bool PrimaryLogPG::is_present_clone(hobject_t coid) +{ + if (!pool.info.allow_incomplete_clones()) + return true; + if (is_missing_object(coid)) + return true; + ObjectContextRef obc = get_object_context(coid, false); + return obc && obc->obs.exists; +} + +// ======================================================================== +// rep op gather + +class C_OSD_RepopCommit : public Context { + PrimaryLogPGRef pg; + boost::intrusive_ptr<PrimaryLogPG::RepGather> repop; +public: + C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop) + : pg(pg), repop(repop) {} + void finish(int) override { + pg->repop_all_committed(repop.get()); + } +}; + +void PrimaryLogPG::repop_all_committed(RepGather *repop) +{ + dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed " + << dendl; + repop->all_committed = true; + if (!repop->rep_aborted) { + if (repop->v != eversion_t()) { + last_update_ondisk = repop->v; + last_complete_ondisk = repop->pg_local_last_complete; + } + eval_repop(repop); + } +} + +void PrimaryLogPG::op_applied(const eversion_t &applied_version) +{ + dout(10) << "op_applied version " << applied_version << dendl; + ceph_assert(applied_version != eversion_t()); + ceph_assert(applied_version <= info.last_update); + last_update_applied = applied_version; + if (is_primary()) { + if (scrubber.active) { + if (last_update_applied >= scrubber.subset_last_update) { + requeue_scrub(ops_blocked_by_scrub()); + } + } else { + ceph_assert(scrubber.start == scrubber.end); + } + } +} + +void PrimaryLogPG::eval_repop(RepGather *repop) +{ + const MOSDOp *m = NULL; + if (repop->op) + m = static_cast<const MOSDOp *>(repop->op->get_req()); + + if (m) + dout(10) << "eval_repop " << *repop << dendl; + else + dout(10) << "eval_repop " << *repop << " (no op)" << dendl; + + // ondisk? + if (repop->all_committed) { + dout(10) << " commit: " << *repop << dendl; + for (auto p = repop->on_committed.begin(); + p != repop->on_committed.end(); + repop->on_committed.erase(p++)) { + (*p)(); + } + // send dup commits, in order + auto it = waiting_for_ondisk.find(repop->v); + if (it != waiting_for_ondisk.end()) { + ceph_assert(waiting_for_ondisk.begin()->first == repop->v); + for (auto& i : it->second) { + int return_code = repop->r; + if (return_code >= 0) { + return_code = std::get<2>(i); + } + osd->reply_op_error(std::get<0>(i), return_code, repop->v, + std::get<1>(i)); + } + waiting_for_ondisk.erase(it); + } + + publish_stats_to_osd(); + calc_min_last_complete_ondisk(); + + dout(10) << " removing " << *repop << dendl; + ceph_assert(!repop_queue.empty()); + dout(20) << " q front is " << *repop_queue.front() << dendl; + if (repop_queue.front() == repop) { + RepGather *to_remove = nullptr; + while (!repop_queue.empty() && + (to_remove = repop_queue.front())->all_committed) { + repop_queue.pop_front(); + for (auto p = to_remove->on_success.begin(); + p != to_remove->on_success.end(); + to_remove->on_success.erase(p++)) { + (*p)(); + } + remove_repop(to_remove); + } + } + } +} + +void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx) +{ + FUNCTRACE(cct); + const hobject_t& soid = ctx->obs->oi.soid; + dout(7) << "issue_repop rep_tid " << repop->rep_tid + << " o " << soid + << dendl; + + repop->v = ctx->at_version; + if (ctx->at_version > eversion_t()) { + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_info_t &pinfo = peer_info[*i]; + // keep peer_info up to date + if (pinfo.last_complete == pinfo.last_update) + pinfo.last_complete = ctx->at_version; + pinfo.last_update = ctx->at_version; + } + } + + ctx->op_t->add_obc(ctx->obc); + if (ctx->clone_obc) { + ctx->op_t->add_obc(ctx->clone_obc); + } + if (ctx->head_obc) { + ctx->op_t->add_obc(ctx->head_obc); + } + + Context *on_all_commit = new C_OSD_RepopCommit(this, repop); + if (!(ctx->log.empty())) { + ceph_assert(ctx->at_version >= projected_last_update); + projected_last_update = ctx->at_version; + } + for (auto &&entry: ctx->log) { + projected_log.add(entry); + } + + bool requires_missing_loc = false; + for (set<pg_shard_t>::iterator i = async_recovery_targets.begin(); + i != async_recovery_targets.end(); + ++i) { + if (*i == get_primary() || !peer_missing[*i].is_missing(soid)) continue; + requires_missing_loc = true; + for (auto &&entry: ctx->log) { + peer_missing[*i].add_next_event(entry); + } + } + + if (requires_missing_loc) { + for (auto &&entry: ctx->log) { + dout(30) << __func__ << " missing_loc before: " + << missing_loc.get_locations(entry.soid) << dendl; + missing_loc.add_missing(entry.soid, entry.version, + eversion_t(), entry.is_delete()); + // clear out missing_loc + missing_loc.clear_location(entry.soid); + for (auto &i: actingset) { + if (!peer_missing[i].is_missing(entry.soid)) + missing_loc.add_location(entry.soid, i); + } + dout(30) << __func__ << " missing_loc after: " + << missing_loc.get_locations(entry.soid) << dendl; + } + } + + pgbackend->submit_transaction( + soid, + ctx->delta_stats, + ctx->at_version, + std::move(ctx->op_t), + pg_trim_to, + min_last_complete_ondisk, + ctx->log, + ctx->updated_hset_history, + on_all_commit, + repop->rep_tid, + ctx->reqid, + ctx->op); +} + +PrimaryLogPG::RepGather *PrimaryLogPG::new_repop( + OpContext *ctx, ObjectContextRef obc, + ceph_tid_t rep_tid) +{ + if (ctx->op) + dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl; + else + dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl; + + RepGather *repop = new RepGather( + ctx, rep_tid, info.last_complete); + + repop->start = ceph_clock_now(); + + repop_queue.push_back(&repop->queue_item); + repop->get(); + + osd->logger->inc(l_osd_op_wip); + + dout(10) << __func__ << ": " << *repop << dendl; + return repop; +} + +boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop( + eversion_t version, + int r, + ObcLockManager &&manager, + OpRequestRef &&op, + boost::optional<std::function<void(void)> > &&on_complete) +{ + RepGather *repop = new RepGather( + std::move(manager), + std::move(op), + std::move(on_complete), + osd->get_tid(), + info.last_complete, + r); + repop->v = version; + + repop->start = ceph_clock_now(); + + repop_queue.push_back(&repop->queue_item); + + osd->logger->inc(l_osd_op_wip); + + dout(10) << __func__ << ": " << *repop << dendl; + return boost::intrusive_ptr<RepGather>(repop); +} + +void PrimaryLogPG::remove_repop(RepGather *repop) +{ + dout(20) << __func__ << " " << *repop << dendl; + + for (auto p = repop->on_finish.begin(); + p != repop->on_finish.end(); + repop->on_finish.erase(p++)) { + (*p)(); + } + + release_object_locks( + repop->lock_manager); + repop->put(); + + osd->logger->dec(l_osd_op_wip); +} + +PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc) +{ + dout(20) << __func__ << " " << obc->obs.oi.soid << dendl; + ceph_tid_t rep_tid = osd->get_tid(); + osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid); + OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this)); + ctx->op_t.reset(new PGTransaction()); + ctx->mtime = ceph_clock_now(); + return ctx; +} + +void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx) +{ + RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid); + dout(20) << __func__ << " " << repop << dendl; + issue_repop(repop, ctx.get()); + eval_repop(repop); + if (hard_limit_pglog()) + calc_trim_to_aggressive(); + else + calc_trim_to(); + repop->put(); +} + + +void PrimaryLogPG::submit_log_entries( + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + ObcLockManager &&manager, + boost::optional<std::function<void(void)> > &&_on_complete, + OpRequestRef op, + int r) +{ + dout(10) << __func__ << " " << entries << dendl; + ceph_assert(is_primary()); + + eversion_t version; + if (!entries.empty()) { + ceph_assert(entries.rbegin()->version >= projected_last_update); + version = projected_last_update = entries.rbegin()->version; + } + + boost::intrusive_ptr<RepGather> repop; + boost::optional<std::function<void(void)> > on_complete; + if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) { + repop = new_repop( + version, + r, + std::move(manager), + std::move(op), + std::move(_on_complete)); + } else { + on_complete = std::move(_on_complete); + } + + pgbackend->call_write_ordered( + [this, entries, repop, on_complete]() { + ObjectStore::Transaction t; + eversion_t old_last_update = info.last_update; + merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk); + + + set<pg_shard_t> waiting_on; + for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + pg_shard_t peer(*i); + if (peer == pg_whoami) continue; + ceph_assert(peer_missing.count(peer)); + ceph_assert(peer_info.count(peer)); + if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) { + ceph_assert(repop); + MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing( + entries, + spg_t(info.pgid.pgid, i->shard), + pg_whoami.shard, + get_osdmap_epoch(), + last_peering_reset, + repop->rep_tid, + pg_trim_to, + min_last_complete_ondisk); + osd->send_message_osd_cluster( + peer.osd, m, get_osdmap_epoch()); + waiting_on.insert(peer); + } else { + MOSDPGLog *m = new MOSDPGLog( + peer.shard, pg_whoami.shard, + info.last_update.epoch, + info, last_peering_reset); + m->log.log = entries; + m->log.tail = old_last_update; + m->log.head = info.last_update; + osd->send_message_osd_cluster( + peer.osd, m, get_osdmap_epoch()); + } + } + ceph_tid_t rep_tid = repop->rep_tid; + waiting_on.insert(pg_whoami); + log_entry_update_waiting_on.insert( + make_pair( + rep_tid, + LogUpdateCtx{std::move(repop), std::move(waiting_on)} + )); + struct OnComplete : public Context { + PrimaryLogPGRef pg; + ceph_tid_t rep_tid; + epoch_t epoch; + OnComplete( + PrimaryLogPGRef pg, + ceph_tid_t rep_tid, + epoch_t epoch) + : pg(pg), rep_tid(rep_tid), epoch(epoch) {} + void finish(int) override { + pg->lock(); + if (!pg->pg_has_reset_since(epoch)) { + auto it = pg->log_entry_update_waiting_on.find(rep_tid); + ceph_assert(it != pg->log_entry_update_waiting_on.end()); + auto it2 = it->second.waiting_on.find(pg->pg_whoami); + ceph_assert(it2 != it->second.waiting_on.end()); + it->second.waiting_on.erase(it2); + if (it->second.waiting_on.empty()) { + pg->repop_all_committed(it->second.repop.get()); + pg->log_entry_update_waiting_on.erase(it); + } + } + pg->unlock(); + } + }; + t.register_on_commit( + new OnComplete{this, rep_tid, get_osdmap_epoch()}); + int r = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(r == 0); + op_applied(info.last_update); + }); + + if (hard_limit_pglog()) + calc_trim_to_aggressive(); + else + calc_trim_to(); +} + +void PrimaryLogPG::cancel_log_updates() +{ + // get rid of all the LogUpdateCtx so their references to repops are + // dropped + log_entry_update_waiting_on.clear(); +} + +// ------------------------------------------------------- + +void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls) +{ + lock(); + pair<hobject_t, ObjectContextRef> i; + while (object_contexts.get_next(i.first, &i)) { + ObjectContextRef obc(i.second); + get_obc_watchers(obc, *ls); + } + unlock(); +} + +void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers) +{ + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = + obc->watchers.begin(); + j != obc->watchers.end(); + ++j) { + obj_watch_item_t owi; + + owi.obj = obc->obs.oi.soid; + owi.wi.addr = j->second->get_peer_addr(); + owi.wi.name = j->second->get_entity(); + owi.wi.cookie = j->second->get_cookie(); + owi.wi.timeout_seconds = j->second->get_timeout(); + + dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr + << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl; + + pg_watchers.push_back(owi); + } +} + +void PrimaryLogPG::check_blacklisted_watchers() +{ + dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl; + pair<hobject_t, ObjectContextRef> i; + while (object_contexts.get_next(i.first, &i)) + check_blacklisted_obc_watchers(i.second); +} + +void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc) +{ + dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl; + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k = + obc->watchers.begin(); + k != obc->watchers.end(); + ) { + //Advance iterator now so handle_watch_timeout() can erase element + map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++; + dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl; + entity_addr_t ea = j->second->get_peer_addr(); + dout(30) << "watch: Check entity_addr_t " << ea << dendl; + if (get_osdmap()->is_blacklisted(ea)) { + dout(10) << "watch: Found blacklisted watcher for " << ea << dendl; + ceph_assert(j->second->get_pg() == this); + j->second->unregister_cb(); + handle_watch_timeout(j->second); + } + } +} + +void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc) +{ + ceph_assert(is_active()); + auto it_objects = pg_log.get_log().objects.find(obc->obs.oi.soid); + ceph_assert((recovering.count(obc->obs.oi.soid) || + !is_missing_object(obc->obs.oi.soid)) || + (it_objects != pg_log.get_log().objects.end() && // or this is a revert... see recover_primary() + it_objects->second->op == + pg_log_entry_t::LOST_REVERT && + it_objects->second->reverting_to == + obc->obs.oi.version)); + + dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl; + ceph_assert(obc->watchers.empty()); + // populate unconnected_watchers + for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p = + obc->obs.oi.watchers.begin(); + p != obc->obs.oi.watchers.end(); + ++p) { + utime_t expire = info.stats.last_became_active; + expire += p->second.timeout_seconds; + dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl; + WatchRef watch( + Watch::makeWatchRef( + this, osd, obc, p->second.timeout_seconds, p->first.first, + p->first.second, p->second.addr)); + watch->disconnect(); + obc->watchers.insert( + make_pair( + make_pair(p->first.first, p->first.second), + watch)); + } + // Look for watchers from blacklisted clients and drop + check_blacklisted_obc_watchers(obc); +} + +void PrimaryLogPG::handle_watch_timeout(WatchRef watch) +{ + ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref + dout(10) << "handle_watch_timeout obc " << obc << dendl; + + if (!is_active()) { + dout(10) << "handle_watch_timeout not active, no-op" << dendl; + return; + } + if (!obc->obs.exists) { + dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl; + return; + } + if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) { + callbacks_for_degraded_object[obc->obs.oi.soid].push_back( + watch->get_delayed_cb() + ); + dout(10) << "handle_watch_timeout waiting for degraded on obj " + << obc->obs.oi.soid + << dendl; + return; + } + + if (write_blocked_by_scrub(obc->obs.oi.soid)) { + dout(10) << "handle_watch_timeout waiting for scrub on obj " + << obc->obs.oi.soid + << dendl; + scrubber.add_callback( + watch->get_delayed_cb() // This callback! + ); + return; + } + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->at_version = get_next_version(); + + object_info_t& oi = ctx->new_obs.oi; + oi.watchers.erase(make_pair(watch->get_cookie(), + watch->get_entity())); + + list<watch_disconnect_t> watch_disconnects = { + watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true) + }; + ctx->register_on_success( + [this, obc, watch_disconnects]() { + complete_disconnect_watches(obc, watch_disconnects); + }); + + + PGTransaction *t = ctx->op_t.get(); + ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid, + ctx->at_version, + oi.version, + 0, + osd_reqid_t(), ctx->mtime, 0)); + + oi.prior_version = obc->obs.oi.version; + oi.version = ctx->at_version; + bufferlist bl; + encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + t->setattr(obc->obs.oi.soid, OI_ATTR, bl); + + // apply new object state. + ctx->obc->obs = ctx->new_obs; + + // no ctx->delta_stats + simple_opc_submit(std::move(ctx)); +} + +ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi, + SnapSetContext *ssc) +{ + ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid)); + ceph_assert(obc->destructor_callback == NULL); + obc->destructor_callback = new C_PG_ObjectContext(this, obc.get()); + obc->obs.oi = oi; + obc->obs.exists = false; + obc->ssc = ssc; + if (ssc) + register_snapset_context(ssc); + dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl; + if (is_active()) + populate_obc_watchers(obc); + return obc; +} + +ObjectContextRef PrimaryLogPG::get_object_context( + const hobject_t& soid, + bool can_create, + const map<string, bufferlist> *attrs) +{ + auto it_objects = pg_log.get_log().objects.find(soid); + ceph_assert( + attrs || !pg_log.get_missing().is_missing(soid) || + // or this is a revert... see recover_primary() + (it_objects != pg_log.get_log().objects.end() && + it_objects->second->op == + pg_log_entry_t::LOST_REVERT)); + ObjectContextRef obc = object_contexts.lookup(soid); + osd->logger->inc(l_osd_object_ctx_cache_total); + if (obc) { + osd->logger->inc(l_osd_object_ctx_cache_hit); + dout(10) << __func__ << ": found obc in cache: " << obc + << dendl; + } else { + dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl; + // check disk + bufferlist bv; + if (attrs) { + auto it_oi = attrs->find(OI_ATTR); + ceph_assert(it_oi != attrs->end()); + bv = it_oi->second; + } else { + int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv); + if (r < 0) { + if (!can_create) { + dout(10) << __func__ << ": no obc for soid " + << soid << " and !can_create" + << dendl; + return ObjectContextRef(); // -ENOENT! + } + + dout(10) << __func__ << ": no obc for soid " + << soid << " but can_create" + << dendl; + // new object. + object_info_t oi(soid); + SnapSetContext *ssc = get_snapset_context( + soid, true, 0, false); + ceph_assert(ssc); + obc = create_object_context(oi, ssc); + dout(10) << __func__ << ": " << obc << " " << soid + << " " << obc->rwstate + << " oi: " << obc->obs.oi + << " ssc: " << obc->ssc + << " snapset: " << obc->ssc->snapset << dendl; + return obc; + } + } + + object_info_t oi; + try { + bufferlist::const_iterator bliter = bv.begin(); + decode(oi, bliter); + } catch (...) { + dout(0) << __func__ << ": obc corrupt: " << soid << dendl; + return ObjectContextRef(); // -ENOENT! + } + + ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool()); + + obc = object_contexts.lookup_or_create(oi.soid); + obc->destructor_callback = new C_PG_ObjectContext(this, obc.get()); + obc->obs.oi = oi; + obc->obs.exists = true; + + obc->ssc = get_snapset_context( + soid, true, + soid.has_snapset() ? attrs : 0); + + if (is_active()) + populate_obc_watchers(obc); + + if (pool.info.is_erasure()) { + if (attrs) { + obc->attr_cache = *attrs; + } else { + int r = pgbackend->objects_get_attrs( + soid, + &obc->attr_cache); + ceph_assert(r == 0); + } + } + + dout(10) << __func__ << ": creating obc from disk: " << obc + << dendl; + } + + // XXX: Caller doesn't expect this + if (obc->ssc == NULL) { + derr << __func__ << ": obc->ssc not available, not returning context" << dendl; + return ObjectContextRef(); // -ENOENT! + } + + dout(10) << __func__ << ": " << obc << " " << soid + << " " << obc->rwstate + << " oi: " << obc->obs.oi + << " exists: " << (int)obc->obs.exists + << " ssc: " << obc->ssc + << " snapset: " << obc->ssc->snapset << dendl; + return obc; +} + +void PrimaryLogPG::context_registry_on_change() +{ + pair<hobject_t, ObjectContextRef> i; + while (object_contexts.get_next(i.first, &i)) { + ObjectContextRef obc(i.second); + if (obc) { + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = + obc->watchers.begin(); + j != obc->watchers.end(); + obc->watchers.erase(j++)) { + j->second->discard(); + } + } + } +} + + +/* + * If we return an error, and set *pmissing, then promoting that + * object may help. + * + * If we return -EAGAIN, we will always set *pmissing to the missing + * object to wait for. + * + * If we return an error but do not set *pmissing, then we know the + * object does not exist. + */ +int PrimaryLogPG::find_object_context(const hobject_t& oid, + ObjectContextRef *pobc, + bool can_create, + bool map_snapid_to_clone, + hobject_t *pmissing) +{ + FUNCTRACE(cct); + ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool())); + // want the head? + if (oid.snap == CEPH_NOSNAP) { + ObjectContextRef obc = get_object_context(oid, can_create); + if (!obc) { + if (pmissing) + *pmissing = oid; + return -ENOENT; + } + dout(10) << __func__ << " " << oid + << " @" << oid.snap + << " oi=" << obc->obs.oi + << dendl; + *pobc = obc; + + return 0; + } + + hobject_t head = oid.get_head(); + + // we want a snap + if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) { + dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl; + return -ENOENT; + } + + SnapSetContext *ssc = get_snapset_context(oid, can_create); + if (!ssc || !(ssc->exists || can_create)) { + dout(20) << __func__ << " " << oid << " no snapset" << dendl; + if (pmissing) + *pmissing = head; // start by getting the head + if (ssc) + put_snapset_context(ssc); + return -ENOENT; + } + + if (map_snapid_to_clone) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " map_snapid_to_clone=true" << dendl; + if (oid.snap > ssc->snapset.seq) { + // already must be readable + ObjectContextRef obc = get_object_context(head, false); + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to head" << dendl; + *pobc = obc; + put_snapset_context(ssc); + return (obc && obc->obs.exists) ? 0 : -ENOENT; + } else { + vector<snapid_t>::const_iterator citer = std::find( + ssc->snapset.clones.begin(), + ssc->snapset.clones.end(), + oid.snap); + if (citer == ssc->snapset.clones.end()) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to nothing" << dendl; + put_snapset_context(ssc); + return -ENOENT; + } + + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to " << oid << dendl; + + if (pg_log.get_missing().is_missing(oid)) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " is missing" << dendl; + if (pmissing) + *pmissing = oid; + put_snapset_context(ssc); + return -EAGAIN; + } + + ObjectContextRef obc = get_object_context(oid, false); + if (!obc || !obc->obs.exists) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " is not present" << dendl; + if (pmissing) + *pmissing = oid; + put_snapset_context(ssc); + return -ENOENT; + } + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " HIT" << dendl; + *pobc = obc; + put_snapset_context(ssc); + return 0; + } + ceph_abort(); //unreachable + } + + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset << dendl; + + // head? + if (oid.snap > ssc->snapset.seq) { + ObjectContextRef obc = get_object_context(head, false); + dout(10) << __func__ << " " << head + << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq + << " -- HIT " << obc->obs + << dendl; + if (!obc->ssc) + obc->ssc = ssc; + else { + ceph_assert(ssc == obc->ssc); + put_snapset_context(ssc); + } + *pobc = obc; + return 0; + } + + // which clone would it be? + unsigned k = 0; + while (k < ssc->snapset.clones.size() && + ssc->snapset.clones[k] < oid.snap) + k++; + if (k == ssc->snapset.clones.size()) { + dout(10) << __func__ << " no clones with last >= oid.snap " + << oid.snap << " -- DNE" << dendl; + put_snapset_context(ssc); + return -ENOENT; + } + hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(), + info.pgid.pool(), oid.get_namespace()); + + if (pg_log.get_missing().is_missing(soid)) { + dout(20) << __func__ << " " << soid << " missing, try again later" + << dendl; + if (pmissing) + *pmissing = soid; + put_snapset_context(ssc); + return -EAGAIN; + } + + ObjectContextRef obc = get_object_context(soid, false); + if (!obc || !obc->obs.exists) { + if (pmissing) + *pmissing = soid; + put_snapset_context(ssc); + if (is_degraded_or_backfilling_object(soid)) { + dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl; + return -EAGAIN; + } else if (is_degraded_on_async_recovery_target(soid)) { + dout(20) << __func__ << " clone is recovering " << soid << dendl; + return -EAGAIN; + } else { + dout(20) << __func__ << " missing clone " << soid << dendl; + return -ENOENT; + } + } + + if (!obc->ssc) { + obc->ssc = ssc; + } else { + ceph_assert(obc->ssc == ssc); + put_snapset_context(ssc); + } + ssc = 0; + + // clone + dout(20) << __func__ << " " << soid + << " snapset " << obc->ssc->snapset + << dendl; + snapid_t first, last; + auto p = obc->ssc->snapset.clone_snaps.find(soid.snap); + ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); + if (p->second.empty()) { + dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl; + ceph_assert(!cct->_conf->osd_debug_verify_snaps); + return -ENOENT; + } + first = p->second.back(); + last = p->second.front(); + if (first <= oid.snap) { + dout(20) << __func__ << " " << soid << " [" << first << "," << last + << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl; + *pobc = obc; + return 0; + } else { + dout(20) << __func__ << " " << soid << " [" << first << "," << last + << "] does not contain " << oid.snap << " -- DNE" << dendl; + return -ENOENT; + } +} + +void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc) +{ + if (obc->ssc) + put_snapset_context(obc->ssc); +} + +void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat) +{ + object_info_t& oi = obc->obs.oi; + + dout(10) << __func__ << " " << oi.soid << dendl; + ceph_assert(!oi.soid.is_snapdir()); + + object_stat_sum_t stat; + stat.num_objects++; + if (oi.is_dirty()) + stat.num_objects_dirty++; + if (oi.is_whiteout()) + stat.num_whiteouts++; + if (oi.is_omap()) + stat.num_objects_omap++; + if (oi.is_cache_pinned()) + stat.num_objects_pinned++; + if (oi.has_manifest()) + stat.num_objects_manifest++; + + if (oi.soid.is_snap()) { + stat.num_object_clones++; + + if (!obc->ssc) + obc->ssc = get_snapset_context(oi.soid, false); + ceph_assert(obc->ssc); + stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap); + } else { + stat.num_bytes += oi.size; + } + + // add it in + pgstat->stats.sum.add(stat); +} + +void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + if (obc->is_blocked()) { + dout(10) << __func__ << " " << soid << " still blocked" << dendl; + return; + } + + map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid); + if (p != waiting_for_blocked_object.end()) { + list<OpRequestRef>& ls = p->second; + dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl; + requeue_ops(ls); + waiting_for_blocked_object.erase(p); + } + + map<hobject_t, ObjectContextRef>::iterator i = + objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head()); + if (i != objects_blocked_on_snap_promotion.end()) { + ceph_assert(i->second == obc); + objects_blocked_on_snap_promotion.erase(i); + } + + if (obc->requeue_scrub_on_unblock) { + obc->requeue_scrub_on_unblock = false; + // only requeue if we are still active: we may be unblocking + // because we are resetting for a new peering interval + if (is_active()) { + requeue_scrub(); + } + } +} + +SnapSetContext *PrimaryLogPG::get_snapset_context( + const hobject_t& oid, + bool can_create, + const map<string, bufferlist> *attrs, + bool oid_existed) +{ + std::lock_guard l(snapset_contexts_lock); + SnapSetContext *ssc; + map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find( + oid.get_snapdir()); + if (p != snapset_contexts.end()) { + if (can_create || p->second->exists) { + ssc = p->second; + } else { + return NULL; + } + } else { + bufferlist bv; + if (!attrs) { + int r = -ENOENT; + if (!(oid.is_head() && !oid_existed)) { + r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv); + } + if (r < 0 && !can_create) + return NULL; + } else { + auto it_ss = attrs->find(SS_ATTR); + ceph_assert(it_ss != attrs->end()); + bv = it_ss->second; + } + ssc = new SnapSetContext(oid.get_snapdir()); + _register_snapset_context(ssc); + if (bv.length()) { + bufferlist::const_iterator bvp = bv.begin(); + try { + ssc->snapset.decode(bvp); + } catch (buffer::error& e) { + dout(0) << __func__ << " Can't decode snapset: " << e << dendl; + return NULL; + } + ssc->exists = true; + } else { + ssc->exists = false; + } + } + ceph_assert(ssc); + ssc->ref++; + return ssc; +} + +void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc) +{ + std::lock_guard l(snapset_contexts_lock); + --ssc->ref; + if (ssc->ref == 0) { + if (ssc->registered) + snapset_contexts.erase(ssc->oid); + delete ssc; + } +} + +/* + * Return values: + * NONE - didn't pull anything + * YES - pulled what the caller wanted + * HEAD - needed to pull head first + */ +enum { PULL_NONE, PULL_HEAD, PULL_YES }; + +int PrimaryLogPG::recover_missing( + const hobject_t &soid, eversion_t v, + int priority, + PGBackend::RecoveryHandle *h) +{ + if (missing_loc.is_unfound(soid)) { + dout(7) << __func__ << " " << soid + << " v " << v + << " but it is unfound" << dendl; + return PULL_NONE; + } + + if (missing_loc.is_deleted(soid)) { + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, ObjectContextRef())); + epoch_t cur_epoch = get_osdmap_epoch(); + remove_missing_object(soid, v, new FunctionContext( + [=](int) { + lock(); + if (!pg_has_reset_since(cur_epoch)) { + bool object_missing = false; + for (const auto& shard : acting_recovery_backfill) { + if (shard == pg_whoami) + continue; + if (peer_missing[shard].is_missing(soid)) { + dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl; + object_missing = true; + break; + } + } + if (!object_missing) { + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + if (scrub_after_recovery) + stat_diff.num_objects_repaired = 1; + on_global_recover(soid, stat_diff, true); + } else { + auto recovery_handle = pgbackend->open_recovery_op(); + pgbackend->recover_delete_object(soid, v, recovery_handle); + pgbackend->run_recovery_op(recovery_handle, priority); + } + } + unlock(); + })); + return PULL_YES; + } + + // is this a snapped object? if so, consult the snapset.. we may not need the entire object! + ObjectContextRef obc; + ObjectContextRef head_obc; + if (soid.snap && soid.snap < CEPH_NOSNAP) { + // do we have the head? + hobject_t head = soid.get_head(); + if (pg_log.get_missing().is_missing(head)) { + if (recovering.count(head)) { + dout(10) << " missing but already recovering head " << head << dendl; + return PULL_NONE; + } else { + int r = recover_missing( + head, pg_log.get_missing().get_items().find(head)->second.need, priority, + h); + if (r != PULL_NONE) + return PULL_HEAD; + return PULL_NONE; + } + } + head_obc = get_object_context( + head, + false, + 0); + ceph_assert(head_obc); + } + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, obc)); + int r = pgbackend->recover_object( + soid, + v, + head_obc, + obc, + h); + // This is only a pull which shouldn't return an error + ceph_assert(r >= 0); + return PULL_YES; +} + +void PrimaryLogPG::remove_missing_object(const hobject_t &soid, + eversion_t v, Context *on_complete) +{ + dout(20) << __func__ << " " << soid << " " << v << dendl; + ceph_assert(on_complete != nullptr); + // delete locally + ObjectStore::Transaction t; + remove_snap_mapped_object(t, soid); + + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = v; + + epoch_t cur_epoch = get_osdmap_epoch(); + t.register_on_complete(new FunctionContext( + [=](int) { + lock(); + if (!pg_has_reset_since(cur_epoch)) { + ObjectStore::Transaction t2; + on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2); + t2.register_on_complete(on_complete); + int r = osd->store->queue_transaction(ch, std::move(t2), nullptr); + ceph_assert(r == 0); + unlock(); + } else { + unlock(); + on_complete->complete(-EAGAIN); + } + })); + int r = osd->store->queue_transaction(ch, std::move(t), nullptr); + ceph_assert(r == 0); +} + +void PrimaryLogPG::finish_degraded_object(const hobject_t oid) +{ + dout(10) << __func__ << " " << oid << dendl; + if (callbacks_for_degraded_object.count(oid)) { + list<Context*> contexts; + contexts.swap(callbacks_for_degraded_object[oid]); + callbacks_for_degraded_object.erase(oid); + for (list<Context*>::iterator i = contexts.begin(); + i != contexts.end(); + ++i) { + (*i)->complete(0); + } + } + map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find( + oid.get_head()); + if (i != objects_blocked_on_degraded_snap.end() && + i->second == oid.snap) + objects_blocked_on_degraded_snap.erase(i); +} + +void PrimaryLogPG::_committed_pushed_object( + epoch_t epoch, eversion_t last_complete) +{ + lock(); + if (!pg_has_reset_since(epoch)) { + dout(10) << __func__ << " last_complete " << last_complete << " now ondisk" << dendl; + last_complete_ondisk = last_complete; + + if (last_complete_ondisk == info.last_update) { + if (!is_primary()) { + // Either we are a replica or backfill target. + // we are fully up to date. tell the primary! + osd->send_message_osd_cluster( + get_primary().osd, + new MOSDPGTrim( + get_osdmap_epoch(), + spg_t(info.pgid.pgid, get_primary().shard), + last_complete_ondisk), + get_osdmap_epoch()); + } else { + calc_min_last_complete_ondisk(); + } + } + + } else { + dout(10) << __func__ << " pg has changed, not touching last_complete_ondisk" << dendl; + } + + unlock(); +} + +void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc) +{ + dout(20) << __func__ << dendl; + if (obc) { + dout(20) << "obc = " << *obc << dendl; + } + ceph_assert(active_pushes >= 1); + --active_pushes; + + // requeue an active chunky scrub waiting on recovery ops + if (!deleting && active_pushes == 0 + && scrubber.is_chunky_scrub_active()) { + requeue_scrub(ops_blocked_by_scrub()); + } +} + +void PrimaryLogPG::_applied_recovered_object_replica() +{ + dout(20) << __func__ << dendl; + ceph_assert(active_pushes >= 1); + --active_pushes; + + // requeue an active chunky scrub waiting on recovery ops + if (!deleting && active_pushes == 0 && + scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>( + scrubber.active_rep_scrub->get_req())->chunky) { + auto& op = scrubber.active_rep_scrub; + osd->enqueue_back( + OpQueueItem( + unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)), + op->get_req()->get_cost(), + op->get_req()->get_priority(), + op->get_req()->get_recv_stamp(), + op->get_req()->get_source().num(), + get_osdmap_epoch())); + scrubber.active_rep_scrub.reset(); + } +} + +void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v) +{ + dout(10) << "got missing " << oid << " v " << v << dendl; + pg_log.recover_got(oid, v, info); + if (pg_log.get_log().log.empty()) { + dout(10) << "last_complete now " << info.last_complete + << " while log is empty" << dendl; + } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) { + dout(10) << "last_complete now " << info.last_complete + << " log.complete_to " << pg_log.get_log().complete_to->version + << dendl; + } else { + dout(10) << "last_complete now " << info.last_complete + << " log.complete_to at end" << dendl; + //below is not true in the repair case. + //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong. + ceph_assert(info.last_complete == info.last_update); + } +} + +void PrimaryLogPG::primary_failed(const hobject_t &soid) +{ + list<pg_shard_t> fl = { pg_whoami }; + failed_push(fl, soid); +} + +void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, + const hobject_t &soid, const eversion_t &need) +{ + dout(20) << __func__ << ": " << soid << dendl; + ceph_assert(recovering.count(soid)); + auto obc = recovering[soid]; + if (obc) { + list<OpRequestRef> blocked_ops; + obc->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + recovering.erase(soid); + for (auto&& i : from) { + missing_loc.remove_location(soid, i); + if (need != eversion_t()) { + dout(0) << __func__ << " adding " << soid << " to shard " << i + << "'s missing set too" << dendl; + auto pm = peer_missing.find(i); + if (pm != peer_missing.end()) + pm->second.add(soid, need, eversion_t(), false); + } + } + dout(0) << __func__ << " " << soid << " from shard " << from + << ", reps on " << missing_loc.get_locations(soid) + << " unfound? " << missing_loc.is_unfound(soid) << dendl; + finish_recovery_op(soid); // close out this attempt, +} + +eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid) +{ + eversion_t v; + pg_missing_item pmi; + bool is_missing = pg_log.get_missing().is_missing(oid, &pmi); + ceph_assert(is_missing); + v = pmi.have; + dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl; + + ceph_assert(!acting_recovery_backfill.empty()); + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + if (!peer_missing[peer].is_missing(oid)) { + continue; + } + eversion_t h = peer_missing[peer].get_items().at(oid).have; + dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl; + if (h > v) + v = h; + } + + dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl; + return v; +} + +void PrimaryLogPG::do_update_log_missing(OpRequestRef &op) +{ + const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>( + op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING); + ObjectStore::Transaction t; + boost::optional<eversion_t> op_trim_to, op_roll_forward_to; + if (m->pg_trim_to != eversion_t()) + op_trim_to = m->pg_trim_to; + if (m->pg_roll_forward_to != eversion_t()) + op_roll_forward_to = m->pg_roll_forward_to; + + dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl; + + append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to); + eversion_t new_lcod = info.last_complete; + + Context *complete = new FunctionContext( + [=](int) { + const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>( + op->get_req()); + lock(); + if (!pg_has_reset_since(msg->get_epoch())) { + update_last_complete_ondisk(new_lcod); + MOSDPGUpdateLogMissingReply *reply = + new MOSDPGUpdateLogMissingReply( + spg_t(info.pgid.pgid, primary_shard().shard), + pg_whoami.shard, + msg->get_epoch(), + msg->min_epoch, + msg->get_tid(), + new_lcod); + reply->set_priority(CEPH_MSG_PRIO_HIGH); + msg->get_connection()->send_message(reply); + } + unlock(); + }); + + if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) { + t.register_on_commit(complete); + } else { + /* Hack to work around the fact that ReplicatedBackend sends + * ack+commit if commit happens first + * + * This behavior is no longer necessary, but we preserve it so old + * primaries can keep their repops in order */ + if (pool.info.is_erasure()) { + t.register_on_complete(complete); + } else { + t.register_on_commit(complete); + } + } + int tr = osd->store->queue_transaction( + ch, + std::move(t), + nullptr); + ceph_assert(tr == 0); + op_applied(info.last_update); +} + +void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op) +{ + const MOSDPGUpdateLogMissingReply *m = + static_cast<const MOSDPGUpdateLogMissingReply*>( + op->get_req()); + dout(20) << __func__ << " got reply from " + << m->get_from() << dendl; + + auto it = log_entry_update_waiting_on.find(m->get_tid()); + if (it != log_entry_update_waiting_on.end()) { + if (it->second.waiting_on.count(m->get_from())) { + it->second.waiting_on.erase(m->get_from()); + if (m->last_complete_ondisk != eversion_t()) { + update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk); + } + } else { + osd->clog->error() + << info.pgid << " got reply " + << *m << " from shard we are not waiting for " + << m->get_from(); + } + + if (it->second.waiting_on.empty()) { + repop_all_committed(it->second.repop.get()); + log_entry_update_waiting_on.erase(it); + } + } else { + osd->clog->error() + << info.pgid << " got reply " + << *m << " on unknown tid " << m->get_tid(); + } +} + +/* Mark all unfound objects as lost. + */ +void PrimaryLogPG::mark_all_unfound_lost( + int what, + ConnectionRef con, + ceph_tid_t tid) +{ + dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl; + list<hobject_t> oids; + + dout(30) << __func__ << ": log before:\n"; + pg_log.get_log().print(*_dout); + *_dout << dendl; + + mempool::osd_pglog::list<pg_log_entry_t> log_entries; + + utime_t mtime = ceph_clock_now(); + map<hobject_t, pg_missing_item>::const_iterator m = + missing_loc.get_needs_recovery().begin(); + map<hobject_t, pg_missing_item>::const_iterator mend = + missing_loc.get_needs_recovery().end(); + + ObcLockManager manager; + eversion_t v = get_next_version(); + v.epoch = get_osdmap_epoch(); + uint64_t num_unfound = missing_loc.num_unfound(); + while (m != mend) { + const hobject_t &oid(m->first); + if (!missing_loc.is_unfound(oid)) { + // We only care about unfound objects + ++m; + continue; + } + + ObjectContextRef obc; + eversion_t prev; + + switch (what) { + case pg_log_entry_t::LOST_MARK: + ceph_abort_msg("actually, not implemented yet!"); + break; + + case pg_log_entry_t::LOST_REVERT: + prev = pick_newest_available(oid); + if (prev > eversion_t()) { + // log it + pg_log_entry_t e( + pg_log_entry_t::LOST_REVERT, oid, v, + m->second.need, 0, osd_reqid_t(), mtime, 0); + e.reverting_to = prev; + e.mark_unrollbackable(); + log_entries.push_back(e); + dout(10) << e << dendl; + + // we are now missing the new version; recovery code will sort it out. + ++v.version; + ++m; + break; + } + + case pg_log_entry_t::LOST_DELETE: + { + pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need, + 0, osd_reqid_t(), mtime, 0); + if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) { + if (pool.info.require_rollback()) { + e.mod_desc.try_rmobject(v.version); + } else { + e.mark_unrollbackable(); + } + } // otherwise, just do what we used to do + dout(10) << e << dendl; + log_entries.push_back(e); + oids.push_back(oid); + + // If context found mark object as deleted in case + // of racing with new creation. This can happen if + // object lost and EIO at primary. + obc = object_contexts.lookup(oid); + if (obc) + obc->obs.exists = false; + + ++v.version; + ++m; + } + break; + + default: + ceph_abort(); + } + } + + info.stats.stats_invalid = true; + + submit_log_entries( + log_entries, + std::move(manager), + boost::optional<std::function<void(void)> >( + [this, oids, con, num_unfound, tid]() { + if (perform_deletes_during_peering()) { + for (auto oid : oids) { + // clear old locations - merge_new_log_entries will have + // handled rebuilding missing_loc for each of these + // objects if we have the RECOVERY_DELETES flag + missing_loc.recovered(oid); + } + } + + if (is_recovery_unfound()) { + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + DoRecovery()))); + } else if (is_backfill_unfound()) { + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + RequestBackfill()))); + } else { + queue_recovery(); + } + + stringstream ss; + ss << "pg has " << num_unfound + << " objects unfound and apparently lost marking"; + string rs = ss.str(); + dout(0) << "do_command r=" << 0 << " " << rs << dendl; + osd->clog->info() << rs; + if (con) { + MCommandReply *reply = new MCommandReply(0, rs); + reply->set_tid(tid); + con->send_message(reply); + } + }), + OpRequestRef()); +} + +void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits) +{ + ceph_assert(repop_queue.empty()); +} + +/* + * pg status change notification + */ + +void PrimaryLogPG::apply_and_flush_repops(bool requeue) +{ + list<OpRequestRef> rq; + + // apply all repops + while (!repop_queue.empty()) { + RepGather *repop = repop_queue.front(); + repop_queue.pop_front(); + dout(10) << " canceling repop tid " << repop->rep_tid << dendl; + repop->rep_aborted = true; + repop->on_committed.clear(); + repop->on_success.clear(); + + if (requeue) { + if (repop->op) { + dout(10) << " requeuing " << *repop->op->get_req() << dendl; + rq.push_back(repop->op); + repop->op = OpRequestRef(); + } + + // also requeue any dups, interleaved into position + auto p = waiting_for_ondisk.find(repop->v); + if (p != waiting_for_ondisk.end()) { + dout(10) << " also requeuing ondisk waiters " << p->second << dendl; + for (auto& i : p->second) { + rq.push_back(std::get<0>(i)); + } + waiting_for_ondisk.erase(p); + } + } + + remove_repop(repop); + } + + ceph_assert(repop_queue.empty()); + + if (requeue) { + requeue_ops(rq); + if (!waiting_for_ondisk.empty()) { + for (auto& i : waiting_for_ondisk) { + for (auto& j : i.second) { + derr << __func__ << ": op " << *(std::get<0>(j)->get_req()) + << " waiting on " << i.first << dendl; + } + } + ceph_assert(waiting_for_ondisk.empty()); + } + } + + waiting_for_ondisk.clear(); +} + +void PrimaryLogPG::on_flushed() +{ + ceph_assert(flushes_in_progress > 0); + flushes_in_progress--; + if (flushes_in_progress == 0) { + requeue_ops(waiting_for_flush); + } + if (!is_peered() || !is_primary()) { + pair<hobject_t, ObjectContextRef> i; + while (object_contexts.get_next(i.first, &i)) { + derr << __func__ << ": object " << i.first << " obc still alive" << dendl; + } + ceph_assert(object_contexts.empty()); + } +} + +void PrimaryLogPG::on_removal(ObjectStore::Transaction *t) +{ + dout(10) << __func__ << dendl; + + // adjust info to backfill + info.set_last_backfill(hobject_t()); + pg_log.reset_backfill(); + dirty_info = true; + + // clear log + PGLogEntryHandler rollbacker{this, t}; + pg_log.roll_forward(&rollbacker); + + on_shutdown(); +} + +void PrimaryLogPG::clear_async_reads() +{ + dout(10) << __func__ << dendl; + for(auto& i : in_progress_async_reads) { + dout(10) << "clear ctx: " + << "OpRequestRef " << i.first + << " OpContext " << i.second + << dendl; + close_op_ctx(i.second); + } +} + +void PrimaryLogPG::clear_cache() +{ + object_contexts.clear(); +} + +void PrimaryLogPG::on_shutdown() +{ + dout(10) << __func__ << dendl; + + // handles queue races + deleting = true; + + if (recovery_queued) { + recovery_queued = false; + osd->clear_queued_recovery(this); + } + + clear_scrub_reserved(); + scrub_clear_state(); + + unreg_next_scrub(); + + vector<ceph_tid_t> tids; + cancel_copy_ops(false, &tids); + cancel_flush_ops(false, &tids); + cancel_proxy_ops(false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + + apply_and_flush_repops(false); + cancel_log_updates(); + // we must remove PGRefs, so do this this prior to release_backoffs() callers + clear_backoffs(); + // clean up snap trim references + snap_trimmer_machine.process_event(Reset()); + + pgbackend->on_change(); + + context_registry_on_change(); + object_contexts.clear(); + + clear_async_reads(); + + osd->remote_reserver.cancel_reservation(info.pgid); + osd->local_reserver.cancel_reservation(info.pgid); + + clear_primary_state(); + cancel_recovery(); + + if (is_primary()) { + osd->clear_ready_to_merge(this); + } +} + +void PrimaryLogPG::on_activate() +{ + // all clean? + if (needs_recovery()) { + dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + DoRecovery()))); + } else if (needs_backfill()) { + dout(10) << "activate queueing backfill" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + RequestBackfill()))); + } else { + dout(10) << "activate all replicas clean, no recovery" << dendl; + eio_errors_to_process = false; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + AllReplicasRecovered()))); + } + + publish_stats_to_osd(); + + if (!backfill_targets.empty()) { + last_backfill_started = earliest_backfill(); + new_backfill = true; + ceph_assert(!last_backfill_started.is_max()); + dout(5) << __func__ << ": bft=" << backfill_targets + << " from " << last_backfill_started << dendl; + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + dout(5) << "target shard " << *i + << " from " << peer_info[*i].last_backfill + << dendl; + } + } + + hit_set_setup(); + agent_setup(); +} + +void PrimaryLogPG::_on_new_interval() +{ + dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl; + if (!pg_log.get_missing().may_include_deletes && + get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) { + pg_log.rebuild_missing_set_with_deletes(osd->store, ch, info); + } + ceph_assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)); +} + +void PrimaryLogPG::on_change(ObjectStore::Transaction *t) +{ + dout(10) << __func__ << dendl; + + if (hit_set && hit_set->insert_count() == 0) { + dout(20) << " discarding empty hit_set" << dendl; + hit_set_clear(); + } + + if (recovery_queued) { + recovery_queued = false; + osd->clear_queued_recovery(this); + } + + // requeue everything in the reverse order they should be + // reexamined. + requeue_ops(waiting_for_peered); + requeue_ops(waiting_for_flush); + requeue_ops(waiting_for_active); + + clear_scrub_reserved(); + + vector<ceph_tid_t> tids; + cancel_copy_ops(is_primary(), &tids); + cancel_flush_ops(is_primary(), &tids); + cancel_proxy_ops(is_primary(), &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + + // requeue object waiters + for (auto& p : waiting_for_unreadable_object) { + release_backoffs(p.first); + } + if (is_primary()) { + requeue_object_waiters(waiting_for_unreadable_object); + } else { + waiting_for_unreadable_object.clear(); + } + for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin(); + p != waiting_for_degraded_object.end(); + waiting_for_degraded_object.erase(p++)) { + release_backoffs(p->first); + if (is_primary()) + requeue_ops(p->second); + else + p->second.clear(); + finish_degraded_object(p->first); + } + + // requeues waiting_for_scrub + scrub_clear_state(); + + for (auto p = waiting_for_blocked_object.begin(); + p != waiting_for_blocked_object.end(); + waiting_for_blocked_object.erase(p++)) { + if (is_primary()) + requeue_ops(p->second); + else + p->second.clear(); + } + for (auto i = callbacks_for_degraded_object.begin(); + i != callbacks_for_degraded_object.end(); + ) { + finish_degraded_object((i++)->first); + } + ceph_assert(callbacks_for_degraded_object.empty()); + + if (is_primary()) { + requeue_ops(waiting_for_cache_not_full); + } else { + waiting_for_cache_not_full.clear(); + } + objects_blocked_on_cache_full.clear(); + + for (list<pair<OpRequestRef, OpContext*> >::iterator i = + in_progress_async_reads.begin(); + i != in_progress_async_reads.end(); + in_progress_async_reads.erase(i++)) { + close_op_ctx(i->second); + if (is_primary()) + requeue_op(i->first); + } + + // this will requeue ops we were working on but didn't finish, and + // any dups + apply_and_flush_repops(is_primary()); + cancel_log_updates(); + + // do this *after* apply_and_flush_repops so that we catch any newly + // registered watches. + context_registry_on_change(); + + pgbackend->on_change_cleanup(t); + scrubber.cleanup_store(t); + pgbackend->on_change(); + + // clear snap_trimmer state + snap_trimmer_machine.process_event(Reset()); + + debug_op_order.clear(); + unstable_stats.clear(); + + // we don't want to cache object_contexts through the interval change + // NOTE: we actually assert that all currently live references are dead + // by the time the flush for the next interval completes. + object_contexts.clear(); + + // should have been cleared above by finishing all of the degraded objects + ceph_assert(objects_blocked_on_degraded_snap.empty()); +} + +void PrimaryLogPG::on_role_change() +{ + dout(10) << __func__ << dendl; + if (get_role() != 0 && hit_set) { + dout(10) << " clearing hit set" << dendl; + hit_set_clear(); + } +} + +void PrimaryLogPG::on_pool_change() +{ + dout(10) << __func__ << dendl; + // requeue cache full waiters just in case the cache_mode is + // changing away from writeback mode. note that if we are not + // active the normal requeuing machinery is sufficient (and properly + // ordered). + if (is_active() && + pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK && + !waiting_for_cache_not_full.empty()) { + dout(10) << __func__ << " requeuing full waiters (not in writeback) " + << dendl; + requeue_ops(waiting_for_cache_not_full); + objects_blocked_on_cache_full.clear(); + } + hit_set_setup(); + agent_setup(); +} + +// clear state. called on recovery completion AND cancellation. +void PrimaryLogPG::_clear_recovery_state() +{ + missing_loc.clear(); +#ifdef DEBUG_RECOVERY_OIDS + recovering_oids.clear(); +#endif + last_backfill_started = hobject_t(); + set<hobject_t>::iterator i = backfills_in_flight.begin(); + while (i != backfills_in_flight.end()) { + ceph_assert(recovering.count(*i)); + backfills_in_flight.erase(i++); + } + + list<OpRequestRef> blocked_ops; + for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin(); + i != recovering.end(); + recovering.erase(i++)) { + if (i->second) { + i->second->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + } + ceph_assert(backfills_in_flight.empty()); + pending_backfill_updates.clear(); + ceph_assert(recovering.empty()); + pgbackend->clear_recovery_state(); +} + +void PrimaryLogPG::cancel_pull(const hobject_t &soid) +{ + dout(20) << __func__ << ": " << soid << dendl; + ceph_assert(recovering.count(soid)); + ObjectContextRef obc = recovering[soid]; + if (obc) { + list<OpRequestRef> blocked_ops; + obc->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + recovering.erase(soid); + finish_recovery_op(soid); + release_backoffs(soid); + if (waiting_for_degraded_object.count(soid)) { + dout(20) << " kicking degraded waiters on " << soid << dendl; + requeue_ops(waiting_for_degraded_object[soid]); + waiting_for_degraded_object.erase(soid); + } + if (waiting_for_unreadable_object.count(soid)) { + dout(20) << " kicking unreadable waiters on " << soid << dendl; + requeue_ops(waiting_for_unreadable_object[soid]); + waiting_for_unreadable_object.erase(soid); + } + if (is_missing_object(soid)) + pg_log.set_last_requested(0); // get recover_primary to start over + finish_degraded_object(soid); +} + +void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap) +{ + /* + * check that any peers we are planning to (or currently) pulling + * objects from are dealt with. + */ + missing_loc.check_recovery_sources(osdmap); + pgbackend->check_recovery_sources(osdmap); + + for (set<pg_shard_t>::iterator i = peer_log_requested.begin(); + i != peer_log_requested.end(); + ) { + if (!osdmap->is_up(i->osd)) { + dout(10) << "peer_log_requested removing " << *i << dendl; + peer_log_requested.erase(i++); + } else { + ++i; + } + } + + for (set<pg_shard_t>::iterator i = peer_missing_requested.begin(); + i != peer_missing_requested.end(); + ) { + if (!osdmap->is_up(i->osd)) { + dout(10) << "peer_missing_requested removing " << *i << dendl; + peer_missing_requested.erase(i++); + } else { + ++i; + } + } +} + +bool PrimaryLogPG::start_recovery_ops( + uint64_t max, + ThreadPool::TPHandle &handle, + uint64_t *ops_started) +{ + uint64_t& started = *ops_started; + started = 0; + bool work_in_progress = false; + bool recovery_started = false; + ceph_assert(is_primary()); + ceph_assert(is_peered()); + ceph_assert(!is_deleting()); + + ceph_assert(recovery_queued); + recovery_queued = false; + + if (!state_test(PG_STATE_RECOVERING) && + !state_test(PG_STATE_BACKFILLING)) { + /* TODO: I think this case is broken and will make do_recovery() + * unhappy since we're returning false */ + dout(10) << "recovery raced and were queued twice, ignoring!" << dendl; + return have_unfound(); + } + + const auto &missing = pg_log.get_missing(); + + uint64_t num_unfound = get_num_unfound(); + + if (!missing.have_missing()) { + info.last_complete = info.last_update; + } + + if (!missing.have_missing() || // Primary does not have missing + all_missing_unfound()) { // or all of the missing objects are unfound. + // Recover the replicas. + started = recover_replicas(max, handle, &recovery_started); + } + if (!started) { + // We still have missing objects that we should grab from replicas. + started += recover_primary(max, handle); + } + if (!started && num_unfound != get_num_unfound()) { + // second chance to recovery replicas + started = recover_replicas(max, handle, &recovery_started); + } + + if (started || recovery_started) + work_in_progress = true; + + bool deferred_backfill = false; + if (recovering.empty() && + state_test(PG_STATE_BACKFILLING) && + !backfill_targets.empty() && started < max && + missing.num_missing() == 0 && + waiting_on_backfill.empty()) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) { + dout(10) << "deferring backfill due to NOBACKFILL" << dendl; + deferred_backfill = true; + } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) && + !is_degraded()) { + dout(10) << "deferring backfill due to NOREBALANCE" << dendl; + deferred_backfill = true; + } else if (!backfill_reserved) { + dout(10) << "deferring backfill due to !backfill_reserved" << dendl; + if (!backfill_reserving) { + dout(10) << "queueing RequestBackfill" << dendl; + backfill_reserving = true; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + RequestBackfill()))); + } + deferred_backfill = true; + } else { + started += recover_backfill(max - started, handle, &work_in_progress); + } + } + + dout(10) << " started " << started << dendl; + osd->logger->inc(l_osd_rop, started); + + if (!recovering.empty() || + work_in_progress || recovery_ops_active > 0 || deferred_backfill) + return !work_in_progress && have_unfound(); + + ceph_assert(recovering.empty()); + ceph_assert(recovery_ops_active == 0); + + dout(10) << __func__ << " needs_recovery: " + << missing_loc.get_needs_recovery() + << dendl; + dout(10) << __func__ << " missing_loc: " + << missing_loc.get_missing_locs() + << dendl; + int unfound = get_num_unfound(); + if (unfound) { + dout(10) << " still have " << unfound << " unfound" << dendl; + return true; + } + + if (missing.num_missing() > 0) { + // this shouldn't happen! + osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with " + << missing.num_missing() << ": " << missing.get_items(); + return false; + } + + if (needs_recovery()) { + // this shouldn't happen! + // We already checked num_missing() so we must have missing replicas + osd->clog->error() << info.pgid + << " Unexpected Error: recovery ending with missing replicas"; + return false; + } + + if (state_test(PG_STATE_RECOVERING)) { + state_clear(PG_STATE_RECOVERING); + state_clear(PG_STATE_FORCED_RECOVERY); + if (needs_backfill()) { + dout(10) << "recovery done, queuing backfill" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + RequestBackfill()))); + } else { + dout(10) << "recovery done, no backfill" << dendl; + eio_errors_to_process = false; + state_clear(PG_STATE_FORCED_BACKFILL); + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + AllReplicasRecovered()))); + } + } else { // backfilling + state_clear(PG_STATE_BACKFILLING); + state_clear(PG_STATE_FORCED_BACKFILL); + state_clear(PG_STATE_FORCED_RECOVERY); + dout(10) << "recovery done, backfill done" << dendl; + eio_errors_to_process = false; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + Backfilled()))); + } + + return false; +} + +/** + * do one recovery op. + * return true if done, false if nothing left to do. + */ +uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle) +{ + ceph_assert(is_primary()); + + const auto &missing = pg_log.get_missing(); + + dout(10) << __func__ << " recovering " << recovering.size() + << " in pg," + << " missing " << missing << dendl; + + dout(25) << __func__ << " " << missing.get_items() << dendl; + + // look at log! + pg_log_entry_t *latest = 0; + unsigned started = 0; + int skipped = 0; + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + map<version_t, hobject_t>::const_iterator p = + missing.get_rmissing().lower_bound(pg_log.get_log().last_requested); + while (p != missing.get_rmissing().end()) { + handle.reset_tp_timeout(); + hobject_t soid; + version_t v = p->first; + + auto it_objects = pg_log.get_log().objects.find(p->second); + if (it_objects != pg_log.get_log().objects.end()) { + latest = it_objects->second; + ceph_assert(latest->is_update() || latest->is_delete()); + soid = latest->soid; + } else { + latest = 0; + soid = p->second; + } + const pg_missing_item& item = missing.get_items().find(p->second)->second; + ++p; + + hobject_t head = soid.get_head(); + + eversion_t need = item.need; + + dout(10) << __func__ << " " + << soid << " " << item.need + << (missing.is_missing(soid) ? " (missing)":"") + << (missing.is_missing(head) ? " (missing head)":"") + << (recovering.count(soid) ? " (recovering)":"") + << (recovering.count(head) ? " (recovering head)":"") + << dendl; + + if (latest) { + switch (latest->op) { + case pg_log_entry_t::CLONE: + /* + * Handling for this special case removed for now, until we + * can correctly construct an accurate SnapSet from the old + * one. + */ + break; + + case pg_log_entry_t::LOST_REVERT: + { + if (item.have == latest->reverting_to) { + ObjectContextRef obc = get_object_context(soid, true); + + if (obc->obs.oi.version == latest->version) { + // I'm already reverting + dout(10) << " already reverting " << soid << dendl; + } else { + dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl; + obc->obs.oi.version = latest->version; + + ObjectStore::Transaction t; + bufferlist b2; + obc->obs.oi.encode( + b2, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + ceph_assert(!pool.info.require_rollback()); + t.setattr(coll, ghobject_t(soid), OI_ATTR, b2); + + recover_got(soid, latest->version); + missing_loc.add_location(soid, pg_whoami); + + ++active_pushes; + + t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc)); + t.register_on_commit(new C_OSD_CommittedPushedObject( + this, + get_osdmap_epoch(), + info.last_complete)); + osd->store->queue_transaction(ch, std::move(t)); + continue; + } + } else { + /* + * Pull the old version of the object. Update missing_loc here to have the location + * of the version we want. + * + * This doesn't use the usual missing_loc paths, but that's okay: + * - if we have it locally, we hit the case above, and go from there. + * - if we don't, we always pass through this case during recovery and set up the location + * properly. + * - this way we don't need to mangle the missing code to be general about needing an old + * version... + */ + eversion_t alternate_need = latest->reverting_to; + dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl; + + for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin(); + p != peer_missing.end(); + ++p) + if (p->second.is_missing(soid, need) && + p->second.get_items().at(soid).have == alternate_need) { + missing_loc.add_location(soid, p->first); + } + dout(10) << " will pull " << alternate_need << " or " << need + << " from one of " << missing_loc.get_locations(soid) + << dendl; + } + } + break; + } + } + + if (!recovering.count(soid)) { + if (recovering.count(head)) { + ++skipped; + } else { + int r = recover_missing( + soid, need, get_recovery_op_priority(), h); + switch (r) { + case PULL_YES: + ++started; + break; + case PULL_HEAD: + ++started; + case PULL_NONE: + ++skipped; + break; + default: + ceph_abort(); + } + if (started >= max) + break; + } + } + + // only advance last_requested if we haven't skipped anything + if (!skipped) + pg_log.set_last_requested(v); + } + + pgbackend->run_recovery_op(h, get_recovery_op_priority()); + return started; +} + +bool PrimaryLogPG::primary_error( + const hobject_t& soid, eversion_t v) +{ + pg_log.missing_add(soid, v, eversion_t()); + pg_log.set_last_requested(0); + missing_loc.remove_location(soid, pg_whoami); + bool uhoh = true; + ceph_assert(!acting_recovery_backfill.empty()); + for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); + i != acting_recovery_backfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + if (!peer_missing[peer].is_missing(soid, v)) { + missing_loc.add_location(soid, peer); + dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v + << ", there should be a copy on shard " << peer << dendl; + uhoh = false; + } + } + if (uhoh) + osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound"; + else + osd->clog->error() << info.pgid << " missing primary copy of " << soid + << ", will try copies on " << missing_loc.get_locations(soid); + return uhoh; +} + +int PrimaryLogPG::prep_object_replica_deletes( + const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started) +{ + ceph_assert(is_primary()); + dout(10) << __func__ << ": on " << soid << dendl; + + ObjectContextRef obc = get_object_context(soid, false); + if (obc) { + if (!obc->get_recovery_read()) { + dout(20) << "replica delete delayed on " << soid + << "; could not get rw_manager lock" << dendl; + *work_started = true; + return 0; + } else { + dout(20) << "replica delete got recovery read lock on " << soid + << dendl; + } + } + + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + if (!obc) + recovering.insert(make_pair(soid, ObjectContextRef())); + else + recovering.insert(make_pair(soid, obc)); + + pgbackend->recover_delete_object(soid, v, h); + return 1; +} + +int PrimaryLogPG::prep_object_replica_pushes( + const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started) +{ + ceph_assert(is_primary()); + dout(10) << __func__ << ": on " << soid << dendl; + + // NOTE: we know we will get a valid oloc off of disk here. + ObjectContextRef obc = get_object_context(soid, false); + if (!obc) { + primary_error(soid, v); + return 0; + } + + if (!obc->get_recovery_read()) { + dout(20) << "recovery delayed on " << soid + << "; could not get rw_manager lock" << dendl; + *work_started = true; + return 0; + } else { + dout(20) << "recovery got recovery read lock on " << soid + << dendl; + } + + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, obc)); + + /* We need this in case there is an in progress write on the object. In fact, + * the only possible write is an update to the xattr due to a lost_revert -- + * a client write would be blocked since the object is degraded. + * In almost all cases, therefore, this lock should be uncontended. + */ + int r = pgbackend->recover_object( + soid, + v, + ObjectContextRef(), + obc, // has snapset context + h); + if (r < 0) { + dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl; + primary_failed(soid); + primary_error(soid, v); + return 0; + } + return 1; +} + +uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle, + bool *work_started) +{ + dout(10) << __func__ << "(" << max << ")" << dendl; + uint64_t started = 0; + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + + // this is FAR from an optimal recovery order. pretty lame, really. + ceph_assert(!acting_recovery_backfill.empty()); + // choose replicas to recover, replica has the shortest missing list first + // so we can bring it back to normal ASAP + std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing, + async_by_num_missing; + replicas_by_num_missing.reserve(acting_recovery_backfill.size() - 1); + for (auto &p: acting_recovery_backfill) { + if (p == get_primary()) { + continue; + } + auto pm = peer_missing.find(p); + ceph_assert(pm != peer_missing.end()); + auto nm = pm->second.num_missing(); + if (nm != 0) { + if (async_recovery_targets.count(p)) { + async_by_num_missing.push_back(make_pair(nm, p)); + } else { + replicas_by_num_missing.push_back(make_pair(nm, p)); + } + } + } + // sort by number of missing objects, in ascending order. + auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs, + const std::pair<unsigned int, pg_shard_t> &rhs) { + return lhs.first < rhs.first; + }; + // acting goes first + std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func); + // then async_recovery_targets + std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func); + replicas_by_num_missing.insert(replicas_by_num_missing.end(), + async_by_num_missing.begin(), async_by_num_missing.end()); + for (auto &replica: replicas_by_num_missing) { + pg_shard_t &peer = replica.second; + ceph_assert(peer != get_primary()); + map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer); + ceph_assert(pm != peer_missing.end()); + map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer); + ceph_assert(pi != peer_info.end()); + size_t m_sz = pm->second.num_missing(); + + dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl; + dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl; + + // oldest first! + const pg_missing_t &m(pm->second); + for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin(); + p != m.get_rmissing().end() && started < max; + ++p) { + handle.reset_tp_timeout(); + const hobject_t soid(p->second); + + if (missing_loc.is_unfound(soid)) { + dout(10) << __func__ << ": " << soid << " still unfound" << dendl; + continue; + } + + if (soid > pi->second.last_backfill) { + if (!recovering.count(soid)) { + derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl; + derr << __func__ << ": object added to missing set for backfill, but " + << "is not in recovering, error!" << dendl; + ceph_abort(); + } + continue; + } + + if (recovering.count(soid)) { + dout(10) << __func__ << ": already recovering " << soid << dendl; + continue; + } + + if (missing_loc.is_deleted(soid)) { + dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl; + map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid); + started += prep_object_replica_deletes(soid, r->second.need, h, work_started); + continue; + } + + if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) { + dout(10) << __func__ << ": " << soid.get_head() + << " still missing on primary" << dendl; + continue; + } + + if (pg_log.get_missing().is_missing(soid)) { + dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl; + continue; + } + + dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl; + map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid); + started += prep_object_replica_pushes(soid, r->second.need, h, work_started); + } + } + + pgbackend->run_recovery_op(h, get_recovery_op_priority()); + return started; +} + +hobject_t PrimaryLogPG::earliest_peer_backfill() const +{ + hobject_t e = hobject_t::get_max(); + for (set<pg_shard_t>::const_iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t peer = *i; + map<pg_shard_t, BackfillInterval>::const_iterator iter = + peer_backfill_info.find(peer); + ceph_assert(iter != peer_backfill_info.end()); + if (iter->second.begin < e) + e = iter->second.begin; + } + return e; +} + +bool PrimaryLogPG::all_peer_done() const +{ + // Primary hasn't got any more objects + ceph_assert(backfill_info.empty()); + + for (set<pg_shard_t>::const_iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + map<pg_shard_t, BackfillInterval>::const_iterator piter = + peer_backfill_info.find(bt); + ceph_assert(piter != peer_backfill_info.end()); + const BackfillInterval& pbi = piter->second; + // See if peer has more to process + if (!pbi.extends_to_end() || !pbi.empty()) + return false; + } + return true; +} + +/** + * recover_backfill + * + * Invariants: + * + * backfilled: fully pushed to replica or present in replica's missing set (both + * our copy and theirs). + * + * All objects on a backfill_target in + * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed + * objects have been actually deleted and all logically-valid objects are replicated. + * There may be PG objects in this interval yet to be backfilled. + * + * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all + * backfill_targets. There may be objects on backfill_target(s) yet to be deleted. + * + * For a backfill target, all objects < std::min(peer_backfill_info[target].begin, + * backfill_info.begin) in PG are backfilled. No deleted objects in this + * interval remain on the backfill target. + * + * For a backfill target, all objects <= peer_info[target].last_backfill + * have been backfilled to target + * + * There *MAY* be missing/outdated objects between last_backfill_started and + * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client + * io created objects since the last scan. For this reason, we call + * update_range() again before continuing backfill. + */ +uint64_t PrimaryLogPG::recover_backfill( + uint64_t max, + ThreadPool::TPHandle &handle, bool *work_started) +{ + dout(10) << __func__ << " (" << max << ")" + << " bft=" << backfill_targets + << " last_backfill_started " << last_backfill_started + << (new_backfill ? " new_backfill":"") + << dendl; + ceph_assert(!backfill_targets.empty()); + + // Initialize from prior backfill state + if (new_backfill) { + // on_activate() was called prior to getting here + ceph_assert(last_backfill_started == earliest_backfill()); + new_backfill = false; + + // initialize BackfillIntervals + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + peer_backfill_info[*i].reset(peer_info[*i].last_backfill); + } + backfill_info.reset(last_backfill_started); + + backfills_in_flight.clear(); + pending_backfill_updates.clear(); + } + + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + dout(10) << "peer osd." << *i + << " info " << peer_info[*i] + << " interval " << peer_backfill_info[*i].begin + << "-" << peer_backfill_info[*i].end + << " " << peer_backfill_info[*i].objects.size() << " objects" + << dendl; + } + + // update our local interval to cope with recent changes + backfill_info.begin = last_backfill_started; + update_range(&backfill_info, handle); + + unsigned ops = 0; + vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove; + set<hobject_t> add_to_stat; + + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + peer_backfill_info[*i].trim_to( + std::max(peer_info[*i].last_backfill, last_backfill_started)); + } + backfill_info.trim_to(last_backfill_started); + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + while (ops < max) { + if (backfill_info.begin <= earliest_peer_backfill() && + !backfill_info.extends_to_end() && backfill_info.empty()) { + hobject_t next = backfill_info.end; + backfill_info.reset(next); + backfill_info.end = hobject_t::get_max(); + update_range(&backfill_info, handle); + backfill_info.trim(); + } + + dout(20) << " my backfill interval " << backfill_info << dendl; + + bool sent_scan = false; + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + + dout(20) << " peer shard " << bt << " backfill " << pbi << dendl; + if (pbi.begin <= backfill_info.begin && + !pbi.extends_to_end() && pbi.empty()) { + dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl; + epoch_t e = get_osdmap_epoch(); + MOSDPGScan *m = new MOSDPGScan( + MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset, + spg_t(info.pgid.pgid, bt.shard), + pbi.end, hobject_t()); + osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch()); + ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end()); + waiting_on_backfill.insert(bt); + sent_scan = true; + } + } + + // Count simultaneous scans as a single op and let those complete + if (sent_scan) { + ops++; + start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end + break; + } + + if (backfill_info.empty() && all_peer_done()) { + dout(10) << " reached end for both local and all peers" << dendl; + break; + } + + // Get object within set of peers to operate on and + // the set of targets for which that object applies. + hobject_t check = earliest_peer_backfill(); + + if (check < backfill_info.begin) { + + set<pg_shard_t> check_targets; + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + if (pbi.begin == check) + check_targets.insert(bt); + } + ceph_assert(!check_targets.empty()); + + dout(20) << " BACKFILL removing " << check + << " from peers " << check_targets << dendl; + for (set<pg_shard_t>::iterator i = check_targets.begin(); + i != check_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + ceph_assert(pbi.begin == check); + + to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt)); + pbi.pop_front(); + } + + last_backfill_started = check; + + // Don't increment ops here because deletions + // are cheap and not replied to unlike real recovery_ops, + // and we can't increment ops without requeueing ourself + // for recovery. + } else { + eversion_t& obj_v = backfill_info.objects.begin()->second; + + vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs; + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + // Find all check peers that have the wrong version + if (check == backfill_info.begin && check == pbi.begin) { + if (pbi.objects.begin()->second != obj_v) { + need_ver_targs.push_back(bt); + } else { + keep_ver_targs.push_back(bt); + } + } else { + pg_info_t& pinfo = peer_info[bt]; + + // Only include peers that we've caught up to their backfill line + // otherwise, they only appear to be missing this object + // because their pbi.begin > backfill_info.begin. + if (backfill_info.begin > pinfo.last_backfill) + missing_targs.push_back(bt); + else + skip_targs.push_back(bt); + } + } + + if (!keep_ver_targs.empty()) { + // These peers have version obj_v + dout(20) << " BACKFILL keeping " << check + << " with ver " << obj_v + << " on peers " << keep_ver_targs << dendl; + //assert(!waiting_for_degraded_object.count(check)); + } + if (!need_ver_targs.empty() || !missing_targs.empty()) { + ObjectContextRef obc = get_object_context(backfill_info.begin, false); + ceph_assert(obc); + if (obc->get_recovery_read()) { + if (!need_ver_targs.empty()) { + dout(20) << " BACKFILL replacing " << check + << " with ver " << obj_v + << " to peers " << need_ver_targs << dendl; + } + if (!missing_targs.empty()) { + dout(20) << " BACKFILL pushing " << backfill_info.begin + << " with ver " << obj_v + << " to peers " << missing_targs << dendl; + } + vector<pg_shard_t> all_push = need_ver_targs; + all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end()); + + handle.reset_tp_timeout(); + int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h); + if (r < 0) { + *work_started = true; + dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl; + break; + } + ops++; + } else { + *work_started = true; + dout(20) << "backfill blocking on " << backfill_info.begin + << "; could not get rw_manager lock" << dendl; + break; + } + } + dout(20) << "need_ver_targs=" << need_ver_targs + << " keep_ver_targs=" << keep_ver_targs << dendl; + dout(20) << "backfill_targets=" << backfill_targets + << " missing_targs=" << missing_targs + << " skip_targs=" << skip_targs << dendl; + + last_backfill_started = backfill_info.begin; + add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes? + backfill_info.pop_front(); + vector<pg_shard_t> check_targets = need_ver_targs; + check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end()); + for (vector<pg_shard_t>::iterator i = check_targets.begin(); + i != check_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + pbi.pop_front(); + } + } + } + + hobject_t backfill_pos = + std::min(backfill_info.begin, earliest_peer_backfill()); + + for (set<hobject_t>::iterator i = add_to_stat.begin(); + i != add_to_stat.end(); + ++i) { + ObjectContextRef obc = get_object_context(*i, false); + ceph_assert(obc); + pg_stat_t stat; + add_object_context_to_pg_stat(obc, &stat); + pending_backfill_updates[*i] = stat; + } + map<pg_shard_t,MOSDPGBackfillRemove*> reqs; + for (unsigned i = 0; i < to_remove.size(); ++i) { + handle.reset_tp_timeout(); + const hobject_t& oid = to_remove[i].get<0>(); + eversion_t v = to_remove[i].get<1>(); + pg_shard_t peer = to_remove[i].get<2>(); + MOSDPGBackfillRemove *m; + auto it = reqs.find(peer); + if (it != reqs.end()) { + m = it->second; + } else { + m = reqs[peer] = new MOSDPGBackfillRemove( + spg_t(info.pgid.pgid, peer.shard), + get_osdmap_epoch()); + } + m->ls.push_back(make_pair(oid, v)); + + if (oid <= last_backfill_started) + pending_backfill_updates[oid]; // add empty stat! + } + for (auto p : reqs) { + osd->send_message_osd_cluster(p.first.osd, p.second, + get_osdmap_epoch()); + } + + pgbackend->run_recovery_op(h, get_recovery_op_priority()); + + dout(5) << "backfill_pos is " << backfill_pos << dendl; + for (set<hobject_t>::iterator i = backfills_in_flight.begin(); + i != backfills_in_flight.end(); + ++i) { + dout(20) << *i << " is still in flight" << dendl; + } + + hobject_t next_backfill_to_complete = backfills_in_flight.empty() ? + backfill_pos : *(backfills_in_flight.begin()); + hobject_t new_last_backfill = earliest_backfill(); + dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl; + for (map<hobject_t, pg_stat_t>::iterator i = + pending_backfill_updates.begin(); + i != pending_backfill_updates.end() && + i->first < next_backfill_to_complete; + pending_backfill_updates.erase(i++)) { + dout(20) << " pending_backfill_update " << i->first << dendl; + ceph_assert(i->first > new_last_backfill); + for (set<pg_shard_t>::iterator j = backfill_targets.begin(); + j != backfill_targets.end(); + ++j) { + pg_shard_t bt = *j; + pg_info_t& pinfo = peer_info[bt]; + //Add stats to all peers that were missing object + if (i->first > pinfo.last_backfill) + pinfo.stats.add(i->second); + } + new_last_backfill = i->first; + } + dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl; + + ceph_assert(!pending_backfill_updates.empty() || + new_last_backfill == last_backfill_started); + if (pending_backfill_updates.empty() && + backfill_pos.is_max()) { + ceph_assert(backfills_in_flight.empty()); + new_last_backfill = backfill_pos; + last_backfill_started = backfill_pos; + } + dout(10) << "final new_last_backfill at " << new_last_backfill << dendl; + + // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to + // all the backfill targets. Otherwise, we will move last_backfill up on + // those targets need it and send OP_BACKFILL_PROGRESS to them. + for (set<pg_shard_t>::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + pg_info_t& pinfo = peer_info[bt]; + + if (new_last_backfill > pinfo.last_backfill) { + pinfo.set_last_backfill(new_last_backfill); + epoch_t e = get_osdmap_epoch(); + MOSDPGBackfill *m = NULL; + if (pinfo.last_backfill.is_max()) { + m = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_FINISH, + e, + last_peering_reset, + spg_t(info.pgid.pgid, bt.shard)); + // Use default priority here, must match sub_op priority + /* pinfo.stats might be wrong if we did log-based recovery on the + * backfilled portion in addition to continuing backfill. + */ + pinfo.stats = info.stats; + start_recovery_op(hobject_t::get_max()); + } else { + m = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_PROGRESS, + e, + last_peering_reset, + spg_t(info.pgid.pgid, bt.shard)); + // Use default priority here, must match sub_op priority + } + m->last_backfill = pinfo.last_backfill; + m->stats = pinfo.stats; + osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch()); + dout(10) << " peer " << bt + << " num_objects now " << pinfo.stats.stats.sum.num_objects + << " / " << info.stats.stats.sum.num_objects << dendl; + } + } + + if (ops) + *work_started = true; + return ops; +} + +int PrimaryLogPG::prep_backfill_object_push( + hobject_t oid, eversion_t v, + ObjectContextRef obc, + vector<pg_shard_t> peers, + PGBackend::RecoveryHandle *h) +{ + dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl; + ceph_assert(!peers.empty()); + + backfills_in_flight.insert(oid); + for (unsigned int i = 0 ; i < peers.size(); ++i) { + map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]); + ceph_assert(bpm != peer_missing.end()); + bpm->second.add(oid, eversion_t(), eversion_t(), false); + } + + ceph_assert(!recovering.count(oid)); + + start_recovery_op(oid); + recovering.insert(make_pair(oid, obc)); + + // We need to take the read_lock here in order to flush in-progress writes + int r = pgbackend->recover_object( + oid, + v, + ObjectContextRef(), + obc, + h); + if (r < 0) { + dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl; + primary_failed(oid); + primary_error(oid, v); + backfills_in_flight.erase(oid); + missing_loc.add_missing(oid, v, eversion_t()); + } + return r; +} + +void PrimaryLogPG::update_range( + BackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + int local_min = cct->_conf->osd_backfill_scan_min; + int local_max = cct->_conf->osd_backfill_scan_max; + + if (bi->version < info.log_tail) { + dout(10) << __func__<< ": bi is old, rescanning local backfill_info" + << dendl; + bi->version = info.last_update; + scan_range(local_min, local_max, bi, handle); + } + + if (bi->version >= projected_last_update) { + dout(10) << __func__<< ": bi is current " << dendl; + ceph_assert(bi->version == projected_last_update); + } else if (bi->version >= info.log_tail) { + if (pg_log.get_log().empty() && projected_log.empty()) { + /* Because we don't move log_tail on split, the log might be + * empty even if log_tail != last_update. However, the only + * way to get here with an empty log is if log_tail is actually + * eversion_t(), because otherwise the entry which changed + * last_update since the last scan would have to be present. + */ + ceph_assert(bi->version == eversion_t()); + return; + } + + dout(10) << __func__<< ": bi is old, (" << bi->version + << ") can be updated with log to projected_last_update " + << projected_last_update << dendl; + + auto func = [&](const pg_log_entry_t &e) { + dout(10) << __func__ << ": updating from version " << e.version + << dendl; + const hobject_t &soid = e.soid; + if (soid >= bi->begin && + soid < bi->end) { + if (e.is_update()) { + dout(10) << __func__ << ": " << e.soid << " updated to version " + << e.version << dendl; + bi->objects.erase(e.soid); + bi->objects.insert( + make_pair( + e.soid, + e.version)); + } else if (e.is_delete()) { + dout(10) << __func__ << ": " << e.soid << " removed" << dendl; + bi->objects.erase(e.soid); + } + } + }; + dout(10) << "scanning pg log first" << dendl; + pg_log.get_log().scan_log_after(bi->version, func); + dout(10) << "scanning projected log" << dendl; + projected_log.scan_log_after(bi->version, func); + bi->version = projected_last_update; + } else { + ceph_abort_msg("scan_range should have raised bi->version past log_tail"); + } +} + +void PrimaryLogPG::scan_range( + int min, int max, BackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + ceph_assert(is_locked()); + dout(10) << "scan_range from " << bi->begin << dendl; + bi->clear_objects(); + + vector<hobject_t> ls; + ls.reserve(max); + int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end); + ceph_assert(r >= 0); + dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl; + dout(20) << ls << dendl; + + for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) { + handle.reset_tp_timeout(); + ObjectContextRef obc; + if (is_primary()) + obc = object_contexts.lookup(*p); + if (obc) { + if (!obc->obs.exists) { + /* If the object does not exist here, it must have been removed + * between the collection_list_partial and here. This can happen + * for the first item in the range, which is usually last_backfill. + */ + continue; + } + bi->objects[*p] = obc->obs.oi.version; + dout(20) << " " << *p << " " << obc->obs.oi.version << dendl; + } else { + bufferlist bl; + int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl); + /* If the object does not exist here, it must have been removed + * between the collection_list_partial and here. This can happen + * for the first item in the range, which is usually last_backfill. + */ + if (r == -ENOENT) + continue; + + ceph_assert(r >= 0); + object_info_t oi(bl); + bi->objects[*p] = oi.version; + dout(20) << " " << *p << " " << oi.version << dendl; + } + } +} + + +/** check_local + * + * verifies that stray objects have been deleted + */ +void PrimaryLogPG::check_local() +{ + dout(10) << __func__ << dendl; + + ceph_assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help! + + if (!cct->_conf->osd_debug_verify_stray_on_activate) + return; + + // just scan the log. + set<hobject_t> did; + for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin(); + p != pg_log.get_log().log.rend(); + ++p) { + if (did.count(p->soid)) + continue; + did.insert(p->soid); + + if (p->is_delete() && !is_missing_object(p->soid)) { + dout(10) << " checking " << p->soid + << " at " << p->version << dendl; + struct stat st; + int r = osd->store->stat( + ch, + ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard), + &st); + if (r != -ENOENT) { + derr << __func__ << " " << p->soid << " exists, but should have been " + << "deleted" << dendl; + ceph_abort_msg("erroneously present object"); + } + } else { + // ignore old(+missing) objects + } + } +} + + + +// =========================== +// hit sets + +hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp) +{ + ostringstream ss; + ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp; + hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", + info.pgid.ps(), info.pgid.pool(), + cct->_conf->osd_hit_set_namespace); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt) +{ + ostringstream ss; + ss << "hit_set_" << info.pgid.pgid << "_archive_"; + if (using_gmt) { + start.gmtime(ss) << "_"; + end.gmtime(ss); + } else { + start.localtime(ss) << "_"; + end.localtime(ss); + } + hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", + info.pgid.ps(), info.pgid.pool(), + cct->_conf->osd_hit_set_namespace); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +void PrimaryLogPG::hit_set_clear() +{ + dout(20) << __func__ << dendl; + hit_set.reset(); + hit_set_start_stamp = utime_t(); +} + +void PrimaryLogPG::hit_set_setup() +{ + if (!is_active() || + !is_primary()) { + hit_set_clear(); + return; + } + + if (is_active() && is_primary() && + (!pool.info.hit_set_count || + !pool.info.hit_set_period || + pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) { + hit_set_clear(); + + // only primary is allowed to remove all the hit set objects + hit_set_remove_all(); + return; + } + + // FIXME: discard any previous data for now + hit_set_create(); + + // include any writes we know about from the pg log. this doesn't + // capture reads, but it is better than nothing! + hit_set_apply_log(); +} + +void PrimaryLogPG::hit_set_remove_all() +{ + // If any archives are degraded we skip this + for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + // Once we hit a degraded object just skip + if (is_degraded_or_backfilling_object(aoid)) + return; + if (write_blocked_by_scrub(aoid)) + return; + } + + if (!info.hit_set.history.empty()) { + list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin(); + ceph_assert(p != info.hit_set.history.rend()); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + ceph_assert(!is_degraded_or_backfilling_object(oid)); + ObjectContextRef obc = get_object_context(oid, false); + ceph_assert(obc); + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->at_version = get_next_version(); + ctx->updated_hset_history = info.hit_set; + utime_t now = ceph_clock_now(); + ctx->mtime = now; + hit_set_trim(ctx, 0); + simple_opc_submit(std::move(ctx)); + } + + info.hit_set = pg_hit_set_history_t(); + if (agent_state) { + agent_state->discard_hit_sets(); + } +} + +void PrimaryLogPG::hit_set_create() +{ + utime_t now = ceph_clock_now(); + // make a copy of the params to modify + HitSet::Params params(pool.info.hit_set_params); + + dout(20) << __func__ << " " << params << dendl; + if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) { + BloomHitSet::Params *p = + static_cast<BloomHitSet::Params*>(params.impl.get()); + + // convert false positive rate so it holds up across the full period + p->set_fpp(p->get_fpp() / pool.info.hit_set_count); + if (p->get_fpp() <= 0.0) + p->set_fpp(.01); // fpp cannot be zero! + + // if we don't have specified size, estimate target size based on the + // previous bin! + if (p->target_size == 0 && hit_set) { + utime_t dur = now - hit_set_start_stamp; + unsigned unique = hit_set->approx_unique_insert_count(); + dout(20) << __func__ << " previous set had approx " << unique + << " unique items over " << dur << " seconds" << dendl; + p->target_size = (double)unique * (double)pool.info.hit_set_period + / (double)dur; + } + if (p->target_size < + static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size)) + p->target_size = cct->_conf->osd_hit_set_min_size; + + if (p->target_size + > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size)) + p->target_size = cct->_conf->osd_hit_set_max_size; + + p->seed = now.sec(); + + dout(10) << __func__ << " target_size " << p->target_size + << " fpp " << p->get_fpp() << dendl; + } + hit_set.reset(new HitSet(params)); + hit_set_start_stamp = now; +} + +/** + * apply log entries to set + * + * this would only happen after peering, to at least capture writes + * during an interval that was potentially lost. + */ +bool PrimaryLogPG::hit_set_apply_log() +{ + if (!hit_set) + return false; + + eversion_t to = info.last_update; + eversion_t from = info.hit_set.current_last_update; + if (to <= from) { + dout(20) << __func__ << " no update" << dendl; + return false; + } + + dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl; + list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin(); + while (p != pg_log.get_log().log.rend() && p->version > to) + ++p; + while (p != pg_log.get_log().log.rend() && p->version > from) { + hit_set->insert(p->soid); + ++p; + } + + return true; +} + +void PrimaryLogPG::hit_set_persist() +{ + dout(10) << __func__ << dendl; + bufferlist bl; + unsigned max = pool.info.hit_set_count; + + utime_t now = ceph_clock_now(); + hobject_t oid; + + // If any archives are degraded we skip this persist request + // account for the additional entry being added below + for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + // Once we hit a degraded object just skip further trim + if (is_degraded_or_backfilling_object(aoid)) + return; + if (write_blocked_by_scrub(aoid)) + return; + } + + // If backfill is in progress and we could possibly overlap with the + // hit_set_* objects, back off. Since these all have + // hobject_t::hash set to pgid.ps(), and those sort first, we can + // look just at that. This is necessary because our transactions + // may include a modify of the new hit_set *and* a delete of the + // old one, and this may span the backfill boundary. + for (set<pg_shard_t>::iterator p = backfill_targets.begin(); + p != backfill_targets.end(); + ++p) { + ceph_assert(peer_info.count(*p)); + const pg_info_t& pi = peer_info[*p]; + if (pi.last_backfill == hobject_t() || + pi.last_backfill.get_hash() == info.pgid.ps()) { + dout(10) << __func__ << " backfill target osd." << *p + << " last_backfill has not progressed past pgid ps" + << dendl; + return; + } + } + + + pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset); + new_hset.begin = hit_set_start_stamp; + new_hset.end = now; + oid = get_hit_set_archive_object( + new_hset.begin, + new_hset.end, + new_hset.using_gmt); + + // If the current object is degraded we skip this persist request + if (write_blocked_by_scrub(oid)) + return; + + hit_set->seal(); + encode(*hit_set, bl); + dout(20) << __func__ << " archive " << oid << dendl; + + if (agent_state) { + agent_state->add_hit_set(new_hset.begin, hit_set); + uint32_t size = agent_state->hit_set_map.size(); + if (size >= pool.info.hit_set_count) { + size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0; + } + hit_set_in_memory_trim(size); + } + + ObjectContextRef obc = get_object_context(oid, true); + OpContextUPtr ctx = simple_opc_create(obc); + + ctx->at_version = get_next_version(); + ctx->updated_hset_history = info.hit_set; + pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history); + + updated_hit_set_hist.current_last_update = info.last_update; + new_hset.version = ctx->at_version; + + updated_hit_set_hist.history.push_back(new_hset); + hit_set_create(); + + // fabricate an object_info_t and SnapSet + obc->obs.oi.version = ctx->at_version; + obc->obs.oi.mtime = now; + obc->obs.oi.size = bl.length(); + obc->obs.exists = true; + obc->obs.oi.set_data_digest(bl.crc32c(-1)); + + ctx->new_obs = obc->obs; + + ctx->new_snapset = obc->ssc->snapset; + + ctx->delta_stats.num_objects++; + ctx->delta_stats.num_objects_hit_set_archive++; + + ctx->delta_stats.num_bytes += bl.length(); + ctx->delta_stats.num_bytes_hit_set_archive += bl.length(); + + bufferlist bss; + encode(ctx->new_snapset, bss); + bufferlist boi(sizeof(ctx->new_obs.oi)); + encode(ctx->new_obs.oi, boi, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + + ctx->op_t->create(oid); + if (bl.length()) { + ctx->op_t->write(oid, 0, bl.length(), bl, 0); + } + map <string, bufferlist> attrs; + attrs[OI_ATTR].claim(boi); + attrs[SS_ATTR].claim(bss); + setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs); + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + oid, + ctx->at_version, + eversion_t(), + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + + hit_set_trim(ctx, max); + + simple_opc_submit(std::move(ctx)); +} + +void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max) +{ + ceph_assert(ctx->updated_hset_history); + pg_hit_set_history_t &updated_hit_set_hist = + *(ctx->updated_hset_history); + for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) { + list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin(); + ceph_assert(p != updated_hit_set_hist.history.end()); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + ceph_assert(!is_degraded_or_backfilling_object(oid)); + + dout(20) << __func__ << " removing " << oid << dendl; + ++ctx->at_version.version; + ctx->log.push_back( + pg_log_entry_t(pg_log_entry_t::DELETE, + oid, + ctx->at_version, + p->version, + 0, + osd_reqid_t(), + ctx->mtime, + 0)); + + ctx->op_t->remove(oid); + updated_hit_set_hist.history.pop_front(); + + ObjectContextRef obc = get_object_context(oid, false); + ceph_assert(obc); + --ctx->delta_stats.num_objects; + --ctx->delta_stats.num_objects_hit_set_archive; + ctx->delta_stats.num_bytes -= obc->obs.oi.size; + ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size; + } +} + +void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory) +{ + while (agent_state->hit_set_map.size() > max_in_memory) { + agent_state->remove_oldest_hit_set(); + } +} + + +// ======================================= +// cache agent + +void PrimaryLogPG::agent_setup() +{ + ceph_assert(is_locked()); + if (!is_active() || + !is_primary() || + state_test(PG_STATE_PREMERGE) || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || + pool.info.tier_of < 0 || + !get_osdmap()->have_pg_pool(pool.info.tier_of)) { + agent_clear(); + return; + } + if (!agent_state) { + agent_state.reset(new TierAgentState); + + // choose random starting position + agent_state->position = hobject_t(); + agent_state->position.pool = info.pgid.pool(); + agent_state->position.set_hash(pool.info.get_random_pg_position( + info.pgid.pgid, + rand())); + agent_state->start = agent_state->position; + + dout(10) << __func__ << " allocated new state, position " + << agent_state->position << dendl; + } else { + dout(10) << __func__ << " keeping existing state" << dendl; + } + + if (info.stats.stats_invalid) { + osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate"; + } + + agent_choose_mode(); +} + +void PrimaryLogPG::agent_clear() +{ + agent_stop(); + agent_state.reset(NULL); +} + +// Return false if no objects operated on since start of object hash space +bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota) +{ + lock(); + if (!agent_state) { + dout(10) << __func__ << " no agent state, stopping" << dendl; + unlock(); + return true; + } + + ceph_assert(!deleting); + + if (agent_state->is_idle()) { + dout(10) << __func__ << " idle, stopping" << dendl; + unlock(); + return true; + } + + osd->logger->inc(l_osd_agent_wake); + + dout(10) << __func__ + << " max " << start_max + << ", flush " << agent_state->get_flush_mode_name() + << ", evict " << agent_state->get_evict_mode_name() + << ", pos " << agent_state->position + << dendl; + ceph_assert(is_primary()); + ceph_assert(is_active()); + + agent_load_hit_sets(); + + const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of); + ceph_assert(base_pool); + + int ls_min = 1; + int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size; + + // list some objects. this conveniently lists clones (oldest to + // newest) before heads... the same order we want to flush in. + // + // NOTE: do not flush the Sequencer. we will assume that the + // listing we get back is imprecise. + vector<hobject_t> ls; + hobject_t next; + int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max, + &ls, &next); + ceph_assert(r >= 0); + dout(20) << __func__ << " got " << ls.size() << " objects" << dendl; + int started = 0; + for (vector<hobject_t>::iterator p = ls.begin(); + p != ls.end(); + ++p) { + if (p->nspace == cct->_conf->osd_hit_set_namespace) { + dout(20) << __func__ << " skip (hit set) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (is_degraded_or_backfilling_object(*p)) { + dout(20) << __func__ << " skip (degraded) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (is_missing_object(p->get_head())) { + dout(20) << __func__ << " skip (missing head) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + ObjectContextRef obc = get_object_context(*p, false, NULL); + if (!obc) { + // we didn't flush; we may miss something here. + dout(20) << __func__ << " skip (no obc) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (!obc->obs.exists) { + dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (range_intersects_scrub(obc->obs.oi.soid, + obc->obs.oi.soid.get_head())) { + dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (obc->is_blocked()) { + dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (obc->is_request_pending()) { + dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + + // be careful flushing omap to an EC pool. + if (!base_pool->supports_omap() && + obc->obs.oi.is_omap()) { + dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + + if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && + agent_maybe_evict(obc, false)) + ++started; + else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE && + agent_flush_quota > 0 && agent_maybe_flush(obc)) { + ++started; + --agent_flush_quota; + } + if (started >= start_max) { + // If finishing early, set "next" to the next object + if (++p != ls.end()) + next = *p; + break; + } + } + + if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) { + dout(20) << __func__ << " resetting atime and temp histograms" << dendl; + agent_state->hist_age = 0; + agent_state->temp_hist.decay(); + } + + // Total objects operated on so far + int total_started = agent_state->started + started; + bool need_delay = false; + + dout(20) << __func__ << " start pos " << agent_state->position + << " next start pos " << next + << " started " << total_started << dendl; + + // See if we've made a full pass over the object hash space + // This might check at most ls_max objects a second time to notice that + // we've checked every objects at least once. + if (agent_state->position < agent_state->start && + next >= agent_state->start) { + dout(20) << __func__ << " wrap around " << agent_state->start << dendl; + if (total_started == 0) + need_delay = true; + else + total_started = 0; + agent_state->start = next; + } + agent_state->started = total_started; + + // See if we are starting from beginning + if (next.is_max()) + agent_state->position = hobject_t(); + else + agent_state->position = next; + + // Discard old in memory HitSets + hit_set_in_memory_trim(pool.info.hit_set_count); + + if (need_delay) { + ceph_assert(agent_state->delaying == false); + agent_delay(); + unlock(); + return false; + } + agent_choose_mode(); + unlock(); + return true; +} + +void PrimaryLogPG::agent_load_hit_sets() +{ + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) { + return; + } + + if (agent_state->hit_set_map.size() < info.hit_set.history.size()) { + dout(10) << __func__ << dendl; + for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); ++p) { + if (agent_state->hit_set_map.count(p->begin.sec()) == 0) { + dout(10) << __func__ << " loading " << p->begin << "-" + << p->end << dendl; + if (!pool.info.is_replicated()) { + // FIXME: EC not supported here yet + derr << __func__ << " on non-replicated pool" << dendl; + break; + } + + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + if (is_unreadable_object(oid)) { + dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl; + break; + } + + ObjectContextRef obc = get_object_context(oid, false); + if (!obc) { + derr << __func__ << ": could not load hitset " << oid << dendl; + break; + } + + bufferlist bl; + { + int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl); + ceph_assert(r >= 0); + } + HitSetRef hs(new HitSet); + bufferlist::const_iterator pbl = bl.begin(); + decode(*hs, pbl); + agent_state->add_hit_set(p->begin.sec(), hs); + } + } + } +} + +bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc) +{ + if (!obc->obs.oi.is_dirty()) { + dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + if (obc->obs.oi.is_cache_pinned()) { + dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + utime_t now = ceph_clock_now(); + utime_t ob_local_mtime; + if (obc->obs.oi.local_mtime != utime_t()) { + ob_local_mtime = obc->obs.oi.local_mtime; + } else { + ob_local_mtime = obc->obs.oi.mtime; + } + bool evict_mode_full = + (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL); + if (!evict_mode_full && + obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay + (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) { + dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + if (osd->agent_is_active_oid(obc->obs.oi.soid)) { + dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + dout(10) << __func__ << " flushing " << obc->obs.oi << dendl; + + // FIXME: flush anything dirty, regardless of what distribution of + // ages we expect. + + hobject_t oid = obc->obs.oi.soid; + osd->agent_start_op(oid); + // no need to capture a pg ref, can't outlive fop or ctx + std::function<void()> on_flush = [this, oid]() { + osd->agent_finish_op(oid); + }; + + int result = start_flush( + OpRequestRef(), obc, false, NULL, + on_flush); + if (result != -EINPROGRESS) { + on_flush(); + dout(10) << __func__ << " start_flush() failed " << obc->obs.oi + << " with " << result << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + osd->logger->inc(l_osd_agent_flush); + return true; +} + +bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush) +{ + const hobject_t& soid = obc->obs.oi.soid; + if (!after_flush && obc->obs.oi.is_dirty()) { + dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl; + return false; + } + // This is already checked by agent_work() which passes after_flush = false + if (after_flush && range_intersects_scrub(soid, soid.get_head())) { + dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl; + return false; + } + if (!obc->obs.oi.watchers.empty()) { + dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl; + return false; + } + if (obc->is_blocked()) { + dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl; + return false; + } + if (obc->obs.oi.is_cache_pinned()) { + dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl; + return false; + } + + if (soid.snap == CEPH_NOSNAP) { + int result = _verify_no_head_clones(soid, obc->ssc->snapset); + if (result < 0) { + dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl; + return false; + } + } + + if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) { + // is this object old than cache_min_evict_age? + utime_t now = ceph_clock_now(); + utime_t ob_local_mtime; + if (obc->obs.oi.local_mtime != utime_t()) { + ob_local_mtime = obc->obs.oi.local_mtime; + } else { + ob_local_mtime = obc->obs.oi.mtime; + } + if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) { + dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + // is this object old and/or cold enough? + int temp = 0; + uint64_t temp_upper = 0, temp_lower = 0; + if (hit_set) + agent_estimate_temp(soid, &temp); + agent_state->temp_hist.add(temp); + agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper); + + dout(20) << __func__ + << " temp " << temp + << " pos " << temp_lower << "-" << temp_upper + << ", evict_effort " << agent_state->evict_effort + << dendl; + dout(30) << "agent_state:\n"; + Formatter *f = Formatter::create(""); + f->open_object_section("agent_state"); + agent_state->dump(f); + f->close_section(); + f->flush(*_dout); + delete f; + *_dout << dendl; + + if (1000000 - temp_upper >= agent_state->evict_effort) + return false; + } + + dout(10) << __func__ << " evicting " << obc->obs.oi << dendl; + OpContextUPtr ctx = simple_opc_create(obc); + + auto null_op_req = OpRequestRef(); + if (!ctx->lock_manager.get_lock_type( + ObjectContext::RWState::RWWRITE, + obc->obs.oi.soid, + obc, + null_op_req)) { + close_op_ctx(ctx.release()); + dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl; + return false; + } + + osd->agent_start_evict_op(); + ctx->register_on_finish( + [this]() { + osd->agent_finish_evict_op(); + }); + + ctx->at_version = get_next_version(); + ceph_assert(ctx->new_obs.exists); + int r = _delete_oid(ctx.get(), true, false); + if (obc->obs.oi.is_omap()) + ctx->delta_stats.num_objects_omap--; + ctx->delta_stats.num_evict++; + ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10); + if (obc->obs.oi.is_dirty()) + --ctx->delta_stats.num_objects_dirty; + ceph_assert(r == 0); + finish_ctx(ctx.get(), pg_log_entry_t::DELETE); + simple_opc_submit(std::move(ctx)); + osd->logger->inc(l_osd_tier_evict); + osd->logger->inc(l_osd_agent_evict); + return true; +} + +void PrimaryLogPG::agent_stop() +{ + dout(20) << __func__ << dendl; + if (agent_state && !agent_state->is_idle()) { + agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE; + agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE; + osd->agent_disable_pg(this, agent_state->evict_effort); + } +} + +void PrimaryLogPG::agent_delay() +{ + dout(20) << __func__ << dendl; + if (agent_state && !agent_state->is_idle()) { + ceph_assert(agent_state->delaying == false); + agent_state->delaying = true; + osd->agent_disable_pg(this, agent_state->evict_effort); + } +} + +void PrimaryLogPG::agent_choose_mode_restart() +{ + dout(20) << __func__ << dendl; + lock(); + if (agent_state && agent_state->delaying) { + agent_state->delaying = false; + agent_choose_mode(true); + } + unlock(); +} + +bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op) +{ + bool requeued = false; + // Let delay play out + if (agent_state->delaying) { + dout(20) << __func__ << " " << this << " delaying, ignored" << dendl; + return requeued; + } + + TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE; + TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE; + unsigned evict_effort = 0; + + if (info.stats.stats_invalid) { + // idle; stats can't be trusted until we scrub. + dout(20) << __func__ << " stats invalid (post-split), idle" << dendl; + goto skip_calc; + } + + { + uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid); + ceph_assert(divisor > 0); + + // adjust (effective) user objects down based on the number + // of HitSet objects, which should not count toward our total since + // they cannot be flushed. + uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive; + + // also exclude omap objects if ec backing pool + const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of); + ceph_assert(base_pool); + if (!base_pool->supports_omap()) + unflushable += info.stats.stats.sum.num_objects_omap; + + uint64_t num_user_objects = info.stats.stats.sum.num_objects; + if (num_user_objects > unflushable) + num_user_objects -= unflushable; + else + num_user_objects = 0; + + uint64_t num_user_bytes = info.stats.stats.sum.num_bytes; + uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive; + num_user_bytes -= unflushable_bytes; + uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects); + num_user_bytes += num_overhead_bytes; + + // also reduce the num_dirty by num_objects_omap + int64_t num_dirty = info.stats.stats.sum.num_objects_dirty; + if (!base_pool->supports_omap()) { + if (num_dirty > info.stats.stats.sum.num_objects_omap) + num_dirty -= info.stats.stats.sum.num_objects_omap; + else + num_dirty = 0; + } + + dout(10) << __func__ + << " flush_mode: " + << TierAgentState::get_flush_mode_name(agent_state->flush_mode) + << " evict_mode: " + << TierAgentState::get_evict_mode_name(agent_state->evict_mode) + << " num_objects: " << info.stats.stats.sum.num_objects + << " num_bytes: " << info.stats.stats.sum.num_bytes + << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty + << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap + << " num_dirty: " << num_dirty + << " num_user_objects: " << num_user_objects + << " num_user_bytes: " << num_user_bytes + << " num_overhead_bytes: " << num_overhead_bytes + << " pool.info.target_max_bytes: " << pool.info.target_max_bytes + << " pool.info.target_max_objects: " << pool.info.target_max_objects + << dendl; + + // get dirty, full ratios + uint64_t dirty_micro = 0; + uint64_t full_micro = 0; + if (pool.info.target_max_bytes && num_user_objects > 0) { + uint64_t avg_size = num_user_bytes / num_user_objects; + dirty_micro = + num_dirty * avg_size * 1000000 / + std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1); + full_micro = + num_user_objects * avg_size * 1000000 / + std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1); + } + if (pool.info.target_max_objects > 0) { + uint64_t dirty_objects_micro = + num_dirty * 1000000 / + std::max<uint64_t>(pool.info.target_max_objects / divisor, 1); + if (dirty_objects_micro > dirty_micro) + dirty_micro = dirty_objects_micro; + uint64_t full_objects_micro = + num_user_objects * 1000000 / + std::max<uint64_t>(pool.info.target_max_objects / divisor, 1); + if (full_objects_micro > full_micro) + full_micro = full_objects_micro; + } + dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0) + << " full " << ((float)full_micro / 1000000.0) + << dendl; + + // flush mode + uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro; + uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro; + uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop; + if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) { + flush_target += flush_slop; + flush_high_target += flush_slop; + } else { + flush_target -= std::min(flush_target, flush_slop); + flush_high_target -= std::min(flush_high_target, flush_slop); + } + + if (dirty_micro > flush_high_target) { + flush_mode = TierAgentState::FLUSH_MODE_HIGH; + } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) { + flush_mode = TierAgentState::FLUSH_MODE_LOW; + } + + // evict mode + uint64_t evict_target = pool.info.cache_target_full_ratio_micro; + uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop; + if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) + evict_target += evict_slop; + else + evict_target -= std::min(evict_target, evict_slop); + + if (full_micro > 1000000) { + // evict anything clean + evict_mode = TierAgentState::EVICT_MODE_FULL; + evict_effort = 1000000; + } else if (full_micro > evict_target) { + // set effort in [0..1] range based on where we are between + evict_mode = TierAgentState::EVICT_MODE_SOME; + uint64_t over = full_micro - evict_target; + uint64_t span = 1000000 - evict_target; + evict_effort = std::max(over * 1000000 / span, + uint64_t(1000000.0 * + cct->_conf->osd_agent_min_evict_effort)); + + // quantize effort to avoid too much reordering in the agent_queue. + uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000; + ceph_assert(inc > 0); + uint64_t was = evict_effort; + evict_effort -= evict_effort % inc; + if (evict_effort < inc) + evict_effort = inc; + ceph_assert(evict_effort >= inc && evict_effort <= 1000000); + dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl; + } + } + + skip_calc: + bool old_idle = agent_state->is_idle(); + if (flush_mode != agent_state->flush_mode) { + dout(5) << __func__ << " flush_mode " + << TierAgentState::get_flush_mode_name(agent_state->flush_mode) + << " -> " + << TierAgentState::get_flush_mode_name(flush_mode) + << dendl; + if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) { + osd->agent_inc_high_count(); + info.stats.stats.sum.num_flush_mode_high = 1; + } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) { + info.stats.stats.sum.num_flush_mode_low = 1; + } + if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) { + osd->agent_dec_high_count(); + info.stats.stats.sum.num_flush_mode_high = 0; + } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) { + info.stats.stats.sum.num_flush_mode_low = 0; + } + agent_state->flush_mode = flush_mode; + } + if (evict_mode != agent_state->evict_mode) { + dout(5) << __func__ << " evict_mode " + << TierAgentState::get_evict_mode_name(agent_state->evict_mode) + << " -> " + << TierAgentState::get_evict_mode_name(evict_mode) + << dendl; + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL && + is_active()) { + if (op) + requeue_op(op); + requeue_ops(waiting_for_flush); + requeue_ops(waiting_for_active); + requeue_ops(waiting_for_scrub); + requeue_ops(waiting_for_cache_not_full); + objects_blocked_on_cache_full.clear(); + requeued = true; + } + if (evict_mode == TierAgentState::EVICT_MODE_SOME) { + info.stats.stats.sum.num_evict_mode_some = 1; + } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) { + info.stats.stats.sum.num_evict_mode_full = 1; + } + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) { + info.stats.stats.sum.num_evict_mode_some = 0; + } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + info.stats.stats.sum.num_evict_mode_full = 0; + } + agent_state->evict_mode = evict_mode; + } + uint64_t old_effort = agent_state->evict_effort; + if (evict_effort != agent_state->evict_effort) { + dout(5) << __func__ << " evict_effort " + << ((float)agent_state->evict_effort / 1000000.0) + << " -> " + << ((float)evict_effort / 1000000.0) + << dendl; + agent_state->evict_effort = evict_effort; + } + + // NOTE: we are using evict_effort as a proxy for *all* agent effort + // (including flush). This is probably fine (they should be + // correlated) but it is not precisely correct. + if (agent_state->is_idle()) { + if (!restart && !old_idle) { + osd->agent_disable_pg(this, old_effort); + } + } else { + if (restart || old_idle) { + osd->agent_enable_pg(this, agent_state->evict_effort); + } else if (old_effort != agent_state->evict_effort) { + osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort); + } + } + return requeued; +} + +void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp) +{ + ceph_assert(hit_set); + ceph_assert(temp); + *temp = 0; + if (hit_set->contains(oid)) + *temp = 1000000; + unsigned i = 0; + int last_n = pool.info.hit_set_search_last_n; + for (map<time_t,HitSetRef>::reverse_iterator p = + agent_state->hit_set_map.rbegin(); last_n > 0 && + p != agent_state->hit_set_map.rend(); ++p, ++i) { + if (p->second->contains(oid)) { + *temp += pool.info.get_grade(i); + --last_n; + } + } +} + +// Dup op detection + +bool PrimaryLogPG::already_complete(eversion_t v) +{ + dout(20) << __func__ << ": " << v << dendl; + for (xlist<RepGather*>::iterator i = repop_queue.begin(); + !i.end(); + ++i) { + dout(20) << __func__ << ": " << **i << dendl; + // skip copy from temp object ops + if ((*i)->v == eversion_t()) { + dout(20) << __func__ << ": " << **i + << " version is empty" << dendl; + continue; + } + if ((*i)->v > v) { + dout(20) << __func__ << ": " << **i + << " (*i)->v past v" << dendl; + break; + } + if (!(*i)->all_committed) { + dout(20) << __func__ << ": " << **i + << " not committed, returning false" + << dendl; + return false; + } + } + dout(20) << __func__ << ": returning true" << dendl; + return true; +} + +bool PrimaryLogPG::already_ack(eversion_t v) +{ + dout(20) << __func__ << ": " << v << dendl; + for (xlist<RepGather*>::iterator i = repop_queue.begin(); + !i.end(); + ++i) { + // skip copy from temp object ops + if ((*i)->v == eversion_t()) { + dout(20) << __func__ << ": " << **i + << " version is empty" << dendl; + continue; + } + if ((*i)->v > v) { + dout(20) << __func__ << ": " << **i + << " (*i)->v past v" << dendl; + break; + } + } + dout(20) << __func__ << ": returning true" << dendl; + return true; +} + + +// ========================================================================================== +// SCRUB + + +bool PrimaryLogPG::_range_available_for_scrub( + const hobject_t &begin, const hobject_t &end) +{ + pair<hobject_t, ObjectContextRef> next; + next.second = object_contexts.lookup(begin); + next.first = begin; + bool more = true; + while (more && next.first < end) { + if (next.second && next.second->is_blocked()) { + next.second->requeue_scrub_on_unblock = true; + dout(10) << __func__ << ": scrub delayed, " + << next.first << " is blocked" + << dendl; + return false; + } + more = object_contexts.get_next(next.first, &next); + } + return true; +} + +static bool doing_clones(const boost::optional<SnapSet> &snapset, + const vector<snapid_t>::reverse_iterator &curclone) { + return snapset && curclone != snapset.get().clones.rend(); +} + +void PrimaryLogPG::log_missing(unsigned missing, + const boost::optional<hobject_t> &head, + LogChannelRef clog, + const spg_t &pgid, + const char *func, + const char *mode, + bool allow_incomplete_clones) +{ + ceph_assert(head); + if (allow_incomplete_clones) { + dout(20) << func << " " << mode << " " << pgid << " " << head.get() + << " skipped " << missing << " clone(s) in cache tier" << dendl; + } else { + clog->info() << mode << " " << pgid << " " << head.get() + << " : " << missing << " missing clone(s)"; + } +} + +unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head, + const boost::optional<SnapSet> &snapset, + LogChannelRef clog, + const spg_t &pgid, + const char *mode, + bool allow_incomplete_clones, + boost::optional<snapid_t> target, + vector<snapid_t>::reverse_iterator *curclone, + inconsistent_snapset_wrapper &e) +{ + ceph_assert(head); + ceph_assert(snapset); + unsigned missing = 0; + + // NOTE: clones are in descending order, thus **curclone > target test here + hobject_t next_clone(head.get()); + while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) { + ++missing; + // it is okay to be missing one or more clones in a cache tier. + // skip higher-numbered clones in the list. + if (!allow_incomplete_clones) { + next_clone.snap = **curclone; + clog->error() << mode << " " << pgid << " " << head.get() + << " : expected clone " << next_clone << " " << missing + << " missing"; + ++scrubber.shallow_errors; + e.set_clone_missing(next_clone.snap); + } + // Clones are descending + ++(*curclone); + } + return missing; +} + +/* + * Validate consistency of the object info and snap sets. + * + * We are sort of comparing 2 lists. The main loop is on objmap.objects. But + * the comparison of the objects is against multiple snapset.clones. There are + * multiple clone lists and in between lists we expect head. + * + * Example + * + * objects expected + * ======= ======= + * obj1 snap 1 head, unexpected obj1 snap 1 + * obj2 head head, match + * [SnapSet clones 6 4 2 1] + * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7 + * obj2 snap 6 obj2 snap 6, match + * obj2 snap 4 obj2 snap 4, match + * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match + * [Snapset clones 3 1] + * obj3 snap 3 obj3 snap 3 match + * obj3 snap 1 obj3 snap 1 match + * obj4 head head, match + * [Snapset clones 4] + * EOL obj4 snap 4, (expected) + */ +void PrimaryLogPG::scrub_snapshot_metadata( + ScrubMap &scrubmap, + const map<hobject_t, + pair<boost::optional<uint32_t>, + boost::optional<uint32_t>>> &missing_digest) +{ + dout(10) << __func__ << dendl; + + bool repair = state_test(PG_STATE_REPAIR); + bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); + const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); + boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none + + // traverse in reverse order. + boost::optional<hobject_t> head; + boost::optional<SnapSet> snapset; // If initialized so will head (above) + vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized + unsigned missing = 0; + inconsistent_snapset_wrapper soid_error, head_error; + unsigned soid_error_count = 0; + + for (map<hobject_t,ScrubMap::object>::reverse_iterator + p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) { + const hobject_t& soid = p->first; + ceph_assert(!soid.is_snapdir()); + soid_error = inconsistent_snapset_wrapper{soid}; + object_stat_sum_t stat; + boost::optional<object_info_t> oi; + + stat.num_objects++; + + if (soid.nspace == cct->_conf->osd_hit_set_namespace) + stat.num_objects_hit_set_archive++; + + if (soid.is_snap()) { + // it's a clone + stat.num_object_clones++; + } + + // basic checks. + if (p->second.attrs.count(OI_ATTR) == 0) { + oi = boost::none; + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : no '" << OI_ATTR << "' attr"; + ++scrubber.shallow_errors; + soid_error.set_info_missing(); + } else { + bufferlist bv; + bv.push_back(p->second.attrs[OI_ATTR]); + try { + oi = object_info_t(); // Initialize optional<> before decode into it + oi.get().decode(bv); + } catch (buffer::error& e) { + oi = boost::none; + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : can't decode '" << OI_ATTR << "' attr " << e.what(); + ++scrubber.shallow_errors; + soid_error.set_info_corrupted(); + soid_error.set_info_missing(); // Not available too + } + } + + if (oi) { + if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : on disk size (" << p->second.size + << ") does not match object info size (" + << oi->size << ") adjusted for ondisk to (" + << pgbackend->be_get_ondisk_size(oi->size) + << ")"; + soid_error.set_size_mismatch(); + ++scrubber.shallow_errors; + } + + dout(20) << mode << " " << soid << " " << oi.get() << dendl; + + // A clone num_bytes will be added later when we have snapset + if (!soid.is_snap()) { + stat.num_bytes += oi->size; + } + if (soid.nspace == cct->_conf->osd_hit_set_namespace) + stat.num_bytes_hit_set_archive += oi->size; + + if (oi->is_dirty()) + ++stat.num_objects_dirty; + if (oi->is_whiteout()) + ++stat.num_whiteouts; + if (oi->is_omap()) + ++stat.num_objects_omap; + if (oi->is_cache_pinned()) + ++stat.num_objects_pinned; + if (oi->has_manifest()) + ++stat.num_objects_manifest; + } + + // Check for any problems while processing clones + if (doing_clones(snapset, curclone)) { + boost::optional<snapid_t> target; + // Expecting an object with snap for current head + if (soid.has_snapset() || soid.get_head() != head->get_head()) { + + dout(10) << __func__ << " " << mode << " " << info.pgid << " new object " + << soid << " while processing " << head.get() << dendl; + + target = all_clones; + } else { + ceph_assert(soid.is_snap()); + target = soid.snap; + } + + // Log any clones we were expecting to be there up to target + // This will set missing, but will be a no-op if snap.soid == *curclone. + missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode, + pool.info.allow_incomplete_clones(), target, &curclone, + head_error); + } + bool expected; + // Check doing_clones() again in case we ran process_clones_to() + if (doing_clones(snapset, curclone)) { + // A head would have processed all clones above + // or all greater than *curclone. + ceph_assert(soid.is_snap() && *curclone <= soid.snap); + + // After processing above clone snap should match the expected curclone + expected = (*curclone == soid.snap); + } else { + // If we aren't doing clones any longer, then expecting head + expected = soid.has_snapset(); + } + if (!expected) { + // If we couldn't read the head's snapset, just ignore clones + if (head && !snapset) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : clone ignored due to missing snapset"; + } else { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : is an unexpected clone"; + } + ++scrubber.shallow_errors; + soid_error.set_headless(); + scrubber.store->add_snap_error(pool.id, soid_error); + ++soid_error_count; + if (head && soid.get_head() == head->get_head()) + head_error.set_clone(soid.snap); + continue; + } + + // new snapset? + if (soid.has_snapset()) { + + if (missing) { + log_missing(missing, head, osd->clog, info.pgid, __func__, mode, + pool.info.allow_incomplete_clones()); + } + + // Save previous head error information + if (head && (head_error.errors || soid_error_count)) + scrubber.store->add_snap_error(pool.id, head_error); + // Set this as a new head object + head = soid; + missing = 0; + head_error = soid_error; + soid_error_count = 0; + + dout(20) << __func__ << " " << mode << " new head " << head << dendl; + + if (p->second.attrs.count(SS_ATTR) == 0) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : no '" << SS_ATTR << "' attr"; + ++scrubber.shallow_errors; + snapset = boost::none; + head_error.set_snapset_missing(); + } else { + bufferlist bl; + bl.push_back(p->second.attrs[SS_ATTR]); + auto blp = bl.cbegin(); + try { + snapset = SnapSet(); // Initialize optional<> before decoding into it + decode(snapset.get(), blp); + head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]); + } catch (buffer::error& e) { + snapset = boost::none; + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : can't decode '" << SS_ATTR << "' attr " << e.what(); + ++scrubber.shallow_errors; + head_error.set_snapset_corrupted(); + } + } + + if (snapset) { + // what will be next? + curclone = snapset->clones.rbegin(); + + if (!snapset->clones.empty()) { + dout(20) << " snapset " << snapset.get() << dendl; + if (snapset->seq == 0) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : snaps.seq not set"; + ++scrubber.shallow_errors; + head_error.set_snapset_error(); + } + } + } + } else { + ceph_assert(soid.is_snap()); + ceph_assert(head); + ceph_assert(snapset); + ceph_assert(soid.snap == *curclone); + + dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl; + + if (snapset->clone_size.count(soid.snap) == 0) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : is missing in clone_size"; + ++scrubber.shallow_errors; + soid_error.set_size_mismatch(); + } else { + if (oi && oi->size != snapset->clone_size[soid.snap]) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : size " << oi->size << " != clone_size " + << snapset->clone_size[*curclone]; + ++scrubber.shallow_errors; + soid_error.set_size_mismatch(); + } + + if (snapset->clone_overlap.count(soid.snap) == 0) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : is missing in clone_overlap"; + ++scrubber.shallow_errors; + soid_error.set_size_mismatch(); + } else { + // This checking is based on get_clone_bytes(). The first 2 asserts + // can't happen because we know we have a clone_size and + // a clone_overlap. Now we check that the interval_set won't + // cause the last assert. + uint64_t size = snapset->clone_size.find(soid.snap)->second; + const interval_set<uint64_t> &overlap = + snapset->clone_overlap.find(soid.snap)->second; + bool bad_interval_set = false; + for (interval_set<uint64_t>::const_iterator i = overlap.begin(); + i != overlap.end(); ++i) { + if (size < i.get_len()) { + bad_interval_set = true; + break; + } + size -= i.get_len(); + } + + if (bad_interval_set) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " : bad interval_set in clone_overlap"; + ++scrubber.shallow_errors; + soid_error.set_size_mismatch(); + } else { + stat.num_bytes += snapset->get_clone_bytes(soid.snap); + } + } + } + + // what's next? + ++curclone; + if (soid_error.errors) { + scrubber.store->add_snap_error(pool.id, soid_error); + ++soid_error_count; + } + } + + scrub_cstat.add(stat); + } + + if (doing_clones(snapset, curclone)) { + dout(10) << __func__ << " " << mode << " " << info.pgid + << " No more objects while processing " << head.get() << dendl; + + missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode, + pool.info.allow_incomplete_clones(), all_clones, &curclone, + head_error); + } + // There could be missing found by the test above or even + // before dropping out of the loop for the last head. + if (missing) { + log_missing(missing, head, osd->clog, info.pgid, __func__, + mode, pool.info.allow_incomplete_clones()); + } + if (head && (head_error.errors || soid_error_count)) + scrubber.store->add_snap_error(pool.id, head_error); + + for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) { + ceph_assert(!p->first.is_snapdir()); + dout(10) << __func__ << " recording digests for " << p->first << dendl; + ObjectContextRef obc = get_object_context(p->first, false); + if (!obc) { + osd->clog->error() << info.pgid << " " << mode + << " cannot get object context for object " + << p->first; + continue; + } else if (obc->obs.oi.soid != p->first) { + osd->clog->error() << info.pgid << " " << mode + << " " << p->first + << " : object has a valid oi attr with a mismatched name, " + << " obc->obs.oi.soid: " << obc->obs.oi.soid; + continue; + } + OpContextUPtr ctx = simple_opc_create(obc); + ctx->at_version = get_next_version(); + ctx->mtime = utime_t(); // do not update mtime + if (p->second.first) { + ctx->new_obs.oi.set_data_digest(*p->second.first); + } else { + ctx->new_obs.oi.clear_data_digest(); + } + if (p->second.second) { + ctx->new_obs.oi.set_omap_digest(*p->second.second); + } else { + ctx->new_obs.oi.clear_omap_digest(); + } + finish_ctx(ctx.get(), pg_log_entry_t::MODIFY); + + ctx->register_on_success( + [this]() { + dout(20) << "updating scrub digest" << dendl; + if (--scrubber.num_digest_updates_pending == 0) { + requeue_scrub(); + } + }); + + simple_opc_submit(std::move(ctx)); + ++scrubber.num_digest_updates_pending; + } + + dout(10) << __func__ << " (" << mode << ") finish" << dendl; +} + +void PrimaryLogPG::_scrub_clear_state() +{ + scrub_cstat = object_stat_collection_t(); +} + +void PrimaryLogPG::_scrub_finish() +{ + bool repair = state_test(PG_STATE_REPAIR); + bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); + const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); + + if (info.stats.stats_invalid) { + info.stats.stats = scrub_cstat; + info.stats.stats_invalid = false; + + if (agent_state) + agent_choose_mode(); + } + + dout(10) << mode << " got " + << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, " + << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, " + << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, " + << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, " + << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, " + << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, " + << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, " + << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, " + << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes." + << dendl; + + if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects || + scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones || + (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty && + !info.stats.dirty_stats_invalid) || + (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap && + !info.stats.omap_stats_invalid) || + (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned && + !info.stats.pin_stats_invalid) || + (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive && + !info.stats.hitset_stats_invalid) || + (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive && + !info.stats.hitset_bytes_stats_invalid) || + (scrub_cstat.sum.num_objects_manifest != info.stats.stats.sum.num_objects_manifest && + !info.stats.manifest_stats_invalid) || + scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts || + scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) { + osd->clog->error() << info.pgid << " " << mode + << " : stat mismatch, got " + << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, " + << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, " + << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, " + << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, " + << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, " + << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, " + << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, " + << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, " + << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, " + << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."; + ++scrubber.shallow_errors; + + if (repair) { + ++scrubber.fixed; + info.stats.stats = scrub_cstat; + info.stats.dirty_stats_invalid = false; + info.stats.omap_stats_invalid = false; + info.stats.hitset_stats_invalid = false; + info.stats.hitset_bytes_stats_invalid = false; + info.stats.pin_stats_invalid = false; + info.stats.manifest_stats_invalid = false; + publish_stats_to_osd(); + share_pg_info(); + } + } + // Clear object context cache to get repair information + if (repair) + object_contexts.clear(); +} + +bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on) +{ + return osd->check_osdmap_full(missing_on); +} + +int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx) +{ + OpRequestRef op = ctx->op; + // Only supports replicated pools + ceph_assert(!pool.info.is_erasure()); + ceph_assert(is_primary()); + + dout(10) << __func__ << " " << soid + << " peers osd.{" << acting_recovery_backfill << "}" << dendl; + + if (!is_clean()) { + block_for_clean(soid, op); + return -EAGAIN; + } + + ceph_assert(!pg_log.get_missing().is_missing(soid)); + auto& oi = ctx->new_obs.oi; + eversion_t v = oi.version; + + missing_loc.add_missing(soid, v, eversion_t()); + if (primary_error(soid, v)) { + dout(0) << __func__ << " No other replicas available for " << soid << dendl; + // XXX: If we knew that there is no down osd which could include this + // object, it would be nice if we could return EIO here. + // If a "never fail" flag was available, that could be used + // for rbd to NOT return EIO until object marked lost. + + // Drop through to save this op in case an osd comes up with the object. + } + + // Restart the op after object becomes readable again + waiting_for_unreadable_object[soid].push_back(op); + op->mark_delayed("waiting for missing object"); + + if (!eio_errors_to_process) { + eio_errors_to_process = true; + ceph_assert(is_clean()); + state_set(PG_STATE_REPAIR); + state_clear(PG_STATE_CLEAN); + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + DoRecovery()))); + } else { + // A prior error must have already cleared clean state and queued recovery + // or a map change has triggered re-peering. + // Not inlining the recovery by calling maybe_kick_recovery(soid); + dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl; + } + + return -EAGAIN; +} + +/*---SnapTrimmer Logging---*/ +#undef dout_prefix +#define dout_prefix pg->gen_prefix(*_dout) + +void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name) +{ + ldout(pg->cct, 20) << "enter " << state_name << dendl; +} + +void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time) +{ + ldout(pg->cct, 20) << "exit " << state_name << dendl; +} + +/*---SnapTrimmer states---*/ +#undef dout_prefix +#define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \ + << "SnapTrimmer state<" << get_state_name() << ">: ") + +/* NotTrimming */ +PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx) + : my_base(ctx), + NamedState(context< SnapTrimmer >().pg, "NotTrimming") +{ + context< SnapTrimmer >().log_enter(state_name); +} + +void PrimaryLogPG::NotTrimming::exit() +{ + context< SnapTrimmer >().log_exit(state_name, enter_time); +} + +boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&) +{ + PrimaryLogPG *pg = context< SnapTrimmer >().pg; + ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl; + + if (!(pg->is_primary() && pg->is_active())) { + ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl; + return discard_event(); + } + if (!pg->is_clean() || + pg->snap_trimq.empty()) { + ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl; + return discard_event(); + } + if (pg->scrubber.active) { + ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl; + return transit< WaitScrub >(); + } else { + return transit< Trimming >(); + } +} + +boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&) +{ + PrimaryLogPG *pg = context< SnapTrimmer >().pg; + ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl; + + pending = nullptr; + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } + + context<Trimming>().snap_to_trim = pg->snap_trimq.range_start(); + ldout(pg->cct, 10) << "NotTrimming: trimming " + << pg->snap_trimq.range_start() + << dendl; + return transit< AwaitAsyncWork >(); +} + +/* AwaitAsyncWork */ +PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx) + : my_base(ctx), + NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork") +{ + auto *pg = context< SnapTrimmer >().pg; + context< SnapTrimmer >().log_enter(state_name); + context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg); + pg->state_set(PG_STATE_SNAPTRIM); + pg->state_clear(PG_STATE_SNAPTRIM_ERROR); + pg->publish_stats_to_osd(); +} + +boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&) +{ + PrimaryLogPGRef pg = context< SnapTrimmer >().pg; + snapid_t snap_to_trim = context<Trimming>().snap_to_trim; + auto &in_flight = context<Trimming>().in_flight; + ceph_assert(in_flight.empty()); + + ceph_assert(pg->is_primary() && pg->is_active()); + if (!context< SnapTrimmer >().can_trim()) { + ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl; + post_event(KickTrim()); + return transit< NotTrimming >(); + } + + ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl; + + vector<hobject_t> to_trim; + unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims; + to_trim.reserve(max); + int r = pg->snap_mapper.get_next_objects_to_trim( + snap_to_trim, + max, + &to_trim); + if (r != 0 && r != -ENOENT) { + lderr(pg->cct) << "get_next_objects_to_trim returned " + << cpp_strerror(r) << dendl; + ceph_abort_msg("get_next_objects_to_trim returned an invalid code"); + } else if (r == -ENOENT) { + // Done! + ldout(pg->cct, 10) << "got ENOENT" << dendl; + + ldout(pg->cct, 10) << "adding snap " << snap_to_trim + << " to purged_snaps" + << dendl; + pg->info.purged_snaps.insert(snap_to_trim); + pg->snap_trimq.erase(snap_to_trim); + ldout(pg->cct, 10) << "purged_snaps now " + << pg->info.purged_snaps << ", snap_trimq now " + << pg->snap_trimq << dendl; + + ObjectStore::Transaction t; + pg->dirty_big_info = true; + pg->write_if_dirty(t); + int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL); + ceph_assert(tr == 0); + + pg->share_pg_info(); + post_event(KickTrim()); + return transit< NotTrimming >(); + } + ceph_assert(!to_trim.empty()); + + for (auto &&object: to_trim) { + // Get next + ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl; + OpContextUPtr ctx; + int error = pg->trim_object(in_flight.empty(), object, &ctx); + if (error) { + if (error == -ENOLCK) { + ldout(pg->cct, 10) << "could not get write lock on obj " + << object << dendl; + } else { + pg->state_set(PG_STATE_SNAPTRIM_ERROR); + ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl; + } + if (!in_flight.empty()) { + ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl; + return transit< WaitRepops >(); + } + if (error == -ENOLCK) { + ldout(pg->cct, 10) << "waiting for it to clear" + << dendl; + return transit< WaitRWLock >(); + } else { + return transit< NotTrimming >(); + } + } + + in_flight.insert(object); + ctx->register_on_success( + [pg, object, &in_flight]() { + ceph_assert(in_flight.find(object) != in_flight.end()); + in_flight.erase(object); + if (in_flight.empty()) { + if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) { + pg->snap_trimmer_machine.process_event(Reset()); + } else { + pg->snap_trimmer_machine.process_event(RepopsComplete()); + } + } + }); + + pg->simple_opc_submit(std::move(ctx)); + } + + return transit< WaitRepops >(); +} + +void PrimaryLogPG::setattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const string &key, + bufferlist &val) +{ + t->setattr(obc->obs.oi.soid, key, val); +} + +void PrimaryLogPG::setattrs_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + map<string, bufferlist> &attrs) +{ + t->setattrs(obc->obs.oi.soid, attrs); +} + +void PrimaryLogPG::rmattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const string &key) +{ + t->rmattr(obc->obs.oi.soid, key); +} + +int PrimaryLogPG::getattr_maybe_cache( + ObjectContextRef obc, + const string &key, + bufferlist *val) +{ + if (pool.info.is_erasure()) { + map<string, bufferlist>::iterator i = obc->attr_cache.find(key); + if (i != obc->attr_cache.end()) { + if (val) + *val = i->second; + return 0; + } else { + return -ENODATA; + } + } + return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val); +} + +int PrimaryLogPG::getattrs_maybe_cache( + ObjectContextRef obc, + map<string, bufferlist> *out) +{ + int r = 0; + ceph_assert(out); + if (pool.info.is_erasure()) { + *out = obc->attr_cache; + } else { + r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out); + } + map<string, bufferlist> tmp; + for (map<string, bufferlist>::iterator i = out->begin(); + i != out->end(); + ++i) { + if (i->first.size() > 1 && i->first[0] == '_') + tmp[i->first.substr(1, i->first.size())].claim(i->second); + } + tmp.swap(*out); + return r; +} + +bool PrimaryLogPG::check_failsafe_full() { + return osd->check_failsafe_full(get_dpp()); +} + +void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); } +void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); } + +#ifdef PG_DEBUG_REFS +uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); } +void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); } +#endif + +void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); } +void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); } diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h new file mode 100644 index 00000000..ba5a5123 --- /dev/null +++ b/src/osd/PrimaryLogPG.h @@ -0,0 +1,1949 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_REPLICATEDPG_H +#define CEPH_REPLICATEDPG_H + +#include <boost/tuple/tuple.hpp> +#include "include/ceph_assert.h" +#include "DynamicPerfStats.h" +#include "OSD.h" +#include "PG.h" +#include "Watch.h" +#include "TierAgentState.h" +#include "messages/MOSDOpReply.h" +#include "common/Checksummer.h" +#include "common/sharedptr_registry.hpp" +#include "common/shared_cache.hpp" +#include "ReplicatedBackend.h" +#include "PGTransaction.h" +#include "cls/cas/cls_cas_ops.h" + +class CopyFromCallback; +class PromoteCallback; + +class PrimaryLogPG; +class PGLSFilter; +class HitSet; +struct TierAgentState; +class MOSDOp; +class MOSDOpReply; +class OSDService; + +void intrusive_ptr_add_ref(PrimaryLogPG *pg); +void intrusive_ptr_release(PrimaryLogPG *pg); +uint64_t get_with_id(PrimaryLogPG *pg); +void put_with_id(PrimaryLogPG *pg, uint64_t id); + +#ifdef PG_DEBUG_REFS + typedef TrackedIntPtr<PrimaryLogPG> PrimaryLogPGRef; +#else + typedef boost::intrusive_ptr<PrimaryLogPG> PrimaryLogPGRef; +#endif + +struct inconsistent_snapset_wrapper; + +class PrimaryLogPG : public PG, public PGBackend::Listener { + friend class OSD; + friend class Watch; + +public: + MEMPOOL_CLASS_HELPERS(); + + /* + * state associated with a copy operation + */ + struct OpContext; + class CopyCallback; + + /** + * CopyResults stores the object metadata of interest to a copy initiator. + */ + struct CopyResults { + ceph::real_time mtime; ///< the copy source's mtime + uint64_t object_size; ///< the copied object's size + bool started_temp_obj; ///< true if the callback needs to delete temp object + hobject_t temp_oid; ///< temp object (if any) + + /** + * Function to fill in transaction; if non-empty the callback + * must execute it before any other accesses to the object + * (in order to complete the copy). + */ + std::function<void(PGTransaction *)> fill_in_final_tx; + + version_t user_version; ///< The copy source's user version + bool should_requeue; ///< op should be requeued on cancel + vector<snapid_t> snaps; ///< src's snaps (if clone) + snapid_t snap_seq; ///< src's snap_seq (if head) + librados::snap_set_t snapset; ///< src snapset (if head) + bool mirror_snapset; + bool has_omap; + uint32_t flags; // object_copy_data_t::FLAG_* + uint32_t source_data_digest, source_omap_digest; + uint32_t data_digest, omap_digest; + mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids; // [(reqid, user_version)] + mempool::osd_pglog::map<uint32_t, int> reqid_return_codes; // map reqids by index to error code + map<string, bufferlist> attrs; // xattrs + uint64_t truncate_seq; + uint64_t truncate_size; + bool is_data_digest() { + return flags & object_copy_data_t::FLAG_DATA_DIGEST; + } + bool is_omap_digest() { + return flags & object_copy_data_t::FLAG_OMAP_DIGEST; + } + CopyResults() + : object_size(0), started_temp_obj(false), + user_version(0), + should_requeue(false), mirror_snapset(false), + has_omap(false), + flags(0), + source_data_digest(-1), source_omap_digest(-1), + data_digest(-1), omap_digest(-1), + truncate_seq(0), truncate_size(0) + {} + }; + + struct CopyOp; + typedef std::shared_ptr<CopyOp> CopyOpRef; + + struct CopyOp { + CopyCallback *cb; + ObjectContextRef obc; + hobject_t src; + object_locator_t oloc; + unsigned flags; + bool mirror_snapset; + + CopyResults results; + + ceph_tid_t objecter_tid; + ceph_tid_t objecter_tid2; + + object_copy_cursor_t cursor; + map<string,bufferlist> attrs; + bufferlist data; + bufferlist omap_header; + bufferlist omap_data; + int rval; + + object_copy_cursor_t temp_cursor; + + /* + * For CopyOp the process is: + * step1: read the data(attr/omap/data) from the source object + * step2: handle those data(w/ those data create a new object) + * src_obj_fadvise_flags used in step1; + * dest_obj_fadvise_flags used in step2 + */ + unsigned src_obj_fadvise_flags; + unsigned dest_obj_fadvise_flags; + + map<uint64_t, CopyOpRef> chunk_cops; + int num_chunk; + bool failed; + uint64_t start_offset = 0; + uint64_t last_offset = 0; + vector<OSDOp> chunk_ops; + + CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, + object_locator_t l, + version_t v, + unsigned f, + bool ms, + unsigned src_obj_fadvise_flags, + unsigned dest_obj_fadvise_flags) + : cb(cb_), obc(_obc), src(s), oloc(l), flags(f), + mirror_snapset(ms), + objecter_tid(0), + objecter_tid2(0), + rval(-1), + src_obj_fadvise_flags(src_obj_fadvise_flags), + dest_obj_fadvise_flags(dest_obj_fadvise_flags), + num_chunk(0), + failed(false) + { + results.user_version = v; + results.mirror_snapset = mirror_snapset; + } + }; + + /** + * The CopyCallback class defines an interface for completions to the + * copy_start code. Users of the copy infrastructure must implement + * one and give an instance of the class to start_copy. + * + * The implementer is responsible for making sure that the CopyCallback + * can associate itself with the correct copy operation. + */ + typedef boost::tuple<int, CopyResults*> CopyCallbackResults; + + friend class CopyFromCallback; + friend class CopyFromFinisher; + friend class PromoteCallback; + friend class PromoteFinisher; + + struct ProxyReadOp { + OpRequestRef op; + hobject_t soid; + ceph_tid_t objecter_tid; + vector<OSDOp> &ops; + version_t user_version; + int data_offset; + bool canceled; ///< true if canceled + + ProxyReadOp(OpRequestRef _op, hobject_t oid, vector<OSDOp>& _ops) + : op(_op), soid(oid), + objecter_tid(0), ops(_ops), + user_version(0), data_offset(0), + canceled(false) { } + }; + typedef std::shared_ptr<ProxyReadOp> ProxyReadOpRef; + + struct ProxyWriteOp { + OpContext *ctx; + OpRequestRef op; + hobject_t soid; + ceph_tid_t objecter_tid; + vector<OSDOp> &ops; + version_t user_version; + bool sent_reply; + utime_t mtime; + bool canceled; + osd_reqid_t reqid; + + ProxyWriteOp(OpRequestRef _op, hobject_t oid, vector<OSDOp>& _ops, osd_reqid_t _reqid) + : ctx(NULL), op(_op), soid(oid), + objecter_tid(0), ops(_ops), + user_version(0), sent_reply(false), + canceled(false), + reqid(_reqid) { } + }; + typedef std::shared_ptr<ProxyWriteOp> ProxyWriteOpRef; + + struct FlushOp { + ObjectContextRef obc; ///< obc we are flushing + OpRequestRef op; ///< initiating op + list<OpRequestRef> dup_ops; ///< bandwagon jumpers + version_t flushed_version; ///< user version we are flushing + ceph_tid_t objecter_tid; ///< copy-from request tid + int rval; ///< copy-from result + bool blocking; ///< whether we are blocking updates + bool removal; ///< we are removing the backend object + boost::optional<std::function<void()>> on_flush; ///< callback, may be null + // for chunked object + map<uint64_t, int> io_results; + map<uint64_t, ceph_tid_t> io_tids; + uint64_t chunks; + + FlushOp() + : flushed_version(0), objecter_tid(0), rval(0), + blocking(false), removal(false), chunks(0) {} + ~FlushOp() { ceph_assert(!on_flush); } + }; + typedef std::shared_ptr<FlushOp> FlushOpRef; + + boost::scoped_ptr<PGBackend> pgbackend; + PGBackend *get_pgbackend() override { + return pgbackend.get(); + } + + const PGBackend *get_pgbackend() const override { + return pgbackend.get(); + } + + /// Listener methods + DoutPrefixProvider *get_dpp() override { + return this; + } + + void on_local_recover( + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info, + ObjectContextRef obc, + bool is_delete, + ObjectStore::Transaction *t + ) override; + void on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info + ) override; + void begin_peer_recover( + pg_shard_t peer, + const hobject_t oid) override; + void on_global_recover( + const hobject_t &oid, + const object_stat_sum_t &stat_diff, + bool is_delete) override; + void failed_push(const list<pg_shard_t> &from, + const hobject_t &soid, + const eversion_t &need = eversion_t()) override; + void primary_failed(const hobject_t &soid) override; + bool primary_error(const hobject_t& soid, eversion_t v) override; + void cancel_pull(const hobject_t &soid) override; + void apply_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats) override; + void on_primary_error(const hobject_t &oid, eversion_t v) override; + void backfill_add_missing(const hobject_t &oid, eversion_t v) override; + void remove_missing_object(const hobject_t &oid, + eversion_t v, + Context *on_complete) override; + + template<class T> class BlessedGenContext; + template<class T> class UnlockedBlessedGenContext; + class BlessedContext; + Context *bless_context(Context *c) override; + + GenContext<ThreadPool::TPHandle&> *bless_gencontext( + GenContext<ThreadPool::TPHandle&> *c) override; + GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext( + GenContext<ThreadPool::TPHandle&> *c) override; + + void send_message(int to_osd, Message *m) override { + osd->send_message_osd_cluster(to_osd, m, get_osdmap_epoch()); + } + void queue_transaction(ObjectStore::Transaction&& t, + OpRequestRef op) override { + osd->store->queue_transaction(ch, std::move(t), op); + } + void queue_transactions(vector<ObjectStore::Transaction>& tls, + OpRequestRef op) override { + osd->store->queue_transactions(ch, tls, op, NULL); + } + epoch_t get_interval_start_epoch() const override { + return info.history.same_interval_since; + } + epoch_t get_last_peering_reset_epoch() const override { + return get_last_peering_reset(); + } + const set<pg_shard_t> &get_acting_recovery_backfill_shards() const override { + return acting_recovery_backfill; + } + const set<pg_shard_t> &get_acting_shards() const override { + return actingset; + } + const set<pg_shard_t> &get_backfill_shards() const override { + return backfill_targets; + } + + std::ostream& gen_dbg_prefix(std::ostream& out) const override { + return gen_prefix(out); + } + + const map<hobject_t, set<pg_shard_t>> + &get_missing_loc_shards() const override { + return missing_loc.get_missing_locs(); + } + const map<pg_shard_t, pg_missing_t> &get_shard_missing() const override { + return peer_missing; + } + using PGBackend::Listener::get_shard_missing; + const map<pg_shard_t, pg_info_t> &get_shard_info() const override { + return peer_info; + } + using PGBackend::Listener::get_shard_info; + const pg_missing_tracker_t &get_local_missing() const override { + return pg_log.get_missing(); + } + const PGLog &get_log() const override { + return pg_log; + } + void add_local_next_event(const pg_log_entry_t& e) override { + pg_log.missing_add_next_entry(e); + } + bool pgb_is_primary() const override { + return is_primary(); + } + const OSDMapRef& pgb_get_osdmap() const override final { + return get_osdmap(); + } + epoch_t pgb_get_osdmap_epoch() const override final { + return get_osdmap_epoch(); + } + const pg_info_t &get_info() const override { + return info; + } + const pg_pool_t &get_pool() const override { + return pool.info; + } + + ObjectContextRef get_obc( + const hobject_t &hoid, + const map<string, bufferlist> &attrs) override { + return get_object_context(hoid, true, &attrs); + } + + bool try_lock_for_read( + const hobject_t &hoid, + ObcLockManager &manager) override { + if (is_missing_object(hoid)) + return false; + auto obc = get_object_context(hoid, false, nullptr); + if (!obc) + return false; + return manager.try_get_read_lock(hoid, obc); + } + + void release_locks(ObcLockManager &manager) override { + release_object_locks(manager); + } + + bool pg_is_repair() override { + return is_repair(); + } + void inc_osd_stat_repaired() override { + osd->inc_osd_stat_repaired(); + } + void set_osd_stat_repaired(int64_t count) override { + osd->set_osd_stat_repaired(count); + } + bool pg_is_remote_backfilling() override { + return is_remote_backfilling(); + } + void pg_add_local_num_bytes(int64_t num_bytes) override { + add_local_num_bytes(num_bytes); + } + void pg_sub_local_num_bytes(int64_t num_bytes) override { + sub_local_num_bytes(num_bytes); + } + void pg_add_num_bytes(int64_t num_bytes) override { + add_num_bytes(num_bytes); + } + void pg_sub_num_bytes(int64_t num_bytes) override { + sub_num_bytes(num_bytes); + } + + void pgb_set_object_snap_mapping( + const hobject_t &soid, + const set<snapid_t> &snaps, + ObjectStore::Transaction *t) override { + return update_object_snap_mapping(t, soid, snaps); + } + void pgb_clear_object_snap_mapping( + const hobject_t &soid, + ObjectStore::Transaction *t) override { + return clear_object_snap_mapping(t, soid); + } + + void log_operation( + const vector<pg_log_entry_t> &logv, + const boost::optional<pg_hit_set_history_t> &hset_history, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + bool transaction_applied, + ObjectStore::Transaction &t, + bool async = false) override { + if (is_primary()) { + ceph_assert(trim_to <= last_update_ondisk); + } + if (hset_history) { + info.hit_set = *hset_history; + } + append_log(logv, trim_to, roll_forward_to, t, transaction_applied, async); + } + + void op_applied(const eversion_t &applied_version) override; + + bool should_send_op( + pg_shard_t peer, + const hobject_t &hoid) override; + + bool pg_is_undersized() const override { + return is_undersized(); + } + + bool pg_is_repair() const override { + return is_repair(); + } + + void update_peer_last_complete_ondisk( + pg_shard_t fromosd, + eversion_t lcod) override { + peer_last_complete_ondisk[fromosd] = lcod; + } + + void update_last_complete_ondisk( + eversion_t lcod) override { + last_complete_ondisk = lcod; + } + + void update_stats( + const pg_stat_t &stat) override { + info.stats = stat; + } + + void schedule_recovery_work( + GenContext<ThreadPool::TPHandle&> *c) override; + + pg_shard_t whoami_shard() const override { + return pg_whoami; + } + spg_t primary_spg_t() const override { + return spg_t(info.pgid.pgid, primary.shard); + } + pg_shard_t primary_shard() const override { + return primary; + } + uint64_t min_upacting_features() const override { + return get_min_upacting_features(); + } + + void send_message_osd_cluster( + int peer, Message *m, epoch_t from_epoch) override; + void send_message_osd_cluster( + Message *m, Connection *con) override; + void send_message_osd_cluster( + Message *m, const ConnectionRef& con) override; + ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override; + entity_name_t get_cluster_msgr_name() override { + return osd->get_cluster_msgr_name(); + } + + PerfCounters *get_logger() override; + + ceph_tid_t get_tid() override { return osd->get_tid(); } + + LogClientTemp clog_error() override { return osd->clog->error(); } + LogClientTemp clog_warn() override { return osd->clog->warn(); } + + struct watch_disconnect_t { + uint64_t cookie; + entity_name_t name; + bool send_disconnect; + watch_disconnect_t(uint64_t c, entity_name_t n, bool sd) + : cookie(c), name(n), send_disconnect(sd) {} + }; + void complete_disconnect_watches( + ObjectContextRef obc, + const list<watch_disconnect_t> &to_disconnect); + + struct OpFinisher { + virtual ~OpFinisher() { + } + + virtual int execute() = 0; + }; + + /* + * Capture all object state associated with an in-progress read or write. + */ + struct OpContext { + OpRequestRef op; + osd_reqid_t reqid; + vector<OSDOp> *ops; + + const ObjectState *obs; // Old objectstate + const SnapSet *snapset; // Old snapset + + ObjectState new_obs; // resulting ObjectState + SnapSet new_snapset; // resulting SnapSet (in case of a write) + //pg_stat_t new_stats; // resulting Stats + object_stat_sum_t delta_stats; + + bool modify; // (force) modification (even if op_t is empty) + bool user_modify; // user-visible modification + bool undirty; // user explicitly un-dirtying this object + bool cache_evict; ///< true if this is a cache eviction + bool ignore_cache; ///< true if IGNORE_CACHE flag is set + bool ignore_log_op_stats; // don't log op stats + bool update_log_only; ///< this is a write that returned an error - just record in pg log for dup detection + + // side effects + list<pair<watch_info_t,bool> > watch_connects; ///< new watch + will_ping flag + list<watch_disconnect_t> watch_disconnects; ///< old watch + send_discon + list<notify_info_t> notifies; + struct NotifyAck { + boost::optional<uint64_t> watch_cookie; + uint64_t notify_id; + bufferlist reply_bl; + explicit NotifyAck(uint64_t notify_id) : notify_id(notify_id) {} + NotifyAck(uint64_t notify_id, uint64_t cookie, bufferlist& rbl) + : watch_cookie(cookie), notify_id(notify_id) { + reply_bl.claim(rbl); + } + }; + list<NotifyAck> notify_acks; + + uint64_t bytes_written, bytes_read; + + utime_t mtime; + SnapContext snapc; // writer snap context + eversion_t at_version; // pg's current version pointer + version_t user_at_version; // pg's current user version pointer + + /// index of the current subop - only valid inside of do_osd_ops() + int current_osd_subop_num; + /// total number of subops processed in this context for cls_cxx_subop_version() + int processed_subop_count = 0; + + PGTransactionUPtr op_t; + vector<pg_log_entry_t> log; + boost::optional<pg_hit_set_history_t> updated_hset_history; + + interval_set<uint64_t> modified_ranges; + ObjectContextRef obc; + ObjectContextRef clone_obc; // if we created a clone + ObjectContextRef head_obc; // if we also update snapset (see trim_object) + + // FIXME: we may want to kill this msgr hint off at some point! + boost::optional<int> data_off = boost::none; + + MOSDOpReply *reply; + + PrimaryLogPG *pg; + + int num_read; ///< count read ops + int num_write; ///< count update ops + + mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids; + mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes; + + hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking + + list<std::function<void()>> on_applied; + list<std::function<void()>> on_committed; + list<std::function<void()>> on_finish; + list<std::function<void()>> on_success; + template <typename F> + void register_on_finish(F &&f) { + on_finish.emplace_back(std::forward<F>(f)); + } + template <typename F> + void register_on_success(F &&f) { + on_success.emplace_back(std::forward<F>(f)); + } + template <typename F> + void register_on_applied(F &&f) { + on_applied.emplace_back(std::forward<F>(f)); + } + template <typename F> + void register_on_commit(F &&f) { + on_committed.emplace_back(std::forward<F>(f)); + } + + bool sent_reply = false; + + // pending async reads <off, len, op_flags> -> <outbl, outr> + list<pair<boost::tuple<uint64_t, uint64_t, unsigned>, + pair<bufferlist*, Context*> > > pending_async_reads; + int inflightreads; + friend struct OnReadComplete; + void start_async_reads(PrimaryLogPG *pg); + void finish_read(PrimaryLogPG *pg); + bool async_reads_complete() { + return inflightreads == 0; + } + + ObjectContext::RWState::State lock_type; + ObcLockManager lock_manager; + + std::map<int, std::unique_ptr<OpFinisher>> op_finishers; + + OpContext(const OpContext& other); + const OpContext& operator=(const OpContext& other); + + OpContext(OpRequestRef _op, osd_reqid_t _reqid, vector<OSDOp>* _ops, + ObjectContextRef& obc, + PrimaryLogPG *_pg) : + op(_op), reqid(_reqid), ops(_ops), + obs(&obc->obs), + snapset(0), + new_obs(obs->oi, obs->exists), + modify(false), user_modify(false), undirty(false), cache_evict(false), + ignore_cache(false), ignore_log_op_stats(false), update_log_only(false), + bytes_written(0), bytes_read(0), user_at_version(0), + current_osd_subop_num(0), + obc(obc), + reply(NULL), pg(_pg), + num_read(0), + num_write(0), + sent_reply(false), + inflightreads(0), + lock_type(ObjectContext::RWState::RWNONE) { + if (obc->ssc) { + new_snapset = obc->ssc->snapset; + snapset = &obc->ssc->snapset; + } + } + OpContext(OpRequestRef _op, osd_reqid_t _reqid, + vector<OSDOp>* _ops, PrimaryLogPG *_pg) : + op(_op), reqid(_reqid), ops(_ops), obs(NULL), snapset(0), + modify(false), user_modify(false), undirty(false), cache_evict(false), + ignore_cache(false), ignore_log_op_stats(false), update_log_only(false), + bytes_written(0), bytes_read(0), user_at_version(0), + current_osd_subop_num(0), + reply(NULL), pg(_pg), + num_read(0), + num_write(0), + inflightreads(0), + lock_type(ObjectContext::RWState::RWNONE) {} + void reset_obs(ObjectContextRef obc) { + new_obs = ObjectState(obc->obs.oi, obc->obs.exists); + if (obc->ssc) { + new_snapset = obc->ssc->snapset; + snapset = &obc->ssc->snapset; + } + } + ~OpContext() { + ceph_assert(!op_t); + if (reply) + reply->put(); + for (list<pair<boost::tuple<uint64_t, uint64_t, unsigned>, + pair<bufferlist*, Context*> > >::iterator i = + pending_async_reads.begin(); + i != pending_async_reads.end(); + pending_async_reads.erase(i++)) { + delete i->second.second; + } + } + uint64_t get_features() { + if (op && op->get_req()) { + return op->get_req()->get_connection()->get_features(); + } + return -1ull; + } + }; + using OpContextUPtr = std::unique_ptr<OpContext>; + friend struct OpContext; + + /* + * State on the PG primary associated with the replicated mutation + */ + class RepGather { + public: + hobject_t hoid; + OpRequestRef op; + xlist<RepGather*>::item queue_item; + int nref; + + eversion_t v; + int r = 0; + + ceph_tid_t rep_tid; + + bool rep_aborted; + bool all_committed; + + utime_t start; + + eversion_t pg_local_last_complete; + + ObcLockManager lock_manager; + + list<std::function<void()>> on_committed; + list<std::function<void()>> on_success; + list<std::function<void()>> on_finish; + + RepGather( + OpContext *c, ceph_tid_t rt, + eversion_t lc) : + hoid(c->obc->obs.oi.soid), + op(c->op), + queue_item(this), + nref(1), + rep_tid(rt), + rep_aborted(false), + all_committed(false), + pg_local_last_complete(lc), + lock_manager(std::move(c->lock_manager)), + on_committed(std::move(c->on_committed)), + on_success(std::move(c->on_success)), + on_finish(std::move(c->on_finish)) {} + + RepGather( + ObcLockManager &&manager, + OpRequestRef &&o, + boost::optional<std::function<void(void)> > &&on_complete, + ceph_tid_t rt, + eversion_t lc, + int r) : + op(o), + queue_item(this), + nref(1), + r(r), + rep_tid(rt), + rep_aborted(false), + all_committed(false), + pg_local_last_complete(lc), + lock_manager(std::move(manager)) { + if (on_complete) { + on_success.push_back(std::move(*on_complete)); + } + } + + RepGather *get() { + nref++; + return this; + } + void put() { + ceph_assert(nref > 0); + if (--nref == 0) { + delete this; + //generic_dout(0) << "deleting " << this << dendl; + } + } + }; + + +protected: + + /** + * Grabs locks for OpContext, should be cleaned up in close_op_ctx + * + * @param ctx [in,out] ctx to get locks for + * @return true on success, false if we are queued + */ + bool get_rw_locks(bool write_ordered, OpContext *ctx) { + /* If head_obc, !obc->obs->exists and we will always take the + * snapdir lock *before* the head lock. Since all callers will do + * this (read or write) if we get the first we will be guaranteed + * to get the second. + */ + if (write_ordered && ctx->op->may_read()) { + ctx->lock_type = ObjectContext::RWState::RWEXCL; + } else if (write_ordered) { + ctx->lock_type = ObjectContext::RWState::RWWRITE; + } else { + ceph_assert(ctx->op->may_read()); + ctx->lock_type = ObjectContext::RWState::RWREAD; + } + + if (ctx->head_obc) { + ceph_assert(!ctx->obc->obs.exists); + if (!ctx->lock_manager.get_lock_type( + ctx->lock_type, + ctx->head_obc->obs.oi.soid, + ctx->head_obc, + ctx->op)) { + ctx->lock_type = ObjectContext::RWState::RWNONE; + return false; + } + } + if (ctx->lock_manager.get_lock_type( + ctx->lock_type, + ctx->obc->obs.oi.soid, + ctx->obc, + ctx->op)) { + return true; + } else { + ceph_assert(!ctx->head_obc); + ctx->lock_type = ObjectContext::RWState::RWNONE; + return false; + } + } + + /** + * Cleans up OpContext + * + * @param ctx [in] ctx to clean up + */ + void close_op_ctx(OpContext *ctx); + + /** + * Releases locks + * + * @param manager [in] manager with locks to release + */ + void release_object_locks( + ObcLockManager &lock_manager) { + list<pair<ObjectContextRef, list<OpRequestRef> > > to_req; + bool requeue_recovery = false; + bool requeue_snaptrim = false; + lock_manager.put_locks( + &to_req, + &requeue_recovery, + &requeue_snaptrim); + if (requeue_recovery) + queue_recovery(); + if (requeue_snaptrim) + snap_trimmer_machine.process_event(TrimWriteUnblocked()); + + if (!to_req.empty()) { + // requeue at front of scrub blocking queue if we are blocked by scrub + for (auto &&p: to_req) { + if (write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) { + for (auto& op : p.second) { + op->mark_delayed("waiting for scrub"); + } + + waiting_for_scrub.splice( + waiting_for_scrub.begin(), + p.second, + p.second.begin(), + p.second.end()); + } else { + requeue_ops(p.second); + } + } + } + } + + // replica ops + // [primary|tail] + xlist<RepGather*> repop_queue; + + friend class C_OSD_RepopCommit; + void repop_all_committed(RepGather *repop); + void eval_repop(RepGather*); + void issue_repop(RepGather *repop, OpContext *ctx); + RepGather *new_repop( + OpContext *ctx, + ObjectContextRef obc, + ceph_tid_t rep_tid); + boost::intrusive_ptr<RepGather> new_repop( + eversion_t version, + int r, + ObcLockManager &&manager, + OpRequestRef &&op, + boost::optional<std::function<void(void)> > &&on_complete); + void remove_repop(RepGather *repop); + + OpContextUPtr simple_opc_create(ObjectContextRef obc); + void simple_opc_submit(OpContextUPtr ctx); + + /** + * Merge entries atomically into all acting_recovery_backfill osds + * adjusting missing and recovery state as necessary. + * + * Also used to store error log entries for dup detection. + */ + void submit_log_entries( + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + ObcLockManager &&manager, + boost::optional<std::function<void(void)> > &&on_complete, + OpRequestRef op = OpRequestRef(), + int r = 0); + struct LogUpdateCtx { + boost::intrusive_ptr<RepGather> repop; + set<pg_shard_t> waiting_on; + }; + void cancel_log_updates(); + map<ceph_tid_t, LogUpdateCtx> log_entry_update_waiting_on; + + + // hot/cold tracking + HitSetRef hit_set; ///< currently accumulating HitSet + utime_t hit_set_start_stamp; ///< time the current HitSet started recording + + + void hit_set_clear(); ///< discard any HitSet state + void hit_set_setup(); ///< initialize HitSet state + void hit_set_create(); ///< create a new HitSet + void hit_set_persist(); ///< persist hit info + bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet + void hit_set_trim(OpContextUPtr &ctx, unsigned max); ///< discard old HitSets + void hit_set_in_memory_trim(uint32_t max_in_memory); ///< discard old in memory HitSets + void hit_set_remove_all(); + + hobject_t get_hit_set_current_object(utime_t stamp); + hobject_t get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt); + + // agent + boost::scoped_ptr<TierAgentState> agent_state; + + void agent_setup(); ///< initialize agent state + bool agent_work(int max) override ///< entry point to do some agent work + { + return agent_work(max, max); + } + bool agent_work(int max, int agent_flush_quota) override; + bool agent_maybe_flush(ObjectContextRef& obc); ///< maybe flush + bool agent_maybe_evict(ObjectContextRef& obc, bool after_flush); ///< maybe evict + + void agent_load_hit_sets(); ///< load HitSets, if needed + + /// estimate object atime and temperature + /// + /// @param oid [in] object name + /// @param temperature [out] relative temperature (# consider both access time and frequency) + void agent_estimate_temp(const hobject_t& oid, int *temperature); + + /// stop the agent + void agent_stop() override; + void agent_delay() override; + + /// clear agent state + void agent_clear() override; + + /// choose (new) agent mode(s), returns true if op is requeued + bool agent_choose_mode(bool restart = false, OpRequestRef op = OpRequestRef()); + void agent_choose_mode_restart() override; + + /// true if we can send an ondisk/commit for v + bool already_complete(eversion_t v); + /// true if we can send an ack for v + bool already_ack(eversion_t v); + + // projected object info + SharedLRU<hobject_t, ObjectContext> object_contexts; + // map from oid.snapdir() to SnapSetContext * + map<hobject_t, SnapSetContext*> snapset_contexts; + Mutex snapset_contexts_lock; + + // debug order that client ops are applied + map<hobject_t, map<client_t, ceph_tid_t>> debug_op_order; + + void populate_obc_watchers(ObjectContextRef obc); + void check_blacklisted_obc_watchers(ObjectContextRef obc); + void check_blacklisted_watchers() override; + void get_watchers(list<obj_watch_item_t> *ls) override; + void get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers); +public: + void handle_watch_timeout(WatchRef watch); +protected: + + ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc); + ObjectContextRef get_object_context( + const hobject_t& soid, + bool can_create, + const map<string, bufferlist> *attrs = 0 + ); + + void context_registry_on_change(); + void object_context_destructor_callback(ObjectContext *obc); + class C_PG_ObjectContext; + + int find_object_context(const hobject_t& oid, + ObjectContextRef *pobc, + bool can_create, + bool map_snapid_to_clone=false, + hobject_t *missing_oid=NULL); + + void add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *stat); + + void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc); + + SnapSetContext *get_snapset_context( + const hobject_t& oid, + bool can_create, + const map<string, bufferlist> *attrs = 0, + bool oid_existed = true //indicate this oid whether exsited in backend + ); + void register_snapset_context(SnapSetContext *ssc) { + std::lock_guard l(snapset_contexts_lock); + _register_snapset_context(ssc); + } + void _register_snapset_context(SnapSetContext *ssc) { + ceph_assert(snapset_contexts_lock.is_locked()); + if (!ssc->registered) { + ceph_assert(snapset_contexts.count(ssc->oid) == 0); + ssc->registered = true; + snapset_contexts[ssc->oid] = ssc; + } + } + void put_snapset_context(SnapSetContext *ssc); + + map<hobject_t, ObjectContextRef> recovering; + + /* + * Backfill + * + * peer_info[backfill_target].last_backfill == info.last_backfill on the peer. + * + * objects prior to peer_info[backfill_target].last_backfill + * - are on the peer + * - are included in the peer stats + * + * objects \in (last_backfill, last_backfill_started] + * - are on the peer or are in backfills_in_flight + * - are not included in pg stats (yet) + * - have their stats in pending_backfill_updates on the primary + */ + set<hobject_t> backfills_in_flight; + map<hobject_t, pg_stat_t> pending_backfill_updates; + + void dump_recovery_info(Formatter *f) const override { + f->open_array_section("backfill_targets"); + for (set<pg_shard_t>::const_iterator p = backfill_targets.begin(); + p != backfill_targets.end(); ++p) + f->dump_stream("replica") << *p; + f->close_section(); + f->open_array_section("waiting_on_backfill"); + for (set<pg_shard_t>::const_iterator p = waiting_on_backfill.begin(); + p != waiting_on_backfill.end(); ++p) + f->dump_stream("osd") << *p; + f->close_section(); + f->dump_stream("last_backfill_started") << last_backfill_started; + { + f->open_object_section("backfill_info"); + backfill_info.dump(f); + f->close_section(); + } + { + f->open_array_section("peer_backfill_info"); + for (map<pg_shard_t, BackfillInterval>::const_iterator pbi = + peer_backfill_info.begin(); + pbi != peer_backfill_info.end(); ++pbi) { + f->dump_stream("osd") << pbi->first; + f->open_object_section("BackfillInterval"); + pbi->second.dump(f); + f->close_section(); + } + f->close_section(); + } + { + f->open_array_section("backfills_in_flight"); + for (set<hobject_t>::const_iterator i = backfills_in_flight.begin(); + i != backfills_in_flight.end(); + ++i) { + f->dump_stream("object") << *i; + } + f->close_section(); + } + { + f->open_array_section("recovering"); + for (map<hobject_t, ObjectContextRef>::const_iterator i = recovering.begin(); + i != recovering.end(); + ++i) { + f->dump_stream("object") << i->first; + } + f->close_section(); + } + { + f->open_object_section("pg_backend"); + pgbackend->dump_recovery_info(f); + f->close_section(); + } + } + + /// last backfill operation started + hobject_t last_backfill_started; + bool new_backfill; + + int prep_object_replica_pushes(const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started); + int prep_object_replica_deletes(const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started); + + void finish_degraded_object(const hobject_t oid) override; + + // Cancels/resets pulls from peer + void check_recovery_sources(const OSDMapRef& map) override ; + + int recover_missing( + const hobject_t& oid, + eversion_t v, + int priority, + PGBackend::RecoveryHandle *h); + + // low level ops + + void _make_clone( + OpContext *ctx, + PGTransaction* t, + ObjectContextRef obc, + const hobject_t& head, const hobject_t& coid, + object_info_t *poi); + void execute_ctx(OpContext *ctx); + void finish_ctx(OpContext *ctx, int log_op_type); + void reply_ctx(OpContext *ctx, int err); + void reply_ctx(OpContext *ctx, int err, eversion_t v, version_t uv); + void make_writeable(OpContext *ctx); + void log_op_stats(const OpRequest& op, uint64_t inb, uint64_t outb); + + void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi, + interval_set<uint64_t>& modified, uint64_t offset, + uint64_t length, bool write_full=false); + inline void truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size); + + enum class cache_result_t { + NOOP, + BLOCKED_FULL, + BLOCKED_PROMOTE, + HANDLED_PROXY, + HANDLED_REDIRECT, + REPLIED_WITH_EAGAIN, + BLOCKED_RECOVERY, + }; + cache_result_t maybe_handle_cache_detail(OpRequestRef op, + bool write_ordered, + ObjectContextRef obc, int r, + hobject_t missing_oid, + bool must_promote, + bool in_hit_set, + ObjectContextRef *promote_obc); + cache_result_t maybe_handle_manifest_detail(OpRequestRef op, + bool write_ordered, + ObjectContextRef obc); + bool maybe_handle_manifest(OpRequestRef op, + bool write_ordered, + ObjectContextRef obc) { + return cache_result_t::NOOP != maybe_handle_manifest_detail( + op, + write_ordered, + obc); + } + + /** + * This helper function is called from do_op if the ObjectContext lookup fails. + * @returns true if the caching code is handling the Op, false otherwise. + */ + bool maybe_handle_cache(OpRequestRef op, + bool write_ordered, + ObjectContextRef obc, int r, + const hobject_t& missing_oid, + bool must_promote, + bool in_hit_set = false) { + return cache_result_t::NOOP != maybe_handle_cache_detail( + op, + write_ordered, + obc, + r, + missing_oid, + must_promote, + in_hit_set, + nullptr); + } + + /** + * This helper function checks if a promotion is needed. + */ + bool maybe_promote(ObjectContextRef obc, + const hobject_t& missing_oid, + const object_locator_t& oloc, + bool in_hit_set, + uint32_t recency, + OpRequestRef promote_op, + ObjectContextRef *promote_obc = nullptr); + /** + * This helper function tells the client to redirect their request elsewhere. + */ + void do_cache_redirect(OpRequestRef op); + /** + * This function attempts to start a promote. Either it succeeds, + * or places op on a wait list. If op is null, failure means that + * this is a noop. If a future user wants to be able to distinguish + * these cases, a return value should be added. + */ + void promote_object( + ObjectContextRef obc, ///< [optional] obc + const hobject_t& missing_object, ///< oid (if !obc) + const object_locator_t& oloc, ///< locator for obc|oid + OpRequestRef op, ///< [optional] client op + ObjectContextRef *promote_obc = nullptr ///< [optional] new obc for object + ); + + int prepare_transaction(OpContext *ctx); + list<pair<OpRequestRef, OpContext*> > in_progress_async_reads; + void complete_read_ctx(int result, OpContext *ctx); + + // pg on-disk content + void check_local() override; + + void _clear_recovery_state() override; + + bool start_recovery_ops( + uint64_t max, + ThreadPool::TPHandle &handle, uint64_t *started) override; + + uint64_t recover_primary(uint64_t max, ThreadPool::TPHandle &handle); + uint64_t recover_replicas(uint64_t max, ThreadPool::TPHandle &handle, + bool *recovery_started); + hobject_t earliest_peer_backfill() const; + bool all_peer_done() const; + /** + * @param work_started will be set to true if recover_backfill got anywhere + * @returns the number of operations started + */ + uint64_t recover_backfill(uint64_t max, ThreadPool::TPHandle &handle, + bool *work_started); + + /** + * scan a (hash) range of objects in the current pg + * + * @begin first item should be >= this value + * @min return at least this many items, unless we are done + * @max return no more than this many items + * @bi [out] resulting map of objects to eversion_t's + */ + void scan_range( + int min, int max, BackfillInterval *bi, + ThreadPool::TPHandle &handle + ); + + /// Update a hash range to reflect changes since the last scan + void update_range( + BackfillInterval *bi, ///< [in,out] interval to update + ThreadPool::TPHandle &handle ///< [in] tp handle + ); + + int prep_backfill_object_push( + hobject_t oid, eversion_t v, ObjectContextRef obc, + vector<pg_shard_t> peers, + PGBackend::RecoveryHandle *h); + void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer); + + + class C_OSD_AppliedRecoveredObject; + class C_OSD_CommittedPushedObject; + class C_OSD_AppliedRecoveredObjectReplica; + + void _applied_recovered_object(ObjectContextRef obc); + void _applied_recovered_object_replica(); + void _committed_pushed_object(epoch_t epoch, eversion_t lc); + void recover_got(hobject_t oid, eversion_t v); + + // -- copyfrom -- + map<hobject_t, CopyOpRef> copy_ops; + + int do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& op, + ObjectContextRef& obc); + int finish_copy_get(); + + void fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid, + OSDOp& osd_op); + + /** + * To copy an object, call start_copy. + * + * @param cb: The CopyCallback to be activated when the copy is complete + * @param obc: The ObjectContext we are copying into + * @param src: The source object + * @param oloc: the source object locator + * @param version: the version of the source object to copy (0 for any) + */ + void start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src, + object_locator_t oloc, version_t version, unsigned flags, + bool mirror_snapset, unsigned src_obj_fadvise_flags, + unsigned dest_obj_fadvise_flags); + void process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r); + void _write_copy_chunk(CopyOpRef cop, PGTransaction *t); + uint64_t get_copy_chunk_size() const { + uint64_t size = cct->_conf->osd_copyfrom_max_chunk; + if (pool.info.required_alignment()) { + uint64_t alignment = pool.info.required_alignment(); + if (size % alignment) { + size += alignment - (size % alignment); + } + } + return size; + } + void _copy_some(ObjectContextRef obc, CopyOpRef cop); + void finish_copyfrom(CopyFromCallback *cb); + void finish_promote(int r, CopyResults *results, ObjectContextRef obc); + void cancel_copy(CopyOpRef cop, bool requeue, vector<ceph_tid_t> *tids); + void cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids); + + friend struct C_Copyfrom; + + // -- flush -- + map<hobject_t, FlushOpRef> flush_ops; + + /// start_flush takes ownership of on_flush iff ret == -EINPROGRESS + int start_flush( + OpRequestRef op, ObjectContextRef obc, + bool blocking, hobject_t *pmissing, + boost::optional<std::function<void()>> &&on_flush); + void finish_flush(hobject_t oid, ceph_tid_t tid, int r); + int try_flush_mark_clean(FlushOpRef fop); + void cancel_flush(FlushOpRef fop, bool requeue, vector<ceph_tid_t> *tids); + void cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids); + + /// @return false if clone is has been evicted + bool is_present_clone(hobject_t coid); + + friend struct C_Flush; + + // -- scrub -- + bool _range_available_for_scrub( + const hobject_t &begin, const hobject_t &end) override; + void scrub_snapshot_metadata( + ScrubMap &map, + const std::map<hobject_t, + pair<boost::optional<uint32_t>, + boost::optional<uint32_t>>> &missing_digest) override; + void _scrub_clear_state() override; + void _scrub_finish() override; + object_stat_collection_t scrub_cstat; + + void _split_into(pg_t child_pgid, PG *child, + unsigned split_bits) override; + void apply_and_flush_repops(bool requeue); + + void calc_trim_to() override; + void calc_trim_to_aggressive() override; + int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr); + int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr); + + // -- checksum -- + int do_checksum(OpContext *ctx, OSDOp& osd_op, bufferlist::const_iterator *bl_it); + int finish_checksum(OSDOp& osd_op, Checksummer::CSumType csum_type, + bufferlist::const_iterator *init_value_bl_it, + const bufferlist &read_bl); + + friend class C_ChecksumRead; + + int do_extent_cmp(OpContext *ctx, OSDOp& osd_op); + int finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl); + + friend class C_ExtentCmpRead; + + int do_read(OpContext *ctx, OSDOp& osd_op); + int do_sparse_read(OpContext *ctx, OSDOp& osd_op); + int do_writesame(OpContext *ctx, OSDOp& osd_op); + + bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata); + int get_pgls_filter(bufferlist::const_iterator& iter, PGLSFilter **pfilter); + + map<hobject_t, list<OpRequestRef>> in_progress_proxy_ops; + void kick_proxy_ops_blocked(hobject_t& soid); + void cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids); + + // -- proxyread -- + map<ceph_tid_t, ProxyReadOpRef> proxyread_ops; + + void do_proxy_read(OpRequestRef op, ObjectContextRef obc = NULL); + void finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r); + void cancel_proxy_read(ProxyReadOpRef prdop, vector<ceph_tid_t> *tids); + + friend struct C_ProxyRead; + + // -- proxywrite -- + map<ceph_tid_t, ProxyWriteOpRef> proxywrite_ops; + + void do_proxy_write(OpRequestRef op, ObjectContextRef obc = NULL); + void finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r); + void cancel_proxy_write(ProxyWriteOpRef pwop, vector<ceph_tid_t> *tids); + + friend struct C_ProxyWrite_Commit; + + // -- chunkop -- + void do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid, + ObjectContextRef obc, bool write_ordered); + void do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index, + uint64_t chunk_index, uint64_t req_offset, uint64_t req_length, + uint64_t req_total_len, bool write_ordered); + bool can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc); + void _copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset); + void process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset); + void finish_promote_manifest(int r, CopyResults *results, ObjectContextRef obc); + void cancel_and_requeue_proxy_ops(hobject_t oid); + int do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop, + uint64_t start_offset, bool block); + int start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking, + boost::optional<std::function<void()>> &&on_flush); + void finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r, ObjectContextRef obc, + uint64_t last_offset); + void handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r, + uint64_t offset, uint64_t last_offset, epoch_t lpr); + void refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid, + SnapContext snapc, bool get, Context *cb, uint64_t offset); + + friend struct C_ProxyChunkRead; + friend class PromoteManifestCallback; + friend class C_CopyChunk; + friend struct C_ManifestFlush; + friend struct RefCountCallback; + +public: + PrimaryLogPG(OSDService *o, OSDMapRef curmap, + const PGPool &_pool, + const map<string,string>& ec_profile, + spg_t p); + ~PrimaryLogPG() override {} + + int do_command( + cmdmap_t cmdmap, + ostream& ss, + bufferlist& idata, + bufferlist& odata, + ConnectionRef conn, + ceph_tid_t tid) override; + + void clear_cache(); + int get_cache_obj_count() { + return object_contexts.get_count(); + } + void do_request( + OpRequestRef& op, + ThreadPool::TPHandle &handle) override; + void do_op(OpRequestRef& op); + void record_write_error(OpRequestRef op, const hobject_t &soid, + MOSDOpReply *orig_reply, int r); + void do_pg_op(OpRequestRef op); + void do_scan( + OpRequestRef op, + ThreadPool::TPHandle &handle); + void do_backfill(OpRequestRef op); + void do_backfill_remove(OpRequestRef op); + + void handle_backoff(OpRequestRef& op); + + int trim_object(bool first, const hobject_t &coid, OpContextUPtr *ctxp); + void snap_trimmer(epoch_t e) override; + void kick_snap_trim() override; + void snap_trimmer_scrub_complete() override; + int do_osd_ops(OpContext *ctx, vector<OSDOp>& ops); + + int _get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals); + int do_tmap2omap(OpContext *ctx, unsigned flags); + int do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op); + int do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op, bufferlist& bl); + + void do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn); +private: + int do_scrub_ls(MOSDOp *op, OSDOp *osd_op); + hobject_t earliest_backfill() const; + bool check_src_targ(const hobject_t& soid, const hobject_t& toid) const; + + uint64_t temp_seq; ///< last id for naming temp objects + /// generate a new temp object name + hobject_t generate_temp_object(const hobject_t& target); + /// generate a new temp object name (for recovery) + hobject_t get_temp_recovery_object(const hobject_t& target, + eversion_t version) override; + int get_recovery_op_priority() const { + int64_t pri = 0; + pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); + return pri > 0 ? pri : cct->_conf->osd_recovery_op_priority; + } + void log_missing(unsigned missing, + const boost::optional<hobject_t> &head, + LogChannelRef clog, + const spg_t &pgid, + const char *func, + const char *mode, + bool allow_incomplete_clones); + unsigned process_clones_to(const boost::optional<hobject_t> &head, + const boost::optional<SnapSet> &snapset, + LogChannelRef clog, + const spg_t &pgid, + const char *mode, + bool allow_incomplete_clones, + boost::optional<snapid_t> target, + vector<snapid_t>::reverse_iterator *curclone, + inconsistent_snapset_wrapper &snap_error); + +public: + coll_t get_coll() { + return coll; + } + void split_colls( + spg_t child, + int split_bits, + int seed, + const pg_pool_t *pool, + ObjectStore::Transaction *t) override { + coll_t target = coll_t(child); + PG::_create(*t, child, split_bits); + t->split_collection( + coll, + split_bits, + seed, + target); + PG::_init(*t, child, pool); + } +private: + + struct DoSnapWork : boost::statechart::event< DoSnapWork > { + DoSnapWork() : boost::statechart::event < DoSnapWork >() {} + }; + struct KickTrim : boost::statechart::event< KickTrim > { + KickTrim() : boost::statechart::event < KickTrim >() {} + }; + struct RepopsComplete : boost::statechart::event< RepopsComplete > { + RepopsComplete() : boost::statechart::event < RepopsComplete >() {} + }; + struct ScrubComplete : boost::statechart::event< ScrubComplete > { + ScrubComplete() : boost::statechart::event < ScrubComplete >() {} + }; + struct TrimWriteUnblocked : boost::statechart::event< TrimWriteUnblocked > { + TrimWriteUnblocked() : boost::statechart::event < TrimWriteUnblocked >() {} + }; + struct Reset : boost::statechart::event< Reset > { + Reset() : boost::statechart::event< Reset >() {} + }; + struct SnapTrimReserved : boost::statechart::event< SnapTrimReserved > { + SnapTrimReserved() : boost::statechart::event< SnapTrimReserved >() {} + }; + struct SnapTrimTimerReady : boost::statechart::event< SnapTrimTimerReady > { + SnapTrimTimerReady() : boost::statechart::event< SnapTrimTimerReady >() {} + }; + + struct NotTrimming; + struct SnapTrimmer : public boost::statechart::state_machine< SnapTrimmer, NotTrimming > { + PrimaryLogPG *pg; + explicit SnapTrimmer(PrimaryLogPG *pg) : pg(pg) {} + void log_enter(const char *state_name); + void log_exit(const char *state_name, utime_t duration); + bool permit_trim() { + return + pg->is_clean() && + !pg->scrubber.active && + !pg->snap_trimq.empty(); + } + bool can_trim() { + return + permit_trim() && + !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM); + } + } snap_trimmer_machine; + + struct WaitReservation; + struct Trimming : boost::statechart::state< Trimming, SnapTrimmer, WaitReservation >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< KickTrim >, + boost::statechart::transition< Reset, NotTrimming > + > reactions; + + set<hobject_t> in_flight; + snapid_t snap_to_trim; + + explicit Trimming(my_context ctx) + : my_base(ctx), + NamedState(context< SnapTrimmer >().pg, "Trimming") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(context< SnapTrimmer >().permit_trim()); + ceph_assert(in_flight.empty()); + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + auto *pg = context< SnapTrimmer >().pg; + pg->osd->snap_reserver.cancel_reservation(pg->get_pgid()); + pg->state_clear(PG_STATE_SNAPTRIM); + pg->publish_stats_to_osd(); + } + boost::statechart::result react(const KickTrim&) { + return discard_event(); + } + }; + + /* SnapTrimmerStates */ + struct WaitTrimTimer : boost::statechart::state< WaitTrimTimer, Trimming >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< SnapTrimTimerReady > + > reactions; + Context *wakeup = nullptr; + explicit WaitTrimTimer(my_context ctx) + : my_base(ctx), + NamedState(context< SnapTrimmer >().pg, "Trimming/WaitTrimTimer") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(context<Trimming>().in_flight.empty()); + struct OnTimer : Context { + PrimaryLogPGRef pg; + epoch_t epoch; + OnTimer(PrimaryLogPGRef pg, epoch_t epoch) : pg(pg), epoch(epoch) {} + void finish(int) override { + pg->lock(); + if (!pg->pg_has_reset_since(epoch)) + pg->snap_trimmer_machine.process_event(SnapTrimTimerReady()); + pg->unlock(); + } + }; + auto *pg = context< SnapTrimmer >().pg; + float osd_snap_trim_sleep = pg->osd->osd->get_osd_snap_trim_sleep(); + if (osd_snap_trim_sleep > 0) { + std::lock_guard l(pg->osd->sleep_lock); + wakeup = pg->osd->sleep_timer.add_event_after( + osd_snap_trim_sleep, + new OnTimer{pg, pg->get_osdmap_epoch()}); + } else { + post_event(SnapTrimTimerReady()); + } + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + auto *pg = context< SnapTrimmer >().pg; + if (wakeup) { + std::lock_guard l(pg->osd->sleep_lock); + pg->osd->sleep_timer.cancel_event(wakeup); + wakeup = nullptr; + } + } + boost::statechart::result react(const SnapTrimTimerReady &) { + wakeup = nullptr; + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } else { + return transit< AwaitAsyncWork >(); + } + } + }; + + struct WaitRWLock : boost::statechart::state< WaitRWLock, Trimming >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< TrimWriteUnblocked > + > reactions; + explicit WaitRWLock(my_context ctx) + : my_base(ctx), + NamedState(context< SnapTrimmer >().pg, "Trimming/WaitRWLock") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(context<Trimming>().in_flight.empty()); + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + } + boost::statechart::result react(const TrimWriteUnblocked&) { + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } else { + return transit< AwaitAsyncWork >(); + } + } + }; + + struct WaitRepops : boost::statechart::state< WaitRepops, Trimming >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< RepopsComplete > + > reactions; + explicit WaitRepops(my_context ctx) + : my_base(ctx), + NamedState(context< SnapTrimmer >().pg, "Trimming/WaitRepops") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(!context<Trimming>().in_flight.empty()); + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + } + boost::statechart::result react(const RepopsComplete&) { + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } else { + return transit< WaitTrimTimer >(); + } + } + }; + + struct AwaitAsyncWork : boost::statechart::state< AwaitAsyncWork, Trimming >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< DoSnapWork > + > reactions; + explicit AwaitAsyncWork(my_context ctx); + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + } + boost::statechart::result react(const DoSnapWork&); + }; + + struct WaitReservation : boost::statechart::state< WaitReservation, Trimming >, NamedState { + /* WaitReservation is a sub-state of trimming simply so that exiting Trimming + * always cancels the reservation */ + typedef boost::mpl::list < + boost::statechart::custom_reaction< SnapTrimReserved > + > reactions; + struct ReservationCB : public Context { + PrimaryLogPGRef pg; + bool canceled; + explicit ReservationCB(PrimaryLogPG *pg) : pg(pg), canceled(false) {} + void finish(int) override { + pg->lock(); + if (!canceled) + pg->snap_trimmer_machine.process_event(SnapTrimReserved()); + pg->unlock(); + } + void cancel() { + ceph_assert(pg->is_locked()); + ceph_assert(!canceled); + canceled = true; + } + }; + ReservationCB *pending = nullptr; + + explicit WaitReservation(my_context ctx) + : my_base(ctx), + NamedState(context< SnapTrimmer >().pg, "Trimming/WaitReservation") { + context< SnapTrimmer >().log_enter(state_name); + ceph_assert(context<Trimming>().in_flight.empty()); + auto *pg = context< SnapTrimmer >().pg; + pending = new ReservationCB(pg); + pg->osd->snap_reserver.request_reservation( + pg->get_pgid(), + pending, + 0); + pg->state_set(PG_STATE_SNAPTRIM_WAIT); + pg->publish_stats_to_osd(); + } + boost::statechart::result react(const SnapTrimReserved&); + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + if (pending) + pending->cancel(); + pending = nullptr; + auto *pg = context< SnapTrimmer >().pg; + pg->state_clear(PG_STATE_SNAPTRIM_WAIT); + pg->state_clear(PG_STATE_SNAPTRIM_ERROR); + pg->publish_stats_to_osd(); + } + }; + + struct WaitScrub : boost::statechart::state< WaitScrub, SnapTrimmer >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< ScrubComplete >, + boost::statechart::custom_reaction< KickTrim >, + boost::statechart::transition< Reset, NotTrimming > + > reactions; + explicit WaitScrub(my_context ctx) + : my_base(ctx), + NamedState(context< SnapTrimmer >().pg, "Trimming/WaitScrub") { + context< SnapTrimmer >().log_enter(state_name); + } + void exit() { + context< SnapTrimmer >().log_exit(state_name, enter_time); + } + boost::statechart::result react(const ScrubComplete&) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } + boost::statechart::result react(const KickTrim&) { + return discard_event(); + } + }; + + struct NotTrimming : boost::statechart::state< NotTrimming, SnapTrimmer >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< KickTrim >, + boost::statechart::transition< Reset, NotTrimming > + > reactions; + explicit NotTrimming(my_context ctx); + void exit(); + boost::statechart::result react(const KickTrim&); + }; + + int _verify_no_head_clones(const hobject_t& soid, + const SnapSet& ss); + // return true if we're creating a local object, false for a + // whiteout or no change. + void maybe_create_new_object(OpContext *ctx, bool ignore_transaction=false); + int _delete_oid(OpContext *ctx, bool no_whiteout, bool try_no_whiteout); + int _rollback_to(OpContext *ctx, ceph_osd_op& op); +public: + bool is_missing_object(const hobject_t& oid) const; + bool is_unreadable_object(const hobject_t &oid) const { + return is_missing_object(oid) || + !missing_loc.readable_with_acting(oid, actingset); + } + void maybe_kick_recovery(const hobject_t &soid); + void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op); + void wait_for_all_missing(OpRequestRef op); + + bool is_degraded_or_backfilling_object(const hobject_t& oid); + bool is_degraded_on_async_recovery_target(const hobject_t& soid); + void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op); + + void block_write_on_full_cache( + const hobject_t& oid, OpRequestRef op); + void block_for_clean( + const hobject_t& oid, OpRequestRef op); + void block_write_on_snap_rollback( + const hobject_t& oid, ObjectContextRef obc, OpRequestRef op); + void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op); + + bool maybe_await_blocked_head(const hobject_t &soid, OpRequestRef op); + void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op); + void kick_object_context_blocked(ObjectContextRef obc); + + void maybe_force_recovery(); + + void mark_all_unfound_lost( + int what, + ConnectionRef con, + ceph_tid_t tid); + eversion_t pick_newest_available(const hobject_t& oid); + + void do_update_log_missing( + OpRequestRef &op); + + void do_update_log_missing_reply( + OpRequestRef &op); + + void on_role_change() override; + void on_pool_change() override; + void _on_new_interval() override; + void clear_async_reads(); + void on_change(ObjectStore::Transaction *t) override; + void on_activate() override; + void on_flushed() override; + void on_removal(ObjectStore::Transaction *t) override; + void on_shutdown() override; + bool check_failsafe_full() override; + bool check_osdmap_full(const set<pg_shard_t> &missing_on) override; + bool maybe_preempt_replica_scrub(const hobject_t& oid) override { + return write_blocked_by_scrub(oid); + } + int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx); + + // attr cache handling + void setattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const string &key, + bufferlist &val); + void setattrs_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + map<string, bufferlist> &attrs); + void rmattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const string &key); + int getattr_maybe_cache( + ObjectContextRef obc, + const string &key, + bufferlist *val); + int getattrs_maybe_cache( + ObjectContextRef obc, + map<string, bufferlist> *out); + +public: + void set_dynamic_perf_stats_queries( + const std::list<OSDPerfMetricQuery> &queries) override; + void get_dynamic_perf_stats(DynamicPerfStats *stats) override; + +private: + DynamicPerfStats m_dynamic_perf_stats; +}; + +inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop) +{ + out << "repgather(" << &repop + << " " << repop.v + << " rep_tid=" << repop.rep_tid + << " committed?=" << repop.all_committed + << " r=" << repop.r + << ")"; + return out; +} + +inline ostream& operator<<(ostream& out, + const PrimaryLogPG::ProxyWriteOpRef& pwop) +{ + out << "proxywrite(" << &pwop + << " " << pwop->user_version + << " pwop_tid=" << pwop->objecter_tid; + if (pwop->ctx->op) + out << " op=" << *(pwop->ctx->op->get_req()); + out << ")"; + return out; +} + +void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop); +void intrusive_ptr_release(PrimaryLogPG::RepGather *repop); + + +#endif diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc new file mode 100644 index 00000000..f8d67af3 --- /dev/null +++ b/src/osd/ReplicatedBackend.cc @@ -0,0 +1,2270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "common/errno.h" +#include "ReplicatedBackend.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPushReply.h" +#include "common/EventTrace.h" +#include "include/random.h" +#include "include/util.h" +#include "OSD.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, ReplicatedBackend *pgb) { + return pgb->get_parent()->gen_dbg_prefix(*_dout); +} + +namespace { +class PG_SendMessageOnConn: public Context { + PGBackend::Listener *pg; + Message *reply; + ConnectionRef conn; + public: + PG_SendMessageOnConn( + PGBackend::Listener *pg, + Message *reply, + ConnectionRef conn) : pg(pg), reply(reply), conn(conn) {} + void finish(int) override { + pg->send_message_osd_cluster(reply, conn.get()); + } +}; + +class PG_RecoveryQueueAsync : public Context { + PGBackend::Listener *pg; + unique_ptr<GenContext<ThreadPool::TPHandle&>> c; + public: + PG_RecoveryQueueAsync( + PGBackend::Listener *pg, + GenContext<ThreadPool::TPHandle&> *c) : pg(pg), c(c) {} + void finish(int) override { + pg->schedule_recovery_work(c.release()); + } +}; +} + +struct ReplicatedBackend::C_OSD_RepModifyCommit : public Context { + ReplicatedBackend *pg; + RepModifyRef rm; + C_OSD_RepModifyCommit(ReplicatedBackend *pg, RepModifyRef r) + : pg(pg), rm(r) {} + void finish(int r) override { + pg->repop_commit(rm); + } +}; + +static void log_subop_stats( + PerfCounters *logger, + OpRequestRef op, int subop) +{ + utime_t now = ceph_clock_now(); + utime_t latency = now; + latency -= op->get_req()->get_recv_stamp(); + + + logger->inc(l_osd_sop); + logger->tinc(l_osd_sop_lat, latency); + logger->inc(subop); + + if (subop != l_osd_sop_pull) { + uint64_t inb = op->get_req()->get_data().length(); + logger->inc(l_osd_sop_inb, inb); + if (subop == l_osd_sop_w) { + logger->inc(l_osd_sop_w_inb, inb); + logger->tinc(l_osd_sop_w_lat, latency); + } else if (subop == l_osd_sop_push) { + logger->inc(l_osd_sop_push_inb, inb); + logger->tinc(l_osd_sop_push_lat, latency); + } else + ceph_abort_msg("no support subop"); + } else { + logger->tinc(l_osd_sop_pull_lat, latency); + } +} + +ReplicatedBackend::ReplicatedBackend( + PGBackend::Listener *pg, + const coll_t &coll, + ObjectStore::CollectionHandle &c, + ObjectStore *store, + CephContext *cct) : + PGBackend(cct, pg, store, coll, c) {} + +void ReplicatedBackend::run_recovery_op( + PGBackend::RecoveryHandle *_h, + int priority) +{ + RPGHandle *h = static_cast<RPGHandle *>(_h); + send_pushes(priority, h->pushes); + send_pulls(priority, h->pulls); + send_recovery_deletes(priority, h->deletes); + delete h; +} + +int ReplicatedBackend::recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + RecoveryHandle *_h + ) +{ + dout(10) << __func__ << ": " << hoid << dendl; + RPGHandle *h = static_cast<RPGHandle *>(_h); + if (get_parent()->get_local_missing().is_missing(hoid)) { + ceph_assert(!obc); + // pull + prepare_pull( + v, + hoid, + head, + h); + } else { + ceph_assert(obc); + int started = start_pushes( + hoid, + obc, + h); + if (started < 0) { + pushing[hoid].clear(); + return started; + } + } + return 0; +} + +void ReplicatedBackend::check_recovery_sources(const OSDMapRef& osdmap) +{ + for(map<pg_shard_t, set<hobject_t> >::iterator i = pull_from_peer.begin(); + i != pull_from_peer.end(); + ) { + if (osdmap->is_down(i->first.osd)) { + dout(10) << "check_recovery_sources resetting pulls from osd." << i->first + << ", osdmap has it marked down" << dendl; + for (set<hobject_t>::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + get_parent()->cancel_pull(*j); + clear_pull(pulling.find(*j), false); + } + pull_from_peer.erase(i++); + } else { + ++i; + } + } +} + +bool ReplicatedBackend::can_handle_while_inactive(OpRequestRef op) +{ + dout(10) << __func__ << ": " << op << dendl; + switch (op->get_req()->get_type()) { + case MSG_OSD_PG_PULL: + return true; + default: + return false; + } +} + +bool ReplicatedBackend::_handle_message( + OpRequestRef op + ) +{ + dout(10) << __func__ << ": " << op << dendl; + switch (op->get_req()->get_type()) { + case MSG_OSD_PG_PUSH: + do_push(op); + return true; + + case MSG_OSD_PG_PULL: + do_pull(op); + return true; + + case MSG_OSD_PG_PUSH_REPLY: + do_push_reply(op); + return true; + + case MSG_OSD_REPOP: { + do_repop(op); + return true; + } + + case MSG_OSD_REPOPREPLY: { + do_repop_reply(op); + return true; + } + + default: + break; + } + return false; +} + +void ReplicatedBackend::clear_recovery_state() +{ + // clear pushing/pulling maps + for (auto &&i: pushing) { + for (auto &&j: i.second) { + get_parent()->release_locks(j.second.lock_manager); + } + } + pushing.clear(); + + for (auto &&i: pulling) { + get_parent()->release_locks(i.second.lock_manager); + } + pulling.clear(); + pull_from_peer.clear(); +} + +void ReplicatedBackend::on_change() +{ + dout(10) << __func__ << dendl; + for (auto& op : in_progress_ops) { + delete op.second->on_commit; + op.second->on_commit = nullptr; + } + in_progress_ops.clear(); + clear_recovery_state(); +} + +int ReplicatedBackend::objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + bufferlist *bl) +{ + return store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags); +} + +void ReplicatedBackend::objects_read_async( + const hobject_t &hoid, + const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, + pair<bufferlist*, Context*> > > &to_read, + Context *on_complete, + bool fast_read) +{ + ceph_abort_msg("async read is not used by replica pool"); +} + +class C_OSD_OnOpCommit : public Context { + ReplicatedBackend *pg; + ReplicatedBackend::InProgressOpRef op; +public: + C_OSD_OnOpCommit(ReplicatedBackend *pg, ReplicatedBackend::InProgressOp *op) + : pg(pg), op(op) {} + void finish(int) override { + pg->op_commit(op); + } +}; + +void generate_transaction( + PGTransactionUPtr &pgt, + const coll_t &coll, + vector<pg_log_entry_t> &log_entries, + ObjectStore::Transaction *t, + set<hobject_t> *added, + set<hobject_t> *removed) +{ + ceph_assert(t); + ceph_assert(added); + ceph_assert(removed); + + for (auto &&le: log_entries) { + le.mark_unrollbackable(); + auto oiter = pgt->op_map.find(le.soid); + if (oiter != pgt->op_map.end() && oiter->second.updated_snaps) { + bufferlist bl(oiter->second.updated_snaps->second.size() * 8 + 8); + encode(oiter->second.updated_snaps->second, bl); + le.snaps.swap(bl); + le.snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + } + + pgt->safe_create_traverse( + [&](pair<const hobject_t, PGTransaction::ObjectOperation> &obj_op) { + const hobject_t &oid = obj_op.first; + const ghobject_t goid = + ghobject_t(oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD); + const PGTransaction::ObjectOperation &op = obj_op.second; + + if (oid.is_temp()) { + if (op.is_fresh_object()) { + added->insert(oid); + } else if (op.is_delete()) { + removed->insert(oid); + } + } + + if (op.delete_first) { + t->remove(coll, goid); + } + + match( + op.init_type, + [&](const PGTransaction::ObjectOperation::Init::None &) { + }, + [&](const PGTransaction::ObjectOperation::Init::Create &op) { + t->touch(coll, goid); + }, + [&](const PGTransaction::ObjectOperation::Init::Clone &op) { + t->clone( + coll, + ghobject_t( + op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + goid); + }, + [&](const PGTransaction::ObjectOperation::Init::Rename &op) { + ceph_assert(op.source.is_temp()); + t->collection_move_rename( + coll, + ghobject_t( + op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + coll, + goid); + }); + + if (op.truncate) { + t->truncate(coll, goid, op.truncate->first); + if (op.truncate->first != op.truncate->second) + t->truncate(coll, goid, op.truncate->second); + } + + if (!op.attr_updates.empty()) { + map<string, bufferlist> attrs; + for (auto &&p: op.attr_updates) { + if (p.second) + attrs[p.first] = *(p.second); + else + t->rmattr(coll, goid, p.first); + } + t->setattrs(coll, goid, attrs); + } + + if (op.clear_omap) + t->omap_clear(coll, goid); + if (op.omap_header) + t->omap_setheader(coll, goid, *(op.omap_header)); + + for (auto &&up: op.omap_updates) { + using UpdateType = PGTransaction::ObjectOperation::OmapUpdateType; + switch (up.first) { + case UpdateType::Remove: + t->omap_rmkeys(coll, goid, up.second); + break; + case UpdateType::Insert: + t->omap_setkeys(coll, goid, up.second); + break; + } + } + + // updated_snaps doesn't matter since we marked unrollbackable + + if (op.alloc_hint) { + auto &hint = *(op.alloc_hint); + t->set_alloc_hint( + coll, + goid, + hint.expected_object_size, + hint.expected_write_size, + hint.flags); + } + + for (auto &&extent: op.buffer_updates) { + using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; + match( + extent.get_val(), + [&](const BufferUpdate::Write &op) { + t->write( + coll, + goid, + extent.get_off(), + extent.get_len(), + op.buffer); + }, + [&](const BufferUpdate::Zero &op) { + t->zero( + coll, + goid, + extent.get_off(), + extent.get_len()); + }, + [&](const BufferUpdate::CloneRange &op) { + ceph_assert(op.len == extent.get_len()); + t->clone_range( + coll, + ghobject_t(op.from, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + goid, + op.offset, + extent.get_len(), + extent.get_off()); + }); + } + }); +} + +void ReplicatedBackend::submit_transaction( + const hobject_t &soid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&_t, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const vector<pg_log_entry_t> &_log_entries, + boost::optional<pg_hit_set_history_t> &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef orig_op) +{ + parent->apply_stats( + soid, + delta_stats); + + vector<pg_log_entry_t> log_entries(_log_entries); + ObjectStore::Transaction op_t; + PGTransactionUPtr t(std::move(_t)); + set<hobject_t> added, removed; + generate_transaction( + t, + coll, + log_entries, + &op_t, + &added, + &removed); + ceph_assert(added.size() <= 1); + ceph_assert(removed.size() <= 1); + + auto insert_res = in_progress_ops.insert( + make_pair( + tid, + new InProgressOp( + tid, on_all_commit, + orig_op, at_version) + ) + ); + ceph_assert(insert_res.second); + InProgressOp &op = *insert_res.first->second; + + op.waiting_for_commit.insert( + parent->get_acting_recovery_backfill_shards().begin(), + parent->get_acting_recovery_backfill_shards().end()); + + issue_op( + soid, + at_version, + tid, + reqid, + trim_to, + at_version, + added.size() ? *(added.begin()) : hobject_t(), + removed.size() ? *(removed.begin()) : hobject_t(), + log_entries, + hset_history, + &op, + op_t); + + add_temp_objs(added); + clear_temp_objs(removed); + + parent->log_operation( + log_entries, + hset_history, + trim_to, + at_version, + true, + op_t); + + op_t.register_on_commit( + parent->bless_context( + new C_OSD_OnOpCommit(this, &op))); + + vector<ObjectStore::Transaction> tls; + tls.push_back(std::move(op_t)); + + parent->queue_transactions(tls, op.op); + if (at_version != eversion_t()) { + parent->op_applied(at_version); + } +} + +void ReplicatedBackend::op_commit( + InProgressOpRef& op) +{ + if (op->on_commit == nullptr) { + // aborted + return; + } + + FUNCTRACE(cct); + OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_COMMIT_BEGIN", true); + dout(10) << __func__ << ": " << op->tid << dendl; + if (op->op) { + op->op->mark_event("op_commit"); + op->op->pg_trace.event("op commit"); + } + + op->waiting_for_commit.erase(get_parent()->whoami_shard()); + + if (op->waiting_for_commit.empty()) { + op->on_commit->complete(0); + op->on_commit = 0; + in_progress_ops.erase(op->tid); + } +} + +void ReplicatedBackend::do_repop_reply(OpRequestRef op) +{ + static_cast<MOSDRepOpReply*>(op->get_nonconst_req())->finish_decode(); + const MOSDRepOpReply *r = static_cast<const MOSDRepOpReply *>(op->get_req()); + ceph_assert(r->get_header().type == MSG_OSD_REPOPREPLY); + + op->mark_started(); + + // must be replication. + ceph_tid_t rep_tid = r->get_tid(); + pg_shard_t from = r->from; + + auto iter = in_progress_ops.find(rep_tid); + if (iter != in_progress_ops.end()) { + InProgressOp &ip_op = *iter->second; + const MOSDOp *m = NULL; + if (ip_op.op) + m = static_cast<const MOSDOp *>(ip_op.op->get_req()); + + if (m) + dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m + << " ack_type " << (int)r->ack_type + << " from " << from + << dendl; + else + dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) " + << " ack_type " << (int)r->ack_type + << " from " << from + << dendl; + + // oh, good. + + if (r->ack_type & CEPH_OSD_FLAG_ONDISK) { + ceph_assert(ip_op.waiting_for_commit.count(from)); + ip_op.waiting_for_commit.erase(from); + if (ip_op.op) { + ip_op.op->mark_event("sub_op_commit_rec"); + ip_op.op->pg_trace.event("sub_op_commit_rec"); + } + } else { + // legacy peer; ignore + } + + parent->update_peer_last_complete_ondisk( + from, + r->get_last_complete_ondisk()); + + if (ip_op.waiting_for_commit.empty() && + ip_op.on_commit) { + ip_op.on_commit->complete(0); + ip_op.on_commit = 0; + in_progress_ops.erase(iter); + } + } +} + +int ReplicatedBackend::be_deep_scrub( + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) +{ + dout(10) << __func__ << " " << poid << " pos " << pos << dendl; + int r; + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE; + + utime_t sleeptime; + sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep); + if (sleeptime != utime_t()) { + lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl; + sleeptime.sleep(); + } + + ceph_assert(poid == pos.ls[pos.pos]); + if (!pos.data_done()) { + if (pos.data_pos == 0) { + pos.data_hash = bufferhash(-1); + } + + bufferlist bl; + r = store->read( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + pos.data_pos, + cct->_conf->osd_deep_scrub_stride, bl, + fadvise_flags); + if (r < 0) { + dout(20) << __func__ << " " << poid << " got " + << r << " on read, read_error" << dendl; + o.read_error = true; + return 0; + } + if (r > 0) { + pos.data_hash << bl; + } + pos.data_pos += r; + if (r == cct->_conf->osd_deep_scrub_stride) { + dout(20) << __func__ << " " << poid << " more data, digest so far 0x" + << std::hex << pos.data_hash.digest() << std::dec << dendl; + return -EINPROGRESS; + } + // done with bytes + pos.data_pos = -1; + o.digest = pos.data_hash.digest(); + o.digest_present = true; + dout(20) << __func__ << " " << poid << " done with data, digest 0x" + << std::hex << o.digest << std::dec << dendl; + } + + // omap header + if (pos.omap_pos.empty()) { + pos.omap_hash = bufferhash(-1); + + bufferlist hdrbl; + r = store->omap_get_header( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + &hdrbl, true); + if (r == -EIO) { + dout(20) << __func__ << " " << poid << " got " + << r << " on omap header read, read_error" << dendl; + o.read_error = true; + return 0; + } + if (r == 0 && hdrbl.length()) { + bool encoded = false; + dout(25) << "CRC header " << cleanbin(hdrbl, encoded, true) << dendl; + pos.omap_hash << hdrbl; + } + } + + // omap + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator( + ch, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + ceph_assert(iter); + if (pos.omap_pos.length()) { + iter->lower_bound(pos.omap_pos); + } else { + iter->seek_to_first(); + } + int max = g_conf()->osd_deep_scrub_keys; + while (iter->status() == 0 && iter->valid()) { + pos.omap_bytes += iter->value().length(); + ++pos.omap_keys; + --max; + // fixme: we can do this more efficiently. + bufferlist bl; + encode(iter->key(), bl); + encode(iter->value(), bl); + pos.omap_hash << bl; + + iter->next(); + + if (iter->valid() && max == 0) { + pos.omap_pos = iter->key(); + return -EINPROGRESS; + } + if (iter->status() < 0) { + dout(25) << __func__ << " " << poid + << " on omap scan, db status error" << dendl; + o.read_error = true; + return 0; + } + } + + if (pos.omap_keys > cct->_conf-> + osd_deep_scrub_large_omap_object_key_threshold || + pos.omap_bytes > cct->_conf-> + osd_deep_scrub_large_omap_object_value_sum_threshold) { + dout(25) << __func__ << " " << poid + << " large omap object detected. Object has " << pos.omap_keys + << " keys and size " << pos.omap_bytes << " bytes" << dendl; + o.large_omap_object_found = true; + o.large_omap_object_key_count = pos.omap_keys; + o.large_omap_object_value_size = pos.omap_bytes; + map.has_large_omap_object_errors = true; + } + + o.omap_digest = pos.omap_hash.digest(); + o.omap_digest_present = true; + dout(20) << __func__ << " done with " << poid << " omap_digest " + << std::hex << o.omap_digest << std::dec << dendl; + + // Sum up omap usage + if (pos.omap_keys > 0 || pos.omap_bytes > 0) { + dout(25) << __func__ << " adding " << pos.omap_keys << " keys and " + << pos.omap_bytes << " bytes to pg_stats sums" << dendl; + map.has_omap_keys = true; + o.object_omap_bytes = pos.omap_bytes; + o.object_omap_keys = pos.omap_keys; + } + + // done! + return 0; +} + +void ReplicatedBackend::_do_push(OpRequestRef op) +{ + const MOSDPGPush *m = static_cast<const MOSDPGPush *>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_PUSH); + pg_shard_t from = m->from; + + op->mark_started(); + + vector<PushReplyOp> replies; + ObjectStore::Transaction t; + if (get_parent()->check_failsafe_full()) { + dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl; + ceph_abort(); + } + for (vector<PushOp>::const_iterator i = m->pushes.begin(); + i != m->pushes.end(); + ++i) { + replies.push_back(PushReplyOp()); + handle_push(from, *i, &(replies.back()), &t, m->is_repair); + } + + MOSDPGPushReply *reply = new MOSDPGPushReply; + reply->from = get_parent()->whoami_shard(); + reply->set_priority(m->get_priority()); + reply->pgid = get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->replies.swap(replies); + reply->compute_cost(cct); + + t.register_on_complete( + new PG_SendMessageOnConn( + get_parent(), reply, m->get_connection())); + + get_parent()->queue_transaction(std::move(t)); +} + +struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> { + ReplicatedBackend *bc; + list<ReplicatedBackend::pull_complete_info> to_continue; + int priority; + C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority) + : bc(bc), priority(priority) {} + + void finish(ThreadPool::TPHandle &handle) override { + ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op(); + for (auto &&i: to_continue) { + auto j = bc->pulling.find(i.hoid); + ceph_assert(j != bc->pulling.end()); + ObjectContextRef obc = j->second.obc; + bc->clear_pull(j, false /* already did it */); + int started = bc->start_pushes(i.hoid, obc, h); + if (started < 0) { + bc->pushing[i.hoid].clear(); + bc->get_parent()->primary_failed(i.hoid); + bc->get_parent()->primary_error(i.hoid, obc->obs.oi.version); + } else if (!started) { + bc->get_parent()->on_global_recover( + i.hoid, i.stat, false); + } + handle.reset_tp_timeout(); + } + bc->run_recovery_op(h, priority); + } +}; + +void ReplicatedBackend::_do_pull_response(OpRequestRef op) +{ + const MOSDPGPush *m = static_cast<const MOSDPGPush *>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_PUSH); + pg_shard_t from = m->from; + + op->mark_started(); + + vector<PullOp> replies(1); + if (get_parent()->check_failsafe_full()) { + dout(10) << __func__ << " Out of space (failsafe) processing pull response (push)." << dendl; + ceph_abort(); + } + + ObjectStore::Transaction t; + list<pull_complete_info> to_continue; + for (vector<PushOp>::const_iterator i = m->pushes.begin(); + i != m->pushes.end(); + ++i) { + bool more = handle_pull_response(from, *i, &(replies.back()), &to_continue, &t); + if (more) + replies.push_back(PullOp()); + } + if (!to_continue.empty()) { + C_ReplicatedBackend_OnPullComplete *c = + new C_ReplicatedBackend_OnPullComplete( + this, + m->get_priority()); + c->to_continue.swap(to_continue); + t.register_on_complete( + new PG_RecoveryQueueAsync( + get_parent(), + get_parent()->bless_unlocked_gencontext(c))); + } + replies.erase(replies.end() - 1); + + if (replies.size()) { + MOSDPGPull *reply = new MOSDPGPull; + reply->from = parent->whoami_shard(); + reply->set_priority(m->get_priority()); + reply->pgid = get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->set_pulls(&replies); + reply->compute_cost(cct); + + t.register_on_complete( + new PG_SendMessageOnConn( + get_parent(), reply, m->get_connection())); + } + + get_parent()->queue_transaction(std::move(t)); +} + +void ReplicatedBackend::do_pull(OpRequestRef op) +{ + MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_nonconst_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_PULL); + pg_shard_t from = m->from; + + map<pg_shard_t, vector<PushOp> > replies; + vector<PullOp> pulls; + m->take_pulls(&pulls); + for (auto& i : pulls) { + replies[from].push_back(PushOp()); + handle_pull(from, i, &(replies[from].back())); + } + send_pushes(m->get_priority(), replies); +} + +void ReplicatedBackend::do_push_reply(OpRequestRef op) +{ + const MOSDPGPushReply *m = static_cast<const MOSDPGPushReply *>(op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_PUSH_REPLY); + pg_shard_t from = m->from; + + vector<PushOp> replies(1); + for (vector<PushReplyOp>::const_iterator i = m->replies.begin(); + i != m->replies.end(); + ++i) { + bool more = handle_push_reply(from, *i, &(replies.back())); + if (more) + replies.push_back(PushOp()); + } + replies.erase(replies.end() - 1); + + map<pg_shard_t, vector<PushOp> > _replies; + _replies[from].swap(replies); + send_pushes(m->get_priority(), _replies); +} + +Message * ReplicatedBackend::generate_subop( + const hobject_t &soid, + const eversion_t &at_version, + ceph_tid_t tid, + osd_reqid_t reqid, + eversion_t pg_trim_to, + eversion_t pg_roll_forward_to, + hobject_t new_temp_oid, + hobject_t discard_temp_oid, + const bufferlist &log_entries, + boost::optional<pg_hit_set_history_t> &hset_hist, + ObjectStore::Transaction &op_t, + pg_shard_t peer, + const pg_info_t &pinfo) +{ + int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + // forward the write/update/whatever + MOSDRepOp *wr = new MOSDRepOp( + reqid, parent->whoami_shard(), + spg_t(get_info().pgid.pgid, peer.shard), + soid, acks_wanted, + get_osdmap_epoch(), + parent->get_last_peering_reset_epoch(), + tid, at_version); + + // ship resulting transaction, log entries, and pg_stats + if (!parent->should_send_op(peer, soid)) { + ObjectStore::Transaction t; + encode(t, wr->get_data()); + } else { + encode(op_t, wr->get_data()); + wr->get_header().data_off = op_t.get_data_alignment(); + } + + wr->logbl = log_entries; + + if (pinfo.is_incomplete()) + wr->pg_stats = pinfo.stats; // reflects backfill progress + else + wr->pg_stats = get_info().stats; + + wr->pg_trim_to = pg_trim_to; + wr->pg_roll_forward_to = pg_roll_forward_to; + + wr->new_temp_oid = new_temp_oid; + wr->discard_temp_oid = discard_temp_oid; + wr->updated_hit_set_history = hset_hist; + return wr; +} + +void ReplicatedBackend::issue_op( + const hobject_t &soid, + const eversion_t &at_version, + ceph_tid_t tid, + osd_reqid_t reqid, + eversion_t pg_trim_to, + eversion_t pg_roll_forward_to, + hobject_t new_temp_oid, + hobject_t discard_temp_oid, + const vector<pg_log_entry_t> &log_entries, + boost::optional<pg_hit_set_history_t> &hset_hist, + InProgressOp *op, + ObjectStore::Transaction &op_t) +{ + if (parent->get_acting_recovery_backfill_shards().size() > 1) { + if (op->op) { + op->op->pg_trace.event("issue replication ops"); + ostringstream ss; + set<pg_shard_t> replicas = parent->get_acting_recovery_backfill_shards(); + replicas.erase(parent->whoami_shard()); + ss << "waiting for subops from " << replicas; + op->op->mark_sub_op_sent(ss.str()); + } + + // avoid doing the same work in generate_subop + bufferlist logs; + encode(log_entries, logs); + + for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) { + if (shard == parent->whoami_shard()) continue; + const pg_info_t &pinfo = parent->get_shard_info().find(shard)->second; + + Message *wr; + wr = generate_subop( + soid, + at_version, + tid, + reqid, + pg_trim_to, + pg_roll_forward_to, + new_temp_oid, + discard_temp_oid, + logs, + hset_hist, + op_t, + shard, + pinfo); + if (op->op && op->op->pg_trace) + wr->trace.init("replicated op", nullptr, &op->op->pg_trace); + get_parent()->send_message_osd_cluster( + shard.osd, wr, get_osdmap_epoch()); + } + } +} + +// sub op modify +void ReplicatedBackend::do_repop(OpRequestRef op) +{ + static_cast<MOSDRepOp*>(op->get_nonconst_req())->finish_decode(); + const MOSDRepOp *m = static_cast<const MOSDRepOp *>(op->get_req()); + int msg_type = m->get_type(); + ceph_assert(MSG_OSD_REPOP == msg_type); + + const hobject_t& soid = m->poid; + + dout(10) << __func__ << " " << soid + << " v " << m->version + << (m->logbl.length() ? " (transaction)" : " (parallel exec") + << " " << m->logbl.length() + << dendl; + + // sanity checks + ceph_assert(m->map_epoch >= get_info().history.same_interval_since); + + dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl; + parent->maybe_preempt_replica_scrub(soid); + + int ackerosd = m->get_source().num(); + + op->mark_started(); + + RepModifyRef rm(std::make_shared<RepModify>()); + rm->op = op; + rm->ackerosd = ackerosd; + rm->last_complete = get_info().last_complete; + rm->epoch_started = get_osdmap_epoch(); + + ceph_assert(m->logbl.length()); + // shipped transaction and log entries + vector<pg_log_entry_t> log; + + auto p = const_cast<bufferlist&>(m->get_data()).cbegin(); + decode(rm->opt, p); + + if (m->new_temp_oid != hobject_t()) { + dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl; + add_temp_obj(m->new_temp_oid); + } + if (m->discard_temp_oid != hobject_t()) { + dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl; + if (rm->opt.empty()) { + dout(10) << __func__ << ": removing object " << m->discard_temp_oid + << " since we won't get the transaction" << dendl; + rm->localt.remove(coll, ghobject_t(m->discard_temp_oid)); + } + clear_temp_obj(m->discard_temp_oid); + } + + p = const_cast<bufferlist&>(m->logbl).begin(); + decode(log, p); + rm->opt.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + + bool update_snaps = false; + if (!rm->opt.empty()) { + // If the opt is non-empty, we infer we are before + // last_backfill (according to the primary, not our + // not-quite-accurate value), and should update the + // collections now. Otherwise, we do it later on push. + update_snaps = true; + } + + // flag set to true during async recovery + bool async = false; + pg_missing_tracker_t pmissing = get_parent()->get_local_missing(); + if (pmissing.is_missing(soid)) { + async = true; + dout(30) << __func__ << " is_missing " << pmissing.is_missing(soid) << dendl; + for (auto &&e: log) { + dout(30) << " add_next_event entry " << e << dendl; + get_parent()->add_local_next_event(e); + dout(30) << " entry is_delete " << e.is_delete() << dendl; + } + } + + parent->update_stats(m->pg_stats); + parent->log_operation( + log, + m->updated_hit_set_history, + m->pg_trim_to, + m->pg_roll_forward_to, + update_snaps, + rm->localt, + async); + + rm->opt.register_on_commit( + parent->bless_context( + new C_OSD_RepModifyCommit(this, rm))); + vector<ObjectStore::Transaction> tls; + tls.reserve(2); + tls.push_back(std::move(rm->localt)); + tls.push_back(std::move(rm->opt)); + parent->queue_transactions(tls, op); + // op is cleaned up by oncommit/onapply when both are executed + dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl; +} + +void ReplicatedBackend::repop_commit(RepModifyRef rm) +{ + rm->op->mark_commit_sent(); + rm->op->pg_trace.event("sup_op_commit"); + rm->committed = true; + + // send commit. + const MOSDRepOp *m = static_cast<const MOSDRepOp*>(rm->op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_REPOP); + dout(10) << __func__ << " on op " << *m + << ", sending commit to osd." << rm->ackerosd + << dendl; + ceph_assert(get_osdmap()->is_up(rm->ackerosd)); + + get_parent()->update_last_complete_ondisk(rm->last_complete); + + MOSDRepOpReply *reply = new MOSDRepOpReply( + m, + get_parent()->whoami_shard(), + 0, get_osdmap_epoch(), m->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); + reply->set_last_complete_ondisk(rm->last_complete); + reply->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority! + reply->trace = rm->op->pg_trace; + get_parent()->send_message_osd_cluster( + rm->ackerosd, reply, get_osdmap_epoch()); + + log_subop_stats(get_parent()->get_logger(), rm->op, l_osd_sop_w); +} + + +// =========================================================== + +void ReplicatedBackend::calc_head_subsets( + ObjectContextRef obc, SnapSet& snapset, const hobject_t& head, + const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set<uint64_t>& data_subset, + map<hobject_t, interval_set<uint64_t>>& clone_subsets, + ObcLockManager &manager) +{ + dout(10) << "calc_head_subsets " << head + << " clone_overlap " << snapset.clone_overlap << dendl; + + uint64_t size = obc->obs.oi.size; + if (size) + data_subset.insert(0, size); + + if (get_parent()->get_pool().allow_incomplete_clones()) { + dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl; + return; + } + + if (!cct->_conf->osd_recover_clone_overlap) { + dout(10) << "calc_head_subsets " << head << " -- osd_recover_clone_overlap disabled" << dendl; + return; + } + + + interval_set<uint64_t> cloning; + interval_set<uint64_t> prev; + if (size) + prev.insert(0, size); + + for (int j=snapset.clones.size()-1; j>=0; j--) { + hobject_t c = head; + c.snap = snapset.clones[j]; + prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); + if (!missing.is_missing(c) && + c < last_backfill && + get_parent()->try_lock_for_read(c, manager)) { + dout(10) << "calc_head_subsets " << head << " has prev " << c + << " overlap " << prev << dendl; + clone_subsets[c] = prev; + cloning.union_of(prev); + break; + } + dout(10) << "calc_head_subsets " << head << " does not have prev " << c + << " overlap " << prev << dendl; + } + + + if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) { + dout(10) << "skipping clone, too many holes" << dendl; + get_parent()->release_locks(manager); + clone_subsets.clear(); + cloning.clear(); + } + + // what's left for us to push? + data_subset.subtract(cloning); + + dout(10) << "calc_head_subsets " << head + << " data_subset " << data_subset + << " clone_subsets " << clone_subsets << dendl; +} + +void ReplicatedBackend::calc_clone_subsets( + SnapSet& snapset, const hobject_t& soid, + const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set<uint64_t>& data_subset, + map<hobject_t, interval_set<uint64_t>>& clone_subsets, + ObcLockManager &manager) +{ + dout(10) << "calc_clone_subsets " << soid + << " clone_overlap " << snapset.clone_overlap << dendl; + + uint64_t size = snapset.clone_size[soid.snap]; + if (size) + data_subset.insert(0, size); + + if (get_parent()->get_pool().allow_incomplete_clones()) { + dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl; + return; + } + + if (!cct->_conf->osd_recover_clone_overlap) { + dout(10) << "calc_clone_subsets " << soid << " -- osd_recover_clone_overlap disabled" << dendl; + return; + } + + unsigned i; + for (i=0; i < snapset.clones.size(); i++) + if (snapset.clones[i] == soid.snap) + break; + + // any overlap with next older clone? + interval_set<uint64_t> cloning; + interval_set<uint64_t> prev; + if (size) + prev.insert(0, size); + for (int j=i-1; j>=0; j--) { + hobject_t c = soid; + c.snap = snapset.clones[j]; + prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); + if (!missing.is_missing(c) && + c < last_backfill && + get_parent()->try_lock_for_read(c, manager)) { + dout(10) << "calc_clone_subsets " << soid << " has prev " << c + << " overlap " << prev << dendl; + clone_subsets[c] = prev; + cloning.union_of(prev); + break; + } + dout(10) << "calc_clone_subsets " << soid << " does not have prev " << c + << " overlap " << prev << dendl; + } + + // overlap with next newest? + interval_set<uint64_t> next; + if (size) + next.insert(0, size); + for (unsigned j=i+1; j<snapset.clones.size(); j++) { + hobject_t c = soid; + c.snap = snapset.clones[j]; + next.intersection_of(snapset.clone_overlap[snapset.clones[j-1]]); + if (!missing.is_missing(c) && + c < last_backfill && + get_parent()->try_lock_for_read(c, manager)) { + dout(10) << "calc_clone_subsets " << soid << " has next " << c + << " overlap " << next << dendl; + clone_subsets[c] = next; + cloning.union_of(next); + break; + } + dout(10) << "calc_clone_subsets " << soid << " does not have next " << c + << " overlap " << next << dendl; + } + + if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) { + dout(10) << "skipping clone, too many holes" << dendl; + get_parent()->release_locks(manager); + clone_subsets.clear(); + cloning.clear(); + } + + + // what's left for us to push? + data_subset.subtract(cloning); + + dout(10) << "calc_clone_subsets " << soid + << " data_subset " << data_subset + << " clone_subsets " << clone_subsets << dendl; +} + +void ReplicatedBackend::prepare_pull( + eversion_t v, + const hobject_t& soid, + ObjectContextRef headctx, + RPGHandle *h) +{ + ceph_assert(get_parent()->get_local_missing().get_items().count(soid)); + eversion_t _v = get_parent()->get_local_missing().get_items().find( + soid)->second.need; + ceph_assert(_v == v); + const map<hobject_t, set<pg_shard_t>> &missing_loc( + get_parent()->get_missing_loc_shards()); + const map<pg_shard_t, pg_missing_t > &peer_missing( + get_parent()->get_shard_missing()); + map<hobject_t, set<pg_shard_t>>::const_iterator q = missing_loc.find(soid); + ceph_assert(q != missing_loc.end()); + ceph_assert(!q->second.empty()); + + // pick a pullee + auto p = q->second.end(); + if (cct->_conf->osd_debug_feed_pullee >= 0) { + for (auto it = q->second.begin(); it != q->second.end(); it++) { + if (it->osd == cct->_conf->osd_debug_feed_pullee) { + p = it; + break; + } + } + } + if (p == q->second.end()) { + // probably because user feed a wrong pullee + p = q->second.begin(); + std::advance(p, + util::generate_random_number<int>(0, + q->second.size() - 1)); + } + ceph_assert(get_osdmap()->is_up(p->osd)); + pg_shard_t fromshard = *p; + + dout(7) << "pull " << soid + << " v " << v + << " on osds " << q->second + << " from osd." << fromshard + << dendl; + + ceph_assert(peer_missing.count(fromshard)); + const pg_missing_t &pmissing = peer_missing.find(fromshard)->second; + if (pmissing.is_missing(soid, v)) { + ceph_assert(pmissing.get_items().find(soid)->second.have != v); + dout(10) << "pulling soid " << soid << " from osd " << fromshard + << " at version " << pmissing.get_items().find(soid)->second.have + << " rather than at version " << v << dendl; + v = pmissing.get_items().find(soid)->second.have; + ceph_assert(get_parent()->get_log().get_log().objects.count(soid) && + (get_parent()->get_log().get_log().objects.find(soid)->second->op == + pg_log_entry_t::LOST_REVERT) && + (get_parent()->get_log().get_log().objects.find( + soid)->second->reverting_to == + v)); + } + + ObjectRecoveryInfo recovery_info; + ObcLockManager lock_manager; + + if (soid.is_snap()) { + ceph_assert(!get_parent()->get_local_missing().is_missing(soid.get_head())); + ceph_assert(headctx); + // check snapset + SnapSetContext *ssc = headctx->ssc; + ceph_assert(ssc); + dout(10) << " snapset " << ssc->snapset << dendl; + recovery_info.ss = ssc->snapset; + calc_clone_subsets( + ssc->snapset, soid, get_parent()->get_local_missing(), + get_info().last_backfill, + recovery_info.copy_subset, + recovery_info.clone_subset, + lock_manager); + // FIXME: this may overestimate if we are pulling multiple clones in parallel... + dout(10) << " pulling " << recovery_info << dendl; + + ceph_assert(ssc->snapset.clone_size.count(soid.snap)); + recovery_info.size = ssc->snapset.clone_size[soid.snap]; + } else { + // pulling head or unversioned object. + // always pull the whole thing. + recovery_info.copy_subset.insert(0, (uint64_t)-1); + recovery_info.size = ((uint64_t)-1); + } + + h->pulls[fromshard].push_back(PullOp()); + PullOp &op = h->pulls[fromshard].back(); + op.soid = soid; + + op.recovery_info = recovery_info; + op.recovery_info.soid = soid; + op.recovery_info.version = v; + op.recovery_progress.data_complete = false; + op.recovery_progress.omap_complete = false; + op.recovery_progress.data_recovered_to = 0; + op.recovery_progress.first = true; + + ceph_assert(!pulling.count(soid)); + pull_from_peer[fromshard].insert(soid); + PullInfo &pi = pulling[soid]; + pi.from = fromshard; + pi.soid = soid; + pi.head_ctx = headctx; + pi.recovery_info = op.recovery_info; + pi.recovery_progress = op.recovery_progress; + pi.cache_dont_need = h->cache_dont_need; + pi.lock_manager = std::move(lock_manager); +} + +/* + * intelligently push an object to a replica. make use of existing + * clones/heads and dup data ranges where possible. + */ +int ReplicatedBackend::prep_push_to_replica( + ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, + PushOp *pop, bool cache_dont_need) +{ + const object_info_t& oi = obc->obs.oi; + uint64_t size = obc->obs.oi.size; + + dout(10) << __func__ << ": " << soid << " v" << oi.version + << " size " << size << " to osd." << peer << dendl; + + map<hobject_t, interval_set<uint64_t>> clone_subsets; + interval_set<uint64_t> data_subset; + + ObcLockManager lock_manager; + // are we doing a clone on the replica? + if (soid.snap && soid.snap < CEPH_NOSNAP) { + hobject_t head = soid; + head.snap = CEPH_NOSNAP; + + // try to base push off of clones that succeed/preceed poid + // we need the head (and current SnapSet) locally to do that. + if (get_parent()->get_local_missing().is_missing(head)) { + dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl; + return prep_push(obc, soid, peer, pop, cache_dont_need); + } + + SnapSetContext *ssc = obc->ssc; + ceph_assert(ssc); + dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; + pop->recovery_info.ss = ssc->snapset; + map<pg_shard_t, pg_missing_t>::const_iterator pm = + get_parent()->get_shard_missing().find(peer); + ceph_assert(pm != get_parent()->get_shard_missing().end()); + map<pg_shard_t, pg_info_t>::const_iterator pi = + get_parent()->get_shard_info().find(peer); + ceph_assert(pi != get_parent()->get_shard_info().end()); + calc_clone_subsets( + ssc->snapset, soid, + pm->second, + pi->second.last_backfill, + data_subset, clone_subsets, + lock_manager); + } else if (soid.snap == CEPH_NOSNAP) { + // pushing head or unversioned object. + // base this on partially on replica's clones? + SnapSetContext *ssc = obc->ssc; + ceph_assert(ssc); + dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; + calc_head_subsets( + obc, + ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second, + get_parent()->get_shard_info().find(peer)->second.last_backfill, + data_subset, clone_subsets, + lock_manager); + } + + return prep_push( + obc, + soid, + peer, + oi.version, + data_subset, + clone_subsets, + pop, + cache_dont_need, + std::move(lock_manager)); +} + +int ReplicatedBackend::prep_push(ObjectContextRef obc, + const hobject_t& soid, pg_shard_t peer, + PushOp *pop, bool cache_dont_need) +{ + interval_set<uint64_t> data_subset; + if (obc->obs.oi.size) + data_subset.insert(0, obc->obs.oi.size); + map<hobject_t, interval_set<uint64_t>> clone_subsets; + + return prep_push(obc, soid, peer, + obc->obs.oi.version, data_subset, clone_subsets, + pop, cache_dont_need, ObcLockManager()); +} + +int ReplicatedBackend::prep_push( + ObjectContextRef obc, + const hobject_t& soid, pg_shard_t peer, + eversion_t version, + interval_set<uint64_t> &data_subset, + map<hobject_t, interval_set<uint64_t>>& clone_subsets, + PushOp *pop, + bool cache_dont_need, + ObcLockManager &&lock_manager) +{ + get_parent()->begin_peer_recover(peer, soid); + // take note. + PushInfo &pi = pushing[soid][peer]; + pi.obc = obc; + pi.recovery_info.size = obc->obs.oi.size; + pi.recovery_info.copy_subset = data_subset; + pi.recovery_info.clone_subset = clone_subsets; + pi.recovery_info.soid = soid; + pi.recovery_info.oi = obc->obs.oi; + pi.recovery_info.ss = pop->recovery_info.ss; + pi.recovery_info.version = version; + pi.lock_manager = std::move(lock_manager); + + ObjectRecoveryProgress new_progress; + int r = build_push_op(pi.recovery_info, + pi.recovery_progress, + &new_progress, + pop, + &(pi.stat), cache_dont_need); + if (r < 0) + return r; + pi.recovery_progress = new_progress; + return 0; +} + +void ReplicatedBackend::submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool cache_dont_need, + const interval_set<uint64_t> &intervals_included, + bufferlist data_included, + bufferlist omap_header, + const map<string, bufferlist> &attrs, + const map<string, bufferlist> &omap_entries, + ObjectStore::Transaction *t) +{ + hobject_t target_oid; + if (first && complete) { + target_oid = recovery_info.soid; + } else { + target_oid = get_parent()->get_temp_recovery_object(recovery_info.soid, + recovery_info.version); + if (first) { + dout(10) << __func__ << ": Adding oid " + << target_oid << " in the temp collection" << dendl; + add_temp_obj(target_oid); + } + } + + if (first) { + t->remove(coll, ghobject_t(target_oid)); + t->touch(coll, ghobject_t(target_oid)); + t->truncate(coll, ghobject_t(target_oid), recovery_info.size); + if (omap_header.length()) + t->omap_setheader(coll, ghobject_t(target_oid), omap_header); + + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll, ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + if (get_parent()->pg_is_remote_backfilling()) { + struct stat st; + uint64_t size = 0; + int r = store->stat(ch, ghobject_t(recovery_info.soid), &st); + if (r == 0) { + size = st.st_size; + } + // Don't need to do anything if object is still the same size + if (size != recovery_info.oi.size) { + get_parent()->pg_add_local_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size); + get_parent()->pg_add_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size); + dout(10) << __func__ << " " << recovery_info.soid + << " backfill size " << recovery_info.oi.size + << " previous size " << size + << " net size " << recovery_info.oi.size - size + << dendl; + } + } + } + uint64_t off = 0; + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; + if (cache_dont_need) + fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + for (interval_set<uint64_t>::const_iterator p = intervals_included.begin(); + p != intervals_included.end(); + ++p) { + bufferlist bit; + bit.substr_of(data_included, off, p.get_len()); + t->write(coll, ghobject_t(target_oid), + p.get_start(), p.get_len(), bit, fadvise_flags); + off += p.get_len(); + } + + if (!omap_entries.empty()) + t->omap_setkeys(coll, ghobject_t(target_oid), omap_entries); + if (!attrs.empty()) + t->setattrs(coll, ghobject_t(target_oid), attrs); + + if (complete) { + if (!first) { + dout(10) << __func__ << ": Removing oid " + << target_oid << " from the temp collection" << dendl; + clear_temp_obj(target_oid); + t->remove(coll, ghobject_t(recovery_info.soid)); + t->collection_move_rename(coll, ghobject_t(target_oid), + coll, ghobject_t(recovery_info.soid)); + } + + submit_push_complete(recovery_info, t); + } +} + +void ReplicatedBackend::submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t) +{ + for (map<hobject_t, interval_set<uint64_t>>::const_iterator p = + recovery_info.clone_subset.begin(); + p != recovery_info.clone_subset.end(); + ++p) { + for (interval_set<uint64_t>::const_iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + dout(15) << " clone_range " << p->first << " " + << q.get_start() << "~" << q.get_len() << dendl; + t->clone_range(coll, ghobject_t(p->first), ghobject_t(recovery_info.soid), + q.get_start(), q.get_len(), q.get_start()); + } + } +} + +ObjectRecoveryInfo ReplicatedBackend::recalc_subsets( + const ObjectRecoveryInfo& recovery_info, + SnapSetContext *ssc, + ObcLockManager &manager) +{ + if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP) + return recovery_info; + ObjectRecoveryInfo new_info = recovery_info; + new_info.copy_subset.clear(); + new_info.clone_subset.clear(); + ceph_assert(ssc); + get_parent()->release_locks(manager); // might already have locks + calc_clone_subsets( + ssc->snapset, new_info.soid, get_parent()->get_local_missing(), + get_info().last_backfill, + new_info.copy_subset, new_info.clone_subset, + manager); + return new_info; +} + +bool ReplicatedBackend::handle_pull_response( + pg_shard_t from, const PushOp &pop, PullOp *response, + list<pull_complete_info> *to_continue, + ObjectStore::Transaction *t) +{ + interval_set<uint64_t> data_included = pop.data_included; + bufferlist data; + data = pop.data; + dout(10) << "handle_pull_response " + << pop.recovery_info + << pop.after_progress + << " data.size() is " << data.length() + << " data_included: " << data_included + << dendl; + if (pop.version == eversion_t()) { + // replica doesn't have it! + _failed_pull(from, pop.soid); + return false; + } + + const hobject_t &hoid = pop.soid; + ceph_assert((data_included.empty() && data.length() == 0) || + (!data_included.empty() && data.length() > 0)); + + auto piter = pulling.find(hoid); + if (piter == pulling.end()) { + return false; + } + + PullInfo &pi = piter->second; + if (pi.recovery_info.size == (uint64_t(-1))) { + pi.recovery_info.size = pop.recovery_info.size; + pi.recovery_info.copy_subset.intersection_of( + pop.recovery_info.copy_subset); + } + // If primary doesn't have object info and didn't know version + if (pi.recovery_info.version == eversion_t()) { + pi.recovery_info.version = pop.version; + } + + bool first = pi.recovery_progress.first; + if (first) { + // attrs only reference the origin bufferlist (decode from + // MOSDPGPush message) whose size is much greater than attrs in + // recovery. If obc cache it (get_obc maybe cache the attr), this + // causes the whole origin bufferlist would not be free until obc + // is evicted from obc cache. So rebuild the bufferlists before + // cache it. + auto attrset = pop.attrset; + for (auto& a : attrset) { + a.second.rebuild(); + } + pi.obc = get_parent()->get_obc(pi.recovery_info.soid, attrset); + pi.recovery_info.oi = pi.obc->obs.oi; + pi.recovery_info = recalc_subsets( + pi.recovery_info, + pi.obc->ssc, + pi.lock_manager); + } + + + interval_set<uint64_t> usable_intervals; + bufferlist usable_data; + trim_pushed_data(pi.recovery_info.copy_subset, + data_included, + data, + &usable_intervals, + &usable_data); + data_included = usable_intervals; + data.claim(usable_data); + + + pi.recovery_progress = pop.after_progress; + + dout(10) << "new recovery_info " << pi.recovery_info + << ", new progress " << pi.recovery_progress + << dendl; + + bool complete = pi.is_complete(); + + submit_push_data(pi.recovery_info, first, + complete, pi.cache_dont_need, + data_included, data, + pop.omap_header, + pop.attrset, + pop.omap_entries, + t); + + pi.stat.num_keys_recovered += pop.omap_entries.size(); + pi.stat.num_bytes_recovered += data.length(); + get_parent()->get_logger()->inc(l_osd_rbytes, pop.omap_entries.size() + data.length()); + + if (complete) { + pi.stat.num_objects_recovered++; + // XXX: This could overcount if regular recovery is needed right after a repair + if (get_parent()->pg_is_repair()) { + pi.stat.num_objects_repaired++; + get_parent()->inc_osd_stat_repaired(); + } + clear_pull_from(piter); + to_continue->push_back({hoid, pi.stat}); + get_parent()->on_local_recover( + hoid, pi.recovery_info, pi.obc, false, t); + return false; + } else { + response->soid = pop.soid; + response->recovery_info = pi.recovery_info; + response->recovery_progress = pi.recovery_progress; + return true; + } +} + +void ReplicatedBackend::handle_push( + pg_shard_t from, const PushOp &pop, PushReplyOp *response, + ObjectStore::Transaction *t, bool is_repair) +{ + dout(10) << "handle_push " + << pop.recovery_info + << pop.after_progress + << dendl; + bufferlist data; + data = pop.data; + bool first = pop.before_progress.first; + bool complete = pop.after_progress.data_complete && + pop.after_progress.omap_complete; + + response->soid = pop.recovery_info.soid; + submit_push_data(pop.recovery_info, + first, + complete, + true, // must be replicate + pop.data_included, + data, + pop.omap_header, + pop.attrset, + pop.omap_entries, + t); + + if (complete) { + if (is_repair) { + get_parent()->inc_osd_stat_repaired(); + dout(20) << __func__ << " repair complete" << dendl; + } + get_parent()->on_local_recover( + pop.recovery_info.soid, + pop.recovery_info, + ObjectContextRef(), // ok, is replica + false, + t); + } +} + +void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes) +{ + for (map<pg_shard_t, vector<PushOp> >::iterator i = pushes.begin(); + i != pushes.end(); + ++i) { + ConnectionRef con = get_parent()->get_con_osd_cluster( + i->first.osd, + get_osdmap_epoch()); + if (!con) + continue; + vector<PushOp>::iterator j = i->second.begin(); + while (j != i->second.end()) { + uint64_t cost = 0; + uint64_t pushes = 0; + MOSDPGPush *msg = new MOSDPGPush(); + msg->from = get_parent()->whoami_shard(); + msg->pgid = get_parent()->primary_spg_t(); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); + msg->set_priority(prio); + msg->is_repair = get_parent()->pg_is_repair(); + for (; + (j != i->second.end() && + cost < cct->_conf->osd_max_push_cost && + pushes < cct->_conf->osd_max_push_objects) ; + ++j) { + dout(20) << __func__ << ": sending push " << *j + << " to osd." << i->first << dendl; + cost += j->cost(cct); + pushes += 1; + msg->pushes.push_back(*j); + } + msg->set_cost(cost); + get_parent()->send_message_osd_cluster(msg, con); + } + } +} + +void ReplicatedBackend::send_pulls(int prio, map<pg_shard_t, vector<PullOp> > &pulls) +{ + for (map<pg_shard_t, vector<PullOp> >::iterator i = pulls.begin(); + i != pulls.end(); + ++i) { + ConnectionRef con = get_parent()->get_con_osd_cluster( + i->first.osd, + get_osdmap_epoch()); + if (!con) + continue; + dout(20) << __func__ << ": sending pulls " << i->second + << " to osd." << i->first << dendl; + MOSDPGPull *msg = new MOSDPGPull(); + msg->from = parent->whoami_shard(); + msg->set_priority(prio); + msg->pgid = get_parent()->primary_spg_t(); + msg->map_epoch = get_osdmap_epoch(); + msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); + msg->set_pulls(&i->second); + msg->compute_cost(cct); + get_parent()->send_message_osd_cluster(msg, con); + } +} + +int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, + const ObjectRecoveryProgress &progress, + ObjectRecoveryProgress *out_progress, + PushOp *out_op, + object_stat_sum_t *stat, + bool cache_dont_need) +{ + ObjectRecoveryProgress _new_progress; + if (!out_progress) + out_progress = &_new_progress; + ObjectRecoveryProgress &new_progress = *out_progress; + new_progress = progress; + + dout(7) << __func__ << " " << recovery_info.soid + << " v " << recovery_info.version + << " size " << recovery_info.size + << " recovery_info: " << recovery_info + << dendl; + + eversion_t v = recovery_info.version; + object_info_t oi; + if (progress.first) { + int r = store->omap_get_header(ch, ghobject_t(recovery_info.soid), &out_op->omap_header); + if(r < 0) { + dout(1) << __func__ << " get omap header failed: " << cpp_strerror(-r) << dendl; + return r; + } + r = store->getattrs(ch, ghobject_t(recovery_info.soid), out_op->attrset); + if(r < 0) { + dout(1) << __func__ << " getattrs failed: " << cpp_strerror(-r) << dendl; + return r; + } + + // Debug + bufferlist bv = out_op->attrset[OI_ATTR]; + try { + auto bliter = bv.cbegin(); + decode(oi, bliter); + } catch (...) { + dout(0) << __func__ << ": bad object_info_t: " << recovery_info.soid << dendl; + return -EINVAL; + } + + // If requestor didn't know the version, use ours + if (v == eversion_t()) { + v = oi.version; + } else if (oi.version != v) { + get_parent()->clog_error() << get_info().pgid << " push " + << recovery_info.soid << " v " + << recovery_info.version + << " failed because local copy is " + << oi.version; + return -EINVAL; + } + + new_progress.first = false; + } + // Once we provide the version subsequent requests will have it, so + // at this point it must be known. + ceph_assert(v != eversion_t()); + + uint64_t available = cct->_conf->osd_recovery_max_chunk; + if (!progress.omap_complete) { + ObjectMap::ObjectMapIterator iter = + store->get_omap_iterator(ch, + ghobject_t(recovery_info.soid)); + ceph_assert(iter); + for (iter->lower_bound(progress.omap_recovered_to); + iter->valid(); + iter->next()) { + if (!out_op->omap_entries.empty() && + ((cct->_conf->osd_recovery_max_omap_entries_per_chunk > 0 && + out_op->omap_entries.size() >= cct->_conf->osd_recovery_max_omap_entries_per_chunk) || + available <= iter->key().size() + iter->value().length())) + break; + out_op->omap_entries.insert(make_pair(iter->key(), iter->value())); + + if ((iter->key().size() + iter->value().length()) <= available) + available -= (iter->key().size() + iter->value().length()); + else + available = 0; + } + if (!iter->valid()) + new_progress.omap_complete = true; + else + new_progress.omap_recovered_to = iter->key(); + } + + if (available > 0) { + if (!recovery_info.copy_subset.empty()) { + interval_set<uint64_t> copy_subset = recovery_info.copy_subset; + map<uint64_t, uint64_t> m; + int r = store->fiemap(ch, ghobject_t(recovery_info.soid), 0, + copy_subset.range_end(), m); + if (r >= 0) { + interval_set<uint64_t> fiemap_included(m); + copy_subset.intersection_of(fiemap_included); + } else { + // intersection of copy_subset and empty interval_set would be empty anyway + copy_subset.clear(); + } + + out_op->data_included.span_of(copy_subset, progress.data_recovered_to, + available); + if (out_op->data_included.empty()) // zero filled section, skip to end! + new_progress.data_recovered_to = recovery_info.copy_subset.range_end(); + else + new_progress.data_recovered_to = out_op->data_included.range_end(); + } + } else { + out_op->data_included.clear(); + } + + for (interval_set<uint64_t>::iterator p = out_op->data_included.begin(); + p != out_op->data_included.end(); + ++p) { + bufferlist bit; + int r = store->read(ch, ghobject_t(recovery_info.soid), + p.get_start(), p.get_len(), bit, + cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0); + if (cct->_conf->osd_debug_random_push_read_error && + (rand() % (int)(cct->_conf->osd_debug_random_push_read_error * 100.0)) == 0) { + dout(0) << __func__ << ": inject EIO " << recovery_info.soid << dendl; + r = -EIO; + } + if (r < 0) { + return r; + } + if (p.get_len() != bit.length()) { + dout(10) << " extent " << p.get_start() << "~" << p.get_len() + << " is actually " << p.get_start() << "~" << bit.length() + << dendl; + interval_set<uint64_t>::iterator save = p++; + if (bit.length() == 0) + out_op->data_included.erase(save); //Remove this empty interval + else + save.set_len(bit.length()); + // Remove any other intervals present + while (p != out_op->data_included.end()) { + interval_set<uint64_t>::iterator save = p++; + out_op->data_included.erase(save); + } + new_progress.data_complete = true; + out_op->data.claim_append(bit); + break; + } + out_op->data.claim_append(bit); + } + if (progress.first && !out_op->data_included.empty() && + out_op->data_included.begin().get_start() == 0 && + out_op->data.length() == oi.size && oi.is_data_digest()) { + uint32_t crc = out_op->data.crc32c(-1); + if (oi.data_digest != crc) { + dout(0) << __func__ << " " << coll << std::hex + << " full-object read crc 0x" << crc + << " != expected 0x" << oi.data_digest + << std::dec << " on " << recovery_info.soid << dendl; + return -EIO; + } + } + + if (new_progress.is_complete(recovery_info)) { + new_progress.data_complete = true; + if (stat) { + stat->num_objects_recovered++; + if (get_parent()->pg_is_repair()) + stat->num_objects_repaired++; + } + } + + if (stat) { + stat->num_keys_recovered += out_op->omap_entries.size(); + stat->num_bytes_recovered += out_op->data.length(); + get_parent()->get_logger()->inc(l_osd_rbytes, out_op->omap_entries.size() + out_op->data.length()); + } + + get_parent()->get_logger()->inc(l_osd_push); + get_parent()->get_logger()->inc(l_osd_push_outb, out_op->data.length()); + + // send + out_op->version = v; + out_op->soid = recovery_info.soid; + out_op->recovery_info = recovery_info; + out_op->after_progress = new_progress; + out_op->before_progress = progress; + return 0; +} + +void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op) +{ + op->recovery_info.version = eversion_t(); + op->version = eversion_t(); + op->soid = soid; +} + +bool ReplicatedBackend::handle_push_reply( + pg_shard_t peer, const PushReplyOp &op, PushOp *reply) +{ + const hobject_t &soid = op.soid; + if (pushing.count(soid) == 0) { + dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer + << ", or anybody else" + << dendl; + return false; + } else if (pushing[soid].count(peer) == 0) { + dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer + << dendl; + return false; + } else { + PushInfo *pi = &pushing[soid][peer]; + bool error = pushing[soid].begin()->second.recovery_progress.error; + + if (!pi->recovery_progress.data_complete && !error) { + dout(10) << " pushing more from, " + << pi->recovery_progress.data_recovered_to + << " of " << pi->recovery_info.copy_subset << dendl; + ObjectRecoveryProgress new_progress; + int r = build_push_op( + pi->recovery_info, + pi->recovery_progress, &new_progress, reply, + &(pi->stat)); + // Handle the case of a read error right after we wrote, which is + // hopefully extremely rare. + if (r < 0) { + dout(5) << __func__ << ": oid " << soid << " error " << r << dendl; + + error = true; + goto done; + } + pi->recovery_progress = new_progress; + return true; + } else { + // done! +done: + if (!error) + get_parent()->on_peer_recover( peer, soid, pi->recovery_info); + + get_parent()->release_locks(pi->lock_manager); + object_stat_sum_t stat = pi->stat; + eversion_t v = pi->recovery_info.version; + pushing[soid].erase(peer); + pi = NULL; + + if (pushing[soid].empty()) { + if (!error) + get_parent()->on_global_recover(soid, stat, false); + else + get_parent()->on_primary_error(soid, v); + pushing.erase(soid); + } else { + // This looks weird, but we erased the current peer and need to remember + // the error on any other one, while getting more acks. + if (error) + pushing[soid].begin()->second.recovery_progress.error = true; + dout(10) << "pushed " << soid << ", still waiting for push ack from " + << pushing[soid].size() << " others" << dendl; + } + return false; + } + } +} + +void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply) +{ + const hobject_t &soid = op.soid; + struct stat st; + int r = store->stat(ch, ghobject_t(soid), &st); + if (r != 0) { + get_parent()->clog_error() << get_info().pgid << " " + << peer << " tried to pull " << soid + << " but got " << cpp_strerror(-r); + prep_push_op_blank(soid, reply); + } else { + ObjectRecoveryInfo &recovery_info = op.recovery_info; + ObjectRecoveryProgress &progress = op.recovery_progress; + if (progress.first && recovery_info.size == ((uint64_t)-1)) { + // Adjust size and copy_subset + recovery_info.size = st.st_size; + recovery_info.copy_subset.clear(); + if (st.st_size) + recovery_info.copy_subset.insert(0, st.st_size); + ceph_assert(recovery_info.clone_subset.empty()); + } + + r = build_push_op(recovery_info, progress, 0, reply); + if (r < 0) + prep_push_op_blank(soid, reply); + } +} + +/** + * trim received data to remove what we don't want + * + * @param copy_subset intervals we want + * @param data_included intervals we got + * @param data_recieved data we got + * @param intervals_usable intervals we want to keep + * @param data_usable matching data we want to keep + */ +void ReplicatedBackend::trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + bufferlist data_received, + interval_set<uint64_t> *intervals_usable, + bufferlist *data_usable) +{ + if (intervals_received.subset_of(copy_subset)) { + *intervals_usable = intervals_received; + *data_usable = data_received; + return; + } + + intervals_usable->intersection_of(copy_subset, + intervals_received); + + uint64_t off = 0; + for (interval_set<uint64_t>::const_iterator p = intervals_received.begin(); + p != intervals_received.end(); + ++p) { + interval_set<uint64_t> x; + x.insert(p.get_start(), p.get_len()); + x.intersection_of(copy_subset); + for (interval_set<uint64_t>::const_iterator q = x.begin(); + q != x.end(); + ++q) { + bufferlist sub; + uint64_t data_off = off + (q.get_start() - p.get_start()); + sub.substr_of(data_received, data_off, q.get_len()); + data_usable->claim_append(sub); + } + off += p.get_len(); + } +} + +void ReplicatedBackend::_failed_pull(pg_shard_t from, const hobject_t &soid) +{ + dout(20) << __func__ << ": " << soid << " from " << from << dendl; + list<pg_shard_t> fl = { from }; + auto it = pulling.find(soid); + assert(it != pulling.end()); + get_parent()->failed_push(fl, soid, it->second.recovery_info.version); + + clear_pull(it); +} + +void ReplicatedBackend::clear_pull_from( + map<hobject_t, PullInfo>::iterator piter) +{ + auto from = piter->second.from; + pull_from_peer[from].erase(piter->second.soid); + if (pull_from_peer[from].empty()) + pull_from_peer.erase(from); +} + +void ReplicatedBackend::clear_pull( + map<hobject_t, PullInfo>::iterator piter, + bool clear_pull_from_peer) +{ + if (clear_pull_from_peer) { + clear_pull_from(piter); + } + get_parent()->release_locks(piter->second.lock_manager); + pulling.erase(piter); +} + +int ReplicatedBackend::start_pushes( + const hobject_t &soid, + ObjectContextRef obc, + RPGHandle *h) +{ + list< map<pg_shard_t, pg_missing_t>::const_iterator > shards; + + dout(20) << __func__ << " soid " << soid << dendl; + // who needs it? + ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0); + for (set<pg_shard_t>::iterator i = + get_parent()->get_acting_recovery_backfill_shards().begin(); + i != get_parent()->get_acting_recovery_backfill_shards().end(); + ++i) { + if (*i == get_parent()->whoami_shard()) continue; + pg_shard_t peer = *i; + map<pg_shard_t, pg_missing_t>::const_iterator j = + get_parent()->get_shard_missing().find(peer); + ceph_assert(j != get_parent()->get_shard_missing().end()); + if (j->second.is_missing(soid)) { + shards.push_back(j); + } + } + + // If more than 1 read will occur ignore possible request to not cache + bool cache = shards.size() == 1 ? h->cache_dont_need : false; + + for (auto j : shards) { + pg_shard_t peer = j->first; + h->pushes[peer].push_back(PushOp()); + int r = prep_push_to_replica(obc, soid, peer, + &(h->pushes[peer].back()), cache); + if (r < 0) { + // Back out all failed reads + for (auto k : shards) { + pg_shard_t p = k->first; + dout(10) << __func__ << " clean up peer " << p << dendl; + h->pushes[p].pop_back(); + if (p == peer) break; + } + return r; + } + } + return shards.size(); +} diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h new file mode 100644 index 00000000..8f447495 --- /dev/null +++ b/src/osd/ReplicatedBackend.h @@ -0,0 +1,430 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef REPBACKEND_H +#define REPBACKEND_H + +#include "PGBackend.h" + +struct C_ReplicatedBackend_OnPullComplete; +class ReplicatedBackend : public PGBackend { + struct RPGHandle : public PGBackend::RecoveryHandle { + map<pg_shard_t, vector<PushOp> > pushes; + map<pg_shard_t, vector<PullOp> > pulls; + }; + friend struct C_ReplicatedBackend_OnPullComplete; +public: + ReplicatedBackend( + PGBackend::Listener *pg, + const coll_t &coll, + ObjectStore::CollectionHandle &ch, + ObjectStore *store, + CephContext *cct); + + /// @see PGBackend::open_recovery_op + RPGHandle *_open_recovery_op() { + return new RPGHandle(); + } + PGBackend::RecoveryHandle *open_recovery_op() override { + return _open_recovery_op(); + } + + /// @see PGBackend::run_recovery_op + void run_recovery_op( + PGBackend::RecoveryHandle *h, + int priority) override; + + /// @see PGBackend::recover_object + int recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + RecoveryHandle *h + ) override; + + void check_recovery_sources(const OSDMapRef& osdmap) override; + + bool can_handle_while_inactive(OpRequestRef op) override; + + /// @see PGBackend::handle_message + bool _handle_message( + OpRequestRef op + ) override; + + void on_change() override; + void clear_recovery_state() override; + + class RPCRecPred : public IsPGRecoverablePredicate { + public: + bool operator()(const set<pg_shard_t> &have) const override { + return !have.empty(); + } + }; + IsPGRecoverablePredicate *get_is_recoverable_predicate() const override { + return new RPCRecPred; + } + + class RPCReadPred : public IsPGReadablePredicate { + pg_shard_t whoami; + public: + explicit RPCReadPred(pg_shard_t whoami) : whoami(whoami) {} + bool operator()(const set<pg_shard_t> &have) const override { + return have.count(whoami); + } + }; + IsPGReadablePredicate *get_is_readable_predicate() const override { + return new RPCReadPred(get_parent()->whoami_shard()); + } + + void dump_recovery_info(Formatter *f) const override { + { + f->open_array_section("pull_from_peer"); + for (map<pg_shard_t, set<hobject_t> >::const_iterator i = pull_from_peer.begin(); + i != pull_from_peer.end(); + ++i) { + f->open_object_section("pulling_from"); + f->dump_stream("pull_from") << i->first; + { + f->open_array_section("pulls"); + for (set<hobject_t>::const_iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + f->open_object_section("pull_info"); + ceph_assert(pulling.count(*j)); + pulling.find(*j)->second.dump(f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + { + f->open_array_section("pushing"); + for (map<hobject_t, map<pg_shard_t, PushInfo>>::const_iterator i = + pushing.begin(); + i != pushing.end(); + ++i) { + f->open_object_section("object"); + f->dump_stream("pushing") << i->first; + { + f->open_array_section("pushing_to"); + for (map<pg_shard_t, PushInfo>::const_iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + f->open_object_section("push_progress"); + f->dump_stream("pushing_to") << j->first; + { + f->open_object_section("push_info"); + j->second.dump(f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + } + + int objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + bufferlist *bl) override; + + void objects_read_async( + const hobject_t &hoid, + const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, + pair<bufferlist*, Context*> > > &to_read, + Context *on_complete, + bool fast_read = false) override; + +private: + // push + struct PushInfo { + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + ObjectContextRef obc; + object_stat_sum_t stat; + ObcLockManager lock_manager; + + void dump(Formatter *f) const { + { + f->open_object_section("recovery_progress"); + recovery_progress.dump(f); + f->close_section(); + } + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + } + }; + map<hobject_t, map<pg_shard_t, PushInfo>> pushing; + + // pull + struct PullInfo { + pg_shard_t from; + hobject_t soid; + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + ObjectContextRef head_ctx; + ObjectContextRef obc; + object_stat_sum_t stat; + bool cache_dont_need; + ObcLockManager lock_manager; + + void dump(Formatter *f) const { + { + f->open_object_section("recovery_progress"); + recovery_progress.dump(f); + f->close_section(); + } + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + } + + bool is_complete() const { + return recovery_progress.is_complete(recovery_info); + } + }; + + map<hobject_t, PullInfo> pulling; + + // Reverse mapping from osd peer to objects being pulled from that peer + map<pg_shard_t, set<hobject_t> > pull_from_peer; + void clear_pull( + map<hobject_t, PullInfo>::iterator piter, + bool clear_pull_from_peer = true); + void clear_pull_from( + map<hobject_t, PullInfo>::iterator piter); + + void _do_push(OpRequestRef op); + void _do_pull_response(OpRequestRef op); + void do_push(OpRequestRef op) { + if (is_primary()) { + _do_pull_response(op); + } else { + _do_push(op); + } + } + void do_pull(OpRequestRef op); + void do_push_reply(OpRequestRef op); + + bool handle_push_reply(pg_shard_t peer, const PushReplyOp &op, PushOp *reply); + void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply); + + struct pull_complete_info { + hobject_t hoid; + object_stat_sum_t stat; + }; + bool handle_pull_response( + pg_shard_t from, const PushOp &op, PullOp *response, + list<pull_complete_info> *to_continue, + ObjectStore::Transaction *t); + void handle_push(pg_shard_t from, const PushOp &op, PushReplyOp *response, + ObjectStore::Transaction *t, bool is_repair); + + static void trim_pushed_data(const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + bufferlist data_received, + interval_set<uint64_t> *intervals_usable, + bufferlist *data_usable); + void _failed_pull(pg_shard_t from, const hobject_t &soid); + + void send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes); + void prep_push_op_blank(const hobject_t& soid, PushOp *op); + void send_pulls( + int priority, + map<pg_shard_t, vector<PullOp> > &pulls); + + int build_push_op(const ObjectRecoveryInfo &recovery_info, + const ObjectRecoveryProgress &progress, + ObjectRecoveryProgress *out_progress, + PushOp *out_op, + object_stat_sum_t *stat = 0, + bool cache_dont_need = true); + void submit_push_data(const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool cache_dont_need, + const interval_set<uint64_t> &intervals_included, + bufferlist data_included, + bufferlist omap_header, + const map<string, bufferlist> &attrs, + const map<string, bufferlist> &omap_entries, + ObjectStore::Transaction *t); + void submit_push_complete(const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t); + + void calc_clone_subsets( + SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set<uint64_t>& data_subset, + map<hobject_t, interval_set<uint64_t>>& clone_subsets, + ObcLockManager &lock_manager); + void prepare_pull( + eversion_t v, + const hobject_t& soid, + ObjectContextRef headctx, + RPGHandle *h); + int start_pushes( + const hobject_t &soid, + ObjectContextRef obj, + RPGHandle *h); + int prep_push_to_replica( + ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, + PushOp *pop, bool cache_dont_need = true); + int prep_push( + ObjectContextRef obc, + const hobject_t& oid, pg_shard_t dest, + PushOp *op, + bool cache_dont_need); + int prep_push( + ObjectContextRef obc, + const hobject_t& soid, pg_shard_t peer, + eversion_t version, + interval_set<uint64_t> &data_subset, + map<hobject_t, interval_set<uint64_t>>& clone_subsets, + PushOp *op, + bool cache, + ObcLockManager &&lock_manager); + void calc_head_subsets( + ObjectContextRef obc, SnapSet& snapset, const hobject_t& head, + const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set<uint64_t>& data_subset, + map<hobject_t, interval_set<uint64_t>>& clone_subsets, + ObcLockManager &lock_manager); + ObjectRecoveryInfo recalc_subsets( + const ObjectRecoveryInfo& recovery_info, + SnapSetContext *ssc, + ObcLockManager &lock_manager); + + /** + * Client IO + */ + struct InProgressOp : public RefCountedObject { + ceph_tid_t tid; + set<pg_shard_t> waiting_for_commit; + Context *on_commit; + OpRequestRef op; + eversion_t v; + InProgressOp( + ceph_tid_t tid, Context *on_commit, + OpRequestRef op, eversion_t v) + : RefCountedObject(nullptr, 0), + tid(tid), on_commit(on_commit), + op(op), v(v) {} + bool done() const { + return waiting_for_commit.empty(); + } + }; + typedef boost::intrusive_ptr<InProgressOp> InProgressOpRef; + map<ceph_tid_t, InProgressOpRef> in_progress_ops; +public: + friend class C_OSD_OnOpCommit; + + void call_write_ordered(std::function<void(void)> &&cb) override { + // ReplicatedBackend submits writes inline in submit_transaction, so + // we can just call the callback. + cb(); + } + + void submit_transaction( + const hobject_t &hoid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&t, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const vector<pg_log_entry_t> &log_entries, + boost::optional<pg_hit_set_history_t> &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef op + ) override; + +private: + Message * generate_subop( + const hobject_t &soid, + const eversion_t &at_version, + ceph_tid_t tid, + osd_reqid_t reqid, + eversion_t pg_trim_to, + eversion_t pg_roll_forward_to, + hobject_t new_temp_oid, + hobject_t discard_temp_oid, + const bufferlist &log_entries, + boost::optional<pg_hit_set_history_t> &hset_history, + ObjectStore::Transaction &op_t, + pg_shard_t peer, + const pg_info_t &pinfo); + void issue_op( + const hobject_t &soid, + const eversion_t &at_version, + ceph_tid_t tid, + osd_reqid_t reqid, + eversion_t pg_trim_to, + eversion_t pg_roll_forward_to, + hobject_t new_temp_oid, + hobject_t discard_temp_oid, + const vector<pg_log_entry_t> &log_entries, + boost::optional<pg_hit_set_history_t> &hset_history, + InProgressOp *op, + ObjectStore::Transaction &op_t); + void op_commit(InProgressOpRef& op); + void do_repop_reply(OpRequestRef op); + void do_repop(OpRequestRef op); + + struct RepModify { + OpRequestRef op; + bool committed; + int ackerosd; + eversion_t last_complete; + epoch_t epoch_started; + + ObjectStore::Transaction opt, localt; + + RepModify() : committed(false), ackerosd(-1), + epoch_started(0) {} + }; + typedef std::shared_ptr<RepModify> RepModifyRef; + + struct C_OSD_RepModifyCommit; + + void repop_commit(RepModifyRef rm); + bool auto_repair_supported() const override { return store->has_builtin_csum(); } + + + int be_deep_scrub( + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o) override; + uint64_t be_get_ondisk_size(uint64_t logical_size) override { return logical_size; } +}; + +#endif diff --git a/src/osd/ScrubStore.cc b/src/osd/ScrubStore.cc new file mode 100644 index 00000000..036f4a88 --- /dev/null +++ b/src/osd/ScrubStore.cc @@ -0,0 +1,195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ScrubStore.h" +#include "osd_types.h" +#include "common/scrub_types.h" +#include "include/rados/rados_types.hpp" + +namespace { +ghobject_t make_scrub_object(const spg_t& pgid) +{ + ostringstream ss; + ss << "scrub_" << pgid; + return pgid.make_temp_ghobject(ss.str()); +} + +string first_object_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0x00000000, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +// the object_key should be unique across pools +string to_object_key(int64_t pool, const librados::object_id_t& oid) +{ + auto hoid = hobject_t(object_t(oid.name), + oid.locator, // key + oid.snap, + 0, // hash + pool, + oid.nspace); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +string last_object_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0xffffffff, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +string first_snap_key(int64_t pool) +{ + // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for + // the representing the minimal and maximum keys. and this relies on how + // hobject_t::to_str() works: hex(pool).hex(revhash). + auto hoid = hobject_t(object_t(), + "", + 0, + 0x00000000, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} + +string to_snap_key(int64_t pool, const librados::object_id_t& oid) +{ + auto hoid = hobject_t(object_t(oid.name), + oid.locator, // key + oid.snap, + 0x77777777, // hash + pool, + oid.nspace); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} + +string last_snap_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0xffffffff, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} +} + +namespace Scrub { + +Store* +Store::create(ObjectStore* store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll) +{ + ceph_assert(store); + ceph_assert(t); + ghobject_t oid = make_scrub_object(pgid); + t->touch(coll, oid); + return new Store{coll, oid, store}; +} + +Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store) + : coll(coll), + hoid(oid), + driver(store, coll, hoid), + backend(&driver) +{} + +Store::~Store() +{ + ceph_assert(results.empty()); +} + +void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) +{ + bufferlist bl; + e.encode(bl); + results[to_object_key(pool, e.object)] = bl; +} + +void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) +{ + bufferlist bl; + e.encode(bl); + results[to_snap_key(pool, e.object)] = bl; +} + +bool Store::empty() const +{ + return results.empty(); +} + +void Store::flush(ObjectStore::Transaction* t) +{ + if (t) { + OSDriver::OSTransaction txn = driver.get_transaction(t); + backend.set_keys(results, &txn); + } + results.clear(); +} + +void Store::cleanup(ObjectStore::Transaction* t) +{ + t->remove(coll, hoid); +} + +std::vector<bufferlist> +Store::get_snap_errors(ObjectStore* store, + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) +{ + const string begin = (start.name.empty() ? + first_snap_key(pool) : to_snap_key(pool, start)); + const string end = last_snap_key(pool); + return get_errors(store, begin, end, max_return); +} + +std::vector<bufferlist> +Store::get_object_errors(ObjectStore* store, + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) +{ + const string begin = (start.name.empty() ? + first_object_key(pool) : to_object_key(pool, start)); + const string end = last_object_key(pool); + return get_errors(store, begin, end, max_return); +} + +std::vector<bufferlist> +Store::get_errors(ObjectStore* store, + const string& begin, + const string& end, + uint64_t max_return) +{ + vector<bufferlist> errors; + auto next = std::make_pair(begin, bufferlist{}); + while (max_return && !backend.get_next(next.first, &next)) { + if (next.first >= end) + break; + errors.push_back(next.second); + max_return--; + } + return errors; +} + +} // namespace Scrub diff --git a/src/osd/ScrubStore.h b/src/osd/ScrubStore.h new file mode 100644 index 00000000..39c7da67 --- /dev/null +++ b/src/osd/ScrubStore.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_SCRUB_RESULT_H +#define CEPH_SCRUB_RESULT_H + +#include "SnapMapper.h" // for OSDriver +#include "common/map_cacher.hpp" + +namespace librados { + struct object_id_t; +} + +struct inconsistent_obj_wrapper; +struct inconsistent_snapset_wrapper; + +namespace Scrub { + +class Store { +public: + ~Store(); + static Store* create(ObjectStore* store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); + void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e); + void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e); + bool empty() const; + void flush(ObjectStore::Transaction *); + void cleanup(ObjectStore::Transaction *); + std::vector<bufferlist> get_snap_errors(ObjectStore* store, + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return); + std::vector<bufferlist> get_object_errors(ObjectStore* store, + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return); +private: + Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store); + std::vector<bufferlist> get_errors(ObjectStore* store, + const string& start, const string& end, + uint64_t max_return); +private: + const coll_t coll; + const ghobject_t hoid; + // a temp object holding mappings from seq-id to inconsistencies found in + // scrubbing + OSDriver driver; + MapCacher::MapCacher<std::string, bufferlist> backend; + map<string, bufferlist> results; +}; +} + +#endif // CEPH_SCRUB_RESULT_H diff --git a/src/osd/Session.cc b/src/osd/Session.cc new file mode 100644 index 00000000..44b5817a --- /dev/null +++ b/src/osd/Session.cc @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PG.h" +#include "Session.h" + +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd + +void Session::clear_backoffs() +{ + map<spg_t,map<hobject_t,set<BackoffRef>>> ls; + { + std::lock_guard l(backoff_lock); + ls.swap(backoffs); + backoff_count = 0; + } + for (auto& i : ls) { + for (auto& p : i.second) { + for (auto& b : p.second) { + std::lock_guard l(b->lock); + if (b->pg) { + ceph_assert(b->session == this); + ceph_assert(b->is_new() || b->is_acked()); + b->pg->rm_backoff(b); + b->pg.reset(); + b->session.reset(); + } else if (b->session) { + ceph_assert(b->session == this); + ceph_assert(b->is_deleting()); + b->session.reset(); + } + } + } + } +} + +void Session::ack_backoff( + CephContext *cct, + spg_t pgid, + uint64_t id, + const hobject_t& begin, + const hobject_t& end) +{ + std::lock_guard l(backoff_lock); + auto p = backoffs.find(pgid); + if (p == backoffs.end()) { + dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << "," + << end << ") pg not found" << dendl; + return; + } + auto q = p->second.find(begin); + if (q == p->second.end()) { + dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << "," + << end << ") begin not found" << dendl; + return; + } + for (auto i = q->second.begin(); i != q->second.end(); ++i) { + Backoff *b = (*i).get(); + if (b->id == id) { + if (b->is_new()) { + b->state = Backoff::STATE_ACKED; + dout(20) << __func__ << " now " << *b << dendl; + } else if (b->is_deleting()) { + dout(20) << __func__ << " deleting " << *b << dendl; + q->second.erase(i); + --backoff_count; + } + break; + } + } + if (q->second.empty()) { + dout(20) << __func__ << " clearing begin bin " << q->first << dendl; + p->second.erase(q); + if (p->second.empty()) { + dout(20) << __func__ << " clearing pg bin " << p->first << dendl; + backoffs.erase(p); + } + } + ceph_assert(!backoff_count == backoffs.empty()); +} + +bool Session::check_backoff( + CephContext *cct, spg_t pgid, const hobject_t& oid, const Message *m) +{ + BackoffRef b(have_backoff(pgid, oid)); + if (b) { + dout(10) << __func__ << " session " << this << " has backoff " << *b + << " for " << *m << dendl; + ceph_assert(!b->is_acked() || !g_conf()->osd_debug_crash_on_ignored_backoff); + return true; + } + // we may race with ms_handle_reset. it clears session->con before removing + // backoffs, so if we see con is cleared here we have to abort this + // request. + if (!con) { + dout(10) << __func__ << " session " << this << " disconnected" << dendl; + return true; + } + return false; +} diff --git a/src/osd/Session.h b/src/osd/Session.h new file mode 100644 index 00000000..e391200d --- /dev/null +++ b/src/osd/Session.h @@ -0,0 +1,238 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_SESSION_H +#define CEPH_OSD_SESSION_H + +#include "common/RefCountedObj.h" +#include "common/Mutex.h" +#include "global/global_context.h" +#include "include/spinlock.h" +#include "OSDCap.h" +#include "Watch.h" +#include "OSDMap.h" + +//#define PG_DEBUG_REFS + +struct Session; +typedef boost::intrusive_ptr<Session> SessionRef; +struct Backoff; +typedef boost::intrusive_ptr<Backoff> BackoffRef; +class PG; +#ifdef PG_DEBUG_REFS +#include "common/tracked_int_ptr.hpp" +typedef TrackedIntPtr<PG> PGRef; +#else +typedef boost::intrusive_ptr<PG> PGRef; +#endif + +/* + * A Backoff represents one instance of either a PG or an OID + * being plugged at the client. It's refcounted and linked from + * the PG {pg_oid}_backoffs map and from the client Session + * object. + * + * The Backoff has a lock that protects it's internal fields. + * + * The PG has a backoff_lock that protects it's maps to Backoffs. + * This lock is *inside* of Backoff::lock. + * + * The Session has a backoff_lock that protects it's map of pg and + * oid backoffs. This lock is *inside* the Backoff::lock *and* + * PG::backoff_lock. + * + * That's + * + * Backoff::lock + * PG::backoff_lock + * Session::backoff_lock + * + * When the Session goes away, we move our backoff lists aside, + * then we lock each of the Backoffs we + * previously referenced and clear the Session* pointer. If the PG + * is still linked, we unlink it, too. + * + * When the PG clears the backoff, it will send an unblock message + * if the Session* is still non-null, and unlink the session. + * + */ + +struct Backoff : public RefCountedObject { + enum { + STATE_NEW = 1, ///< backoff in flight to client + STATE_ACKED = 2, ///< backoff acked + STATE_DELETING = 3 ///< backoff deleted, but un-acked + }; + std::atomic<int> state = {STATE_NEW}; + spg_t pgid; ///< owning pgid + uint64_t id = 0; ///< unique id (within the Session) + + bool is_new() const { + return state.load() == STATE_NEW; + } + bool is_acked() const { + return state.load() == STATE_ACKED; + } + bool is_deleting() const { + return state.load() == STATE_DELETING; + } + const char *get_state_name() const { + switch (state.load()) { + case STATE_NEW: return "new"; + case STATE_ACKED: return "acked"; + case STATE_DELETING: return "deleting"; + default: return "???"; + } + } + + Mutex lock; + // NOTE: the owning PG and session are either + // - *both* set, or + // - both null (teardown), or + // - only session is set (and state == DELETING) + PGRef pg; ///< owning pg + SessionRef session; ///< owning session + hobject_t begin, end; ///< [) range to block, unless ==, then single obj + + Backoff(spg_t pgid, PGRef pg, SessionRef s, + uint64_t i, + const hobject_t& b, const hobject_t& e) + : RefCountedObject(g_ceph_context, 0), + pgid(pgid), + id(i), + lock("Backoff::lock"), + pg(pg), + session(s), + begin(b), + end(e) {} + + friend ostream& operator<<(ostream& out, const Backoff& b) { + return out << "Backoff(" << &b << " " << b.pgid << " " << b.id + << " " << b.get_state_name() + << " [" << b.begin << "," << b.end << ") " + << " session " << b.session + << " pg " << b.pg << ")"; + } +}; + + + +struct Session : public RefCountedObject { + EntityName entity_name; + OSDCap caps; + ConnectionRef con; + entity_addr_t socket_addr; + WatchConState wstate; + + Mutex session_dispatch_lock; + boost::intrusive::list<OpRequest> waiting_on_map; + + ceph::spinlock sent_epoch_lock; + epoch_t last_sent_epoch; + + /// protects backoffs; orders inside Backoff::lock *and* PG::backoff_lock + Mutex backoff_lock; + std::atomic<int> backoff_count= {0}; ///< simple count of backoffs + map<spg_t,map<hobject_t,set<BackoffRef>>> backoffs; + + std::atomic<uint64_t> backoff_seq = {0}; + + explicit Session(CephContext *cct, Connection *con_) : + RefCountedObject(cct), + con(con_), + socket_addr(con_->get_peer_socket_addr()), + wstate(cct), + session_dispatch_lock("Session::session_dispatch_lock"), + last_sent_epoch(0), + backoff_lock("Session::backoff_lock") + {} + + entity_addr_t& get_peer_socket_addr() { + return socket_addr; + } + + void ack_backoff( + CephContext *cct, + spg_t pgid, + uint64_t id, + const hobject_t& start, + const hobject_t& end); + + BackoffRef have_backoff(spg_t pgid, const hobject_t& oid) { + if (!backoff_count.load()) { + return nullptr; + } + std::lock_guard l(backoff_lock); + ceph_assert(!backoff_count == backoffs.empty()); + auto i = backoffs.find(pgid); + if (i == backoffs.end()) { + return nullptr; + } + auto p = i->second.lower_bound(oid); + if (p != i->second.begin() && + (p == i->second.end() || p->first > oid)) { + --p; + } + if (p != i->second.end()) { + int r = cmp(oid, p->first); + if (r == 0 || r > 0) { + for (auto& q : p->second) { + if (r == 0 || oid < q->end) { + return &(*q); + } + } + } + } + return nullptr; + } + + bool check_backoff( + CephContext *cct, spg_t pgid, const hobject_t& oid, const Message *m); + + void add_backoff(BackoffRef b) { + std::lock_guard l(backoff_lock); + ceph_assert(!backoff_count == backoffs.empty()); + backoffs[b->pgid][b->begin].insert(b); + ++backoff_count; + } + + // called by PG::release_*_backoffs and PG::clear_backoffs() + void rm_backoff(BackoffRef b) { + std::lock_guard l(backoff_lock); + ceph_assert(b->lock.is_locked_by_me()); + ceph_assert(b->session == this); + auto i = backoffs.find(b->pgid); + if (i != backoffs.end()) { + // may race with clear_backoffs() + auto p = i->second.find(b->begin); + if (p != i->second.end()) { + auto q = p->second.find(b); + if (q != p->second.end()) { + p->second.erase(q); + --backoff_count; + if (p->second.empty()) { + i->second.erase(p); + if (i->second.empty()) { + backoffs.erase(i); + } + } + } + } + } + ceph_assert(!backoff_count == backoffs.empty()); + } + void clear_backoffs(); +}; + +#endif diff --git a/src/osd/SnapMapper.cc b/src/osd/SnapMapper.cc new file mode 100644 index 00000000..4c82d5a3 --- /dev/null +++ b/src/osd/SnapMapper.cc @@ -0,0 +1,385 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SnapMapper.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout << "snap_mapper." + +using std::string; + +const string SnapMapper::MAPPING_PREFIX = "MAP_"; +const string SnapMapper::OBJECT_PREFIX = "OBJ_"; + +int OSDriver::get_keys( + const std::set<std::string> &keys, + std::map<std::string, bufferlist> *out) +{ + return os->omap_get_values(ch, hoid, keys, out); +} + +int OSDriver::get_next( + const std::string &key, + pair<std::string, bufferlist> *next) +{ + ObjectMap::ObjectMapIterator iter = + os->get_omap_iterator(ch, hoid); + if (!iter) { + ceph_abort(); + return -EINVAL; + } + iter->upper_bound(key); + if (iter->valid()) { + if (next) + *next = make_pair(iter->key(), iter->value()); + return 0; + } else { + return -ENOENT; + } +} + +struct Mapping { + snapid_t snap; + hobject_t hoid; + explicit Mapping(const pair<snapid_t, hobject_t> &in) + : snap(in.first), hoid(in.second) {} + Mapping() : snap(0) {} + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + encode(snap, bl); + encode(hoid, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &bl) { + DECODE_START(1, bl); + decode(snap, bl); + decode(hoid, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(Mapping) + +string SnapMapper::get_prefix(snapid_t snap) +{ + char buf[100]; + int len = snprintf( + buf, sizeof(buf), + "%.*X_", (int)(sizeof(snap)*2), + static_cast<unsigned>(snap)); + return MAPPING_PREFIX + string(buf, len); +} + +string SnapMapper::to_raw_key( + const pair<snapid_t, hobject_t> &in) +{ + return get_prefix(in.first) + shard_prefix + in.second.to_str(); +} + +pair<string, bufferlist> SnapMapper::to_raw( + const pair<snapid_t, hobject_t> &in) +{ + bufferlist bl; + encode(Mapping(in), bl); + return make_pair( + to_raw_key(in), + bl); +} + +pair<snapid_t, hobject_t> SnapMapper::from_raw( + const pair<std::string, bufferlist> &image) +{ + Mapping map; + bufferlist bl(image.second); + auto bp = bl.cbegin(); + decode(map, bp); + return make_pair(map.snap, map.hoid); +} + +bool SnapMapper::is_mapping(const string &to_test) +{ + return to_test.substr(0, MAPPING_PREFIX.size()) == MAPPING_PREFIX; +} + +string SnapMapper::to_object_key(const hobject_t &hoid) +{ + return OBJECT_PREFIX + shard_prefix + hoid.to_str(); +} + +void SnapMapper::object_snaps::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(oid, bl); + encode(snaps, bl); + ENCODE_FINISH(bl); +} + +void SnapMapper::object_snaps::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(oid, bl); + decode(snaps, bl); + DECODE_FINISH(bl); +} + +bool SnapMapper::check(const hobject_t &hoid) const +{ + if (hoid.match(mask_bits, match)) { + return true; + } + derr << __func__ << " " << hoid << " mask_bits " << mask_bits + << " match 0x" << std::hex << match << std::dec << " is false" + << dendl; + return false; +} + +int SnapMapper::get_snaps( + const hobject_t &oid, + object_snaps *out) +{ + ceph_assert(check(oid)); + set<string> keys; + map<string, bufferlist> got; + keys.insert(to_object_key(oid)); + int r = backend.get_keys(keys, &got); + if (r < 0) { + dout(20) << __func__ << " " << oid << " got err " << r << dendl; + return r; + } + if (got.empty()) { + dout(20) << __func__ << " " << oid << " got.empty()" << dendl; + return -ENOENT; + } + if (out) { + auto bp = got.begin()->second.cbegin(); + decode(*out, bp); + dout(20) << __func__ << " " << oid << " " << out->snaps << dendl; + if (out->snaps.empty()) { + dout(1) << __func__ << " " << oid << " empty snapset" << dendl; + ceph_assert(!cct->_conf->osd_debug_verify_snaps); + } + } else { + dout(20) << __func__ << " " << oid << " (out == NULL)" << dendl; + } + return 0; +} + +void SnapMapper::clear_snaps( + const hobject_t &oid, + MapCacher::Transaction<std::string, bufferlist> *t) +{ + dout(20) << __func__ << " " << oid << dendl; + ceph_assert(check(oid)); + set<string> to_remove; + to_remove.insert(to_object_key(oid)); + if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) { + for (auto& i : to_remove) { + dout(20) << __func__ << " rm " << i << dendl; + } + } + backend.remove_keys(to_remove, t); +} + +void SnapMapper::set_snaps( + const hobject_t &oid, + const object_snaps &in, + MapCacher::Transaction<std::string, bufferlist> *t) +{ + ceph_assert(check(oid)); + map<string, bufferlist> to_set; + bufferlist bl; + encode(in, bl); + to_set[to_object_key(oid)] = bl; + dout(20) << __func__ << " " << oid << " " << in.snaps << dendl; + if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) { + for (auto& i : to_set) { + dout(20) << __func__ << " set " << i.first << dendl; + } + } + backend.set_keys(to_set, t); +} + +int SnapMapper::update_snaps( + const hobject_t &oid, + const set<snapid_t> &new_snaps, + const set<snapid_t> *old_snaps_check, + MapCacher::Transaction<std::string, bufferlist> *t) +{ + dout(20) << __func__ << " " << oid << " " << new_snaps + << " was " << (old_snaps_check ? *old_snaps_check : set<snapid_t>()) + << dendl; + ceph_assert(check(oid)); + if (new_snaps.empty()) + return remove_oid(oid, t); + + object_snaps out; + int r = get_snaps(oid, &out); + // Tolerate missing keys but not disk errors + if (r < 0 && r != -ENOENT) + return r; + if (old_snaps_check) + ceph_assert(out.snaps == *old_snaps_check); + + object_snaps in(oid, new_snaps); + set_snaps(oid, in, t); + + set<string> to_remove; + for (set<snapid_t>::iterator i = out.snaps.begin(); + i != out.snaps.end(); + ++i) { + if (!new_snaps.count(*i)) { + to_remove.insert(to_raw_key(make_pair(*i, oid))); + } + } + if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) { + for (auto& i : to_remove) { + dout(20) << __func__ << " rm " << i << dendl; + } + } + backend.remove_keys(to_remove, t); + return 0; +} + +void SnapMapper::add_oid( + const hobject_t &oid, + const set<snapid_t>& snaps, + MapCacher::Transaction<std::string, bufferlist> *t) +{ + dout(20) << __func__ << " " << oid << " " << snaps << dendl; + ceph_assert(!snaps.empty()); + ceph_assert(check(oid)); + { + object_snaps out; + int r = get_snaps(oid, &out); + if (r != -ENOENT) { + derr << __func__ << " found existing snaps mapped on " << oid + << ", removing" << dendl; + ceph_assert(!cct->_conf->osd_debug_verify_snaps); + remove_oid(oid, t); + } + } + + object_snaps _snaps(oid, snaps); + set_snaps(oid, _snaps, t); + + map<string, bufferlist> to_add; + for (set<snapid_t>::iterator i = snaps.begin(); + i != snaps.end(); + ++i) { + to_add.insert(to_raw(make_pair(*i, oid))); + } + if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) { + for (auto& i : to_add) { + dout(20) << __func__ << " set " << i.first << dendl; + } + } + backend.set_keys(to_add, t); +} + +int SnapMapper::get_next_objects_to_trim( + snapid_t snap, + unsigned max, + vector<hobject_t> *out) +{ + ceph_assert(out); + ceph_assert(out->empty()); + int r = 0; + for (set<string>::iterator i = prefixes.begin(); + i != prefixes.end() && out->size() < max && r == 0; + ++i) { + string prefix(get_prefix(snap) + *i); + string pos = prefix; + while (out->size() < max) { + pair<string, bufferlist> next; + r = backend.get_next(pos, &next); + dout(20) << __func__ << " get_next(" << pos << ") returns " << r + << " " << next << dendl; + if (r != 0) { + break; // Done + } + + if (next.first.substr(0, prefix.size()) != + prefix) { + break; // Done with this prefix + } + + ceph_assert(is_mapping(next.first)); + + dout(20) << __func__ << " " << next.first << dendl; + pair<snapid_t, hobject_t> next_decoded(from_raw(next)); + ceph_assert(next_decoded.first == snap); + ceph_assert(check(next_decoded.second)); + + out->push_back(next_decoded.second); + pos = next.first; + } + } + if (out->size() == 0) { + return -ENOENT; + } else { + return 0; + } +} + + +int SnapMapper::remove_oid( + const hobject_t &oid, + MapCacher::Transaction<std::string, bufferlist> *t) +{ + dout(20) << __func__ << " " << oid << dendl; + ceph_assert(check(oid)); + return _remove_oid(oid, t); +} + +int SnapMapper::_remove_oid( + const hobject_t &oid, + MapCacher::Transaction<std::string, bufferlist> *t) +{ + dout(20) << __func__ << " " << oid << dendl; + object_snaps out; + int r = get_snaps(oid, &out); + if (r < 0) + return r; + + clear_snaps(oid, t); + + set<string> to_remove; + for (set<snapid_t>::iterator i = out.snaps.begin(); + i != out.snaps.end(); + ++i) { + to_remove.insert(to_raw_key(make_pair(*i, oid))); + } + if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) { + for (auto& i : to_remove) { + dout(20) << __func__ << " rm " << i << dendl; + } + } + backend.remove_keys(to_remove, t); + return 0; +} + +int SnapMapper::get_snaps( + const hobject_t &oid, + std::set<snapid_t> *snaps) +{ + ceph_assert(check(oid)); + object_snaps out; + int r = get_snaps(oid, &out); + if (r < 0) + return r; + if (snaps) + snaps->swap(out.snaps); + return 0; +} diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h new file mode 100644 index 00000000..21157ef2 --- /dev/null +++ b/src/osd/SnapMapper.h @@ -0,0 +1,236 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef SNAPMAPPER_H +#define SNAPMAPPER_H + +#include <string> +#include <set> +#include <utility> +#include <string.h> + +#include "common/map_cacher.hpp" +#include "common/hobject.h" +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/object.h" +#include "os/ObjectStore.h" + +class OSDriver : public MapCacher::StoreDriver<std::string, bufferlist> { + ObjectStore *os; + ObjectStore::CollectionHandle ch; + ghobject_t hoid; + +public: + class OSTransaction : public MapCacher::Transaction<std::string, bufferlist> { + friend class OSDriver; + coll_t cid; + ghobject_t hoid; + ObjectStore::Transaction *t; + OSTransaction( + const coll_t &cid, + const ghobject_t &hoid, + ObjectStore::Transaction *t) + : cid(cid), hoid(hoid), t(t) {} + public: + void set_keys( + const std::map<std::string, bufferlist> &to_set) override { + t->omap_setkeys(cid, hoid, to_set); + } + void remove_keys( + const std::set<std::string> &to_remove) override { + t->omap_rmkeys(cid, hoid, to_remove); + } + void add_callback( + Context *c) override { + t->register_on_applied(c); + } + }; + + OSTransaction get_transaction( + ObjectStore::Transaction *t) { + return OSTransaction(ch->cid, hoid, t); + } + + OSDriver(ObjectStore *os, const coll_t& cid, const ghobject_t &hoid) : + os(os), + hoid(hoid) { + ch = os->open_collection(cid); + } + int get_keys( + const std::set<std::string> &keys, + std::map<std::string, bufferlist> *out) override; + int get_next( + const std::string &key, + pair<std::string, bufferlist> *next) override; +}; + +/** + * SnapMapper + * + * Manages two mappings: + * 1) hobject_t -> {snapid} + * 2) snapid -> {hobject_t} + * + * We accomplish this using two sets of keys: + * 1) OBJECT_PREFIX + obj.str() -> encoding of object_snaps + * 2) MAPPING_PREFIX + snapid_t + obj.str() -> encoding of pair<snapid_t, obj> + * + * The on disk strings and encodings are implemented in to_raw, to_raw_key, + * from_raw, to_object_key. + * + * The object -> {snapid} mapping is primarily included so that the + * SnapMapper state can be verified against the external PG state during + * scrub etc. + * + * The 2) mapping is arranged such that all objects in a particular + * snap will sort together, and so that all objects in a pg for a + * particular snap will group under up to 8 prefixes. + */ +class SnapMapper { +public: + CephContext* cct; + struct object_snaps { + hobject_t oid; + std::set<snapid_t> snaps; + object_snaps(hobject_t oid, const std::set<snapid_t> &snaps) + : oid(oid), snaps(snaps) {} + object_snaps() {} + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bp); + }; + +private: + MapCacher::MapCacher<std::string, bufferlist> backend; + + static const std::string MAPPING_PREFIX; + static const std::string OBJECT_PREFIX; + + static std::string get_prefix(snapid_t snap); + + std::string to_raw_key( + const std::pair<snapid_t, hobject_t> &to_map); + + std::pair<std::string, bufferlist> to_raw( + const std::pair<snapid_t, hobject_t> &to_map); + + static bool is_mapping(const std::string &to_test); + + std::pair<snapid_t, hobject_t> from_raw( + const std::pair<std::string, bufferlist> &image); + + std::string to_object_key(const hobject_t &hoid); + + int get_snaps(const hobject_t &oid, object_snaps *out); + + void set_snaps( + const hobject_t &oid, + const object_snaps &out, + MapCacher::Transaction<std::string, bufferlist> *t); + + void clear_snaps( + const hobject_t &oid, + MapCacher::Transaction<std::string, bufferlist> *t); + + // True if hoid belongs in this mapping based on mask_bits and match + bool check(const hobject_t &hoid) const; + + int _remove_oid( + const hobject_t &oid, ///< [in] oid to remove + MapCacher::Transaction<std::string, bufferlist> *t ///< [out] transaction + ); + +public: + static string make_shard_prefix(shard_id_t shard) { + if (shard == shard_id_t::NO_SHARD) + return string(); + char buf[20]; + int r = snprintf(buf, sizeof(buf), ".%x", (int)shard); + ceph_assert(r < (int)sizeof(buf)); + return string(buf, r) + '_'; + } + uint32_t mask_bits; + const uint32_t match; + string last_key_checked; + const int64_t pool; + const shard_id_t shard; + const string shard_prefix; + SnapMapper( + CephContext* cct, + MapCacher::StoreDriver<std::string, bufferlist> *driver, + uint32_t match, ///< [in] pgid + uint32_t bits, ///< [in] current split bits + int64_t pool, ///< [in] pool + shard_id_t shard ///< [in] shard + ) + : cct(cct), backend(driver), mask_bits(bits), match(match), pool(pool), + shard(shard), shard_prefix(make_shard_prefix(shard)) { + update_bits(mask_bits); + } + + set<string> prefixes; + /// Update bits in case of pg split or merge + void update_bits( + uint32_t new_bits ///< [in] new split bits + ) { + mask_bits = new_bits; + set<string> _prefixes = hobject_t::get_prefixes( + mask_bits, + match, + pool); + prefixes.clear(); + for (set<string>::iterator i = _prefixes.begin(); + i != _prefixes.end(); + ++i) { + prefixes.insert(shard_prefix + *i); + } + } + + /// Update snaps for oid, empty new_snaps removes the mapping + int update_snaps( + const hobject_t &oid, ///< [in] oid to update + const std::set<snapid_t> &new_snaps, ///< [in] new snap set + const std::set<snapid_t> *old_snaps, ///< [in] old snaps (for debugging) + MapCacher::Transaction<std::string, bufferlist> *t ///< [out] transaction + ); ///@ return error, 0 on success + + /// Add mapping for oid, must not already be mapped + void add_oid( + const hobject_t &oid, ///< [in] oid to add + const std::set<snapid_t>& new_snaps, ///< [in] snaps + MapCacher::Transaction<std::string, bufferlist> *t ///< [out] transaction + ); + + /// Returns first object with snap as a snap + int get_next_objects_to_trim( + snapid_t snap, ///< [in] snap to check + unsigned max, ///< [in] max to get + vector<hobject_t> *out ///< [out] next objects to trim (must be empty) + ); ///< @return error, -ENOENT if no more objects + + /// Remove mapping for oid + int remove_oid( + const hobject_t &oid, ///< [in] oid to remove + MapCacher::Transaction<std::string, bufferlist> *t ///< [out] transaction + ); ///< @return error, -ENOENT if the object is not mapped + + /// Get snaps for oid + int get_snaps( + const hobject_t &oid, ///< [in] oid to get snaps for + std::set<snapid_t> *snaps ///< [out] snaps + ); ///< @return error, -ENOENT if oid is not recorded +}; +WRITE_CLASS_ENCODER(SnapMapper::object_snaps) + +#endif diff --git a/src/osd/TierAgentState.h b/src/osd/TierAgentState.h new file mode 100644 index 00000000..2c58534b --- /dev/null +++ b/src/osd/TierAgentState.h @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Sage Weil <sage@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_TIERAGENT_H +#define CEPH_OSD_TIERAGENT_H + +struct TierAgentState { + /// current position iterating across pool + hobject_t position; + /// Count of agent_work since "start" position of object hash space + int started; + hobject_t start; + bool delaying; + + /// histogram of ages we've encountered + pow2_hist_t temp_hist; + int hist_age; + + /// past HitSet(s) (not current) + map<time_t,HitSetRef> hit_set_map; + + /// a few recent things we've seen that are clean + list<hobject_t> recent_clean; + + enum flush_mode_t { + FLUSH_MODE_IDLE, // nothing to flush + FLUSH_MODE_LOW, // flush dirty objects with a low speed + FLUSH_MODE_HIGH, //flush dirty objects with a high speed + } flush_mode; ///< current flush behavior + static const char *get_flush_mode_name(flush_mode_t m) { + switch (m) { + case FLUSH_MODE_IDLE: return "idle"; + case FLUSH_MODE_LOW: return "low"; + case FLUSH_MODE_HIGH: return "high"; + default: ceph_abort_msg("bad flush mode"); + } + } + const char *get_flush_mode_name() const { + return get_flush_mode_name(flush_mode); + } + + enum evict_mode_t { + EVICT_MODE_IDLE, // no need to evict anything + EVICT_MODE_SOME, // evict some things as we are near the target + EVICT_MODE_FULL, // evict anything + } evict_mode; ///< current evict behavior + static const char *get_evict_mode_name(evict_mode_t m) { + switch (m) { + case EVICT_MODE_IDLE: return "idle"; + case EVICT_MODE_SOME: return "some"; + case EVICT_MODE_FULL: return "full"; + default: ceph_abort_msg("bad evict mode"); + } + } + const char *get_evict_mode_name() const { + return get_evict_mode_name(evict_mode); + } + + /// approximate ratio of objects (assuming they are uniformly + /// distributed) that i should aim to evict. + unsigned evict_effort; + + TierAgentState() + : started(0), + delaying(false), + hist_age(0), + flush_mode(FLUSH_MODE_IDLE), + evict_mode(EVICT_MODE_IDLE), + evict_effort(0) + {} + + /// false if we have any work to do + bool is_idle() const { + return + delaying || + (flush_mode == FLUSH_MODE_IDLE && + evict_mode == EVICT_MODE_IDLE); + } + + /// add archived HitSet + void add_hit_set(time_t start, HitSetRef hs) { + hit_set_map.insert(make_pair(start, hs)); + } + + /// remove old/trimmed HitSet + void remove_oldest_hit_set() { + if (!hit_set_map.empty()) + hit_set_map.erase(hit_set_map.begin()); + } + + /// discard all open hit sets + void discard_hit_sets() { + hit_set_map.clear(); + } + + void dump(Formatter *f) const { + f->dump_string("flush_mode", get_flush_mode_name()); + f->dump_string("evict_mode", get_evict_mode_name()); + f->dump_unsigned("evict_effort", evict_effort); + f->dump_stream("position") << position; + f->open_object_section("temp_hist"); + temp_hist.dump(f); + f->close_section(); + } +}; + +#endif diff --git a/src/osd/Watch.cc b/src/osd/Watch.cc new file mode 100644 index 00000000..bb25b448 --- /dev/null +++ b/src/osd/Watch.cc @@ -0,0 +1,538 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +#include "PG.h" + +#include "include/types.h" +#include "messages/MWatchNotify.h" + +#include <map> + +#include "OSD.h" +#include "PrimaryLogPG.h" +#include "Watch.h" +#include "Session.h" + +#include "common/config.h" + +struct CancelableContext : public Context { + virtual void cancel() = 0; +}; + +#define dout_context osd->cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +static ostream& _prefix( + std::ostream* _dout, + Notify *notify) { + return notify->gen_dbg_prefix(*_dout); +} + +Notify::Notify( + ConnectionRef client, + uint64_t client_gid, + bufferlist &payload, + uint32_t timeout, + uint64_t cookie, + uint64_t notify_id, + uint64_t version, + OSDService *osd) + : client(client), client_gid(client_gid), + complete(false), + discarded(false), + timed_out(false), + payload(payload), + timeout(timeout), + cookie(cookie), + notify_id(notify_id), + version(version), + osd(osd), + cb(NULL), + lock("Notify::lock") {} + +NotifyRef Notify::makeNotifyRef( + ConnectionRef client, + uint64_t client_gid, + bufferlist &payload, + uint32_t timeout, + uint64_t cookie, + uint64_t notify_id, + uint64_t version, + OSDService *osd) { + NotifyRef ret( + new Notify( + client, client_gid, + payload, timeout, + cookie, notify_id, + version, osd)); + ret->set_self(ret); + return ret; +} + +class NotifyTimeoutCB : public CancelableContext { + NotifyRef notif; + bool canceled; // protected by notif lock +public: + explicit NotifyTimeoutCB(NotifyRef notif) : notif(notif), canceled(false) {} + void finish(int) override { + notif->osd->watch_lock.Unlock(); + notif->lock.Lock(); + if (!canceled) + notif->do_timeout(); // drops lock + else + notif->lock.Unlock(); + notif->osd->watch_lock.Lock(); + } + void cancel() override { + ceph_assert(notif->lock.is_locked_by_me()); + canceled = true; + } +}; + +void Notify::do_timeout() +{ + ceph_assert(lock.is_locked_by_me()); + dout(10) << "timeout" << dendl; + cb = nullptr; + if (is_discarded()) { + lock.Unlock(); + return; + } + + timed_out = true; // we will send the client an error code + maybe_complete_notify(); + ceph_assert(complete); + set<WatchRef> _watchers; + _watchers.swap(watchers); + lock.Unlock(); + + for (set<WatchRef>::iterator i = _watchers.begin(); + i != _watchers.end(); + ++i) { + boost::intrusive_ptr<PrimaryLogPG> pg((*i)->get_pg()); + pg->lock(); + if (!(*i)->is_discarded()) { + (*i)->cancel_notify(self.lock()); + } + pg->unlock(); + } +} + +void Notify::register_cb() +{ + ceph_assert(lock.is_locked_by_me()); + { + osd->watch_lock.Lock(); + cb = new NotifyTimeoutCB(self.lock()); + if (!osd->watch_timer.add_event_after(timeout, cb)) { + cb = nullptr; + } + osd->watch_lock.Unlock(); + } +} + +void Notify::unregister_cb() +{ + ceph_assert(lock.is_locked_by_me()); + if (!cb) + return; + cb->cancel(); + { + osd->watch_lock.Lock(); + osd->watch_timer.cancel_event(cb); + cb = nullptr; + osd->watch_lock.Unlock(); + } +} + +void Notify::start_watcher(WatchRef watch) +{ + std::lock_guard l(lock); + dout(10) << "start_watcher" << dendl; + watchers.insert(watch); +} + +void Notify::complete_watcher(WatchRef watch, bufferlist& reply_bl) +{ + std::lock_guard l(lock); + dout(10) << "complete_watcher" << dendl; + if (is_discarded()) + return; + ceph_assert(watchers.count(watch)); + watchers.erase(watch); + notify_replies.insert(make_pair(make_pair(watch->get_watcher_gid(), + watch->get_cookie()), + reply_bl)); + maybe_complete_notify(); +} + +void Notify::complete_watcher_remove(WatchRef watch) +{ + std::lock_guard l(lock); + dout(10) << __func__ << dendl; + if (is_discarded()) + return; + ceph_assert(watchers.count(watch)); + watchers.erase(watch); + maybe_complete_notify(); +} + +void Notify::maybe_complete_notify() +{ + dout(10) << "maybe_complete_notify -- " + << watchers.size() + << " in progress watchers " << dendl; + if (watchers.empty() || timed_out) { + // prepare reply + bufferlist bl; + encode(notify_replies, bl); + list<pair<uint64_t,uint64_t> > missed; + for (set<WatchRef>::iterator p = watchers.begin(); p != watchers.end(); ++p) { + missed.push_back(make_pair((*p)->get_watcher_gid(), + (*p)->get_cookie())); + } + encode(missed, bl); + + bufferlist empty; + MWatchNotify *reply(new MWatchNotify(cookie, version, notify_id, + CEPH_WATCH_EVENT_NOTIFY_COMPLETE, empty)); + reply->notifier_gid = client_gid; + reply->set_data(bl); + if (timed_out) + reply->return_code = -ETIMEDOUT; + client->send_message(reply); + unregister_cb(); + + complete = true; + } +} + +void Notify::discard() +{ + std::lock_guard l(lock); + discarded = true; + unregister_cb(); + watchers.clear(); +} + +void Notify::init() +{ + std::lock_guard l(lock); + register_cb(); + maybe_complete_notify(); +} + +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, watch.get()) + +static ostream& _prefix( + std::ostream* _dout, + Watch *watch) { + return watch->gen_dbg_prefix(*_dout); +} + +class HandleWatchTimeout : public CancelableContext { + WatchRef watch; +public: + bool canceled; // protected by watch->pg->lock + explicit HandleWatchTimeout(WatchRef watch) : watch(watch), canceled(false) {} + void cancel() override { + canceled = true; + } + void finish(int) override { ceph_abort(); /* not used */ } + void complete(int) override { + OSDService *osd(watch->osd); + ldout(osd->cct, 10) << "HandleWatchTimeout" << dendl; + boost::intrusive_ptr<PrimaryLogPG> pg(watch->pg); + osd->watch_lock.Unlock(); + pg->lock(); + watch->cb = nullptr; + if (!watch->is_discarded() && !canceled) + watch->pg->handle_watch_timeout(watch); + delete this; // ~Watch requires pg lock! + pg->unlock(); + osd->watch_lock.Lock(); + } +}; + +class HandleDelayedWatchTimeout : public CancelableContext { + WatchRef watch; +public: + bool canceled; + explicit HandleDelayedWatchTimeout(WatchRef watch) : watch(watch), canceled(false) {} + void cancel() override { + canceled = true; + } + void finish(int) override { + OSDService *osd(watch->osd); + dout(10) << "HandleWatchTimeoutDelayed" << dendl; + ceph_assert(watch->pg->is_locked()); + watch->cb = nullptr; + if (!watch->is_discarded() && !canceled) + watch->pg->handle_watch_timeout(watch); + } +}; + +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +std::ostream& Watch::gen_dbg_prefix(std::ostream& out) { + return pg->gen_prefix(out) << " -- Watch(" + << make_pair(cookie, entity) << ") "; +} + +Watch::Watch( + PrimaryLogPG *pg, + OSDService *osd, + ObjectContextRef obc, + uint32_t timeout, + uint64_t cookie, + entity_name_t entity, + const entity_addr_t &addr) + : cb(NULL), + osd(osd), + pg(pg), + obc(obc), + timeout(timeout), + cookie(cookie), + addr(addr), + will_ping(false), + entity(entity), + discarded(false) { + dout(10) << "Watch()" << dendl; +} + +Watch::~Watch() { + dout(10) << "~Watch" << dendl; + // users must have called remove() or discard() prior to this point + ceph_assert(!obc); + ceph_assert(!conn); +} + +bool Watch::connected() { return !!conn; } + +Context *Watch::get_delayed_cb() +{ + ceph_assert(!cb); + cb = new HandleDelayedWatchTimeout(self.lock()); + return cb; +} + +void Watch::register_cb() +{ + std::lock_guard l(osd->watch_lock); + if (cb) { + dout(15) << "re-registering callback, timeout: " << timeout << dendl; + cb->cancel(); + osd->watch_timer.cancel_event(cb); + } else { + dout(15) << "registering callback, timeout: " << timeout << dendl; + } + cb = new HandleWatchTimeout(self.lock()); + if (!osd->watch_timer.add_event_after(timeout, cb)) { + cb = nullptr; + } +} + +void Watch::unregister_cb() +{ + dout(15) << "unregister_cb" << dendl; + if (!cb) + return; + dout(15) << "actually registered, cancelling" << dendl; + cb->cancel(); + { + std::lock_guard l(osd->watch_lock); + osd->watch_timer.cancel_event(cb); // harmless if not registered with timer + } + cb = nullptr; +} + +void Watch::got_ping(utime_t t) +{ + last_ping = t; + if (conn) { + register_cb(); + } +} + +void Watch::connect(ConnectionRef con, bool _will_ping) +{ + if (conn == con) { + dout(10) << __func__ << " con " << con << " - already connected" << dendl; + return; + } + dout(10) << __func__ << " con " << con << dendl; + conn = con; + will_ping = _will_ping; + auto priv = con->get_priv(); + if (priv) { + auto sessionref = static_cast<Session*>(priv.get()); + sessionref->wstate.addWatch(self.lock()); + priv.reset(); + for (map<uint64_t, NotifyRef>::iterator i = in_progress_notifies.begin(); + i != in_progress_notifies.end(); + ++i) { + send_notify(i->second); + } + } + if (will_ping) { + last_ping = ceph_clock_now(); + register_cb(); + } else { + unregister_cb(); + } +} + +void Watch::disconnect() +{ + dout(10) << "disconnect (con was " << conn << ")" << dendl; + conn = ConnectionRef(); + if (!will_ping) + register_cb(); +} + +void Watch::discard() +{ + dout(10) << "discard" << dendl; + for (map<uint64_t, NotifyRef>::iterator i = in_progress_notifies.begin(); + i != in_progress_notifies.end(); + ++i) { + i->second->discard(); + } + discard_state(); +} + +void Watch::discard_state() +{ + ceph_assert(pg->is_locked()); + ceph_assert(!discarded); + ceph_assert(obc); + in_progress_notifies.clear(); + unregister_cb(); + discarded = true; + if (conn) { + if (auto priv = conn->get_priv(); priv) { + auto session = static_cast<Session*>(priv.get()); + session->wstate.removeWatch(self.lock()); + } + conn = ConnectionRef(); + } + obc = ObjectContextRef(); +} + +bool Watch::is_discarded() const +{ + return discarded; +} + +void Watch::remove(bool send_disconnect) +{ + dout(10) << "remove" << dendl; + if (send_disconnect && conn) { + bufferlist empty; + MWatchNotify *reply(new MWatchNotify(cookie, 0, 0, + CEPH_WATCH_EVENT_DISCONNECT, empty)); + conn->send_message(reply); + } + for (map<uint64_t, NotifyRef>::iterator i = in_progress_notifies.begin(); + i != in_progress_notifies.end(); + ++i) { + i->second->complete_watcher_remove(self.lock()); + } + discard_state(); +} + +void Watch::start_notify(NotifyRef notif) +{ + ceph_assert(in_progress_notifies.find(notif->notify_id) == + in_progress_notifies.end()); + if (will_ping) { + utime_t cutoff = ceph_clock_now(); + cutoff.sec_ref() -= timeout; + if (last_ping < cutoff) { + dout(10) << __func__ << " " << notif->notify_id + << " last_ping " << last_ping << " < cutoff " << cutoff + << ", disconnecting" << dendl; + disconnect(); + return; + } + } + dout(10) << "start_notify " << notif->notify_id << dendl; + in_progress_notifies[notif->notify_id] = notif; + notif->start_watcher(self.lock()); + if (connected()) + send_notify(notif); +} + +void Watch::cancel_notify(NotifyRef notif) +{ + dout(10) << "cancel_notify " << notif->notify_id << dendl; + in_progress_notifies.erase(notif->notify_id); +} + +void Watch::send_notify(NotifyRef notif) +{ + dout(10) << "send_notify" << dendl; + MWatchNotify *notify_msg = new MWatchNotify( + cookie, notif->version, notif->notify_id, + CEPH_WATCH_EVENT_NOTIFY, notif->payload); + notify_msg->notifier_gid = notif->client_gid; + conn->send_message(notify_msg); +} + +void Watch::notify_ack(uint64_t notify_id, bufferlist& reply_bl) +{ + dout(10) << "notify_ack" << dendl; + map<uint64_t, NotifyRef>::iterator i = in_progress_notifies.find(notify_id); + if (i != in_progress_notifies.end()) { + i->second->complete_watcher(self.lock(), reply_bl); + in_progress_notifies.erase(i); + } +} + +WatchRef Watch::makeWatchRef( + PrimaryLogPG *pg, OSDService *osd, + ObjectContextRef obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, const entity_addr_t& addr) +{ + WatchRef ret(new Watch(pg, osd, obc, timeout, cookie, entity, addr)); + ret->set_self(ret); + return ret; +} + +void WatchConState::addWatch(WatchRef watch) +{ + std::lock_guard l(lock); + watches.insert(watch); +} + +void WatchConState::removeWatch(WatchRef watch) +{ + std::lock_guard l(lock); + watches.erase(watch); +} + +void WatchConState::reset(Connection *con) +{ + set<WatchRef> _watches; + { + std::lock_guard l(lock); + _watches.swap(watches); + } + for (set<WatchRef>::iterator i = _watches.begin(); + i != _watches.end(); + ++i) { + boost::intrusive_ptr<PrimaryLogPG> pg((*i)->get_pg()); + pg->lock(); + if (!(*i)->is_discarded()) { + if ((*i)->is_connected(con)) { + (*i)->disconnect(); + } else { + lgeneric_derr(cct) << __func__ << " not still connected to " << (*i) << dendl; + } + } + pg->unlock(); + } +} diff --git a/src/osd/Watch.h b/src/osd/Watch.h new file mode 100644 index 00000000..65408c27 --- /dev/null +++ b/src/osd/Watch.h @@ -0,0 +1,293 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_WATCH_H +#define CEPH_WATCH_H + +#include <set> +#include "msg/Connection.h" +#include "include/Context.h" + +enum WatcherState { + WATCHER_PENDING, + WATCHER_NOTIFIED, +}; + +class OSDService; +class PrimaryLogPG; +void intrusive_ptr_add_ref(PrimaryLogPG *pg); +void intrusive_ptr_release(PrimaryLogPG *pg); +struct ObjectContext; +class MWatchNotify; + +class Watch; +typedef std::shared_ptr<Watch> WatchRef; +typedef std::weak_ptr<Watch> WWatchRef; + +class Notify; +typedef std::shared_ptr<Notify> NotifyRef; +typedef std::weak_ptr<Notify> WNotifyRef; + +struct CancelableContext; + +/** + * Notify tracks the progress of a particular notify + * + * References are held by Watch and the timeout callback. + */ +class Notify { + friend class NotifyTimeoutCB; + friend class Watch; + WNotifyRef self; + ConnectionRef client; + uint64_t client_gid; + bool complete; + bool discarded; + bool timed_out; ///< true if the notify timed out + set<WatchRef> watchers; + + bufferlist payload; + uint32_t timeout; + uint64_t cookie; + uint64_t notify_id; + uint64_t version; + + OSDService *osd; + CancelableContext *cb; + Mutex lock; + + /// (gid,cookie) -> reply_bl for everyone who acked the notify + multimap<pair<uint64_t,uint64_t>,bufferlist> notify_replies; + + /// true if this notify is being discarded + bool is_discarded() { + return discarded || complete; + } + + /// Sends notify completion if watchers.empty() or timeout + void maybe_complete_notify(); + + /// Called on Notify timeout + void do_timeout(); + + Notify( + ConnectionRef client, + uint64_t client_gid, + bufferlist &payload, + uint32_t timeout, + uint64_t cookie, + uint64_t notify_id, + uint64_t version, + OSDService *osd); + + /// registers a timeout callback with the watch_timer + void register_cb(); + + /// removes the timeout callback, called on completion or cancellation + void unregister_cb(); +public: + + std::ostream& gen_dbg_prefix(std::ostream& out) { + return out << "Notify(" << make_pair(cookie, notify_id) << " " + << " watchers=" << watchers.size() + << ") "; + } + void set_self(NotifyRef _self) { + self = _self; + } + static NotifyRef makeNotifyRef( + ConnectionRef client, + uint64_t client_gid, + bufferlist &payload, + uint32_t timeout, + uint64_t cookie, + uint64_t notify_id, + uint64_t version, + OSDService *osd); + + /// Call after creation to initialize + void init(); + + /// Called once per watcher prior to init() + void start_watcher( + WatchRef watcher ///< [in] watcher to complete + ); + + /// Called once per NotifyAck + void complete_watcher( + WatchRef watcher, ///< [in] watcher to complete + bufferlist& reply_bl ///< [in] reply buffer from the notified watcher + ); + /// Called when a watcher unregisters or times out + void complete_watcher_remove( + WatchRef watcher ///< [in] watcher to complete + ); + + /// Called when the notify is canceled due to a new peering interval + void discard(); +}; + +/** + * Watch is a mapping between a Connection and an ObjectContext + * + * References are held by ObjectContext and the timeout callback + */ +class HandleWatchTimeout; +class HandleDelayedWatchTimeout; +class Watch { + WWatchRef self; + friend class HandleWatchTimeout; + friend class HandleDelayedWatchTimeout; + ConnectionRef conn; + CancelableContext *cb; + + OSDService *osd; + boost::intrusive_ptr<PrimaryLogPG> pg; + std::shared_ptr<ObjectContext> obc; + + std::map<uint64_t, NotifyRef> in_progress_notifies; + + // Could have watch_info_t here, but this file includes osd_types.h + uint32_t timeout; ///< timeout in seconds + uint64_t cookie; + entity_addr_t addr; + + bool will_ping; ///< is client new enough to ping the watch + utime_t last_ping; ///< last client ping + + entity_name_t entity; + bool discarded; + + Watch( + PrimaryLogPG *pg, OSDService *osd, + std::shared_ptr<ObjectContext> obc, uint32_t timeout, + uint64_t cookie, entity_name_t entity, + const entity_addr_t& addr); + + /// Registers the timeout callback with watch_timer + void register_cb(); + + /// send a Notify message when connected for notif + void send_notify(NotifyRef notif); + + /// Cleans up state on discard or remove (including Connection state, obc) + void discard_state(); +public: + /// Unregisters the timeout callback + void unregister_cb(); + + /// note receipt of a ping + void got_ping(utime_t t); + utime_t get_last_ping() const { + return last_ping; + } + + bool is_connected() const { + return conn.get() != NULL; + } + bool is_connected(Connection *con) const { + return conn.get() == con; + } + + /// NOTE: must be called with pg lock held + ~Watch(); + + uint64_t get_watcher_gid() const { + return entity.num(); + } + + std::ostream& gen_dbg_prefix(std::ostream& out); + static WatchRef makeWatchRef( + PrimaryLogPG *pg, OSDService *osd, + std::shared_ptr<ObjectContext> obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, const entity_addr_t &addr); + void set_self(WatchRef _self) { + self = _self; + } + + /// Does not grant a ref count! + boost::intrusive_ptr<PrimaryLogPG> get_pg() { return pg; } + + std::shared_ptr<ObjectContext> get_obc() { return obc; } + + uint64_t get_cookie() const { return cookie; } + entity_name_t get_entity() const { return entity; } + entity_addr_t get_peer_addr() const { return addr; } + uint32_t get_timeout() const { return timeout; } + + /// Generates context for use if watch timeout is delayed by scrub or recovery + Context *get_delayed_cb(); + + /// True if currently connected + bool connected(); + + /// Transitions Watch to connected, unregister_cb, resends pending Notifies + void connect( + ConnectionRef con, ///< [in] Reference to new connection + bool will_ping ///< [in] client is new and will send pings + ); + + /// Transitions watch to disconnected, register_cb + void disconnect(); + + /// Called if Watch state is discarded due to new peering interval + void discard(); + + /// True if removed or discarded + bool is_discarded() const; + + /// Called on unwatch + void remove(bool send_disconnect); + + /// Adds notif as in-progress notify + void start_notify( + NotifyRef notif ///< [in] Reference to new in-progress notify + ); + + /// Removes timed out notify + void cancel_notify( + NotifyRef notif ///< [in] notify which timed out + ); + + /// Call when notify_ack received on notify_id + void notify_ack( + uint64_t notify_id, ///< [in] id of acked notify + bufferlist& reply_bl ///< [in] notify reply buffer + ); +}; + +/** + * Holds weak refs to Watch structures corresponding to a connection + * Lives in the Session object of an OSD connection + */ +class WatchConState { + Mutex lock; + std::set<WatchRef> watches; +public: + CephContext* cct; + explicit WatchConState(CephContext* cct) : lock("WatchConState"), cct(cct) {} + + /// Add a watch + void addWatch( + WatchRef watch ///< [in] Ref to new watch object + ); + + /// Remove a watch + void removeWatch( + WatchRef watch ///< [in] Ref to watch object to remove + ); + + /// Called on session reset, disconnects watchers + void reset(Connection *con); +}; + +#endif diff --git a/src/osd/mClockClientQueue.cc b/src/osd/mClockClientQueue.cc new file mode 100644 index 00000000..ae6985fe --- /dev/null +++ b/src/osd/mClockClientQueue.cc @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <memory> + +#include "osd/mClockClientQueue.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout + + +namespace ceph { + + /* + * class mClockClientQueue + */ + + mClockClientQueue::mClockClientQueue(CephContext *cct) : + queue(std::bind(&mClockClientQueue::op_class_client_info_f, this, _1), + cct->_conf->osd_op_queue_mclock_anticipation_timeout), + client_info_mgr(cct) + { + // empty + } + + const dmc::ClientInfo* mClockClientQueue::op_class_client_info_f( + const mClockClientQueue::InnerClient& client) + { + return client_info_mgr.get_client_info(client.second); + } + + mClockClientQueue::InnerClient + inline mClockClientQueue::get_inner_client(const Client& cl, + const Request& request) { + return InnerClient(cl, client_info_mgr.osd_op_type(request)); + } + + // Formatted output of the queue + inline void mClockClientQueue::dump(ceph::Formatter *f) const { + queue.dump(f); + } + + inline void mClockClientQueue::enqueue_strict(Client cl, + unsigned priority, + Request&& item) { + queue.enqueue_strict(get_inner_client(cl, item), priority, + std::move(item)); + } + + // Enqueue op in the front of the strict queue + inline void mClockClientQueue::enqueue_strict_front(Client cl, + unsigned priority, + Request&& item) { + queue.enqueue_strict_front(get_inner_client(cl, item), priority, + std::move(item)); + } + + // Enqueue op in the back of the regular queue + inline void mClockClientQueue::enqueue(Client cl, + unsigned priority, + unsigned cost, + Request&& item) { + queue.enqueue(get_inner_client(cl, item), priority, 1u, std::move(item)); + } + + // Enqueue the op in the front of the regular queue + inline void mClockClientQueue::enqueue_front(Client cl, + unsigned priority, + unsigned cost, + Request&& item) { + queue.enqueue_front(get_inner_client(cl, item), priority, 1u, + std::move(item)); + } + + // Return an op to be dispatched + inline Request mClockClientQueue::dequeue() { + return queue.dequeue(); + } +} // namespace ceph diff --git a/src/osd/mClockClientQueue.h b/src/osd/mClockClientQueue.h new file mode 100644 index 00000000..84454ff6 --- /dev/null +++ b/src/osd/mClockClientQueue.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include <ostream> + +#include "boost/variant.hpp" + +#include "common/config.h" +#include "common/ceph_context.h" +#include "common/mClockPriorityQueue.h" +#include "osd/OpQueueItem.h" +#include "osd/mClockOpClassSupport.h" + + +namespace ceph { + + using Request = OpQueueItem; + using Client = uint64_t; + + // This class exists to bridge the ceph code, which treats the class + // as the client, and the queue, where the class is + // osd_op_type_t. So this adapter class will transform calls + // appropriately. + class mClockClientQueue : public OpQueue<Request, Client> { + + using osd_op_type_t = ceph::mclock::osd_op_type_t; + + using InnerClient = std::pair<uint64_t,osd_op_type_t>; + + using queue_t = mClockQueue<Request, InnerClient>; + + queue_t queue; + + ceph::mclock::OpClassClientInfoMgr client_info_mgr; + + public: + + mClockClientQueue(CephContext *cct); + + const crimson::dmclock::ClientInfo* op_class_client_info_f(const InnerClient& client); + + inline unsigned get_size_slow() const { + return queue.get_size_slow(); + } + + // Ops of this priority should be deleted immediately + inline void remove_by_class(Client cl, + std::list<Request> *out) override final { + queue.remove_by_filter( + [&cl, out] (Request&& r) -> bool { + if (cl == r.get_owner()) { + out->push_front(std::move(r)); + return true; + } else { + return false; + } + }); + } + + void enqueue_strict(Client cl, + unsigned priority, + Request&& item) override final; + + // Enqueue op in the front of the strict queue + void enqueue_strict_front(Client cl, + unsigned priority, + Request&& item) override final; + + // Enqueue op in the back of the regular queue + void enqueue(Client cl, + unsigned priority, + unsigned cost, + Request&& item) override final; + + // Enqueue the op in the front of the regular queue + void enqueue_front(Client cl, + unsigned priority, + unsigned cost, + Request&& item) override final; + + // Return an op to be dispatch + Request dequeue() override final; + + // Returns if the queue is empty + inline bool empty() const override final { + return queue.empty(); + } + + // Formatted output of the queue + void dump(ceph::Formatter *f) const override final; + + protected: + + InnerClient get_inner_client(const Client& cl, const Request& request); + }; // class mClockClientAdapter + +} // namespace ceph diff --git a/src/osd/mClockOpClassQueue.cc b/src/osd/mClockOpClassQueue.cc new file mode 100644 index 00000000..ccaf98f0 --- /dev/null +++ b/src/osd/mClockOpClassQueue.cc @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <memory> + +#include "osd/mClockOpClassQueue.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout + + +namespace ceph { + + /* + * class mClockOpClassQueue + */ + + mClockOpClassQueue::mClockOpClassQueue(CephContext *cct) : + queue(std::bind(&mClockOpClassQueue::op_class_client_info_f, this, _1), + cct->_conf->osd_op_queue_mclock_anticipation_timeout), + client_info_mgr(cct) + { + // empty + } + + const dmc::ClientInfo* mClockOpClassQueue::op_class_client_info_f( + const osd_op_type_t& op_type) + { + return client_info_mgr.get_client_info(op_type); + } + + // Formatted output of the queue + void mClockOpClassQueue::dump(ceph::Formatter *f) const { + queue.dump(f); + } +} // namespace ceph diff --git a/src/osd/mClockOpClassQueue.h b/src/osd/mClockOpClassQueue.h new file mode 100644 index 00000000..3ad7f719 --- /dev/null +++ b/src/osd/mClockOpClassQueue.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include <ostream> + +#include "boost/variant.hpp" +#include "boost/container/flat_set.hpp" + +#include "common/config.h" +#include "common/ceph_context.h" +#include "common/mClockPriorityQueue.h" +#include "osd/OpQueueItem.h" +#include "osd/mClockOpClassSupport.h" + + +namespace ceph { + + using Request = OpQueueItem; + using Client = uint64_t; + + // This class exists to bridge the ceph code, which treats the class + // as the client, and the queue, where the class is + // osd_op_type_t. So this adapter class will transform calls + // appropriately. + class mClockOpClassQueue : public OpQueue<Request, Client> { + + using osd_op_type_t = ceph::mclock::osd_op_type_t; + + using queue_t = mClockQueue<Request, osd_op_type_t>; + queue_t queue; + + ceph::mclock::OpClassClientInfoMgr client_info_mgr; + + public: + + mClockOpClassQueue(CephContext *cct); + + const crimson::dmclock::ClientInfo* + op_class_client_info_f(const osd_op_type_t& op_type); + + inline unsigned get_size_slow() const { + return queue.get_size_slow(); + } + + // Ops of this priority should be deleted immediately + inline void remove_by_class(Client cl, + std::list<Request> *out) override final { + queue.remove_by_filter( + [&cl, out] (Request&& r) -> bool { + if (cl == r.get_owner()) { + out->push_front(std::move(r)); + return true; + } else { + return false; + } + }); + } + + inline void enqueue_strict(Client cl, + unsigned priority, + Request&& item) override final { + queue.enqueue_strict(client_info_mgr.osd_op_type(item), + priority, + std::move(item)); + } + + // Enqueue op in the front of the strict queue + inline void enqueue_strict_front(Client cl, + unsigned priority, + Request&& item) override final { + queue.enqueue_strict_front(client_info_mgr.osd_op_type(item), + priority, + std::move(item)); + } + + // Enqueue op in the back of the regular queue + inline void enqueue(Client cl, + unsigned priority, + unsigned cost, + Request&& item) override final { + queue.enqueue(client_info_mgr.osd_op_type(item), + priority, + 1u, + std::move(item)); + } + + // Enqueue the op in the front of the regular queue + inline void enqueue_front(Client cl, + unsigned priority, + unsigned cost, + Request&& item) override final { + queue.enqueue_front(client_info_mgr.osd_op_type(item), + priority, + 1u, + std::move(item)); + } + + // Returns if the queue is empty + inline bool empty() const override final { + return queue.empty(); + } + + // Return an op to be dispatch + inline Request dequeue() override final { + return queue.dequeue(); + } + + // Formatted output of the queue + void dump(ceph::Formatter *f) const override final; + }; // class mClockOpClassAdapter +} // namespace ceph diff --git a/src/osd/mClockOpClassSupport.cc b/src/osd/mClockOpClassSupport.cc new file mode 100644 index 00000000..d35c2cbe --- /dev/null +++ b/src/osd/mClockOpClassSupport.cc @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/dout.h" +#include "osd/mClockOpClassSupport.h" +#include "osd/OpQueueItem.h" + +#include "include/ceph_assert.h" + +namespace ceph { + + namespace mclock { + + OpClassClientInfoMgr::OpClassClientInfoMgr(CephContext *cct) : + client_op(cct->_conf->osd_op_queue_mclock_client_op_res, + cct->_conf->osd_op_queue_mclock_client_op_wgt, + cct->_conf->osd_op_queue_mclock_client_op_lim), + osd_rep_op(cct->_conf->osd_op_queue_mclock_osd_rep_op_res, + cct->_conf->osd_op_queue_mclock_osd_rep_op_wgt, + cct->_conf->osd_op_queue_mclock_osd_rep_op_lim), + snaptrim(cct->_conf->osd_op_queue_mclock_snap_res, + cct->_conf->osd_op_queue_mclock_snap_wgt, + cct->_conf->osd_op_queue_mclock_snap_lim), + recov(cct->_conf->osd_op_queue_mclock_recov_res, + cct->_conf->osd_op_queue_mclock_recov_wgt, + cct->_conf->osd_op_queue_mclock_recov_lim), + scrub(cct->_conf->osd_op_queue_mclock_scrub_res, + cct->_conf->osd_op_queue_mclock_scrub_wgt, + cct->_conf->osd_op_queue_mclock_scrub_lim), + pg_delete(cct->_conf->osd_op_queue_mclock_pg_delete_res, + cct->_conf->osd_op_queue_mclock_pg_delete_wgt, + cct->_conf->osd_op_queue_mclock_pg_delete_lim), + peering_event(cct->_conf->osd_op_queue_mclock_peering_event_res, + cct->_conf->osd_op_queue_mclock_peering_event_wgt, + cct->_conf->osd_op_queue_mclock_peering_event_lim) + { + constexpr int rep_ops[] = { + MSG_OSD_REPOP, + MSG_OSD_REPOPREPLY, + MSG_OSD_PG_UPDATE_LOG_MISSING, + MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY, + MSG_OSD_EC_WRITE, + MSG_OSD_EC_WRITE_REPLY, + MSG_OSD_EC_READ, + MSG_OSD_EC_READ_REPLY + }; + for (auto op : rep_ops) { + add_rep_op_msg(op); + } + + lgeneric_subdout(cct, osd, 20) << + "mClock OpClass settings:: " << + "client_op:" << client_op << + "; osd_rep_op:" << osd_rep_op << + "; snaptrim:" << snaptrim << + "; recov:" << recov << + "; scrub:" << scrub << + dendl; + + lgeneric_subdout(cct, osd, 30) << + "mClock OpClass message bit set:: " << + rep_op_msg_bitset.to_string() << dendl; + } + + void OpClassClientInfoMgr::add_rep_op_msg(int message_code) { + ceph_assert(message_code >= 0 && message_code < int(rep_op_msg_bitset_size)); + rep_op_msg_bitset.set(message_code); + } + + osd_op_type_t + OpClassClientInfoMgr::osd_op_type(const OpQueueItem& op) const { + osd_op_type_t type = convert_op_type(op.get_op_type()); + if (osd_op_type_t::client_op != type) { + return type; + } else { + // get_header returns ceph_msg_header type, ceph_msg_header + // stores type as unsigned little endian, so be sure to + // convert to CPU byte ordering + boost::optional<OpRequestRef> op_ref_maybe = op.maybe_get_op(); + ceph_assert(op_ref_maybe); + __le16 mtype_le = (*op_ref_maybe)->get_req()->get_header().type; + __u16 mtype = le16_to_cpu(mtype_le); + if (rep_op_msg_bitset.test(mtype)) { + return osd_op_type_t::osd_rep_op; + } else { + return osd_op_type_t::client_op; + } + } + } + + // used for debugging since faster implementation can be done + // with rep_op_msg_bitmap + bool OpClassClientInfoMgr::is_rep_op(uint16_t mtype) { + return + MSG_OSD_REPOP == mtype || + MSG_OSD_REPOPREPLY == mtype || + MSG_OSD_PG_UPDATE_LOG_MISSING == mtype || + MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY == mtype || + MSG_OSD_EC_WRITE == mtype || + MSG_OSD_EC_WRITE_REPLY == mtype || + MSG_OSD_EC_READ == mtype || + MSG_OSD_EC_READ_REPLY == mtype; + } + } // namespace mclock +} // namespace ceph diff --git a/src/osd/mClockOpClassSupport.h b/src/osd/mClockOpClassSupport.h new file mode 100644 index 00000000..1ea1043e --- /dev/null +++ b/src/osd/mClockOpClassSupport.h @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include <bitset> + +#include "dmclock/src/dmclock_server.h" +#include "osd/OpRequest.h" +#include "osd/OpQueueItem.h" + + +namespace ceph { + namespace mclock { + + using op_item_type_t = OpQueueItem::OpQueueable::op_type_t; + + enum class osd_op_type_t { + client_op, osd_rep_op, bg_snaptrim, bg_recovery, bg_scrub, bg_pg_delete, + peering_event + }; + + class OpClassClientInfoMgr { + crimson::dmclock::ClientInfo client_op; + crimson::dmclock::ClientInfo osd_rep_op; + crimson::dmclock::ClientInfo snaptrim; + crimson::dmclock::ClientInfo recov; + crimson::dmclock::ClientInfo scrub; + crimson::dmclock::ClientInfo pg_delete; + crimson::dmclock::ClientInfo peering_event; + + static constexpr std::size_t rep_op_msg_bitset_size = 128; + std::bitset<rep_op_msg_bitset_size> rep_op_msg_bitset; + void add_rep_op_msg(int message_code); + + public: + + OpClassClientInfoMgr(CephContext *cct); + + inline const crimson::dmclock::ClientInfo* + get_client_info(osd_op_type_t type) { + switch(type) { + case osd_op_type_t::client_op: + return &client_op; + case osd_op_type_t::osd_rep_op: + return &osd_rep_op; + case osd_op_type_t::bg_snaptrim: + return &snaptrim; + case osd_op_type_t::bg_recovery: + return &recov; + case osd_op_type_t::bg_scrub: + return &scrub; + case osd_op_type_t::bg_pg_delete: + return &pg_delete; + case osd_op_type_t::peering_event: + return &peering_event; + default: + ceph_abort(); + return nullptr; + } + } + + // converts operation type from op queue internal to mclock + // equivalent + inline static osd_op_type_t convert_op_type(op_item_type_t t) { + switch(t) { + case op_item_type_t::client_op: + return osd_op_type_t::client_op; + case op_item_type_t::bg_snaptrim: + return osd_op_type_t::bg_snaptrim; + case op_item_type_t::bg_recovery: + return osd_op_type_t::bg_recovery; + case op_item_type_t::bg_scrub: + return osd_op_type_t::bg_scrub; + case op_item_type_t::bg_pg_delete: + return osd_op_type_t::bg_pg_delete; + case op_item_type_t::peering_event: + return osd_op_type_t::peering_event; + default: + ceph_abort(); + } + } + + osd_op_type_t osd_op_type(const OpQueueItem&) const; + + // used for debugging since faster implementation can be done + // with rep_op_msg_bitmap + static bool is_rep_op(uint16_t); + }; // OpClassClientInfoMgr + } // namespace mclock +} // namespace ceph diff --git a/src/osd/osd_internal_types.h b/src/osd/osd_internal_types.h new file mode 100644 index 00000000..97d444e7 --- /dev/null +++ b/src/osd/osd_internal_types.h @@ -0,0 +1,464 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OSD_INTERNAL_TYPES_H +#define CEPH_OSD_INTERNAL_TYPES_H + +#include "osd_types.h" +#include "OpRequest.h" + +/* + * keep tabs on object modifications that are in flight. + * we need to know the projected existence, size, snapset, + * etc., because we don't send writes down to disk until after + * replicas ack. + */ + +struct SnapSetContext { + hobject_t oid; + SnapSet snapset; + int ref; + bool registered : 1; + bool exists : 1; + + explicit SnapSetContext(const hobject_t& o) : + oid(o), ref(0), registered(false), exists(true) { } +}; + +struct ObjectContext; + +struct ObjectState { + object_info_t oi; + bool exists; ///< the stored object exists (i.e., we will remember the object_info_t) + + ObjectState() : exists(false) {} + + ObjectState(const object_info_t &oi_, bool exists_) + : oi(oi_), exists(exists_) {} +}; + +typedef std::shared_ptr<ObjectContext> ObjectContextRef; + +struct ObjectContext { + ObjectState obs; + + SnapSetContext *ssc; // may be null + + Context *destructor_callback; + +public: + + // any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers. + map<pair<uint64_t, entity_name_t>, WatchRef> watchers; + + // attr cache + map<string, bufferlist> attr_cache; + + struct RWState { + enum State { + RWNONE, + RWREAD, + RWWRITE, + RWEXCL, + }; + static const char *get_state_name(State s) { + switch (s) { + case RWNONE: return "none"; + case RWREAD: return "read"; + case RWWRITE: return "write"; + case RWEXCL: return "excl"; + default: return "???"; + } + } + const char *get_state_name() const { + return get_state_name(state); + } + + std::list<OpRequestRef> waiters; ///< ops waiting on state change + int count; ///< number of readers or writers + + State state:4; ///< rw state + /// if set, restart backfill when we can get a read lock + bool recovery_read_marker:1; + /// if set, requeue snaptrim on lock release + bool snaptrimmer_write_marker:1; + + RWState() + : count(0), + state(RWNONE), + recovery_read_marker(false), + snaptrimmer_write_marker(false) + {} + bool get_read(OpRequestRef& op) { + if (get_read_lock()) { + return true; + } // else + // Now we really need to bump up the ref-counter. + waiters.emplace_back(op); + return false; + } + /// this function adjusts the counts if necessary + bool get_read_lock() { + // don't starve anybody! + if (!waiters.empty()) { + return false; + } + switch (state) { + case RWNONE: + ceph_assert(count == 0); + state = RWREAD; + // fall through + case RWREAD: + count++; + return true; + case RWWRITE: + return false; + case RWEXCL: + return false; + default: + ceph_abort_msg("unhandled case"); + return false; + } + } + + bool get_write(OpRequestRef& op, bool greedy=false) { + if (get_write_lock(greedy)) { + return true; + } // else + if (op) + waiters.emplace_back(op); + return false; + } + bool get_write_lock(bool greedy=false) { + if (!greedy) { + // don't starve anybody! + if (!waiters.empty() || + recovery_read_marker) { + return false; + } + } + switch (state) { + case RWNONE: + ceph_assert(count == 0); + state = RWWRITE; + // fall through + case RWWRITE: + count++; + return true; + case RWREAD: + return false; + case RWEXCL: + return false; + default: + ceph_abort_msg("unhandled case"); + return false; + } + } + bool get_excl_lock() { + switch (state) { + case RWNONE: + ceph_assert(count == 0); + state = RWEXCL; + count = 1; + return true; + case RWWRITE: + return false; + case RWREAD: + return false; + case RWEXCL: + return false; + default: + ceph_abort_msg("unhandled case"); + return false; + } + } + bool get_excl(OpRequestRef& op) { + if (get_excl_lock()) { + return true; + } // else + if (op) + waiters.emplace_back(op); + return false; + } + /// same as get_write_lock, but ignore starvation + bool take_write_lock() { + if (state == RWWRITE) { + count++; + return true; + } + return get_write_lock(); + } + void dec(list<OpRequestRef> *requeue) { + ceph_assert(count > 0); + ceph_assert(requeue); + count--; + if (count == 0) { + state = RWNONE; + requeue->splice(requeue->end(), waiters); + } + } + void put_read(list<OpRequestRef> *requeue) { + ceph_assert(state == RWREAD); + dec(requeue); + } + void put_write(list<OpRequestRef> *requeue) { + ceph_assert(state == RWWRITE); + dec(requeue); + } + void put_excl(list<OpRequestRef> *requeue) { + ceph_assert(state == RWEXCL); + dec(requeue); + } + bool empty() const { return state == RWNONE; } + } rwstate; + + bool get_read(OpRequestRef& op) { + return rwstate.get_read(op); + } + bool get_write(OpRequestRef& op) { + return rwstate.get_write(op, false); + } + bool get_excl(OpRequestRef op) { + return rwstate.get_excl(op); + } + bool get_lock_type(OpRequestRef& op, RWState::State type) { + switch (type) { + case RWState::RWWRITE: + return get_write(op); + case RWState::RWREAD: + return get_read(op); + case RWState::RWEXCL: + return get_excl(op); + default: + ceph_abort_msg("invalid lock type"); + return true; + } + } + bool get_write_greedy(OpRequestRef& op) { + return rwstate.get_write(op, true); + } + bool get_snaptrimmer_write(bool mark_if_unsuccessful) { + if (rwstate.get_write_lock()) { + return true; + } else { + if (mark_if_unsuccessful) + rwstate.snaptrimmer_write_marker = true; + return false; + } + } + bool get_recovery_read() { + rwstate.recovery_read_marker = true; + if (rwstate.get_read_lock()) { + return true; + } + return false; + } + bool try_get_read_lock() { + return rwstate.get_read_lock(); + } + void drop_recovery_read(list<OpRequestRef> *ls) { + ceph_assert(rwstate.recovery_read_marker); + rwstate.put_read(ls); + rwstate.recovery_read_marker = false; + } + void put_lock_type( + ObjectContext::RWState::State type, + list<OpRequestRef> *to_wake, + bool *requeue_recovery, + bool *requeue_snaptrimmer) { + switch (type) { + case ObjectContext::RWState::RWWRITE: + rwstate.put_write(to_wake); + break; + case ObjectContext::RWState::RWREAD: + rwstate.put_read(to_wake); + break; + case ObjectContext::RWState::RWEXCL: + rwstate.put_excl(to_wake); + break; + default: + ceph_abort_msg("invalid lock type"); + } + if (rwstate.empty() && rwstate.recovery_read_marker) { + rwstate.recovery_read_marker = false; + *requeue_recovery = true; + } + if (rwstate.empty() && rwstate.snaptrimmer_write_marker) { + rwstate.snaptrimmer_write_marker = false; + *requeue_snaptrimmer = true; + } + } + bool is_request_pending() { + return (rwstate.count > 0); + } + + ObjectContext() + : ssc(NULL), + destructor_callback(0), + blocked(false), requeue_scrub_on_unblock(false) {} + + ~ObjectContext() { + ceph_assert(rwstate.empty()); + if (destructor_callback) + destructor_callback->complete(0); + } + + void start_block() { + ceph_assert(!blocked); + blocked = true; + } + void stop_block() { + ceph_assert(blocked); + blocked = false; + } + bool is_blocked() const { + return blocked; + } + + /// in-progress copyfrom ops for this object + bool blocked:1; + bool requeue_scrub_on_unblock:1; // true if we need to requeue scrub on unblock + +}; + +inline ostream& operator<<(ostream& out, const ObjectState& obs) +{ + out << obs.oi.soid; + if (!obs.exists) + out << "(dne)"; + return out; +} + +inline ostream& operator<<(ostream& out, const ObjectContext::RWState& rw) +{ + return out << "rwstate(" << rw.get_state_name() + << " n=" << rw.count + << " w=" << rw.waiters.size() + << ")"; +} + +inline ostream& operator<<(ostream& out, const ObjectContext& obc) +{ + return out << "obc(" << obc.obs << " " << obc.rwstate << ")"; +} + +class ObcLockManager { + struct ObjectLockState { + ObjectContextRef obc; + ObjectContext::RWState::State type; + ObjectLockState( + ObjectContextRef obc, + ObjectContext::RWState::State type) + : obc(std::move(obc)), type(type) {} + }; + map<hobject_t, ObjectLockState> locks; +public: + ObcLockManager() = default; + ObcLockManager(ObcLockManager &&) = default; + ObcLockManager(const ObcLockManager &) = delete; + ObcLockManager &operator=(ObcLockManager &&) = default; + bool empty() const { + return locks.empty(); + } + bool get_lock_type( + ObjectContext::RWState::State type, + const hobject_t &hoid, + ObjectContextRef& obc, + OpRequestRef& op) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->get_lock_type(op, type)) { + locks.insert(make_pair(hoid, ObjectLockState(obc, type))); + return true; + } else { + return false; + } + } + /// Get write lock, ignore starvation + bool take_write_lock( + const hobject_t &hoid, + ObjectContextRef obc) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->rwstate.take_write_lock()) { + locks.insert( + make_pair( + hoid, ObjectLockState(obc, ObjectContext::RWState::RWWRITE))); + return true; + } else { + return false; + } + } + /// Get write lock for snap trim + bool get_snaptrimmer_write( + const hobject_t &hoid, + ObjectContextRef obc, + bool mark_if_unsuccessful) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->get_snaptrimmer_write(mark_if_unsuccessful)) { + locks.insert( + make_pair( + hoid, ObjectLockState(obc, ObjectContext::RWState::RWWRITE))); + return true; + } else { + return false; + } + } + /// Get write lock greedy + bool get_write_greedy( + const hobject_t &hoid, + ObjectContextRef obc, + OpRequestRef op) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->get_write_greedy(op)) { + locks.insert( + make_pair( + hoid, ObjectLockState(obc, ObjectContext::RWState::RWWRITE))); + return true; + } else { + return false; + } + } + + /// try get read lock + bool try_get_read_lock( + const hobject_t &hoid, + ObjectContextRef obc) { + ceph_assert(locks.find(hoid) == locks.end()); + if (obc->try_get_read_lock()) { + locks.insert( + make_pair( + hoid, + ObjectLockState(obc, ObjectContext::RWState::RWREAD))); + return true; + } else { + return false; + } + } + + void put_locks( + list<pair<ObjectContextRef, list<OpRequestRef> > > *to_requeue, + bool *requeue_recovery, + bool *requeue_snaptrimmer) { + for (auto& p: locks) { + list<OpRequestRef> _to_requeue; + p.second.obc->put_lock_type( + p.second.type, + &_to_requeue, + requeue_recovery, + requeue_snaptrimmer); + if (to_requeue) { + // We can safely std::move here as the whole `locks` is going + // to die just after the loop. + to_requeue->emplace_back(std::move(p.second.obc), + std::move(_to_requeue)); + } + } + locks.clear(); + } + ~ObcLockManager() { + ceph_assert(locks.empty()); + } +}; + + + +#endif diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc new file mode 100644 index 00000000..baf76be1 --- /dev/null +++ b/src/osd/osd_types.cc @@ -0,0 +1,6597 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <boost/assign/list_of.hpp> + +#include "osd_types.h" +#include "include/ceph_features.h" +#include "include/stringify.h" +extern "C" { +#include "crush/hash.h" +} +#include "OSDMap.h" + +const char *ceph_osd_flag_name(unsigned flag) +{ + switch (flag) { + case CEPH_OSD_FLAG_ACK: return "ack"; + case CEPH_OSD_FLAG_ONNVRAM: return "onnvram"; + case CEPH_OSD_FLAG_ONDISK: return "ondisk"; + case CEPH_OSD_FLAG_RETRY: return "retry"; + case CEPH_OSD_FLAG_READ: return "read"; + case CEPH_OSD_FLAG_WRITE: return "write"; + case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap"; + case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old"; + case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads"; + case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec"; + case CEPH_OSD_FLAG_PGOP: return "pgop"; + case CEPH_OSD_FLAG_EXEC: return "exec"; + case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public"; + case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads"; + case CEPH_OSD_FLAG_RWORDERED: return "rwordered"; + case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache"; + case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks"; + case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay"; + case CEPH_OSD_FLAG_FLUSH: return "flush"; + case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone"; + case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc"; + case CEPH_OSD_FLAG_REDIRECTED: return "redirected"; + case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected"; + case CEPH_OSD_FLAG_FULL_TRY: return "full_try"; + case CEPH_OSD_FLAG_FULL_FORCE: return "full_force"; + case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect"; + default: return "???"; + } +} + +string ceph_osd_flag_string(unsigned flags) +{ + string s; + for (unsigned i=0; i<32; ++i) { + if (flags & (1u<<i)) { + if (s.length()) + s += "+"; + s += ceph_osd_flag_name(1u << i); + } + } + if (s.length()) + return s; + return string("-"); +} + +const char * ceph_osd_op_flag_name(unsigned flag) +{ + const char *name; + + switch(flag) { + case CEPH_OSD_OP_FLAG_EXCL: + name = "excl"; + break; + case CEPH_OSD_OP_FLAG_FAILOK: + name = "failok"; + break; + case CEPH_OSD_OP_FLAG_FADVISE_RANDOM: + name = "fadvise_random"; + break; + case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL: + name = "fadvise_sequential"; + break; + case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED: + name = "favise_willneed"; + break; + case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: + name = "fadvise_dontneed"; + break; + case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE: + name = "fadvise_nocache"; + break; + case CEPH_OSD_OP_FLAG_WITH_REFERENCE: + name = "with_reference"; + break; + case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE: + name = "bypass_clean_cache"; + break; + default: + name = "???"; + }; + + return name; +} + +string ceph_osd_op_flag_string(unsigned flags) +{ + string s; + for (unsigned i=0; i<32; ++i) { + if (flags & (1u<<i)) { + if (s.length()) + s += "+"; + s += ceph_osd_op_flag_name(1u << i); + } + } + if (s.length()) + return s; + return string("-"); +} + +string ceph_osd_alloc_hint_flag_string(unsigned flags) +{ + string s; + for (unsigned i=0; i<32; ++i) { + if (flags & (1u<<i)) { + if (s.length()) + s += "+"; + s += ceph_osd_alloc_hint_flag_name(1u << i); + } + } + if (s.length()) + return s; + return string("-"); +} + +void pg_shard_t::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(osd, bl); + encode(shard, bl); + ENCODE_FINISH(bl); +} +void pg_shard_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(osd, bl); + decode(shard, bl); + DECODE_FINISH(bl); +} + +ostream &operator<<(ostream &lhs, const pg_shard_t &rhs) +{ + if (rhs.is_undefined()) + return lhs << "?"; + if (rhs.shard == shard_id_t::NO_SHARD) + return lhs << rhs.get_osd(); + return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')'; +} + +void dump(Formatter* f, const osd_alerts_t& alerts) +{ + for (auto& a : alerts) { + string s0 = " osd: "; + s0 += stringify(a.first); + string s; + for (auto& aa : a.second) { + s = s0; + s += " "; + s += aa.first; + s += ":"; + s += aa.second; + f->dump_string("alert", s); + } + } +} + +// -- osd_reqid_t -- +void osd_reqid_t::dump(Formatter *f) const +{ + f->dump_stream("name") << name; + f->dump_int("inc", inc); + f->dump_unsigned("tid", tid); +} + +void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o) +{ + o.push_back(new osd_reqid_t); + o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678)); +} + +// -- object_locator_t -- + +void object_locator_t::encode(bufferlist& bl) const +{ + // verify that nobody's corrupted the locator + ceph_assert(hash == -1 || key.empty()); + __u8 encode_compat = 3; + ENCODE_START(6, encode_compat, bl); + encode(pool, bl); + int32_t preferred = -1; // tell old code there is no preferred osd (-1). + encode(preferred, bl); + encode(key, bl); + encode(nspace, bl); + encode(hash, bl); + if (hash != -1) + encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash + ENCODE_FINISH_NEW_COMPAT(bl, encode_compat); +} + +void object_locator_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p); + if (struct_v < 2) { + int32_t op; + decode(op, p); + pool = op; + int16_t pref; + decode(pref, p); + } else { + decode(pool, p); + int32_t preferred; + decode(preferred, p); + } + decode(key, p); + if (struct_v >= 5) + decode(nspace, p); + if (struct_v >= 6) + decode(hash, p); + else + hash = -1; + DECODE_FINISH(p); + // verify that nobody's corrupted the locator + ceph_assert(hash == -1 || key.empty()); +} + +void object_locator_t::dump(Formatter *f) const +{ + f->dump_int("pool", pool); + f->dump_string("key", key); + f->dump_string("namespace", nspace); + f->dump_int("hash", hash); +} + +void object_locator_t::generate_test_instances(list<object_locator_t*>& o) +{ + o.push_back(new object_locator_t); + o.push_back(new object_locator_t(123)); + o.push_back(new object_locator_t(123, 876)); + o.push_back(new object_locator_t(1, "n2")); + o.push_back(new object_locator_t(1234, "", "key")); + o.push_back(new object_locator_t(12, "n1", "key2")); +} + +// -- request_redirect_t -- +void request_redirect_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(redirect_locator, bl); + encode(redirect_object, bl); + // legacy of the removed osd_instructions member + encode((uint32_t)0, bl); + ENCODE_FINISH(bl); +} + +void request_redirect_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + uint32_t legacy_osd_instructions_len; + decode(redirect_locator, bl); + decode(redirect_object, bl); + decode(legacy_osd_instructions_len, bl); + if (legacy_osd_instructions_len) { + bl.advance(legacy_osd_instructions_len); + } + DECODE_FINISH(bl); +} + +void request_redirect_t::dump(Formatter *f) const +{ + f->dump_string("object", redirect_object); + f->open_object_section("locator"); + redirect_locator.dump(f); + f->close_section(); // locator +} + +void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o) +{ + object_locator_t loc(1, "redir_obj"); + o.push_back(new request_redirect_t()); + o.push_back(new request_redirect_t(loc, 0)); + o.push_back(new request_redirect_t(loc, "redir_obj")); + o.push_back(new request_redirect_t(loc)); +} + +void objectstore_perf_stat_t::dump(Formatter *f) const +{ + // *_ms values just for compatibility. + f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0); + f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0); + f->dump_unsigned("commit_latency_ns", os_commit_latency_ns); + f->dump_unsigned("apply_latency_ns", os_apply_latency_ns); +} + +void objectstore_perf_stat_t::encode(bufferlist &bl, uint64_t features) const +{ + uint8_t target_v = 2; + if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) { + target_v = 1; + } + ENCODE_START(target_v, target_v, bl); + if (target_v >= 2) { + encode(os_commit_latency_ns, bl); + encode(os_apply_latency_ns, bl); + } else { + constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count(); + uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS; + uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS; + encode(commit_latency_ms, bl); // for compatibility with older monitor. + encode(apply_latency_ms, bl); // for compatibility with older monitor. + } + ENCODE_FINISH(bl); +} + +void objectstore_perf_stat_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(2, bl); + if (struct_v >= 2) { + decode(os_commit_latency_ns, bl); + decode(os_apply_latency_ns, bl); + } else { + uint32_t commit_latency_ms; + uint32_t apply_latency_ms; + decode(commit_latency_ms, bl); + decode(apply_latency_ms, bl); + constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count(); + os_commit_latency_ns = commit_latency_ms * NS_PER_MS; + os_apply_latency_ns = apply_latency_ms * NS_PER_MS; + } + DECODE_FINISH(bl); +} + +void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o) +{ + o.push_back(new objectstore_perf_stat_t()); + o.push_back(new objectstore_perf_stat_t()); + o.back()->os_commit_latency_ns = 20000000; + o.back()->os_apply_latency_ns = 30000000; +} + +// -- osd_stat_t -- +void osd_stat_t::dump(Formatter *f, bool with_net) const +{ + f->dump_unsigned("up_from", up_from); + f->dump_unsigned("seq", seq); + f->dump_unsigned("num_pgs", num_pgs); + f->dump_unsigned("num_osds", num_osds); + f->dump_unsigned("num_per_pool_osds", num_per_pool_osds); + + /// dump legacy stats fields to ensure backward compatibility. + f->dump_unsigned("kb", statfs.kb()); + f->dump_unsigned("kb_used", statfs.kb_used_raw()); + f->dump_unsigned("kb_used_data", statfs.kb_used_data()); + f->dump_unsigned("kb_used_omap", statfs.kb_used_omap()); + f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata()); + f->dump_unsigned("kb_avail", statfs.kb_avail()); + //////////////////// + + f->open_object_section("statfs"); + statfs.dump(f); + f->close_section(); + f->open_array_section("hb_peers"); + for (auto p : hb_peers) + f->dump_int("osd", p); + f->close_section(); + f->dump_int("snap_trim_queue_len", snap_trim_queue_len); + f->dump_int("num_snap_trimming", num_snap_trimming); + f->dump_int("num_shards_repaired", num_shards_repaired); + f->open_object_section("op_queue_age_hist"); + op_queue_age_hist.dump(f); + f->close_section(); + f->open_object_section("perf_stat"); + os_perf_stat.dump(f); + f->close_section(); + f->open_array_section("alerts"); + ::dump(f, os_alerts); + f->close_section(); + if (with_net) { + dump_ping_time(f); + } +} + +void osd_stat_t::dump_ping_time(Formatter *f) const +{ + f->open_array_section("network_ping_times"); + for (auto &i : hb_pingtime) { + f->open_object_section("entry"); + f->dump_int("osd", i.first); + const time_t lu(i.second.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + f->dump_string("last update", lustr); + f->open_array_section("interfaces"); + f->open_object_section("interface"); + f->dump_string("interface", "back"); + f->open_object_section("average"); + f->dump_float("1min", i.second.back_pingtime[0]/1000.0); + f->dump_float("5min", i.second.back_pingtime[1]/1000.0); + f->dump_float("15min", i.second.back_pingtime[2]/1000.0); + f->close_section(); // average + f->open_object_section("min"); + f->dump_float("1min", i.second.back_min[0]/1000.0); + f->dump_float("5min", i.second.back_min[1]/1000.0); + f->dump_float("15min", i.second.back_min[2]/1000.0); + f->close_section(); // min + f->open_object_section("max"); + f->dump_float("1min", i.second.back_max[0]/1000.0); + f->dump_float("5min", i.second.back_max[1]/1000.0); + f->dump_float("15min", i.second.back_max[2]/1000.0); + f->close_section(); // max + f->dump_float("last", i.second.back_last/1000.0); + f->close_section(); // interface + + if (i.second.front_pingtime[0] != 0) { + f->open_object_section("interface"); + f->dump_string("interface", "front"); + f->open_object_section("average"); + f->dump_float("1min", i.second.front_pingtime[0]/1000.0); + f->dump_float("5min", i.second.front_pingtime[1]/1000.0); + f->dump_float("15min", i.second.front_pingtime[2]/1000.0); + f->close_section(); // average + f->open_object_section("min"); + f->dump_float("1min", i.second.front_min[0]/1000.0); + f->dump_float("5min", i.second.front_min[1]/1000.0); + f->dump_float("15min", i.second.front_min[2]/1000.0); + f->close_section(); // min + f->open_object_section("max"); + f->dump_float("1min", i.second.front_max[0]/1000.0); + f->dump_float("5min", i.second.front_max[1]/1000.0); + f->dump_float("15min", i.second.front_max[2]/1000.0); + f->close_section(); // max + f->dump_float("last", i.second.front_last/1000.0); + f->close_section(); // interface + } + f->close_section(); // interfaces + f->close_section(); // entry + } + f->close_section(); // network_ping_time +} + +void osd_stat_t::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(14, 2, bl); + + //////// for compatibility //////// + int64_t kb = statfs.kb(); + int64_t kb_used = statfs.kb_used_raw(); + int64_t kb_avail = statfs.kb_avail(); + encode(kb, bl); + encode(kb_used, bl); + encode(kb_avail, bl); + /////////////////////////////////// + + encode(snap_trim_queue_len, bl); + encode(num_snap_trimming, bl); + encode(hb_peers, bl); + encode((uint32_t)0, bl); + encode(op_queue_age_hist, bl); + encode(os_perf_stat, bl, features); + encode(up_from, bl); + encode(seq, bl); + encode(num_pgs, bl); + + //////// for compatibility //////// + int64_t kb_used_data = statfs.kb_used_data(); + int64_t kb_used_omap = statfs.kb_used_omap(); + int64_t kb_used_meta = statfs.kb_used_internal_metadata(); + encode(kb_used_data, bl); + encode(kb_used_omap, bl); + encode(kb_used_meta, bl); + encode(statfs, bl); + /////////////////////////////////// + encode(os_alerts, bl); + encode(num_shards_repaired, bl); + encode(num_osds, bl); + encode(num_per_pool_osds, bl); + + encode((uint32_t)0, bl); // compatibility + + // hb_pingtime map + encode((int)hb_pingtime.size(), bl); + for (auto i : hb_pingtime) { + encode(i.first, bl); // osd + encode(i.second.last_update, bl); + encode(i.second.back_pingtime[0], bl); + encode(i.second.back_pingtime[1], bl); + encode(i.second.back_pingtime[2], bl); + encode(i.second.back_min[0], bl); + encode(i.second.back_min[1], bl); + encode(i.second.back_min[2], bl); + encode(i.second.back_max[0], bl); + encode(i.second.back_max[1], bl); + encode(i.second.back_max[2], bl); + encode(i.second.back_last, bl); + encode(i.second.front_pingtime[0], bl); + encode(i.second.front_pingtime[1], bl); + encode(i.second.front_pingtime[2], bl); + encode(i.second.front_min[0], bl); + encode(i.second.front_min[1], bl); + encode(i.second.front_min[2], bl); + encode(i.second.front_max[0], bl); + encode(i.second.front_max[1], bl); + encode(i.second.front_max[2], bl); + encode(i.second.front_last, bl); + } + ENCODE_FINISH(bl); +} + +void osd_stat_t::decode(bufferlist::const_iterator &bl) +{ + int64_t kb, kb_used,kb_avail; + int64_t kb_used_data, kb_used_omap, kb_used_meta; + DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl); + decode(kb, bl); + decode(kb_used, bl); + decode(kb_avail, bl); + decode(snap_trim_queue_len, bl); + decode(num_snap_trimming, bl); + decode(hb_peers, bl); + vector<int> num_hb_out; + decode(num_hb_out, bl); + if (struct_v >= 3) + decode(op_queue_age_hist, bl); + if (struct_v >= 4) + decode(os_perf_stat, bl); + if (struct_v >= 6) { + decode(up_from, bl); + decode(seq, bl); + } + if (struct_v >= 7) { + decode(num_pgs, bl); + } + if (struct_v >= 8) { + decode(kb_used_data, bl); + decode(kb_used_omap, bl); + decode(kb_used_meta, bl); + } else { + kb_used_data = kb_used; + kb_used_omap = 0; + kb_used_meta = 0; + } + if (struct_v >= 9) { + decode(statfs, bl); + } else { + statfs.reset(); + statfs.total = kb << 10; + statfs.available = kb_avail << 10; + // actually it's totally unexpected to have ststfs.total < statfs.available + // here but unfortunately legacy generate_test_instances produced such a + // case hence inserting some handling rather than assert + statfs.internally_reserved = + statfs.total > statfs.available ? statfs.total - statfs.available : 0; + kb_used <<= 10; + if ((int64_t)statfs.internally_reserved > kb_used) { + statfs.internally_reserved -= kb_used; + } else { + statfs.internally_reserved = 0; + } + statfs.allocated = kb_used_data << 10; + statfs.omap_allocated = kb_used_omap << 10; + statfs.internal_metadata = kb_used_meta << 10; + } + if (struct_v >= 10) { + decode(os_alerts, bl); + } else { + os_alerts.clear(); + } + if (struct_v >= 11) { + decode(num_shards_repaired, bl); + } else { + num_shards_repaired = 0; + } + if (struct_v >= 12) { + decode(num_osds, bl); + decode(num_per_pool_osds, bl); + } else { + num_osds = 0; + num_per_pool_osds = 0; + } + // Compatibility num_per_pool_omap_osds + if (struct_v >= 13) { + uint32_t dummy; + decode(dummy, bl); + } + hb_pingtime.clear(); + if (struct_v >= 14) { + int count; + decode(count, bl); + for (int i = 0 ; i < count ; i++) { + int osd; + decode(osd, bl); + struct Interfaces ifs; + decode(ifs.last_update, bl); + decode(ifs.back_pingtime[0],bl); + decode(ifs.back_pingtime[1], bl); + decode(ifs.back_pingtime[2], bl); + decode(ifs.back_min[0],bl); + decode(ifs.back_min[1], bl); + decode(ifs.back_min[2], bl); + decode(ifs.back_max[0],bl); + decode(ifs.back_max[1], bl); + decode(ifs.back_max[2], bl); + decode(ifs.back_last, bl); + decode(ifs.front_pingtime[0], bl); + decode(ifs.front_pingtime[1], bl); + decode(ifs.front_pingtime[2], bl); + decode(ifs.front_min[0], bl); + decode(ifs.front_min[1], bl); + decode(ifs.front_min[2], bl); + decode(ifs.front_max[0], bl); + decode(ifs.front_max[1], bl); + decode(ifs.front_max[2], bl); + decode(ifs.front_last, bl); + hb_pingtime[osd] = ifs; + } + } + DECODE_FINISH(bl); +} + +void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o) +{ + o.push_back(new osd_stat_t); + + o.push_back(new osd_stat_t); + list<store_statfs_t*> ll; + store_statfs_t::generate_test_instances(ll); + o.back()->statfs = *ll.back(); + o.back()->hb_peers.push_back(7); + o.back()->snap_trim_queue_len = 8; + o.back()->num_snap_trimming = 99; + o.back()->num_shards_repaired = 101; + o.back()->os_alerts[0].emplace( + "some alert", "some alert details"); + o.back()->os_alerts[1].emplace( + "some alert2", "some alert2 details"); + struct Interfaces gen_interfaces = { + 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, + { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 }; + o.back()->hb_pingtime[20] = gen_interfaces; + gen_interfaces = { + 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; + o.back()->hb_pingtime[30] = gen_interfaces; +} + +// -- pg_t -- + +int pg_t::print(char *o, int maxlen) const +{ + return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps()); +} + +bool pg_t::parse(const char *s) +{ + uint64_t ppool; + uint32_t pseed; + int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed); + if (r < 2) + return false; + m_pool = ppool; + m_seed = pseed; + return true; +} + +bool spg_t::parse(const char *s) +{ + shard = shard_id_t::NO_SHARD; + uint64_t ppool; + uint32_t pseed; + uint32_t pshard; + int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed); + if (r < 2) + return false; + pgid.set_pool(ppool); + pgid.set_ps(pseed); + + const char *p = strchr(s, 's'); + if (p) { + r = sscanf(p, "s%u", &pshard); + if (r == 1) { + shard = shard_id_t(pshard); + } else { + return false; + } + } + return true; +} + +char *spg_t::calc_name(char *buf, const char *suffix_backwords) const +{ + while (*suffix_backwords) + *--buf = *suffix_backwords++; + + if (!is_no_shard()) { + buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf); + *--buf = 's'; + } + + return pgid.calc_name(buf, ""); +} + +ostream& operator<<(ostream& out, const spg_t &pg) +{ + char buf[spg_t::calc_name_buf_size]; + buf[spg_t::calc_name_buf_size - 1] = '\0'; + out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, ""); + return out; +} + +pg_t pg_t::get_ancestor(unsigned old_pg_num) const +{ + int old_bits = cbits(old_pg_num); + int old_mask = (1 << old_bits) - 1; + pg_t ret = *this; + ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask); + return ret; +} + +bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const +{ + //ceph_assert(m_seed < old_pg_num); + if (m_seed >= old_pg_num) { + // degenerate case + return false; + } + if (new_pg_num <= old_pg_num) + return false; + + bool split = false; + if (true) { + unsigned old_bits = cbits(old_pg_num); + unsigned old_mask = (1 << old_bits) - 1; + for (unsigned n = 1; ; n++) { + unsigned next_bit = (n << (old_bits-1)); + unsigned s = next_bit | m_seed; + + if (s < old_pg_num || s == m_seed) + continue; + if (s >= new_pg_num) + break; + if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) { + split = true; + if (children) + children->insert(pg_t(s, m_pool)); + } + } + } + if (false) { + // brute force + int old_bits = cbits(old_pg_num); + int old_mask = (1 << old_bits) - 1; + for (unsigned x = old_pg_num; x < new_pg_num; ++x) { + unsigned o = ceph_stable_mod(x, old_pg_num, old_mask); + if (o == m_seed) { + split = true; + children->insert(pg_t(x, m_pool)); + } + } + } + return split; +} + +unsigned pg_t::get_split_bits(unsigned pg_num) const { + if (pg_num == 1) + return 0; + ceph_assert(pg_num > 1); + + // Find unique p such that pg_num \in [2^(p-1), 2^p) + unsigned p = cbits(pg_num); + ceph_assert(p); // silence coverity #751330 + + if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1)))) + return p; + else + return p - 1; +} + +bool pg_t::is_merge_source( + unsigned old_pg_num, + unsigned new_pg_num, + pg_t *parent) const +{ + if (m_seed < old_pg_num && + m_seed >= new_pg_num) { + if (parent) { + pg_t t = *this; + while (t.m_seed >= new_pg_num) { + t = t.get_parent(); + } + *parent = t; + } + return true; + } + return false; +} + +pg_t pg_t::get_parent() const +{ + unsigned bits = cbits(m_seed); + ceph_assert(bits); + pg_t retval = *this; + retval.m_seed &= ~((~0)<<(bits - 1)); + return retval; +} + +hobject_t pg_t::get_hobj_start() const +{ + return hobject_t(object_t(), string(), 0, m_seed, m_pool, + string()); +} + +hobject_t pg_t::get_hobj_end(unsigned pg_num) const +{ + // note: this assumes a bitwise sort; with the legacy nibblewise + // sort a PG did not always cover a single contiguous range of the + // (bit-reversed) hash range. + unsigned bits = get_split_bits(pg_num); + uint64_t rev_start = hobject_t::_reverse_bits(m_seed); + uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1; + if (rev_end >= 0x100000000) { + ceph_assert(rev_end == 0x100000000); + return hobject_t::get_max(); + } else { + return hobject_t(object_t(), string(), CEPH_NOSNAP, + hobject_t::_reverse_bits(rev_end), m_pool, + string()); + } +} + +void pg_t::dump(Formatter *f) const +{ + f->dump_unsigned("pool", m_pool); + f->dump_unsigned("seed", m_seed); +} + +void pg_t::generate_test_instances(list<pg_t*>& o) +{ + o.push_back(new pg_t); + o.push_back(new pg_t(1, 2)); + o.push_back(new pg_t(13123, 3)); + o.push_back(new pg_t(131223, 4)); +} + +char *pg_t::calc_name(char *buf, const char *suffix_backwords) const +{ + while (*suffix_backwords) + *--buf = *suffix_backwords++; + + buf = ritoa<uint32_t, 16>(m_seed, buf); + + *--buf = '.'; + + return ritoa<uint64_t, 10>(m_pool, buf); +} + +ostream& operator<<(ostream& out, const pg_t &pg) +{ + char buf[pg_t::calc_name_buf_size]; + buf[pg_t::calc_name_buf_size - 1] = '\0'; + out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, ""); + return out; +} + + +// -- coll_t -- + +void coll_t::calc_str() +{ + switch (type) { + case TYPE_META: + strcpy(_str_buff, "meta"); + _str = _str_buff; + break; + case TYPE_PG: + _str_buff[spg_t::calc_name_buf_size - 1] = '\0'; + _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_"); + break; + case TYPE_PG_TEMP: + _str_buff[spg_t::calc_name_buf_size - 1] = '\0'; + _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_"); + break; + default: + ceph_abort_msg("unknown collection type"); + } +} + +bool coll_t::parse(const std::string& s) +{ + if (s == "meta") { + type = TYPE_META; + pgid = spg_t(); + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + if (s.find("_head") == s.length() - 5 && + pgid.parse(s.substr(0, s.length() - 5))) { + type = TYPE_PG; + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + if (s.find("_TEMP") == s.length() - 5 && + pgid.parse(s.substr(0, s.length() - 5))) { + type = TYPE_PG_TEMP; + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + return false; +} + +void coll_t::encode(bufferlist& bl) const +{ + using ceph::encode; + // when changing this, remember to update encoded_size() too. + if (is_temp()) { + // can't express this as v2... + __u8 struct_v = 3; + encode(struct_v, bl); + encode(to_str(), bl); + } else { + __u8 struct_v = 2; + encode(struct_v, bl); + encode((__u8)type, bl); + encode(pgid, bl); + snapid_t snap = CEPH_NOSNAP; + encode(snap, bl); + } +} + +size_t coll_t::encoded_size() const +{ + size_t r = sizeof(__u8); + if (is_temp()) { + // v3 + r += sizeof(__u32); + if (_str) { + r += strlen(_str); + } + } else { + // v2 + // 1. type + r += sizeof(__u8); + // 2. pgid + // - encoding header + r += sizeof(ceph_le32) + 2 * sizeof(__u8); + // - pg_t + r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t); + // - shard_id_t + r += sizeof(int8_t); + // 3. snapid_t + r += sizeof(uint64_t); + } + + return r; +} + +void coll_t::decode(bufferlist::const_iterator& bl) +{ + using ceph::decode; + __u8 struct_v; + decode(struct_v, bl); + switch (struct_v) { + case 1: + { + snapid_t snap; + decode(pgid, bl); + decode(snap, bl); + + // infer the type + if (pgid == spg_t() && snap == 0) { + type = TYPE_META; + } else { + type = TYPE_PG; + } + removal_seq = 0; + } + break; + + case 2: + { + __u8 _type; + snapid_t snap; + decode(_type, bl); + decode(pgid, bl); + decode(snap, bl); + type = (type_t)_type; + removal_seq = 0; + } + break; + + case 3: + { + string str; + decode(str, bl); + bool ok = parse(str); + if (!ok) + throw std::domain_error(std::string("unable to parse pg ") + str); + } + break; + + default: + { + ostringstream oss; + oss << "coll_t::decode(): don't know how to decode version " + << struct_v; + throw std::domain_error(oss.str()); + } + } +} + +void coll_t::dump(Formatter *f) const +{ + f->dump_unsigned("type_id", (unsigned)type); + if (type != TYPE_META) + f->dump_stream("pgid") << pgid; + f->dump_string("name", to_str()); +} + +void coll_t::generate_test_instances(list<coll_t*>& o) +{ + o.push_back(new coll_t()); + o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD))); + o.push_back(new coll_t(o.back()->get_temp())); + o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12)))); + o.push_back(new coll_t(o.back()->get_temp())); + o.push_back(new coll_t()); +} + +// --- + +std::string pg_vector_string(const vector<int32_t> &a) +{ + ostringstream oss; + oss << "["; + for (vector<int32_t>::const_iterator i = a.begin(); i != a.end(); ++i) { + if (i != a.begin()) + oss << ","; + if (*i != CRUSH_ITEM_NONE) + oss << *i; + else + oss << "NONE"; + } + oss << "]"; + return oss.str(); +} + +std::string pg_state_string(uint64_t state) +{ + ostringstream oss; + if (state & PG_STATE_STALE) + oss << "stale+"; + if (state & PG_STATE_CREATING) + oss << "creating+"; + if (state & PG_STATE_ACTIVE) + oss << "active+"; + if (state & PG_STATE_ACTIVATING) + oss << "activating+"; + if (state & PG_STATE_CLEAN) + oss << "clean+"; + if (state & PG_STATE_RECOVERY_WAIT) + oss << "recovery_wait+"; + if (state & PG_STATE_RECOVERY_TOOFULL) + oss << "recovery_toofull+"; + if (state & PG_STATE_RECOVERING) + oss << "recovering+"; + if (state & PG_STATE_FORCED_RECOVERY) + oss << "forced_recovery+"; + if (state & PG_STATE_DOWN) + oss << "down+"; + if (state & PG_STATE_RECOVERY_UNFOUND) + oss << "recovery_unfound+"; + if (state & PG_STATE_BACKFILL_UNFOUND) + oss << "backfill_unfound+"; + if (state & PG_STATE_UNDERSIZED) + oss << "undersized+"; + if (state & PG_STATE_DEGRADED) + oss << "degraded+"; + if (state & PG_STATE_REMAPPED) + oss << "remapped+"; + if (state & PG_STATE_PREMERGE) + oss << "premerge+"; + if (state & PG_STATE_SCRUBBING) + oss << "scrubbing+"; + if (state & PG_STATE_DEEP_SCRUB) + oss << "deep+"; + if (state & PG_STATE_INCONSISTENT) + oss << "inconsistent+"; + if (state & PG_STATE_PEERING) + oss << "peering+"; + if (state & PG_STATE_REPAIR) + oss << "repair+"; + if (state & PG_STATE_BACKFILL_WAIT) + oss << "backfill_wait+"; + if (state & PG_STATE_BACKFILLING) + oss << "backfilling+"; + if (state & PG_STATE_FORCED_BACKFILL) + oss << "forced_backfill+"; + if (state & PG_STATE_BACKFILL_TOOFULL) + oss << "backfill_toofull+"; + if (state & PG_STATE_INCOMPLETE) + oss << "incomplete+"; + if (state & PG_STATE_PEERED) + oss << "peered+"; + if (state & PG_STATE_SNAPTRIM) + oss << "snaptrim+"; + if (state & PG_STATE_SNAPTRIM_WAIT) + oss << "snaptrim_wait+"; + if (state & PG_STATE_SNAPTRIM_ERROR) + oss << "snaptrim_error+"; + if (state & PG_STATE_FAILED_REPAIR) + oss << "failed_repair+"; + string ret(oss.str()); + if (ret.length() > 0) + ret.resize(ret.length() - 1); + else + ret = "unknown"; + return ret; +} + +boost::optional<uint64_t> pg_string_state(const std::string& state) +{ + boost::optional<uint64_t> type; + if (state == "active") + type = PG_STATE_ACTIVE; + else if (state == "clean") + type = PG_STATE_CLEAN; + else if (state == "down") + type = PG_STATE_DOWN; + else if (state == "recovery_unfound") + type = PG_STATE_RECOVERY_UNFOUND; + else if (state == "backfill_unfound") + type = PG_STATE_BACKFILL_UNFOUND; + else if (state == "premerge") + type = PG_STATE_PREMERGE; + else if (state == "scrubbing") + type = PG_STATE_SCRUBBING; + else if (state == "degraded") + type = PG_STATE_DEGRADED; + else if (state == "inconsistent") + type = PG_STATE_INCONSISTENT; + else if (state == "peering") + type = PG_STATE_PEERING; + else if (state == "repair") + type = PG_STATE_REPAIR; + else if (state == "recovering") + type = PG_STATE_RECOVERING; + else if (state == "forced_recovery") + type = PG_STATE_FORCED_RECOVERY; + else if (state == "backfill_wait") + type = PG_STATE_BACKFILL_WAIT; + else if (state == "incomplete") + type = PG_STATE_INCOMPLETE; + else if (state == "stale") + type = PG_STATE_STALE; + else if (state == "remapped") + type = PG_STATE_REMAPPED; + else if (state == "deep") + type = PG_STATE_DEEP_SCRUB; + else if (state == "backfilling") + type = PG_STATE_BACKFILLING; + else if (state == "forced_backfill") + type = PG_STATE_FORCED_BACKFILL; + else if (state == "backfill_toofull") + type = PG_STATE_BACKFILL_TOOFULL; + else if (state == "recovery_wait") + type = PG_STATE_RECOVERY_WAIT; + else if (state == "recovery_toofull") + type = PG_STATE_RECOVERY_TOOFULL; + else if (state == "undersized") + type = PG_STATE_UNDERSIZED; + else if (state == "activating") + type = PG_STATE_ACTIVATING; + else if (state == "peered") + type = PG_STATE_PEERED; + else if (state == "snaptrim") + type = PG_STATE_SNAPTRIM; + else if (state == "snaptrim_wait") + type = PG_STATE_SNAPTRIM_WAIT; + else if (state == "snaptrim_error") + type = PG_STATE_SNAPTRIM_ERROR; + else if (state == "creating") + type = PG_STATE_CREATING; + else if (state == "failed_repair") + type = PG_STATE_FAILED_REPAIR; + else if (state == "unknown") + type = 0; + else + type = boost::none; + return type; +} + +// -- eversion_t -- +string eversion_t::get_key_name() const +{ + std::string key(32, ' '); + get_key_name(&key[0]); + key.resize(31); // remove the null terminator + return key; +} + +// -- pool_snap_info_t -- +void pool_snap_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("snapid", snapid); + f->dump_stream("stamp") << stamp; + f->dump_string("name", name); +} + +void pool_snap_info_t::encode(bufferlist& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGPOOL3) == 0) { + __u8 struct_v = 1; + encode(struct_v, bl); + encode(snapid, bl); + encode(stamp, bl); + encode(name, bl); + return; + } + ENCODE_START(2, 2, bl); + encode(snapid, bl); + encode(stamp, bl); + encode(name, bl); + ENCODE_FINISH(bl); +} + +void pool_snap_info_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(snapid, bl); + decode(stamp, bl); + decode(name, bl); + DECODE_FINISH(bl); +} + +void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o) +{ + o.push_back(new pool_snap_info_t); + o.push_back(new pool_snap_info_t); + o.back()->snapid = 1; + o.back()->stamp = utime_t(1, 2); + o.back()->name = "foo"; +} + +// -- pool_opts_t -- + +typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t; +static opt_mapping_t opt_mapping = boost::assign::map_list_of + ("scrub_min_interval", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE)) + ("scrub_max_interval", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE)) + ("deep_scrub_interval", pool_opts_t::opt_desc_t( + pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE)) + ("recovery_priority", pool_opts_t::opt_desc_t( + pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT)) + ("recovery_op_priority", pool_opts_t::opt_desc_t( + pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT)) + ("scrub_priority", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT)) + ("compression_mode", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR)) + ("compression_algorithm", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR)) + ("compression_required_ratio", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE)) + ("compression_max_blob_size", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT)) + ("compression_min_blob_size", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT)) + ("csum_type", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_TYPE, pool_opts_t::INT)) + ("csum_max_block", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT)) + ("csum_min_block", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT)) + ("fingerprint_algorithm", pool_opts_t::opt_desc_t( + pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR)) + ("pg_num_min", pool_opts_t::opt_desc_t( + pool_opts_t::PG_NUM_MIN, pool_opts_t::INT)) + ("target_size_bytes", pool_opts_t::opt_desc_t( + pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT)) + ("target_size_ratio", pool_opts_t::opt_desc_t( + pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE)) + ("pg_autoscale_bias", pool_opts_t::opt_desc_t( + pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE)); + +bool pool_opts_t::is_opt_name(const std::string& name) +{ + return opt_mapping.count(name); +} + +pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) +{ + opt_mapping_t::iterator i = opt_mapping.find(name); + ceph_assert(i != opt_mapping.end()); + return i->second; +} + +bool pool_opts_t::is_set(pool_opts_t::key_t key) const +{ + return opts.count(key); +} + +const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const +{ + opts_t::const_iterator i = opts.find(key); + ceph_assert(i != opts.end()); + return i->second; +} + +bool pool_opts_t::unset(pool_opts_t::key_t key) { + return opts.erase(key) > 0; +} + +class pool_opts_dumper_t : public boost::static_visitor<> { +public: + pool_opts_dumper_t(const std::string& name_, Formatter* f_) : + name(name_.c_str()), f(f_) {} + + void operator()(std::string s) const { + f->dump_string(name, s); + } + void operator()(int64_t i) const { + f->dump_int(name, i); + } + void operator()(double d) const { + f->dump_float(name, d); + } + +private: + const char* name; + Formatter* f; +}; + +void pool_opts_t::dump(const std::string& name, Formatter* f) const +{ + const opt_desc_t& desc = get_opt_desc(name); + opts_t::const_iterator i = opts.find(desc.key); + if (i == opts.end()) { + return; + } + boost::apply_visitor(pool_opts_dumper_t(name, f), i->second); +} + +void pool_opts_t::dump(Formatter* f) const +{ + for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end(); + ++i) { + const std::string& name = i->first; + const opt_desc_t& desc = i->second; + opts_t::const_iterator j = opts.find(desc.key); + if (j == opts.end()) { + continue; + } + boost::apply_visitor(pool_opts_dumper_t(name, f), j->second); + } +} + +class pool_opts_encoder_t : public boost::static_visitor<> { +public: + explicit pool_opts_encoder_t(bufferlist& bl_, uint64_t features) + : bl(bl_), + features(features) {} + + void operator()(const std::string &s) const { + encode(static_cast<int32_t>(pool_opts_t::STR), bl); + encode(s, bl); + } + void operator()(int64_t i) const { + encode(static_cast<int32_t>(pool_opts_t::INT), bl); + if (HAVE_FEATURE(features, SERVER_NAUTILUS)) { + encode(i, bl); + } else { + encode(static_cast<int32_t>(i), bl); + } + } + void operator()(double d) const { + encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl); + encode(d, bl); + } + +private: + bufferlist& bl; + uint64_t features; +}; + +void pool_opts_t::encode(bufferlist& bl, uint64_t features) const +{ + unsigned v = 2; + if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 1; + } + ENCODE_START(v, 1, bl); + uint32_t n = static_cast<uint32_t>(opts.size()); + encode(n, bl); + for (opts_t::const_iterator i = opts.begin(); i != opts.end(); ++i) { + encode(static_cast<int32_t>(i->first), bl); + boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second); + } + ENCODE_FINISH(bl); +} + +void pool_opts_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + __u32 n; + decode(n, bl); + opts.clear(); + while (n--) { + int32_t k, t; + decode(k, bl); + decode(t, bl); + if (t == STR) { + std::string s; + decode(s, bl); + opts[static_cast<key_t>(k)] = s; + } else if (t == INT) { + int64_t i; + if (struct_v >= 2) { + decode(i, bl); + } else { + int ii; + decode(ii, bl); + i = ii; + } + opts[static_cast<key_t>(k)] = i; + } else if (t == DOUBLE) { + double d; + decode(d, bl); + opts[static_cast<key_t>(k)] = d; + } else { + ceph_assert(!"invalid type"); + } + } + DECODE_FINISH(bl); +} + +ostream& operator<<(ostream& out, const pool_opts_t& opts) +{ + for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end(); + ++i) { + const std::string& name = i->first; + const pool_opts_t::opt_desc_t& desc = i->second; + pool_opts_t::opts_t::const_iterator j = opts.opts.find(desc.key); + if (j == opts.opts.end()) { + continue; + } + out << " " << name << " " << j->second; + } + return out; +} + +// -- pg_pool_t -- + +const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs"); +const char *pg_pool_t::APPLICATION_NAME_RBD("rbd"); +const char *pg_pool_t::APPLICATION_NAME_RGW("rgw"); + +void pg_pool_t::dump(Formatter *f) const +{ + f->dump_stream("create_time") << get_create_time(); + f->dump_unsigned("flags", get_flags()); + f->dump_string("flags_names", get_flags_string()); + f->dump_int("type", get_type()); + f->dump_int("size", get_size()); + f->dump_int("min_size", get_min_size()); + f->dump_int("crush_rule", get_crush_rule()); + f->dump_int("object_hash", get_object_hash()); + f->dump_string("pg_autoscale_mode", + get_pg_autoscale_mode_name(pg_autoscale_mode)); + f->dump_unsigned("pg_num", get_pg_num()); + f->dump_unsigned("pg_placement_num", get_pgp_num()); + f->dump_unsigned("pg_placement_num_target", get_pgp_num_target()); + f->dump_unsigned("pg_num_target", get_pg_num_target()); + f->dump_unsigned("pg_num_pending", get_pg_num_pending()); + f->dump_object("last_pg_merge_meta", last_pg_merge_meta); + f->dump_stream("last_change") << get_last_change(); + f->dump_stream("last_force_op_resend") << get_last_force_op_resend(); + f->dump_stream("last_force_op_resend_prenautilus") + << get_last_force_op_resend_prenautilus(); + f->dump_stream("last_force_op_resend_preluminous") + << get_last_force_op_resend_preluminous(); + f->dump_unsigned("auid", get_auid()); + f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged"); + f->dump_unsigned("snap_seq", get_snap_seq()); + f->dump_unsigned("snap_epoch", get_snap_epoch()); + f->open_array_section("pool_snaps"); + for (map<snapid_t, pool_snap_info_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) { + f->open_object_section("pool_snap_info"); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_stream("removed_snaps") << removed_snaps; + f->dump_unsigned("quota_max_bytes", quota_max_bytes); + f->dump_unsigned("quota_max_objects", quota_max_objects); + f->open_array_section("tiers"); + for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p) + f->dump_unsigned("pool_id", *p); + f->close_section(); + f->dump_int("tier_of", tier_of); + f->dump_int("read_tier", read_tier); + f->dump_int("write_tier", write_tier); + f->dump_string("cache_mode", get_cache_mode_name()); + f->dump_unsigned("target_max_bytes", target_max_bytes); + f->dump_unsigned("target_max_objects", target_max_objects); + f->dump_unsigned("cache_target_dirty_ratio_micro", + cache_target_dirty_ratio_micro); + f->dump_unsigned("cache_target_dirty_high_ratio_micro", + cache_target_dirty_high_ratio_micro); + f->dump_unsigned("cache_target_full_ratio_micro", + cache_target_full_ratio_micro); + f->dump_unsigned("cache_min_flush_age", cache_min_flush_age); + f->dump_unsigned("cache_min_evict_age", cache_min_evict_age); + f->dump_string("erasure_code_profile", erasure_code_profile); + f->open_object_section("hit_set_params"); + hit_set_params.dump(f); + f->close_section(); // hit_set_params + f->dump_unsigned("hit_set_period", hit_set_period); + f->dump_unsigned("hit_set_count", hit_set_count); + f->dump_bool("use_gmt_hitset", use_gmt_hitset); + f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote); + f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote); + f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate); + f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n); + f->open_array_section("grade_table"); + for (unsigned i = 0; i < hit_set_count; ++i) + f->dump_unsigned("value", get_grade(i)); + f->close_section(); + f->dump_unsigned("stripe_width", get_stripe_width()); + f->dump_unsigned("expected_num_objects", expected_num_objects); + f->dump_bool("fast_read", fast_read); + f->open_object_section("options"); + opts.dump(f); + f->close_section(); // options + f->open_object_section("application_metadata"); + for (auto &app_pair : application_metadata) { + f->open_object_section(app_pair.first.c_str()); + for (auto &kv_pair : app_pair.second) { + f->dump_string(kv_pair.first.c_str(), kv_pair.second); + } + f->close_section(); // application + } + f->close_section(); // application_metadata +} + +void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const { + for (size_t i = 0; i < from.size(); ++i) { + if (from[i] != CRUSH_ITEM_NONE) { + to->insert( + pg_shard_t( + from[i], + is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + } +} + +void pg_pool_t::calc_pg_masks() +{ + pg_num_mask = (1 << cbits(pg_num-1)) - 1; + pgp_num_mask = (1 << cbits(pgp_num-1)) - 1; +} + +unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const +{ + if (pg_num == pg_num_mask + 1) + return pg_num; // power-of-2 split + unsigned mask = pg_num_mask >> 1; + if ((pgid.ps() & mask) < (pg_num & mask)) + return pg_num_mask + 1; // smaller bin size (already split) + else + return (pg_num_mask + 1) >> 1; // bigger bin (not yet split) +} + +bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const +{ + if (pg_num_pending >= pg_num) { + return false; + } + if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) { + if (target) { + *target = false; + } + return true; + } + for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) { + if (pg_t(ps, pgid.pool()).get_parent() == pgid) { + if (target) { + *target = true; + } + return true; + } + } + return false; +} + +/* + * we have two snap modes: + * - pool snaps + * - snap existence/non-existence defined by snaps[] and snap_seq + * - user managed snaps + * - existence tracked by librados user + */ +bool pg_pool_t::is_pool_snaps_mode() const +{ + return has_flag(FLAG_POOL_SNAPS); +} + +bool pg_pool_t::is_unmanaged_snaps_mode() const +{ + return has_flag(FLAG_SELFMANAGED_SNAPS); +} + +bool pg_pool_t::is_removed_snap(snapid_t s) const +{ + if (is_pool_snaps_mode()) + return s <= get_snap_seq() && snaps.count(s) == 0; + else + return removed_snaps.contains(s); +} + +/* + * build set of known-removed sets from either pool snaps or + * explicit removed_snaps set. + */ +void pg_pool_t::build_removed_snaps(interval_set<snapid_t>& rs) const +{ + if (is_pool_snaps_mode()) { + rs.clear(); + for (snapid_t s = 1; s <= get_snap_seq(); s = s + 1) + if (snaps.count(s) == 0) + rs.insert(s); + } else { + rs = removed_snaps; + } +} + +bool pg_pool_t::maybe_updated_removed_snaps(const interval_set<snapid_t>& cached) const +{ + if (is_unmanaged_snaps_mode()) { // remove_unmanaged_snap increments range_end + if (removed_snaps.empty() || cached.empty()) // range_end is undefined + return removed_snaps.empty() != cached.empty(); + return removed_snaps.range_end() != cached.range_end(); + } + return true; +} + +snapid_t pg_pool_t::snap_exists(const char *s) const +{ + for (map<snapid_t,pool_snap_info_t>::const_iterator p = snaps.begin(); + p != snaps.end(); + ++p) + if (p->second.name == s) + return p->second.snapid; + return 0; +} + +void pg_pool_t::add_snap(const char *n, utime_t stamp) +{ + ceph_assert(!is_unmanaged_snaps_mode()); + flags |= FLAG_POOL_SNAPS; + snapid_t s = get_snap_seq() + 1; + snap_seq = s; + snaps[s].snapid = s; + snaps[s].name = n; + snaps[s].stamp = stamp; +} + +void pg_pool_t::add_unmanaged_snap(uint64_t& snapid) +{ + ceph_assert(!is_pool_snaps_mode()); + if (snap_seq == 0) { + // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after + // mimic this field is not decoded but our flag is set; pre-mimic, we + // have a non-empty removed_snaps to signifiy a non-pool-snaps pool. + removed_snaps.insert(snapid_t(1)); + snap_seq = 1; + } + flags |= FLAG_SELFMANAGED_SNAPS; + snapid = snap_seq = snap_seq + 1; +} + +void pg_pool_t::remove_snap(snapid_t s) +{ + ceph_assert(snaps.count(s)); + snaps.erase(s); + snap_seq = snap_seq + 1; +} + +void pg_pool_t::remove_unmanaged_snap(snapid_t s) +{ + ceph_assert(is_unmanaged_snaps_mode()); + removed_snaps.insert(s); + snap_seq = snap_seq + 1; + // try to add in the new seq, just to try to keep the interval_set contiguous + if (!removed_snaps.contains(get_snap_seq())) { + removed_snaps.insert(get_snap_seq()); + } +} + +SnapContext pg_pool_t::get_snap_context() const +{ + vector<snapid_t> s(snaps.size()); + unsigned i = 0; + for (map<snapid_t, pool_snap_info_t>::const_reverse_iterator p = snaps.rbegin(); + p != snaps.rend(); + ++p) + s[i++] = p->first; + return SnapContext(get_snap_seq(), s); +} + +uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const +{ + if (ns.empty()) + return ceph_str_hash(object_hash, key.data(), key.length()); + int nsl = ns.length(); + int len = key.length() + nsl + 1; + char buf[len]; + memcpy(&buf[0], ns.data(), nsl); + buf[nsl] = '\037'; + memcpy(&buf[nsl+1], key.data(), key.length()); + return ceph_str_hash(object_hash, &buf[0], len); +} + +uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const +{ + return ceph_stable_mod(v, pg_num, pg_num_mask); +} + +/* + * map a raw pg (with full precision ps) into an actual pg, for storage + */ +pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const +{ + pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask)); + return pg; +} + +/* + * map raw pg (full precision ps) into a placement seed. include + * pool id in that value so that different pools don't use the same + * seeds. + */ +ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const +{ + if (flags & FLAG_HASHPSPOOL) { + // Hash the pool id so that pool PGs do not overlap. + return + crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask), + pg.pool()); + } else { + // Legacy behavior; add ps and pool together. This is not a great + // idea because the PGs from each pool will essentially overlap on + // top of each other: 0.5 == 1.4 == 2.3 == ... + return + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) + + pg.pool(); + } +} + +uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const +{ + uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123); + if (pg_num == pg_num_mask + 1) { + r &= ~pg_num_mask; + } else { + unsigned smaller_mask = pg_num_mask >> 1; + if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) { + r &= ~pg_num_mask; + } else { + r &= ~smaller_mask; + } + } + r |= pg.ps(); + return r; +} + +void pg_pool_t::encode(bufferlist& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGPOOL3) == 0) { + // this encoding matches the old struct ceph_pg_pool + __u8 struct_v = 2; + encode(struct_v, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + + __u32 n = snaps.size(); + encode(n, bl); + n = removed_snaps.num_intervals(); + encode(n, bl); + + encode(auid, bl); + + encode_nohead(snaps, bl, features); + encode_nohead(removed_snaps, bl); + return; + } + + if ((features & CEPH_FEATURE_OSDENC) == 0) { + __u8 struct_v = 4; + encode(struct_v, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + encode(flags, bl); + encode((uint32_t)0, bl); // crash_replay_interval + return; + } + + if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) { + // we simply added last_force_op_resend here, which is a fully + // backward compatible change. however, encoding the same map + // differently between monitors triggers scrub noise (even though + // they are decodable without the feature), so let's be pendantic + // about it. + ENCODE_START(14, 5, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + encode(flags, bl); + encode((uint32_t)0, bl); // crash_replay_interval + encode(min_size, bl); + encode(quota_max_bytes, bl); + encode(quota_max_objects, bl); + encode(tiers, bl); + encode(tier_of, bl); + __u8 c = cache_mode; + encode(c, bl); + encode(read_tier, bl); + encode(write_tier, bl); + encode(properties, bl); + encode(hit_set_params, bl); + encode(hit_set_period, bl); + encode(hit_set_count, bl); + encode(stripe_width, bl); + encode(target_max_bytes, bl); + encode(target_max_objects, bl); + encode(cache_target_dirty_ratio_micro, bl); + encode(cache_target_full_ratio_micro, bl); + encode(cache_min_flush_age, bl); + encode(cache_min_evict_age, bl); + encode(erasure_code_profile, bl); + ENCODE_FINISH(bl); + return; + } + + uint8_t v = 29; + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) { + // this was the first post-hammer thing we added; if it's missing, encode + // like hammer. + v = 21; + } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 24; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 26; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 27; + } + + ENCODE_START(v, 5, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + if (v >= 27) { + encode(flags, bl); + } else { + auto tmp = flags; + tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING); + encode(tmp, bl); + } + encode((uint32_t)0, bl); // crash_replay_interval + encode(min_size, bl); + encode(quota_max_bytes, bl); + encode(quota_max_objects, bl); + encode(tiers, bl); + encode(tier_of, bl); + __u8 c = cache_mode; + encode(c, bl); + encode(read_tier, bl); + encode(write_tier, bl); + encode(properties, bl); + encode(hit_set_params, bl); + encode(hit_set_period, bl); + encode(hit_set_count, bl); + encode(stripe_width, bl); + encode(target_max_bytes, bl); + encode(target_max_objects, bl); + encode(cache_target_dirty_ratio_micro, bl); + encode(cache_target_full_ratio_micro, bl); + encode(cache_min_flush_age, bl); + encode(cache_min_evict_age, bl); + encode(erasure_code_profile, bl); + encode(last_force_op_resend_preluminous, bl); + encode(min_read_recency_for_promote, bl); + encode(expected_num_objects, bl); + if (v >= 19) { + encode(cache_target_dirty_high_ratio_micro, bl); + } + if (v >= 20) { + encode(min_write_recency_for_promote, bl); + } + if (v >= 21) { + encode(use_gmt_hitset, bl); + } + if (v >= 22) { + encode(fast_read, bl); + } + if (v >= 23) { + encode(hit_set_grade_decay_rate, bl); + encode(hit_set_search_last_n, bl); + } + if (v >= 24) { + encode(opts, bl, features); + } + if (v >= 25) { + encode(last_force_op_resend_prenautilus, bl); + } + if (v >= 26) { + encode(application_metadata, bl); + } + if (v >= 27) { + encode(create_time, bl); + } + if (v >= 28) { + encode(pg_num_target, bl); + encode(pgp_num_target, bl); + encode(pg_num_pending, bl); + encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01] + encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01] + encode(last_force_op_resend, bl); + encode(pg_autoscale_mode, bl); + } + if (v >= 29) { + encode(last_pg_merge_meta, bl); + } + ENCODE_FINISH(bl); +} + +void pg_pool_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(29, 5, 5, bl); + decode(type, bl); + decode(size, bl); + decode(crush_rule, bl); + decode(object_hash, bl); + decode(pg_num, bl); + decode(pgp_num, bl); + { + __u32 lpg_num, lpgp_num; + decode(lpg_num, bl); + decode(lpgp_num, bl); + } + decode(last_change, bl); + decode(snap_seq, bl); + decode(snap_epoch, bl); + + if (struct_v >= 3) { + decode(snaps, bl); + decode(removed_snaps, bl); + decode(auid, bl); + } else { + __u32 n, m; + decode(n, bl); + decode(m, bl); + decode(auid, bl); + decode_nohead(n, snaps, bl); + decode_nohead(m, removed_snaps, bl); + } + + if (struct_v >= 4) { + decode(flags, bl); + uint32_t crash_replay_interval; + decode(crash_replay_interval, bl); + } else { + flags = 0; + } + // upgrade path for selfmanaged vs pool snaps + if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) { + if (!removed_snaps.empty()) { + flags |= FLAG_SELFMANAGED_SNAPS; + } else { + flags |= FLAG_POOL_SNAPS; + } + } + if (struct_v >= 7) { + decode(min_size, bl); + } else { + min_size = size - size/2; + } + if (struct_v >= 8) { + decode(quota_max_bytes, bl); + decode(quota_max_objects, bl); + } + if (struct_v >= 9) { + decode(tiers, bl); + decode(tier_of, bl); + __u8 v; + decode(v, bl); + cache_mode = (cache_mode_t)v; + decode(read_tier, bl); + decode(write_tier, bl); + } + if (struct_v >= 10) { + decode(properties, bl); + } + if (struct_v >= 11) { + decode(hit_set_params, bl); + decode(hit_set_period, bl); + decode(hit_set_count, bl); + } else { + pg_pool_t def; + hit_set_period = def.hit_set_period; + hit_set_count = def.hit_set_count; + } + if (struct_v >= 12) { + decode(stripe_width, bl); + } else { + set_stripe_width(0); + } + if (struct_v >= 13) { + decode(target_max_bytes, bl); + decode(target_max_objects, bl); + decode(cache_target_dirty_ratio_micro, bl); + decode(cache_target_full_ratio_micro, bl); + decode(cache_min_flush_age, bl); + decode(cache_min_evict_age, bl); + } else { + target_max_bytes = 0; + target_max_objects = 0; + cache_target_dirty_ratio_micro = 0; + cache_target_full_ratio_micro = 0; + cache_min_flush_age = 0; + cache_min_evict_age = 0; + } + if (struct_v >= 14) { + decode(erasure_code_profile, bl); + } + if (struct_v >= 15) { + decode(last_force_op_resend_preluminous, bl); + } else { + last_force_op_resend_preluminous = 0; + } + if (struct_v >= 16) { + decode(min_read_recency_for_promote, bl); + } else { + min_read_recency_for_promote = 1; + } + if (struct_v >= 17) { + decode(expected_num_objects, bl); + } else { + expected_num_objects = 0; + } + if (struct_v >= 19) { + decode(cache_target_dirty_high_ratio_micro, bl); + } else { + cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro; + } + if (struct_v >= 20) { + decode(min_write_recency_for_promote, bl); + } else { + min_write_recency_for_promote = 1; + } + if (struct_v >= 21) { + decode(use_gmt_hitset, bl); + } else { + use_gmt_hitset = false; + } + if (struct_v >= 22) { + decode(fast_read, bl); + } else { + fast_read = false; + } + if (struct_v >= 23) { + decode(hit_set_grade_decay_rate, bl); + decode(hit_set_search_last_n, bl); + } else { + hit_set_grade_decay_rate = 0; + hit_set_search_last_n = 1; + } + if (struct_v >= 24) { + decode(opts, bl); + } + if (struct_v >= 25) { + decode(last_force_op_resend_prenautilus, bl); + } else { + last_force_op_resend_prenautilus = last_force_op_resend_preluminous; + } + if (struct_v >= 26) { + decode(application_metadata, bl); + } + if (struct_v >= 27) { + decode(create_time, bl); + } + if (struct_v >= 28) { + decode(pg_num_target, bl); + decode(pgp_num_target, bl); + decode(pg_num_pending, bl); + epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started; + decode(old_merge_last_epoch_started, bl); + decode(old_merge_last_epoch_clean, bl); + decode(last_force_op_resend, bl); + decode(pg_autoscale_mode, bl); + if (struct_v >= 29) { + decode(last_pg_merge_meta, bl); + } else { + last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean; + last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started; + } + } else { + pg_num_target = pg_num; + pgp_num_target = pgp_num; + pg_num_pending = pg_num; + last_force_op_resend = last_force_op_resend_prenautilus; + pg_autoscale_mode = PG_AUTOSCALE_MODE_WARN; // default to warn on upgrade + } + DECODE_FINISH(bl); + calc_pg_masks(); + calc_grade_table(); +} + +void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o) +{ + pg_pool_t a; + o.push_back(new pg_pool_t(a)); + + a.create_time = utime_t(4,5); + a.type = TYPE_REPLICATED; + a.size = 2; + a.crush_rule = 3; + a.object_hash = 4; + a.pg_num = 6; + a.pgp_num = 4; + a.pgp_num_target = 4; + a.pg_num_target = 5; + a.pg_num_pending = 5; + a.last_pg_merge_meta.last_epoch_started = 2; + a.last_pg_merge_meta.last_epoch_clean = 2; + a.last_change = 9; + a.last_force_op_resend = 123823; + a.last_force_op_resend_preluminous = 123824; + a.snap_seq = 10; + a.snap_epoch = 11; + a.flags = FLAG_POOL_SNAPS; + a.auid = 12; + a.quota_max_bytes = 473; + a.quota_max_objects = 474; + o.push_back(new pg_pool_t(a)); + + a.snaps[3].name = "asdf"; + a.snaps[3].snapid = 3; + a.snaps[3].stamp = utime_t(123, 4); + a.snaps[6].name = "qwer"; + a.snaps[6].snapid = 6; + a.snaps[6].stamp = utime_t(23423, 4); + o.push_back(new pg_pool_t(a)); + + a.flags = FLAG_SELFMANAGED_SNAPS; + a.snaps.clear(); + a.removed_snaps.insert(2); + a.quota_max_bytes = 2473; + a.quota_max_objects = 4374; + a.tiers.insert(0); + a.tiers.insert(1); + a.tier_of = 2; + a.cache_mode = CACHEMODE_WRITEBACK; + a.read_tier = 1; + a.write_tier = 1; + a.hit_set_params = HitSet::Params(new BloomHitSet::Params); + a.hit_set_period = 3600; + a.hit_set_count = 8; + a.min_read_recency_for_promote = 1; + a.min_write_recency_for_promote = 1; + a.hit_set_grade_decay_rate = 50; + a.hit_set_search_last_n = 1; + a.calc_grade_table(); + a.set_stripe_width(12345); + a.target_max_bytes = 1238132132; + a.target_max_objects = 1232132; + a.cache_target_dirty_ratio_micro = 187232; + a.cache_target_dirty_high_ratio_micro = 309856; + a.cache_target_full_ratio_micro = 987222; + a.cache_min_flush_age = 231; + a.cache_min_evict_age = 2321; + a.erasure_code_profile = "profile in osdmap"; + a.expected_num_objects = 123456; + a.fast_read = false; + a.application_metadata = {{"rbd", {{"key", "value"}}}}; + o.push_back(new pg_pool_t(a)); +} + +ostream& operator<<(ostream& out, const pg_pool_t& p) +{ + out << p.get_type_name() + << " size " << p.get_size() + << " min_size " << p.get_min_size() + << " crush_rule " << p.get_crush_rule() + << " object_hash " << p.get_object_hash_name() + << " pg_num " << p.get_pg_num() + << " pgp_num " << p.get_pgp_num(); + if (p.get_pg_num_target() != p.get_pg_num()) { + out << " pg_num_target " << p.get_pg_num_target(); + } + if (p.get_pgp_num_target() != p.get_pgp_num()) { + out << " pgp_num_target " << p.get_pgp_num_target(); + } + if (p.get_pg_num_pending() != p.get_pg_num()) { + out << " pg_num_pending " << p.get_pg_num_pending(); + } + if (p.pg_autoscale_mode) { + out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode); + } + out << " last_change " << p.get_last_change(); + if (p.get_last_force_op_resend() || + p.get_last_force_op_resend_prenautilus() || + p.get_last_force_op_resend_preluminous()) + out << " lfor " << p.get_last_force_op_resend() << "/" + << p.get_last_force_op_resend_prenautilus() << "/" + << p.get_last_force_op_resend_preluminous(); + if (p.get_auid()) + out << " owner " << p.get_auid(); + if (p.flags) + out << " flags " << p.get_flags_string(); + if (p.quota_max_bytes) + out << " max_bytes " << p.quota_max_bytes; + if (p.quota_max_objects) + out << " max_objects " << p.quota_max_objects; + if (!p.tiers.empty()) + out << " tiers " << p.tiers; + if (p.is_tier()) + out << " tier_of " << p.tier_of; + if (p.has_read_tier()) + out << " read_tier " << p.read_tier; + if (p.has_write_tier()) + out << " write_tier " << p.write_tier; + if (p.cache_mode) + out << " cache_mode " << p.get_cache_mode_name(); + if (p.target_max_bytes) + out << " target_bytes " << p.target_max_bytes; + if (p.target_max_objects) + out << " target_objects " << p.target_max_objects; + if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) { + out << " hit_set " << p.hit_set_params + << " " << p.hit_set_period << "s" + << " x" << p.hit_set_count << " decay_rate " + << p.hit_set_grade_decay_rate + << " search_last_n " << p.hit_set_search_last_n; + } + if (p.min_read_recency_for_promote) + out << " min_read_recency_for_promote " << p.min_read_recency_for_promote; + if (p.min_write_recency_for_promote) + out << " min_write_recency_for_promote " << p.min_write_recency_for_promote; + out << " stripe_width " << p.get_stripe_width(); + if (p.expected_num_objects) + out << " expected_num_objects " << p.expected_num_objects; + if (p.fast_read) + out << " fast_read " << p.fast_read; + out << p.opts; + if (!p.application_metadata.empty()) { + out << " application "; + for (auto it = p.application_metadata.begin(); + it != p.application_metadata.end(); ++it) { + if (it != p.application_metadata.begin()) + out << ","; + out << it->first; + } + } + return out; +} + + +// -- object_stat_sum_t -- + +void object_stat_sum_t::dump(Formatter *f) const +{ + f->dump_int("num_bytes", num_bytes); + f->dump_int("num_objects", num_objects); + f->dump_int("num_object_clones", num_object_clones); + f->dump_int("num_object_copies", num_object_copies); + f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary); + f->dump_int("num_objects_missing", num_objects_missing); + f->dump_int("num_objects_degraded", num_objects_degraded); + f->dump_int("num_objects_misplaced", num_objects_misplaced); + f->dump_int("num_objects_unfound", num_objects_unfound); + f->dump_int("num_objects_dirty", num_objects_dirty); + f->dump_int("num_whiteouts", num_whiteouts); + f->dump_int("num_read", num_rd); + f->dump_int("num_read_kb", num_rd_kb); + f->dump_int("num_write", num_wr); + f->dump_int("num_write_kb", num_wr_kb); + f->dump_int("num_scrub_errors", num_scrub_errors); + f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors); + f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors); + f->dump_int("num_objects_recovered", num_objects_recovered); + f->dump_int("num_bytes_recovered", num_bytes_recovered); + f->dump_int("num_keys_recovered", num_keys_recovered); + f->dump_int("num_objects_omap", num_objects_omap); + f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive); + f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive); + f->dump_int("num_flush", num_flush); + f->dump_int("num_flush_kb", num_flush_kb); + f->dump_int("num_evict", num_evict); + f->dump_int("num_evict_kb", num_evict_kb); + f->dump_int("num_promote", num_promote); + f->dump_int("num_flush_mode_high", num_flush_mode_high); + f->dump_int("num_flush_mode_low", num_flush_mode_low); + f->dump_int("num_evict_mode_some", num_evict_mode_some); + f->dump_int("num_evict_mode_full", num_evict_mode_full); + f->dump_int("num_objects_pinned", num_objects_pinned); + f->dump_int("num_legacy_snapsets", num_legacy_snapsets); + f->dump_int("num_large_omap_objects", num_large_omap_objects); + f->dump_int("num_objects_manifest", num_objects_manifest); + f->dump_int("num_omap_bytes", num_omap_bytes); + f->dump_int("num_omap_keys", num_omap_keys); + f->dump_int("num_objects_repaired", num_objects_repaired); +} + +void object_stat_sum_t::encode(bufferlist& bl) const +{ + ENCODE_START(20, 14, bl); +#if defined(CEPH_LITTLE_ENDIAN) + bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t)); +#else + encode(num_bytes, bl); + encode(num_objects, bl); + encode(num_object_clones, bl); + encode(num_object_copies, bl); + encode(num_objects_missing_on_primary, bl); + encode(num_objects_degraded, bl); + encode(num_objects_unfound, bl); + encode(num_rd, bl); + encode(num_rd_kb, bl); + encode(num_wr, bl); + encode(num_wr_kb, bl); + encode(num_scrub_errors, bl); + encode(num_objects_recovered, bl); + encode(num_bytes_recovered, bl); + encode(num_keys_recovered, bl); + encode(num_shallow_scrub_errors, bl); + encode(num_deep_scrub_errors, bl); + encode(num_objects_dirty, bl); + encode(num_whiteouts, bl); + encode(num_objects_omap, bl); + encode(num_objects_hit_set_archive, bl); + encode(num_objects_misplaced, bl); + encode(num_bytes_hit_set_archive, bl); + encode(num_flush, bl); + encode(num_flush_kb, bl); + encode(num_evict, bl); + encode(num_evict_kb, bl); + encode(num_promote, bl); + encode(num_flush_mode_high, bl); + encode(num_flush_mode_low, bl); + encode(num_evict_mode_some, bl); + encode(num_evict_mode_full, bl); + encode(num_objects_pinned, bl); + encode(num_objects_missing, bl); + encode(num_legacy_snapsets, bl); + encode(num_large_omap_objects, bl); + encode(num_objects_manifest, bl); + encode(num_omap_bytes, bl); + encode(num_omap_keys, bl); + encode(num_objects_repaired, bl); +#endif + ENCODE_FINISH(bl); +} + +void object_stat_sum_t::decode(bufferlist::const_iterator& bl) +{ + bool decode_finish = false; + static const int STAT_SUM_DECODE_VERSION = 20; + DECODE_START(STAT_SUM_DECODE_VERSION, bl); +#if defined(CEPH_LITTLE_ENDIAN) + if (struct_v == STAT_SUM_DECODE_VERSION) { + bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes)); + decode_finish = true; + } +#endif + if (!decode_finish) { + decode(num_bytes, bl); + decode(num_objects, bl); + decode(num_object_clones, bl); + decode(num_object_copies, bl); + decode(num_objects_missing_on_primary, bl); + decode(num_objects_degraded, bl); + decode(num_objects_unfound, bl); + decode(num_rd, bl); + decode(num_rd_kb, bl); + decode(num_wr, bl); + decode(num_wr_kb, bl); + decode(num_scrub_errors, bl); + decode(num_objects_recovered, bl); + decode(num_bytes_recovered, bl); + decode(num_keys_recovered, bl); + decode(num_shallow_scrub_errors, bl); + decode(num_deep_scrub_errors, bl); + decode(num_objects_dirty, bl); + decode(num_whiteouts, bl); + decode(num_objects_omap, bl); + decode(num_objects_hit_set_archive, bl); + decode(num_objects_misplaced, bl); + decode(num_bytes_hit_set_archive, bl); + decode(num_flush, bl); + decode(num_flush_kb, bl); + decode(num_evict, bl); + decode(num_evict_kb, bl); + decode(num_promote, bl); + decode(num_flush_mode_high, bl); + decode(num_flush_mode_low, bl); + decode(num_evict_mode_some, bl); + decode(num_evict_mode_full, bl); + decode(num_objects_pinned, bl); + decode(num_objects_missing, bl); + if (struct_v >= 16) { + decode(num_legacy_snapsets, bl); + } else { + num_legacy_snapsets = num_object_clones; // upper bound + } + if (struct_v >= 17) { + decode(num_large_omap_objects, bl); + } + if (struct_v >= 18) { + decode(num_objects_manifest, bl); + } + if (struct_v >= 19) { + decode(num_omap_bytes, bl); + decode(num_omap_keys, bl); + } + if (struct_v >= 20) { + decode(num_objects_repaired, bl); + } + } + DECODE_FINISH(bl); +} + +void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o) +{ + object_stat_sum_t a; + + a.num_bytes = 1; + a.num_objects = 3; + a.num_object_clones = 4; + a.num_object_copies = 5; + a.num_objects_missing_on_primary = 6; + a.num_objects_missing = 123; + a.num_objects_degraded = 7; + a.num_objects_unfound = 8; + a.num_rd = 9; a.num_rd_kb = 10; + a.num_wr = 11; a.num_wr_kb = 12; + a.num_objects_recovered = 14; + a.num_bytes_recovered = 15; + a.num_keys_recovered = 16; + a.num_deep_scrub_errors = 17; + a.num_shallow_scrub_errors = 18; + a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors; + a.num_objects_dirty = 21; + a.num_whiteouts = 22; + a.num_objects_misplaced = 1232; + a.num_objects_hit_set_archive = 2; + a.num_bytes_hit_set_archive = 27; + a.num_flush = 5; + a.num_flush_kb = 6; + a.num_evict = 7; + a.num_evict_kb = 8; + a.num_promote = 9; + a.num_flush_mode_high = 0; + a.num_flush_mode_low = 1; + a.num_evict_mode_some = 1; + a.num_evict_mode_full = 0; + a.num_objects_pinned = 20; + a.num_large_omap_objects = 5; + a.num_objects_manifest = 2; + a.num_omap_bytes = 20000; + a.num_omap_keys = 200; + a.num_objects_repaired = 300; + o.push_back(new object_stat_sum_t(a)); +} + +void object_stat_sum_t::add(const object_stat_sum_t& o) +{ + num_bytes += o.num_bytes; + num_objects += o.num_objects; + num_object_clones += o.num_object_clones; + num_object_copies += o.num_object_copies; + num_objects_missing_on_primary += o.num_objects_missing_on_primary; + num_objects_missing += o.num_objects_missing; + num_objects_degraded += o.num_objects_degraded; + num_objects_misplaced += o.num_objects_misplaced; + num_rd += o.num_rd; + num_rd_kb += o.num_rd_kb; + num_wr += o.num_wr; + num_wr_kb += o.num_wr_kb; + num_objects_unfound += o.num_objects_unfound; + num_scrub_errors += o.num_scrub_errors; + num_shallow_scrub_errors += o.num_shallow_scrub_errors; + num_deep_scrub_errors += o.num_deep_scrub_errors; + num_objects_recovered += o.num_objects_recovered; + num_bytes_recovered += o.num_bytes_recovered; + num_keys_recovered += o.num_keys_recovered; + num_objects_dirty += o.num_objects_dirty; + num_whiteouts += o.num_whiteouts; + num_objects_omap += o.num_objects_omap; + num_objects_hit_set_archive += o.num_objects_hit_set_archive; + num_bytes_hit_set_archive += o.num_bytes_hit_set_archive; + num_flush += o.num_flush; + num_flush_kb += o.num_flush_kb; + num_evict += o.num_evict; + num_evict_kb += o.num_evict_kb; + num_promote += o.num_promote; + num_flush_mode_high += o.num_flush_mode_high; + num_flush_mode_low += o.num_flush_mode_low; + num_evict_mode_some += o.num_evict_mode_some; + num_evict_mode_full += o.num_evict_mode_full; + num_objects_pinned += o.num_objects_pinned; + num_legacy_snapsets += o.num_legacy_snapsets; + num_large_omap_objects += o.num_large_omap_objects; + num_objects_manifest += o.num_objects_manifest; + num_omap_bytes += o.num_omap_bytes; + num_omap_keys += o.num_omap_keys; + num_objects_repaired += o.num_objects_repaired; +} + +void object_stat_sum_t::sub(const object_stat_sum_t& o) +{ + num_bytes -= o.num_bytes; + num_objects -= o.num_objects; + num_object_clones -= o.num_object_clones; + num_object_copies -= o.num_object_copies; + num_objects_missing_on_primary -= o.num_objects_missing_on_primary; + num_objects_missing -= o.num_objects_missing; + num_objects_degraded -= o.num_objects_degraded; + num_objects_misplaced -= o.num_objects_misplaced; + num_rd -= o.num_rd; + num_rd_kb -= o.num_rd_kb; + num_wr -= o.num_wr; + num_wr_kb -= o.num_wr_kb; + num_objects_unfound -= o.num_objects_unfound; + num_scrub_errors -= o.num_scrub_errors; + num_shallow_scrub_errors -= o.num_shallow_scrub_errors; + num_deep_scrub_errors -= o.num_deep_scrub_errors; + num_objects_recovered -= o.num_objects_recovered; + num_bytes_recovered -= o.num_bytes_recovered; + num_keys_recovered -= o.num_keys_recovered; + num_objects_dirty -= o.num_objects_dirty; + num_whiteouts -= o.num_whiteouts; + num_objects_omap -= o.num_objects_omap; + num_objects_hit_set_archive -= o.num_objects_hit_set_archive; + num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive; + num_flush -= o.num_flush; + num_flush_kb -= o.num_flush_kb; + num_evict -= o.num_evict; + num_evict_kb -= o.num_evict_kb; + num_promote -= o.num_promote; + num_flush_mode_high -= o.num_flush_mode_high; + num_flush_mode_low -= o.num_flush_mode_low; + num_evict_mode_some -= o.num_evict_mode_some; + num_evict_mode_full -= o.num_evict_mode_full; + num_objects_pinned -= o.num_objects_pinned; + num_legacy_snapsets -= o.num_legacy_snapsets; + num_large_omap_objects -= o.num_large_omap_objects; + num_objects_manifest -= o.num_objects_manifest; + num_omap_bytes -= o.num_omap_bytes; + num_omap_keys -= o.num_omap_keys; + num_objects_repaired -= o.num_objects_repaired; +} + +bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r) +{ + return + l.num_bytes == r.num_bytes && + l.num_objects == r.num_objects && + l.num_object_clones == r.num_object_clones && + l.num_object_copies == r.num_object_copies && + l.num_objects_missing_on_primary == r.num_objects_missing_on_primary && + l.num_objects_missing == r.num_objects_missing && + l.num_objects_degraded == r.num_objects_degraded && + l.num_objects_misplaced == r.num_objects_misplaced && + l.num_objects_unfound == r.num_objects_unfound && + l.num_rd == r.num_rd && + l.num_rd_kb == r.num_rd_kb && + l.num_wr == r.num_wr && + l.num_wr_kb == r.num_wr_kb && + l.num_scrub_errors == r.num_scrub_errors && + l.num_shallow_scrub_errors == r.num_shallow_scrub_errors && + l.num_deep_scrub_errors == r.num_deep_scrub_errors && + l.num_objects_recovered == r.num_objects_recovered && + l.num_bytes_recovered == r.num_bytes_recovered && + l.num_keys_recovered == r.num_keys_recovered && + l.num_objects_dirty == r.num_objects_dirty && + l.num_whiteouts == r.num_whiteouts && + l.num_objects_omap == r.num_objects_omap && + l.num_objects_hit_set_archive == r.num_objects_hit_set_archive && + l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive && + l.num_flush == r.num_flush && + l.num_flush_kb == r.num_flush_kb && + l.num_evict == r.num_evict && + l.num_evict_kb == r.num_evict_kb && + l.num_promote == r.num_promote && + l.num_flush_mode_high == r.num_flush_mode_high && + l.num_flush_mode_low == r.num_flush_mode_low && + l.num_evict_mode_some == r.num_evict_mode_some && + l.num_evict_mode_full == r.num_evict_mode_full && + l.num_objects_pinned == r.num_objects_pinned && + l.num_legacy_snapsets == r.num_legacy_snapsets && + l.num_large_omap_objects == r.num_large_omap_objects && + l.num_objects_manifest == r.num_objects_manifest && + l.num_omap_bytes == r.num_omap_bytes && + l.num_omap_keys == r.num_omap_keys && + l.num_objects_repaired == r.num_objects_repaired; +} + +// -- object_stat_collection_t -- + +void object_stat_collection_t::dump(Formatter *f) const +{ + f->open_object_section("stat_sum"); + sum.dump(f); + f->close_section(); +} + +void object_stat_collection_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(sum, bl); + encode((__u32)0, bl); + ENCODE_FINISH(bl); +} + +void object_stat_collection_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(sum, bl); + { + map<string,object_stat_sum_t> cat_sum; + decode(cat_sum, bl); + } + DECODE_FINISH(bl); +} + +void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o) +{ + object_stat_collection_t a; + o.push_back(new object_stat_collection_t(a)); + list<object_stat_sum_t*> l; + object_stat_sum_t::generate_test_instances(l); + for (list<object_stat_sum_t*>::iterator p = l.begin(); p != l.end(); ++p) { + a.add(**p); + o.push_back(new object_stat_collection_t(a)); + } +} + + +// -- pg_stat_t -- + +bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const +{ + if (primary && osd == acting_primary) { + return true; + } else if (!primary) { + for(vector<int32_t>::const_iterator it = acting.begin(); + it != acting.end(); ++it) + { + if (*it == osd) + return true; + } + } + return false; +} + +void pg_stat_t::dump(Formatter *f) const +{ + f->dump_stream("version") << version; + f->dump_stream("reported_seq") << reported_seq; + f->dump_stream("reported_epoch") << reported_epoch; + f->dump_string("state", pg_state_string(state)); + f->dump_stream("last_fresh") << last_fresh; + f->dump_stream("last_change") << last_change; + f->dump_stream("last_active") << last_active; + f->dump_stream("last_peered") << last_peered; + f->dump_stream("last_clean") << last_clean; + f->dump_stream("last_became_active") << last_became_active; + f->dump_stream("last_became_peered") << last_became_peered; + f->dump_stream("last_unstale") << last_unstale; + f->dump_stream("last_undegraded") << last_undegraded; + f->dump_stream("last_fullsized") << last_fullsized; + f->dump_unsigned("mapping_epoch", mapping_epoch); + f->dump_stream("log_start") << log_start; + f->dump_stream("ondisk_log_start") << ondisk_log_start; + f->dump_unsigned("created", created); + f->dump_unsigned("last_epoch_clean", last_epoch_clean); + f->dump_stream("parent") << parent; + f->dump_unsigned("parent_split_bits", parent_split_bits); + f->dump_stream("last_scrub") << last_scrub; + f->dump_stream("last_scrub_stamp") << last_scrub_stamp; + f->dump_stream("last_deep_scrub") << last_deep_scrub; + f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp; + f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp; + f->dump_int("log_size", log_size); + f->dump_int("ondisk_log_size", ondisk_log_size); + f->dump_bool("stats_invalid", stats_invalid); + f->dump_bool("dirty_stats_invalid", dirty_stats_invalid); + f->dump_bool("omap_stats_invalid", omap_stats_invalid); + f->dump_bool("hitset_stats_invalid", hitset_stats_invalid); + f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid); + f->dump_bool("pin_stats_invalid", pin_stats_invalid); + f->dump_bool("manifest_stats_invalid", manifest_stats_invalid); + f->dump_unsigned("snaptrimq_len", snaptrimq_len); + stats.dump(f); + f->open_array_section("up"); + for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("avail_no_missing"); + for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p) + f->dump_stream("shard") << *p; + f->close_section(); + f->open_array_section("object_location_counts"); + for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) { + f->open_object_section("entry"); + f->dump_stream("shards") << p->first; + f->dump_int("objects", p->second); + f->close_section(); + } + f->close_section(); + f->open_array_section("blocked_by"); + for (vector<int32_t>::const_iterator p = blocked_by.begin(); + p != blocked_by.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("up_primary", up_primary); + f->dump_int("acting_primary", acting_primary); + f->open_array_section("purged_snaps"); + for (interval_set<snapid_t>::const_iterator i = purged_snaps.begin(); + i != purged_snaps.end(); + ++i) { + f->open_object_section("interval"); + f->dump_stream("start") << i.get_start(); + f->dump_stream("length") << i.get_len(); + f->close_section(); + } + f->close_section(); +} + +void pg_stat_t::dump_brief(Formatter *f) const +{ + f->dump_string("state", pg_state_string(state)); + f->open_array_section("up"); + for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("up_primary", up_primary); + f->dump_int("acting_primary", acting_primary); +} + +void pg_stat_t::encode(bufferlist &bl) const +{ + ENCODE_START(26, 22, bl); + encode(version, bl); + encode(reported_seq, bl); + encode(reported_epoch, bl); + encode((__u32)state, bl); // for older peers + encode(log_start, bl); + encode(ondisk_log_start, bl); + encode(created, bl); + encode(last_epoch_clean, bl); + encode(parent, bl); + encode(parent_split_bits, bl); + encode(last_scrub, bl); + encode(last_scrub_stamp, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + encode(up, bl); + encode(acting, bl); + encode(last_fresh, bl); + encode(last_change, bl); + encode(last_active, bl); + encode(last_clean, bl); + encode(last_unstale, bl); + encode(mapping_epoch, bl); + encode(last_deep_scrub, bl); + encode(last_deep_scrub_stamp, bl); + encode(stats_invalid, bl); + encode(last_clean_scrub_stamp, bl); + encode(last_became_active, bl); + encode(dirty_stats_invalid, bl); + encode(up_primary, bl); + encode(acting_primary, bl); + encode(omap_stats_invalid, bl); + encode(hitset_stats_invalid, bl); + encode(blocked_by, bl); + encode(last_undegraded, bl); + encode(last_fullsized, bl); + encode(hitset_bytes_stats_invalid, bl); + encode(last_peered, bl); + encode(last_became_peered, bl); + encode(pin_stats_invalid, bl); + encode(snaptrimq_len, bl); + __u32 top_state = (state >> 32); + encode(top_state, bl); + encode(purged_snaps, bl); + encode(manifest_stats_invalid, bl); + encode(avail_no_missing, bl); + encode(object_location_counts, bl); + ENCODE_FINISH(bl); +} + +void pg_stat_t::decode(bufferlist::const_iterator &bl) +{ + bool tmp; + uint32_t old_state; + DECODE_START(26, bl); + decode(version, bl); + decode(reported_seq, bl); + decode(reported_epoch, bl); + decode(old_state, bl); + decode(log_start, bl); + decode(ondisk_log_start, bl); + decode(created, bl); + decode(last_epoch_clean, bl); + decode(parent, bl); + decode(parent_split_bits, bl); + decode(last_scrub, bl); + decode(last_scrub_stamp, bl); + decode(stats, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + decode(up, bl); + decode(acting, bl); + decode(last_fresh, bl); + decode(last_change, bl); + decode(last_active, bl); + decode(last_clean, bl); + decode(last_unstale, bl); + decode(mapping_epoch, bl); + decode(last_deep_scrub, bl); + decode(last_deep_scrub_stamp, bl); + decode(tmp, bl); + stats_invalid = tmp; + decode(last_clean_scrub_stamp, bl); + decode(last_became_active, bl); + decode(tmp, bl); + dirty_stats_invalid = tmp; + decode(up_primary, bl); + decode(acting_primary, bl); + decode(tmp, bl); + omap_stats_invalid = tmp; + decode(tmp, bl); + hitset_stats_invalid = tmp; + decode(blocked_by, bl); + decode(last_undegraded, bl); + decode(last_fullsized, bl); + decode(tmp, bl); + hitset_bytes_stats_invalid = tmp; + decode(last_peered, bl); + decode(last_became_peered, bl); + decode(tmp, bl); + pin_stats_invalid = tmp; + if (struct_v >= 23) { + decode(snaptrimq_len, bl); + if (struct_v >= 24) { + __u32 top_state; + decode(top_state, bl); + state = (uint64_t)old_state | ((uint64_t)top_state << 32); + decode(purged_snaps, bl); + } else { + state = old_state; + } + if (struct_v >= 25) { + decode(tmp, bl); + manifest_stats_invalid = tmp; + } else { + manifest_stats_invalid = true; + } + if (struct_v >= 26) { + decode(avail_no_missing, bl); + decode(object_location_counts, bl); + } + } + DECODE_FINISH(bl); +} + +void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o) +{ + pg_stat_t a; + o.push_back(new pg_stat_t(a)); + + a.version = eversion_t(1, 3); + a.reported_epoch = 1; + a.reported_seq = 2; + a.state = 123; + a.mapping_epoch = 998; + a.last_fresh = utime_t(1002, 1); + a.last_change = utime_t(1002, 2); + a.last_active = utime_t(1002, 3); + a.last_clean = utime_t(1002, 4); + a.last_unstale = utime_t(1002, 5); + a.last_undegraded = utime_t(1002, 7); + a.last_fullsized = utime_t(1002, 8); + a.log_start = eversion_t(1, 4); + a.ondisk_log_start = eversion_t(1, 5); + a.created = 6; + a.last_epoch_clean = 7; + a.parent = pg_t(1, 2); + a.parent_split_bits = 12; + a.last_scrub = eversion_t(9, 10); + a.last_scrub_stamp = utime_t(11, 12); + a.last_deep_scrub = eversion_t(13, 14); + a.last_deep_scrub_stamp = utime_t(15, 16); + a.last_clean_scrub_stamp = utime_t(17, 18); + a.snaptrimq_len = 1048576; + list<object_stat_collection_t*> l; + object_stat_collection_t::generate_test_instances(l); + a.stats = *l.back(); + a.log_size = 99; + a.ondisk_log_size = 88; + a.up.push_back(123); + a.up_primary = 123; + a.acting.push_back(456); + a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD)); + set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) }; + a.object_location_counts.insert(make_pair(sset, 10)); + sset.insert(pg_shard_t(2)); + a.object_location_counts.insert(make_pair(sset, 5)); + a.acting_primary = 456; + o.push_back(new pg_stat_t(a)); + + a.up.push_back(124); + a.up_primary = 124; + a.acting.push_back(124); + a.acting_primary = 124; + a.blocked_by.push_back(155); + a.blocked_by.push_back(156); + o.push_back(new pg_stat_t(a)); +} + +bool operator==(const pg_stat_t& l, const pg_stat_t& r) +{ + return + l.version == r.version && + l.reported_seq == r.reported_seq && + l.reported_epoch == r.reported_epoch && + l.state == r.state && + l.last_fresh == r.last_fresh && + l.last_change == r.last_change && + l.last_active == r.last_active && + l.last_peered == r.last_peered && + l.last_clean == r.last_clean && + l.last_unstale == r.last_unstale && + l.last_undegraded == r.last_undegraded && + l.last_fullsized == r.last_fullsized && + l.log_start == r.log_start && + l.ondisk_log_start == r.ondisk_log_start && + l.created == r.created && + l.last_epoch_clean == r.last_epoch_clean && + l.parent == r.parent && + l.parent_split_bits == r.parent_split_bits && + l.last_scrub == r.last_scrub && + l.last_deep_scrub == r.last_deep_scrub && + l.last_scrub_stamp == r.last_scrub_stamp && + l.last_deep_scrub_stamp == r.last_deep_scrub_stamp && + l.last_clean_scrub_stamp == r.last_clean_scrub_stamp && + l.stats == r.stats && + l.stats_invalid == r.stats_invalid && + l.log_size == r.log_size && + l.ondisk_log_size == r.ondisk_log_size && + l.up == r.up && + l.acting == r.acting && + l.avail_no_missing == r.avail_no_missing && + l.object_location_counts == r.object_location_counts && + l.mapping_epoch == r.mapping_epoch && + l.blocked_by == r.blocked_by && + l.last_became_active == r.last_became_active && + l.last_became_peered == r.last_became_peered && + l.dirty_stats_invalid == r.dirty_stats_invalid && + l.omap_stats_invalid == r.omap_stats_invalid && + l.hitset_stats_invalid == r.hitset_stats_invalid && + l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid && + l.up_primary == r.up_primary && + l.acting_primary == r.acting_primary && + l.pin_stats_invalid == r.pin_stats_invalid && + l.manifest_stats_invalid == r.manifest_stats_invalid && + l.purged_snaps == r.purged_snaps && + l.snaptrimq_len == r.snaptrimq_len; +} + +// -- store_statfs_t -- + +bool store_statfs_t::operator==(const store_statfs_t& other) const +{ + return total == other.total + && available == other.available + && allocated == other.allocated + && internally_reserved == other.internally_reserved + && data_stored == other.data_stored + && data_compressed == other.data_compressed + && data_compressed_allocated == other.data_compressed_allocated + && data_compressed_original == other.data_compressed_original + && omap_allocated == other.omap_allocated + && internal_metadata == other.internal_metadata; +} + +void store_statfs_t::dump(Formatter *f) const +{ + f->dump_int("total", total); + f->dump_int("available", available); + f->dump_int("internally_reserved", internally_reserved); + f->dump_int("allocated", allocated); + f->dump_int("data_stored", data_stored); + f->dump_int("data_compressed", data_compressed); + f->dump_int("data_compressed_allocated", data_compressed_allocated); + f->dump_int("data_compressed_original", data_compressed_original); + f->dump_int("omap_allocated", omap_allocated); + f->dump_int("internal_metadata", internal_metadata); +} + +ostream& operator<<(ostream& out, const store_statfs_t &s) +{ + out << std::hex + << "store_statfs(0x" << s.available + << "/0x" << s.internally_reserved + << "/0x" << s.total + << ", data 0x" << s.data_stored + << "/0x" << s.allocated + << ", compress 0x" << s.data_compressed + << "/0x" << s.data_compressed_allocated + << "/0x" << s.data_compressed_original + << ", omap 0x" << s.omap_allocated + << ", meta 0x" << s.internal_metadata + << std::dec + << ")"; + return out; +} + +void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o) +{ + store_statfs_t a; + o.push_back(new store_statfs_t(a)); + a.total = 234; + a.available = 123; + a.internally_reserved = 33; + a.allocated = 32; + a.data_stored = 44; + a.data_compressed = 21; + a.data_compressed_allocated = 12; + a.data_compressed_original = 13; + a.omap_allocated = 14; + a.internal_metadata = 15; + o.push_back(new store_statfs_t(a)); +} + +// -- pool_stat_t -- + +void pool_stat_t::dump(Formatter *f) const +{ + stats.dump(f); + f->open_object_section("store_stats"); + store_stats.dump(f); + f->close_section(); + f->dump_int("log_size", log_size); + f->dump_int("ondisk_log_size", ondisk_log_size); + f->dump_int("up", up); + f->dump_int("acting", acting); + f->dump_int("num_store_stats", num_store_stats); +} + +void pool_stat_t::encode(bufferlist &bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDENC) == 0) { + __u8 v = 4; + encode(v, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + return; + } + + ENCODE_START(7, 5, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + encode(up, bl); + encode(acting, bl); + encode(store_stats, bl); + encode(num_store_stats, bl); + ENCODE_FINISH(bl); +} + +void pool_stat_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); + if (struct_v >= 4) { + decode(stats, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + if (struct_v >= 6) { + decode(up, bl); + decode(acting, bl); + } else { + up = 0; + acting = 0; + } + if (struct_v >= 7) { + decode(store_stats, bl); + decode(num_store_stats, bl); + } else { + store_stats.reset(); + num_store_stats = 0; + } + + } else { + decode(stats.sum.num_bytes, bl); + uint64_t num_kb; + decode(num_kb, bl); + decode(stats.sum.num_objects, bl); + decode(stats.sum.num_object_clones, bl); + decode(stats.sum.num_object_copies, bl); + decode(stats.sum.num_objects_missing_on_primary, bl); + decode(stats.sum.num_objects_degraded, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + if (struct_v >= 2) { + decode(stats.sum.num_rd, bl); + decode(stats.sum.num_rd_kb, bl); + decode(stats.sum.num_wr, bl); + decode(stats.sum.num_wr_kb, bl); + } + if (struct_v >= 3) { + decode(stats.sum.num_objects_unfound, bl); + } + } + DECODE_FINISH(bl); +} + +void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o) +{ + pool_stat_t a; + o.push_back(new pool_stat_t(a)); + + list<object_stat_collection_t*> l; + object_stat_collection_t::generate_test_instances(l); + list<store_statfs_t*> ll; + store_statfs_t::generate_test_instances(ll); + a.stats = *l.back(); + a.store_stats = *ll.back(); + a.log_size = 123; + a.ondisk_log_size = 456; + a.acting = 3; + a.up = 4; + a.num_store_stats = 1; + o.push_back(new pool_stat_t(a)); +} + + +// -- pg_history_t -- + +void pg_history_t::encode(bufferlist &bl) const +{ + ENCODE_START(9, 4, bl); + encode(epoch_created, bl); + encode(last_epoch_started, bl); + encode(last_epoch_clean, bl); + encode(last_epoch_split, bl); + encode(same_interval_since, bl); + encode(same_up_since, bl); + encode(same_primary_since, bl); + encode(last_scrub, bl); + encode(last_scrub_stamp, bl); + encode(last_deep_scrub, bl); + encode(last_deep_scrub_stamp, bl); + encode(last_clean_scrub_stamp, bl); + encode(last_epoch_marked_full, bl); + encode(last_interval_started, bl); + encode(last_interval_clean, bl); + encode(epoch_pool_created, bl); + ENCODE_FINISH(bl); +} + +void pg_history_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl); + decode(epoch_created, bl); + decode(last_epoch_started, bl); + if (struct_v >= 3) + decode(last_epoch_clean, bl); + else + last_epoch_clean = last_epoch_started; // careful, it's a lie! + decode(last_epoch_split, bl); + decode(same_interval_since, bl); + decode(same_up_since, bl); + decode(same_primary_since, bl); + if (struct_v >= 2) { + decode(last_scrub, bl); + decode(last_scrub_stamp, bl); + } + if (struct_v >= 5) { + decode(last_deep_scrub, bl); + decode(last_deep_scrub_stamp, bl); + } + if (struct_v >= 6) { + decode(last_clean_scrub_stamp, bl); + } + if (struct_v >= 7) { + decode(last_epoch_marked_full, bl); + } + if (struct_v >= 8) { + decode(last_interval_started, bl); + decode(last_interval_clean, bl); + } else { + if (last_epoch_started >= same_interval_since) { + last_interval_started = same_interval_since; + } else { + last_interval_started = last_epoch_started; // best guess + } + if (last_epoch_clean >= same_interval_since) { + last_interval_clean = same_interval_since; + } else { + last_interval_clean = last_epoch_clean; // best guess + } + } + if (struct_v >= 9) { + decode(epoch_pool_created, bl); + } else { + epoch_pool_created = epoch_created; + } + DECODE_FINISH(bl); +} + +void pg_history_t::dump(Formatter *f) const +{ + f->dump_int("epoch_created", epoch_created); + f->dump_int("epoch_pool_created", epoch_pool_created); + f->dump_int("last_epoch_started", last_epoch_started); + f->dump_int("last_interval_started", last_interval_started); + f->dump_int("last_epoch_clean", last_epoch_clean); + f->dump_int("last_interval_clean", last_interval_clean); + f->dump_int("last_epoch_split", last_epoch_split); + f->dump_int("last_epoch_marked_full", last_epoch_marked_full); + f->dump_int("same_up_since", same_up_since); + f->dump_int("same_interval_since", same_interval_since); + f->dump_int("same_primary_since", same_primary_since); + f->dump_stream("last_scrub") << last_scrub; + f->dump_stream("last_scrub_stamp") << last_scrub_stamp; + f->dump_stream("last_deep_scrub") << last_deep_scrub; + f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp; + f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp; +} + +void pg_history_t::generate_test_instances(list<pg_history_t*>& o) +{ + o.push_back(new pg_history_t); + o.push_back(new pg_history_t); + o.back()->epoch_created = 1; + o.back()->epoch_pool_created = 1; + o.back()->last_epoch_started = 2; + o.back()->last_interval_started = 2; + o.back()->last_epoch_clean = 3; + o.back()->last_interval_clean = 2; + o.back()->last_epoch_split = 4; + o.back()->same_up_since = 5; + o.back()->same_interval_since = 6; + o.back()->same_primary_since = 7; + o.back()->last_scrub = eversion_t(8, 9); + o.back()->last_scrub_stamp = utime_t(10, 11); + o.back()->last_deep_scrub = eversion_t(12, 13); + o.back()->last_deep_scrub_stamp = utime_t(14, 15); + o.back()->last_clean_scrub_stamp = utime_t(16, 17); + o.back()->last_epoch_marked_full = 18; +} + + +// -- pg_info_t -- + +void pg_info_t::encode(bufferlist &bl) const +{ + ENCODE_START(32, 26, bl); + encode(pgid.pgid, bl); + encode(last_update, bl); + encode(last_complete, bl); + encode(log_tail, bl); + if (last_backfill_bitwise && !last_backfill.is_max()) { + encode(hobject_t(), bl); + } else { + encode(last_backfill, bl); + } + encode(stats, bl); + history.encode(bl); + encode(purged_snaps, bl); + encode(last_epoch_started, bl); + encode(last_user_version, bl); + encode(hit_set, bl); + encode(pgid.shard, bl); + encode(last_backfill, bl); + encode(last_backfill_bitwise, bl); + encode(last_interval_started, bl); + ENCODE_FINISH(bl); +} + +void pg_info_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(32, bl); + decode(pgid.pgid, bl); + decode(last_update, bl); + decode(last_complete, bl); + decode(log_tail, bl); + { + hobject_t old_last_backfill; + decode(old_last_backfill, bl); + } + decode(stats, bl); + history.decode(bl); + decode(purged_snaps, bl); + decode(last_epoch_started, bl); + decode(last_user_version, bl); + decode(hit_set, bl); + decode(pgid.shard, bl); + decode(last_backfill, bl); + decode(last_backfill_bitwise, bl); + if (struct_v >= 32) { + decode(last_interval_started, bl); + } else { + last_interval_started = last_epoch_started; + } + DECODE_FINISH(bl); +} + +// -- pg_info_t -- + +void pg_info_t::dump(Formatter *f) const +{ + f->dump_stream("pgid") << pgid; + f->dump_stream("last_update") << last_update; + f->dump_stream("last_complete") << last_complete; + f->dump_stream("log_tail") << log_tail; + f->dump_int("last_user_version", last_user_version); + f->dump_stream("last_backfill") << last_backfill; + f->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise); + f->open_array_section("purged_snaps"); + for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin(); + i != purged_snaps.end(); + ++i) { + f->open_object_section("purged_snap_interval"); + f->dump_stream("start") << i.get_start(); + f->dump_stream("length") << i.get_len(); + f->close_section(); + } + f->close_section(); + f->open_object_section("history"); + history.dump(f); + f->close_section(); + f->open_object_section("stats"); + stats.dump(f); + f->close_section(); + + f->dump_int("empty", is_empty()); + f->dump_int("dne", dne()); + f->dump_int("incomplete", is_incomplete()); + f->dump_int("last_epoch_started", last_epoch_started); + + f->open_object_section("hit_set_history"); + hit_set.dump(f); + f->close_section(); +} + +void pg_info_t::generate_test_instances(list<pg_info_t*>& o) +{ + o.push_back(new pg_info_t); + o.push_back(new pg_info_t); + list<pg_history_t*> h; + pg_history_t::generate_test_instances(h); + o.back()->history = *h.back(); + o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD); + o.back()->last_update = eversion_t(3, 4); + o.back()->last_complete = eversion_t(5, 6); + o.back()->last_user_version = 2; + o.back()->log_tail = eversion_t(7, 8); + o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, ""); + o.back()->last_backfill_bitwise = true; + { + list<pg_stat_t*> s; + pg_stat_t::generate_test_instances(s); + o.back()->stats = *s.back(); + } + { + list<pg_hit_set_history_t*> s; + pg_hit_set_history_t::generate_test_instances(s); + o.back()->hit_set = *s.back(); + } +} + +// -- pg_notify_t -- +void pg_notify_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + encode(query_epoch, bl); + encode(epoch_sent, bl); + encode(info, bl); + encode(to, bl); + encode(from, bl); + ENCODE_FINISH(bl); +} + +void pg_notify_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(2, bl); + decode(query_epoch, bl); + decode(epoch_sent, bl); + decode(info, bl); + decode(to, bl); + decode(from, bl); + DECODE_FINISH(bl); +} + +void pg_notify_t::dump(Formatter *f) const +{ + f->dump_int("from", from); + f->dump_int("to", to); + f->dump_unsigned("query_epoch", query_epoch); + f->dump_unsigned("epoch_sent", epoch_sent); + { + f->open_object_section("info"); + info.dump(f); + f->close_section(); + } +} + +void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o) +{ + o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, pg_info_t())); + o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t())); +} + +ostream &operator<<(ostream &lhs, const pg_notify_t ¬ify) +{ + lhs << "(query:" << notify.query_epoch + << " sent:" << notify.epoch_sent + << " " << notify.info; + if (notify.from != shard_id_t::NO_SHARD || + notify.to != shard_id_t::NO_SHARD) + lhs << " " << (unsigned)notify.from + << "->" << (unsigned)notify.to; + return lhs << ")"; +} + +// -- pg_interval_t -- + +void PastIntervals::pg_interval_t::encode(bufferlist& bl) const +{ + ENCODE_START(4, 2, bl); + encode(first, bl); + encode(last, bl); + encode(up, bl); + encode(acting, bl); + encode(maybe_went_rw, bl); + encode(primary, bl); + encode(up_primary, bl); + ENCODE_FINISH(bl); +} + +void PastIntervals::pg_interval_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl); + decode(first, bl); + decode(last, bl); + decode(up, bl); + decode(acting, bl); + decode(maybe_went_rw, bl); + if (struct_v >= 3) { + decode(primary, bl); + } else { + if (acting.size()) + primary = acting[0]; + } + if (struct_v >= 4) { + decode(up_primary, bl); + } else { + if (up.size()) + up_primary = up[0]; + } + DECODE_FINISH(bl); +} + +void PastIntervals::pg_interval_t::dump(Formatter *f) const +{ + f->dump_unsigned("first", first); + f->dump_unsigned("last", last); + f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0); + f->open_array_section("up"); + for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("primary", primary); + f->dump_int("up_primary", up_primary); +} + +void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o) +{ + o.push_back(new pg_interval_t); + o.push_back(new pg_interval_t); + o.back()->up.push_back(1); + o.back()->acting.push_back(2); + o.back()->acting.push_back(3); + o.back()->first = 4; + o.back()->last = 5; + o.back()->maybe_went_rw = true; +} + +WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t) + + +/** + * pi_compact_rep + * + * PastIntervals only needs to be able to answer two questions: + * 1) Where should the primary look for unfound objects? + * 2) List a set of subsets of the OSDs such that contacting at least + * one from each subset guarantees we speak to at least one witness + * of any completed write. + * + * Crucially, 2) does not require keeping *all* past intervals. Certainly, + * we don't need to keep any where maybe_went_rw would be false. We also + * needn't keep two intervals where the actingset in one is a subset + * of the other (only need to keep the smaller of the two sets). In order + * to accurately trim the set of intervals as last_epoch_started changes + * without rebuilding the set from scratch, we'll retain the larger set + * if it in an older interval. + */ +struct compact_interval_t { + epoch_t first; + epoch_t last; + set<pg_shard_t> acting; + bool supersedes(const compact_interval_t &other) { + for (auto &&i: acting) { + if (!other.acting.count(i)) + return false; + } + return true; + } + void dump(Formatter *f) const { + f->open_object_section("compact_interval_t"); + f->dump_stream("first") << first; + f->dump_stream("last") << last; + f->dump_stream("acting") << acting; + f->close_section(); + } + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(last, bl); + encode(acting, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &bl) { + DECODE_START(1, bl); + decode(first, bl); + decode(last, bl); + decode(acting, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(list<compact_interval_t*> & o) { + /* Not going to be used, we'll generate pi_compact_rep directly */ + } +}; +ostream &operator<<(ostream &o, const compact_interval_t &rhs) +{ + return o << "([" << rhs.first << "," << rhs.last + << "] acting " << rhs.acting << ")"; +} +WRITE_CLASS_ENCODER(compact_interval_t) + +class pi_compact_rep : public PastIntervals::interval_rep { + epoch_t first = 0; + epoch_t last = 0; // inclusive + set<pg_shard_t> all_participants; + list<compact_interval_t> intervals; + pi_compact_rep( + bool ec_pool, + std::list<PastIntervals::pg_interval_t> &&intervals) { + for (auto &&i: intervals) + add_interval(ec_pool, i); + } +public: + pi_compact_rep() = default; + pi_compact_rep(const pi_compact_rep &) = default; + pi_compact_rep(pi_compact_rep &&) = default; + pi_compact_rep &operator=(const pi_compact_rep &) = default; + pi_compact_rep &operator=(pi_compact_rep &&) = default; + + size_t size() const override { return intervals.size(); } + bool empty() const override { + return first > last || (first == 0 && last == 0); + } + void clear() override { + *this = pi_compact_rep(); + } + pair<epoch_t, epoch_t> get_bounds() const override { + return make_pair(first, last + 1); + } + void adjust_start_backwards(epoch_t last_epoch_clean) { + first = last_epoch_clean; + } + + set<pg_shard_t> get_all_participants( + bool ec_pool) const override { + return all_participants; + } + void add_interval( + bool ec_pool, const PastIntervals::pg_interval_t &interval) override { + if (first == 0) + first = interval.first; + ceph_assert(interval.last > last); + last = interval.last; + set<pg_shard_t> acting; + for (unsigned i = 0; i < interval.acting.size(); ++i) { + if (interval.acting[i] == CRUSH_ITEM_NONE) + continue; + acting.insert( + pg_shard_t( + interval.acting[i], + ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + all_participants.insert(acting.begin(), acting.end()); + if (!interval.maybe_went_rw) + return; + intervals.push_back( + compact_interval_t{interval.first, interval.last, acting}); + auto plast = intervals.end(); + --plast; + for (auto cur = intervals.begin(); cur != plast; ) { + if (plast->supersedes(*cur)) { + intervals.erase(cur++); + } else { + ++cur; + } + } + } + unique_ptr<PastIntervals::interval_rep> clone() const override { + return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this)); + } + ostream &print(ostream &out) const override { + return out << "([" << first << "," << last + << "] intervals=" << intervals << ")"; + } + void encode(bufferlist &bl) const override { + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(last, bl); + encode(all_participants, bl); + encode(intervals, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &bl) override { + DECODE_START(1, bl); + decode(first, bl); + decode(last, bl); + decode(all_participants, bl); + decode(intervals, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const override { + f->open_object_section("PastIntervals::compact_rep"); + f->dump_stream("first") << first; + f->dump_stream("last") << last; + f->open_array_section("all_participants"); + for (auto& i : all_participants) { + f->dump_object("pg_shard", i); + } + f->close_section(); + f->open_array_section("intervals"); + for (auto &&i: intervals) { + i.dump(f); + } + f->close_section(); + f->close_section(); + } + static void generate_test_instances(list<pi_compact_rep*> &o) { + using ival = PastIntervals::pg_interval_t; + using ivallst = std::list<ival>; + o.push_back( + new pi_compact_rep( + true, ivallst + { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0} + , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1} + , ival{{ 2}, { 2}, 31, 35, false, 2, 2} + , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0} + })); + o.push_back( + new pi_compact_rep( + false, ivallst + { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0} + , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1} + , ival{{ 2}, { 2}, 31, 35, false, 2, 2} + , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0} + })); + o.push_back( + new pi_compact_rep( + true, ivallst + { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1} + , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0} + , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2} + , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0} + })); + } + void iterate_mayberw_back_to( + epoch_t les, + std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override { + for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) { + if (i->last < les) + break; + f(i->first, i->acting); + } + } + virtual ~pi_compact_rep() override {} +}; +WRITE_CLASS_ENCODER(pi_compact_rep) + +PastIntervals::PastIntervals() +{ + past_intervals.reset(new pi_compact_rep); +} + +PastIntervals::PastIntervals(const PastIntervals &rhs) + : past_intervals(rhs.past_intervals ? + rhs.past_intervals->clone() : + nullptr) {} + +PastIntervals &PastIntervals::operator=(const PastIntervals &rhs) +{ + PastIntervals other(rhs); + swap(other); + return *this; +} + +ostream& operator<<(ostream& out, const PastIntervals &i) +{ + if (i.past_intervals) { + return i.past_intervals->print(out); + } else { + return out << "(empty)"; + } +} + +ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i) +{ + return out << "PriorSet(" + << "ec_pool: " << i.ec_pool + << ", probe: " << i.probe + << ", down: " << i.down + << ", blocked_by: " << i.blocked_by + << ", pg_down: " << i.pg_down + << ")"; +} + +void PastIntervals::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + __u8 type = 0; + decode(type, bl); + switch (type) { + case 0: + break; + case 1: + ceph_abort_msg("pi_simple_rep support removed post-luminous"); + break; + case 2: + past_intervals.reset(new pi_compact_rep); + past_intervals->decode(bl); + break; + } + DECODE_FINISH(bl); +} + +void PastIntervals::generate_test_instances(list<PastIntervals*> &o) +{ + { + list<pi_compact_rep *> compact; + pi_compact_rep::generate_test_instances(compact); + for (auto &&i: compact) { + // takes ownership of contents + o.push_back(new PastIntervals(i)); + } + } + return; +} + +bool PastIntervals::is_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector<int> &old_acting, + const vector<int> &new_acting, + int old_up_primary, + int new_up_primary, + const vector<int> &old_up, + const vector<int> &new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + unsigned old_pg_num, + unsigned new_pg_num, + unsigned old_pg_num_pending, + unsigned new_pg_num_pending, + bool old_sort_bitwise, + bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, + pg_t pgid) { + return old_acting_primary != new_acting_primary || + new_acting != old_acting || + old_up_primary != new_up_primary || + new_up != old_up || + old_min_size != new_min_size || + old_size != new_size || + pgid.is_split(old_pg_num, new_pg_num, 0) || + // (is or was) pre-merge source + pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) || + pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) || + // merge source + pgid.is_merge_source(old_pg_num, new_pg_num, 0) || + // (is or was) pre-merge target + pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) || + pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) || + // merge target + pgid.is_merge_target(old_pg_num, new_pg_num) || + old_sort_bitwise != new_sort_bitwise || + old_recovery_deletes != new_recovery_deletes; +} + +bool PastIntervals::is_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector<int> &old_acting, + const vector<int> &new_acting, + int old_up_primary, + int new_up_primary, + const vector<int> &old_up, + const vector<int> &new_up, + OSDMapRef osdmap, + OSDMapRef lastmap, + pg_t pgid) +{ + const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool()); + if (!plast) { + return false; // after pool is deleted there are no more interval changes + } + const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool()); + if (!pi) { + return true; // pool was deleted this epoch -> (final!) interval change + } + return + is_new_interval(old_acting_primary, + new_acting_primary, + old_acting, + new_acting, + old_up_primary, + new_up_primary, + old_up, + new_up, + plast->size, + pi->size, + plast->min_size, + pi->min_size, + plast->get_pg_num(), + pi->get_pg_num(), + plast->get_pg_num_pending(), + pi->get_pg_num_pending(), + lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE), + osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE), + lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES), + osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES), + pgid); +} + +bool PastIntervals::check_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector<int> &old_acting, + const vector<int> &new_acting, + int old_up_primary, + int new_up_primary, + const vector<int> &old_up, + const vector<int> &new_up, + epoch_t same_interval_since, + epoch_t last_epoch_clean, + OSDMapRef osdmap, + OSDMapRef lastmap, + pg_t pgid, + IsPGRecoverablePredicate *could_have_gone_active, + PastIntervals *past_intervals, + std::ostream *out) +{ + /* + * We have to be careful to gracefully deal with situations like + * so. Say we have a power outage or something that takes out both + * OSDs, but the monitor doesn't mark them down in the same epoch. + * The history may look like + * + * 1: A B + * 2: B + * 3: let's say B dies for good, too (say, from the power spike) + * 4: A + * + * which makes it look like B may have applied updates to the PG + * that we need in order to proceed. This sucks... + * + * To minimize the risk of this happening, we CANNOT go active if + * _any_ OSDs in the prior set are down until we send an MOSDAlive + * to the monitor such that the OSDMap sets osd_up_thru to an epoch. + * Then, we have something like + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: + * 4: A + * + * -> we can ignore B, bc it couldn't have gone active (up_thru still 0). + * + * or, + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: B up_thru[B]=2 + * 4: + * 5: A + * + * -> we must wait for B, bc it was alive through 2, and could have + * written to the pg. + * + * If B is really dead, then an administrator will need to manually + * intervene by marking the OSD as "lost." + */ + + // remember past interval + // NOTE: a change in the up set primary triggers an interval + // change, even though the interval members in the pg_interval_t + // do not change. + ceph_assert(past_intervals); + ceph_assert(past_intervals->past_intervals); + if (is_new_interval( + old_acting_primary, + new_acting_primary, + old_acting, + new_acting, + old_up_primary, + new_up_primary, + old_up, + new_up, + osdmap, + lastmap, + pgid)) { + pg_interval_t i; + i.first = same_interval_since; + i.last = osdmap->get_epoch() - 1; + ceph_assert(i.first <= i.last); + i.acting = old_acting; + i.up = old_up; + i.primary = old_acting_primary; + i.up_primary = old_up_primary; + + unsigned num_acting = 0; + for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end(); + ++p) + if (*p != CRUSH_ITEM_NONE) + ++num_acting; + + ceph_assert(lastmap->get_pools().count(pgid.pool())); + const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second; + set<pg_shard_t> old_acting_shards; + old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards); + + if (num_acting && + i.primary != -1 && + num_acting >= old_pg_pool.min_size && + (*could_have_gone_active)(old_acting_shards)) { + if (out) + *out << __func__ << " " << i + << " up_thru " << lastmap->get_up_thru(i.primary) + << " up_from " << lastmap->get_up_from(i.primary) + << " last_epoch_clean " << last_epoch_clean; + if (lastmap->get_up_thru(i.primary) >= i.first && + lastmap->get_up_from(i.primary) <= i.first) { + i.maybe_went_rw = true; + if (out) + *out << " " << i + << " : primary up " << lastmap->get_up_from(i.primary) + << "-" << lastmap->get_up_thru(i.primary) + << " includes interval" + << std::endl; + } else if (last_epoch_clean >= i.first && + last_epoch_clean <= i.last) { + // If the last_epoch_clean is included in this interval, then + // the pg must have been rw (for recovery to have completed). + // This is important because we won't know the _real_ + // first_epoch because we stop at last_epoch_clean, and we + // don't want the oldest interval to randomly have + // maybe_went_rw false depending on the relative up_thru vs + // last_epoch_clean timing. + i.maybe_went_rw = true; + if (out) + *out << " " << i + << " : includes last_epoch_clean " << last_epoch_clean + << " and presumed to have been rw" + << std::endl; + } else { + i.maybe_went_rw = false; + if (out) + *out << " " << i + << " : primary up " << lastmap->get_up_from(i.primary) + << "-" << lastmap->get_up_thru(i.primary) + << " does not include interval" + << std::endl; + } + } else { + i.maybe_went_rw = false; + if (out) + *out << __func__ << " " << i << " : acting set is too small" << std::endl; + } + past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i); + return true; + } else { + return false; + } +} + + +// true if the given map affects the prior set +bool PastIntervals::PriorSet::affected_by_map( + const OSDMap &osdmap, + const DoutPrefixProvider *dpp) const +{ + for (set<pg_shard_t>::iterator p = probe.begin(); + p != probe.end(); + ++p) { + int o = p->osd; + + // did someone in the prior set go down? + if (osdmap.is_down(o) && down.count(o) == 0) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl; + return true; + } + + // did a down osd in cur get (re)marked as lost? + map<int, epoch_t>::const_iterator r = blocked_by.find(o); + if (r != blocked_by.end()) { + if (!osdmap.exists(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl; + return true; + } + if (osdmap.get_info(o).lost_at != r->second) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl; + return true; + } + } + } + + // did someone in the prior down set go up? + for (set<int>::const_iterator p = down.begin(); + p != down.end(); + ++p) { + int o = *p; + + if (osdmap.is_up(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl; + return true; + } + + // did someone in the prior set get lost or destroyed? + if (!osdmap.exists(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl; + return true; + } + // did a down osd in down get (re)marked as lost? + map<int, epoch_t>::const_iterator r = blocked_by.find(o); + if (r != blocked_by.end()) { + if (osdmap.get_info(o).lost_at != r->second) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl; + return true; + } + } + } + + return false; +} + +ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i) +{ + out << "interval(" << i.first << "-" << i.last + << " up " << i.up << "(" << i.up_primary << ")" + << " acting " << i.acting << "(" << i.primary << ")"; + if (i.maybe_went_rw) + out << " maybe_went_rw"; + out << ")"; + return out; +} + + + +// -- pg_query_t -- + +void pg_query_t::encode(bufferlist &bl, uint64_t features) const { + ENCODE_START(3, 3, bl); + encode(type, bl); + encode(since, bl); + history.encode(bl); + encode(epoch_sent, bl); + encode(to, bl); + encode(from, bl); + ENCODE_FINISH(bl); +} + +void pg_query_t::decode(bufferlist::const_iterator &bl) { + DECODE_START(3, bl); + decode(type, bl); + decode(since, bl); + history.decode(bl); + decode(epoch_sent, bl); + decode(to, bl); + decode(from, bl); + DECODE_FINISH(bl); +} + +void pg_query_t::dump(Formatter *f) const +{ + f->dump_int("from", from); + f->dump_int("to", to); + f->dump_string("type", get_type_name()); + f->dump_stream("since") << since; + f->dump_stream("epoch_sent") << epoch_sent; + f->open_object_section("history"); + history.dump(f); + f->close_section(); +} +void pg_query_t::generate_test_instances(list<pg_query_t*>& o) +{ + o.push_back(new pg_query_t()); + list<pg_history_t*> h; + pg_history_t::generate_test_instances(h); + o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0), + eversion_t(4, 5), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::FULLLOG, + shard_id_t::NO_SHARD, shard_id_t::NO_SHARD, + *h.back(), 5)); +} + +// -- ObjectModDesc -- +void ObjectModDesc::visit(Visitor *visitor) const +{ + auto bp = bl.cbegin(); + try { + while (!bp.end()) { + DECODE_START(max_required_version, bp); + uint8_t code; + decode(code, bp); + switch (code) { + case APPEND: { + uint64_t size; + decode(size, bp); + visitor->append(size); + break; + } + case SETATTRS: { + map<string, boost::optional<bufferlist> > attrs; + decode(attrs, bp); + visitor->setattrs(attrs); + break; + } + case DELETE: { + version_t old_version; + decode(old_version, bp); + visitor->rmobject(old_version); + break; + } + case CREATE: { + visitor->create(); + break; + } + case UPDATE_SNAPS: { + set<snapid_t> snaps; + decode(snaps, bp); + visitor->update_snaps(snaps); + break; + } + case TRY_DELETE: { + version_t old_version; + decode(old_version, bp); + visitor->try_rmobject(old_version); + break; + } + case ROLLBACK_EXTENTS: { + vector<pair<uint64_t, uint64_t> > extents; + version_t gen; + decode(gen, bp); + decode(extents, bp); + visitor->rollback_extents(gen,extents); + break; + } + default: + ceph_abort_msg("Invalid rollback code"); + } + DECODE_FINISH(bp); + } + } catch (...) { + ceph_abort_msg("Invalid encoding"); + } +} + +struct DumpVisitor : public ObjectModDesc::Visitor { + Formatter *f; + explicit DumpVisitor(Formatter *f) : f(f) {} + void append(uint64_t old_size) override { + f->open_object_section("op"); + f->dump_string("code", "APPEND"); + f->dump_unsigned("old_size", old_size); + f->close_section(); + } + void setattrs(map<string, boost::optional<bufferlist> > &attrs) override { + f->open_object_section("op"); + f->dump_string("code", "SETATTRS"); + f->open_array_section("attrs"); + for (map<string, boost::optional<bufferlist> >::iterator i = attrs.begin(); + i != attrs.end(); + ++i) { + f->dump_string("attr_name", i->first); + } + f->close_section(); + f->close_section(); + } + void rmobject(version_t old_version) override { + f->open_object_section("op"); + f->dump_string("code", "RMOBJECT"); + f->dump_unsigned("old_version", old_version); + f->close_section(); + } + void try_rmobject(version_t old_version) override { + f->open_object_section("op"); + f->dump_string("code", "TRY_RMOBJECT"); + f->dump_unsigned("old_version", old_version); + f->close_section(); + } + void create() override { + f->open_object_section("op"); + f->dump_string("code", "CREATE"); + f->close_section(); + } + void update_snaps(const set<snapid_t> &snaps) override { + f->open_object_section("op"); + f->dump_string("code", "UPDATE_SNAPS"); + f->dump_stream("snaps") << snaps; + f->close_section(); + } + void rollback_extents( + version_t gen, + const vector<pair<uint64_t, uint64_t> > &extents) override { + f->open_object_section("op"); + f->dump_string("code", "ROLLBACK_EXTENTS"); + f->dump_unsigned("gen", gen); + f->dump_stream("snaps") << extents; + f->close_section(); + } +}; + +void ObjectModDesc::dump(Formatter *f) const +{ + f->open_object_section("object_mod_desc"); + f->dump_bool("can_local_rollback", can_local_rollback); + f->dump_bool("rollback_info_completed", rollback_info_completed); + { + f->open_array_section("ops"); + DumpVisitor vis(f); + visit(&vis); + f->close_section(); + } + f->close_section(); +} + +void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o) +{ + map<string, boost::optional<bufferlist> > attrs; + attrs[OI_ATTR]; + attrs[SS_ATTR]; + attrs["asdf"]; + o.push_back(new ObjectModDesc()); + o.back()->append(100); + o.back()->setattrs(attrs); + o.push_back(new ObjectModDesc()); + o.back()->rmobject(1001); + o.push_back(new ObjectModDesc()); + o.back()->create(); + o.back()->setattrs(attrs); + o.push_back(new ObjectModDesc()); + o.back()->create(); + o.back()->setattrs(attrs); + o.back()->mark_unrollbackable(); + o.back()->append(1000); +} + +void ObjectModDesc::encode(bufferlist &_bl) const +{ + ENCODE_START(max_required_version, max_required_version, _bl); + encode(can_local_rollback, _bl); + encode(rollback_info_completed, _bl); + encode(bl, _bl); + ENCODE_FINISH(_bl); +} +void ObjectModDesc::decode(bufferlist::const_iterator &_bl) +{ + DECODE_START(2, _bl); + max_required_version = struct_v; + decode(can_local_rollback, _bl); + decode(rollback_info_completed, _bl); + decode(bl, _bl); + // ensure bl does not pin a larger buffer in memory + bl.rebuild(); + bl.reassign_to_mempool(mempool::mempool_osd_pglog); + DECODE_FINISH(_bl); +} + +// -- pg_log_entry_t -- + +string pg_log_entry_t::get_key_name() const +{ + return version.get_key_name(); +} + +void pg_log_entry_t::encode_with_checksum(bufferlist& bl) const +{ + using ceph::encode; + bufferlist ebl(sizeof(*this)*2); + this->encode(ebl); + __u32 crc = ebl.crc32c(0); + encode(ebl, bl); + encode(crc, bl); +} + +void pg_log_entry_t::decode_with_checksum(bufferlist::const_iterator& p) +{ + using ceph::decode; + bufferlist bl; + decode(bl, p); + __u32 crc; + decode(crc, p); + if (crc != bl.crc32c(0)) + throw buffer::malformed_input("bad checksum on pg_log_entry_t"); + auto q = bl.cbegin(); + this->decode(q); +} + +void pg_log_entry_t::encode(bufferlist &bl) const +{ + ENCODE_START(12, 4, bl); + encode(op, bl); + encode(soid, bl); + encode(version, bl); + + /** + * Added with reverting_to: + * Previous code used prior_version to encode + * what we now call reverting_to. This will + * allow older code to decode reverting_to + * into prior_version as expected. + */ + if (op == LOST_REVERT) + encode(reverting_to, bl); + else + encode(prior_version, bl); + + encode(reqid, bl); + encode(mtime, bl); + if (op == LOST_REVERT) + encode(prior_version, bl); + encode(snaps, bl); + encode(user_version, bl); + encode(mod_desc, bl); + encode(extra_reqids, bl); + if (op == ERROR) + encode(return_code, bl); + if (!extra_reqids.empty()) + encode(extra_reqid_return_codes, bl); + ENCODE_FINISH(bl); +} + +void pg_log_entry_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(12, 4, 4, bl); + decode(op, bl); + if (struct_v < 2) { + sobject_t old_soid; + decode(old_soid, bl); + soid.oid = old_soid.oid; + soid.snap = old_soid.snap; + invalid_hash = true; + } else { + decode(soid, bl); + } + if (struct_v < 3) + invalid_hash = true; + decode(version, bl); + + if (struct_v >= 6 && op == LOST_REVERT) + decode(reverting_to, bl); + else + decode(prior_version, bl); + + decode(reqid, bl); + + decode(mtime, bl); + if (struct_v < 5) + invalid_pool = true; + + if (op == LOST_REVERT) { + if (struct_v >= 6) { + decode(prior_version, bl); + } else { + reverting_to = prior_version; + } + } + if (struct_v >= 7 || // for v >= 7, this is for all ops. + op == CLONE) { // for v < 7, it's only present for CLONE. + decode(snaps, bl); + // ensure snaps does not pin a larger buffer in memory + snaps.rebuild(); + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + if (struct_v >= 8) + decode(user_version, bl); + else + user_version = version.version; + + if (struct_v >= 9) + decode(mod_desc, bl); + else + mod_desc.mark_unrollbackable(); + if (struct_v >= 10) + decode(extra_reqids, bl); + if (struct_v >= 11 && op == ERROR) + decode(return_code, bl); + if (struct_v >= 12 && !extra_reqids.empty()) + decode(extra_reqid_return_codes, bl); + DECODE_FINISH(bl); +} + +void pg_log_entry_t::dump(Formatter *f) const +{ + f->dump_string("op", get_op_name()); + f->dump_stream("object") << soid; + f->dump_stream("version") << version; + f->dump_stream("prior_version") << prior_version; + f->dump_stream("reqid") << reqid; + f->open_array_section("extra_reqids"); + uint32_t idx = 0; + for (auto p = extra_reqids.begin(); + p != extra_reqids.end(); + ++idx, ++p) { + f->open_object_section("extra_reqid"); + f->dump_stream("reqid") << p->first; + f->dump_stream("user_version") << p->second; + auto it = extra_reqid_return_codes.find(idx); + if (it != extra_reqid_return_codes.end()) { + f->dump_int("return_code", it->second); + } + f->close_section(); + } + f->close_section(); + f->dump_stream("mtime") << mtime; + f->dump_int("return_code", return_code); + if (snaps.length() > 0) { + vector<snapid_t> v; + bufferlist c = snaps; + auto p = c.cbegin(); + try { + using ceph::decode; + decode(v, p); + } catch (...) { + v.clear(); + } + f->open_object_section("snaps"); + for (vector<snapid_t>::iterator p = v.begin(); p != v.end(); ++p) + f->dump_unsigned("snap", *p); + f->close_section(); + } + { + f->open_object_section("mod_desc"); + mod_desc.dump(f); + f->close_section(); + } +} + +void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o) +{ + o.push_back(new pg_log_entry_t()); + hobject_t oid(object_t("objname"), "key", 123, 456, 0, ""); + o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4), + 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + utime_t(8,9), 0)); + o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4), + 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + utime_t(8,9), -ENOENT)); +} + +ostream& operator<<(ostream& out, const pg_log_entry_t& e) +{ + out << e.version << " (" << e.prior_version << ") " + << std::left << std::setw(8) << e.get_op_name() << ' ' + << e.soid << " by " << e.reqid << " " << e.mtime + << " " << e.return_code; + if (e.snaps.length()) { + vector<snapid_t> snaps; + bufferlist c = e.snaps; + auto p = c.cbegin(); + try { + decode(snaps, p); + } catch (...) { + snaps.clear(); + } + out << " snaps " << snaps; + } + return out; +} + +// -- pg_log_dup_t -- + +std::string pg_log_dup_t::get_key_name() const +{ + static const char prefix[] = "dup_"; + std::string key(36, ' '); + memcpy(&key[0], prefix, 4); + version.get_key_name(&key[4]); + key.resize(35); // remove the null terminator + return key; +} + +void pg_log_dup_t::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(reqid, bl); + encode(version, bl); + encode(user_version, bl); + encode(return_code, bl); + ENCODE_FINISH(bl); +} + +void pg_log_dup_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(reqid, bl); + decode(version, bl); + decode(user_version, bl); + decode(return_code, bl); + DECODE_FINISH(bl); +} + +void pg_log_dup_t::dump(Formatter *f) const +{ + f->dump_stream("reqid") << reqid; + f->dump_stream("version") << version; + f->dump_stream("user_version") << user_version; + f->dump_stream("return_code") << return_code; +} + +void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o) +{ + o.push_back(new pg_log_dup_t()); + o.push_back(new pg_log_dup_t(eversion_t(1,2), + 1, + osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + 0)); + o.push_back(new pg_log_dup_t(eversion_t(1,2), + 2, + osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + -ENOENT)); +} + + +std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) { + return out << "log_dup(reqid=" << e.reqid << + " v=" << e.version << " uv=" << e.user_version << + " rc=" << e.return_code << ")"; +} + + +// -- pg_log_t -- + +// out: pg_log_t that only has entries that apply to import_pgid using curmap +// reject: Entries rejected from "in" are in the reject.log. Other fields not set. +void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap, + const string &hit_set_namespace, const pg_log_t &in, + pg_log_t &out, pg_log_t &reject) +{ + out = in; + out.log.clear(); + reject.log.clear(); + + for (list<pg_log_entry_t>::const_iterator i = in.log.begin(); + i != in.log.end(); ++i) { + + // Reject pg log entries for temporary objects + if (i->soid.is_temp()) { + reject.log.push_back(*i); + continue; + } + + if (i->soid.nspace != hit_set_namespace) { + object_t oid = i->soid.oid; + object_locator_t loc(i->soid); + pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc); + pg_t pgid = curmap.raw_pg_to_pg(raw_pgid); + + if (import_pgid.pgid == pgid) { + out.log.push_back(*i); + } else { + reject.log.push_back(*i); + } + } else { + out.log.push_back(*i); + } + } +} + +void pg_log_t::encode(bufferlist& bl) const +{ + ENCODE_START(7, 3, bl); + encode(head, bl); + encode(tail, bl); + encode(log, bl); + encode(can_rollback_to, bl); + encode(rollback_info_trimmed_to, bl); + encode(dups, bl); + ENCODE_FINISH(bl); +} + +void pg_log_t::decode(bufferlist::const_iterator &bl, int64_t pool) +{ + DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl); + decode(head, bl); + decode(tail, bl); + if (struct_v < 2) { + bool backlog; + decode(backlog, bl); + } + decode(log, bl); + if (struct_v >= 5) + decode(can_rollback_to, bl); + + if (struct_v >= 6) + decode(rollback_info_trimmed_to, bl); + else + rollback_info_trimmed_to = tail; + + if (struct_v >= 7) + decode(dups, bl); + + DECODE_FINISH(bl); + + // handle hobject_t format change + if (struct_v < 4) { + for (list<pg_log_entry_t>::iterator i = log.begin(); + i != log.end(); + ++i) { + if (!i->soid.is_max() && i->soid.pool == -1) + i->soid.pool = pool; + } + } +} + +void pg_log_t::dump(Formatter *f) const +{ + f->dump_stream("head") << head; + f->dump_stream("tail") << tail; + f->open_array_section("log"); + for (list<pg_log_entry_t>::const_iterator p = log.begin(); p != log.end(); ++p) { + f->open_object_section("entry"); + p->dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("dups"); + for (const auto& entry : dups) { + f->open_object_section("entry"); + entry.dump(f); + f->close_section(); + } + f->close_section(); +} + +void pg_log_t::generate_test_instances(list<pg_log_t*>& o) +{ + o.push_back(new pg_log_t); + + // this is nonsensical: + o.push_back(new pg_log_t); + o.back()->head = eversion_t(1,2); + o.back()->tail = eversion_t(3,4); + list<pg_log_entry_t*> e; + pg_log_entry_t::generate_test_instances(e); + for (list<pg_log_entry_t*>::iterator p = e.begin(); p != e.end(); ++p) + o.back()->log.push_back(**p); +} + +static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups) +{ + auto earliest_dup_version = + target.head.version < maxdups ? 0u : target.head.version - maxdups + 1; + lgeneric_subdout(cct, osd, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version << dendl; + + for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) { + if (d->version.version >= earliest_dup_version) { + lgeneric_subdout(cct, osd, 20) + << "copy_up_to/copy_after copy dup version " + << d->version << dendl; + target.dups.push_back(pg_log_dup_t(*d)); + } + } + + for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) { + ceph_assert(i->version > other.tail); + if (i->version > target.tail) + break; + if (i->version.version >= earliest_dup_version) { + lgeneric_subdout(cct, osd, 20) + << "copy_up_to/copy_after copy dup from log version " + << i->version << dendl; + target.dups.push_back(pg_log_dup_t(*i)); + } + } +} + + +void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v) +{ + can_rollback_to = other.can_rollback_to; + head = other.head; + tail = other.tail; + lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v << dendl; + for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin(); + i != other.log.rend(); + ++i) { + ceph_assert(i->version > other.tail); + if (i->version <= v) { + // make tail accurate. + tail = i->version; + break; + } + lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl; + log.push_front(*i); + } + _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked); +} + +void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max) +{ + can_rollback_to = other.can_rollback_to; + int n = 0; + head = other.head; + tail = other.tail; + lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max << dendl; + for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin(); + i != other.log.rend(); + ++i) { + ceph_assert(i->version > other.tail); + if (n++ >= max) { + tail = i->version; + break; + } + lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl; + log.push_front(*i); + } + _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked); +} + +ostream& pg_log_t::print(ostream& out) const +{ + out << *this << std::endl; + for (list<pg_log_entry_t>::const_iterator p = log.begin(); + p != log.end(); + ++p) + out << *p << std::endl; + for (const auto& entry : dups) { + out << " dup entry: " << entry << std::endl; + } + return out; +} + +// -- pg_missing_t -- + +ostream& operator<<(ostream& out, const pg_missing_item& i) +{ + out << i.need; + if (i.have != eversion_t()) + out << "(" << i.have << ")"; + out << " flags = " << i.flag_str(); + return out; +} + +// -- object_copy_cursor_t -- + +void object_copy_cursor_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(attr_complete, bl); + encode(data_offset, bl); + encode(data_complete, bl); + encode(omap_offset, bl); + encode(omap_complete, bl); + ENCODE_FINISH(bl); +} + +void object_copy_cursor_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(attr_complete, bl); + decode(data_offset, bl); + decode(data_complete, bl); + decode(omap_offset, bl); + decode(omap_complete, bl); + DECODE_FINISH(bl); +} + +void object_copy_cursor_t::dump(Formatter *f) const +{ + f->dump_unsigned("attr_complete", (int)attr_complete); + f->dump_unsigned("data_offset", data_offset); + f->dump_unsigned("data_complete", (int)data_complete); + f->dump_string("omap_offset", omap_offset); + f->dump_unsigned("omap_complete", (int)omap_complete); +} + +void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o) +{ + o.push_back(new object_copy_cursor_t); + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_offset = 123; + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_complete = true; + o.back()->omap_offset = "foo"; + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_complete = true; + o.back()->omap_complete = true; +} + +// -- object_copy_data_t -- + +void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(8, 5, bl); + encode(size, bl); + encode(mtime, bl); + encode(attrs, bl); + encode(data, bl); + encode(omap_data, bl); + encode(cursor, bl); + encode(omap_header, bl); + encode(snaps, bl); + encode(snap_seq, bl); + encode(flags, bl); + encode(data_digest, bl); + encode(omap_digest, bl); + encode(reqids, bl); + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(reqid_return_codes, bl); + ENCODE_FINISH(bl); +} + +void object_copy_data_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(7, bl); + if (struct_v < 5) { + // old + decode(size, bl); + decode(mtime, bl); + { + string category; + decode(category, bl); // no longer used + } + decode(attrs, bl); + decode(data, bl); + { + map<string,bufferlist> omap; + decode(omap, bl); + omap_data.clear(); + if (!omap.empty()) { + using ceph::encode; + encode(omap, omap_data); + } + } + decode(cursor, bl); + if (struct_v >= 2) + decode(omap_header, bl); + if (struct_v >= 3) { + decode(snaps, bl); + decode(snap_seq, bl); + } else { + snaps.clear(); + snap_seq = 0; + } + if (struct_v >= 4) { + decode(flags, bl); + decode(data_digest, bl); + decode(omap_digest, bl); + } + } else { + // current + decode(size, bl); + decode(mtime, bl); + decode(attrs, bl); + decode(data, bl); + decode(omap_data, bl); + decode(cursor, bl); + decode(omap_header, bl); + decode(snaps, bl); + decode(snap_seq, bl); + if (struct_v >= 4) { + decode(flags, bl); + decode(data_digest, bl); + decode(omap_digest, bl); + } + if (struct_v >= 6) { + decode(reqids, bl); + } + if (struct_v >= 7) { + decode(truncate_seq, bl); + decode(truncate_size, bl); + } + if (struct_v >= 8) { + decode(reqid_return_codes, bl); + } + } + DECODE_FINISH(bl); +} + +void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o) +{ + o.push_back(new object_copy_data_t()); + + list<object_copy_cursor_t*> cursors; + object_copy_cursor_t::generate_test_instances(cursors); + list<object_copy_cursor_t*>::iterator ci = cursors.begin(); + o.back()->cursor = **(ci++); + + o.push_back(new object_copy_data_t()); + o.back()->cursor = **(ci++); + + o.push_back(new object_copy_data_t()); + o.back()->size = 1234; + o.back()->mtime.set_from_double(1234); + bufferptr bp("there", 5); + bufferlist bl; + bl.push_back(bp); + o.back()->attrs["hello"] = bl; + bufferptr bp2("not", 3); + bufferlist bl2; + bl2.push_back(bp2); + map<string,bufferlist> omap; + omap["why"] = bl2; + using ceph::encode; + encode(omap, o.back()->omap_data); + bufferptr databp("iamsomedatatocontain", 20); + o.back()->data.push_back(databp); + o.back()->omap_header.append("this is an omap header"); + o.back()->snaps.push_back(123); + o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t())); +} + +void object_copy_data_t::dump(Formatter *f) const +{ + f->open_object_section("cursor"); + cursor.dump(f); + f->close_section(); // cursor + f->dump_int("size", size); + f->dump_stream("mtime") << mtime; + /* we should really print out the attrs here, but bufferlist + const-correctness prevents that */ + f->dump_int("attrs_size", attrs.size()); + f->dump_int("flags", flags); + f->dump_unsigned("data_digest", data_digest); + f->dump_unsigned("omap_digest", omap_digest); + f->dump_int("omap_data_length", omap_data.length()); + f->dump_int("omap_header_length", omap_header.length()); + f->dump_int("data_length", data.length()); + f->open_array_section("snaps"); + for (vector<snapid_t>::const_iterator p = snaps.begin(); + p != snaps.end(); ++p) + f->dump_unsigned("snap", *p); + f->close_section(); + f->open_array_section("reqids"); + uint32_t idx = 0; + for (auto p = reqids.begin(); + p != reqids.end(); + ++idx, ++p) { + f->open_object_section("extra_reqid"); + f->dump_stream("reqid") << p->first; + f->dump_stream("user_version") << p->second; + auto it = reqid_return_codes.find(idx); + if (it != reqid_return_codes.end()) { + f->dump_int("return_code", it->second); + } + f->close_section(); + } + f->close_section(); +} + +// -- pg_create_t -- + +void pg_create_t::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(created, bl); + encode(parent, bl); + encode(split_bits, bl); + ENCODE_FINISH(bl); +} + +void pg_create_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(created, bl); + decode(parent, bl); + decode(split_bits, bl); + DECODE_FINISH(bl); +} + +void pg_create_t::dump(Formatter *f) const +{ + f->dump_unsigned("created", created); + f->dump_stream("parent") << parent; + f->dump_int("split_bits", split_bits); +} + +void pg_create_t::generate_test_instances(list<pg_create_t*>& o) +{ + o.push_back(new pg_create_t); + o.push_back(new pg_create_t(1, pg_t(3, 4), 2)); +} + + +// -- pg_hit_set_info_t -- + +void pg_hit_set_info_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 1, bl); + encode(begin, bl); + encode(end, bl); + encode(version, bl); + encode(using_gmt, bl); + ENCODE_FINISH(bl); +} + +void pg_hit_set_info_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(2, p); + decode(begin, p); + decode(end, p); + decode(version, p); + if (struct_v >= 2) { + decode(using_gmt, p); + } else { + using_gmt = false; + } + DECODE_FINISH(p); +} + +void pg_hit_set_info_t::dump(Formatter *f) const +{ + f->dump_stream("begin") << begin; + f->dump_stream("end") << end; + f->dump_stream("version") << version; + f->dump_stream("using_gmt") << using_gmt; +} + +void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls) +{ + ls.push_back(new pg_hit_set_info_t); + ls.push_back(new pg_hit_set_info_t); + ls.back()->begin = utime_t(1, 2); + ls.back()->end = utime_t(3, 4); +} + + +// -- pg_hit_set_history_t -- + +void pg_hit_set_history_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(current_last_update, bl); + { + utime_t dummy_stamp; + encode(dummy_stamp, bl); + } + { + pg_hit_set_info_t dummy_info; + encode(dummy_info, bl); + } + encode(history, bl); + ENCODE_FINISH(bl); +} + +void pg_hit_set_history_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(current_last_update, p); + { + utime_t dummy_stamp; + decode(dummy_stamp, p); + } + { + pg_hit_set_info_t dummy_info; + decode(dummy_info, p); + } + decode(history, p); + DECODE_FINISH(p); +} + +void pg_hit_set_history_t::dump(Formatter *f) const +{ + f->dump_stream("current_last_update") << current_last_update; + f->open_array_section("history"); + for (list<pg_hit_set_info_t>::const_iterator p = history.begin(); + p != history.end(); ++p) { + f->open_object_section("info"); + p->dump(f); + f->close_section(); + } + f->close_section(); +} + +void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls) +{ + ls.push_back(new pg_hit_set_history_t); + ls.push_back(new pg_hit_set_history_t); + ls.back()->current_last_update = eversion_t(1, 2); + ls.back()->history.push_back(pg_hit_set_info_t()); +} + +// -- OSDSuperblock -- + +void OSDSuperblock::encode(bufferlist &bl) const +{ + ENCODE_START(8, 5, bl); + encode(cluster_fsid, bl); + encode(whoami, bl); + encode(current_epoch, bl); + encode(oldest_map, bl); + encode(newest_map, bl); + encode(weight, bl); + compat_features.encode(bl); + encode(clean_thru, bl); + encode(mounted, bl); + encode(osd_fsid, bl); + encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full + encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full + ENCODE_FINISH(bl); +} + +void OSDSuperblock::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl); + if (struct_v < 3) { + string magic; + decode(magic, bl); + } + decode(cluster_fsid, bl); + decode(whoami, bl); + decode(current_epoch, bl); + decode(oldest_map, bl); + decode(newest_map, bl); + decode(weight, bl); + if (struct_v >= 2) { + compat_features.decode(bl); + } else { //upgrade it! + compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + } + decode(clean_thru, bl); + decode(mounted, bl); + if (struct_v >= 4) + decode(osd_fsid, bl); + if (struct_v >= 6) { + epoch_t last_map_marked_full; + decode(last_map_marked_full, bl); + } + if (struct_v >= 7) { + map<int64_t,epoch_t> pool_last_map_marked_full; + decode(pool_last_map_marked_full, bl); + } + DECODE_FINISH(bl); +} + +void OSDSuperblock::dump(Formatter *f) const +{ + f->dump_stream("cluster_fsid") << cluster_fsid; + f->dump_stream("osd_fsid") << osd_fsid; + f->dump_int("whoami", whoami); + f->dump_int("current_epoch", current_epoch); + f->dump_int("oldest_map", oldest_map); + f->dump_int("newest_map", newest_map); + f->dump_float("weight", weight); + f->open_object_section("compat"); + compat_features.dump(f); + f->close_section(); + f->dump_int("clean_thru", clean_thru); + f->dump_int("last_epoch_mounted", mounted); +} + +void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o) +{ + OSDSuperblock z; + o.push_back(new OSDSuperblock(z)); + z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101"); + z.osd_fsid.parse("02020202-0202-0202-0202-020202020202"); + z.whoami = 3; + z.current_epoch = 4; + z.oldest_map = 5; + z.newest_map = 9; + z.mounted = 8; + z.clean_thru = 7; + o.push_back(new OSDSuperblock(z)); + o.push_back(new OSDSuperblock(z)); +} + +// -- SnapSet -- + +void SnapSet::encode(bufferlist& bl) const +{ + ENCODE_START(3, 2, bl); + encode(seq, bl); + encode(true, bl); // head_exists + encode(snaps, bl); + encode(clones, bl); + encode(clone_overlap, bl); + encode(clone_size, bl); + encode(clone_snaps, bl); + ENCODE_FINISH(bl); +} + +void SnapSet::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(seq, bl); + bl.advance(1u); // skip legacy head_exists (always true) + decode(snaps, bl); + decode(clones, bl); + decode(clone_overlap, bl); + decode(clone_size, bl); + if (struct_v >= 3) { + decode(clone_snaps, bl); + } else { + clone_snaps.clear(); + } + DECODE_FINISH(bl); +} + +void SnapSet::dump(Formatter *f) const +{ + SnapContext sc(seq, snaps); + f->open_object_section("snap_context"); + sc.dump(f); + f->close_section(); + f->open_array_section("clones"); + for (vector<snapid_t>::const_iterator p = clones.begin(); p != clones.end(); ++p) { + f->open_object_section("clone"); + f->dump_unsigned("snap", *p); + auto cs = clone_size.find(*p); + if (cs != clone_size.end()) + f->dump_unsigned("size", cs->second); + else + f->dump_string("size", "????"); + auto co = clone_overlap.find(*p); + if (co != clone_overlap.end()) + f->dump_stream("overlap") << co->second; + else + f->dump_stream("overlap") << "????"; + auto q = clone_snaps.find(*p); + if (q != clone_snaps.end()) { + f->open_array_section("snaps"); + for (auto s : q->second) { + f->dump_unsigned("snap", s); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} + +void SnapSet::generate_test_instances(list<SnapSet*>& o) +{ + o.push_back(new SnapSet); + o.push_back(new SnapSet); + o.back()->seq = 123; + o.back()->snaps.push_back(123); + o.back()->snaps.push_back(12); + o.push_back(new SnapSet); + o.back()->seq = 123; + o.back()->snaps.push_back(123); + o.back()->snaps.push_back(12); + o.back()->clones.push_back(12); + o.back()->clone_size[12] = 12345; + o.back()->clone_overlap[12]; + o.back()->clone_snaps[12] = {12, 10, 8}; +} + +ostream& operator<<(ostream& out, const SnapSet& cs) +{ + return out << cs.seq << "=" << cs.snaps << ":" + << cs.clone_snaps; +} + +void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy) +{ + // NOTE: our reconstruction of snaps (and the snapc) is not strictly + // correct: it will not include snaps that still logically exist + // but for which there was no clone that is defined. For all + // practical purposes this doesn't matter, since we only use that + // information to clone on the OSD, and we have already moved + // forward past that part of the object history. + + seq = ss.seq; + set<snapid_t> _snaps; + set<snapid_t> _clones; + for (vector<librados::clone_info_t>::const_iterator p = ss.clones.begin(); + p != ss.clones.end(); + ++p) { + if (p->cloneid != librados::SNAP_HEAD) { + _clones.insert(p->cloneid); + _snaps.insert(p->snaps.begin(), p->snaps.end()); + clone_size[p->cloneid] = p->size; + clone_overlap[p->cloneid]; // the entry must exist, even if it's empty. + for (vector<pair<uint64_t, uint64_t> >::const_iterator q = + p->overlap.begin(); q != p->overlap.end(); ++q) + clone_overlap[p->cloneid].insert(q->first, q->second); + if (!legacy) { + // p->snaps is ascending; clone_snaps is descending + vector<snapid_t>& v = clone_snaps[p->cloneid]; + for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) { + v.push_back(*q); + } + } + } + } + + // ascending + clones.clear(); + clones.reserve(_clones.size()); + for (set<snapid_t>::iterator p = _clones.begin(); p != _clones.end(); ++p) + clones.push_back(*p); + + // descending + snaps.clear(); + snaps.reserve(_snaps.size()); + for (set<snapid_t>::reverse_iterator p = _snaps.rbegin(); + p != _snaps.rend(); ++p) + snaps.push_back(*p); +} + +uint64_t SnapSet::get_clone_bytes(snapid_t clone) const +{ + ceph_assert(clone_size.count(clone)); + uint64_t size = clone_size.find(clone)->second; + ceph_assert(clone_overlap.count(clone)); + const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second; + ceph_assert(size >= (uint64_t)overlap.size()); + return size - overlap.size(); +} + +void SnapSet::filter(const pg_pool_t &pinfo) +{ + vector<snapid_t> oldsnaps; + oldsnaps.swap(snaps); + for (vector<snapid_t>::const_iterator i = oldsnaps.begin(); + i != oldsnaps.end(); + ++i) { + if (!pinfo.is_removed_snap(*i)) + snaps.push_back(*i); + } +} + +SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const +{ + SnapSet ss = *this; + ss.filter(pinfo); + return ss; +} + +// -- watch_info_t -- + +void watch_info_t::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(4, 3, bl); + encode(cookie, bl); + encode(timeout_seconds, bl); + encode(addr, bl, features); + ENCODE_FINISH(bl); +} + +void watch_info_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl); + decode(cookie, bl); + if (struct_v < 2) { + uint64_t ver; + decode(ver, bl); + } + decode(timeout_seconds, bl); + if (struct_v >= 4) { + decode(addr, bl); + } + DECODE_FINISH(bl); +} + +void watch_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("cookie", cookie); + f->dump_unsigned("timeout_seconds", timeout_seconds); + f->open_object_section("addr"); + addr.dump(f); + f->close_section(); +} + +void watch_info_t::generate_test_instances(list<watch_info_t*>& o) +{ + o.push_back(new watch_info_t); + o.push_back(new watch_info_t); + o.back()->cookie = 123; + o.back()->timeout_seconds = 99; + entity_addr_t ea; + ea.set_type(entity_addr_t::TYPE_LEGACY); + ea.set_nonce(1); + ea.set_family(AF_INET); + ea.set_in4_quad(0, 127); + ea.set_in4_quad(1, 0); + ea.set_in4_quad(2, 1); + ea.set_in4_quad(3, 2); + ea.set_port(2); + o.back()->addr = ea; +} + +// -- chunk_info_t -- + +void chunk_info_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(offset, bl); + encode(length, bl); + encode(oid, bl); + __u32 _flags = flags; + encode(_flags, bl); + ENCODE_FINISH(bl); +} + +void chunk_info_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(offset, bl); + decode(length, bl); + decode(oid, bl); + __u32 _flags; + decode(_flags, bl); + flags = (cflag_t)_flags; + DECODE_FINISH(bl); +} + +void chunk_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("length", length); + f->open_object_section("oid"); + oid.dump(f); + f->close_section(); + f->dump_unsigned("flags", flags); +} + +ostream& operator<<(ostream& out, const chunk_info_t& ci) +{ + return out << "(len: " << ci.length << " oid: " << ci.oid + << " offset: " << ci.offset + << " flags: " << ci.get_flag_string(ci.flags) << ")"; +} + +// -- object_manifest_t -- + +void object_manifest_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(type, bl); + switch (type) { + case TYPE_NONE: break; + case TYPE_REDIRECT: + encode(redirect_target, bl); + break; + case TYPE_CHUNKED: + encode(chunk_map, bl); + break; + default: + ceph_abort(); + } + ENCODE_FINISH(bl); +} + +void object_manifest_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(type, bl); + switch (type) { + case TYPE_NONE: break; + case TYPE_REDIRECT: + decode(redirect_target, bl); + break; + case TYPE_CHUNKED: + decode(chunk_map, bl); + break; + default: + ceph_abort(); + } + DECODE_FINISH(bl); +} + +void object_manifest_t::dump(Formatter *f) const +{ + f->dump_unsigned("type", type); + if (type == TYPE_REDIRECT) { + f->open_object_section("redirect_target"); + redirect_target.dump(f); + f->close_section(); + } else if (type == TYPE_CHUNKED) { + f->open_array_section("chunk_map"); + for (auto& p : chunk_map) { + f->open_object_section("chunk"); + f->dump_unsigned("offset", p.first); + p.second.dump(f); + f->close_section(); + } + f->close_section(); + } +} + +void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o) +{ + o.push_back(new object_manifest_t()); + o.back()->type = TYPE_REDIRECT; +} + +ostream& operator<<(ostream& out, const object_manifest_t& om) +{ + out << "manifest(" << om.get_type_name(); + if (om.is_redirect()) { + out << " " << om.redirect_target; + } else if (om.is_chunked()) { + out << " " << om.chunk_map; + } + out << ")"; + return out; +} + +// -- object_info_t -- + +void object_info_t::copy_user_bits(const object_info_t& other) +{ + // these bits are copied from head->clone. + size = other.size; + mtime = other.mtime; + local_mtime = other.local_mtime; + last_reqid = other.last_reqid; + truncate_seq = other.truncate_seq; + truncate_size = other.truncate_size; + flags = other.flags; + user_version = other.user_version; + data_digest = other.data_digest; + omap_digest = other.omap_digest; +} + +void object_info_t::encode(bufferlist& bl, uint64_t features) const +{ + object_locator_t myoloc(soid); + map<entity_name_t, watch_info_t> old_watchers; + for (map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator i = + watchers.begin(); + i != watchers.end(); + ++i) { + old_watchers.insert(make_pair(i->first.second, i->second)); + } + ENCODE_START(17, 8, bl); + encode(soid, bl); + encode(myoloc, bl); //Retained for compatibility + encode((__u32)0, bl); // was category, no longer used + encode(version, bl); + encode(prior_version, bl); + encode(last_reqid, bl); + encode(size, bl); + encode(mtime, bl); + if (soid.snap == CEPH_NOSNAP) + encode(osd_reqid_t(), bl); // used to be wrlock_by + else + encode((uint32_t)0, bl); // was legacy_snaps + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(is_lost(), bl); + encode(old_watchers, bl, features); + /* shenanigans to avoid breaking backwards compatibility in the disk format. + * When we can, switch this out for simply putting the version_t on disk. */ + eversion_t user_eversion(0, user_version); + encode(user_eversion, bl); + encode(test_flag(FLAG_USES_TMAP), bl); + encode(watchers, bl, features); + __u32 _flags = flags; + encode(_flags, bl); + encode(local_mtime, bl); + encode(data_digest, bl); + encode(omap_digest, bl); + encode(expected_object_size, bl); + encode(expected_write_size, bl); + encode(alloc_hint_flags, bl); + if (has_manifest()) { + encode(manifest, bl); + } + ENCODE_FINISH(bl); +} + +void object_info_t::decode(bufferlist::const_iterator& bl) +{ + object_locator_t myoloc; + DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl); + map<entity_name_t, watch_info_t> old_watchers; + decode(soid, bl); + decode(myoloc, bl); + { + string category; + decode(category, bl); // no longer used + } + decode(version, bl); + decode(prior_version, bl); + decode(last_reqid, bl); + decode(size, bl); + decode(mtime, bl); + if (soid.snap == CEPH_NOSNAP) { + osd_reqid_t wrlock_by; + decode(wrlock_by, bl); + } else { + vector<snapid_t> legacy_snaps; + decode(legacy_snaps, bl); + } + decode(truncate_seq, bl); + decode(truncate_size, bl); + + // if this is struct_v >= 13, we will overwrite this + // below since this field is just here for backwards + // compatibility + __u8 lo; + decode(lo, bl); + flags = (flag_t)lo; + + decode(old_watchers, bl); + eversion_t user_eversion; + decode(user_eversion, bl); + user_version = user_eversion.version; + + if (struct_v >= 9) { + bool uses_tmap = false; + decode(uses_tmap, bl); + if (uses_tmap) + set_flag(FLAG_USES_TMAP); + } else { + set_flag(FLAG_USES_TMAP); + } + if (struct_v < 10) + soid.pool = myoloc.pool; + if (struct_v >= 11) { + decode(watchers, bl); + } else { + for (map<entity_name_t, watch_info_t>::iterator i = old_watchers.begin(); + i != old_watchers.end(); + ++i) { + watchers.insert( + make_pair( + make_pair(i->second.cookie, i->first), i->second)); + } + } + if (struct_v >= 13) { + __u32 _flags; + decode(_flags, bl); + flags = (flag_t)_flags; + } + if (struct_v >= 14) { + decode(local_mtime, bl); + } else { + local_mtime = utime_t(); + } + if (struct_v >= 15) { + decode(data_digest, bl); + decode(omap_digest, bl); + } else { + data_digest = omap_digest = -1; + clear_flag(FLAG_DATA_DIGEST); + clear_flag(FLAG_OMAP_DIGEST); + } + if (struct_v >= 16) { + decode(expected_object_size, bl); + decode(expected_write_size, bl); + decode(alloc_hint_flags, bl); + } else { + expected_object_size = 0; + expected_write_size = 0; + alloc_hint_flags = 0; + } + if (struct_v >= 17) { + if (has_manifest()) { + decode(manifest, bl); + } + } + DECODE_FINISH(bl); +} + +void object_info_t::dump(Formatter *f) const +{ + f->open_object_section("oid"); + soid.dump(f); + f->close_section(); + f->dump_stream("version") << version; + f->dump_stream("prior_version") << prior_version; + f->dump_stream("last_reqid") << last_reqid; + f->dump_unsigned("user_version", user_version); + f->dump_unsigned("size", size); + f->dump_stream("mtime") << mtime; + f->dump_stream("local_mtime") << local_mtime; + f->dump_unsigned("lost", (int)is_lost()); + vector<string> sv = get_flag_vector(flags); + f->open_array_section("flags"); + for (auto str: sv) + f->dump_string("flags", str); + f->close_section(); + f->dump_unsigned("truncate_seq", truncate_seq); + f->dump_unsigned("truncate_size", truncate_size); + f->dump_format("data_digest", "0x%08x", data_digest); + f->dump_format("omap_digest", "0x%08x", omap_digest); + f->dump_unsigned("expected_object_size", expected_object_size); + f->dump_unsigned("expected_write_size", expected_write_size); + f->dump_unsigned("alloc_hint_flags", alloc_hint_flags); + f->dump_object("manifest", manifest); + f->open_object_section("watchers"); + for (map<pair<uint64_t, entity_name_t>,watch_info_t>::const_iterator p = + watchers.begin(); p != watchers.end(); ++p) { + stringstream ss; + ss << p->first.second; + f->open_object_section(ss.str().c_str()); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void object_info_t::generate_test_instances(list<object_info_t*>& o) +{ + o.push_back(new object_info_t()); + + // fixme +} + + +ostream& operator<<(ostream& out, const object_info_t& oi) +{ + out << oi.soid << "(" << oi.version + << " " << oi.last_reqid; + if (oi.flags) + out << " " << oi.get_flag_string(); + out << " s " << oi.size; + out << " uv " << oi.user_version; + if (oi.is_data_digest()) + out << " dd " << std::hex << oi.data_digest << std::dec; + if (oi.is_omap_digest()) + out << " od " << std::hex << oi.omap_digest << std::dec; + out << " alloc_hint [" << oi.expected_object_size + << " " << oi.expected_write_size + << " " << oi.alloc_hint_flags << "]"; + if (oi.has_manifest()) + out << " " << oi.manifest; + out << ")"; + return out; +} + +// -- ObjectRecovery -- +void ObjectRecoveryProgress::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(data_complete, bl); + encode(data_recovered_to, bl); + encode(omap_recovered_to, bl); + encode(omap_complete, bl); + ENCODE_FINISH(bl); +} + +void ObjectRecoveryProgress::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(first, bl); + decode(data_complete, bl); + decode(data_recovered_to, bl); + decode(omap_recovered_to, bl); + decode(omap_complete, bl); + DECODE_FINISH(bl); +} + +ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog) +{ + return prog.print(out); +} + +void ObjectRecoveryProgress::generate_test_instances( + list<ObjectRecoveryProgress*>& o) +{ + o.push_back(new ObjectRecoveryProgress); + o.back()->first = false; + o.back()->data_complete = true; + o.back()->omap_complete = true; + o.back()->data_recovered_to = 100; + + o.push_back(new ObjectRecoveryProgress); + o.back()->first = true; + o.back()->data_complete = false; + o.back()->omap_complete = false; + o.back()->data_recovered_to = 0; +} + +ostream &ObjectRecoveryProgress::print(ostream &out) const +{ + return out << "ObjectRecoveryProgress(" + << ( first ? "" : "!" ) << "first, " + << "data_recovered_to:" << data_recovered_to + << ", data_complete:" << ( data_complete ? "true" : "false" ) + << ", omap_recovered_to:" << omap_recovered_to + << ", omap_complete:" << ( omap_complete ? "true" : "false" ) + << ", error:" << ( error ? "true" : "false" ) + << ")"; +} + +void ObjectRecoveryProgress::dump(Formatter *f) const +{ + f->dump_int("first?", first); + f->dump_int("data_complete?", data_complete); + f->dump_unsigned("data_recovered_to", data_recovered_to); + f->dump_int("omap_complete?", omap_complete); + f->dump_string("omap_recovered_to", omap_recovered_to); +} + +void ObjectRecoveryInfo::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(2, 1, bl); + encode(soid, bl); + encode(version, bl); + encode(size, bl); + encode(oi, bl, features); + encode(ss, bl); + encode(copy_subset, bl); + encode(clone_subset, bl); + ENCODE_FINISH(bl); +} + +void ObjectRecoveryInfo::decode(bufferlist::const_iterator &bl, + int64_t pool) +{ + DECODE_START(2, bl); + decode(soid, bl); + decode(version, bl); + decode(size, bl); + decode(oi, bl); + decode(ss, bl); + decode(copy_subset, bl); + decode(clone_subset, bl); + DECODE_FINISH(bl); + + if (struct_v < 2) { + if (!soid.is_max() && soid.pool == -1) + soid.pool = pool; + map<hobject_t, interval_set<uint64_t>> tmp; + tmp.swap(clone_subset); + for (map<hobject_t, interval_set<uint64_t>>::iterator i = tmp.begin(); + i != tmp.end(); + ++i) { + hobject_t first(i->first); + if (!first.is_max() && first.pool == -1) + first.pool = pool; + clone_subset[first].swap(i->second); + } + } +} + +void ObjectRecoveryInfo::generate_test_instances( + list<ObjectRecoveryInfo*>& o) +{ + o.push_back(new ObjectRecoveryInfo); + o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP)); + o.back()->version = eversion_t(0,0); + o.back()->size = 100; +} + + +void ObjectRecoveryInfo::dump(Formatter *f) const +{ + f->dump_stream("object") << soid; + f->dump_stream("at_version") << version; + f->dump_stream("size") << size; + { + f->open_object_section("object_info"); + oi.dump(f); + f->close_section(); + } + { + f->open_object_section("snapset"); + ss.dump(f); + f->close_section(); + } + f->dump_stream("copy_subset") << copy_subset; + f->dump_stream("clone_subset") << clone_subset; +} + +ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf) +{ + return inf.print(out); +} + +ostream &ObjectRecoveryInfo::print(ostream &out) const +{ + return out << "ObjectRecoveryInfo(" + << soid << "@" << version + << ", size: " << size + << ", copy_subset: " << copy_subset + << ", clone_subset: " << clone_subset + << ", snapset: " << ss + << ")"; +} + +// -- PushReplyOp -- +void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o) +{ + o.push_back(new PushReplyOp); + o.push_back(new PushReplyOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.push_back(new PushReplyOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); +} + +void PushReplyOp::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + ENCODE_FINISH(bl); +} + +void PushReplyOp::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + DECODE_FINISH(bl); +} + +void PushReplyOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; +} + +ostream &PushReplyOp::print(ostream &out) const +{ + return out + << "PushReplyOp(" << soid + << ")"; +} + +ostream& operator<<(ostream& out, const PushReplyOp &op) +{ + return op.print(out); +} + +uint64_t PushReplyOp::cost(CephContext *cct) const +{ + + return cct->_conf->osd_push_per_object_cost + + cct->_conf->osd_recovery_max_chunk; +} + +// -- PullOp -- +void PullOp::generate_test_instances(list<PullOp*> &o) +{ + o.push_back(new PullOp); + o.push_back(new PullOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.back()->recovery_info.version = eversion_t(3, 10); + o.push_back(new PullOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); + o.back()->recovery_info.version = eversion_t(0, 0); +} + +void PullOp::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + encode(recovery_info, bl, features); + encode(recovery_progress, bl); + ENCODE_FINISH(bl); +} + +void PullOp::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + decode(recovery_info, bl); + decode(recovery_progress, bl); + DECODE_FINISH(bl); +} + +void PullOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + { + f->open_object_section("recovery_progress"); + recovery_progress.dump(f); + f->close_section(); + } +} + +ostream &PullOp::print(ostream &out) const +{ + return out + << "PullOp(" << soid + << ", recovery_info: " << recovery_info + << ", recovery_progress: " << recovery_progress + << ")"; +} + +ostream& operator<<(ostream& out, const PullOp &op) +{ + return op.print(out); +} + +uint64_t PullOp::cost(CephContext *cct) const +{ + return cct->_conf->osd_push_per_object_cost + + cct->_conf->osd_recovery_max_chunk; +} + +// -- PushOp -- +void PushOp::generate_test_instances(list<PushOp*> &o) +{ + o.push_back(new PushOp); + o.push_back(new PushOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.back()->version = eversion_t(3, 10); + o.push_back(new PushOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); + o.back()->version = eversion_t(0, 0); +} + +void PushOp::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + encode(version, bl); + encode(data, bl); + encode(data_included, bl); + encode(omap_header, bl); + encode(omap_entries, bl); + encode(attrset, bl); + encode(recovery_info, bl, features); + encode(after_progress, bl); + encode(before_progress, bl); + ENCODE_FINISH(bl); +} + +void PushOp::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + decode(version, bl); + decode(data, bl); + decode(data_included, bl); + decode(omap_header, bl); + decode(omap_entries, bl); + decode(attrset, bl); + decode(recovery_info, bl); + decode(after_progress, bl); + decode(before_progress, bl); + DECODE_FINISH(bl); +} + +void PushOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; + f->dump_stream("version") << version; + f->dump_int("data_len", data.length()); + f->dump_stream("data_included") << data_included; + f->dump_int("omap_header_len", omap_header.length()); + f->dump_int("omap_entries_len", omap_entries.size()); + f->dump_int("attrset_len", attrset.size()); + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + { + f->open_object_section("after_progress"); + after_progress.dump(f); + f->close_section(); + } + { + f->open_object_section("before_progress"); + before_progress.dump(f); + f->close_section(); + } +} + +ostream &PushOp::print(ostream &out) const +{ + return out + << "PushOp(" << soid + << ", version: " << version + << ", data_included: " << data_included + << ", data_size: " << data.length() + << ", omap_header_size: " << omap_header.length() + << ", omap_entries_size: " << omap_entries.size() + << ", attrset_size: " << attrset.size() + << ", recovery_info: " << recovery_info + << ", after_progress: " << after_progress + << ", before_progress: " << before_progress + << ")"; +} + +ostream& operator<<(ostream& out, const PushOp &op) +{ + return op.print(out); +} + +uint64_t PushOp::cost(CephContext *cct) const +{ + uint64_t cost = data_included.size(); + for (map<string, bufferlist>::const_iterator i = + omap_entries.begin(); + i != omap_entries.end(); + ++i) { + cost += i->second.length(); + } + cost += cct->_conf->osd_push_per_object_cost; + return cost; +} + +// -- ScrubMap -- + +void ScrubMap::merge_incr(const ScrubMap &l) +{ + ceph_assert(valid_through == l.incr_since); + valid_through = l.valid_through; + + for (map<hobject_t,object>::const_iterator p = l.objects.begin(); + p != l.objects.end(); + ++p){ + if (p->second.negative) { + map<hobject_t,object>::iterator q = objects.find(p->first); + if (q != objects.end()) { + objects.erase(q); + } + } else { + objects[p->first] = p->second; + } + } +} + +void ScrubMap::encode(bufferlist& bl) const +{ + ENCODE_START(3, 2, bl); + encode(objects, bl); + encode((__u32)0, bl); // used to be attrs; now deprecated + bufferlist old_logbl; // not used + encode(old_logbl, bl); + encode(valid_through, bl); + encode(incr_since, bl); + ENCODE_FINISH(bl); +} + +void ScrubMap::decode(bufferlist::const_iterator& bl, int64_t pool) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(objects, bl); + { + map<string,string> attrs; // deprecated + decode(attrs, bl); + } + bufferlist old_logbl; // not used + decode(old_logbl, bl); + decode(valid_through, bl); + decode(incr_since, bl); + DECODE_FINISH(bl); + + // handle hobject_t upgrade + if (struct_v < 3) { + map<hobject_t, object> tmp; + tmp.swap(objects); + for (map<hobject_t, object>::iterator i = tmp.begin(); + i != tmp.end(); + ++i) { + hobject_t first(i->first); + if (!first.is_max() && first.pool == -1) + first.pool = pool; + objects[first] = i->second; + } + } +} + +void ScrubMap::dump(Formatter *f) const +{ + f->dump_stream("valid_through") << valid_through; + f->dump_stream("incremental_since") << incr_since; + f->open_array_section("objects"); + for (map<hobject_t,object>::const_iterator p = objects.begin(); p != objects.end(); ++p) { + f->open_object_section("object"); + f->dump_string("name", p->first.oid.name); + f->dump_unsigned("hash", p->first.get_hash()); + f->dump_string("key", p->first.get_key()); + f->dump_int("snapid", p->first.snap); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void ScrubMap::generate_test_instances(list<ScrubMap*>& o) +{ + o.push_back(new ScrubMap); + o.push_back(new ScrubMap); + o.back()->valid_through = eversion_t(1, 2); + o.back()->incr_since = eversion_t(3, 4); + list<object*> obj; + object::generate_test_instances(obj); + o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back(); + obj.pop_back(); + o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back(); +} + +// -- ScrubMap::object -- + +void ScrubMap::object::encode(bufferlist& bl) const +{ + bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch; + ENCODE_START(10, 7, bl); + encode(size, bl); + encode(negative, bl); + encode(attrs, bl); + encode(digest, bl); + encode(digest_present, bl); + encode((uint32_t)0, bl); // obsolete nlinks + encode((uint32_t)0, bl); // snapcolls + encode(omap_digest, bl); + encode(omap_digest_present, bl); + encode(compat_read_error, bl); + encode(stat_error, bl); + encode(read_error, bl); + encode(ec_hash_mismatch, bl); + encode(ec_size_mismatch, bl); + encode(large_omap_object_found, bl); + encode(large_omap_object_key_count, bl); + encode(large_omap_object_value_size, bl); + encode(object_omap_bytes, bl); + encode(object_omap_keys, bl); + ENCODE_FINISH(bl); +} + +void ScrubMap::object::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(10, bl); + decode(size, bl); + bool tmp, compat_read_error = false; + decode(tmp, bl); + negative = tmp; + decode(attrs, bl); + decode(digest, bl); + decode(tmp, bl); + digest_present = tmp; + { + uint32_t nlinks; + decode(nlinks, bl); + set<snapid_t> snapcolls; + decode(snapcolls, bl); + } + decode(omap_digest, bl); + decode(tmp, bl); + omap_digest_present = tmp; + decode(compat_read_error, bl); + decode(tmp, bl); + stat_error = tmp; + if (struct_v >= 8) { + decode(tmp, bl); + read_error = tmp; + decode(tmp, bl); + ec_hash_mismatch = tmp; + decode(tmp, bl); + ec_size_mismatch = tmp; + } + // If older encoder found a read_error, set read_error + if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch) + read_error = true; + if (struct_v >= 9) { + decode(tmp, bl); + large_omap_object_found = tmp; + decode(large_omap_object_key_count, bl); + decode(large_omap_object_value_size, bl); + } + if (struct_v >= 10) { + decode(object_omap_bytes, bl); + decode(object_omap_keys, bl); + } + DECODE_FINISH(bl); +} + +void ScrubMap::object::dump(Formatter *f) const +{ + f->dump_int("size", size); + f->dump_int("negative", negative); + f->open_array_section("attrs"); + for (map<string,bufferptr>::const_iterator p = attrs.begin(); p != attrs.end(); ++p) { + f->open_object_section("attr"); + f->dump_string("name", p->first); + f->dump_int("length", p->second.length()); + f->close_section(); + } + f->close_section(); +} + +void ScrubMap::object::generate_test_instances(list<object*>& o) +{ + o.push_back(new object); + o.push_back(new object); + o.back()->negative = true; + o.push_back(new object); + o.back()->size = 123; + o.back()->attrs["foo"] = buffer::copy("foo", 3); + o.back()->attrs["bar"] = buffer::copy("barval", 6); +} + +// -- OSDOp -- + +ostream& operator<<(ostream& out, const OSDOp& op) +{ + out << ceph_osd_op_name(op.op.op); + if (ceph_osd_op_type_data(op.op.op)) { + // data extent + switch (op.op.op) { + case CEPH_OSD_OP_ASSERT_VER: + out << " v" << op.op.assert_ver.ver; + break; + case CEPH_OSD_OP_TRUNCATE: + out << " " << op.op.extent.offset; + break; + case CEPH_OSD_OP_MASKTRUNC: + case CEPH_OSD_OP_TRIMTRUNC: + out << " " << op.op.extent.truncate_seq << "@" + << (int64_t)op.op.extent.truncate_size; + break; + case CEPH_OSD_OP_ROLLBACK: + out << " " << snapid_t(op.op.snap.snapid); + break; + case CEPH_OSD_OP_WATCH: + out << " " << ceph_osd_watch_op_name(op.op.watch.op) + << " cookie " << op.op.watch.cookie; + if (op.op.watch.gen) + out << " gen " << op.op.watch.gen; + break; + case CEPH_OSD_OP_NOTIFY: + out << " cookie " << op.op.notify.cookie; + break; + case CEPH_OSD_OP_COPY_GET: + out << " max " << op.op.copy_get.max; + break; + case CEPH_OSD_OP_COPY_FROM: + out << " ver " << op.op.copy_from.src_version; + break; + case CEPH_OSD_OP_SETALLOCHINT: + out << " object_size " << op.op.alloc_hint.expected_object_size + << " write_size " << op.op.alloc_hint.expected_write_size; + break; + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_MAPEXT: + case CEPH_OSD_OP_CMPEXT: + out << " " << op.op.extent.offset << "~" << op.op.extent.length; + if (op.op.extent.truncate_seq) + out << " [" << op.op.extent.truncate_seq << "@" + << (int64_t)op.op.extent.truncate_size << "]"; + if (op.op.flags) + out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]"; + default: + // don't show any arg info + break; + } + } else if (ceph_osd_op_type_attr(op.op.op)) { + // xattr name + if (op.op.xattr.name_len && op.indata.length()) { + out << " "; + op.indata.write(0, op.op.xattr.name_len, out); + } + if (op.op.xattr.value_len) + out << " (" << op.op.xattr.value_len << ")"; + if (op.op.op == CEPH_OSD_OP_CMPXATTR) + out << " op " << (int)op.op.xattr.cmp_op + << " mode " << (int)op.op.xattr.cmp_mode; + } else if (ceph_osd_op_type_exec(op.op.op)) { + // class.method + if (op.op.cls.class_len && op.indata.length()) { + out << " "; + op.indata.write(0, op.op.cls.class_len, out); + out << "."; + op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out); + } + } else if (ceph_osd_op_type_pg(op.op.op)) { + switch (op.op.op) { + case CEPH_OSD_OP_PGLS: + case CEPH_OSD_OP_PGLS_FILTER: + case CEPH_OSD_OP_PGNLS: + case CEPH_OSD_OP_PGNLS_FILTER: + out << " start_epoch " << op.op.pgls.start_epoch; + break; + case CEPH_OSD_OP_PG_HITSET_LS: + break; + case CEPH_OSD_OP_PG_HITSET_GET: + out << " " << utime_t(op.op.hit_set_get.stamp); + break; + case CEPH_OSD_OP_SCRUBLS: + break; + } + } + return out; +} + + +void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in) +{ + bufferlist::iterator datap = in.begin(); + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].op.payload_len) { + datap.copy(ops[i].op.payload_len, ops[i].indata); + } + } +} + +void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out) +{ + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].indata.length()) { + ops[i].op.payload_len = ops[i].indata.length(); + out.append(ops[i].indata); + } + } +} + +void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in) +{ + bufferlist::iterator datap = in.begin(); + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].op.payload_len) { + datap.copy(ops[i].op.payload_len, ops[i].outdata); + } + } +} + +void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out) +{ + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].outdata.length()) { + ops[i].op.payload_len = ops[i].outdata.length(); + out.append(ops[i].outdata); + } + } +} + +void OSDOp::clear_data(vector<OSDOp>& ops) +{ + for (unsigned i = 0; i < ops.size(); i++) { + OSDOp& op = ops[i]; + op.outdata.clear(); + if (ceph_osd_op_type_attr(op.op.op) && + op.op.xattr.name_len && + op.indata.length() >= op.op.xattr.name_len) { + bufferptr bp(op.op.xattr.name_len); + bufferlist bl; + bl.append(bp); + bl.copy_in(0, op.op.xattr.name_len, op.indata); + op.indata.claim(bl); + } else if (ceph_osd_op_type_exec(op.op.op) && + op.op.cls.class_len && + op.indata.length() > + (op.op.cls.class_len + op.op.cls.method_len)) { + __u8 len = op.op.cls.class_len + op.op.cls.method_len; + bufferptr bp(len); + bufferlist bl; + bl.append(bp); + bl.copy_in(0, len, op.indata); + op.indata.claim(bl); + } else { + op.indata.clear(); + } + } +} diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h new file mode 100644 index 00000000..6debfe0e --- /dev/null +++ b/src/osd/osd_types.h @@ -0,0 +1,5913 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_TYPES_H +#define CEPH_OSD_TYPES_H + +#include <sstream> +#include <stdio.h> +#include <memory> +#include <string_view> +#include <boost/scoped_ptr.hpp> +#include <boost/optional/optional_io.hpp> +#include <boost/variant.hpp> + +#include "include/rados/rados_types.hpp" +#include "include/mempool.h" + +#include "msg/msg_types.h" +#include "include/types.h" +#include "include/utime.h" +#include "include/CompatSet.h" +#include "common/histogram.h" +#include "include/interval_set.h" +#include "include/inline_memory.h" +#include "common/Formatter.h" +#include "common/bloom_filter.hpp" +#include "common/hobject.h" +#include "common/snap_types.h" +#include "HitSet.h" +#include "Watch.h" +#include "include/cmp.h" +#include "librados/ListObjectImpl.h" +#include "compressor/Compressor.h" +#include <atomic> + +#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026" + +#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)") +#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object") +#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator") +#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean") +#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories") +#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool") +#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog") +#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper") +#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects") +#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints") +#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object") +#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set") +#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr") +#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set") + + +/// pool priority range set by user +#define OSD_POOL_PRIORITY_MAX 10 +#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX + +/// min recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_MIN 0 + +/// base backfill priority for MBackfillReserve +#define OSD_BACKFILL_PRIORITY_BASE 100 + +/// base backfill priority for MBackfillReserve (degraded PG) +#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140 + +/// base recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_BASE 180 + +/// base backfill priority for MBackfillReserve (inactive PG) +#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220 + +/// base recovery priority for MRecoveryReserve (inactive PG) +#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220 + +/// max manually/automatically set recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_MAX 253 + +/// backfill priority for MBackfillReserve, when forced manually +#define OSD_BACKFILL_PRIORITY_FORCED 254 + +/// recovery priority for MRecoveryReserve, when forced manually +#define OSD_RECOVERY_PRIORITY_FORCED 255 + +/// priority for pg deletion when osd is not fullish +#define OSD_DELETE_PRIORITY_NORMAL 179 + +/// priority for pg deletion when osd is approaching full +#define OSD_DELETE_PRIORITY_FULLISH 219 + +/// priority when more full +#define OSD_DELETE_PRIORITY_FULL 255 + +static std::map<int, int> max_prio_map = { + {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1}, + {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1}, + {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1}, + {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}, + {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX} +}; + +typedef hobject_t collection_list_handle_t; + +/// convert a single CPEH_OSD_FLAG_* to a string +const char *ceph_osd_flag_name(unsigned flag); +/// convert a single CEPH_OSD_OF_FLAG_* to a string +const char *ceph_osd_op_flag_name(unsigned flag); + +/// convert CEPH_OSD_FLAG_* op flags to a string +string ceph_osd_flag_string(unsigned flags); +/// conver CEPH_OSD_OP_FLAG_* op flags to a string +string ceph_osd_op_flag_string(unsigned flags); +/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string +string ceph_osd_alloc_hint_flag_string(unsigned flags); + +typedef map<string,string> osd_alert_list_t; +/// map osd id -> alert_list_t +typedef map<int, osd_alert_list_t> osd_alerts_t; +void dump(Formatter* f, const osd_alerts_t& alerts); + +/** + * osd request identifier + * + * caller name + incarnation# + tid to unique identify this request. + */ +struct osd_reqid_t { + entity_name_t name; // who + ceph_tid_t tid; + int32_t inc; // incarnation + + osd_reqid_t() + : tid(0), inc(0) + {} + osd_reqid_t(const osd_reqid_t& other) + : name(other.name), tid(other.tid), inc(other.inc) + {} + osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t) + : name(a), tid(t), inc(i) + {} + + DENC(osd_reqid_t, v, p) { + DENC_START(2, 2, p); + denc(v.name, p); + denc(v.tid, p); + denc(v.inc, p); + DENC_FINISH(p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<osd_reqid_t*>& o); +}; +WRITE_CLASS_DENC(osd_reqid_t) + + + +struct pg_shard_t { + static const int32_t NO_OSD = 0x7fffffff; + int32_t osd; + shard_id_t shard; + pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {} + explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {} + pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {} + bool is_undefined() const { + return osd == -1; + } + string get_osd() const { return (osd == NO_OSD ? "NONE" : to_string(osd)); } + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const { + f->dump_unsigned("osd", osd); + if (shard != shard_id_t::NO_SHARD) { + f->dump_unsigned("shard", shard); + } + } +}; +WRITE_CLASS_ENCODER(pg_shard_t) +WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard) +WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard) +ostream &operator<<(ostream &lhs, const pg_shard_t &rhs); + +class IsPGRecoverablePredicate { +public: + /** + * have encodes the shards available + */ + virtual bool operator()(const set<pg_shard_t> &have) const = 0; + virtual ~IsPGRecoverablePredicate() {} +}; + +class IsPGReadablePredicate { +public: + /** + * have encodes the shards available + */ + virtual bool operator()(const set<pg_shard_t> &have) const = 0; + virtual ~IsPGReadablePredicate() {} +}; + +inline ostream& operator<<(ostream& out, const osd_reqid_t& r) { + return out << r.name << "." << r.inc << ":" << r.tid; +} + +inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); +} +inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); +} +inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid < r.tid); +} +inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); +} +inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); } +inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); } + +namespace std { + template<> struct hash<osd_reqid_t> { + size_t operator()(const osd_reqid_t &r) const { + static hash<uint64_t> H; + return H(r.name.num() ^ r.tid ^ r.inc); + } + }; +} // namespace std + + +// ----- + +// a locator constrains the placement of an object. mainly, which pool +// does it go in. +struct object_locator_t { + // You specify either the hash or the key -- not both + int64_t pool; ///< pool id + string key; ///< key string (if non-empty) + string nspace; ///< namespace + int64_t hash; ///< hash position (if >= 0) + + explicit object_locator_t() + : pool(-1), hash(-1) {} + explicit object_locator_t(int64_t po) + : pool(po), hash(-1) {} + explicit object_locator_t(int64_t po, int64_t ps) + : pool(po), hash(ps) {} + explicit object_locator_t(int64_t po, string ns) + : pool(po), nspace(ns), hash(-1) {} + explicit object_locator_t(int64_t po, string ns, int64_t ps) + : pool(po), nspace(ns), hash(ps) {} + explicit object_locator_t(int64_t po, string ns, string s) + : pool(po), key(s), nspace(ns), hash(-1) {} + explicit object_locator_t(const hobject_t& soid) + : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {} + + int64_t get_pool() const { + return pool; + } + + void clear() { + pool = -1; + key = ""; + nspace = ""; + hash = -1; + } + + bool empty() const { + return pool == -1; + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<object_locator_t*>& o); +}; +WRITE_CLASS_ENCODER(object_locator_t) + +inline bool operator==(const object_locator_t& l, const object_locator_t& r) { + return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash; +} +inline bool operator!=(const object_locator_t& l, const object_locator_t& r) { + return !(l == r); +} + +inline ostream& operator<<(ostream& out, const object_locator_t& loc) +{ + out << "@" << loc.pool; + if (loc.nspace.length()) + out << ";" << loc.nspace; + if (loc.key.length()) + out << ":" << loc.key; + return out; +} + +struct request_redirect_t { +private: + object_locator_t redirect_locator; ///< this is authoritative + string redirect_object; ///< If non-empty, the request goes to this object name + + friend ostream& operator<<(ostream& out, const request_redirect_t& redir); +public: + + request_redirect_t() {} + explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) : + redirect_locator(orig) { redirect_locator.pool = rpool; } + explicit request_redirect_t(const object_locator_t& rloc) : + redirect_locator(rloc) {} + explicit request_redirect_t(const object_locator_t& orig, + const string& robj) : + redirect_locator(orig), redirect_object(robj) {} + + bool empty() const { return redirect_locator.empty() && + redirect_object.empty(); } + + void combine_with_locator(object_locator_t& orig, string& obj) const { + orig = redirect_locator; + if (!redirect_object.empty()) + obj = redirect_object; + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<request_redirect_t*>& o); +}; +WRITE_CLASS_ENCODER(request_redirect_t) + +inline ostream& operator<<(ostream& out, const request_redirect_t& redir) { + out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}"; + return out; +} + +// Internal OSD op flags - set by the OSD based on the op types +enum { + CEPH_OSD_RMW_FLAG_READ = (1 << 1), + CEPH_OSD_RMW_FLAG_WRITE = (1 << 2), + CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3), + CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4), + CEPH_OSD_RMW_FLAG_PGOP = (1 << 5), + CEPH_OSD_RMW_FLAG_CACHE = (1 << 6), + CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7), + CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8), + CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9), + CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10), +}; + + +// pg stuff + +#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0))) + +// placement seed (a hash value) +typedef uint32_t ps_t; + +// old (v1) pg_t encoding (wrap old struct ceph_pg) +struct old_pg_t { + ceph_pg v; + void encode(bufferlist& bl) const { + ::encode_raw(v, bl); + } + void decode(bufferlist::const_iterator& bl) { + ::decode_raw(v, bl); + } +}; +WRITE_CLASS_ENCODER(old_pg_t) + +// placement group id +struct pg_t { + uint64_t m_pool; + uint32_t m_seed; + + pg_t() : m_pool(0), m_seed(0) {} + pg_t(ps_t seed, uint64_t pool) : + m_pool(pool), m_seed(seed) {} + // cppcheck-suppress noExplicitConstructor + pg_t(const ceph_pg& cpg) : + m_pool(cpg.pool), m_seed(cpg.ps) {} + + // cppcheck-suppress noExplicitConstructor + pg_t(const old_pg_t& opg) { + *this = opg.v; + } + + old_pg_t get_old_pg() const { + old_pg_t o; + ceph_assert(m_pool < 0xffffffffull); + o.v.pool = m_pool; + o.v.ps = m_seed; + o.v.preferred = (__s16)-1; + return o; + } + + ps_t ps() const { + return m_seed; + } + int64_t pool() const { + return m_pool; + } + + static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0' + char *calc_name(char *buf, const char *suffix_backwords) const; + + void set_ps(ps_t p) { + m_seed = p; + } + void set_pool(uint64_t p) { + m_pool = p; + } + + pg_t get_parent() const; + pg_t get_ancestor(unsigned old_pg_num) const; + + int print(char *o, int maxlen) const; + bool parse(const char *s); + + bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const; + + bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const; + bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const { + return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr); + } + + /** + * Returns b such that for all object o: + * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this + */ + unsigned get_split_bits(unsigned pg_num) const; + + bool contains(int bits, const ghobject_t& oid) { + return + (int64_t)m_pool == oid.hobj.get_logical_pool() && + oid.match(bits, ps()); + } + bool contains(int bits, const hobject_t& oid) { + return + (int64_t)m_pool == oid.get_logical_pool() && + oid.match(bits, ps()); + } + + hobject_t get_hobj_start() const; + hobject_t get_hobj_end(unsigned pg_num) const; + + void encode(bufferlist& bl) const { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(m_pool, bl); + encode(m_seed, bl); + encode((int32_t)-1, bl); // was preferred + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + __u8 v; + decode(v, bl); + decode(m_pool, bl); + decode(m_seed, bl); + bl.advance(sizeof(int32_t)); // was preferred + } + void decode_old(bufferlist::const_iterator& bl) { + using ceph::decode; + old_pg_t opg; + decode(opg, bl); + *this = opg; + } + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_t) + +inline bool operator<(const pg_t& l, const pg_t& r) { + return l.pool() < r.pool() || + (l.pool() == r.pool() && (l.ps() < r.ps())); +} +inline bool operator<=(const pg_t& l, const pg_t& r) { + return l.pool() < r.pool() || + (l.pool() == r.pool() && (l.ps() <= r.ps())); +} +inline bool operator==(const pg_t& l, const pg_t& r) { + return l.pool() == r.pool() && + l.ps() == r.ps(); +} +inline bool operator!=(const pg_t& l, const pg_t& r) { + return l.pool() != r.pool() || + l.ps() != r.ps(); +} +inline bool operator>(const pg_t& l, const pg_t& r) { + return l.pool() > r.pool() || + (l.pool() == r.pool() && (l.ps() > r.ps())); +} +inline bool operator>=(const pg_t& l, const pg_t& r) { + return l.pool() > r.pool() || + (l.pool() == r.pool() && (l.ps() >= r.ps())); +} + +ostream& operator<<(ostream& out, const pg_t &pg); + +namespace std { + template<> struct hash< pg_t > + { + size_t operator()( const pg_t& x ) const + { + static hash<uint32_t> H; + // xor (s32)-1 in there to preserve original m_preferred result (paranoia!) + return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1)); + } + }; +} // namespace std + +struct spg_t { + pg_t pgid; + shard_id_t shard; + spg_t() : shard(shard_id_t::NO_SHARD) {} + spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {} + explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {} + unsigned get_split_bits(unsigned pg_num) const { + return pgid.get_split_bits(pg_num); + } + spg_t get_parent() const { + return spg_t(pgid.get_parent(), shard); + } + ps_t ps() const { + return pgid.ps(); + } + uint64_t pool() const { + return pgid.pool(); + } + + static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255"); + char *calc_name(char *buf, const char *suffix_backwords) const; + + bool parse(const char *s); + bool parse(const std::string& s) { + return parse(s.c_str()); + } + + spg_t get_ancestor(unsigned old_pg_num) const { + return spg_t(pgid.get_ancestor(old_pg_num), shard); + } + + bool is_split(unsigned old_pg_num, unsigned new_pg_num, + set<spg_t> *pchildren) const { + set<pg_t> _children; + set<pg_t> *children = pchildren ? &_children : NULL; + bool is_split = pgid.is_split(old_pg_num, new_pg_num, children); + if (pchildren && is_split) { + for (set<pg_t>::iterator i = _children.begin(); + i != _children.end(); + ++i) { + pchildren->insert(spg_t(*i, shard)); + } + } + return is_split; + } + bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const { + return pgid.is_merge_target(old_pg_num, new_pg_num); + } + bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, + spg_t *parent) const { + spg_t out = *this; + bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid); + if (r && parent) { + *parent = out; + } + return r; + } + + bool is_no_shard() const { + return shard == shard_id_t::NO_SHARD; + } + + ghobject_t make_pgmeta_oid() const { + return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard); + } + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + encode(pgid, bl); + encode(shard, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(pgid, bl); + decode(shard, bl); + DECODE_FINISH(bl); + } + + ghobject_t make_temp_ghobject(const string& name) const { + return ghobject_t( + hobject_t(object_t(name), "", CEPH_NOSNAP, + pgid.ps(), + hobject_t::get_temp_pool(pgid.pool()), + ""), + ghobject_t::NO_GEN, + shard); + } + + unsigned hash_to_shard(unsigned num_shards) const { + return ps() % num_shards; + } +}; +WRITE_CLASS_ENCODER(spg_t) +WRITE_EQ_OPERATORS_2(spg_t, pgid, shard) +WRITE_CMP_OPERATORS_2(spg_t, pgid, shard) + +namespace std { + template<> struct hash< spg_t > + { + size_t operator()( const spg_t& x ) const + { + static hash<uint32_t> H; + return H(hash<pg_t>()(x.pgid) ^ x.shard); + } + }; +} // namespace std + +ostream& operator<<(ostream& out, const spg_t &pg); + +// ---------------------- + +class coll_t { + enum type_t { + TYPE_META = 0, + TYPE_LEGACY_TEMP = 1, /* no longer used */ + TYPE_PG = 2, + TYPE_PG_TEMP = 3, + }; + type_t type; + spg_t pgid; + uint64_t removal_seq; // note: deprecated, not encoded + + char _str_buff[spg_t::calc_name_buf_size]; + char *_str; + + void calc_str(); + + coll_t(type_t t, spg_t p, uint64_t r) + : type(t), pgid(p), removal_seq(r) { + calc_str(); + } + +public: + coll_t() : type(TYPE_META), removal_seq(0) + { + calc_str(); + } + + coll_t(const coll_t& other) + : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) { + calc_str(); + } + + explicit coll_t(spg_t pgid) + : type(TYPE_PG), pgid(pgid), removal_seq(0) + { + calc_str(); + } + + coll_t& operator=(const coll_t& rhs) + { + this->type = rhs.type; + this->pgid = rhs.pgid; + this->removal_seq = rhs.removal_seq; + this->calc_str(); + return *this; + } + + // named constructors + static coll_t meta() { + return coll_t(); + } + static coll_t pg(spg_t p) { + return coll_t(p); + } + + const std::string to_str() const { + return string(_str); + } + const char *c_str() const { + return _str; + } + + bool parse(const std::string& s); + + int operator<(const coll_t &rhs) const { + return type < rhs.type || + (type == rhs.type && pgid < rhs.pgid); + } + + bool is_meta() const { + return type == TYPE_META; + } + bool is_pg_prefix(spg_t *pgid_) const { + if (type == TYPE_PG || type == TYPE_PG_TEMP) { + *pgid_ = pgid; + return true; + } + return false; + } + bool is_pg() const { + return type == TYPE_PG; + } + bool is_pg(spg_t *pgid_) const { + if (type == TYPE_PG) { + *pgid_ = pgid; + return true; + } + return false; + } + bool is_temp() const { + return type == TYPE_PG_TEMP; + } + bool is_temp(spg_t *pgid_) const { + if (type == TYPE_PG_TEMP) { + *pgid_ = pgid; + return true; + } + return false; + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + size_t encoded_size() const; + + inline bool operator==(const coll_t& rhs) const { + // only compare type if meta + if (type != rhs.type) + return false; + if (type == TYPE_META) + return true; + return type == rhs.type && pgid == rhs.pgid; + } + inline bool operator!=(const coll_t& rhs) const { + return !(*this == rhs); + } + + // get a TEMP collection that corresponds to the current collection, + // which we presume is a pg collection. + coll_t get_temp() const { + ceph_assert(type == TYPE_PG); + return coll_t(TYPE_PG_TEMP, pgid, 0); + } + + ghobject_t get_min_hobj() const { + ghobject_t o; + switch (type) { + case TYPE_PG: + o.hobj.pool = pgid.pool(); + o.set_shard(pgid.shard); + break; + case TYPE_META: + o.hobj.pool = -1; + break; + default: + break; + } + return o; + } + + unsigned hash_to_shard(unsigned num_shards) const { + if (type == TYPE_PG) + return pgid.hash_to_shard(num_shards); + return 0; // whatever. + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<coll_t*>& o); +}; + +WRITE_CLASS_ENCODER(coll_t) + +inline ostream& operator<<(ostream& out, const coll_t& c) { + out << c.to_str(); + return out; +} + +namespace std { + template<> struct hash<coll_t> { + size_t operator()(const coll_t &c) const { + size_t h = 0; + string str(c.to_str()); + std::string::const_iterator end(str.end()); + for (std::string::const_iterator s = str.begin(); s != end; ++s) { + h += *s; + h += (h << 10); + h ^= (h >> 6); + } + h += (h << 3); + h ^= (h >> 11); + h += (h << 15); + return h; + } + }; +} // namespace std + +inline ostream& operator<<(ostream& out, const ceph_object_layout &ol) +{ + out << pg_t(ol.ol_pgid); + int su = ol.ol_stripe_unit; + if (su) + out << ".su=" << su; + return out; +} + + + +// compound rados version type +/* WARNING: If add member in eversion_t, please make sure the encode/decode function + * work well. For little-endian machine, we should make sure there is no padding + * in 32-bit machine and 64-bit machine. + */ +class eversion_t { +public: + version_t version; + epoch_t epoch; + __u32 __pad; + eversion_t() : version(0), epoch(0), __pad(0) {} + eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {} + + // cppcheck-suppress noExplicitConstructor + eversion_t(const ceph_eversion& ce) : + version(ce.version), + epoch(ce.epoch), + __pad(0) { } + + explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); } + + static const eversion_t& max() { + static const eversion_t max(-1,-1); + return max; + } + + operator ceph_eversion() { + ceph_eversion c; + c.epoch = epoch; + c.version = version; + return c; + } + + string get_key_name() const; + + // key must point to the beginning of a block of 32 chars + inline void get_key_name(char* key) const { + // Below is equivalent of sprintf("%010u.%020llu"); + key[31] = 0; + ritoa<uint64_t, 10, 20>(version, key + 31); + key[10] = '.'; + ritoa<uint32_t, 10, 10>(epoch, key + 10); + } + + void encode(bufferlist &bl) const { +#if defined(CEPH_LITTLE_ENDIAN) + bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t)); +#else + using ceph::encode; + encode(version, bl); + encode(epoch, bl); +#endif + } + void decode(bufferlist::const_iterator &bl) { +#if defined(CEPH_LITTLE_ENDIAN) + bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this); +#else + using ceph::decode; + decode(version, bl); + decode(epoch, bl); +#endif + } + void decode(bufferlist& bl) { + auto p = std::cbegin(bl); + decode(p); + } +}; +WRITE_CLASS_ENCODER(eversion_t) + +inline bool operator==(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) && (l.version == r.version); +} +inline bool operator!=(const eversion_t& l, const eversion_t& r) { + return (l.epoch != r.epoch) || (l.version != r.version); +} +inline bool operator<(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); +} +inline bool operator<=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); +} +inline bool operator>(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); +} +inline bool operator>=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); +} +inline ostream& operator<<(ostream& out, const eversion_t& e) { + return out << e.epoch << "'" << e.version; +} + +/** + * objectstore_perf_stat_t + * + * current perf information about the osd + */ +struct objectstore_perf_stat_t { + // cur_op_latency is in ns since double add/sub are not associative + uint64_t os_commit_latency_ns; + uint64_t os_apply_latency_ns; + + objectstore_perf_stat_t() : + os_commit_latency_ns(0), os_apply_latency_ns(0) {} + + bool operator==(const objectstore_perf_stat_t &r) const { + return os_commit_latency_ns == r.os_commit_latency_ns && + os_apply_latency_ns == r.os_apply_latency_ns; + } + + void add(const objectstore_perf_stat_t &o) { + os_commit_latency_ns += o.os_commit_latency_ns; + os_apply_latency_ns += o.os_apply_latency_ns; + } + void sub(const objectstore_perf_stat_t &o) { + os_commit_latency_ns -= o.os_commit_latency_ns; + os_apply_latency_ns -= o.os_apply_latency_ns; + } + void dump(Formatter *f) const; + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t) + +/* + * pg states + */ +#define PG_STATE_CREATING (1ULL << 0) // creating +#define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too) +#define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas. +#define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline +#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound +#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound +#define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging +#define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing +//#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub +#define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy +#define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be) +#define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering +#define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub +#define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects +#define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill +#define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed. +#define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown. +#define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH +#define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files +#define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content +#define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full +#define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations +#define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size +#define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active +#define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover +#define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps +#define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps +#define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full +#define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps +#define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other +#define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other +#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors + +std::string pg_state_string(uint64_t state); +std::string pg_vector_string(const vector<int32_t> &a); +boost::optional<uint64_t> pg_string_state(const std::string& state); + + +/* + * pool_snap_info_t + * + * attributes for a single pool snapshot. + */ +struct pool_snap_info_t { + snapid_t snapid; + utime_t stamp; + string name; + + void dump(Formatter *f) const; + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + static void generate_test_instances(list<pool_snap_info_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t) + +inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) { + return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')'; +} + + +/* + * pool_opts_t + * + * pool options. + */ + +class pool_opts_t { +public: + enum key_t { + SCRUB_MIN_INTERVAL, + SCRUB_MAX_INTERVAL, + DEEP_SCRUB_INTERVAL, + RECOVERY_PRIORITY, + RECOVERY_OP_PRIORITY, + SCRUB_PRIORITY, + COMPRESSION_MODE, + COMPRESSION_ALGORITHM, + COMPRESSION_REQUIRED_RATIO, + COMPRESSION_MAX_BLOB_SIZE, + COMPRESSION_MIN_BLOB_SIZE, + CSUM_TYPE, + CSUM_MAX_BLOCK, + CSUM_MIN_BLOCK, + FINGERPRINT_ALGORITHM, + PG_NUM_MIN, // min pg_num + TARGET_SIZE_BYTES, // total bytes in pool + TARGET_SIZE_RATIO, // fraction of total cluster + PG_AUTOSCALE_BIAS, + }; + + enum type_t { + STR, + INT, + DOUBLE, + }; + + struct opt_desc_t { + key_t key; + type_t type; + + opt_desc_t(key_t k, type_t t) : key(k), type(t) {} + + bool operator==(const opt_desc_t& rhs) const { + return key == rhs.key && type == rhs.type; + } + }; + + typedef boost::variant<std::string,int64_t,double> value_t; + + static bool is_opt_name(const std::string& name); + static opt_desc_t get_opt_desc(const std::string& name); + + pool_opts_t() : opts() {} + + bool is_set(key_t key) const; + + template<typename T> + void set(key_t key, const T &val) { + value_t value = val; + opts[key] = value; + } + + template<typename T> + bool get(key_t key, T *val) const { + opts_t::const_iterator i = opts.find(key); + if (i == opts.end()) { + return false; + } + *val = boost::get<T>(i->second); + return true; + } + + const value_t& get(key_t key) const; + + bool unset(key_t key); + + void dump(const std::string& name, Formatter *f) const; + + void dump(Formatter *f) const; + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + +private: + typedef std::map<key_t, value_t> opts_t; + opts_t opts; + + friend ostream& operator<<(ostream& out, const pool_opts_t& opts); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_opts_t) + +struct pg_merge_meta_t { + pg_t source_pgid; + epoch_t ready_epoch = 0; + epoch_t last_epoch_started = 0; + epoch_t last_epoch_clean = 0; + eversion_t source_version; + eversion_t target_version; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(source_pgid, bl); + encode(ready_epoch, bl); + encode(last_epoch_started, bl); + encode(last_epoch_clean, bl); + encode(source_version, bl); + encode(target_version, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(source_pgid, p); + decode(ready_epoch, p); + decode(last_epoch_started, p); + decode(last_epoch_clean, p); + decode(source_version, p); + decode(target_version, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->dump_stream("source_pgid") << source_pgid; + f->dump_unsigned("ready_epoch", ready_epoch); + f->dump_unsigned("last_epoch_started", last_epoch_started); + f->dump_unsigned("last_epoch_clean", last_epoch_clean); + f->dump_stream("source_version") << source_version; + f->dump_stream("target_version") << target_version; + } +}; +WRITE_CLASS_ENCODER(pg_merge_meta_t) + +/* + * pg_pool + */ +struct pg_pool_t { + static const char *APPLICATION_NAME_CEPHFS; + static const char *APPLICATION_NAME_RBD; + static const char *APPLICATION_NAME_RGW; + + enum { + TYPE_REPLICATED = 1, // replication + //TYPE_RAID4 = 2, // raid4 (never implemented) + TYPE_ERASURE = 3, // erasure-coded + }; + static std::string_view get_type_name(int t) { + switch (t) { + case TYPE_REPLICATED: return "replicated"; + //case TYPE_RAID4: return "raid4"; + case TYPE_ERASURE: return "erasure"; + default: return "???"; + } + } + std::string_view get_type_name() const { + return get_type_name(type); + } + + enum { + FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding) + FLAG_FULL = 1<<1, // pool is full + FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled + FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay) + FLAG_NODELETE = 1<<4, // pool can't be deleted + FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed + FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed + FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED + FLAG_NOSCRUB = 1<<8, // block periodic scrub + FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub + FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too + FLAG_NEARFULL = 1<<11, // pool is nearfull + FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull + FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps + FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps + FLAG_CREATING = 1<<15, // initial pool PGs are being created + }; + + static const char *get_flag_name(int f) { + switch (f) { + case FLAG_HASHPSPOOL: return "hashpspool"; + case FLAG_FULL: return "full"; + case FLAG_EC_OVERWRITES: return "ec_overwrites"; + case FLAG_INCOMPLETE_CLONES: return "incomplete_clones"; + case FLAG_NODELETE: return "nodelete"; + case FLAG_NOPGCHANGE: return "nopgchange"; + case FLAG_NOSIZECHANGE: return "nosizechange"; + case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed"; + case FLAG_NOSCRUB: return "noscrub"; + case FLAG_NODEEP_SCRUB: return "nodeep-scrub"; + case FLAG_FULL_QUOTA: return "full_quota"; + case FLAG_NEARFULL: return "nearfull"; + case FLAG_BACKFILLFULL: return "backfillfull"; + case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps"; + case FLAG_POOL_SNAPS: return "pool_snaps"; + case FLAG_CREATING: return "creating"; + default: return "???"; + } + } + static string get_flags_string(uint64_t f) { + string s; + for (unsigned n=0; f && n<64; ++n) { + if (f & (1ull << n)) { + if (s.length()) + s += ","; + s += get_flag_name(1ull << n); + } + } + return s; + } + string get_flags_string() const { + return get_flags_string(flags); + } + static uint64_t get_flag_by_name(const string& name) { + if (name == "hashpspool") + return FLAG_HASHPSPOOL; + if (name == "full") + return FLAG_FULL; + if (name == "ec_overwrites") + return FLAG_EC_OVERWRITES; + if (name == "incomplete_clones") + return FLAG_INCOMPLETE_CLONES; + if (name == "nodelete") + return FLAG_NODELETE; + if (name == "nopgchange") + return FLAG_NOPGCHANGE; + if (name == "nosizechange") + return FLAG_NOSIZECHANGE; + if (name == "write_fadvise_dontneed") + return FLAG_WRITE_FADVISE_DONTNEED; + if (name == "noscrub") + return FLAG_NOSCRUB; + if (name == "nodeep-scrub") + return FLAG_NODEEP_SCRUB; + if (name == "full_quota") + return FLAG_FULL_QUOTA; + if (name == "nearfull") + return FLAG_NEARFULL; + if (name == "backfillfull") + return FLAG_BACKFILLFULL; + if (name == "selfmanaged_snaps") + return FLAG_SELFMANAGED_SNAPS; + if (name == "pool_snaps") + return FLAG_POOL_SNAPS; + if (name == "creating") + return FLAG_CREATING; + return 0; + } + + /// converts the acting/up vector to a set of pg shards + void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const; + + typedef enum { + CACHEMODE_NONE = 0, ///< no caching + CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later + CACHEMODE_FORWARD = 2, ///< forward if not in cache + CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent] + CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later + CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later + CACHEMODE_PROXY = 6, ///< proxy if not in cache + } cache_mode_t; + static const char *get_cache_mode_name(cache_mode_t m) { + switch (m) { + case CACHEMODE_NONE: return "none"; + case CACHEMODE_WRITEBACK: return "writeback"; + case CACHEMODE_FORWARD: return "forward"; + case CACHEMODE_READONLY: return "readonly"; + case CACHEMODE_READFORWARD: return "readforward"; + case CACHEMODE_READPROXY: return "readproxy"; + case CACHEMODE_PROXY: return "proxy"; + default: return "unknown"; + } + } + static cache_mode_t get_cache_mode_from_str(const string& s) { + if (s == "none") + return CACHEMODE_NONE; + if (s == "writeback") + return CACHEMODE_WRITEBACK; + if (s == "forward") + return CACHEMODE_FORWARD; + if (s == "readonly") + return CACHEMODE_READONLY; + if (s == "readforward") + return CACHEMODE_READFORWARD; + if (s == "readproxy") + return CACHEMODE_READPROXY; + if (s == "proxy") + return CACHEMODE_PROXY; + return (cache_mode_t)-1; + } + const char *get_cache_mode_name() const { + return get_cache_mode_name(cache_mode); + } + bool cache_mode_requires_hit_set() const { + switch (cache_mode) { + case CACHEMODE_NONE: + case CACHEMODE_FORWARD: + case CACHEMODE_READONLY: + case CACHEMODE_PROXY: + return false; + case CACHEMODE_WRITEBACK: + case CACHEMODE_READFORWARD: + case CACHEMODE_READPROXY: + return true; + default: + ceph_abort_msg("implement me"); + } + } + + enum { + PG_AUTOSCALE_MODE_OFF = 0, + PG_AUTOSCALE_MODE_WARN = 1, + PG_AUTOSCALE_MODE_ON = 2, + }; + static const char *get_pg_autoscale_mode_name(int m) { + switch (m) { + case PG_AUTOSCALE_MODE_OFF: return "off"; + case PG_AUTOSCALE_MODE_ON: return "on"; + case PG_AUTOSCALE_MODE_WARN: return "warn"; + default: return "???"; + } + } + static int get_pg_autoscale_mode_by_name(const string& m) { + if (m == "off") { + return PG_AUTOSCALE_MODE_OFF; + } + if (m == "warn") { + return PG_AUTOSCALE_MODE_WARN; + } + if (m == "on") { + return PG_AUTOSCALE_MODE_ON; + } + return -1; + } + + utime_t create_time; + uint64_t flags; ///< FLAG_* + __u8 type; ///< TYPE_* + __u8 size, min_size; ///< number of osds in each pg + __u8 crush_rule; ///< crush placement rule + __u8 object_hash; ///< hash mapping object name to ps + __u8 pg_autoscale_mode; ///< PG_AUTOSCALE_MODE_ +private: + __u32 pg_num = 0, pgp_num = 0; ///< number of pgs + __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to + __u32 pg_num_target = 0; ///< pg_num we should converge toward + __u32 pgp_num_target = 0; ///< pgp_num we should converge toward + +public: + map<string,string> properties; ///< OBSOLETE + string erasure_code_profile; ///< name of the erasure code profile in OSDMap + epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes + + /// last epoch that forced clients to resend + epoch_t last_force_op_resend = 0; + /// last epoch that forced clients to resend (pre-nautilus clients only) + epoch_t last_force_op_resend_prenautilus = 0; + /// last epoch that forced clients to resend (pre-luminous clients only) + epoch_t last_force_op_resend_preluminous = 0; + + /// metadata for the most recent PG merge + pg_merge_meta_t last_pg_merge_meta; + + snapid_t snap_seq; ///< seq for per-pool snapshot + epoch_t snap_epoch; ///< osdmap epoch of last snap + uint64_t auid; ///< who owns the pg + + uint64_t quota_max_bytes; ///< maximum number of bytes for this pool + uint64_t quota_max_objects; ///< maximum number of objects for this pool + + /* + * Pool snaps (global to this pool). These define a SnapContext for + * the pool, unless the client manually specifies an alternate + * context. + */ + map<snapid_t, pool_snap_info_t> snaps; + /* + * Alternatively, if we are defining non-pool snaps (e.g. via the + * Ceph MDS), we must track @removed_snaps (since @snaps is not + * used). Snaps and removed_snaps are to be used exclusive of each + * other! + */ + interval_set<snapid_t> removed_snaps; + + unsigned pg_num_mask, pgp_num_mask; + + set<uint64_t> tiers; ///< pools that are tiers of us + int64_t tier_of; ///< pool for which we are a tier + // Note that write wins for read+write ops + int64_t read_tier; ///< pool/tier for objecter to direct reads to + int64_t write_tier; ///< pool/tier for objecter to direct writes to + cache_mode_t cache_mode; ///< cache pool mode + + bool is_tier() const { return tier_of >= 0; } + bool has_tiers() const { return !tiers.empty(); } + void clear_tier() { + tier_of = -1; + clear_read_tier(); + clear_write_tier(); + clear_tier_tunables(); + } + bool has_read_tier() const { return read_tier >= 0; } + void clear_read_tier() { read_tier = -1; } + bool has_write_tier() const { return write_tier >= 0; } + void clear_write_tier() { write_tier = -1; } + void clear_tier_tunables() { + if (cache_mode != CACHEMODE_NONE) + flags |= FLAG_INCOMPLETE_CLONES; + cache_mode = CACHEMODE_NONE; + + target_max_bytes = 0; + target_max_objects = 0; + cache_target_dirty_ratio_micro = 0; + cache_target_dirty_high_ratio_micro = 0; + cache_target_full_ratio_micro = 0; + hit_set_params = HitSet::Params(); + hit_set_period = 0; + hit_set_count = 0; + hit_set_grade_decay_rate = 0; + hit_set_search_last_n = 0; + grade_table.resize(0); + } + + uint64_t target_max_bytes; ///< tiering: target max pool size + uint64_t target_max_objects; ///< tiering: target max pool size + + uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty + uint32_t cache_target_dirty_high_ratio_micro; ///< cache: fraction of target to flush with high speed + uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest + + uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush + uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict + + HitSet::Params hit_set_params; ///< The HitSet params to use on this pool + uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds) + uint32_t hit_set_count; ///< number of periods to retain + bool use_gmt_hitset; ///< use gmt to name the hitset archive object + uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read + uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write + uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects + ///< temperature count,the follow hit_set's priority decay + ///< by this params than pre hit_set + uint32_t hit_set_search_last_n; ///< accumulate atmost N hit_sets for temperature + + uint32_t stripe_width; ///< erasure coded stripe size in bytes + + uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates + ///< user does not specify any expected value + bool fast_read; ///< whether turn on fast read on the pool or not + + pool_opts_t opts; ///< options + + typedef enum { + TYPE_FINGERPRINT_NONE = 0, + TYPE_FINGERPRINT_SHA1 = 1, + } fingerprint_t; + static fingerprint_t get_fingerprint_from_str(const string& s) { + if (s == "none") + return TYPE_FINGERPRINT_NONE; + if (s == "sha1") + return TYPE_FINGERPRINT_SHA1; + return (fingerprint_t)-1; + } + const fingerprint_t get_fingerprint_type() const { + string fp_str; + opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str); + return get_fingerprint_from_str(fp_str); + } + const char *get_fingerprint_name() const { + string fp_str; + fingerprint_t fp_t; + opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str); + fp_t = get_fingerprint_from_str(fp_str); + return get_fingerprint_name(fp_t); + } + static const char *get_fingerprint_name(fingerprint_t m) { + switch (m) { + case TYPE_FINGERPRINT_NONE: return "none"; + case TYPE_FINGERPRINT_SHA1: return "sha1"; + default: return "unknown"; + } + } + + /// application -> key/value metadata + map<string, std::map<string, string>> application_metadata; + +private: + vector<uint32_t> grade_table; + +public: + uint32_t get_grade(unsigned i) const { + if (grade_table.size() <= i) + return 0; + return grade_table[i]; + } + void calc_grade_table() { + unsigned v = 1000000; + grade_table.resize(hit_set_count); + for (unsigned i = 0; i < hit_set_count; i++) { + v = v * (1 - (hit_set_grade_decay_rate / 100.0)); + grade_table[i] = v; + } + } + + pg_pool_t() + : flags(0), type(0), size(0), min_size(0), + crush_rule(0), object_hash(0), + last_change(0), + snap_seq(0), snap_epoch(0), + auid(0), + quota_max_bytes(0), quota_max_objects(0), + pg_num_mask(0), pgp_num_mask(0), + tier_of(-1), read_tier(-1), write_tier(-1), + cache_mode(CACHEMODE_NONE), + target_max_bytes(0), target_max_objects(0), + cache_target_dirty_ratio_micro(0), + cache_target_dirty_high_ratio_micro(0), + cache_target_full_ratio_micro(0), + cache_min_flush_age(0), + cache_min_evict_age(0), + hit_set_params(), + hit_set_period(0), + hit_set_count(0), + use_gmt_hitset(true), + min_read_recency_for_promote(0), + min_write_recency_for_promote(0), + hit_set_grade_decay_rate(0), + hit_set_search_last_n(0), + stripe_width(0), + expected_num_objects(0), + fast_read(false), + opts() + { } + + void dump(Formatter *f) const; + + const utime_t &get_create_time() const { return create_time; } + uint64_t get_flags() const { return flags; } + bool has_flag(uint64_t f) const { return flags & f; } + void set_flag(uint64_t f) { flags |= f; } + void unset_flag(uint64_t f) { flags &= ~f; } + + bool require_rollback() const { + return is_erasure(); + } + + /// true if incomplete clones may be present + bool allow_incomplete_clones() const { + return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES); + } + + unsigned get_type() const { return type; } + unsigned get_size() const { return size; } + unsigned get_min_size() const { return min_size; } + int get_crush_rule() const { return crush_rule; } + int get_object_hash() const { return object_hash; } + const char *get_object_hash_name() const { + return ceph_str_hash_name(get_object_hash()); + } + epoch_t get_last_change() const { return last_change; } + epoch_t get_last_force_op_resend() const { return last_force_op_resend; } + epoch_t get_last_force_op_resend_prenautilus() const { + return last_force_op_resend_prenautilus; + } + epoch_t get_last_force_op_resend_preluminous() const { + return last_force_op_resend_preluminous; + } + epoch_t get_snap_epoch() const { return snap_epoch; } + snapid_t get_snap_seq() const { return snap_seq; } + uint64_t get_auid() const { return auid; } + + void set_snap_seq(snapid_t s) { snap_seq = s; } + void set_snap_epoch(epoch_t e) { snap_epoch = e; } + + void set_stripe_width(uint32_t s) { stripe_width = s; } + uint32_t get_stripe_width() const { return stripe_width; } + + bool is_replicated() const { return get_type() == TYPE_REPLICATED; } + bool is_erasure() const { return get_type() == TYPE_ERASURE; } + + bool supports_omap() const { + return !(get_type() == TYPE_ERASURE); + } + + bool requires_aligned_append() const { + return is_erasure() && !has_flag(FLAG_EC_OVERWRITES); + } + uint64_t required_alignment() const { return stripe_width; } + + bool allows_ecoverwrites() const { + return has_flag(FLAG_EC_OVERWRITES); + } + + bool can_shift_osds() const { + switch (get_type()) { + case TYPE_REPLICATED: + return true; + case TYPE_ERASURE: + return false; + default: + ceph_abort_msg("unhandled pool type"); + } + } + + unsigned get_pg_num() const { return pg_num; } + unsigned get_pgp_num() const { return pgp_num; } + unsigned get_pg_num_target() const { return pg_num_target; } + unsigned get_pgp_num_target() const { return pgp_num_target; } + unsigned get_pg_num_pending() const { return pg_num_pending; } + + unsigned get_pg_num_mask() const { return pg_num_mask; } + unsigned get_pgp_num_mask() const { return pgp_num_mask; } + + // if pg_num is not a multiple of two, pgs are not equally sized. + // return, for a given pg, the fraction (denominator) of the total + // pool size that it represents. + unsigned get_pg_num_divisor(pg_t pgid) const; + + bool is_pending_merge(pg_t pgid, bool *target) const; + + void set_pg_num(int p) { + pg_num = p; + pg_num_pending = p; + calc_pg_masks(); + } + void set_pgp_num(int p) { + pgp_num = p; + calc_pg_masks(); + } + void set_pg_num_pending(int p) { + pg_num_pending = p; + calc_pg_masks(); + } + void set_pg_num_target(int p) { + pg_num_target = p; + } + void set_pgp_num_target(int p) { + pgp_num_target = p; + } + void dec_pg_num(pg_t source_pgid, + epoch_t ready_epoch, + eversion_t source_version, + eversion_t target_version, + epoch_t last_epoch_started, + epoch_t last_epoch_clean) { + --pg_num; + last_pg_merge_meta.source_pgid = source_pgid; + last_pg_merge_meta.ready_epoch = ready_epoch; + last_pg_merge_meta.source_version = source_version; + last_pg_merge_meta.target_version = target_version; + last_pg_merge_meta.last_epoch_started = last_epoch_started; + last_pg_merge_meta.last_epoch_clean = last_epoch_clean; + calc_pg_masks(); + } + + void set_quota_max_bytes(uint64_t m) { + quota_max_bytes = m; + } + uint64_t get_quota_max_bytes() { + return quota_max_bytes; + } + + void set_quota_max_objects(uint64_t m) { + quota_max_objects = m; + } + uint64_t get_quota_max_objects() { + return quota_max_objects; + } + + void set_last_force_op_resend(uint64_t t) { + last_force_op_resend = t; + last_force_op_resend_prenautilus = t; + last_force_op_resend_preluminous = t; + } + + void calc_pg_masks(); + + /* + * we have two snap modes: + * - pool global snaps + * - snap existence/non-existence defined by snaps[] and snap_seq + * - user managed snaps + * - removal governed by removed_snaps + * + * we know which mode we're using based on whether removed_snaps is empty. + * If nothing has been created, both functions report false. + */ + bool is_pool_snaps_mode() const; + bool is_unmanaged_snaps_mode() const; + bool is_removed_snap(snapid_t s) const; + + /* + * build set of known-removed sets from either pool snaps or + * explicit removed_snaps set. + */ + void build_removed_snaps(interval_set<snapid_t>& rs) const; + bool maybe_updated_removed_snaps(const interval_set<snapid_t>& cached) const; + snapid_t snap_exists(const char *s) const; + void add_snap(const char *n, utime_t stamp); + void add_unmanaged_snap(uint64_t& snapid); + void remove_snap(snapid_t s); + void remove_unmanaged_snap(snapid_t s); + + SnapContext get_snap_context() const; + + /// hash a object name+namespace key to a hash position + uint32_t hash_key(const string& key, const string& ns) const; + + /// round a hash position down to a pg num + uint32_t raw_hash_to_pg(uint32_t v) const; + + /* + * map a raw pg (with full precision ps) into an actual pg, for storage + */ + pg_t raw_pg_to_pg(pg_t pg) const; + + /* + * map raw pg (full precision ps) into a placement seed. include + * pool id in that value so that different pools don't use the same + * seeds. + */ + ps_t raw_pg_to_pps(pg_t pg) const; + + /// choose a random hash position within a pg + uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const; + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + + static void generate_test_instances(list<pg_pool_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pg_pool_t) + +ostream& operator<<(ostream& out, const pg_pool_t& p); + + +/** + * a summation of object stats + * + * This is just a container for object stats; we don't know what for. + * + * If you add members in object_stat_sum_t, you should make sure there are + * not padding among these members. + * You should also modify the padding_check function. + + */ +struct object_stat_sum_t { + /************************************************************************** + * WARNING: be sure to update operator==, floor, and split when + * adding/removing fields! + **************************************************************************/ + int64_t num_bytes; // in bytes + int64_t num_objects; + int64_t num_object_clones; + int64_t num_object_copies; // num_objects * num_replicas + int64_t num_objects_missing_on_primary; + int64_t num_objects_degraded; + int64_t num_objects_unfound; + int64_t num_rd; + int64_t num_rd_kb; + int64_t num_wr; + int64_t num_wr_kb; + int64_t num_scrub_errors; // total deep and shallow scrub errors + int64_t num_objects_recovered; + int64_t num_bytes_recovered; + int64_t num_keys_recovered; + int64_t num_shallow_scrub_errors; + int64_t num_deep_scrub_errors; + int64_t num_objects_dirty; + int64_t num_whiteouts; + int64_t num_objects_omap; + int64_t num_objects_hit_set_archive; + int64_t num_objects_misplaced; + int64_t num_bytes_hit_set_archive; + int64_t num_flush; + int64_t num_flush_kb; + int64_t num_evict; + int64_t num_evict_kb; + int64_t num_promote; + int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0 + int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0 + int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0 + int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0 + int64_t num_objects_pinned; + int64_t num_objects_missing; + int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets + int64_t num_large_omap_objects = 0; + int64_t num_objects_manifest = 0; + int64_t num_omap_bytes = 0; + int64_t num_omap_keys = 0; + int64_t num_objects_repaired = 0; + + object_stat_sum_t() + : num_bytes(0), + num_objects(0), num_object_clones(0), num_object_copies(0), + num_objects_missing_on_primary(0), num_objects_degraded(0), + num_objects_unfound(0), + num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0), + num_scrub_errors(0), + num_objects_recovered(0), + num_bytes_recovered(0), + num_keys_recovered(0), + num_shallow_scrub_errors(0), + num_deep_scrub_errors(0), + num_objects_dirty(0), + num_whiteouts(0), + num_objects_omap(0), + num_objects_hit_set_archive(0), + num_objects_misplaced(0), + num_bytes_hit_set_archive(0), + num_flush(0), + num_flush_kb(0), + num_evict(0), + num_evict_kb(0), + num_promote(0), + num_flush_mode_high(0), num_flush_mode_low(0), + num_evict_mode_some(0), num_evict_mode_full(0), + num_objects_pinned(0), + num_objects_missing(0), + num_legacy_snapsets(0) + {} + + void floor(int64_t f) { +#define FLOOR(x) if (x < f) x = f + FLOOR(num_bytes); + FLOOR(num_objects); + FLOOR(num_object_clones); + FLOOR(num_object_copies); + FLOOR(num_objects_missing_on_primary); + FLOOR(num_objects_missing); + FLOOR(num_objects_degraded); + FLOOR(num_objects_misplaced); + FLOOR(num_objects_unfound); + FLOOR(num_rd); + FLOOR(num_rd_kb); + FLOOR(num_wr); + FLOOR(num_wr_kb); + FLOOR(num_large_omap_objects); + FLOOR(num_objects_manifest); + FLOOR(num_omap_bytes); + FLOOR(num_omap_keys); + FLOOR(num_shallow_scrub_errors); + FLOOR(num_deep_scrub_errors); + num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors; + FLOOR(num_objects_recovered); + FLOOR(num_bytes_recovered); + FLOOR(num_keys_recovered); + FLOOR(num_objects_dirty); + FLOOR(num_whiteouts); + FLOOR(num_objects_omap); + FLOOR(num_objects_hit_set_archive); + FLOOR(num_bytes_hit_set_archive); + FLOOR(num_flush); + FLOOR(num_flush_kb); + FLOOR(num_evict); + FLOOR(num_evict_kb); + FLOOR(num_promote); + FLOOR(num_flush_mode_high); + FLOOR(num_flush_mode_low); + FLOOR(num_evict_mode_some); + FLOOR(num_evict_mode_full); + FLOOR(num_objects_pinned); + FLOOR(num_legacy_snapsets); + FLOOR(num_objects_repaired); +#undef FLOOR + } + + void split(vector<object_stat_sum_t> &out) const { +#define SPLIT(PARAM) \ + for (unsigned i = 0; i < out.size(); ++i) { \ + out[i].PARAM = PARAM / out.size(); \ + if (i < (PARAM % out.size())) { \ + out[i].PARAM++; \ + } \ + } +#define SPLIT_PRESERVE_NONZERO(PARAM) \ + for (unsigned i = 0; i < out.size(); ++i) { \ + if (PARAM) \ + out[i].PARAM = 1 + PARAM / out.size(); \ + else \ + out[i].PARAM = 0; \ + } + + SPLIT(num_bytes); + SPLIT(num_objects); + SPLIT(num_object_clones); + SPLIT(num_object_copies); + SPLIT(num_objects_missing_on_primary); + SPLIT(num_objects_missing); + SPLIT(num_objects_degraded); + SPLIT(num_objects_misplaced); + SPLIT(num_objects_unfound); + SPLIT(num_rd); + SPLIT(num_rd_kb); + SPLIT(num_wr); + SPLIT(num_wr_kb); + SPLIT(num_large_omap_objects); + SPLIT(num_objects_manifest); + SPLIT(num_omap_bytes); + SPLIT(num_omap_keys); + SPLIT(num_objects_repaired); + SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors); + SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors); + for (unsigned i = 0; i < out.size(); ++i) { + out[i].num_scrub_errors = out[i].num_shallow_scrub_errors + + out[i].num_deep_scrub_errors; + } + SPLIT(num_objects_recovered); + SPLIT(num_bytes_recovered); + SPLIT(num_keys_recovered); + SPLIT(num_objects_dirty); + SPLIT(num_whiteouts); + SPLIT(num_objects_omap); + SPLIT(num_objects_hit_set_archive); + SPLIT(num_bytes_hit_set_archive); + SPLIT(num_flush); + SPLIT(num_flush_kb); + SPLIT(num_evict); + SPLIT(num_evict_kb); + SPLIT(num_promote); + SPLIT(num_flush_mode_high); + SPLIT(num_flush_mode_low); + SPLIT(num_evict_mode_some); + SPLIT(num_evict_mode_full); + SPLIT(num_objects_pinned); + SPLIT_PRESERVE_NONZERO(num_legacy_snapsets); +#undef SPLIT +#undef SPLIT_PRESERVE_NONZERO + } + + void clear() { + // FIPS zeroization audit 20191117: this memset is not security related. + memset(this, 0, sizeof(*this)); + } + + void calc_copies(int nrep) { + num_object_copies = nrep * num_objects; + } + + bool is_zero() const { + return mem_is_zero((char*)this, sizeof(*this)); + } + + void add(const object_stat_sum_t& o); + void sub(const object_stat_sum_t& o); + + void dump(Formatter *f) const; + void padding_check() { + static_assert( + sizeof(object_stat_sum_t) == + sizeof(num_bytes) + + sizeof(num_objects) + + sizeof(num_object_clones) + + sizeof(num_object_copies) + + sizeof(num_objects_missing_on_primary) + + sizeof(num_objects_degraded) + + sizeof(num_objects_unfound) + + sizeof(num_rd) + + sizeof(num_rd_kb) + + sizeof(num_wr) + + sizeof(num_wr_kb) + + sizeof(num_scrub_errors) + + sizeof(num_large_omap_objects) + + sizeof(num_objects_manifest) + + sizeof(num_omap_bytes) + + sizeof(num_omap_keys) + + sizeof(num_objects_repaired) + + sizeof(num_objects_recovered) + + sizeof(num_bytes_recovered) + + sizeof(num_keys_recovered) + + sizeof(num_shallow_scrub_errors) + + sizeof(num_deep_scrub_errors) + + sizeof(num_objects_dirty) + + sizeof(num_whiteouts) + + sizeof(num_objects_omap) + + sizeof(num_objects_hit_set_archive) + + sizeof(num_objects_misplaced) + + sizeof(num_bytes_hit_set_archive) + + sizeof(num_flush) + + sizeof(num_flush_kb) + + sizeof(num_evict) + + sizeof(num_evict_kb) + + sizeof(num_promote) + + sizeof(num_flush_mode_high) + + sizeof(num_flush_mode_low) + + sizeof(num_evict_mode_some) + + sizeof(num_evict_mode_full) + + sizeof(num_objects_pinned) + + sizeof(num_objects_missing) + + sizeof(num_legacy_snapsets) + , + "object_stat_sum_t have padding"); + } + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + static void generate_test_instances(list<object_stat_sum_t*>& o); +}; +WRITE_CLASS_ENCODER(object_stat_sum_t) + +bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r); + +/** + * a collection of object stat sums + * + * This is a collection of stat sums over different categories. + */ +struct object_stat_collection_t { + /************************************************************************** + * WARNING: be sure to update the operator== when adding/removing fields! * + **************************************************************************/ + object_stat_sum_t sum; + + void calc_copies(int nrep) { + sum.calc_copies(nrep); + } + + void dump(Formatter *f) const; + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + static void generate_test_instances(list<object_stat_collection_t*>& o); + + bool is_zero() const { + return sum.is_zero(); + } + + void clear() { + sum.clear(); + } + + void floor(int64_t f) { + sum.floor(f); + } + + void add(const object_stat_sum_t& o) { + sum.add(o); + } + + void add(const object_stat_collection_t& o) { + sum.add(o.sum); + } + void sub(const object_stat_collection_t& o) { + sum.sub(o.sum); + } +}; +WRITE_CLASS_ENCODER(object_stat_collection_t) + +inline bool operator==(const object_stat_collection_t& l, + const object_stat_collection_t& r) { + return l.sum == r.sum; +} + + +/** pg_stat + * aggregate stats for a single PG. + */ +struct pg_stat_t { + /************************************************************************** + * WARNING: be sure to update the operator== when adding/removing fields! * + **************************************************************************/ + eversion_t version; + version_t reported_seq; // sequence number + epoch_t reported_epoch; // epoch of this report + uint64_t state; + utime_t last_fresh; // last reported + utime_t last_change; // new state != previous state + utime_t last_active; // state & PG_STATE_ACTIVE + utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED + utime_t last_clean; // state & PG_STATE_CLEAN + utime_t last_unstale; // (state & PG_STATE_STALE) == 0 + utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0 + utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0 + + eversion_t log_start; // (log_start,version] + eversion_t ondisk_log_start; // there may be more on disk + + epoch_t created; + epoch_t last_epoch_clean; + pg_t parent; + __u32 parent_split_bits; + + eversion_t last_scrub; + eversion_t last_deep_scrub; + utime_t last_scrub_stamp; + utime_t last_deep_scrub_stamp; + utime_t last_clean_scrub_stamp; + + object_stat_collection_t stats; + + int64_t log_size; + int64_t ondisk_log_size; // >= active_log_size + + vector<int32_t> up, acting; + vector<pg_shard_t> avail_no_missing; + map< std::set<pg_shard_t>, int32_t > object_location_counts; + epoch_t mapping_epoch; + + vector<int32_t> blocked_by; ///< osds on which the pg is blocked + + interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged + + utime_t last_became_active; + utime_t last_became_peered; + + /// up, acting primaries + int32_t up_primary; + int32_t acting_primary; + + // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is + // absurd already, so cap it to 2^32 and save 4 bytes at the same time + uint32_t snaptrimq_len; + + bool stats_invalid:1; + /// true if num_objects_dirty is not accurate (because it was not + /// maintained starting from pool creation) + bool dirty_stats_invalid:1; + bool omap_stats_invalid:1; + bool hitset_stats_invalid:1; + bool hitset_bytes_stats_invalid:1; + bool pin_stats_invalid:1; + bool manifest_stats_invalid:1; + + pg_stat_t() + : reported_seq(0), + reported_epoch(0), + state(0), + created(0), last_epoch_clean(0), + parent_split_bits(0), + log_size(0), ondisk_log_size(0), + mapping_epoch(0), + up_primary(-1), + acting_primary(-1), + snaptrimq_len(0), + stats_invalid(false), + dirty_stats_invalid(false), + omap_stats_invalid(false), + hitset_stats_invalid(false), + hitset_bytes_stats_invalid(false), + pin_stats_invalid(false), + manifest_stats_invalid(false) + { } + + epoch_t get_effective_last_epoch_clean() const { + if (state & PG_STATE_CLEAN) { + // we are clean as of this report, and should thus take the + // reported epoch + return reported_epoch; + } else { + return last_epoch_clean; + } + } + + pair<epoch_t, version_t> get_version_pair() const { + return make_pair(reported_epoch, reported_seq); + } + + void floor(int64_t f) { + stats.floor(f); + if (log_size < f) + log_size = f; + if (ondisk_log_size < f) + ondisk_log_size = f; + if (snaptrimq_len < f) + snaptrimq_len = f; + } + + void add_sub_invalid_flags(const pg_stat_t& o) { + // adding (or subtracting!) invalid stats render our stats invalid too + stats_invalid |= o.stats_invalid; + dirty_stats_invalid |= o.dirty_stats_invalid; + omap_stats_invalid |= o.omap_stats_invalid; + hitset_stats_invalid |= o.hitset_stats_invalid; + hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid; + pin_stats_invalid |= o.pin_stats_invalid; + manifest_stats_invalid |= o.manifest_stats_invalid; + } + void add(const pg_stat_t& o) { + stats.add(o.stats); + log_size += o.log_size; + ondisk_log_size += o.ondisk_log_size; + snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len, + (uint64_t)(1ull << 31)); + add_sub_invalid_flags(o); + } + void sub(const pg_stat_t& o) { + stats.sub(o.stats); + log_size -= o.log_size; + ondisk_log_size -= o.ondisk_log_size; + if (o.snaptrimq_len < snaptrimq_len) { + snaptrimq_len -= o.snaptrimq_len; + } else { + snaptrimq_len = 0; + } + add_sub_invalid_flags(o); + } + + bool is_acting_osd(int32_t osd, bool primary) const; + void dump(Formatter *f) const; + void dump_brief(Formatter *f) const; + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + static void generate_test_instances(list<pg_stat_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_stat_t) + +bool operator==(const pg_stat_t& l, const pg_stat_t& r); + +/** store_statfs_t + * ObjectStore full statfs information + */ +struct store_statfs_t +{ + uint64_t total = 0; ///< Total bytes + uint64_t available = 0; ///< Free bytes available + uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes + + int64_t allocated = 0; ///< Bytes allocated by the store + + int64_t data_stored = 0; ///< Bytes actually stored by the user + int64_t data_compressed = 0; ///< Bytes stored after compression + int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data + int64_t data_compressed_original = 0; ///< Bytes that were compressed + + int64_t omap_allocated = 0; ///< approx usage of omap data + int64_t internal_metadata = 0; ///< approx usage of internal metadata + + void reset() { + *this = store_statfs_t(); + } + void floor(int64_t f) { +#define FLOOR(x) if (int64_t(x) < f) x = f + FLOOR(total); + FLOOR(available); + FLOOR(internally_reserved); + FLOOR(allocated); + FLOOR(data_stored); + FLOOR(data_compressed); + FLOOR(data_compressed_allocated); + FLOOR(data_compressed_original); + + FLOOR(omap_allocated); + FLOOR(internal_metadata); +#undef FLOOR + } + + bool operator ==(const store_statfs_t& other) const; + bool is_zero() const { + return *this == store_statfs_t(); + } + + uint64_t get_used() const { + return total - available - internally_reserved; + } + + // this accumulates both actually used and statfs's internally_reserved + uint64_t get_used_raw() const { + return total - available; + } + + float get_used_raw_ratio() const { + if (total) { + return (float)get_used_raw() / (float)total; + } else { + return 0.0; + } + } + + // helpers to ease legacy code porting + uint64_t kb_avail() const { + return available >> 10; + } + uint64_t kb() const { + return total >> 10; + } + uint64_t kb_used() const { + return (total - available - internally_reserved) >> 10; + } + uint64_t kb_used_raw() const { + return get_used_raw() >> 10; + } + + uint64_t kb_used_data() const { + return allocated >> 10; + } + uint64_t kb_used_omap() const { + return omap_allocated >> 10; + } + + uint64_t kb_used_internal_metadata() const { + return internal_metadata >> 10; + } + + void add(const store_statfs_t& o) { + total += o.total; + available += o.available; + internally_reserved += o.internally_reserved; + allocated += o.allocated; + data_stored += o.data_stored; + data_compressed += o.data_compressed; + data_compressed_allocated += o.data_compressed_allocated; + data_compressed_original += o.data_compressed_original; + omap_allocated += o.omap_allocated; + internal_metadata += o.internal_metadata; + } + void sub(const store_statfs_t& o) { + total -= o.total; + available -= o.available; + internally_reserved -= o.internally_reserved; + allocated -= o.allocated; + data_stored -= o.data_stored; + data_compressed -= o.data_compressed; + data_compressed_allocated -= o.data_compressed_allocated; + data_compressed_original -= o.data_compressed_original; + omap_allocated -= o.omap_allocated; + internal_metadata -= o.internal_metadata; + } + void dump(Formatter *f) const; + DENC(store_statfs_t, v, p) { + DENC_START(1, 1, p); + denc(v.total, p); + denc(v.available, p); + denc(v.internally_reserved, p); + denc(v.allocated, p); + denc(v.data_stored, p); + denc(v.data_compressed, p); + denc(v.data_compressed_allocated, p); + denc(v.data_compressed_original, p); + denc(v.omap_allocated, p); + denc(v.internal_metadata, p); + DENC_FINISH(p); + } + static void generate_test_instances(list<store_statfs_t*>& o); +}; +WRITE_CLASS_DENC(store_statfs_t) + +ostream &operator<<(ostream &lhs, const store_statfs_t &rhs); + +/** osd_stat + * aggregate stats for an osd + */ +struct osd_stat_t { + store_statfs_t statfs; + vector<int> hb_peers; + int32_t snap_trim_queue_len, num_snap_trimming; + uint64_t num_shards_repaired; + + pow2_hist_t op_queue_age_hist; + + objectstore_perf_stat_t os_perf_stat; + osd_alerts_t os_alerts; + + epoch_t up_from = 0; + uint64_t seq = 0; + + uint32_t num_pgs = 0; + + uint32_t num_osds = 0; + uint32_t num_per_pool_osds = 0; + + struct Interfaces { + uint32_t last_update; // in seconds + uint32_t back_pingtime[3]; + uint32_t back_min[3]; + uint32_t back_max[3]; + uint32_t back_last; + uint32_t front_pingtime[3]; + uint32_t front_min[3]; + uint32_t front_max[3]; + uint32_t front_last; + }; + map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces + + osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0), + num_shards_repaired(0) {} + + void add(const osd_stat_t& o) { + statfs.add(o.statfs); + snap_trim_queue_len += o.snap_trim_queue_len; + num_snap_trimming += o.num_snap_trimming; + num_shards_repaired += o.num_shards_repaired; + op_queue_age_hist.add(o.op_queue_age_hist); + os_perf_stat.add(o.os_perf_stat); + num_pgs += o.num_pgs; + num_osds += o.num_osds; + num_per_pool_osds += o.num_per_pool_osds; + for (const auto& a : o.os_alerts) { + auto& target = os_alerts[a.first]; + for (auto& i : a.second) { + target.emplace(i.first, i.second); + } + } + } + void sub(const osd_stat_t& o) { + statfs.sub(o.statfs); + snap_trim_queue_len -= o.snap_trim_queue_len; + num_snap_trimming -= o.num_snap_trimming; + num_shards_repaired -= o.num_shards_repaired; + op_queue_age_hist.sub(o.op_queue_age_hist); + os_perf_stat.sub(o.os_perf_stat); + num_pgs -= o.num_pgs; + num_osds -= o.num_osds; + num_per_pool_osds -= o.num_per_pool_osds; + for (const auto& a : o.os_alerts) { + auto& target = os_alerts[a.first]; + for (auto& i : a.second) { + target.erase(i.first); + } + if (target.empty()) { + os_alerts.erase(a.first); + } + } + } + void dump(Formatter *f, bool with_net = true) const; + void dump_ping_time(Formatter *f) const; + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + static void generate_test_instances(std::list<osd_stat_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(osd_stat_t) + +inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) { + return l.statfs == r.statfs && + l.snap_trim_queue_len == r.snap_trim_queue_len && + l.num_snap_trimming == r.num_snap_trimming && + l.num_shards_repaired == r.num_shards_repaired && + l.hb_peers == r.hb_peers && + l.op_queue_age_hist == r.op_queue_age_hist && + l.os_perf_stat == r.os_perf_stat && + l.num_pgs == r.num_pgs && + l.num_osds == r.num_osds && + l.num_per_pool_osds == r.num_per_pool_osds; +} +inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) { + return !(l == r); +} + +inline ostream& operator<<(ostream& out, const osd_stat_t& s) { + return out << "osd_stat(" << s.statfs << ", " + << "peers " << s.hb_peers + << " op hist " << s.op_queue_age_hist.h + << ")"; +} + +/* + * summation over an entire pool + */ +struct pool_stat_t { + object_stat_collection_t stats; + store_statfs_t store_stats; + int64_t log_size; + int64_t ondisk_log_size; // >= active_log_size + int32_t up; ///< number of up replicas or shards + int32_t acting; ///< number of acting replicas or shards + int32_t num_store_stats; ///< amount of store_stats accumulated + + pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0), + num_store_stats(0) + { } + + void floor(int64_t f) { + stats.floor(f); + store_stats.floor(f); + if (log_size < f) + log_size = f; + if (ondisk_log_size < f) + ondisk_log_size = f; + if (up < f) + up = f; + if (acting < f) + acting = f; + if (num_store_stats < f) + num_store_stats = f; + } + + void add(const store_statfs_t& o) { + store_stats.add(o); + ++num_store_stats; + } + void sub(const store_statfs_t& o) { + store_stats.sub(o); + --num_store_stats; + } + + void add(const pg_stat_t& o) { + stats.add(o.stats); + log_size += o.log_size; + ondisk_log_size += o.ondisk_log_size; + up += o.up.size(); + acting += o.acting.size(); + } + void sub(const pg_stat_t& o) { + stats.sub(o.stats); + log_size -= o.log_size; + ondisk_log_size -= o.ondisk_log_size; + up -= o.up.size(); + acting -= o.acting.size(); + } + + bool is_zero() const { + return (stats.is_zero() && + store_stats.is_zero() && + log_size == 0 && + ondisk_log_size == 0 && + up == 0 && + acting == 0 && + num_store_stats == 0); + } + + // helper accessors to retrieve used/netto bytes depending on the + // collection method: new per-pool objectstore report or legacy PG + // summation at OSD. + // In legacy mode used and netto values are the same. But for new per-pool + // collection 'used' provides amount of space ALLOCATED at all related OSDs + // and 'netto' is amount of stored user data. + uint64_t get_allocated_bytes(bool per_pool) const { + uint64_t allocated_bytes; + if (per_pool) { + allocated_bytes = store_stats.allocated; + } else { + // legacy mode, use numbers from 'stats' + allocated_bytes = stats.sum.num_bytes + + stats.sum.num_bytes_hit_set_archive; + } + // omap is not broken out by pool by nautilus bluestore + allocated_bytes += stats.sum.num_omap_bytes; + return allocated_bytes; + } + uint64_t get_user_bytes(float raw_used_rate, bool per_pool) const { + uint64_t user_bytes; + if (per_pool) { + user_bytes = raw_used_rate ? store_stats.data_stored / raw_used_rate : 0; + } else { + // legacy mode, use numbers from 'stats' + user_bytes = stats.sum.num_bytes + + stats.sum.num_bytes_hit_set_archive; + } + // omap is not broken out by pool by nautilus bluestore + user_bytes += stats.sum.num_omap_bytes; + return user_bytes; + } + + void dump(Formatter *f) const; + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + static void generate_test_instances(list<pool_stat_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_stat_t) + + +// ----------------------------------------- + +/** + * pg_hit_set_info_t - information about a single recorded HitSet + * + * Track basic metadata about a HitSet, like the number of insertions + * and the time range it covers. + */ +struct pg_hit_set_info_t { + utime_t begin, end; ///< time interval + eversion_t version; ///< version this HitSet object was written + bool using_gmt; ///< use gmt for creating the hit_set archive object name + + friend bool operator==(const pg_hit_set_info_t& l, + const pg_hit_set_info_t& r) { + return + l.begin == r.begin && + l.end == r.end && + l.version == r.version && + l.using_gmt == r.using_gmt; + } + + explicit pg_hit_set_info_t(bool using_gmt = true) + : using_gmt(using_gmt) {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_hit_set_info_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_hit_set_info_t) + +/** + * pg_hit_set_history_t - information about a history of hitsets + * + * Include information about the currently accumulating hit set as well + * as archived/historical ones. + */ +struct pg_hit_set_history_t { + eversion_t current_last_update; ///< last version inserted into current set + list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest + + friend bool operator==(const pg_hit_set_history_t& l, + const pg_hit_set_history_t& r) { + return + l.current_last_update == r.current_last_update && + l.history == r.history; + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_hit_set_history_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_hit_set_history_t) + + +// ----------------------------------------- + +/** + * pg_history_t - information about recent pg peering/mapping history + * + * This is aggressively shared between OSDs to bound the amount of past + * history they need to worry about. + */ +struct pg_history_t { + epoch_t epoch_created; // epoch in which *pg* was created (pool or pg) + epoch_t epoch_pool_created; // epoch in which *pool* was created + // (note: may be pg creation epoch for + // pre-luminous clusters) + epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally) + epoch_t last_interval_started; // first epoch of last_epoch_started interval + epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean. + epoch_t last_interval_clean; // first epoch of last_epoch_clean interval + epoch_t last_epoch_split; // as parent or child + epoch_t last_epoch_marked_full; // pool or cluster + + /** + * In the event of a map discontinuity, same_*_since may reflect the first + * map the osd has seen in the new map sequence rather than the actual start + * of the interval. This is ok since a discontinuity at epoch e means there + * must have been a clean interval between e and now and that we cannot be + * in the active set during the interval containing e. + */ + epoch_t same_up_since; // same acting set since + epoch_t same_interval_since; // same acting AND up set since + epoch_t same_primary_since; // same primary at least back through this epoch. + + eversion_t last_scrub; + eversion_t last_deep_scrub; + utime_t last_scrub_stamp; + utime_t last_deep_scrub_stamp; + utime_t last_clean_scrub_stamp; + + friend bool operator==(const pg_history_t& l, const pg_history_t& r) { + return + l.epoch_created == r.epoch_created && + l.epoch_pool_created == r.epoch_pool_created && + l.last_epoch_started == r.last_epoch_started && + l.last_interval_started == r.last_interval_started && + l.last_epoch_clean == r.last_epoch_clean && + l.last_interval_clean == r.last_interval_clean && + l.last_epoch_split == r.last_epoch_split && + l.last_epoch_marked_full == r.last_epoch_marked_full && + l.same_up_since == r.same_up_since && + l.same_interval_since == r.same_interval_since && + l.same_primary_since == r.same_primary_since && + l.last_scrub == r.last_scrub && + l.last_deep_scrub == r.last_deep_scrub && + l.last_scrub_stamp == r.last_scrub_stamp && + l.last_deep_scrub_stamp == r.last_deep_scrub_stamp && + l.last_clean_scrub_stamp == r.last_clean_scrub_stamp; + } + + pg_history_t() + : epoch_created(0), + epoch_pool_created(0), + last_epoch_started(0), + last_interval_started(0), + last_epoch_clean(0), + last_interval_clean(0), + last_epoch_split(0), + last_epoch_marked_full(0), + same_up_since(0), same_interval_since(0), same_primary_since(0) {} + + bool merge(const pg_history_t &other) { + // Here, we only update the fields which cannot be calculated from the OSDmap. + bool modified = false; + if (epoch_created < other.epoch_created) { + epoch_created = other.epoch_created; + modified = true; + } + if (epoch_pool_created < other.epoch_pool_created) { + // FIXME: for jewel compat only; this should either be 0 or always the + // same value across all pg instances. + epoch_pool_created = other.epoch_pool_created; + modified = true; + } + if (last_epoch_started < other.last_epoch_started) { + last_epoch_started = other.last_epoch_started; + modified = true; + } + if (last_interval_started < other.last_interval_started) { + last_interval_started = other.last_interval_started; + modified = true; + } + if (last_epoch_clean < other.last_epoch_clean) { + last_epoch_clean = other.last_epoch_clean; + modified = true; + } + if (last_interval_clean < other.last_interval_clean) { + last_interval_clean = other.last_interval_clean; + modified = true; + } + if (last_epoch_split < other.last_epoch_split) { + last_epoch_split = other.last_epoch_split; + modified = true; + } + if (last_epoch_marked_full < other.last_epoch_marked_full) { + last_epoch_marked_full = other.last_epoch_marked_full; + modified = true; + } + if (other.last_scrub > last_scrub) { + last_scrub = other.last_scrub; + modified = true; + } + if (other.last_scrub_stamp > last_scrub_stamp) { + last_scrub_stamp = other.last_scrub_stamp; + modified = true; + } + if (other.last_deep_scrub > last_deep_scrub) { + last_deep_scrub = other.last_deep_scrub; + modified = true; + } + if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) { + last_deep_scrub_stamp = other.last_deep_scrub_stamp; + modified = true; + } + if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) { + last_clean_scrub_stamp = other.last_clean_scrub_stamp; + modified = true; + } + return modified; + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_history_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_history_t) + +inline ostream& operator<<(ostream& out, const pg_history_t& h) { + return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created + << " lis/c " << h.last_interval_started + << "/" << h.last_interval_clean + << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean + << "/" << h.last_epoch_marked_full + << " " << h.same_up_since + << "/" << h.same_interval_since + << "/" << h.same_primary_since; +} + + +/** + * pg_info_t - summary of PG statistics. + * + * some notes: + * - last_complete implies we have all objects that existed as of that + * stamp, OR a newer object, OR have already applied a later delete. + * - if last_complete >= log.bottom, then we know pg contents thru log.head. + * otherwise, we have no idea what the pg is supposed to contain. + */ +struct pg_info_t { + spg_t pgid; + eversion_t last_update; ///< last object version applied to store. + eversion_t last_complete; ///< last version pg was complete through. + epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd + epoch_t last_interval_started; ///< first epoch of last_epoch_started interval + + version_t last_user_version; ///< last user object version applied to store + + eversion_t log_tail; ///< oldest log entry. + + hobject_t last_backfill; ///< objects >= this and < last_complete may be missing + bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort + + interval_set<snapid_t> purged_snaps; + + pg_stat_t stats; + + pg_history_t history; + pg_hit_set_history_t hit_set; + + friend bool operator==(const pg_info_t& l, const pg_info_t& r) { + return + l.pgid == r.pgid && + l.last_update == r.last_update && + l.last_complete == r.last_complete && + l.last_epoch_started == r.last_epoch_started && + l.last_interval_started == r.last_interval_started && + l.last_user_version == r.last_user_version && + l.log_tail == r.log_tail && + l.last_backfill == r.last_backfill && + l.last_backfill_bitwise == r.last_backfill_bitwise && + l.purged_snaps == r.purged_snaps && + l.stats == r.stats && + l.history == r.history && + l.hit_set == r.hit_set; + } + + pg_info_t() + : last_epoch_started(0), + last_interval_started(0), + last_user_version(0), + last_backfill(hobject_t::get_max()), + last_backfill_bitwise(false) + { } + // cppcheck-suppress noExplicitConstructor + pg_info_t(spg_t p) + : pgid(p), + last_epoch_started(0), + last_interval_started(0), + last_user_version(0), + last_backfill(hobject_t::get_max()), + last_backfill_bitwise(false) + { } + + void set_last_backfill(hobject_t pos) { + last_backfill = pos; + last_backfill_bitwise = true; + } + + bool is_empty() const { return last_update.version == 0; } + bool dne() const { return history.epoch_created == 0; } + + bool has_missing() const { return last_complete != last_update; } + bool is_incomplete() const { return !last_backfill.is_max(); } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_info_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_info_t) + +inline ostream& operator<<(ostream& out, const pg_info_t& pgi) +{ + out << pgi.pgid << "("; + if (pgi.dne()) + out << " DNE"; + if (pgi.is_empty()) + out << " empty"; + else { + out << " v " << pgi.last_update; + if (pgi.last_complete != pgi.last_update) + out << " lc " << pgi.last_complete; + out << " (" << pgi.log_tail << "," << pgi.last_update << "]"; + } + if (pgi.is_incomplete()) + out << " lb " << pgi.last_backfill + << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)"); + //out << " c " << pgi.epoch_created; + out << " local-lis/les=" << pgi.last_interval_started + << "/" << pgi.last_epoch_started; + out << " n=" << pgi.stats.stats.sum.num_objects; + out << " " << pgi.history + << ")"; + return out; +} + +/** + * pg_fast_info_t - common pg_info_t fields + * + * These are the fields of pg_info_t (and children) that are updated for + * most IO operations. + * + * ** WARNING ** + * Because we rely on these fields to be applied to the normal + * info struct, adding a new field here that is not also new in info + * means that we must set an incompat OSD feature bit! + */ +struct pg_fast_info_t { + eversion_t last_update; + eversion_t last_complete; + version_t last_user_version; + struct { // pg_stat_t stats + eversion_t version; + version_t reported_seq; + utime_t last_fresh; + utime_t last_active; + utime_t last_peered; + utime_t last_clean; + utime_t last_unstale; + utime_t last_undegraded; + utime_t last_fullsized; + int64_t log_size; // (also ondisk_log_size, which has the same value) + struct { // object_stat_collection_t stats; + struct { // objct_stat_sum_t sum + int64_t num_bytes; // in bytes + int64_t num_objects; + int64_t num_object_copies; + int64_t num_rd; + int64_t num_rd_kb; + int64_t num_wr; + int64_t num_wr_kb; + int64_t num_objects_dirty; + } sum; + } stats; + } stats; + + void populate_from(const pg_info_t& info) { + last_update = info.last_update; + last_complete = info.last_complete; + last_user_version = info.last_user_version; + stats.version = info.stats.version; + stats.reported_seq = info.stats.reported_seq; + stats.last_fresh = info.stats.last_fresh; + stats.last_active = info.stats.last_active; + stats.last_peered = info.stats.last_peered; + stats.last_clean = info.stats.last_clean; + stats.last_unstale = info.stats.last_unstale; + stats.last_undegraded = info.stats.last_undegraded; + stats.last_fullsized = info.stats.last_fullsized; + stats.log_size = info.stats.log_size; + stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes; + stats.stats.sum.num_objects = info.stats.stats.sum.num_objects; + stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies; + stats.stats.sum.num_rd = info.stats.stats.sum.num_rd; + stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb; + stats.stats.sum.num_wr = info.stats.stats.sum.num_wr; + stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb; + stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty; + } + + bool try_apply_to(pg_info_t* info) { + if (last_update <= info->last_update) + return false; + info->last_update = last_update; + info->last_complete = last_complete; + info->last_user_version = last_user_version; + info->stats.version = stats.version; + info->stats.reported_seq = stats.reported_seq; + info->stats.last_fresh = stats.last_fresh; + info->stats.last_active = stats.last_active; + info->stats.last_peered = stats.last_peered; + info->stats.last_clean = stats.last_clean; + info->stats.last_unstale = stats.last_unstale; + info->stats.last_undegraded = stats.last_undegraded; + info->stats.last_fullsized = stats.last_fullsized; + info->stats.log_size = stats.log_size; + info->stats.ondisk_log_size = stats.log_size; + info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes; + info->stats.stats.sum.num_objects = stats.stats.sum.num_objects; + info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies; + info->stats.stats.sum.num_rd = stats.stats.sum.num_rd; + info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb; + info->stats.stats.sum.num_wr = stats.stats.sum.num_wr; + info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb; + info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty; + return true; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(last_update, bl); + encode(last_complete, bl); + encode(last_user_version, bl); + encode(stats.version, bl); + encode(stats.reported_seq, bl); + encode(stats.last_fresh, bl); + encode(stats.last_active, bl); + encode(stats.last_peered, bl); + encode(stats.last_clean, bl); + encode(stats.last_unstale, bl); + encode(stats.last_undegraded, bl); + encode(stats.last_fullsized, bl); + encode(stats.log_size, bl); + encode(stats.stats.sum.num_bytes, bl); + encode(stats.stats.sum.num_objects, bl); + encode(stats.stats.sum.num_object_copies, bl); + encode(stats.stats.sum.num_rd, bl); + encode(stats.stats.sum.num_rd_kb, bl); + encode(stats.stats.sum.num_wr, bl); + encode(stats.stats.sum.num_wr_kb, bl); + encode(stats.stats.sum.num_objects_dirty, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(last_update, p); + decode(last_complete, p); + decode(last_user_version, p); + decode(stats.version, p); + decode(stats.reported_seq, p); + decode(stats.last_fresh, p); + decode(stats.last_active, p); + decode(stats.last_peered, p); + decode(stats.last_clean, p); + decode(stats.last_unstale, p); + decode(stats.last_undegraded, p); + decode(stats.last_fullsized, p); + decode(stats.log_size, p); + decode(stats.stats.sum.num_bytes, p); + decode(stats.stats.sum.num_objects, p); + decode(stats.stats.sum.num_object_copies, p); + decode(stats.stats.sum.num_rd, p); + decode(stats.stats.sum.num_rd_kb, p); + decode(stats.stats.sum.num_wr, p); + decode(stats.stats.sum.num_wr_kb, p); + decode(stats.stats.sum.num_objects_dirty, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(pg_fast_info_t) + + +struct pg_notify_t { + epoch_t query_epoch; + epoch_t epoch_sent; + pg_info_t info; + shard_id_t to; + shard_id_t from; + pg_notify_t() : + query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD), + from(shard_id_t::NO_SHARD) {} + pg_notify_t( + shard_id_t to, + shard_id_t from, + epoch_t query_epoch, + epoch_t epoch_sent, + const pg_info_t &info) + : query_epoch(query_epoch), + epoch_sent(epoch_sent), + info(info), to(to), from(from) { + ceph_assert(from == info.pgid.shard); + } + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &p); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_notify_t*> &o); +}; +WRITE_CLASS_ENCODER(pg_notify_t) +ostream &operator<<(ostream &lhs, const pg_notify_t ¬ify); + + +class OSDMap; +/** + * PastIntervals -- information needed to determine the PriorSet and + * the might_have_unfound set + */ +class PastIntervals { +public: + struct pg_interval_t { + vector<int32_t> up, acting; + epoch_t first, last; + bool maybe_went_rw; + int32_t primary; + int32_t up_primary; + + pg_interval_t() + : first(0), last(0), + maybe_went_rw(false), + primary(-1), + up_primary(-1) + {} + + pg_interval_t( + vector<int32_t> &&up, + vector<int32_t> &&acting, + epoch_t first, + epoch_t last, + bool maybe_went_rw, + int32_t primary, + int32_t up_primary) + : up(up), acting(acting), first(first), last(last), + maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary) + {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_interval_t*>& o); + }; + + PastIntervals(); + PastIntervals(PastIntervals &&rhs) = default; + PastIntervals &operator=(PastIntervals &&rhs) = default; + + PastIntervals(const PastIntervals &rhs); + PastIntervals &operator=(const PastIntervals &rhs); + + class interval_rep { + public: + virtual size_t size() const = 0; + virtual bool empty() const = 0; + virtual void clear() = 0; + virtual pair<epoch_t, epoch_t> get_bounds() const = 0; + virtual set<pg_shard_t> get_all_participants( + bool ec_pool) const = 0; + virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0; + virtual unique_ptr<interval_rep> clone() const = 0; + virtual ostream &print(ostream &out) const = 0; + virtual void encode(bufferlist &bl) const = 0; + virtual void decode(bufferlist::const_iterator &bl) = 0; + virtual void dump(Formatter *f) const = 0; + virtual void iterate_mayberw_back_to( + epoch_t les, + std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0; + + virtual bool has_full_intervals() const { return false; } + virtual void iterate_all_intervals( + std::function<void(const pg_interval_t &)> &&f) const { + ceph_assert(!has_full_intervals()); + ceph_abort_msg("not valid for this implementation"); + } + virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0; + + virtual ~interval_rep() {} + }; + friend class pi_compact_rep; +private: + + unique_ptr<interval_rep> past_intervals; + + explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {} + +public: + void add_interval(bool ec_pool, const pg_interval_t &interval) { + ceph_assert(past_intervals); + return past_intervals->add_interval(ec_pool, interval); + } + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + if (past_intervals) { + __u8 type = 2; + encode(type, bl); + past_intervals->encode(bl); + } else { + encode((__u8)0, bl); + } + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl); + + void dump(Formatter *f) const { + ceph_assert(past_intervals); + past_intervals->dump(f); + } + static void generate_test_instances(list<PastIntervals *> & o); + + /** + * Determines whether there is an interval change + */ + static bool is_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector<int> &old_acting, + const vector<int> &new_acting, + int old_up_primary, + int new_up_primary, + const vector<int> &old_up, + const vector<int> &new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + unsigned old_pg_num, + unsigned new_pg_num, + unsigned old_pg_num_pending, + unsigned new_pg_num_pending, + bool old_sort_bitwise, + bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, + pg_t pgid + ); + + /** + * Determines whether there is an interval change + */ + static bool is_new_interval( + int old_acting_primary, ///< [in] primary as of lastmap + int new_acting_primary, ///< [in] primary as of lastmap + const vector<int> &old_acting, ///< [in] acting as of lastmap + const vector<int> &new_acting, ///< [in] acting as of osdmap + int old_up_primary, ///< [in] up primary of lastmap + int new_up_primary, ///< [in] up primary of osdmap + const vector<int> &old_up, ///< [in] up as of lastmap + const vector<int> &new_up, ///< [in] up as of osdmap + std::shared_ptr<const OSDMap> osdmap, ///< [in] current map + std::shared_ptr<const OSDMap> lastmap, ///< [in] last map + pg_t pgid ///< [in] pgid for pg + ); + + /** + * Integrates a new map into *past_intervals, returns true + * if an interval was closed out. + */ + static bool check_new_interval( + int old_acting_primary, ///< [in] primary as of lastmap + int new_acting_primary, ///< [in] primary as of osdmap + const vector<int> &old_acting, ///< [in] acting as of lastmap + const vector<int> &new_acting, ///< [in] acting as of osdmap + int old_up_primary, ///< [in] up primary of lastmap + int new_up_primary, ///< [in] up primary of osdmap + const vector<int> &old_up, ///< [in] up as of lastmap + const vector<int> &new_up, ///< [in] up as of osdmap + epoch_t same_interval_since, ///< [in] as of osdmap + epoch_t last_epoch_clean, ///< [in] current + std::shared_ptr<const OSDMap> osdmap, ///< [in] current map + std::shared_ptr<const OSDMap> lastmap, ///< [in] last map + pg_t pgid, ///< [in] pgid for pg + IsPGRecoverablePredicate *could_have_gone_active, ///< [in] predicate whether the pg can be active + PastIntervals *past_intervals, ///< [out] intervals + ostream *out = 0 ///< [out] debug ostream + ); + + friend ostream& operator<<(ostream& out, const PastIntervals &i); + + template <typename F> + void iterate_mayberw_back_to( + epoch_t les, + F &&f) const { + ceph_assert(past_intervals); + past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f)); + } + void clear() { + ceph_assert(past_intervals); + past_intervals->clear(); + } + + /** + * Should return a value which gives an indication of the amount + * of state contained + */ + size_t size() const { + ceph_assert(past_intervals); + return past_intervals->size(); + } + + bool empty() const { + ceph_assert(past_intervals); + return past_intervals->empty(); + } + + void swap(PastIntervals &other) { + using std::swap; + swap(other.past_intervals, past_intervals); + } + + /** + * Return all shards which have been in the acting set back to the + * latest epoch to which we have trimmed except for pg_whoami + */ + set<pg_shard_t> get_might_have_unfound( + pg_shard_t pg_whoami, + bool ec_pool) const { + ceph_assert(past_intervals); + auto ret = past_intervals->get_all_participants(ec_pool); + ret.erase(pg_whoami); + return ret; + } + + /** + * Return all shards which we might want to talk to for peering + */ + set<pg_shard_t> get_all_probe( + bool ec_pool) const { + ceph_assert(past_intervals); + return past_intervals->get_all_participants(ec_pool); + } + + /* Return the set of epochs [start, end) represented by the + * past_interval set. + */ + pair<epoch_t, epoch_t> get_bounds() const { + ceph_assert(past_intervals); + return past_intervals->get_bounds(); + } + + void adjust_start_backwards(epoch_t last_epoch_clean) { + ceph_assert(past_intervals); + past_intervals->adjust_start_backwards(last_epoch_clean); + } + + enum osd_state_t { + UP, + DOWN, + DNE, + LOST + }; + struct PriorSet { + bool ec_pool = false; + set<pg_shard_t> probe; ///< current+prior OSDs we need to probe. + set<int> down; ///< down osds that would normally be in @a probe and might be interesting. + map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set + + bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set. + unique_ptr<IsPGRecoverablePredicate> pcontdec; + + PriorSet() = default; + PriorSet(PriorSet &&) = default; + PriorSet &operator=(PriorSet &&) = default; + + PriorSet &operator=(const PriorSet &) = delete; + PriorSet(const PriorSet &) = delete; + + bool operator==(const PriorSet &rhs) const { + return (ec_pool == rhs.ec_pool) && + (probe == rhs.probe) && + (down == rhs.down) && + (blocked_by == rhs.blocked_by) && + (pg_down == rhs.pg_down); + } + + bool affected_by_map( + const OSDMap &osdmap, + const DoutPrefixProvider *dpp) const; + + // For verifying tests + PriorSet( + bool ec_pool, + set<pg_shard_t> probe, + set<int> down, + map<int, epoch_t> blocked_by, + bool pg_down, + IsPGRecoverablePredicate *pcontdec) + : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by), + pg_down(pg_down), pcontdec(pcontdec) {} + + private: + template <typename F> + PriorSet( + const PastIntervals &past_intervals, + bool ec_pool, + epoch_t last_epoch_started, + IsPGRecoverablePredicate *c, + F f, + const vector<int> &up, + const vector<int> &acting, + const DoutPrefixProvider *dpp); + + friend class PastIntervals; + }; + + template <typename... Args> + PriorSet get_prior_set(Args&&... args) const { + return PriorSet(*this, std::forward<Args>(args)...); + } +}; +WRITE_CLASS_ENCODER(PastIntervals) + +ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i); +ostream& operator<<(ostream& out, const PastIntervals &i); +ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i); + +template <typename F> +PastIntervals::PriorSet::PriorSet( + const PastIntervals &past_intervals, + bool ec_pool, + epoch_t last_epoch_started, + IsPGRecoverablePredicate *c, + F f, + const vector<int> &up, + const vector<int> &acting, + const DoutPrefixProvider *dpp) + : ec_pool(ec_pool), pg_down(false), pcontdec(c) +{ + /* + * We have to be careful to gracefully deal with situations like + * so. Say we have a power outage or something that takes out both + * OSDs, but the monitor doesn't mark them down in the same epoch. + * The history may look like + * + * 1: A B + * 2: B + * 3: let's say B dies for good, too (say, from the power spike) + * 4: A + * + * which makes it look like B may have applied updates to the PG + * that we need in order to proceed. This sucks... + * + * To minimize the risk of this happening, we CANNOT go active if + * _any_ OSDs in the prior set are down until we send an MOSDAlive + * to the monitor such that the OSDMap sets osd_up_thru to an epoch. + * Then, we have something like + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: + * 4: A + * + * -> we can ignore B, bc it couldn't have gone active (alive_thru + * still 0). + * + * or, + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: B up_thru[B]=2 + * 4: + * 5: A + * + * -> we must wait for B, bc it was alive through 2, and could have + * written to the pg. + * + * If B is really dead, then an administrator will need to manually + * intervene by marking the OSD as "lost." + */ + + // Include current acting and up nodes... not because they may + // contain old data (this interval hasn't gone active, obviously), + // but because we want their pg_info to inform choose_acting(), and + // so that we know what they do/do not have explicitly before + // sending them any new info/logs/whatever. + for (unsigned i = 0; i < acting.size(); i++) { + if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */) + probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + // It may be possible to exclude the up nodes, but let's keep them in + // there for now. + for (unsigned i = 0; i < up.size(); i++) { + if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */) + probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + + set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool); + ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl; + for (auto &&i: all_probe) { + switch (f(0, i.osd, nullptr)) { + case UP: { + probe.insert(i); + break; + } + case DNE: + case LOST: + case DOWN: { + down.insert(i.osd); + break; + } + } + } + + past_intervals.iterate_mayberw_back_to( + last_epoch_started, + [&](epoch_t start, const set<pg_shard_t> &acting) { + ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start + << ", acting: " << acting << dendl; + + // look at candidate osds during this interval. each falls into + // one of three categories: up, down (but potentially + // interesting), or lost (down, but we won't wait for it). + set<pg_shard_t> up_now; + map<int, epoch_t> candidate_blocked_by; + // any candidates down now (that might have useful data) + bool any_down_now = false; + + // consider ACTING osds + for (auto &&so: acting) { + epoch_t lost_at = 0; + switch (f(start, so.osd, &lost_at)) { + case UP: { + // include past acting osds if they are up. + up_now.insert(so); + break; + } + case DNE: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " no longer exists" << dendl; + break; + } + case LOST: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " is down, but lost_at " << lost_at << dendl; + up_now.insert(so); + break; + } + case DOWN: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " is down" << dendl; + candidate_blocked_by[so.osd] = lost_at; + any_down_now = true; + break; + } + } + } + + // if not enough osds survived this interval, and we may have gone rw, + // then we need to wait for one of those osds to recover to + // ensure that we haven't lost any information. + if (!(*pcontdec)(up_now) && any_down_now) { + // fixme: how do we identify a "clean" shutdown anyway? + ldpp_dout(dpp, 10) << "build_prior possibly went active+rw," + << " insufficient up; including down osds" << dendl; + ceph_assert(!candidate_blocked_by.empty()); + pg_down = true; + blocked_by.insert( + candidate_blocked_by.begin(), + candidate_blocked_by.end()); + } + }); + + ldpp_dout(dpp, 10) << "build_prior final: probe " << probe + << " down " << down + << " blocked_by " << blocked_by + << (pg_down ? " pg_down":"") + << dendl; +} + +/** + * pg_query_t - used to ask a peer for information about a pg. + * + * note: if version=0, type=LOG, then we just provide our full log. + */ +struct pg_query_t { + enum { + INFO = 0, + LOG = 1, + MISSING = 4, + FULLLOG = 5, + }; + std::string_view get_type_name() const { + switch (type) { + case INFO: return "info"; + case LOG: return "log"; + case MISSING: return "missing"; + case FULLLOG: return "fulllog"; + default: return "???"; + } + } + + __s32 type; + eversion_t since; + pg_history_t history; + epoch_t epoch_sent; + shard_id_t to; + shard_id_t from; + + pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD), + from(shard_id_t::NO_SHARD) {} + pg_query_t( + int t, + shard_id_t to, + shard_id_t from, + const pg_history_t& h, + epoch_t epoch_sent) + : type(t), + history(h), + epoch_sent(epoch_sent), + to(to), from(from) { + ceph_assert(t != LOG); + } + pg_query_t( + int t, + shard_id_t to, + shard_id_t from, + eversion_t s, + const pg_history_t& h, + epoch_t epoch_sent) + : type(t), since(s), history(h), + epoch_sent(epoch_sent), to(to), from(from) { + ceph_assert(t == LOG); + } + + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_query_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pg_query_t) + +inline ostream& operator<<(ostream& out, const pg_query_t& q) { + out << "query(" << q.get_type_name() << " " << q.since; + if (q.type == pg_query_t::LOG) + out << " " << q.history; + out << " epoch_sent " << q.epoch_sent; + out << ")"; + return out; +} + +class PGBackend; +class ObjectModDesc { + bool can_local_rollback; + bool rollback_info_completed; + + // version required to decode, reflected in encode/decode version + __u8 max_required_version = 1; +public: + class Visitor { + public: + virtual void append(uint64_t old_offset) {} + virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {} + virtual void rmobject(version_t old_version) {} + /** + * Used to support the unfound_lost_delete log event: if the stashed + * version exists, we unstash it, otherwise, we do nothing. This way + * each replica rolls back to whatever state it had prior to the attempt + * at mark unfound lost delete + */ + virtual void try_rmobject(version_t old_version) { + rmobject(old_version); + } + virtual void create() {} + virtual void update_snaps(const set<snapid_t> &old_snaps) {} + virtual void rollback_extents( + version_t gen, + const vector<pair<uint64_t, uint64_t> > &extents) {} + virtual ~Visitor() {} + }; + void visit(Visitor *visitor) const; + mutable bufferlist bl; + enum ModID { + APPEND = 1, + SETATTRS = 2, + DELETE = 3, + CREATE = 4, + UPDATE_SNAPS = 5, + TRY_DELETE = 6, + ROLLBACK_EXTENTS = 7 + }; + ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) { + bl.reassign_to_mempool(mempool::mempool_osd_pglog); + } + void claim(ObjectModDesc &other) { + bl.clear(); + bl.claim(other.bl); + can_local_rollback = other.can_local_rollback; + rollback_info_completed = other.rollback_info_completed; + } + void claim_append(ObjectModDesc &other) { + if (!can_local_rollback || rollback_info_completed) + return; + if (!other.can_local_rollback) { + mark_unrollbackable(); + return; + } + bl.claim_append(other.bl); + rollback_info_completed = other.rollback_info_completed; + } + void swap(ObjectModDesc &other) { + bl.swap(other.bl); + + using std::swap; + swap(other.can_local_rollback, can_local_rollback); + swap(other.rollback_info_completed, rollback_info_completed); + swap(other.max_required_version, max_required_version); + } + void append_id(ModID id) { + using ceph::encode; + uint8_t _id(id); + encode(_id, bl); + } + void append(uint64_t old_size) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(APPEND); + encode(old_size, bl); + ENCODE_FINISH(bl); + } + void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(SETATTRS); + encode(old_attrs, bl); + ENCODE_FINISH(bl); + } + bool rmobject(version_t deletion_version) { + if (!can_local_rollback || rollback_info_completed) + return false; + ENCODE_START(1, 1, bl); + append_id(DELETE); + encode(deletion_version, bl); + ENCODE_FINISH(bl); + rollback_info_completed = true; + return true; + } + bool try_rmobject(version_t deletion_version) { + if (!can_local_rollback || rollback_info_completed) + return false; + ENCODE_START(1, 1, bl); + append_id(TRY_DELETE); + encode(deletion_version, bl); + ENCODE_FINISH(bl); + rollback_info_completed = true; + return true; + } + void create() { + if (!can_local_rollback || rollback_info_completed) + return; + rollback_info_completed = true; + ENCODE_START(1, 1, bl); + append_id(CREATE); + ENCODE_FINISH(bl); + } + void update_snaps(const set<snapid_t> &old_snaps) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(UPDATE_SNAPS); + encode(old_snaps, bl); + ENCODE_FINISH(bl); + } + void rollback_extents( + version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) { + ceph_assert(can_local_rollback); + ceph_assert(!rollback_info_completed); + if (max_required_version < 2) + max_required_version = 2; + ENCODE_START(2, 2, bl); + append_id(ROLLBACK_EXTENTS); + encode(gen, bl); + encode(extents, bl); + ENCODE_FINISH(bl); + } + + // cannot be rolled back + void mark_unrollbackable() { + can_local_rollback = false; + bl.clear(); + } + bool can_rollback() const { + return can_local_rollback; + } + bool empty() const { + return can_local_rollback && (bl.length() == 0); + } + + bool requires_kraken() const { + return max_required_version >= 2; + } + + /** + * Create fresh copy of bl bytes to avoid keeping large buffers around + * in the case that bl contains ptrs which point into a much larger + * message buffer + */ + void trim_bl() const { + if (bl.length() > 0) + bl.rebuild(); + } + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ObjectModDesc*>& o); +}; +WRITE_CLASS_ENCODER(ObjectModDesc) + + +/** + * pg_log_entry_t - single entry/event in pg log + * + */ +struct pg_log_entry_t { + enum { + MODIFY = 1, // some unspecified modification (but not *all* modifications) + CLONE = 2, // cloned object from head + DELETE = 3, // deleted object + //BACKLOG = 4, // event invented by generate_backlog [obsolete] + LOST_REVERT = 5, // lost new version, revert to an older version. + LOST_DELETE = 6, // lost new version, revert to no object (deleted). + LOST_MARK = 7, // lost new version, now EIO + PROMOTE = 8, // promoted object from another tier + CLEAN = 9, // mark an object clean + ERROR = 10, // write that returned an error + }; + static const char *get_op_name(int op) { + switch (op) { + case MODIFY: + return "modify"; + case PROMOTE: + return "promote"; + case CLONE: + return "clone"; + case DELETE: + return "delete"; + case LOST_REVERT: + return "l_revert"; + case LOST_DELETE: + return "l_delete"; + case LOST_MARK: + return "l_mark"; + case CLEAN: + return "clean"; + case ERROR: + return "error"; + default: + return "unknown"; + } + } + const char *get_op_name() const { + return get_op_name(op); + } + + // describes state for a locally-rollbackable entry + ObjectModDesc mod_desc; + bufferlist snaps; // only for clone entries + hobject_t soid; + osd_reqid_t reqid; // caller+tid to uniquely identify request + mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids; + + /// map extra_reqids by index to error return code (if any) + mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes; + + eversion_t version, prior_version, reverting_to; + version_t user_version; // the user version for this entry + utime_t mtime; // this is the _user_ mtime, mind you + int32_t return_code; // only stored for ERRORs for dup detection + + __s32 op; + bool invalid_hash; // only when decoding sobject_t based entries + bool invalid_pool; // only when decoding pool-less hobject based entries + + pg_log_entry_t() + : user_version(0), return_code(0), op(0), + invalid_hash(false), invalid_pool(false) { + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + pg_log_entry_t(int _op, const hobject_t& _soid, + const eversion_t& v, const eversion_t& pv, + version_t uv, + const osd_reqid_t& rid, const utime_t& mt, + int return_code) + : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv), + mtime(mt), return_code(return_code), op(_op), + invalid_hash(false), invalid_pool(false) { + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + bool is_clone() const { return op == CLONE; } + bool is_modify() const { return op == MODIFY; } + bool is_promote() const { return op == PROMOTE; } + bool is_clean() const { return op == CLEAN; } + bool is_lost_revert() const { return op == LOST_REVERT; } + bool is_lost_delete() const { return op == LOST_DELETE; } + bool is_lost_mark() const { return op == LOST_MARK; } + bool is_error() const { return op == ERROR; } + + bool is_update() const { + return + is_clone() || is_modify() || is_promote() || is_clean() || + is_lost_revert() || is_lost_mark(); + } + bool is_delete() const { + return op == DELETE || op == LOST_DELETE; + } + + bool can_rollback() const { + return mod_desc.can_rollback(); + } + + void mark_unrollbackable() { + mod_desc.mark_unrollbackable(); + } + + bool requires_kraken() const { + return mod_desc.requires_kraken(); + } + + // Errors are only used for dup detection, whereas + // the index by objects is used by recovery, copy_get, + // and other facilities that don't expect or need to + // be aware of error entries. + bool object_is_indexed() const { + return !is_error(); + } + + bool reqid_is_indexed() const { + return reqid != osd_reqid_t() && + (op == MODIFY || op == DELETE || op == ERROR); + } + + string get_key_name() const; + void encode_with_checksum(bufferlist& bl) const; + void decode_with_checksum(bufferlist::const_iterator& p); + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_log_entry_t*>& o); + +}; +WRITE_CLASS_ENCODER(pg_log_entry_t) + +ostream& operator<<(ostream& out, const pg_log_entry_t& e); + +struct pg_log_dup_t { + osd_reqid_t reqid; // caller+tid to uniquely identify request + eversion_t version; + version_t user_version; // the user version for this entry + int32_t return_code; // only stored for ERRORs for dup detection + + pg_log_dup_t() + : user_version(0), return_code(0) + {} + explicit pg_log_dup_t(const pg_log_entry_t& entry) + : reqid(entry.reqid), version(entry.version), + user_version(entry.user_version), return_code(entry.return_code) + {} + pg_log_dup_t(const eversion_t& v, version_t uv, + const osd_reqid_t& rid, int return_code) + : reqid(rid), version(v), user_version(uv), + return_code(return_code) + {} + + string get_key_name() const; + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_log_dup_t*>& o); + + bool operator==(const pg_log_dup_t &rhs) const { + return reqid == rhs.reqid && + version == rhs.version && + user_version == rhs.user_version && + return_code == rhs.return_code; + } + bool operator!=(const pg_log_dup_t &rhs) const { + return !(*this == rhs); + } + + friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e); +}; +WRITE_CLASS_ENCODER(pg_log_dup_t) + +std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e); + +/** + * pg_log_t - incremental log of recent pg changes. + * + * serves as a recovery queue for recent changes. + */ +struct pg_log_t { + /* + * head - newest entry (update|delete) + * tail - entry previous to oldest (update|delete) for which we have + * complete negative information. + * i.e. we can infer pg contents for any store whose last_update >= tail. + */ + eversion_t head; // newest entry + eversion_t tail; // version prior to oldest + +protected: + // We can rollback rollback-able entries > can_rollback_to + eversion_t can_rollback_to; + + // always <= can_rollback_to, indicates how far stashed rollback + // data can be found + eversion_t rollback_info_trimmed_to; + +public: + // the actual log + mempool::osd_pglog::list<pg_log_entry_t> log; + + // entries just for dup op detection ordered oldest to newest + mempool::osd_pglog::list<pg_log_dup_t> dups; + + pg_log_t() = default; + pg_log_t(const eversion_t &last_update, + const eversion_t &log_tail, + const eversion_t &can_rollback_to, + const eversion_t &rollback_info_trimmed_to, + mempool::osd_pglog::list<pg_log_entry_t> &&entries, + mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries) + : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to), + rollback_info_trimmed_to(rollback_info_trimmed_to), + log(std::move(entries)), dups(std::move(dup_entries)) {} + pg_log_t(const eversion_t &last_update, + const eversion_t &log_tail, + const eversion_t &can_rollback_to, + const eversion_t &rollback_info_trimmed_to, + const std::list<pg_log_entry_t> &entries, + const std::list<pg_log_dup_t> &dup_entries) + : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to), + rollback_info_trimmed_to(rollback_info_trimmed_to) { + for (auto &&entry: entries) { + log.push_back(entry); + } + for (auto &&entry: dup_entries) { + dups.push_back(entry); + } + } + + void clear() { + eversion_t z; + rollback_info_trimmed_to = can_rollback_to = head = tail = z; + log.clear(); + dups.clear(); + } + + eversion_t get_rollback_info_trimmed_to() const { + return rollback_info_trimmed_to; + } + eversion_t get_can_rollback_to() const { + return can_rollback_to; + } + + + pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) { + mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog; + oldlog.swap(log); + + eversion_t old_tail; + unsigned mask = ~((~0)<<split_bits); + for (auto i = oldlog.begin(); + i != oldlog.end(); + ) { + if ((i->soid.get_hash() & mask) == child_pgid.m_seed) { + childlog.push_back(*i); + } else { + log.push_back(*i); + } + oldlog.erase(i++); + } + + // osd_reqid is unique, so it doesn't matter if there are extra + // dup entries in each pg. To avoid storing oid with the dup + // entries, just copy the whole list. + auto childdups(dups); + + return pg_log_t( + head, + tail, + can_rollback_to, + rollback_info_trimmed_to, + std::move(childlog), + std::move(childdups)); + } + + mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) { + ceph_assert(newhead >= tail); + + mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end(); + mempool::osd_pglog::list<pg_log_entry_t> divergent; + while (true) { + if (p == log.begin()) { + // yikes, the whole thing is divergent! + using std::swap; + swap(divergent, log); + break; + } + --p; + if (p->version.version <= newhead.version) { + /* + * look at eversion.version here. we want to avoid a situation like: + * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529 + * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529 + * lower_bound = 100'9 + * i.e, same request, different version. If the eversion.version is > the + * lower_bound, we it is divergent. + */ + ++p; + divergent.splice(divergent.begin(), log, p, log.end()); + break; + } + ceph_assert(p->version > newhead); + } + head = newhead; + + if (can_rollback_to > newhead) + can_rollback_to = newhead; + + if (rollback_info_trimmed_to > newhead) + rollback_info_trimmed_to = newhead; + + return divergent; + } + + void merge_from(const vector<pg_log_t*>& slogs, eversion_t last_update) { + log.clear(); + + // sort and merge dups + multimap<eversion_t,pg_log_dup_t> sorted; + for (auto& d : dups) { + sorted.emplace(d.version, d); + } + for (auto l : slogs) { + for (auto& d : l->dups) { + sorted.emplace(d.version, d); + } + } + dups.clear(); + for (auto& i : sorted) { + dups.push_back(i.second); + } + + head = last_update; + tail = last_update; + can_rollback_to = last_update; + rollback_info_trimmed_to = last_update; + } + + bool empty() const { + return log.empty(); + } + + bool null() const { + return head.version == 0 && head.epoch == 0; + } + + size_t approx_size() const { + return head.version - tail.version; + } + + static void filter_log(spg_t import_pgid, const OSDMap &curmap, + const string &hit_set_namespace, const pg_log_t &in, + pg_log_t &out, pg_log_t &reject); + + /** + * copy entries from the tail of another pg_log_t + * + * @param other pg_log_t to copy from + * @param from copy entries after this version + */ + void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from); + + /** + * copy up to N entries + * + * @param other source log + * @param max max number of entries to copy + */ + void copy_up_to(CephContext* cct, const pg_log_t &other, int max); + + ostream& print(ostream& out) const; + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl, int64_t pool = -1); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_log_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_log_t) + +inline ostream& operator<<(ostream& out, const pg_log_t& log) +{ + out << "log((" << log.tail << "," << log.head << "], crt=" + << log.get_can_rollback_to() << ")"; + return out; +} + + +/** + * pg_missing_t - summary of missing objects. + * + * kept in memory, as a supplement to pg_log_t + * also used to pass missing info in messages. + */ +struct pg_missing_item { + eversion_t need, have; + enum missing_flags_t { + FLAG_NONE = 0, + FLAG_DELETE = 1, + } flags; + pg_missing_item() : flags(FLAG_NONE) {} + explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version + pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) { + set_delete(is_delete); + } + + void encode(bufferlist& bl, uint64_t features) const { + using ceph::encode; + if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) { + // encoding a zeroed eversion_t to differentiate between this and + // legacy unversioned encoding - a need value of 0'0 is not + // possible. This can be replaced with the legacy encoding + // macros post-luminous. + eversion_t e; + encode(e, bl); + encode(need, bl); + encode(have, bl); + encode(static_cast<uint8_t>(flags), bl); + } else { + // legacy unversioned encoding + encode(need, bl); + encode(have, bl); + } + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + eversion_t e; + decode(e, bl); + if (e != eversion_t()) { + // legacy encoding, this is the need value + need = e; + decode(have, bl); + } else { + decode(need, bl); + decode(have, bl); + uint8_t f; + decode(f, bl); + flags = static_cast<missing_flags_t>(f); + } + } + + void set_delete(bool is_delete) { + flags = is_delete ? FLAG_DELETE : FLAG_NONE; + } + + bool is_delete() const { + return (flags & FLAG_DELETE) == FLAG_DELETE; + } + + string flag_str() const { + if (flags == FLAG_NONE) { + return "none"; + } else { + return "delete"; + } + } + + void dump(Formatter *f) const { + f->dump_stream("need") << need; + f->dump_stream("have") << have; + f->dump_stream("flags") << flag_str(); + } + static void generate_test_instances(list<pg_missing_item*>& o) { + o.push_back(new pg_missing_item); + o.push_back(new pg_missing_item); + o.back()->need = eversion_t(1, 2); + o.back()->have = eversion_t(1, 1); + o.push_back(new pg_missing_item); + o.back()->need = eversion_t(3, 5); + o.back()->have = eversion_t(3, 4); + o.back()->flags = FLAG_DELETE; + } + bool operator==(const pg_missing_item &rhs) const { + return need == rhs.need && have == rhs.have && flags == rhs.flags; + } + bool operator!=(const pg_missing_item &rhs) const { + return !(*this == rhs); + } +}; +WRITE_CLASS_ENCODER_FEATURES(pg_missing_item) +ostream& operator<<(ostream& out, const pg_missing_item &item); + +class pg_missing_const_i { +public: + virtual const map<hobject_t, pg_missing_item> & + get_items() const = 0; + virtual const map<version_t, hobject_t> &get_rmissing() const = 0; + virtual bool get_may_include_deletes() const = 0; + virtual unsigned int num_missing() const = 0; + virtual bool have_missing() const = 0; + virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0; + virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0; + virtual ~pg_missing_const_i() {} +}; + + +template <bool Track> +class ChangeTracker { +public: + void changed(const hobject_t &obj) {} + template <typename F> + void get_changed(F &&f) const {} + void flush() {} + bool is_clean() const { + return true; + } +}; +template <> +class ChangeTracker<true> { + set<hobject_t> _changed; +public: + void changed(const hobject_t &obj) { + _changed.insert(obj); + } + template <typename F> + void get_changed(F &&f) const { + for (auto const &i: _changed) { + f(i); + } + } + void flush() { + _changed.clear(); + } + bool is_clean() const { + return _changed.empty(); + } +}; + +template <bool TrackChanges> +class pg_missing_set : public pg_missing_const_i { + using item = pg_missing_item; + map<hobject_t, item> missing; // oid -> (need v, have v) + map<version_t, hobject_t> rmissing; // v -> oid + ChangeTracker<TrackChanges> tracker; + +public: + pg_missing_set() = default; + + template <typename missing_type> + pg_missing_set(const missing_type &m) { + missing = m.get_items(); + rmissing = m.get_rmissing(); + may_include_deletes = m.get_may_include_deletes(); + for (auto &&i: missing) + tracker.changed(i.first); + } + + bool may_include_deletes = false; + + const map<hobject_t, item> &get_items() const override { + return missing; + } + const map<version_t, hobject_t> &get_rmissing() const override { + return rmissing; + } + bool get_may_include_deletes() const override { + return may_include_deletes; + } + unsigned int num_missing() const override { + return missing.size(); + } + bool have_missing() const override { + return !missing.empty(); + } + bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override { + auto iter = missing.find(oid); + if (iter == missing.end()) + return false; + if (out) + *out = iter->second; + return true; + } + bool is_missing(const hobject_t& oid, eversion_t v) const override { + map<hobject_t, item>::const_iterator m = + missing.find(oid); + if (m == missing.end()) + return false; + const item &item(m->second); + if (item.need > v) + return false; + return true; + } + eversion_t get_oldest_need() const { + if (missing.empty()) { + return eversion_t(); + } + auto it = missing.find(rmissing.begin()->second); + ceph_assert(it != missing.end()); + return it->second.need; + } + + void claim(pg_missing_set& o) { + static_assert(!TrackChanges, "Can't use claim with TrackChanges"); + missing.swap(o.missing); + rmissing.swap(o.rmissing); + } + + /* + * this needs to be called in log order as we extend the log. it + * assumes missing is accurate up through the previous log entry. + */ + void add_next_event(const pg_log_entry_t& e) { + map<hobject_t, item>::iterator missing_it; + missing_it = missing.find(e.soid); + bool is_missing_divergent_item = missing_it != missing.end(); + if (e.prior_version == eversion_t() || e.is_clone()) { + // new object. + if (is_missing_divergent_item) { // use iterator + rmissing.erase((missing_it->second).need.version); + missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil + } else // create new element in missing map + missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil + } else if (is_missing_divergent_item) { + // already missing (prior). + rmissing.erase((missing_it->second).need.version); + (missing_it->second).need = e.version; // leave .have unchanged. + missing_it->second.set_delete(e.is_delete()); + } else { + // not missing, we must have prior_version (if any) + ceph_assert(!is_missing_divergent_item); + missing[e.soid] = item(e.version, e.prior_version, e.is_delete()); + } + rmissing[e.version.version] = e.soid; + tracker.changed(e.soid); + } + + void revise_need(hobject_t oid, eversion_t need, bool is_delete) { + if (missing.count(oid)) { + rmissing.erase(missing[oid].need.version); + missing[oid].need = need; // no not adjust .have + missing[oid].set_delete(is_delete); + } else { + missing[oid] = item(need, eversion_t(), is_delete); + } + rmissing[need.version] = oid; + + tracker.changed(oid); + } + + void revise_have(hobject_t oid, eversion_t have) { + if (missing.count(oid)) { + tracker.changed(oid); + missing[oid].have = have; + } + } + + void add(const hobject_t& oid, eversion_t need, eversion_t have, + bool is_delete) { + missing[oid] = item(need, have, is_delete); + rmissing[need.version] = oid; + tracker.changed(oid); + } + + void rm(const hobject_t& oid, eversion_t v) { + std::map<hobject_t, item>::iterator p = missing.find(oid); + if (p != missing.end() && p->second.need <= v) + rm(p); + } + + void rm(std::map<hobject_t, item>::const_iterator m) { + tracker.changed(m->first); + rmissing.erase(m->second.need.version); + missing.erase(m); + } + + void got(const hobject_t& oid, eversion_t v) { + std::map<hobject_t, item>::iterator p = missing.find(oid); + ceph_assert(p != missing.end()); + ceph_assert(p->second.need <= v || p->second.is_delete()); + got(p); + } + + void got(std::map<hobject_t, item>::const_iterator m) { + tracker.changed(m->first); + rmissing.erase(m->second.need.version); + missing.erase(m); + } + + void split_into( + pg_t child_pgid, + unsigned split_bits, + pg_missing_set *omissing) { + omissing->may_include_deletes = may_include_deletes; + unsigned mask = ~((~0)<<split_bits); + for (map<hobject_t, item>::iterator i = missing.begin(); + i != missing.end(); + ) { + if ((i->first.get_hash() & mask) == child_pgid.m_seed) { + omissing->add(i->first, i->second.need, i->second.have, + i->second.is_delete()); + rm(i++); + } else { + ++i; + } + } + } + + void clear() { + for (auto const &i: missing) + tracker.changed(i.first); + missing.clear(); + rmissing.clear(); + } + + void encode(bufferlist &bl) const { + ENCODE_START(4, 2, bl); + encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0); + encode(may_include_deletes, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &bl, int64_t pool = -1) { + for (auto const &i: missing) + tracker.changed(i.first); + DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl); + decode(missing, bl); + if (struct_v >= 4) { + decode(may_include_deletes, bl); + } + DECODE_FINISH(bl); + + if (struct_v < 3) { + // Handle hobject_t upgrade + map<hobject_t, item> tmp; + for (map<hobject_t, item>::iterator i = + missing.begin(); + i != missing.end(); + ) { + if (!i->first.is_max() && i->first.pool == -1) { + hobject_t to_insert(i->first); + to_insert.pool = pool; + tmp[to_insert] = i->second; + missing.erase(i++); + } else { + ++i; + } + } + missing.insert(tmp.begin(), tmp.end()); + } + + for (map<hobject_t,item>::iterator it = + missing.begin(); + it != missing.end(); + ++it) + rmissing[it->second.need.version] = it->first; + for (auto const &i: missing) + tracker.changed(i.first); + } + void dump(Formatter *f) const { + f->open_array_section("missing"); + for (map<hobject_t,item>::const_iterator p = + missing.begin(); p != missing.end(); ++p) { + f->open_object_section("item"); + f->dump_stream("object") << p->first; + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_bool("may_include_deletes", may_include_deletes); + } + template <typename F> + void filter_objects(F &&f) { + for (auto i = missing.begin(); i != missing.end();) { + if (f(i->first)) { + rm(i++); + } else { + ++i; + } + } + } + static void generate_test_instances(list<pg_missing_set*>& o) { + o.push_back(new pg_missing_set); + o.push_back(new pg_missing_set); + o.back()->add( + hobject_t(object_t("foo"), "foo", 123, 456, 0, ""), + eversion_t(5, 6), eversion_t(5, 1), false); + o.push_back(new pg_missing_set); + o.back()->add( + hobject_t(object_t("foo"), "foo", 123, 456, 0, ""), + eversion_t(5, 6), eversion_t(5, 1), true); + o.back()->may_include_deletes = true; + } + template <typename F> + void get_changed(F &&f) const { + tracker.get_changed(f); + } + void flush() { + tracker.flush(); + } + bool is_clean() const { + return tracker.is_clean(); + } + template <typename missing_t> + bool debug_verify_from_init( + const missing_t &init_missing, + ostream *oss) const { + if (!TrackChanges) + return true; + auto check_missing(init_missing.get_items()); + tracker.get_changed([&](const hobject_t &hoid) { + check_missing.erase(hoid); + if (missing.count(hoid)) { + check_missing.insert(*(missing.find(hoid))); + } + }); + bool ok = true; + if (check_missing.size() != missing.size()) { + if (oss) { + *oss << "Size mismatch, check: " << check_missing.size() + << ", actual: " << missing.size() << "\n"; + } + ok = false; + } + for (auto &i: missing) { + if (!check_missing.count(i.first)) { + if (oss) + *oss << "check_missing missing " << i.first << "\n"; + ok = false; + } else if (check_missing[i.first] != i.second) { + if (oss) + *oss << "check_missing missing item mismatch on " << i.first + << ", check: " << check_missing[i.first] + << ", actual: " << i.second << "\n"; + ok = false; + } + } + if (oss && !ok) { + *oss << "check_missing: " << check_missing << "\n"; + set<hobject_t> changed; + tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); }); + *oss << "changed: " << changed << "\n"; + } + return ok; + } +}; +template <bool TrackChanges> +void encode( + const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) { + ENCODE_DUMP_PRE(); + c.encode(bl); + ENCODE_DUMP_POST(cl); +} +template <bool TrackChanges> +void decode(pg_missing_set<TrackChanges> &c, bufferlist::const_iterator &p) { + c.decode(p); +} +template <bool TrackChanges> +ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing) +{ + out << "missing(" << missing.num_missing() + << " may_include_deletes = " << missing.may_include_deletes; + //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; + out << ")"; + return out; +} + +using pg_missing_t = pg_missing_set<false>; +using pg_missing_tracker_t = pg_missing_set<true>; + + +/** + * pg list objects response format + * + */ +struct pg_nls_response_t { + collection_list_handle_t handle; + list<librados::ListObjectImpl> entries; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(handle, bl); + __u32 n = (__u32)entries.size(); + encode(n, bl); + for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) { + encode(i->nspace, bl); + encode(i->oid, bl); + encode(i->locator, bl); + } + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(handle, bl); + __u32 n; + decode(n, bl); + entries.clear(); + while (n--) { + librados::ListObjectImpl i; + decode(i.nspace, bl); + decode(i.oid, bl); + decode(i.locator, bl); + entries.push_back(i); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const { + f->dump_stream("handle") << handle; + f->open_array_section("entries"); + for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("object"); + f->dump_string("namespace", p->nspace); + f->dump_string("object", p->oid); + f->dump_string("key", p->locator); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(list<pg_nls_response_t*>& o) { + o.push_back(new pg_nls_response_t); + o.push_back(new pg_nls_response_t); + o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("", "one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("", "three", "")); + o.push_back(new pg_nls_response_t); + o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", "")); + o.push_back(new pg_nls_response_t); + o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("", "one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("", "three", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", "")); + } +}; + +WRITE_CLASS_ENCODER(pg_nls_response_t) + +// For backwards compatibility with older OSD requests +struct pg_ls_response_t { + collection_list_handle_t handle; + list<pair<object_t, string> > entries; + + void encode(bufferlist& bl) const { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(handle, bl); + encode(entries, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + __u8 v; + decode(v, bl); + ceph_assert(v == 1); + decode(handle, bl); + decode(entries, bl); + } + void dump(Formatter *f) const { + f->dump_stream("handle") << handle; + f->open_array_section("entries"); + for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("object"); + f->dump_stream("object") << p->first; + f->dump_string("key", p->second); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(list<pg_ls_response_t*>& o) { + o.push_back(new pg_ls_response_t); + o.push_back(new pg_ls_response_t); + o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, ""); + o.back()->entries.push_back(make_pair(object_t("one"), string())); + o.back()->entries.push_back(make_pair(object_t("two"), string("twokey"))); + } +}; + +WRITE_CLASS_ENCODER(pg_ls_response_t) + +/** + * object_copy_cursor_t + */ +struct object_copy_cursor_t { + uint64_t data_offset; + string omap_offset; + bool attr_complete; + bool data_complete; + bool omap_complete; + + object_copy_cursor_t() + : data_offset(0), + attr_complete(false), + data_complete(false), + omap_complete(false) + {} + + bool is_initial() const { + return !attr_complete && data_offset == 0 && omap_offset.empty(); + } + bool is_complete() const { + return attr_complete && data_complete && omap_complete; + } + + static void generate_test_instances(list<object_copy_cursor_t*>& o); + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(object_copy_cursor_t) + +/** + * object_copy_data_t + * + * Return data from a copy request. The semantics are a little strange + * as a result of the encoding's heritage. + * + * In particular, the sender unconditionally fills in the cursor (from what + * it receives and sends), the size, and the mtime, but is responsible for + * figuring out whether it should put any data in the attrs, data, or + * omap members (corresponding to xattrs, object data, and the omap entries) + * based on external data (the client includes a max amount to return with + * the copy request). The client then looks into the attrs, data, and/or omap + * based on the contents of the cursor. + */ +struct object_copy_data_t { + enum { + FLAG_DATA_DIGEST = 1<<0, + FLAG_OMAP_DIGEST = 1<<1, + }; + object_copy_cursor_t cursor; + uint64_t size; + utime_t mtime; + uint32_t data_digest, omap_digest; + uint32_t flags; + map<string, bufferlist> attrs; + bufferlist data; + bufferlist omap_header; + bufferlist omap_data; + + /// which snaps we are defined for (if a snap and not the head) + vector<snapid_t> snaps; + /// latest snap seq for the object (if head) + snapid_t snap_seq; + + /// recent reqids on this object + mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids; + + /// map reqids by index to error return code (if any) + mempool::osd_pglog::map<uint32_t, int> reqid_return_codes; + + uint64_t truncate_seq; + uint64_t truncate_size; + +public: + object_copy_data_t() : + size((uint64_t)-1), data_digest(-1), + omap_digest(-1), flags(0), + truncate_seq(0), + truncate_size(0) {} + + static void generate_test_instances(list<object_copy_data_t*>& o); + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t) + +/** + * pg creation info + */ +struct pg_create_t { + epoch_t created; // epoch pg created + pg_t parent; // split from parent (if != pg_t()) + __s32 split_bits; + + pg_create_t() + : created(0), split_bits(0) {} + pg_create_t(unsigned c, pg_t p, int s) + : created(c), parent(p), split_bits(s) {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<pg_create_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_create_t) + +// ----------------------------------------- + +class ObjectExtent { + /** + * ObjectExtents are used for specifying IO behavior against RADOS + * objects when one is using the ObjectCacher. + * + * To use this in a real system, *every member* must be filled + * out correctly. In particular, make sure to initialize the + * oloc correctly, as its default values are deliberate poison + * and will cause internal ObjectCacher asserts. + * + * Similarly, your buffer_extents vector *must* specify a total + * size equal to your length. If the buffer_extents inadvertently + * contain less space than the length member specifies, you + * will get unintelligible asserts deep in the ObjectCacher. + * + * If you are trying to do testing and don't care about actual + * RADOS function, the simplest thing to do is to initialize + * the ObjectExtent (truncate_size can be 0), create a single entry + * in buffer_extents matching the length, and set oloc.pool to 0. + */ + public: + object_t oid; // object id + uint64_t objectno; + uint64_t offset; // in object + uint64_t length; // in object + uint64_t truncate_size; // in object + + object_locator_t oloc; // object locator (pool etc) + + vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) + + ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {} + ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) : + oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { } +}; + +inline ostream& operator<<(ostream& out, const ObjectExtent &ex) +{ + return out << "extent(" + << ex.oid << " (" << ex.objectno << ") in " << ex.oloc + << " " << ex.offset << "~" << ex.length + << " -> " << ex.buffer_extents + << ")"; +} + + +// --------------------------------------- + +class OSDSuperblock { +public: + uuid_d cluster_fsid, osd_fsid; + int32_t whoami; // my role in this fs. + epoch_t current_epoch; // most recent epoch + epoch_t oldest_map, newest_map; // oldest/newest maps we have. + double weight; + + CompatSet compat_features; + + // last interval over which i mounted and was then active + epoch_t mounted; // last epoch i mounted + epoch_t clean_thru; // epoch i was active and clean thru + + OSDSuperblock() : + whoami(-1), + current_epoch(0), oldest_map(0), newest_map(0), weight(0), + mounted(0), clean_thru(0) { + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<OSDSuperblock*>& o); +}; +WRITE_CLASS_ENCODER(OSDSuperblock) + +inline ostream& operator<<(ostream& out, const OSDSuperblock& sb) +{ + return out << "sb(" << sb.cluster_fsid + << " osd." << sb.whoami + << " " << sb.osd_fsid + << " e" << sb.current_epoch + << " [" << sb.oldest_map << "," << sb.newest_map << "]" + << " lci=[" << sb.mounted << "," << sb.clean_thru << "]" + << ")"; +} + + +// ------- + + + + + + +/* + * attached to object head. describes most recent snap context, and + * set of existing clones. + */ +struct SnapSet { + snapid_t seq; + vector<snapid_t> snaps; // descending + vector<snapid_t> clones; // ascending + map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest + map<snapid_t, uint64_t> clone_size; + map<snapid_t, vector<snapid_t>> clone_snaps; // descending + + SnapSet() : seq(0) {} + explicit SnapSet(bufferlist& bl) { + auto p = std::cbegin(bl); + decode(p); + } + + /// populate SnapSet from a librados::snap_set_t + void from_snap_set(const librados::snap_set_t& ss, bool legacy); + + /// get space accounted to clone + uint64_t get_clone_bytes(snapid_t clone) const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<SnapSet*>& o); + + SnapContext get_ssc_as_of(snapid_t as_of) const { + SnapContext out; + out.seq = as_of; + for (vector<snapid_t>::const_iterator i = snaps.begin(); + i != snaps.end(); + ++i) { + if (*i <= as_of) + out.snaps.push_back(*i); + } + return out; + } + + + SnapSet get_filtered(const pg_pool_t &pinfo) const; + void filter(const pg_pool_t &pinfo); +}; +WRITE_CLASS_ENCODER(SnapSet) + +ostream& operator<<(ostream& out, const SnapSet& cs); + + + +#define OI_ATTR "_" +#define SS_ATTR "snapset" + +struct watch_info_t { + uint64_t cookie; + uint32_t timeout_seconds; + entity_addr_t addr; + + watch_info_t() : cookie(0), timeout_seconds(0) { } + watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {} + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<watch_info_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(watch_info_t) + +static inline bool operator==(const watch_info_t& l, const watch_info_t& r) { + return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds + && l.addr == r.addr; +} + +static inline ostream& operator<<(ostream& out, const watch_info_t& w) { + return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s" + << " " << w.addr << ")"; +} + +struct notify_info_t { + uint64_t cookie; + uint64_t notify_id; + uint32_t timeout; + bufferlist bl; +}; + +static inline ostream& operator<<(ostream& out, const notify_info_t& n) { + return out << "notify(cookie " << n.cookie + << " notify" << n.notify_id + << " " << n.timeout << "s)"; +} + +struct chunk_info_t { + typedef enum { + FLAG_DIRTY = 1, + FLAG_MISSING = 2, + FLAG_HAS_REFERENCE = 4, + FLAG_HAS_FINGERPRINT = 8, + } cflag_t; + uint32_t offset; + uint32_t length; + hobject_t oid; + cflag_t flags; // FLAG_* + + chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { } + + static string get_flag_string(uint64_t flags) { + string r; + if (flags & FLAG_DIRTY) { + r += "|dirty"; + } + if (flags & FLAG_MISSING) { + r += "|missing"; + } + if (flags & FLAG_HAS_REFERENCE) { + r += "|has_reference"; + } + if (flags & FLAG_HAS_FINGERPRINT) { + r += "|has_fingerprint"; + } + if (r.length()) + return r.substr(1); + return r; + } + bool test_flag(cflag_t f) const { + return (flags & f) == f; + } + void set_flag(cflag_t f) { + flags = (cflag_t)(flags | f); + } + void set_flags(cflag_t f) { + flags = f; + } + void clear_flag(cflag_t f) { + flags = (cflag_t)(flags & ~f); + } + void clear_flags() { + flags = (cflag_t)0; + } + bool is_dirty() const { + return test_flag(FLAG_DIRTY); + } + bool is_missing() const { + return test_flag(FLAG_MISSING); + } + bool has_reference() const { + return test_flag(FLAG_HAS_REFERENCE); + } + bool has_fingerprint() const { + return test_flag(FLAG_HAS_FINGERPRINT); + } + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + friend ostream& operator<<(ostream& out, const chunk_info_t& ci); +}; +WRITE_CLASS_ENCODER(chunk_info_t) +ostream& operator<<(ostream& out, const chunk_info_t& ci); + +struct object_info_t; +struct object_manifest_t { + enum { + TYPE_NONE = 0, + TYPE_REDIRECT = 1, + TYPE_CHUNKED = 2, + }; + uint8_t type; // redirect, chunked, ... + hobject_t redirect_target; + map <uint64_t, chunk_info_t> chunk_map; + + object_manifest_t() : type(0) { } + object_manifest_t(uint8_t type, const hobject_t& redirect_target) + : type(type), redirect_target(redirect_target) { } + + bool is_empty() const { + return type == TYPE_NONE; + } + bool is_redirect() const { + return type == TYPE_REDIRECT; + } + bool is_chunked() const { + return type == TYPE_CHUNKED; + } + static std::string_view get_type_name(uint8_t m) { + switch (m) { + case TYPE_NONE: return "none"; + case TYPE_REDIRECT: return "redirect"; + case TYPE_CHUNKED: return "chunked"; + default: return "unknown"; + } + } + std::string_view get_type_name() const { + return get_type_name(type); + } + void clear() { + type = 0; + redirect_target = hobject_t(); + chunk_map.clear(); + } + static void generate_test_instances(list<object_manifest_t*>& o); + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + friend ostream& operator<<(ostream& out, const object_info_t& oi); +}; +WRITE_CLASS_ENCODER(object_manifest_t) +ostream& operator<<(ostream& out, const object_manifest_t& oi); + +struct object_info_t { + hobject_t soid; + eversion_t version, prior_version; + version_t user_version; + osd_reqid_t last_reqid; + + uint64_t size; + utime_t mtime; + utime_t local_mtime; // local mtime + + // note: these are currently encoded into a total 16 bits; see + // encode()/decode() for the weirdness. + typedef enum { + FLAG_LOST = 1<<0, + FLAG_WHITEOUT = 1<<1, // object logically does not exist + FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied + FLAG_OMAP = 1<<3, // has (or may have) some/any omap data + FLAG_DATA_DIGEST = 1<<4, // has data crc + FLAG_OMAP_DIGEST = 1<<5, // has omap crc + FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier + FLAG_MANIFEST = 1<<7, // has manifest + FLAG_USES_TMAP = 1<<8, // deprecated; no longer used + FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference + } flag_t; + + flag_t flags; + + static string get_flag_string(flag_t flags) { + string s; + vector<string> sv = get_flag_vector(flags); + for (auto ss : sv) { + s += string("|") + ss; + } + if (s.length()) + return s.substr(1); + return s; + } + static vector<string> get_flag_vector(flag_t flags) { + vector<string> sv; + if (flags & FLAG_LOST) + sv.insert(sv.end(), "lost"); + if (flags & FLAG_WHITEOUT) + sv.insert(sv.end(), "whiteout"); + if (flags & FLAG_DIRTY) + sv.insert(sv.end(), "dirty"); + if (flags & FLAG_USES_TMAP) + sv.insert(sv.end(), "uses_tmap"); + if (flags & FLAG_OMAP) + sv.insert(sv.end(), "omap"); + if (flags & FLAG_DATA_DIGEST) + sv.insert(sv.end(), "data_digest"); + if (flags & FLAG_OMAP_DIGEST) + sv.insert(sv.end(), "omap_digest"); + if (flags & FLAG_CACHE_PIN) + sv.insert(sv.end(), "cache_pin"); + if (flags & FLAG_MANIFEST) + sv.insert(sv.end(), "manifest"); + if (flags & FLAG_REDIRECT_HAS_REFERENCE) + sv.insert(sv.end(), "redirect_has_reference"); + return sv; + } + string get_flag_string() const { + return get_flag_string(flags); + } + + uint64_t truncate_seq, truncate_size; + + map<pair<uint64_t, entity_name_t>, watch_info_t> watchers; + + // opportunistic checksums; may or may not be present + __u32 data_digest; ///< data crc32c + __u32 omap_digest; ///< omap crc32c + + // alloc hint attribute + uint64_t expected_object_size, expected_write_size; + uint32_t alloc_hint_flags; + + struct object_manifest_t manifest; + + void copy_user_bits(const object_info_t& other); + + bool test_flag(flag_t f) const { + return (flags & f) == f; + } + void set_flag(flag_t f) { + flags = (flag_t)(flags | f); + } + void clear_flag(flag_t f) { + flags = (flag_t)(flags & ~f); + } + bool is_lost() const { + return test_flag(FLAG_LOST); + } + bool is_whiteout() const { + return test_flag(FLAG_WHITEOUT); + } + bool is_dirty() const { + return test_flag(FLAG_DIRTY); + } + bool is_omap() const { + return test_flag(FLAG_OMAP); + } + bool is_data_digest() const { + return test_flag(FLAG_DATA_DIGEST); + } + bool is_omap_digest() const { + return test_flag(FLAG_OMAP_DIGEST); + } + bool is_cache_pinned() const { + return test_flag(FLAG_CACHE_PIN); + } + bool has_manifest() const { + return test_flag(FLAG_MANIFEST); + } + void set_data_digest(__u32 d) { + set_flag(FLAG_DATA_DIGEST); + data_digest = d; + } + void set_omap_digest(__u32 d) { + set_flag(FLAG_OMAP_DIGEST); + omap_digest = d; + } + void clear_data_digest() { + clear_flag(FLAG_DATA_DIGEST); + data_digest = -1; + } + void clear_omap_digest() { + clear_flag(FLAG_OMAP_DIGEST); + omap_digest = -1; + } + void new_object() { + clear_data_digest(); + clear_omap_digest(); + } + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + void decode(bufferlist& bl) { + auto p = std::cbegin(bl); + decode(p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<object_info_t*>& o); + + explicit object_info_t() + : user_version(0), size(0), flags((flag_t)0), + truncate_seq(0), truncate_size(0), + data_digest(-1), omap_digest(-1), + expected_object_size(0), expected_write_size(0), + alloc_hint_flags(0) + {} + + explicit object_info_t(const hobject_t& s) + : soid(s), + user_version(0), size(0), flags((flag_t)0), + truncate_seq(0), truncate_size(0), + data_digest(-1), omap_digest(-1), + expected_object_size(0), expected_write_size(0), + alloc_hint_flags(0) + {} + + explicit object_info_t(bufferlist& bl) { + decode(bl); + } +}; +WRITE_CLASS_ENCODER_FEATURES(object_info_t) + +ostream& operator<<(ostream& out, const object_info_t& oi); + + + +// Object recovery +struct ObjectRecoveryInfo { + hobject_t soid; + eversion_t version; + uint64_t size; + object_info_t oi; + SnapSet ss; // only populated if soid is_snap() + interval_set<uint64_t> copy_subset; + map<hobject_t, interval_set<uint64_t>> clone_subset; + + ObjectRecoveryInfo() : size(0) { } + + static void generate_test_instances(list<ObjectRecoveryInfo*>& o); + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl, int64_t pool = -1); + ostream &print(ostream &out) const; + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo) +ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf); + +struct ObjectRecoveryProgress { + uint64_t data_recovered_to; + string omap_recovered_to; + bool first; + bool data_complete; + bool omap_complete; + bool error = false; + + ObjectRecoveryProgress() + : data_recovered_to(0), + first(true), + data_complete(false), omap_complete(false) { } + + bool is_complete(const ObjectRecoveryInfo& info) const { + return (data_recovered_to >= ( + info.copy_subset.empty() ? + 0 : info.copy_subset.range_end())) && + omap_complete; + } + + static void generate_test_instances(list<ObjectRecoveryProgress*>& o); + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + ostream &print(ostream &out) const; + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(ObjectRecoveryProgress) +ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog); + +struct PushReplyOp { + hobject_t soid; + + static void generate_test_instances(list<PushReplyOp*>& o); + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + ostream &print(ostream &out) const; + void dump(Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER(PushReplyOp) +ostream& operator<<(ostream& out, const PushReplyOp &op); + +struct PullOp { + hobject_t soid; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress recovery_progress; + + static void generate_test_instances(list<PullOp*>& o); + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + ostream &print(ostream &out) const; + void dump(Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER_FEATURES(PullOp) +ostream& operator<<(ostream& out, const PullOp &op); + +struct PushOp { + hobject_t soid; + eversion_t version; + bufferlist data; + interval_set<uint64_t> data_included; + bufferlist omap_header; + map<string, bufferlist> omap_entries; + map<string, bufferlist> attrset; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress before_progress; + ObjectRecoveryProgress after_progress; + + static void generate_test_instances(list<PushOp*>& o); + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + ostream &print(ostream &out) const; + void dump(Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER_FEATURES(PushOp) +ostream& operator<<(ostream& out, const PushOp &op); + + +/* + * summarize pg contents for purposes of a scrub + */ +struct ScrubMap { + struct object { + map<string,bufferptr> attrs; + uint64_t size; + __u32 omap_digest; ///< omap crc32c + __u32 digest; ///< data crc32c + bool negative:1; + bool digest_present:1; + bool omap_digest_present:1; + bool read_error:1; + bool stat_error:1; + bool ec_hash_mismatch:1; + bool ec_size_mismatch:1; + bool large_omap_object_found:1; + uint64_t large_omap_object_key_count = 0; + uint64_t large_omap_object_value_size = 0; + uint64_t object_omap_bytes = 0; + uint64_t object_omap_keys = 0; + + object() : + // Init invalid size so it won't match if we get a stat EIO error + size(-1), omap_digest(0), digest(0), + negative(false), digest_present(false), omap_digest_present(false), + read_error(false), stat_error(false), ec_hash_mismatch(false), + ec_size_mismatch(false), large_omap_object_found(false) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<object*>& o); + }; + WRITE_CLASS_ENCODER(object) + + map<hobject_t,object> objects; + eversion_t valid_through; + eversion_t incr_since; + bool has_large_omap_object_errors:1; + bool has_omap_keys:1; + + void merge_incr(const ScrubMap &l); + void clear_from(const hobject_t& start) { + objects.erase(objects.lower_bound(start), objects.end()); + } + void insert(const ScrubMap &r) { + objects.insert(r.objects.begin(), r.objects.end()); + } + void swap(ScrubMap &r) { + using std::swap; + swap(objects, r.objects); + swap(valid_through, r.valid_through); + swap(incr_since, r.incr_since); + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl, int64_t pool=-1); + void dump(Formatter *f) const; + static void generate_test_instances(list<ScrubMap*>& o); +}; +WRITE_CLASS_ENCODER(ScrubMap::object) +WRITE_CLASS_ENCODER(ScrubMap) + +struct ScrubMapBuilder { + bool deep = false; + vector<hobject_t> ls; + size_t pos = 0; + int64_t data_pos = 0; + string omap_pos; + int ret = 0; + bufferhash data_hash, omap_hash; ///< accumulatinng hash value + uint64_t omap_keys = 0; + uint64_t omap_bytes = 0; + + bool empty() { + return ls.empty(); + } + bool done() { + return pos >= ls.size(); + } + void reset() { + *this = ScrubMapBuilder(); + } + + bool data_done() { + return data_pos < 0; + } + + void next_object() { + ++pos; + data_pos = 0; + omap_pos.clear(); + omap_keys = 0; + omap_bytes = 0; + } + + friend ostream& operator<<(ostream& out, const ScrubMapBuilder& pos) { + out << "(" << pos.pos << "/" << pos.ls.size(); + if (pos.pos < pos.ls.size()) { + out << " " << pos.ls[pos.pos]; + } + if (pos.data_pos < 0) { + out << " byte " << pos.data_pos; + } + if (!pos.omap_pos.empty()) { + out << " key " << pos.omap_pos; + } + if (pos.deep) { + out << " deep"; + } + if (pos.ret) { + out << " ret " << pos.ret; + } + return out << ")"; + } +}; + +struct OSDOp { + ceph_osd_op op; + sobject_t soid; + + bufferlist indata, outdata; + errorcode32_t rval; + + OSDOp() : rval(0) { + // FIPS zeroization audit 20191115: this memset clean for security + memset(&op, 0, sizeof(ceph_osd_op)); + } + + /** + * split a bufferlist into constituent indata members of a vector of OSDOps + * + * @param ops [out] vector of OSDOps + * @param in [in] combined data buffer + */ + static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in); + + /** + * merge indata members of a vector of OSDOp into a single bufferlist + * + * Notably this also encodes certain other OSDOp data into the data + * buffer, including the sobject_t soid. + * + * @param ops [in] vector of OSDOps + * @param out [out] combined data buffer + */ + static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out); + + /** + * split a bufferlist into constituent outdata members of a vector of OSDOps + * + * @param ops [out] vector of OSDOps + * @param in [in] combined data buffer + */ + static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in); + + /** + * merge outdata members of a vector of OSDOps into a single bufferlist + * + * @param ops [in] vector of OSDOps + * @param out [out] combined data buffer + */ + static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out); + + /** + * Clear data as much as possible, leave minimal data for historical op dump + * + * @param ops [in] vector of OSDOps + */ + static void clear_data(vector<OSDOp>& ops); +}; + +ostream& operator<<(ostream& out, const OSDOp& op); + +struct watch_item_t { + entity_name_t name; + uint64_t cookie; + uint32_t timeout_seconds; + entity_addr_t addr; + + watch_item_t() : cookie(0), timeout_seconds(0) { } + watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout, + const entity_addr_t& addr) + : name(name), cookie(cookie), timeout_seconds(timeout), + addr(addr) { } + + void encode(bufferlist &bl, uint64_t features) const { + ENCODE_START(2, 1, bl); + encode(name, bl); + encode(cookie, bl); + encode(timeout_seconds, bl); + encode(addr, bl, features); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &bl) { + DECODE_START(2, bl); + decode(name, bl); + decode(cookie, bl); + decode(timeout_seconds, bl); + if (struct_v >= 2) { + decode(addr, bl); + } + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER_FEATURES(watch_item_t) + +struct obj_watch_item_t { + hobject_t obj; + watch_item_t wi; +}; + +/** + * obj list watch response format + * + */ +struct obj_list_watch_response_t { + list<watch_item_t> entries; + + void encode(bufferlist& bl, uint64_t features) const { + ENCODE_START(1, 1, bl); + encode(entries, bl, features); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(entries, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const { + f->open_array_section("entries"); + for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("watch"); + f->dump_stream("watcher") << p->name; + f->dump_int("cookie", p->cookie); + f->dump_int("timeout", p->timeout_seconds); + f->open_object_section("addr"); + p->addr.dump(f); + f->close_section(); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(list<obj_list_watch_response_t*>& o) { + entity_addr_t ea; + o.push_back(new obj_list_watch_response_t); + o.push_back(new obj_list_watch_response_t); + ea.set_type(entity_addr_t::TYPE_LEGACY); + ea.set_nonce(1000); + ea.set_family(AF_INET); + ea.set_in4_quad(0, 127); + ea.set_in4_quad(1, 0); + ea.set_in4_quad(2, 0); + ea.set_in4_quad(3, 1); + ea.set_port(1024); + o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea)); + ea.set_nonce(1001); + ea.set_in4_quad(3, 2); + ea.set_port(1025); + o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea)); + } +}; +WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t) + +struct clone_info { + snapid_t cloneid; + vector<snapid_t> snaps; // ascending + vector< pair<uint64_t,uint64_t> > overlap; + uint64_t size; + + clone_info() : cloneid(CEPH_NOSNAP), size(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(cloneid, bl); + encode(snaps, bl); + encode(overlap, bl); + encode(size, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(cloneid, bl); + decode(snaps, bl); + decode(overlap, bl); + decode(size, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const { + if (cloneid == CEPH_NOSNAP) + f->dump_string("cloneid", "HEAD"); + else + f->dump_unsigned("cloneid", cloneid.val); + f->open_array_section("snapshots"); + for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) { + f->open_object_section("snap"); + f->dump_unsigned("id", p->val); + f->close_section(); + } + f->close_section(); + f->open_array_section("overlaps"); + for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin(); + q != overlap.end(); ++q) { + f->open_object_section("overlap"); + f->dump_unsigned("offset", q->first); + f->dump_unsigned("length", q->second); + f->close_section(); + } + f->close_section(); + f->dump_unsigned("size", size); + } + static void generate_test_instances(list<clone_info*>& o) { + o.push_back(new clone_info); + o.push_back(new clone_info); + o.back()->cloneid = 1; + o.back()->snaps.push_back(1); + o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096)); + o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096)); + o.back()->size = 16384; + o.push_back(new clone_info); + o.back()->cloneid = CEPH_NOSNAP; + o.back()->size = 32768; + } +}; +WRITE_CLASS_ENCODER(clone_info) + +/** + * obj list snaps response format + * + */ +struct obj_list_snap_response_t { + vector<clone_info> clones; // ascending + snapid_t seq; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(clones, bl); + encode(seq, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(clones, bl); + if (struct_v >= 2) + decode(seq, bl); + else + seq = CEPH_NOSNAP; + DECODE_FINISH(bl); + } + void dump(Formatter *f) const { + f->open_array_section("clones"); + for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) { + f->open_object_section("clone"); + p->dump(f); + f->close_section(); + } + f->dump_unsigned("seq", seq); + f->close_section(); + } + static void generate_test_instances(list<obj_list_snap_response_t*>& o) { + o.push_back(new obj_list_snap_response_t); + o.push_back(new obj_list_snap_response_t); + clone_info cl; + cl.cloneid = 1; + cl.snaps.push_back(1); + cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096)); + cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096)); + cl.size = 16384; + o.back()->clones.push_back(cl); + cl.cloneid = CEPH_NOSNAP; + cl.snaps.clear(); + cl.overlap.clear(); + cl.size = 32768; + o.back()->clones.push_back(cl); + o.back()->seq = 123; + } +}; + +WRITE_CLASS_ENCODER(obj_list_snap_response_t) + +// PromoteCounter + +struct PromoteCounter { + std::atomic<unsigned long long> attempts{0}; + std::atomic<unsigned long long> objects{0}; + std::atomic<unsigned long long> bytes{0}; + + void attempt() { + attempts++; + } + + void finish(uint64_t size) { + objects++; + bytes += size; + } + + void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) { + *a = attempts; + *o = objects; + *b = bytes; + attempts = *a / 2; + objects = *o / 2; + bytes = *b / 2; + } +}; + +struct pool_pg_num_history_t { + /// last epoch updated + epoch_t epoch = 0; + /// poolid -> epoch -> pg_num + map<int64_t,map<epoch_t,uint32_t>> pg_nums; + /// pair(epoch, poolid) + set<pair<epoch_t,int64_t>> deleted_pools; + + void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) { + pg_nums[pool][epoch] = pg_num; + } + void log_pool_delete(epoch_t epoch, int64_t pool) { + deleted_pools.insert(make_pair(epoch, pool)); + } + + /// prune history based on oldest osdmap epoch in the cluster + void prune(epoch_t oldest_epoch) { + auto i = deleted_pools.begin(); + while (i != deleted_pools.end()) { + if (i->first >= oldest_epoch) { + break; + } + pg_nums.erase(i->second); + i = deleted_pools.erase(i); + } + for (auto& j : pg_nums) { + auto k = j.second.lower_bound(oldest_epoch); + // keep this and the entry before it (just to be paranoid) + if (k != j.second.begin()) { + --k; + j.second.erase(j.second.begin(), k); + } + } + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(epoch, bl); + encode(pg_nums, bl); + encode(deleted_pools, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(epoch, p); + decode(pg_nums, p); + decode(deleted_pools, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->dump_unsigned("epoch", epoch); + f->open_object_section("pools"); + for (auto& i : pg_nums) { + f->open_object_section("pool"); + f->dump_unsigned("pool_id", i.first); + f->open_array_section("changes"); + for (auto& j : i.second) { + f->open_object_section("change"); + f->dump_unsigned("epoch", j.first); + f->dump_unsigned("pg_num", j.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("deleted_pools"); + for (auto& i : deleted_pools) { + f->open_object_section("deletion"); + f->dump_unsigned("pool_id", i.second); + f->dump_unsigned("epoch", i.first); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(list<pool_pg_num_history_t*>& ls) { + ls.push_back(new pool_pg_num_history_t); + } + friend ostream& operator<<(ostream& out, const pool_pg_num_history_t& h) { + return out << "pg_num_history(e" << h.epoch + << " pg_nums " << h.pg_nums + << " deleted_pools " << h.deleted_pools + << ")"; + } +}; +WRITE_CLASS_ENCODER(pool_pg_num_history_t) + +// omap specific stats +struct omap_stat_t { + int large_omap_objects; + int64_t omap_bytes; + int64_t omap_keys; +}; + +#endif |