diff options
Diffstat (limited to '')
-rw-r--r-- | src/osdc/CMakeLists.txt | 9 | ||||
-rw-r--r-- | src/osdc/Filer.cc | 487 | ||||
-rw-r--r-- | src/osdc/Filer.h | 302 | ||||
-rw-r--r-- | src/osdc/Journaler.cc | 1607 | ||||
-rw-r--r-- | src/osdc/Journaler.h | 540 | ||||
-rw-r--r-- | src/osdc/ObjectCacher.cc | 2800 | ||||
-rw-r--r-- | src/osdc/ObjectCacher.h | 774 | ||||
-rw-r--r-- | src/osdc/Objecter.cc | 5285 | ||||
-rw-r--r-- | src/osdc/Objecter.h | 3067 | ||||
-rw-r--r-- | src/osdc/Striper.cc | 411 | ||||
-rw-r--r-- | src/osdc/Striper.h | 113 | ||||
-rw-r--r-- | src/osdc/WritebackHandler.h | 57 |
12 files changed, 15452 insertions, 0 deletions
diff --git a/src/osdc/CMakeLists.txt b/src/osdc/CMakeLists.txt new file mode 100644 index 00000000..ef34e629 --- /dev/null +++ b/src/osdc/CMakeLists.txt @@ -0,0 +1,9 @@ +set(osdc_files + Filer.cc + ObjectCacher.cc + Objecter.cc + Striper.cc) +add_library(osdc STATIC ${osdc_files}) +if(WITH_LTTNG AND WITH_EVENTTRACE) + add_dependencies(osdc eventtrace_tp) +endif() diff --git a/src/osdc/Filer.cc b/src/osdc/Filer.cc new file mode 100644 index 00000000..086daf71 --- /dev/null +++ b/src/osdc/Filer.cc @@ -0,0 +1,487 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <mutex> +#include <algorithm> +#include "Filer.h" +#include "osd/OSDMap.h" +#include "Striper.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDMap.h" + +#include "msg/Messenger.h" + +#include "include/Context.h" + +#include "common/Finisher.h" +#include "common/config.h" + +#define dout_subsys ceph_subsys_filer +#undef dout_prefix +#define dout_prefix *_dout << objecter->messenger->get_myname() << ".filer " + +class Filer::C_Probe : public Context { +public: + Filer *filer; + Probe *probe; + object_t oid; + uint64_t size; + ceph::real_time mtime; + C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), + size(0) {} + void finish(int r) override { + if (r == -ENOENT) { + r = 0; + ceph_assert(size == 0); + } + + bool probe_complete; + { + Probe::unique_lock pl(probe->lock); + if (r != 0) { + probe->err = r; + } + + probe_complete = filer->_probed(probe, oid, size, mtime, pl); + ceph_assert(!pl.owns_lock()); + } + if (probe_complete) { + probe->onfinish->complete(probe->err); + delete probe; + } + } +}; + +int Filer::probe(inodeno_t ino, + file_layout_t *layout, + snapid_t snapid, + uint64_t start_from, + uint64_t *end, // LB, when !fwd + ceph::real_time *pmtime, + bool fwd, + int flags, + Context *onfinish) +{ + ldout(cct, 10) << "probe " << (fwd ? "fwd ":"bwd ") + << hex << ino << dec + << " starting from " << start_from + << dendl; + + ceph_assert(snapid); // (until there is a non-NOSNAP write) + + Probe *probe = new Probe(ino, *layout, snapid, start_from, end, pmtime, + flags, fwd, onfinish); + + return probe_impl(probe, layout, start_from, end); +} + +int Filer::probe(inodeno_t ino, + file_layout_t *layout, + snapid_t snapid, + uint64_t start_from, + uint64_t *end, // LB, when !fwd + utime_t *pmtime, + bool fwd, + int flags, + Context *onfinish) +{ + ldout(cct, 10) << "probe " << (fwd ? "fwd ":"bwd ") + << hex << ino << dec + << " starting from " << start_from + << dendl; + + ceph_assert(snapid); // (until there is a non-NOSNAP write) + + Probe *probe = new Probe(ino, *layout, snapid, start_from, end, pmtime, + flags, fwd, onfinish); + return probe_impl(probe, layout, start_from, end); +} + +int Filer::probe_impl(Probe* probe, file_layout_t *layout, + uint64_t start_from, uint64_t *end) // LB, when !fwd +{ + // period (bytes before we jump unto a new set of object(s)) + uint64_t period = layout->get_period(); + + // start with 1+ periods. + probe->probing_len = period; + if (probe->fwd) { + if (start_from % period) + probe->probing_len += period - (start_from % period); + } else { + ceph_assert(start_from > *end); + if (start_from % period) + probe->probing_len -= period - (start_from % period); + probe->probing_off -= probe->probing_len; + } + + Probe::unique_lock pl(probe->lock); + _probe(probe, pl); + ceph_assert(!pl.owns_lock()); + + return 0; +} + + + +/** + * probe->lock must be initially locked, this function will release it + */ +void Filer::_probe(Probe *probe, Probe::unique_lock& pl) +{ + ceph_assert(pl.owns_lock() && pl.mutex() == &probe->lock); + + ldout(cct, 10) << "_probe " << hex << probe->ino << dec + << " " << probe->probing_off << "~" << probe->probing_len + << dendl; + + // map range onto objects + probe->known_size.clear(); + probe->probing.clear(); + Striper::file_to_extents(cct, probe->ino, &probe->layout, probe->probing_off, + probe->probing_len, 0, probe->probing); + + std::vector<ObjectExtent> stat_extents; + for (vector<ObjectExtent>::iterator p = probe->probing.begin(); + p != probe->probing.end(); + ++p) { + ldout(cct, 10) << "_probe probing " << p->oid << dendl; + probe->ops.insert(p->oid); + stat_extents.push_back(*p); + } + + pl.unlock(); + for (std::vector<ObjectExtent>::iterator i = stat_extents.begin(); + i != stat_extents.end(); ++i) { + C_Probe *c = new C_Probe(this, probe, i->oid); + objecter->stat(i->oid, i->oloc, probe->snapid, &c->size, &c->mtime, + probe->flags | CEPH_OSD_FLAG_RWORDERED, + new C_OnFinisher(c, finisher)); + } +} + +/** + * probe->lock must be initially held, and will be released by this function. + * + * @return true if probe is complete and Probe object may be freed. + */ +bool Filer::_probed(Probe *probe, const object_t& oid, uint64_t size, + ceph::real_time mtime, Probe::unique_lock& pl) +{ + ceph_assert(pl.owns_lock() && pl.mutex() == &probe->lock); + + ldout(cct, 10) << "_probed " << probe->ino << " object " << oid + << " has size " << size << " mtime " << mtime << dendl; + + probe->known_size[oid] = size; + if (mtime > probe->max_mtime) + probe->max_mtime = mtime; + + ceph_assert(probe->ops.count(oid)); + probe->ops.erase(oid); + + if (!probe->ops.empty()) { + pl.unlock(); + return false; // waiting for more! + } + + if (probe->err) { // we hit an error, propagate back up + pl.unlock(); + return true; + } + + // analyze! + uint64_t end = 0; + + if (!probe->fwd) { + std::reverse(probe->probing.begin(), probe->probing.end()); + } + + for (vector<ObjectExtent>::iterator p = probe->probing.begin(); + p != probe->probing.end(); + ++p) { + uint64_t shouldbe = p->length + p->offset; + ldout(cct, 10) << "_probed " << probe->ino << " object " << hex + << p->oid << dec << " should be " << shouldbe + << ", actual is " << probe->known_size[p->oid] + << dendl; + + if (!probe->found_size) { + ceph_assert(probe->known_size[p->oid] <= shouldbe); + + if ((probe->fwd && probe->known_size[p->oid] == shouldbe) || + (!probe->fwd && probe->known_size[p->oid] == 0 && + probe->probing_off > 0)) + continue; // keep going + + // aha, we found the end! + // calc offset into buffer_extent to get distance from probe->from. + uint64_t oleft = probe->known_size[p->oid] - p->offset; + for (vector<pair<uint64_t, uint64_t> >::iterator i + = p->buffer_extents.begin(); + i != p->buffer_extents.end(); + ++i) { + if (oleft <= (uint64_t)i->second) { + end = probe->probing_off + i->first + oleft; + ldout(cct, 10) << "_probed end is in buffer_extent " << i->first + << "~" << i->second << " off " << oleft + << ", from was " << probe->probing_off << ", end is " + << end << dendl; + + probe->found_size = true; + ldout(cct, 10) << "_probed found size at " << end << dendl; + *probe->psize = end; + + if (!probe->pmtime && + !probe->pumtime) // stop if we don't need mtime too + break; + } + oleft -= i->second; + } + } + break; + } + + if (!probe->found_size || (probe->probing_off && (probe->pmtime || + probe->pumtime))) { + // keep probing! + ldout(cct, 10) << "_probed probing further" << dendl; + + uint64_t period = probe->layout.get_period(); + if (probe->fwd) { + probe->probing_off += probe->probing_len; + ceph_assert(probe->probing_off % period == 0); + probe->probing_len = period; + } else { + // previous period. + ceph_assert(probe->probing_off % period == 0); + probe->probing_len = period; + probe->probing_off -= period; + } + _probe(probe, pl); + ceph_assert(!pl.owns_lock()); + return false; + } else if (probe->pmtime) { + ldout(cct, 10) << "_probed found mtime " << probe->max_mtime << dendl; + *probe->pmtime = probe->max_mtime; + } else if (probe->pumtime) { + ldout(cct, 10) << "_probed found mtime " << probe->max_mtime << dendl; + *probe->pumtime = ceph::real_clock::to_ceph_timespec(probe->max_mtime); + } + // done! + pl.unlock(); + return true; +} + + +// ----------------------- + +struct PurgeRange { + std::mutex lock; + typedef std::lock_guard<std::mutex> lock_guard; + typedef std::unique_lock<std::mutex> unique_lock; + inodeno_t ino; + file_layout_t layout; + SnapContext snapc; + uint64_t first, num; + ceph::real_time mtime; + int flags; + Context *oncommit; + int uncommitted; + int err = 0; + PurgeRange(inodeno_t i, const file_layout_t& l, const SnapContext& sc, + uint64_t fo, uint64_t no, ceph::real_time t, int fl, + Context *fin) + : ino(i), layout(l), snapc(sc), first(fo), num(no), mtime(t), flags(fl), + oncommit(fin), uncommitted(0) {} +}; + +int Filer::purge_range(inodeno_t ino, + const file_layout_t *layout, + const SnapContext& snapc, + uint64_t first_obj, uint64_t num_obj, + ceph::real_time mtime, + int flags, + Context *oncommit) +{ + ceph_assert(num_obj > 0); + + // single object? easy! + if (num_obj == 1) { + object_t oid = file_object_t(ino, first_obj); + object_locator_t oloc = OSDMap::file_to_object_locator(*layout); + objecter->remove(oid, oloc, snapc, mtime, flags, oncommit); + return 0; + } + + PurgeRange *pr = new PurgeRange(ino, *layout, snapc, first_obj, + num_obj, mtime, flags, oncommit); + + _do_purge_range(pr, 0, 0); + return 0; +} + +struct C_PurgeRange : public Context { + Filer *filer; + PurgeRange *pr; + C_PurgeRange(Filer *f, PurgeRange *p) : filer(f), pr(p) {} + void finish(int r) override { + filer->_do_purge_range(pr, 1, r); + } +}; + +void Filer::_do_purge_range(PurgeRange *pr, int fin, int err) +{ + PurgeRange::unique_lock prl(pr->lock); + if (err && err != -ENOENT) + pr->err = err; + pr->uncommitted -= fin; + ldout(cct, 10) << "_do_purge_range " << pr->ino << " objects " << pr->first + << "~" << pr->num << " uncommitted " << pr->uncommitted + << dendl; + + if (pr->num == 0 && pr->uncommitted == 0) { + pr->oncommit->complete(pr->err); + prl.unlock(); + delete pr; + return; + } + + std::vector<object_t> remove_oids; + + int max = cct->_conf->filer_max_purge_ops - pr->uncommitted; + while (pr->num > 0 && max > 0) { + remove_oids.push_back(file_object_t(pr->ino, pr->first)); + pr->uncommitted++; + pr->first++; + pr->num--; + max--; + } + prl.unlock(); + + // Issue objecter ops outside pr->lock to avoid lock dependency loop + for (const auto& oid : remove_oids) { + object_locator_t oloc = OSDMap::file_to_object_locator(pr->layout); + objecter->remove(oid, oloc, pr->snapc, pr->mtime, pr->flags, + new C_OnFinisher(new C_PurgeRange(this, pr), finisher)); + } +} + +// ----------------------- +struct TruncRange { + std::mutex lock; + typedef std::lock_guard<std::mutex> lock_guard; + typedef std::unique_lock<std::mutex> unique_lock; + inodeno_t ino; + file_layout_t layout; + SnapContext snapc; + ceph::real_time mtime; + int flags; + Context *oncommit; + int uncommitted; + uint64_t offset; + uint64_t length; + uint32_t truncate_seq; + TruncRange(inodeno_t i, const file_layout_t& l, const SnapContext& sc, + ceph::real_time t, int fl, Context *fin, + uint64_t off, uint64_t len, uint32_t ts) + : ino(i), layout(l), snapc(sc), mtime(t), flags(fl), oncommit(fin), + uncommitted(0), offset(off), length(len), truncate_seq(ts) {} +}; + +void Filer::truncate(inodeno_t ino, + file_layout_t *layout, + const SnapContext& snapc, + uint64_t offset, + uint64_t len, + __u32 truncate_seq, + ceph::real_time mtime, + int flags, + Context *oncommit) +{ + uint64_t period = layout->get_period(); + uint64_t num_objs = Striper::get_num_objects(*layout, len + (offset % period)); + if (num_objs == 1) { + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents); + vector<OSDOp> ops(1); + ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC; + ops[0].op.extent.truncate_seq = truncate_seq; + ops[0].op.extent.truncate_size = extents[0].offset; + objecter->_modify(extents[0].oid, extents[0].oloc, ops, mtime, snapc, + flags, oncommit); + return; + } + + if (len > 0 && (offset + len) % period) + len += period - ((offset + len) % period); + + TruncRange *tr = new TruncRange(ino, *layout, snapc, mtime, flags, oncommit, + offset, len, truncate_seq); + _do_truncate_range(tr, 0); +} + +struct C_TruncRange : public Context { + Filer *filer; + TruncRange *tr; + C_TruncRange(Filer *f, TruncRange *t) : filer(f), tr(t) {} + void finish(int r) override { + filer->_do_truncate_range(tr, 1); + } +}; + +void Filer::_do_truncate_range(TruncRange *tr, int fin) +{ + TruncRange::unique_lock trl(tr->lock); + tr->uncommitted -= fin; + ldout(cct, 10) << "_do_truncate_range " << tr->ino << " objects " << tr->offset + << "~" << tr->length << " uncommitted " << tr->uncommitted + << dendl; + + if (tr->length == 0 && tr->uncommitted == 0) { + tr->oncommit->complete(0); + trl.unlock(); + delete tr; + return; + } + + vector<ObjectExtent> extents; + + int max = cct->_conf->filer_max_truncate_ops - tr->uncommitted; + if (max > 0 && tr->length > 0) { + uint64_t len = tr->layout.get_period() * max; + if (len > tr->length) + len = tr->length; + + uint64_t offset = tr->offset + tr->length - len; + Striper::file_to_extents(cct, tr->ino, &tr->layout, offset, len, 0, extents); + tr->uncommitted += extents.size(); + tr->length -= len; + } + + trl.unlock(); + + // Issue objecter ops outside tr->lock to avoid lock dependency loop + for (const auto& p : extents) { + vector<OSDOp> ops(1); + ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC; + ops[0].op.extent.truncate_size = p.offset; + ops[0].op.extent.truncate_seq = tr->truncate_seq; + objecter->_modify(p.oid, p.oloc, ops, tr->mtime, tr->snapc, tr->flags, + new C_OnFinisher(new C_TruncRange(this, tr), finisher)); + } +} diff --git a/src/osdc/Filer.h b/src/osdc/Filer.h new file mode 100644 index 00000000..ea9ac170 --- /dev/null +++ b/src/osdc/Filer.h @@ -0,0 +1,302 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILER_H +#define CEPH_FILER_H + +/*** Filer + * + * stripe file ranges onto objects. + * build list<ObjectExtent> for the objecter or objectcacher. + * + * also, provide convenience methods that call objecter for you. + * + * "files" are identified by ino. + */ + + +#include <mutex> + +#include "include/types.h" + +#include "common/ceph_time.h" + +#include "osd/OSDMap.h" +#include "Objecter.h" +#include "Striper.h" + +class Context; +class Messenger; +class OSDMap; +class Finisher; + + +/**** Filer interface ***/ + +class Filer { + CephContext *cct; + Objecter *objecter; + Finisher *finisher; + + // probes + struct Probe { + std::mutex lock; + typedef std::lock_guard<std::mutex> lock_guard; + typedef std::unique_lock<std::mutex> unique_lock; + inodeno_t ino; + file_layout_t layout; + snapid_t snapid; + + uint64_t *psize; + ceph::real_time *pmtime; + utime_t *pumtime; + + int flags; + + bool fwd; + + Context *onfinish; + + vector<ObjectExtent> probing; + uint64_t probing_off, probing_len; + + map<object_t, uint64_t> known_size; + ceph::real_time max_mtime; + + set<object_t> ops; + + int err; + bool found_size; + + Probe(inodeno_t i, file_layout_t &l, snapid_t sn, + uint64_t f, uint64_t *e, ceph::real_time *m, int fl, bool fw, + Context *c) : + ino(i), layout(l), snapid(sn), + psize(e), pmtime(m), pumtime(nullptr), flags(fl), fwd(fw), onfinish(c), + probing_off(f), probing_len(0), + err(0), found_size(false) {} + + Probe(inodeno_t i, file_layout_t &l, snapid_t sn, + uint64_t f, uint64_t *e, utime_t *m, int fl, bool fw, + Context *c) : + ino(i), layout(l), snapid(sn), + psize(e), pmtime(nullptr), pumtime(m), flags(fl), fwd(fw), + onfinish(c), probing_off(f), probing_len(0), + err(0), found_size(false) {} + }; + + class C_Probe; + + void _probe(Probe *p, Probe::unique_lock& pl); + bool _probed(Probe *p, const object_t& oid, uint64_t size, + ceph::real_time mtime, Probe::unique_lock& pl); + + public: + Filer(const Filer& other); + const Filer operator=(const Filer& other); + + Filer(Objecter *o, Finisher *f) : cct(o->cct), objecter(o), finisher(f) {} + ~Filer() {} + + bool is_active() { + return objecter->is_active(); // || (oc && oc->is_active()); + } + + + /*** async file interface. scatter/gather as needed. ***/ + + void read(inodeno_t ino, + file_layout_t *layout, + snapid_t snap, + uint64_t offset, + uint64_t len, + bufferlist *bl, // ptr to data + int flags, + Context *onfinish, + int op_flags = 0) { + ceph_assert(snap); // (until there is a non-NOSNAP write) + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents); + objecter->sg_read(extents, snap, bl, flags, onfinish, op_flags); + } + + void read_trunc(inodeno_t ino, + file_layout_t *layout, + snapid_t snap, + uint64_t offset, + uint64_t len, + bufferlist *bl, // ptr to data + int flags, + uint64_t truncate_size, + __u32 truncate_seq, + Context *onfinish, + int op_flags = 0) { + ceph_assert(snap); // (until there is a non-NOSNAP write) + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, ino, layout, offset, len, truncate_size, + extents); + objecter->sg_read_trunc(extents, snap, bl, flags, + truncate_size, truncate_seq, onfinish, op_flags); + } + + void write(inodeno_t ino, + file_layout_t *layout, + const SnapContext& snapc, + uint64_t offset, + uint64_t len, + bufferlist& bl, + ceph::real_time mtime, + int flags, + Context *oncommit, + int op_flags = 0) { + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents); + objecter->sg_write(extents, snapc, bl, mtime, flags, oncommit, op_flags); + } + + void write_trunc(inodeno_t ino, + file_layout_t *layout, + const SnapContext& snapc, + uint64_t offset, + uint64_t len, + bufferlist& bl, + ceph::real_time mtime, + int flags, + uint64_t truncate_size, + __u32 truncate_seq, + Context *oncommit, + int op_flags = 0) { + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, ino, layout, offset, len, truncate_size, + extents); + objecter->sg_write_trunc(extents, snapc, bl, mtime, flags, + truncate_size, truncate_seq, oncommit, op_flags); + } + + void truncate(inodeno_t ino, + file_layout_t *layout, + const SnapContext& snapc, + uint64_t offset, + uint64_t len, + __u32 truncate_seq, + ceph::real_time mtime, + int flags, + Context *oncommit); + void _do_truncate_range(struct TruncRange *pr, int fin); + + void zero(inodeno_t ino, + const file_layout_t *layout, + const SnapContext& snapc, + uint64_t offset, + uint64_t len, + ceph::real_time mtime, + int flags, + bool keep_first, + Context *oncommit) { + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents); + if (extents.size() == 1) { + if (extents[0].offset == 0 && extents[0].length == layout->object_size + && (!keep_first || extents[0].objectno != 0)) + objecter->remove(extents[0].oid, extents[0].oloc, + snapc, mtime, flags, oncommit); + else + objecter->zero(extents[0].oid, extents[0].oloc, extents[0].offset, + extents[0].length, snapc, mtime, flags, oncommit); + } else { + C_GatherBuilder gcom(cct, oncommit); + for (vector<ObjectExtent>::iterator p = extents.begin(); + p != extents.end(); + ++p) { + if (p->offset == 0 && p->length == layout->object_size && + (!keep_first || p->objectno != 0)) + objecter->remove(p->oid, p->oloc, + snapc, mtime, flags, + oncommit ? gcom.new_sub():0); + else + objecter->zero(p->oid, p->oloc, p->offset, p->length, + snapc, mtime, flags, + oncommit ? gcom.new_sub():0); + } + gcom.activate(); + } + } + + void zero(inodeno_t ino, + file_layout_t *layout, + const SnapContext& snapc, + uint64_t offset, + uint64_t len, + ceph::real_time mtime, + int flags, + Context *oncommit) { + zero(ino, layout, + snapc, offset, + len, mtime, + flags, false, + oncommit); + } + // purge range of ino.### objects + int purge_range(inodeno_t ino, + const file_layout_t *layout, + const SnapContext& snapc, + uint64_t first_obj, uint64_t num_obj, + ceph::real_time mtime, + int flags, Context *oncommit); + void _do_purge_range(struct PurgeRange *pr, int fin, int err); + + /* + * probe + * specify direction, + * and whether we stop when we find data, or hole. + */ + int probe(inodeno_t ino, + file_layout_t *layout, + snapid_t snapid, + uint64_t start_from, + uint64_t *end, + ceph::real_time *mtime, + bool fwd, + int flags, + Context *onfinish); + + int probe(inodeno_t ino, + file_layout_t *layout, + snapid_t snapid, + uint64_t start_from, + uint64_t *end, + bool fwd, + int flags, + Context *onfinish) { + return probe(ino, layout, snapid, start_from, end, + (ceph::real_time* )0, fwd, flags, onfinish); + } + + int probe(inodeno_t ino, + file_layout_t *layout, + snapid_t snapid, + uint64_t start_from, + uint64_t *end, + utime_t *mtime, + bool fwd, + int flags, + Context *onfinish); + +private: + int probe_impl(Probe* probe, file_layout_t *layout, + uint64_t start_from, uint64_t *end); +}; + +#endif // !CEPH_FILER_H diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc new file mode 100644 index 00000000..5c00293e --- /dev/null +++ b/src/osdc/Journaler.cc @@ -0,0 +1,1607 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/perf_counters.h" +#include "common/dout.h" +#include "include/Context.h" +#include "msg/Messenger.h" +#include "osdc/Journaler.h" +#include "common/errno.h" +#include "include/ceph_assert.h" +#include "common/Finisher.h" + +#define dout_subsys ceph_subsys_journaler +#undef dout_prefix +#define dout_prefix *_dout << objecter->messenger->get_myname() \ + << ".journaler." << name << (readonly ? "(ro) ":"(rw) ") + +using std::chrono::seconds; + + +class Journaler::C_DelayFlush : public Context { + Journaler *journaler; + public: + explicit C_DelayFlush(Journaler *j) : journaler(j) {} + void finish(int r) override { + journaler->_do_delayed_flush(); + } +}; + +void Journaler::set_readonly() +{ + lock_guard l(lock); + + ldout(cct, 1) << "set_readonly" << dendl; + readonly = true; +} + +void Journaler::set_writeable() +{ + lock_guard l(lock); + + ldout(cct, 1) << "set_writeable" << dendl; + readonly = false; +} + +void Journaler::create(file_layout_t *l, stream_format_t const sf) +{ + lock_guard lk(lock); + + ceph_assert(!readonly); + state = STATE_ACTIVE; + + stream_format = sf; + journal_stream.set_format(sf); + _set_layout(l); + + prezeroing_pos = prezero_pos = write_pos = flush_pos = + safe_pos = read_pos = requested_pos = received_pos = + expire_pos = trimming_pos = trimmed_pos = + next_safe_pos = layout.get_period(); + + ldout(cct, 1) << "created blank journal at inode 0x" << std::hex << ino + << std::dec << ", format=" << stream_format << dendl; +} + +void Journaler::set_layout(file_layout_t const *l) +{ + lock_guard lk(lock); + _set_layout(l); +} + +void Journaler::_set_layout(file_layout_t const *l) +{ + layout = *l; + + if (layout.pool_id != pg_pool) { + // user can reset pool id through cephfs-journal-tool + lderr(cct) << "may got older pool id from header layout" << dendl; + ceph_abort(); + } + last_written.layout = layout; + last_committed.layout = layout; + + // prefetch intelligently. + // (watch out, this is big if you use big objects or weird striping) + uint64_t periods = cct->_conf.get_val<uint64_t>("journaler_prefetch_periods"); + fetch_len = layout.get_period() * periods; +} + + +/***************** HEADER *******************/ + +ostream& operator<<(ostream &out, const Journaler::Header &h) +{ + return out << "loghead(trim " << h.trimmed_pos + << ", expire " << h.expire_pos + << ", write " << h.write_pos + << ", stream_format " << (int)(h.stream_format) + << ")"; +} + +class Journaler::C_ReadHead : public Context { + Journaler *ls; +public: + bufferlist bl; + explicit C_ReadHead(Journaler *l) : ls(l) {} + void finish(int r) override { + ls->_finish_read_head(r, bl); + } +}; + +class Journaler::C_RereadHead : public Context { + Journaler *ls; + Context *onfinish; +public: + bufferlist bl; + C_RereadHead(Journaler *l, Context *onfinish_) : ls (l), + onfinish(onfinish_) {} + void finish(int r) override { + ls->_finish_reread_head(r, bl, onfinish); + } +}; + +class Journaler::C_ProbeEnd : public Context { + Journaler *ls; +public: + uint64_t end; + explicit C_ProbeEnd(Journaler *l) : ls(l), end(-1) {} + void finish(int r) override { + ls->_finish_probe_end(r, end); + } +}; + +class Journaler::C_ReProbe : public Context { + Journaler *ls; + C_OnFinisher *onfinish; +public: + uint64_t end; + C_ReProbe(Journaler *l, C_OnFinisher *onfinish_) : + ls(l), onfinish(onfinish_), end(0) {} + void finish(int r) override { + ls->_finish_reprobe(r, end, onfinish); + } +}; + +void Journaler::recover(Context *onread) +{ + lock_guard l(lock); + if (is_stopping()) { + onread->complete(-EAGAIN); + return; + } + + ldout(cct, 1) << "recover start" << dendl; + ceph_assert(state != STATE_ACTIVE); + ceph_assert(readonly); + + if (onread) + waitfor_recover.push_back(wrap_finisher(onread)); + + if (state != STATE_UNDEF) { + ldout(cct, 1) << "recover - already recovering" << dendl; + return; + } + + ldout(cct, 1) << "read_head" << dendl; + state = STATE_READHEAD; + C_ReadHead *fin = new C_ReadHead(this); + _read_head(fin, &fin->bl); +} + +void Journaler::_read_head(Context *on_finish, bufferlist *bl) +{ + // lock is locked + ceph_assert(state == STATE_READHEAD || state == STATE_REREADHEAD); + + object_t oid = file_object_t(ino, 0); + object_locator_t oloc(pg_pool); + objecter->read_full(oid, oloc, CEPH_NOSNAP, bl, 0, wrap_finisher(on_finish)); +} + +void Journaler::reread_head(Context *onfinish) +{ + lock_guard l(lock); + _reread_head(wrap_finisher(onfinish)); +} + +/** + * Re-read the head from disk, and set the write_pos, expire_pos, trimmed_pos + * from the on-disk header. This switches the state to STATE_REREADHEAD for + * the duration, and you shouldn't start a re-read while other operations are + * in-flight, nor start other operations while a re-read is in progress. + * Also, don't call this until the Journaler has finished its recovery and has + * gone STATE_ACTIVE! + */ +void Journaler::_reread_head(Context *onfinish) +{ + ldout(cct, 10) << "reread_head" << dendl; + ceph_assert(state == STATE_ACTIVE); + + state = STATE_REREADHEAD; + C_RereadHead *fin = new C_RereadHead(this, onfinish); + _read_head(fin, &fin->bl); +} + +void Journaler::_finish_reread_head(int r, bufferlist& bl, Context *finish) +{ + lock_guard l(lock); + if (is_stopping()) { + finish->complete(-EAGAIN); + return; + } + + //read on-disk header into + ceph_assert(bl.length() || r < 0 ); + + // unpack header + if (r == 0) { + Header h; + auto p = bl.cbegin(); + try { + decode(h, p); + } catch (const buffer::error &e) { + finish->complete(-EINVAL); + return; + } + prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos + = h.write_pos; + expire_pos = h.expire_pos; + trimmed_pos = trimming_pos = h.trimmed_pos; + init_headers(h); + state = STATE_ACTIVE; + } + + finish->complete(r); +} + +void Journaler::_finish_read_head(int r, bufferlist& bl) +{ + lock_guard l(lock); + if (is_stopping()) + return; + + ceph_assert(state == STATE_READHEAD); + + if (r!=0) { + ldout(cct, 0) << "error getting journal off disk" << dendl; + list<Context*> ls; + ls.swap(waitfor_recover); + finish_contexts(cct, ls, r); + return; + } + + if (bl.length() == 0) { + ldout(cct, 1) << "_finish_read_head r=" << r + << " read 0 bytes, assuming empty log" << dendl; + state = STATE_ACTIVE; + list<Context*> ls; + ls.swap(waitfor_recover); + finish_contexts(cct, ls, 0); + return; + } + + // unpack header + bool corrupt = false; + Header h; + auto p = bl.cbegin(); + try { + decode(h, p); + + if (h.magic != magic) { + ldout(cct, 0) << "on disk magic '" << h.magic << "' != my magic '" + << magic << "'" << dendl; + corrupt = true; + } else if (h.write_pos < h.expire_pos || h.expire_pos < h.trimmed_pos) { + ldout(cct, 0) << "Corrupt header (bad offsets): " << h << dendl; + corrupt = true; + } + } catch (const buffer::error &e) { + corrupt = true; + } + + if (corrupt) { + list<Context*> ls; + ls.swap(waitfor_recover); + finish_contexts(cct, ls, -EINVAL); + return; + } + + prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos + = h.write_pos; + read_pos = requested_pos = received_pos = expire_pos = h.expire_pos; + trimmed_pos = trimming_pos = h.trimmed_pos; + + init_headers(h); + _set_layout(&h.layout); + stream_format = h.stream_format; + journal_stream.set_format(h.stream_format); + + ldout(cct, 1) << "_finish_read_head " << h + << ". probing for end of log (from " << write_pos << ")..." + << dendl; + C_ProbeEnd *fin = new C_ProbeEnd(this); + state = STATE_PROBING; + _probe(fin, &fin->end); +} + +void Journaler::_probe(Context *finish, uint64_t *end) +{ + // lock is locked + ldout(cct, 1) << "probing for end of the log" << dendl; + ceph_assert(state == STATE_PROBING || state == STATE_REPROBING); + // probe the log + filer.probe(ino, &layout, CEPH_NOSNAP, + write_pos, end, true, 0, wrap_finisher(finish)); +} + +void Journaler::_reprobe(C_OnFinisher *finish) +{ + ldout(cct, 10) << "reprobe" << dendl; + ceph_assert(state == STATE_ACTIVE); + + state = STATE_REPROBING; + C_ReProbe *fin = new C_ReProbe(this, finish); + _probe(fin, &fin->end); +} + + +void Journaler::_finish_reprobe(int r, uint64_t new_end, + C_OnFinisher *onfinish) +{ + lock_guard l(lock); + if (is_stopping()) { + onfinish->complete(-EAGAIN); + return; + } + + ceph_assert(new_end >= write_pos || r < 0); + ldout(cct, 1) << "_finish_reprobe new_end = " << new_end + << " (header had " << write_pos << ")." + << dendl; + prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = new_end; + state = STATE_ACTIVE; + onfinish->complete(r); +} + +void Journaler::_finish_probe_end(int r, uint64_t end) +{ + lock_guard l(lock); + if (is_stopping()) + return; + + ceph_assert(state == STATE_PROBING); + if (r < 0) { // error in probing + goto out; + } + if (((int64_t)end) == -1) { + end = write_pos; + ldout(cct, 1) << "_finish_probe_end write_pos = " << end << " (header had " + << write_pos << "). log was empty. recovered." << dendl; + ceph_abort(); // hrm. + } else { + ceph_assert(end >= write_pos); + ldout(cct, 1) << "_finish_probe_end write_pos = " << end + << " (header had " << write_pos << "). recovered." + << dendl; + } + + state = STATE_ACTIVE; + + prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = end; + +out: + // done. + list<Context*> ls; + ls.swap(waitfor_recover); + finish_contexts(cct, ls, r); +} + +class Journaler::C_RereadHeadProbe : public Context +{ + Journaler *ls; + C_OnFinisher *final_finish; +public: + C_RereadHeadProbe(Journaler *l, C_OnFinisher *finish) : + ls(l), final_finish(finish) {} + void finish(int r) override { + ls->_finish_reread_head_and_probe(r, final_finish); + } +}; + +void Journaler::reread_head_and_probe(Context *onfinish) +{ + lock_guard l(lock); + + ceph_assert(state == STATE_ACTIVE); + _reread_head(new C_RereadHeadProbe(this, wrap_finisher(onfinish))); +} + +void Journaler::_finish_reread_head_and_probe(int r, C_OnFinisher *onfinish) +{ + // Expect to be called back from finish_reread_head, which already takes lock + // lock is locked + if (is_stopping()) { + onfinish->complete(-EAGAIN); + return; + } + + // Let the caller know that the operation has failed or was intentionally + // failed since the caller has been blacklisted. + if (r == -EBLACKLISTED) { + onfinish->complete(r); + return; + } + + ceph_assert(!r); //if we get an error, we're boned + _reprobe(onfinish); +} + + +// WRITING + +class Journaler::C_WriteHead : public Context { +public: + Journaler *ls; + Header h; + C_OnFinisher *oncommit; + C_WriteHead(Journaler *l, Header& h_, C_OnFinisher *c) : ls(l), h(h_), + oncommit(c) {} + void finish(int r) override { + ls->_finish_write_head(r, h, oncommit); + } +}; + +void Journaler::write_head(Context *oncommit) +{ + lock_guard l(lock); + _write_head(oncommit); +} + + +void Journaler::_write_head(Context *oncommit) +{ + ceph_assert(!readonly); + ceph_assert(state == STATE_ACTIVE); + last_written.trimmed_pos = trimmed_pos; + last_written.expire_pos = expire_pos; + last_written.unused_field = expire_pos; + last_written.write_pos = safe_pos; + last_written.stream_format = stream_format; + ldout(cct, 10) << "write_head " << last_written << dendl; + + // Avoid persisting bad pointers in case of bugs + ceph_assert(last_written.write_pos >= last_written.expire_pos); + ceph_assert(last_written.expire_pos >= last_written.trimmed_pos); + + last_wrote_head = ceph::real_clock::now(); + + bufferlist bl; + encode(last_written, bl); + SnapContext snapc; + + object_t oid = file_object_t(ino, 0); + object_locator_t oloc(pg_pool); + objecter->write_full(oid, oloc, snapc, bl, ceph::real_clock::now(), 0, + wrap_finisher(new C_WriteHead( + this, last_written, + wrap_finisher(oncommit))), + 0, 0, write_iohint); +} + +void Journaler::_finish_write_head(int r, Header &wrote, + C_OnFinisher *oncommit) +{ + lock_guard l(lock); + + if (r < 0) { + lderr(cct) << "_finish_write_head got " << cpp_strerror(r) << dendl; + handle_write_error(r); + return; + } + ceph_assert(!readonly); + ldout(cct, 10) << "_finish_write_head " << wrote << dendl; + last_committed = wrote; + if (oncommit) { + oncommit->complete(r); + } + + _trim(); // trim? +} + + +/***************** WRITING *******************/ + +class Journaler::C_Flush : public Context { + Journaler *ls; + uint64_t start; + ceph::real_time stamp; +public: + C_Flush(Journaler *l, int64_t s, ceph::real_time st) + : ls(l), start(s), stamp(st) {} + void finish(int r) override { + ls->_finish_flush(r, start, stamp); + } +}; + +void Journaler::_finish_flush(int r, uint64_t start, ceph::real_time stamp) +{ + lock_guard l(lock); + ceph_assert(!readonly); + + if (r < 0) { + lderr(cct) << "_finish_flush got " << cpp_strerror(r) << dendl; + handle_write_error(r); + return; + } + + ceph_assert(start < flush_pos); + + // calc latency? + if (logger) { + ceph::timespan lat = ceph::real_clock::now() - stamp; + logger->tinc(logger_key_lat, lat); + } + + // adjust safe_pos + auto it = pending_safe.find(start); + ceph_assert(it != pending_safe.end()); + uint64_t min_next_safe_pos = pending_safe.begin()->second; + pending_safe.erase(it); + if (pending_safe.empty()) + safe_pos = next_safe_pos; + else + safe_pos = min_next_safe_pos; + + ldout(cct, 10) << "_finish_flush safe from " << start + << ", pending_safe " << pending_safe + << ", (prezeroing/prezero)/write/flush/safe positions now " + << "(" << prezeroing_pos << "/" << prezero_pos << ")/" + << write_pos << "/" << flush_pos << "/" << safe_pos + << dendl; + + // kick waiters <= safe_pos + if (!waitfor_safe.empty()) { + list<Context*> ls; + while (!waitfor_safe.empty()) { + auto it = waitfor_safe.begin(); + if (it->first > safe_pos) + break; + ls.splice(ls.end(), it->second); + waitfor_safe.erase(it); + } + finish_contexts(cct, ls); + } +} + + + +uint64_t Journaler::append_entry(bufferlist& bl) +{ + unique_lock l(lock); + + ceph_assert(!readonly); + uint32_t s = bl.length(); + + // append + size_t delta = bl.length() + journal_stream.get_envelope_size(); + // write_buf space is nearly full + if (!write_buf_throttle.get_or_fail(delta)) { + l.unlock(); + ldout(cct, 10) << "write_buf_throttle wait, delta " << delta << dendl; + write_buf_throttle.get(delta); + l.lock(); + } + ldout(cct, 20) << "write_buf_throttle get, delta " << delta << dendl; + size_t wrote = journal_stream.write(bl, &write_buf, write_pos); + ldout(cct, 10) << "append_entry len " << s << " to " << write_pos << "~" + << wrote << dendl; + write_pos += wrote; + + // flush previous object? + uint64_t su = get_layout_period(); + ceph_assert(su > 0); + uint64_t write_off = write_pos % su; + uint64_t write_obj = write_pos / su; + uint64_t flush_obj = flush_pos / su; + if (write_obj != flush_obj) { + ldout(cct, 10) << " flushing completed object(s) (su " << su << " wro " + << write_obj << " flo " << flush_obj << ")" << dendl; + _do_flush(write_buf.length() - write_off); + + // if _do_flush() skips flushing some data, it does do a best effort to + // update next_safe_pos. + if (write_buf.length() > 0 && + write_buf.length() <= wrote) { // the unflushed data are within this entry + // set next_safe_pos to end of previous entry + next_safe_pos = write_pos - wrote; + } + } + + return write_pos; +} + + +void Journaler::_do_flush(unsigned amount) +{ + if (is_stopping()) + return; + if (write_pos == flush_pos) + return; + ceph_assert(write_pos > flush_pos); + ceph_assert(!readonly); + + // flush + uint64_t len = write_pos - flush_pos; + ceph_assert(len == write_buf.length()); + if (amount && amount < len) + len = amount; + + // zero at least two full periods ahead. this ensures + // that the next object will not exist. + uint64_t period = get_layout_period(); + if (flush_pos + len + 2*period > prezero_pos) { + _issue_prezero(); + + int64_t newlen = prezero_pos - flush_pos - period; + if (newlen <= 0) { + ldout(cct, 10) << "_do_flush wanted to do " << flush_pos << "~" << len + << " already too close to prezero_pos " << prezero_pos + << ", zeroing first" << dendl; + waiting_for_zero_pos = flush_pos + len; + return; + } + if (static_cast<uint64_t>(newlen) < len) { + ldout(cct, 10) << "_do_flush wanted to do " << flush_pos << "~" << len + << " but hit prezero_pos " << prezero_pos + << ", will do " << flush_pos << "~" << newlen << dendl; + waiting_for_zero_pos = flush_pos + len; + len = newlen; + } + } + ldout(cct, 10) << "_do_flush flushing " << flush_pos << "~" << len << dendl; + + // submit write for anything pending + // flush _start_ pos to _finish_flush + ceph::real_time now = ceph::real_clock::now(); + SnapContext snapc; + + Context *onsafe = new C_Flush(this, flush_pos, now); // on COMMIT + pending_safe[flush_pos] = next_safe_pos; + + bufferlist write_bl; + + // adjust pointers + if (len == write_buf.length()) { + write_bl.swap(write_buf); + next_safe_pos = write_pos; + } else { + write_buf.splice(0, len, &write_bl); + // Keys of waitfor_safe map are journal entry boundaries. + // Try finding a journal entry that we are actually flushing + // and set next_safe_pos to end of it. This is best effort. + // The one we found may not be the lastest flushing entry. + auto p = waitfor_safe.lower_bound(flush_pos + len); + if (p != waitfor_safe.end()) { + if (p->first > flush_pos + len && p != waitfor_safe.begin()) + --p; + if (p->first <= flush_pos + len && p->first > next_safe_pos) + next_safe_pos = p->first; + } + } + + filer.write(ino, &layout, snapc, + flush_pos, len, write_bl, ceph::real_clock::now(), + 0, + wrap_finisher(onsafe), write_iohint); + + flush_pos += len; + ceph_assert(write_buf.length() == write_pos - flush_pos); + write_buf_throttle.put(len); + ldout(cct, 20) << "write_buf_throttle put, len " << len << dendl; + + ldout(cct, 10) + << "_do_flush (prezeroing/prezero)/write/flush/safe pointers now at " + << "(" << prezeroing_pos << "/" << prezero_pos << ")/" << write_pos + << "/" << flush_pos << "/" << safe_pos << dendl; + + _issue_prezero(); +} + + +void Journaler::wait_for_flush(Context *onsafe) +{ + lock_guard l(lock); + if (is_stopping()) { + if (onsafe) + onsafe->complete(-EAGAIN); + return; + } + _wait_for_flush(onsafe); +} + +void Journaler::_wait_for_flush(Context *onsafe) +{ + ceph_assert(!readonly); + + // all flushed and safe? + if (write_pos == safe_pos) { + ceph_assert(write_buf.length() == 0); + ldout(cct, 10) + << "flush nothing to flush, (prezeroing/prezero)/write/flush/safe " + "pointers at " << "(" << prezeroing_pos << "/" << prezero_pos << ")/" + << write_pos << "/" << flush_pos << "/" << safe_pos << dendl; + if (onsafe) { + finisher->queue(onsafe, 0); + } + return; + } + + // queue waiter + if (onsafe) { + waitfor_safe[write_pos].push_back(wrap_finisher(onsafe)); + } +} + +void Journaler::flush(Context *onsafe) +{ + lock_guard l(lock); + if (is_stopping()) { + if (onsafe) + onsafe->complete(-EAGAIN); + return; + } + _flush(wrap_finisher(onsafe)); +} + +void Journaler::_flush(C_OnFinisher *onsafe) +{ + ceph_assert(!readonly); + + if (write_pos == flush_pos) { + ceph_assert(write_buf.length() == 0); + ldout(cct, 10) << "flush nothing to flush, (prezeroing/prezero)/write/" + "flush/safe pointers at " << "(" << prezeroing_pos << "/" << prezero_pos + << ")/" << write_pos << "/" << flush_pos << "/" << safe_pos + << dendl; + if (onsafe) { + onsafe->complete(0); + } + } else { + _do_flush(); + _wait_for_flush(onsafe); + } + + // write head? + if (_write_head_needed()) { + _write_head(); + } +} + +bool Journaler::_write_head_needed() +{ + return last_wrote_head + seconds(cct->_conf.get_val<int64_t>("journaler_write_head_interval")) + < ceph::real_clock::now(); +} + + +/*************** prezeroing ******************/ + +struct C_Journaler_Prezero : public Context { + Journaler *journaler; + uint64_t from, len; + C_Journaler_Prezero(Journaler *j, uint64_t f, uint64_t l) + : journaler(j), from(f), len(l) {} + void finish(int r) override { + journaler->_finish_prezero(r, from, len); + } +}; + +void Journaler::_issue_prezero() +{ + ceph_assert(prezeroing_pos >= flush_pos); + + uint64_t num_periods = cct->_conf.get_val<uint64_t>("journaler_prezero_periods"); + /* + * issue zero requests based on write_pos, even though the invariant + * is that we zero ahead of flush_pos. + */ + uint64_t period = get_layout_period(); + uint64_t to = write_pos + period * num_periods + period - 1; + to -= to % period; + + if (prezeroing_pos >= to) { + ldout(cct, 20) << "_issue_prezero target " << to << " <= prezeroing_pos " + << prezeroing_pos << dendl; + return; + } + + while (prezeroing_pos < to) { + uint64_t len; + if (prezeroing_pos % period == 0) { + len = period; + ldout(cct, 10) << "_issue_prezero removing " << prezeroing_pos << "~" + << period << " (full period)" << dendl; + } else { + len = period - (prezeroing_pos % period); + ldout(cct, 10) << "_issue_prezero zeroing " << prezeroing_pos << "~" + << len << " (partial period)" << dendl; + } + SnapContext snapc; + Context *c = wrap_finisher(new C_Journaler_Prezero(this, prezeroing_pos, + len)); + filer.zero(ino, &layout, snapc, prezeroing_pos, len, + ceph::real_clock::now(), 0, c); + prezeroing_pos += len; + } +} + +// Lock cycle because we get called out of objecter callback (holding +// objecter read lock), but there are also cases where we take the journaler +// lock before calling into objecter to do I/O. +void Journaler::_finish_prezero(int r, uint64_t start, uint64_t len) +{ + lock_guard l(lock); + + ldout(cct, 10) << "_prezeroed to " << start << "~" << len + << ", prezeroing/prezero was " << prezeroing_pos << "/" + << prezero_pos << ", pending " << pending_zero + << dendl; + if (r < 0 && r != -ENOENT) { + lderr(cct) << "_prezeroed got " << cpp_strerror(r) << dendl; + handle_write_error(r); + return; + } + + ceph_assert(r == 0 || r == -ENOENT); + + if (start == prezero_pos) { + prezero_pos += len; + while (!pending_zero.empty() && + pending_zero.begin().get_start() == prezero_pos) { + interval_set<uint64_t>::iterator b(pending_zero.begin()); + prezero_pos += b.get_len(); + pending_zero.erase(b); + } + + if (waiting_for_zero_pos > flush_pos) { + _do_flush(waiting_for_zero_pos - flush_pos); + } + + if (prezero_pos == prezeroing_pos && + !waitfor_prezero.empty()) { + list<Context*> ls; + ls.swap(waitfor_prezero); + finish_contexts(cct, ls, 0); + } + } else { + pending_zero.insert(start, len); + } + ldout(cct, 10) << "_prezeroed prezeroing/prezero now " << prezeroing_pos + << "/" << prezero_pos + << ", pending " << pending_zero + << dendl; +} + +void Journaler::wait_for_prezero(Context *onfinish) +{ + ceph_assert(onfinish); + lock_guard l(lock); + + if (prezero_pos == prezeroing_pos) { + finisher->queue(onfinish, 0); + return; + } + waitfor_prezero.push_back(wrap_finisher(onfinish)); +} + + +/***************** READING *******************/ + + +class Journaler::C_Read : public Context { + Journaler *ls; + uint64_t offset; + uint64_t length; +public: + bufferlist bl; + C_Read(Journaler *j, uint64_t o, uint64_t l) : ls(j), offset(o), length(l) {} + void finish(int r) override { + ls->_finish_read(r, offset, length, bl); + } +}; + +class Journaler::C_RetryRead : public Context { + Journaler *ls; +public: + explicit C_RetryRead(Journaler *l) : ls(l) {} + + void finish(int r) override { + // Should only be called from waitfor_safe i.e. already inside lock + // (ls->lock is locked + ls->_prefetch(); + } +}; + +void Journaler::_finish_read(int r, uint64_t offset, uint64_t length, + bufferlist& bl) +{ + lock_guard l(lock); + + if (r < 0) { + ldout(cct, 0) << "_finish_read got error " << r << dendl; + error = r; + } else { + ldout(cct, 10) << "_finish_read got " << offset << "~" << bl.length() + << dendl; + if (bl.length() < length) { + ldout(cct, 0) << "_finish_read got less than expected (" << length << ")" + << dendl; + error = -EINVAL; + } + } + + if (error) { + if (on_readable) { + C_OnFinisher *f = on_readable; + on_readable = 0; + f->complete(error); + } + return; + } + + prefetch_buf[offset].swap(bl); + + try { + _assimilate_prefetch(); + } catch (const buffer::error &err) { + lderr(cct) << "_decode error from assimilate_prefetch" << dendl; + error = -EINVAL; + if (on_readable) { + C_OnFinisher *f = on_readable; + on_readable = 0; + f->complete(error); + } + return; + } + _prefetch(); +} + +void Journaler::_assimilate_prefetch() +{ + bool was_readable = readable; + + bool got_any = false; + while (!prefetch_buf.empty()) { + map<uint64_t,bufferlist>::iterator p = prefetch_buf.begin(); + if (p->first != received_pos) { + uint64_t gap = p->first - received_pos; + ldout(cct, 10) << "_assimilate_prefetch gap of " << gap + << " from received_pos " << received_pos + << " to first prefetched buffer " << p->first << dendl; + break; + } + + ldout(cct, 10) << "_assimilate_prefetch " << p->first << "~" + << p->second.length() << dendl; + received_pos += p->second.length(); + read_buf.claim_append(p->second); + ceph_assert(received_pos <= requested_pos); + prefetch_buf.erase(p); + got_any = true; + } + + if (got_any) { + ldout(cct, 10) << "_assimilate_prefetch read_buf now " << read_pos << "~" + << read_buf.length() << ", read pointers read_pos=" << read_pos + << " received_pos=" << received_pos << " requested_pos=" << requested_pos + << dendl; + + // Update readability (this will also hit any decode errors resulting + // from bad data) + readable = _is_readable(); + } + + if ((got_any && !was_readable && readable) || read_pos == write_pos) { + // readable! + ldout(cct, 10) << "_finish_read now readable (or at journal end) readable=" + << readable << " read_pos=" << read_pos << " write_pos=" + << write_pos << dendl; + if (on_readable) { + C_OnFinisher *f = on_readable; + on_readable = 0; + f->complete(0); + } + } +} + +void Journaler::_issue_read(uint64_t len) +{ + // stuck at safe_pos? (this is needed if we are reading the tail of + // a journal we are also writing to) + ceph_assert(requested_pos <= safe_pos); + if (requested_pos == safe_pos) { + ldout(cct, 10) << "_issue_read requested_pos = safe_pos = " << safe_pos + << ", waiting" << dendl; + ceph_assert(write_pos > requested_pos); + if (pending_safe.empty()) { + _flush(NULL); + } + + // Make sure keys of waitfor_safe map are journal entry boundaries. + // The key we used here is either next_safe_pos or old value of + // next_safe_pos. next_safe_pos is always set to journal entry + // boundary. + auto p = pending_safe.rbegin(); + if (p != pending_safe.rend()) + waitfor_safe[p->second].push_back(new C_RetryRead(this)); + else + waitfor_safe[next_safe_pos].push_back(new C_RetryRead(this)); + return; + } + + // don't read too much + if (requested_pos + len > safe_pos) { + len = safe_pos - requested_pos; + ldout(cct, 10) << "_issue_read reading only up to safe_pos " << safe_pos + << dendl; + } + + // go. + ldout(cct, 10) << "_issue_read reading " << requested_pos << "~" << len + << ", read pointers read_pos=" << read_pos << " received_pos=" << received_pos + << " requested_pos+len=" << (requested_pos+len) << dendl; + + // step by period (object). _don't_ do a single big filer.read() + // here because it will wait for all object reads to complete before + // giving us back any data. this way we can process whatever bits + // come in that are contiguous. + uint64_t period = get_layout_period(); + while (len > 0) { + uint64_t e = requested_pos + period; + e -= e % period; + uint64_t l = e - requested_pos; + if (l > len) + l = len; + C_Read *c = new C_Read(this, requested_pos, l); + filer.read(ino, &layout, CEPH_NOSNAP, requested_pos, l, &c->bl, 0, + wrap_finisher(c), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + requested_pos += l; + len -= l; + } +} + +void Journaler::_prefetch() +{ + if (is_stopping()) + return; + + ldout(cct, 10) << "_prefetch" << dendl; + // prefetch + uint64_t pf; + if (temp_fetch_len) { + ldout(cct, 10) << "_prefetch temp_fetch_len " << temp_fetch_len << dendl; + pf = temp_fetch_len; + temp_fetch_len = 0; + } else { + pf = fetch_len; + } + + uint64_t raw_target = read_pos + pf; + + // read full log segments, so increase if necessary + uint64_t period = get_layout_period(); + uint64_t remainder = raw_target % period; + uint64_t adjustment = remainder ? period - remainder : 0; + uint64_t target = raw_target + adjustment; + + // don't read past the log tail + if (target > write_pos) + target = write_pos; + + if (requested_pos < target) { + uint64_t len = target - requested_pos; + ldout(cct, 10) << "_prefetch " << pf << " requested_pos " << requested_pos + << " < target " << target << " (" << raw_target + << "), prefetching " << len << dendl; + + if (pending_safe.empty() && write_pos > safe_pos) { + // If we are reading and writing the journal, then we may need + // to issue a flush if one isn't already in progress. + // Avoid doing a flush every time so that if we do write/read/write/read + // we don't end up flushing after every write. + ldout(cct, 10) << "_prefetch: requested_pos=" << requested_pos + << ", read_pos=" << read_pos + << ", write_pos=" << write_pos + << ", safe_pos=" << safe_pos << dendl; + _do_flush(); + } + + _issue_read(len); + } +} + + +/* + * _is_readable() - return true if next entry is ready. + */ +bool Journaler::_is_readable() +{ + // anything to read? + if (read_pos == write_pos) + return false; + + // Check if the retrieve bytestream has enough for an entry + uint64_t need; + if (journal_stream.readable(read_buf, &need)) { + return true; + } + + ldout (cct, 10) << "_is_readable read_buf.length() == " << read_buf.length() + << ", but need " << need << " for next entry; fetch_len is " + << fetch_len << dendl; + + // partial fragment at the end? + if (received_pos == write_pos) { + ldout(cct, 10) << "is_readable() detected partial entry at tail, " + "adjusting write_pos to " << read_pos << dendl; + + // adjust write_pos + prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = read_pos; + ceph_assert(write_buf.length() == 0); + ceph_assert(waitfor_safe.empty()); + + // reset read state + requested_pos = received_pos = read_pos; + read_buf.clear(); + + // FIXME: truncate on disk? + + return false; + } + + if (need > fetch_len) { + temp_fetch_len = need; + ldout(cct, 10) << "_is_readable noting temp_fetch_len " << temp_fetch_len + << dendl; + } + + ldout(cct, 10) << "_is_readable: not readable, returning false" << dendl; + return false; +} + +/* + * is_readable() - kickstart prefetch, too + */ +bool Journaler::is_readable() +{ + lock_guard l(lock); + + if (error != 0) { + return false; + } + + bool r = readable; + _prefetch(); + return r; +} + +class Journaler::C_EraseFinish : public Context { + Journaler *journaler; + C_OnFinisher *completion; + public: + C_EraseFinish(Journaler *j, C_OnFinisher *c) : journaler(j), completion(c) {} + void finish(int r) override { + journaler->_finish_erase(r, completion); + } +}; + +/** + * Entirely erase the journal, including header. For use when you + * have already made a copy of the journal somewhere else. + */ +void Journaler::erase(Context *completion) +{ + lock_guard l(lock); + + // Async delete the journal data + uint64_t first = trimmed_pos / get_layout_period(); + uint64_t num = (write_pos - trimmed_pos) / get_layout_period() + 2; + filer.purge_range(ino, &layout, SnapContext(), first, num, + ceph::real_clock::now(), 0, + wrap_finisher(new C_EraseFinish( + this, wrap_finisher(completion)))); + + // We will not start the operation to delete the header until + // _finish_erase has seen the data deletion succeed: otherwise if + // there was an error deleting data we might prematurely delete the + // header thereby lose our reference to the data. +} + +void Journaler::_finish_erase(int data_result, C_OnFinisher *completion) +{ + lock_guard l(lock); + if (is_stopping()) { + completion->complete(-EAGAIN); + return; + } + + if (data_result == 0) { + // Async delete the journal header + filer.purge_range(ino, &layout, SnapContext(), 0, 1, + ceph::real_clock::now(), + 0, wrap_finisher(completion)); + } else { + lderr(cct) << "Failed to delete journal " << ino << " data: " + << cpp_strerror(data_result) << dendl; + completion->complete(data_result); + } +} + +/* try_read_entry(bl) + * read entry into bl if it's ready. + * otherwise, do nothing. + */ +bool Journaler::try_read_entry(bufferlist& bl) +{ + lock_guard l(lock); + + if (!readable) { + ldout(cct, 10) << "try_read_entry at " << read_pos << " not readable" + << dendl; + return false; + } + + uint64_t start_ptr; + size_t consumed; + try { + consumed = journal_stream.read(read_buf, &bl, &start_ptr); + if (stream_format >= JOURNAL_FORMAT_RESILIENT) { + ceph_assert(start_ptr == read_pos); + } + } catch (const buffer::error &e) { + lderr(cct) << __func__ << ": decode error from journal_stream" << dendl; + error = -EINVAL; + return false; + } + + ldout(cct, 10) << "try_read_entry at " << read_pos << " read " + << read_pos << "~" << consumed << " (have " + << read_buf.length() << ")" << dendl; + + read_pos += consumed; + try { + // We were readable, we might not be any more + readable = _is_readable(); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << ": decode error from _is_readable" << dendl; + error = -EINVAL; + return false; + } + + // prefetch? + _prefetch(); + + // If bufferlist consists of discontiguous memory, decoding types whose + // denc_traits needs contiguous memory is inefficient. The bufferlist may + // get copied to temporary memory multiple times (copy_shallow() in + // src/include/denc.h actually does deep copy) + if (bl.get_num_buffers() > 1) + bl.rebuild(); + return true; +} + +void Journaler::wait_for_readable(Context *onreadable) +{ + lock_guard l(lock); + if (is_stopping()) { + finisher->queue(onreadable, -EAGAIN); + return; + } + + ceph_assert(on_readable == 0); + if (!readable) { + ldout(cct, 10) << "wait_for_readable at " << read_pos << " onreadable " + << onreadable << dendl; + on_readable = wrap_finisher(onreadable); + } else { + // race with OSD reply + finisher->queue(onreadable, 0); + } +} + +bool Journaler::have_waiter() const +{ + return on_readable != nullptr; +} + + + + +/***************** TRIMMING *******************/ + + +class Journaler::C_Trim : public Context { + Journaler *ls; + uint64_t to; +public: + C_Trim(Journaler *l, int64_t t) : ls(l), to(t) {} + void finish(int r) override { + ls->_finish_trim(r, to); + } +}; + +void Journaler::trim() +{ + lock_guard l(lock); + _trim(); +} + +void Journaler::_trim() +{ + if (is_stopping()) + return; + + ceph_assert(!readonly); + uint64_t period = get_layout_period(); + uint64_t trim_to = last_committed.expire_pos; + trim_to -= trim_to % period; + ldout(cct, 10) << "trim last_commited head was " << last_committed + << ", can trim to " << trim_to + << dendl; + if (trim_to == 0 || trim_to == trimming_pos) { + ldout(cct, 10) << "trim already trimmed/trimming to " + << trimmed_pos << "/" << trimming_pos << dendl; + return; + } + + if (trimming_pos > trimmed_pos) { + ldout(cct, 10) << "trim already trimming atm, try again later. " + "trimmed/trimming is " << trimmed_pos << "/" << trimming_pos << dendl; + return; + } + + // trim + ceph_assert(trim_to <= write_pos); + ceph_assert(trim_to <= expire_pos); + ceph_assert(trim_to > trimming_pos); + ldout(cct, 10) << "trim trimming to " << trim_to + << ", trimmed/trimming/expire are " + << trimmed_pos << "/" << trimming_pos << "/" << expire_pos + << dendl; + + // delete range of objects + uint64_t first = trimming_pos / period; + uint64_t num = (trim_to - trimming_pos) / period; + SnapContext snapc; + filer.purge_range(ino, &layout, snapc, first, num, + ceph::real_clock::now(), 0, + wrap_finisher(new C_Trim(this, trim_to))); + trimming_pos = trim_to; +} + +void Journaler::_finish_trim(int r, uint64_t to) +{ + lock_guard l(lock); + + ceph_assert(!readonly); + ldout(cct, 10) << "_finish_trim trimmed_pos was " << trimmed_pos + << ", trimmed/trimming/expire now " + << to << "/" << trimming_pos << "/" << expire_pos + << dendl; + if (r < 0 && r != -ENOENT) { + lderr(cct) << "_finish_trim got " << cpp_strerror(r) << dendl; + handle_write_error(r); + return; + } + + ceph_assert(r >= 0 || r == -ENOENT); + + ceph_assert(to <= trimming_pos); + ceph_assert(to > trimmed_pos); + trimmed_pos = to; +} + +void Journaler::handle_write_error(int r) +{ + // lock is locked + + lderr(cct) << "handle_write_error " << cpp_strerror(r) << dendl; + if (on_write_error) { + on_write_error->complete(r); + on_write_error = NULL; + called_write_error = true; + } else if (called_write_error) { + /* We don't call error handler more than once, subsequent errors + * are dropped -- this is okay as long as the error handler does + * something dramatic like respawn */ + lderr(cct) << __func__ << ": multiple write errors, handler already called" + << dendl; + } else { + ceph_abort_msg("unhandled write error"); + } +} + + +/** + * Test whether the 'read_buf' byte stream has enough data to read + * an entry + * + * sets 'next_envelope_size' to the number of bytes needed to advance (enough + * to get the next header if header was unavailable, or enough to get the whole + * next entry if the header was available but the body wasn't). + */ +bool JournalStream::readable(bufferlist &read_buf, uint64_t *need) const +{ + ceph_assert(need != NULL); + + uint32_t entry_size = 0; + uint64_t entry_sentinel = 0; + auto p = read_buf.cbegin(); + + // Do we have enough data to decode an entry prefix? + if (format >= JOURNAL_FORMAT_RESILIENT) { + *need = sizeof(entry_size) + sizeof(entry_sentinel); + } else { + *need = sizeof(entry_size); + } + if (read_buf.length() >= *need) { + if (format >= JOURNAL_FORMAT_RESILIENT) { + decode(entry_sentinel, p); + if (entry_sentinel != sentinel) { + throw buffer::malformed_input("Invalid sentinel"); + } + } + + decode(entry_size, p); + } else { + return false; + } + + // Do we have enough data to decode an entry prefix, payload and suffix? + if (format >= JOURNAL_FORMAT_RESILIENT) { + *need = JOURNAL_ENVELOPE_RESILIENT + entry_size; + } else { + *need = JOURNAL_ENVELOPE_LEGACY + entry_size; + } + if (read_buf.length() >= *need) { + return true; // No more bytes needed + } + + return false; +} + + +/** + * Consume one entry from a journal byte stream 'from', splicing a + * serialized LogEvent blob into 'entry'. + * + * 'entry' must be non null and point to an empty bufferlist. + * + * 'from' must contain sufficient valid data (i.e. readable is true). + * + * 'start_ptr' will be set to the entry's start pointer, if the collection + * format provides it. It may not be null. + * + * @returns The number of bytes consumed from the `from` byte stream. Note + * that this is not equal to the length of `entry`, which contains + * the inner serialized LogEvent and not the envelope. + */ +size_t JournalStream::read(bufferlist &from, bufferlist *entry, + uint64_t *start_ptr) +{ + ceph_assert(start_ptr != NULL); + ceph_assert(entry != NULL); + ceph_assert(entry->length() == 0); + + uint32_t entry_size = 0; + + // Consume envelope prefix: entry_size and entry_sentinel + auto from_ptr = from.cbegin(); + if (format >= JOURNAL_FORMAT_RESILIENT) { + uint64_t entry_sentinel = 0; + decode(entry_sentinel, from_ptr); + // Assertion instead of clean check because of precondition of this + // fn is that readable() already passed + ceph_assert(entry_sentinel == sentinel); + } + decode(entry_size, from_ptr); + + // Read out the payload + from_ptr.copy(entry_size, *entry); + + // Consume the envelope suffix (start_ptr) + if (format >= JOURNAL_FORMAT_RESILIENT) { + decode(*start_ptr, from_ptr); + } else { + *start_ptr = 0; + } + + // Trim the input buffer to discard the bytes we have consumed + from.splice(0, from_ptr.get_off()); + + return from_ptr.get_off(); +} + + +/** + * Append one entry + */ +size_t JournalStream::write(bufferlist &entry, bufferlist *to, + uint64_t const &start_ptr) +{ + ceph_assert(to != NULL); + + uint32_t const entry_size = entry.length(); + if (format >= JOURNAL_FORMAT_RESILIENT) { + encode(sentinel, *to); + } + encode(entry_size, *to); + to->claim_append(entry); + if (format >= JOURNAL_FORMAT_RESILIENT) { + encode(start_ptr, *to); + } + + if (format >= JOURNAL_FORMAT_RESILIENT) { + return JOURNAL_ENVELOPE_RESILIENT + entry_size; + } else { + return JOURNAL_ENVELOPE_LEGACY + entry_size; + } +} + +/** + * set write error callback + * + * Set a callback/context to trigger if we get a write error from + * the objecter. This may be from an explicit request (e.g., flush) + * or something async the journaler did on its own (e.g., journal + * header update). + * + * It is only used once; if the caller continues to use the + * Journaler and wants to hear about errors, it needs to reset the + * error_handler. + * + * @param c callback/context to trigger on error + */ +void Journaler::set_write_error_handler(Context *c) { + lock_guard l(lock); + ceph_assert(!on_write_error); + on_write_error = wrap_finisher(c); + called_write_error = false; +} + + +/** + * Wrap a context in a C_OnFinisher, if it is non-NULL + * + * Utility function to avoid lots of error-prone and verbose + * NULL checking on contexts passed in. + */ +C_OnFinisher *Journaler::wrap_finisher(Context *c) +{ + if (c != NULL) { + return new C_OnFinisher(c, finisher); + } else { + return NULL; + } +} + +void Journaler::shutdown() +{ + lock_guard l(lock); + + ldout(cct, 1) << __func__ << dendl; + + state = STATE_STOPPING; + readable = false; + + // Kick out anyone reading from journal + error = -EAGAIN; + if (on_readable) { + C_OnFinisher *f = on_readable; + on_readable = 0; + f->complete(-EAGAIN); + } + + list<Context*> ls; + ls.swap(waitfor_recover); + finish_contexts(cct, ls, -ESHUTDOWN); + + std::map<uint64_t, std::list<Context*> >::iterator i; + for (i = waitfor_safe.begin(); i != waitfor_safe.end(); ++i) { + finish_contexts(cct, i->second, -EAGAIN); + } + waitfor_safe.clear(); +} + diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h new file mode 100644 index 00000000..e3cd9e6c --- /dev/null +++ b/src/osdc/Journaler.h @@ -0,0 +1,540 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* Journaler + * + * This class stripes a serial log over objects on the store. Four + * logical pointers: + * + * write_pos - where we're writing new entries + * unused_field - where we're reading old entires + * expire_pos - what is deemed "old" by user + * trimmed_pos - where we're expiring old items + * + * trimmed_pos <= expire_pos <= unused_field <= write_pos. + * + * Often, unused_field <= write_pos (as with MDS log). During + * recovery, write_pos is undefined until the end of the log is + * discovered. + * + * A "head" struct at the beginning of the log is used to store + * metadata at regular intervals. The basic invariants include: + * + * head.unused_field <= unused_field -- the head may "lag", since + * it's updated lazily. + * head.write_pos <= write_pos + * head.expire_pos <= expire_pos + * head.trimmed_pos <= trimmed_pos + * + * More significantly, + * + * head.expire_pos >= trimmed_pos -- this ensures we can find the + * "beginning" of the log as last + * recorded, before it is trimmed. + * trimming will block until a + * sufficiently current expire_pos + * is committed. + * + * To recover log state, we simply start at the last write_pos in the + * head, and probe the object sequence sizes until we read the end. + * + * Head struct is stored in the first object. Actual journal starts + * after layout.period() bytes. + * + */ + +#ifndef CEPH_JOURNALER_H +#define CEPH_JOURNALER_H + +#include <list> +#include <map> + +#include "Objecter.h" +#include "Filer.h" + +#include "common/Timer.h" +#include "common/Throttle.h" + +class CephContext; +class Context; +class PerfCounters; +class Finisher; +class C_OnFinisher; + +typedef __u8 stream_format_t; + +// Legacy envelope is leading uint32_t size +enum StreamFormat { + JOURNAL_FORMAT_LEGACY = 0, + JOURNAL_FORMAT_RESILIENT = 1, + // Insert new formats here, before COUNT + JOURNAL_FORMAT_COUNT +}; + +// Highest journal format version that we support +#define JOURNAL_FORMAT_MAX (JOURNAL_FORMAT_COUNT - 1) + +// Legacy envelope is leading uint32_t size +#define JOURNAL_ENVELOPE_LEGACY (sizeof(uint32_t)) + +// Resilient envelope is leading uint64_t sentinel, uint32_t size, +// trailing uint64_t start_ptr +#define JOURNAL_ENVELOPE_RESILIENT (sizeof(uint32_t) + sizeof(uint64_t) + \ + sizeof(uint64_t)) + +/** + * Represents a collection of entries serialized in a byte stream. + * + * Each entry consists of: + * - a blob (used by the next level up as a serialized LogEvent) + * - a uint64_t (used by the next level up as a pointer to the start + * of the entry in the collection bytestream) + */ +class JournalStream +{ + stream_format_t format; + + public: + JournalStream(stream_format_t format_) : format(format_) {} + + void set_format(stream_format_t format_) {format = format_;} + + bool readable(bufferlist &bl, uint64_t *need) const; + size_t read(bufferlist &from, bufferlist *to, uint64_t *start_ptr); + size_t write(bufferlist &entry, bufferlist *to, uint64_t const &start_ptr); + size_t get_envelope_size() const { + if (format >= JOURNAL_FORMAT_RESILIENT) { + return JOURNAL_ENVELOPE_RESILIENT; + } else { + return JOURNAL_ENVELOPE_LEGACY; + } + } + + // A magic number for the start of journal entries, so that we can + // identify them in damaged journals. + static const uint64_t sentinel = 0x3141592653589793; +}; + + +class Journaler { +public: + // this goes at the head of the log "file". + class Header { + public: + uint64_t trimmed_pos; + uint64_t expire_pos; + uint64_t unused_field; + uint64_t write_pos; + string magic; + file_layout_t layout; //< The mapping from byte stream offsets + // to RADOS objects + stream_format_t stream_format; //< The encoding of LogEvents + // within the journal byte stream + + Header(const char *m="") : + trimmed_pos(0), expire_pos(0), unused_field(0), write_pos(0), magic(m), + stream_format(-1) { + } + + void encode(bufferlist &bl) const { + ENCODE_START(2, 2, bl); + encode(magic, bl); + encode(trimmed_pos, bl); + encode(expire_pos, bl); + encode(unused_field, bl); + encode(write_pos, bl); + encode(layout, bl, 0); // encode in legacy format + encode(stream_format, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(magic, bl); + decode(trimmed_pos, bl); + decode(expire_pos, bl); + decode(unused_field, bl); + decode(write_pos, bl); + decode(layout, bl); + if (struct_v > 1) { + decode(stream_format, bl); + } else { + stream_format = JOURNAL_FORMAT_LEGACY; + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->open_object_section("journal_header"); + { + f->dump_string("magic", magic); + f->dump_unsigned("write_pos", write_pos); + f->dump_unsigned("expire_pos", expire_pos); + f->dump_unsigned("trimmed_pos", trimmed_pos); + f->dump_unsigned("stream_format", stream_format); + f->dump_object("layout", layout); + } + f->close_section(); // journal_header + } + + static void generate_test_instances(list<Header*> &ls) + { + ls.push_back(new Header()); + + ls.push_back(new Header()); + ls.back()->trimmed_pos = 1; + ls.back()->expire_pos = 2; + ls.back()->unused_field = 3; + ls.back()->write_pos = 4; + ls.back()->magic = "magique"; + + ls.push_back(new Header()); + ls.back()->stream_format = JOURNAL_FORMAT_RESILIENT; + } + }; + WRITE_CLASS_ENCODER(Header) + + uint32_t get_stream_format() const { + return stream_format; + } + + Header last_committed; + +private: + // me + CephContext *cct; + std::mutex lock; + const std::string name; + typedef std::lock_guard<std::mutex> lock_guard; + typedef std::unique_lock<std::mutex> unique_lock; + Finisher *finisher; + Header last_written; + inodeno_t ino; + int64_t pg_pool; + bool readonly; + file_layout_t layout; + uint32_t stream_format; + JournalStream journal_stream; + + const char *magic; + Objecter *objecter; + Filer filer; + + PerfCounters *logger; + int logger_key_lat; + + class C_DelayFlush; + C_DelayFlush *delay_flush_event; + /* + * Do a flush as a result of a C_DelayFlush context. + */ + void _do_delayed_flush() + { + ceph_assert(delay_flush_event != NULL); + lock_guard l(lock); + delay_flush_event = NULL; + _do_flush(); + } + + // my state + static const int STATE_UNDEF = 0; + static const int STATE_READHEAD = 1; + static const int STATE_PROBING = 2; + static const int STATE_ACTIVE = 3; + static const int STATE_REREADHEAD = 4; + static const int STATE_REPROBING = 5; + static const int STATE_STOPPING = 6; + + int state; + int error; + + void _write_head(Context *oncommit=NULL); + void _wait_for_flush(Context *onsafe); + void _trim(); + + // header + ceph::real_time last_wrote_head; + void _finish_write_head(int r, Header &wrote, C_OnFinisher *oncommit); + class C_WriteHead; + friend class C_WriteHead; + + void _reread_head(Context *onfinish); + void _set_layout(file_layout_t const *l); + list<Context*> waitfor_recover; + void _read_head(Context *on_finish, bufferlist *bl); + void _finish_read_head(int r, bufferlist& bl); + void _finish_reread_head(int r, bufferlist& bl, Context *finish); + void _probe(Context *finish, uint64_t *end); + void _finish_probe_end(int r, uint64_t end); + void _reprobe(C_OnFinisher *onfinish); + void _finish_reprobe(int r, uint64_t end, C_OnFinisher *onfinish); + void _finish_reread_head_and_probe(int r, C_OnFinisher *onfinish); + class C_ReadHead; + friend class C_ReadHead; + class C_ProbeEnd; + friend class C_ProbeEnd; + class C_RereadHead; + friend class C_RereadHead; + class C_ReProbe; + friend class C_ReProbe; + class C_RereadHeadProbe; + friend class C_RereadHeadProbe; + + // writer + uint64_t prezeroing_pos; + uint64_t prezero_pos; ///< we zero journal space ahead of write_pos to + // avoid problems with tail probing + uint64_t write_pos; ///< logical write position, where next entry + // will go + uint64_t flush_pos; ///< where we will flush. if + /// write_pos>flush_pos, we're buffering writes. + uint64_t safe_pos; ///< what has been committed safely to disk. + + uint64_t next_safe_pos; /// start position of the first entry that isn't + /// being fully flushed. If we don't flush any + // partial entry, it's equal to flush_pos. + + bufferlist write_buf; ///< write buffer. flush_pos + + /// write_buf.length() == write_pos. + + // protect write_buf from bufferlist _len overflow + Throttle write_buf_throttle; + + uint64_t waiting_for_zero_pos; + interval_set<uint64_t> pending_zero; // non-contig bits we've zeroed + list<Context*> waitfor_prezero; + + std::map<uint64_t, uint64_t> pending_safe; // flush_pos -> safe_pos + // when safe through given offset + std::map<uint64_t, std::list<Context*> > waitfor_safe; + + void _flush(C_OnFinisher *onsafe); + void _do_flush(unsigned amount=0); + void _finish_flush(int r, uint64_t start, ceph::real_time stamp); + class C_Flush; + friend class C_Flush; + + // reader + uint64_t read_pos; // logical read position, where next entry starts. + uint64_t requested_pos; // what we've requested from OSD. + uint64_t received_pos; // what we've received from OSD. + // read buffer. unused_field + read_buf.length() == prefetch_pos. + bufferlist read_buf; + + map<uint64_t,bufferlist> prefetch_buf; + + uint64_t fetch_len; // how much to read at a time + uint64_t temp_fetch_len; + + // for wait_for_readable() + C_OnFinisher *on_readable; + C_OnFinisher *on_write_error; + bool called_write_error; + + // read completion callback + void _finish_read(int r, uint64_t offset, uint64_t length, bufferlist &bl); + void _finish_retry_read(int r); + void _assimilate_prefetch(); + void _issue_read(uint64_t len); // read some more + void _prefetch(); // maybe read ahead + class C_Read; + friend class C_Read; + class C_RetryRead; + friend class C_RetryRead; + + // trimmer + uint64_t expire_pos; // what we're allowed to trim to + uint64_t trimming_pos; // what we've requested to trim through + uint64_t trimmed_pos; // what has been trimmed + + bool readable; + + void _finish_trim(int r, uint64_t to); + class C_Trim; + friend class C_Trim; + + void _issue_prezero(); + void _finish_prezero(int r, uint64_t from, uint64_t len); + friend struct C_Journaler_Prezero; + + // only init_headers when following or first reading off-disk + void init_headers(Header& h) { + ceph_assert(readonly || + state == STATE_READHEAD || + state == STATE_REREADHEAD); + last_written = last_committed = h; + } + + /** + * handle a write error + * + * called when we get an objecter error on a write. + * + * @param r error code + */ + void handle_write_error(int r); + + bool _is_readable(); + + void _finish_erase(int data_result, C_OnFinisher *completion); + class C_EraseFinish; + friend class C_EraseFinish; + + C_OnFinisher *wrap_finisher(Context *c); + + uint32_t write_iohint; // the fadvise flags for write op, see + // CEPH_OSD_OP_FADIVSE_* + +public: + Journaler(const std::string &name_, inodeno_t ino_, int64_t pool, + const char *mag, Objecter *obj, PerfCounters *l, int lkey, Finisher *f) : + last_committed(mag), + cct(obj->cct), name(name_), finisher(f), last_written(mag), + ino(ino_), pg_pool(pool), readonly(true), + stream_format(-1), journal_stream(-1), + magic(mag), + objecter(obj), filer(objecter, f), logger(l), logger_key_lat(lkey), + delay_flush_event(0), + state(STATE_UNDEF), error(0), + prezeroing_pos(0), prezero_pos(0), write_pos(0), flush_pos(0), + safe_pos(0), next_safe_pos(0), + write_buf_throttle(cct, "write_buf_throttle", UINT_MAX - (UINT_MAX >> 3)), + waiting_for_zero_pos(0), + read_pos(0), requested_pos(0), received_pos(0), + fetch_len(0), temp_fetch_len(0), + on_readable(0), on_write_error(NULL), called_write_error(false), + expire_pos(0), trimming_pos(0), trimmed_pos(0), readable(false), + write_iohint(0) + { + } + + /* reset + * + * NOTE: we assume the caller knows/has ensured that any objects in + * our sequence do not exist.. e.g. after a MKFS. this is _not_ an + * "erase" method. + */ + void reset() { + lock_guard l(lock); + ceph_assert(state == STATE_ACTIVE); + + readonly = true; + delay_flush_event = NULL; + state = STATE_UNDEF; + error = 0; + prezeroing_pos = 0; + prezero_pos = 0; + write_pos = 0; + flush_pos = 0; + safe_pos = 0; + next_safe_pos = 0; + read_pos = 0; + requested_pos = 0; + received_pos = 0; + fetch_len = 0; + ceph_assert(!on_readable); + expire_pos = 0; + trimming_pos = 0; + trimmed_pos = 0; + waiting_for_zero_pos = 0; + } + + // Asynchronous operations + // ======================= + void erase(Context *completion); + void create(file_layout_t *layout, stream_format_t const sf); + void recover(Context *onfinish); + void reread_head(Context *onfinish); + void reread_head_and_probe(Context *onfinish); + void write_head(Context *onsave=0); + void wait_for_flush(Context *onsafe = 0); + void flush(Context *onsafe = 0); + void wait_for_readable(Context *onfinish); + bool have_waiter() const; + void wait_for_prezero(Context *onfinish); + + // Synchronous setters + // =================== + void set_layout(file_layout_t const *l); + void set_readonly(); + void set_writeable(); + void set_write_pos(uint64_t p) { + lock_guard l(lock); + prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = p; + } + void set_read_pos(uint64_t p) { + lock_guard l(lock); + // we can't cope w/ in-progress read right now. + ceph_assert(requested_pos == received_pos); + read_pos = requested_pos = received_pos = p; + read_buf.clear(); + } + uint64_t append_entry(bufferlist& bl); + void set_expire_pos(uint64_t ep) { + lock_guard l(lock); + expire_pos = ep; + } + void set_trimmed_pos(uint64_t p) { + lock_guard l(lock); + trimming_pos = trimmed_pos = p; + } + + bool _write_head_needed(); + bool write_head_needed() { + lock_guard l(lock); + return _write_head_needed(); + } + + + void trim(); + void trim_tail() { + lock_guard l(lock); + + ceph_assert(!readonly); + _issue_prezero(); + } + + void set_write_error_handler(Context *c); + + void set_write_iohint(uint32_t iohint_flags) { + write_iohint = iohint_flags; + } + /** + * Cause any ongoing waits to error out with -EAGAIN, set error + * to -EAGAIN. + */ + void shutdown(); +public: + + // Synchronous getters + // =================== + // TODO: need some locks on reads for true safety + uint64_t get_layout_period() const { + return layout.get_period(); + } + file_layout_t& get_layout() { return layout; } + bool is_active() { return state == STATE_ACTIVE; } + bool is_stopping() { return state == STATE_STOPPING; } + int get_error() { return error; } + bool is_readonly() { return readonly; } + bool is_readable(); + bool try_read_entry(bufferlist& bl); + uint64_t get_write_pos() const { return write_pos; } + uint64_t get_write_safe_pos() const { return safe_pos; } + uint64_t get_read_pos() const { return read_pos; } + uint64_t get_expire_pos() const { return expire_pos; } + uint64_t get_trimmed_pos() const { return trimmed_pos; } +}; +WRITE_CLASS_ENCODER(Journaler::Header) + +#endif diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc new file mode 100644 index 00000000..c326a02a --- /dev/null +++ b/src/osdc/ObjectCacher.cc @@ -0,0 +1,2800 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <limits.h> + +#include "msg/Messenger.h" +#include "ObjectCacher.h" +#include "WritebackHandler.h" +#include "common/errno.h" +#include "common/perf_counters.h" + +#include "include/ceph_assert.h" + +#define MAX_FLUSH_UNDER_LOCK 20 ///< max bh's we start writeback on +#define BUFFER_MEMORY_WEIGHT CEPH_PAGE_SHIFT // memory usage of BufferHead, count in (1<<n) + +using std::chrono::seconds; + /// while holding the lock + +/*** ObjectCacher::BufferHead ***/ + + +/*** ObjectCacher::Object ***/ + +#define dout_subsys ceph_subsys_objectcacher +#undef dout_prefix +#define dout_prefix *_dout << "objectcacher.object(" << oid << ") " + + + +class ObjectCacher::C_ReadFinish : public Context { + ObjectCacher *oc; + int64_t poolid; + sobject_t oid; + loff_t start; + uint64_t length; + xlist<C_ReadFinish*>::item set_item; + bool trust_enoent; + ceph_tid_t tid; + ZTracer::Trace trace; + +public: + bufferlist bl; + C_ReadFinish(ObjectCacher *c, Object *ob, ceph_tid_t t, loff_t s, + uint64_t l, const ZTracer::Trace &trace) : + oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l), + set_item(this), trust_enoent(true), + tid(t), trace(trace) { + ob->reads.push_back(&set_item); + } + + void finish(int r) override { + oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent); + trace.event("finish"); + + // object destructor clears the list + if (set_item.is_on_list()) + set_item.remove_myself(); + } + + void distrust_enoent() { + trust_enoent = false; + } +}; + +class ObjectCacher::C_RetryRead : public Context { + ObjectCacher *oc; + OSDRead *rd; + ObjectSet *oset; + Context *onfinish; + ZTracer::Trace trace; +public: + C_RetryRead(ObjectCacher *_oc, OSDRead *r, ObjectSet *os, Context *c, + const ZTracer::Trace &trace) + : oc(_oc), rd(r), oset(os), onfinish(c), trace(trace) { + } + void finish(int r) override { + if (r >= 0) { + r = oc->_readx(rd, oset, onfinish, false, &trace); + } + + if (r == 0) { + // read is still in-progress + return; + } + + trace.event("finish"); + if (onfinish) { + onfinish->complete(r); + } + } +}; + +ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, + loff_t off) +{ + ceph_assert(oc->lock.is_locked()); + ldout(oc->cct, 20) << "split " << *left << " at " << off << dendl; + + // split off right + ObjectCacher::BufferHead *right = new BufferHead(this); + + //inherit and if later access, this auto clean. + right->set_dontneed(left->get_dontneed()); + right->set_nocache(left->get_nocache()); + + right->last_write_tid = left->last_write_tid; + right->last_read_tid = left->last_read_tid; + right->set_state(left->get_state()); + right->snapc = left->snapc; + right->set_journal_tid(left->journal_tid); + + loff_t newleftlen = off - left->start(); + right->set_start(off); + right->set_length(left->length() - newleftlen); + + // shorten left + oc->bh_stat_sub(left); + left->set_length(newleftlen); + oc->bh_stat_add(left); + + // add right + oc->bh_add(this, right); + + // split buffers too + bufferlist bl; + bl.claim(left->bl); + if (bl.length()) { + ceph_assert(bl.length() == (left->length() + right->length())); + right->bl.substr_of(bl, left->length(), right->length()); + left->bl.substr_of(bl, 0, left->length()); + } + + // move read waiters + if (!left->waitfor_read.empty()) { + map<loff_t, list<Context*> >::iterator start_remove + = left->waitfor_read.begin(); + while (start_remove != left->waitfor_read.end() && + start_remove->first < right->start()) + ++start_remove; + for (map<loff_t, list<Context*> >::iterator p = start_remove; + p != left->waitfor_read.end(); ++p) { + ldout(oc->cct, 20) << "split moving waiters at byte " << p->first + << " to right bh" << dendl; + right->waitfor_read[p->first].swap( p->second ); + ceph_assert(p->second.empty()); + } + left->waitfor_read.erase(start_remove, left->waitfor_read.end()); + } + + ldout(oc->cct, 20) << "split left is " << *left << dendl; + ldout(oc->cct, 20) << "split right is " << *right << dendl; + return right; +} + + +void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right) +{ + ceph_assert(oc->lock.is_locked()); + + ldout(oc->cct, 10) << "merge_left " << *left << " + " << *right << dendl; + if (left->get_journal_tid() == 0) { + left->set_journal_tid(right->get_journal_tid()); + } + right->set_journal_tid(0); + + oc->bh_remove(this, right); + oc->bh_stat_sub(left); + left->set_length(left->length() + right->length()); + oc->bh_stat_add(left); + + // data + left->bl.claim_append(right->bl); + + // version + // note: this is sorta busted, but should only be used for dirty buffers + left->last_write_tid = std::max( left->last_write_tid, right->last_write_tid ); + left->last_write = std::max( left->last_write, right->last_write ); + + left->set_dontneed(right->get_dontneed() ? left->get_dontneed() : false); + left->set_nocache(right->get_nocache() ? left->get_nocache() : false); + + // waiters + for (map<loff_t, list<Context*> >::iterator p = right->waitfor_read.begin(); + p != right->waitfor_read.end(); + ++p) + left->waitfor_read[p->first].splice(left->waitfor_read[p->first].begin(), + p->second ); + + // hose right + delete right; + + ldout(oc->cct, 10) << "merge_left result " << *left << dendl; +} + +bool ObjectCacher::Object::can_merge_bh(BufferHead *left, BufferHead *right) +{ + if (left->end() != right->start() || + left->get_state() != right->get_state() || + !left->can_merge_journal(right)) + return false; + if (left->is_tx() && left->last_write_tid != right->last_write_tid) + return false; + return true; +} + +void ObjectCacher::Object::try_merge_bh(BufferHead *bh) +{ + ceph_assert(oc->lock.is_locked()); + ldout(oc->cct, 10) << "try_merge_bh " << *bh << dendl; + + // do not merge rx buffers; last_read_tid may not match + if (bh->is_rx()) + return; + + // to the left? + map<loff_t,BufferHead*>::iterator p = data.find(bh->start()); + ceph_assert(p->second == bh); + if (p != data.begin()) { + --p; + if (can_merge_bh(p->second, bh)) { + merge_left(p->second, bh); + bh = p->second; + } else { + ++p; + } + } + // to the right? + ceph_assert(p->second == bh); + ++p; + if (p != data.end() && can_merge_bh(bh, p->second)) + merge_left(bh, p->second); + + maybe_rebuild_buffer(bh); +} + +void ObjectCacher::Object::maybe_rebuild_buffer(BufferHead *bh) +{ + auto& bl = bh->bl; + if (bl.get_num_buffers() <= 1) + return; + + auto wasted = bl.get_wasted_space(); + if (wasted * 2 > bl.length() && + wasted > (1U << BUFFER_MEMORY_WEIGHT)) + bl.rebuild(); +} + +/* + * count bytes we have cached in given range + */ +bool ObjectCacher::Object::is_cached(loff_t cur, loff_t left) const +{ + ceph_assert(oc->lock.is_locked()); + map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(cur); + while (left > 0) { + if (p == data.end()) + return false; + + if (p->first <= cur) { + // have part of it + loff_t lenfromcur = std::min(p->second->end() - cur, left); + cur += lenfromcur; + left -= lenfromcur; + ++p; + continue; + } else if (p->first > cur) { + // gap + return false; + } else + ceph_abort(); + } + + return true; +} + +/* + * all cached data in this range[off, off+len] + */ +bool ObjectCacher::Object::include_all_cached_data(loff_t off, loff_t len) +{ + ceph_assert(oc->lock.is_locked()); + if (data.empty()) + return true; + map<loff_t, BufferHead*>::iterator first = data.begin(); + map<loff_t, BufferHead*>::reverse_iterator last = data.rbegin(); + if (first->second->start() >= off && last->second->end() <= (off + len)) + return true; + else + return false; +} + +/* + * map a range of bytes into buffer_heads. + * - create missing buffer_heads as necessary. + */ +int ObjectCacher::Object::map_read(ObjectExtent &ex, + map<loff_t, BufferHead*>& hits, + map<loff_t, BufferHead*>& missing, + map<loff_t, BufferHead*>& rx, + map<loff_t, BufferHead*>& errors) +{ + ceph_assert(oc->lock.is_locked()); + ldout(oc->cct, 10) << "map_read " << ex.oid << " " + << ex.offset << "~" << ex.length << dendl; + + loff_t cur = ex.offset; + loff_t left = ex.length; + + map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(ex.offset); + while (left > 0) { + // at end? + if (p == data.end()) { + // rest is a miss. + BufferHead *n = new BufferHead(this); + n->set_start(cur); + n->set_length(left); + oc->bh_add(this, n); + if (complete) { + oc->mark_zero(n); + hits[cur] = n; + ldout(oc->cct, 20) << "map_read miss+complete+zero " << left << " left, " << *n << dendl; + } else { + missing[cur] = n; + ldout(oc->cct, 20) << "map_read miss " << left << " left, " << *n << dendl; + } + cur += left; + ceph_assert(cur == (loff_t)ex.offset + (loff_t)ex.length); + break; // no more. + } + + if (p->first <= cur) { + // have it (or part of it) + BufferHead *e = p->second; + + if (e->is_clean() || + e->is_dirty() || + e->is_tx() || + e->is_zero()) { + hits[cur] = e; // readable! + ldout(oc->cct, 20) << "map_read hit " << *e << dendl; + } else if (e->is_rx()) { + rx[cur] = e; // missing, not readable. + ldout(oc->cct, 20) << "map_read rx " << *e << dendl; + } else if (e->is_error()) { + errors[cur] = e; + ldout(oc->cct, 20) << "map_read error " << *e << dendl; + } else { + ceph_abort(); + } + + loff_t lenfromcur = std::min(e->end() - cur, left); + cur += lenfromcur; + left -= lenfromcur; + ++p; + continue; // more? + + } else if (p->first > cur) { + // gap.. miss + loff_t next = p->first; + BufferHead *n = new BufferHead(this); + loff_t len = std::min(next - cur, left); + n->set_start(cur); + n->set_length(len); + oc->bh_add(this,n); + if (complete) { + oc->mark_zero(n); + hits[cur] = n; + ldout(oc->cct, 20) << "map_read gap+complete+zero " << *n << dendl; + } else { + missing[cur] = n; + ldout(oc->cct, 20) << "map_read gap " << *n << dendl; + } + cur += std::min(left, n->length()); + left -= std::min(left, n->length()); + continue; // more? + } else { + ceph_abort(); + } + } + return 0; +} + +void ObjectCacher::Object::audit_buffers() +{ + loff_t offset = 0; + for (map<loff_t, BufferHead*>::const_iterator it = data.begin(); + it != data.end(); ++it) { + if (it->first != it->second->start()) { + lderr(oc->cct) << "AUDIT FAILURE: map position " << it->first + << " does not match bh start position: " + << *it->second << dendl; + ceph_assert(it->first == it->second->start()); + } + if (it->first < offset) { + lderr(oc->cct) << "AUDIT FAILURE: " << it->first << " " << *it->second + << " overlaps with previous bh " << *((--it)->second) + << dendl; + ceph_assert(it->first >= offset); + } + BufferHead *bh = it->second; + map<loff_t, list<Context*> >::const_iterator w_it; + for (w_it = bh->waitfor_read.begin(); + w_it != bh->waitfor_read.end(); ++w_it) { + if (w_it->first < bh->start() || + w_it->first >= bh->start() + bh->length()) { + lderr(oc->cct) << "AUDIT FAILURE: waiter at " << w_it->first + << " is not within bh " << *bh << dendl; + ceph_assert(w_it->first >= bh->start()); + ceph_assert(w_it->first < bh->start() + bh->length()); + } + } + offset = it->first + it->second->length(); + } +} + +/* + * map a range of extents on an object's buffer cache. + * - combine any bh's we're writing into one + * - break up bufferheads that don't fall completely within the range + * //no! - return a bh that includes the write. may also include + * other dirty data to left and/or right. + */ +ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex, + ceph_tid_t tid) +{ + ceph_assert(oc->lock.is_locked()); + BufferHead *final = 0; + + ldout(oc->cct, 10) << "map_write oex " << ex.oid + << " " << ex.offset << "~" << ex.length << dendl; + + loff_t cur = ex.offset; + loff_t left = ex.length; + + map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(ex.offset); + while (left > 0) { + loff_t max = left; + + // at end ? + if (p == data.end()) { + if (final == NULL) { + final = new BufferHead(this); + replace_journal_tid(final, tid); + final->set_start( cur ); + final->set_length( max ); + oc->bh_add(this, final); + ldout(oc->cct, 10) << "map_write adding trailing bh " << *final << dendl; + } else { + oc->bh_stat_sub(final); + final->set_length(final->length() + max); + oc->bh_stat_add(final); + } + left -= max; + cur += max; + continue; + } + + ldout(oc->cct, 10) << "cur is " << cur << ", p is " << *p->second << dendl; + //oc->verify_stats(); + + if (p->first <= cur) { + BufferHead *bh = p->second; + ldout(oc->cct, 10) << "map_write bh " << *bh << " intersected" << dendl; + + if (p->first < cur) { + ceph_assert(final == 0); + if (cur + max >= bh->end()) { + // we want right bit (one splice) + final = split(bh, cur); // just split it, take right half. + maybe_rebuild_buffer(bh); + replace_journal_tid(final, tid); + ++p; + ceph_assert(p->second == final); + } else { + // we want middle bit (two splices) + final = split(bh, cur); + maybe_rebuild_buffer(bh); + ++p; + ceph_assert(p->second == final); + auto right = split(final, cur+max); + maybe_rebuild_buffer(right); + replace_journal_tid(final, tid); + } + } else { + ceph_assert(p->first == cur); + if (bh->length() <= max) { + // whole bufferhead, piece of cake. + } else { + // we want left bit (one splice) + auto right = split(bh, cur + max); // just split + maybe_rebuild_buffer(right); + } + if (final) { + oc->mark_dirty(bh); + oc->mark_dirty(final); + --p; // move iterator back to final + ceph_assert(p->second == final); + replace_journal_tid(bh, tid); + merge_left(final, bh); + } else { + final = bh; + replace_journal_tid(final, tid); + } + } + + // keep going. + loff_t lenfromcur = final->end() - cur; + cur += lenfromcur; + left -= lenfromcur; + ++p; + continue; + } else { + // gap! + loff_t next = p->first; + loff_t glen = std::min(next - cur, max); + ldout(oc->cct, 10) << "map_write gap " << cur << "~" << glen << dendl; + if (final) { + oc->bh_stat_sub(final); + final->set_length(final->length() + glen); + oc->bh_stat_add(final); + } else { + final = new BufferHead(this); + replace_journal_tid(final, tid); + final->set_start( cur ); + final->set_length( glen ); + oc->bh_add(this, final); + } + + cur += glen; + left -= glen; + continue; // more? + } + } + + // set version + ceph_assert(final); + ceph_assert(final->get_journal_tid() == tid); + ldout(oc->cct, 10) << "map_write final is " << *final << dendl; + + return final; +} + +void ObjectCacher::Object::replace_journal_tid(BufferHead *bh, + ceph_tid_t tid) { + ceph_tid_t bh_tid = bh->get_journal_tid(); + + ceph_assert(tid == 0 || bh_tid <= tid); + if (bh_tid != 0 && bh_tid != tid) { + // inform journal that it should not expect a writeback from this extent + oc->writeback_handler.overwrite_extent(get_oid(), bh->start(), + bh->length(), bh_tid, tid); + } + bh->set_journal_tid(tid); +} + +void ObjectCacher::Object::truncate(loff_t s) +{ + ceph_assert(oc->lock.is_locked()); + ldout(oc->cct, 10) << "truncate " << *this << " to " << s << dendl; + + std::list<Context*> waiting_for_read; + while (!data.empty()) { + BufferHead *bh = data.rbegin()->second; + if (bh->end() <= s) + break; + + // split bh at truncation point? + if (bh->start() < s) { + split(bh, s); + maybe_rebuild_buffer(bh); + continue; + } + + // remove bh entirely + ceph_assert(bh->start() >= s); + for ([[maybe_unused]] auto& [off, ctxs] : bh->waitfor_read) { + waiting_for_read.splice(waiting_for_read.end(), ctxs); + } + bh->waitfor_read.clear(); + replace_journal_tid(bh, 0); + oc->bh_remove(this, bh); + delete bh; + } + if (!waiting_for_read.empty()) { + ldout(oc->cct, 10) << "restarting reads post-truncate" << dendl; + } + finish_contexts(oc->cct, waiting_for_read, 0); +} + +void ObjectCacher::Object::discard(loff_t off, loff_t len, + C_GatherBuilder* commit_gather) +{ + ceph_assert(oc->lock.is_locked()); + ldout(oc->cct, 10) << "discard " << *this << " " << off << "~" << len + << dendl; + + if (!exists) { + ldout(oc->cct, 10) << " setting exists on " << *this << dendl; + exists = true; + } + if (complete) { + ldout(oc->cct, 10) << " clearing complete on " << *this << dendl; + complete = false; + } + + std::list<Context*> waiting_for_read; + map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(off); + while (p != data.end()) { + BufferHead *bh = p->second; + if (bh->start() >= off + len) + break; + + // split bh at truncation point? + if (bh->start() < off) { + split(bh, off); + maybe_rebuild_buffer(bh); + ++p; + continue; + } + + ceph_assert(bh->start() >= off); + if (bh->end() > off + len) { + auto right = split(bh, off + len); + maybe_rebuild_buffer(right); + } + + ++p; + ldout(oc->cct, 10) << "discard " << *this << " bh " << *bh << dendl; + replace_journal_tid(bh, 0); + + if (bh->is_tx() && commit_gather != nullptr) { + // wait for the writeback to commit + waitfor_commit[bh->last_write_tid].emplace_back(commit_gather->new_sub()); + } else if (bh->is_rx()) { + // cannot remove bh with in-flight read, but we can ensure the + // read won't overwrite the discard + bh->last_read_tid = ++oc->last_read_tid; + bh->bl.clear(); + bh->set_nocache(true); + oc->mark_zero(bh); + // we should mark all Rx bh to zero + continue; + } else { + for ([[maybe_unused]] auto& [off, ctxs] : bh->waitfor_read) { + waiting_for_read.splice(waiting_for_read.end(), ctxs); + } + bh->waitfor_read.clear(); + } + + oc->bh_remove(this, bh); + delete bh; + } + if (!waiting_for_read.empty()) { + ldout(oc->cct, 10) << "restarting reads post-discard" << dendl; + } + finish_contexts(oc->cct, waiting_for_read, 0); /* restart reads */ +} + + + +/*** ObjectCacher ***/ + +#undef dout_prefix +#define dout_prefix *_dout << "objectcacher " + + +ObjectCacher::ObjectCacher(CephContext *cct_, string name, + WritebackHandler& wb, Mutex& l, + flush_set_callback_t flush_callback, + void *flush_callback_arg, uint64_t max_bytes, + uint64_t max_objects, uint64_t max_dirty, + uint64_t target_dirty, double max_dirty_age, + bool block_writes_upfront) + : perfcounter(NULL), + cct(cct_), writeback_handler(wb), name(name), lock(l), + max_dirty(max_dirty), target_dirty(target_dirty), + max_size(max_bytes), max_objects(max_objects), + max_dirty_age(ceph::make_timespan(max_dirty_age)), + block_writes_upfront(block_writes_upfront), + trace_endpoint("ObjectCacher"), + flush_set_callback(flush_callback), + flush_set_callback_arg(flush_callback_arg), + last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct), + stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0), + stat_missing(0), stat_error(0), stat_dirty_waiting(0), + stat_nr_dirty_waiters(0), reads_outstanding(0) +{ + perf_start(); + finisher.start(); + scattered_write = writeback_handler.can_scattered_write(); +} + +ObjectCacher::~ObjectCacher() +{ + finisher.stop(); + perf_stop(); + // we should be empty. + for (vector<ceph::unordered_map<sobject_t, Object *> >::iterator i + = objects.begin(); + i != objects.end(); + ++i) + ceph_assert(i->empty()); + ceph_assert(bh_lru_rest.lru_get_size() == 0); + ceph_assert(bh_lru_dirty.lru_get_size() == 0); + ceph_assert(ob_lru.lru_get_size() == 0); + ceph_assert(dirty_or_tx_bh.empty()); +} + +void ObjectCacher::perf_start() +{ + string n = "objectcacher-" + name; + PerfCountersBuilder plb(cct, n, l_objectcacher_first, l_objectcacher_last); + + plb.add_u64_counter(l_objectcacher_cache_ops_hit, + "cache_ops_hit", "Hit operations"); + plb.add_u64_counter(l_objectcacher_cache_ops_miss, + "cache_ops_miss", "Miss operations"); + plb.add_u64_counter(l_objectcacher_cache_bytes_hit, + "cache_bytes_hit", "Hit data", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_objectcacher_cache_bytes_miss, + "cache_bytes_miss", "Miss data", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_objectcacher_data_read, + "data_read", "Read data"); + plb.add_u64_counter(l_objectcacher_data_written, + "data_written", "Data written to cache"); + plb.add_u64_counter(l_objectcacher_data_flushed, + "data_flushed", "Data flushed"); + plb.add_u64_counter(l_objectcacher_overwritten_in_flush, + "data_overwritten_while_flushing", + "Data overwritten while flushing"); + plb.add_u64_counter(l_objectcacher_write_ops_blocked, "write_ops_blocked", + "Write operations, delayed due to dirty limits"); + plb.add_u64_counter(l_objectcacher_write_bytes_blocked, + "write_bytes_blocked", + "Write data blocked on dirty limit", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_time(l_objectcacher_write_time_blocked, "write_time_blocked", + "Time spent blocking a write due to dirty limits"); + + perfcounter = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perfcounter); +} + +void ObjectCacher::perf_stop() +{ + ceph_assert(perfcounter); + cct->get_perfcounters_collection()->remove(perfcounter); + delete perfcounter; +} + +/* private */ +ObjectCacher::Object *ObjectCacher::get_object(sobject_t oid, + uint64_t object_no, + ObjectSet *oset, + object_locator_t &l, + uint64_t truncate_size, + uint64_t truncate_seq) +{ + // XXX: Add handling of nspace in object_locator_t in cache + ceph_assert(lock.is_locked()); + // have it? + if ((uint32_t)l.pool < objects.size()) { + if (objects[l.pool].count(oid)) { + Object *o = objects[l.pool][oid]; + o->object_no = object_no; + o->truncate_size = truncate_size; + o->truncate_seq = truncate_seq; + return o; + } + } else { + objects.resize(l.pool+1); + } + + // create it. + Object *o = new Object(this, oid, object_no, oset, l, truncate_size, + truncate_seq); + objects[l.pool][oid] = o; + ob_lru.lru_insert_top(o); + return o; +} + +void ObjectCacher::close_object(Object *ob) +{ + ceph_assert(lock.is_locked()); + ldout(cct, 10) << "close_object " << *ob << dendl; + ceph_assert(ob->can_close()); + + // ok! + ob_lru.lru_remove(ob); + objects[ob->oloc.pool].erase(ob->get_soid()); + ob->set_item.remove_myself(); + delete ob; +} + +void ObjectCacher::bh_read(BufferHead *bh, int op_flags, + const ZTracer::Trace &parent_trace) +{ + ceph_assert(lock.is_locked()); + ldout(cct, 7) << "bh_read on " << *bh << " outstanding reads " + << reads_outstanding << dendl; + + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", &trace_endpoint, &parent_trace); + trace.copy_name("bh_read " + bh->ob->get_oid().name); + trace.event("start"); + } + + mark_rx(bh); + bh->last_read_tid = ++last_read_tid; + + // finisher + C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid, + bh->start(), bh->length(), trace); + // go + writeback_handler.read(bh->ob->get_oid(), bh->ob->get_object_number(), + bh->ob->get_oloc(), bh->start(), bh->length(), + bh->ob->get_snap(), &onfinish->bl, + bh->ob->truncate_size, bh->ob->truncate_seq, + op_flags, trace, onfinish); + + ++reads_outstanding; +} + +void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, + ceph_tid_t tid, loff_t start, + uint64_t length, bufferlist &bl, int r, + bool trust_enoent) +{ + ceph_assert(lock.is_locked()); + ldout(cct, 7) << "bh_read_finish " + << oid + << " tid " << tid + << " " << start << "~" << length + << " (bl is " << bl.length() << ")" + << " returned " << r + << " outstanding reads " << reads_outstanding + << dendl; + + if (r >= 0 && bl.length() < length) { + ldout(cct, 7) << "bh_read_finish " << oid << " padding " << start << "~" + << length << " with " << length - bl.length() << " bytes of zeroes" + << dendl; + bl.append_zero(length - bl.length()); + } + + list<Context*> ls; + int err = 0; + + if (objects[poolid].count(oid) == 0) { + ldout(cct, 7) << "bh_read_finish no object cache" << dendl; + } else { + Object *ob = objects[poolid][oid]; + + if (r == -ENOENT && !ob->complete) { + // wake up *all* rx waiters, or else we risk reordering + // identical reads. e.g. + // read 1~1 + // reply to unrelated 3~1 -> !exists + // read 1~1 -> immediate ENOENT + // reply to first 1~1 -> ooo ENOENT + bool allzero = true; + for (map<loff_t, BufferHead*>::iterator p = ob->data.begin(); + p != ob->data.end(); ++p) { + BufferHead *bh = p->second; + for (map<loff_t, list<Context*> >::iterator p + = bh->waitfor_read.begin(); + p != bh->waitfor_read.end(); + ++p) + ls.splice(ls.end(), p->second); + bh->waitfor_read.clear(); + if (!bh->is_zero() && !bh->is_rx()) + allzero = false; + } + + // just pass through and retry all waiters if we don't trust + // -ENOENT for this read + if (trust_enoent) { + ldout(cct, 7) + << "bh_read_finish ENOENT, marking complete and !exists on " << *ob + << dendl; + ob->complete = true; + ob->exists = false; + + /* If all the bhs are effectively zero, get rid of them. All + * the waiters will be retried and get -ENOENT immediately, so + * it's safe to clean up the unneeded bh's now. Since we know + * it's safe to remove them now, do so, so they aren't hanging + *around waiting for more -ENOENTs from rados while the cache + * is being shut down. + * + * Only do this when all the bhs are rx or clean, to match the + * condition in _readx(). If there are any non-rx or non-clean + * bhs, _readx() will wait for the final result instead of + * returning -ENOENT immediately. + */ + if (allzero) { + ldout(cct, 10) + << "bh_read_finish ENOENT and allzero, getting rid of " + << "bhs for " << *ob << dendl; + map<loff_t, BufferHead*>::iterator p = ob->data.begin(); + while (p != ob->data.end()) { + BufferHead *bh = p->second; + // current iterator will be invalidated by bh_remove() + ++p; + bh_remove(ob, bh); + delete bh; + } + } + } + } + + // apply to bh's! + loff_t opos = start; + while (true) { + map<loff_t, BufferHead*>::const_iterator p = ob->data_lower_bound(opos); + if (p == ob->data.end()) + break; + if (opos >= start+(loff_t)length) { + ldout(cct, 20) << "break due to opos " << opos << " >= start+length " + << start << "+" << length << "=" << start+(loff_t)length + << dendl; + break; + } + + BufferHead *bh = p->second; + ldout(cct, 20) << "checking bh " << *bh << dendl; + + // finishers? + for (map<loff_t, list<Context*> >::iterator it + = bh->waitfor_read.begin(); + it != bh->waitfor_read.end(); + ++it) + ls.splice(ls.end(), it->second); + bh->waitfor_read.clear(); + + if (bh->start() > opos) { + ldout(cct, 1) << "bh_read_finish skipping gap " + << opos << "~" << bh->start() - opos + << dendl; + opos = bh->start(); + continue; + } + + if (!bh->is_rx()) { + ldout(cct, 10) << "bh_read_finish skipping non-rx " << *bh << dendl; + opos = bh->end(); + continue; + } + + if (bh->last_read_tid != tid) { + ldout(cct, 10) << "bh_read_finish bh->last_read_tid " + << bh->last_read_tid << " != tid " << tid + << ", skipping" << dendl; + opos = bh->end(); + continue; + } + + ceph_assert(opos >= bh->start()); + ceph_assert(bh->start() == opos); // we don't merge rx bh's... yet! + ceph_assert(bh->length() <= start+(loff_t)length-opos); + + if (bh->error < 0) + err = bh->error; + + opos = bh->end(); + + if (r == -ENOENT) { + if (trust_enoent) { + ldout(cct, 10) << "bh_read_finish removing " << *bh << dendl; + bh_remove(ob, bh); + delete bh; + } else { + ldout(cct, 10) << "skipping unstrusted -ENOENT and will retry for " + << *bh << dendl; + } + continue; + } + + if (r < 0) { + bh->error = r; + mark_error(bh); + } else { + bh->bl.substr_of(bl, + bh->start() - start, + bh->length()); + mark_clean(bh); + } + + ldout(cct, 10) << "bh_read_finish read " << *bh << dendl; + + ob->try_merge_bh(bh); + } + } + + // called with lock held. + ldout(cct, 20) << "finishing waiters " << ls << dendl; + + finish_contexts(cct, ls, err); + retry_waiting_reads(); + + --reads_outstanding; + read_cond.Signal(); +} + +void ObjectCacher::bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff, + int64_t *max_amount, int *max_count) +{ + list<BufferHead*> blist; + + int count = 0; + int64_t total_len = 0; + set<BufferHead*, BufferHead::ptr_lt>::iterator it = dirty_or_tx_bh.find(bh); + ceph_assert(it != dirty_or_tx_bh.end()); + for (set<BufferHead*, BufferHead::ptr_lt>::iterator p = it; + p != dirty_or_tx_bh.end(); + ++p) { + BufferHead *obh = *p; + if (obh->ob != bh->ob) + break; + if (obh->is_dirty() && obh->last_write <= cutoff) { + blist.push_back(obh); + ++count; + total_len += obh->length(); + if ((max_count && count > *max_count) || + (max_amount && total_len > *max_amount)) + break; + } + } + + while (it != dirty_or_tx_bh.begin()) { + --it; + BufferHead *obh = *it; + if (obh->ob != bh->ob) + break; + if (obh->is_dirty() && obh->last_write <= cutoff) { + blist.push_front(obh); + ++count; + total_len += obh->length(); + if ((max_count && count > *max_count) || + (max_amount && total_len > *max_amount)) + break; + } + } + if (max_count) + *max_count -= count; + if (max_amount) + *max_amount -= total_len; + + bh_write_scattered(blist); +} + +class ObjectCacher::C_WriteCommit : public Context { + ObjectCacher *oc; + int64_t poolid; + sobject_t oid; + vector<pair<loff_t, uint64_t> > ranges; + ZTracer::Trace trace; +public: + ceph_tid_t tid = 0; + C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o, loff_t s, + uint64_t l, const ZTracer::Trace &trace) : + oc(c), poolid(_poolid), oid(o), trace(trace) { + ranges.push_back(make_pair(s, l)); + } + C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o, + vector<pair<loff_t, uint64_t> >& _ranges) : + oc(c), poolid(_poolid), oid(o), tid(0) { + ranges.swap(_ranges); + } + void finish(int r) override { + oc->bh_write_commit(poolid, oid, ranges, tid, r); + trace.event("finish"); + } +}; +void ObjectCacher::bh_write_scattered(list<BufferHead*>& blist) +{ + ceph_assert(lock.is_locked()); + + Object *ob = blist.front()->ob; + ob->get(); + + ceph::real_time last_write; + SnapContext snapc; + vector<pair<loff_t, uint64_t> > ranges; + vector<pair<uint64_t, bufferlist> > io_vec; + + ranges.reserve(blist.size()); + io_vec.reserve(blist.size()); + + uint64_t total_len = 0; + for (list<BufferHead*>::iterator p = blist.begin(); p != blist.end(); ++p) { + BufferHead *bh = *p; + ldout(cct, 7) << "bh_write_scattered " << *bh << dendl; + ceph_assert(bh->ob == ob); + ceph_assert(bh->bl.length() == bh->length()); + ranges.push_back(pair<loff_t, uint64_t>(bh->start(), bh->length())); + + int n = io_vec.size(); + io_vec.resize(n + 1); + io_vec[n].first = bh->start(); + io_vec[n].second = bh->bl; + + total_len += bh->length(); + if (bh->snapc.seq > snapc.seq) + snapc = bh->snapc; + if (bh->last_write > last_write) + last_write = bh->last_write; + } + + C_WriteCommit *oncommit = new C_WriteCommit(this, ob->oloc.pool, ob->get_soid(), ranges); + + ceph_tid_t tid = writeback_handler.write(ob->get_oid(), ob->get_oloc(), + io_vec, snapc, last_write, + ob->truncate_size, ob->truncate_seq, + oncommit); + oncommit->tid = tid; + ob->last_write_tid = tid; + for (list<BufferHead*>::iterator p = blist.begin(); p != blist.end(); ++p) { + BufferHead *bh = *p; + bh->last_write_tid = tid; + mark_tx(bh); + } + + if (perfcounter) + perfcounter->inc(l_objectcacher_data_flushed, total_len); +} + +void ObjectCacher::bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace) +{ + ceph_assert(lock.is_locked()); + ldout(cct, 7) << "bh_write " << *bh << dendl; + + bh->ob->get(); + + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", &trace_endpoint, &parent_trace); + trace.copy_name("bh_write " + bh->ob->get_oid().name); + trace.event("start"); + } + + // finishers + C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->oloc.pool, + bh->ob->get_soid(), bh->start(), + bh->length(), trace); + // go + ceph_tid_t tid = writeback_handler.write(bh->ob->get_oid(), + bh->ob->get_oloc(), + bh->start(), bh->length(), + bh->snapc, bh->bl, bh->last_write, + bh->ob->truncate_size, + bh->ob->truncate_seq, + bh->journal_tid, trace, oncommit); + ldout(cct, 20) << " tid " << tid << " on " << bh->ob->get_oid() << dendl; + + // set bh last_write_tid + oncommit->tid = tid; + bh->ob->last_write_tid = tid; + bh->last_write_tid = tid; + + if (perfcounter) { + perfcounter->inc(l_objectcacher_data_flushed, bh->length()); + } + + mark_tx(bh); +} + +void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, + vector<pair<loff_t, uint64_t> >& ranges, + ceph_tid_t tid, int r) +{ + ceph_assert(lock.is_locked()); + ldout(cct, 7) << "bh_write_commit " << oid << " tid " << tid + << " ranges " << ranges << " returned " << r << dendl; + + if (objects[poolid].count(oid) == 0) { + ldout(cct, 7) << "bh_write_commit no object cache" << dendl; + return; + } + + Object *ob = objects[poolid][oid]; + int was_dirty_or_tx = ob->oset->dirty_or_tx; + + for (vector<pair<loff_t, uint64_t> >::iterator p = ranges.begin(); + p != ranges.end(); + ++p) { + loff_t start = p->first; + uint64_t length = p->second; + if (!ob->exists) { + ldout(cct, 10) << "bh_write_commit marking exists on " << *ob << dendl; + ob->exists = true; + + if (writeback_handler.may_copy_on_write(ob->get_oid(), start, length, + ob->get_snap())) { + ldout(cct, 10) << "bh_write_commit may copy on write, clearing " + "complete on " << *ob << dendl; + ob->complete = false; + } + } + + vector<pair<loff_t, BufferHead*>> hit; + // apply to bh's! + for (map<loff_t, BufferHead*>::const_iterator p = ob->data_lower_bound(start); + p != ob->data.end(); + ++p) { + BufferHead *bh = p->second; + + if (bh->start() >= start+(loff_t)length) + break; + + // make sure bh is tx + if (!bh->is_tx()) { + ldout(cct, 10) << "bh_write_commit skipping non-tx " << *bh << dendl; + continue; + } + + // make sure bh tid matches + if (bh->last_write_tid != tid) { + ceph_assert(bh->last_write_tid > tid); + ldout(cct, 10) << "bh_write_commit newer tid on " << *bh << dendl; + continue; + } + + // we don't merge tx buffers. tx buffer should be within the range + ceph_assert(bh->start() >= start); + ceph_assert(bh->end() <= start+(loff_t)length); + + if (r >= 0) { + // ok! mark bh clean and error-free + mark_clean(bh); + bh->set_journal_tid(0); + if (bh->get_nocache()) + bh_lru_rest.lru_bottouch(bh); + hit.push_back(make_pair(bh->start(), bh)); + ldout(cct, 10) << "bh_write_commit clean " << *bh << dendl; + } else { + mark_dirty(bh); + ldout(cct, 10) << "bh_write_commit marking dirty again due to error " + << *bh << " r = " << r << " " << cpp_strerror(-r) + << dendl; + } + } + + for (auto& p : hit) { + //p.second maybe merged and deleted in merge_left + if (ob->data.count(p.first)) + ob->try_merge_bh(p.second); + } + } + + // update last_commit. + ceph_assert(ob->last_commit_tid < tid); + ob->last_commit_tid = tid; + + // waiters? + list<Context*> ls; + if (ob->waitfor_commit.count(tid)) { + ls.splice(ls.begin(), ob->waitfor_commit[tid]); + ob->waitfor_commit.erase(tid); + } + + // is the entire object set now clean and fully committed? + ObjectSet *oset = ob->oset; + ob->put(); + + if (flush_set_callback && + was_dirty_or_tx > 0 && + oset->dirty_or_tx == 0) { // nothing dirty/tx + flush_set_callback(flush_set_callback_arg, oset); + } + + if (!ls.empty()) + finish_contexts(cct, ls, r); +} + +void ObjectCacher::flush(ZTracer::Trace *trace, loff_t amount) +{ + ceph_assert(trace != nullptr); + ceph_assert(lock.is_locked()); + ceph::real_time cutoff = ceph::real_clock::now(); + + ldout(cct, 10) << "flush " << amount << dendl; + + /* + * NOTE: we aren't actually pulling things off the LRU here, just + * looking at the tail item. Then we call bh_write, which moves it + * to the other LRU, so that we can call + * lru_dirty.lru_get_next_expire() again. + */ + int64_t left = amount; + while (amount == 0 || left > 0) { + BufferHead *bh = static_cast<BufferHead*>( + bh_lru_dirty.lru_get_next_expire()); + if (!bh) break; + if (bh->last_write > cutoff) break; + + if (scattered_write) { + bh_write_adjacencies(bh, cutoff, amount > 0 ? &left : NULL, NULL); + } else { + left -= bh->length(); + bh_write(bh, *trace); + } + } +} + + +void ObjectCacher::trim() +{ + ceph_assert(lock.is_locked()); + ldout(cct, 10) << "trim start: bytes: max " << max_size << " clean " + << get_stat_clean() << ", objects: max " << max_objects + << " current " << ob_lru.lru_get_size() << dendl; + + uint64_t max_clean_bh = max_size >> BUFFER_MEMORY_WEIGHT; + uint64_t nr_clean_bh = bh_lru_rest.lru_get_size() - bh_lru_rest.lru_get_num_pinned(); + while (get_stat_clean() > 0 && + ((uint64_t)get_stat_clean() > max_size || + nr_clean_bh > max_clean_bh)) { + BufferHead *bh = static_cast<BufferHead*>(bh_lru_rest.lru_expire()); + if (!bh) + break; + + ldout(cct, 10) << "trim trimming " << *bh << dendl; + ceph_assert(bh->is_clean() || bh->is_zero() || bh->is_error()); + + Object *ob = bh->ob; + bh_remove(ob, bh); + delete bh; + + --nr_clean_bh; + + if (ob->complete) { + ldout(cct, 10) << "trim clearing complete on " << *ob << dendl; + ob->complete = false; + } + } + + while (ob_lru.lru_get_size() > max_objects) { + Object *ob = static_cast<Object*>(ob_lru.lru_expire()); + if (!ob) + break; + + ldout(cct, 10) << "trim trimming " << *ob << dendl; + close_object(ob); + } + + ldout(cct, 10) << "trim finish: max " << max_size << " clean " + << get_stat_clean() << ", objects: max " << max_objects + << " current " << ob_lru.lru_get_size() << dendl; +} + + + +/* public */ + +bool ObjectCacher::is_cached(ObjectSet *oset, vector<ObjectExtent>& extents, + snapid_t snapid) +{ + ceph_assert(lock.is_locked()); + for (vector<ObjectExtent>::iterator ex_it = extents.begin(); + ex_it != extents.end(); + ++ex_it) { + ldout(cct, 10) << "is_cached " << *ex_it << dendl; + + // get Object cache + sobject_t soid(ex_it->oid, snapid); + Object *o = get_object_maybe(soid, ex_it->oloc); + if (!o) + return false; + if (!o->is_cached(ex_it->offset, ex_it->length)) + return false; + } + return true; +} + + +/* + * returns # bytes read (if in cache). onfinish is untouched (caller + * must delete it) + * returns 0 if doing async read + */ +int ObjectCacher::readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, + ZTracer::Trace *parent_trace) +{ + ZTracer::Trace trace; + if (parent_trace != nullptr) { + trace.init("read", &trace_endpoint, parent_trace); + trace.event("start"); + } + + int r =_readx(rd, oset, onfinish, true, &trace); + if (r < 0) { + trace.event("finish"); + } + return r; +} + +int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, + bool external_call, ZTracer::Trace *trace) +{ + ceph_assert(trace != nullptr); + ceph_assert(lock.is_locked()); + bool success = true; + int error = 0; + uint64_t bytes_in_cache = 0; + uint64_t bytes_not_in_cache = 0; + uint64_t total_bytes_read = 0; + map<uint64_t, bufferlist> stripe_map; // final buffer offset -> substring + bool dontneed = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED; + bool nocache = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + + /* + * WARNING: we can only meaningfully return ENOENT if the read request + * passed in a single ObjectExtent. Any caller who wants ENOENT instead of + * zeroed buffers needs to feed single extents into readx(). + */ + ceph_assert(!oset->return_enoent || rd->extents.size() == 1); + + for (vector<ObjectExtent>::iterator ex_it = rd->extents.begin(); + ex_it != rd->extents.end(); + ++ex_it) { + ldout(cct, 10) << "readx " << *ex_it << dendl; + + total_bytes_read += ex_it->length; + + // get Object cache + sobject_t soid(ex_it->oid, rd->snap); + Object *o = get_object(soid, ex_it->objectno, oset, ex_it->oloc, + ex_it->truncate_size, oset->truncate_seq); + if (external_call) + touch_ob(o); + + // does not exist and no hits? + if (oset->return_enoent && !o->exists) { + ldout(cct, 10) << "readx object !exists, 1 extent..." << dendl; + + // should we worry about COW underneath us? + if (writeback_handler.may_copy_on_write(soid.oid, ex_it->offset, + ex_it->length, soid.snap)) { + ldout(cct, 20) << "readx may copy on write" << dendl; + bool wait = false; + list<BufferHead*> blist; + for (map<loff_t, BufferHead*>::iterator bh_it = o->data.begin(); + bh_it != o->data.end(); + ++bh_it) { + BufferHead *bh = bh_it->second; + if (bh->is_dirty() || bh->is_tx()) { + ldout(cct, 10) << "readx flushing " << *bh << dendl; + wait = true; + if (bh->is_dirty()) { + if (scattered_write) + blist.push_back(bh); + else + bh_write(bh, *trace); + } + } + } + if (scattered_write && !blist.empty()) + bh_write_scattered(blist); + if (wait) { + ldout(cct, 10) << "readx waiting on tid " << o->last_write_tid + << " on " << *o << dendl; + o->waitfor_commit[o->last_write_tid].push_back( + new C_RetryRead(this,rd, oset, onfinish, *trace)); + // FIXME: perfcounter! + return 0; + } + } + + // can we return ENOENT? + bool allzero = true; + for (map<loff_t, BufferHead*>::iterator bh_it = o->data.begin(); + bh_it != o->data.end(); + ++bh_it) { + ldout(cct, 20) << "readx ob has bh " << *bh_it->second << dendl; + if (!bh_it->second->is_zero() && !bh_it->second->is_rx()) { + allzero = false; + break; + } + } + if (allzero) { + ldout(cct, 10) << "readx ob has all zero|rx, returning ENOENT" + << dendl; + delete rd; + if (dontneed) + bottouch_ob(o); + return -ENOENT; + } + } + + // map extent into bufferheads + map<loff_t, BufferHead*> hits, missing, rx, errors; + o->map_read(*ex_it, hits, missing, rx, errors); + if (external_call) { + // retry reading error buffers + missing.insert(errors.begin(), errors.end()); + } else { + // some reads had errors, fail later so completions + // are cleaned up properly + // TODO: make read path not call _readx for every completion + hits.insert(errors.begin(), errors.end()); + } + + if (!missing.empty() || !rx.empty()) { + // read missing + map<loff_t, BufferHead*>::iterator last = missing.end(); + for (map<loff_t, BufferHead*>::iterator bh_it = missing.begin(); + bh_it != missing.end(); + ++bh_it) { + uint64_t rx_bytes = static_cast<uint64_t>( + stat_rx + bh_it->second->length()); + bytes_not_in_cache += bh_it->second->length(); + if (!waitfor_read.empty() || (stat_rx > 0 && rx_bytes > max_size)) { + // cache is full with concurrent reads -- wait for rx's to complete + // to constrain memory growth (especially during copy-ups) + if (success) { + ldout(cct, 10) << "readx missed, waiting on cache to complete " + << waitfor_read.size() << " blocked reads, " + << (std::max(rx_bytes, max_size) - max_size) + << " read bytes" << dendl; + waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish, + *trace)); + } + + bh_remove(o, bh_it->second); + delete bh_it->second; + } else { + bh_it->second->set_nocache(nocache); + bh_read(bh_it->second, rd->fadvise_flags, *trace); + if ((success && onfinish) || last != missing.end()) + last = bh_it; + } + success = false; + } + + //add wait in last bh avoid wakeup early. Because read is order + if (last != missing.end()) { + ldout(cct, 10) << "readx missed, waiting on " << *last->second + << " off " << last->first << dendl; + last->second->waitfor_read[last->first].push_back( + new C_RetryRead(this, rd, oset, onfinish, *trace) ); + + } + + // bump rx + for (map<loff_t, BufferHead*>::iterator bh_it = rx.begin(); + bh_it != rx.end(); + ++bh_it) { + touch_bh(bh_it->second); // bump in lru, so we don't lose it. + if (success && onfinish) { + ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second + << " off " << bh_it->first << dendl; + bh_it->second->waitfor_read[bh_it->first].push_back( + new C_RetryRead(this, rd, oset, onfinish, *trace) ); + } + bytes_not_in_cache += bh_it->second->length(); + success = false; + } + + for (map<loff_t, BufferHead*>::iterator bh_it = hits.begin(); + bh_it != hits.end(); ++bh_it) + //bump in lru, so we don't lose it when later read + touch_bh(bh_it->second); + + } else { + ceph_assert(!hits.empty()); + + // make a plain list + for (map<loff_t, BufferHead*>::iterator bh_it = hits.begin(); + bh_it != hits.end(); + ++bh_it) { + BufferHead *bh = bh_it->second; + ldout(cct, 10) << "readx hit bh " << *bh << dendl; + if (bh->is_error() && bh->error) + error = bh->error; + bytes_in_cache += bh->length(); + + if (bh->get_nocache() && bh->is_clean()) + bh_lru_rest.lru_bottouch(bh); + else + touch_bh(bh); + //must be after touch_bh because touch_bh set dontneed false + if (dontneed && + ((loff_t)ex_it->offset <= bh->start() && + (bh->end() <=(loff_t)(ex_it->offset + ex_it->length)))) { + bh->set_dontneed(true); //if dirty + if (bh->is_clean()) + bh_lru_rest.lru_bottouch(bh); + } + } + + if (!error) { + // create reverse map of buffer offset -> object for the + // eventual result. this is over a single ObjectExtent, so we + // know that + // - the bh's are contiguous + // - the buffer frags need not be (and almost certainly aren't) + loff_t opos = ex_it->offset; + map<loff_t, BufferHead*>::iterator bh_it = hits.begin(); + ceph_assert(bh_it->second->start() <= opos); + uint64_t bhoff = opos - bh_it->second->start(); + vector<pair<uint64_t,uint64_t> >::iterator f_it + = ex_it->buffer_extents.begin(); + uint64_t foff = 0; + while (1) { + BufferHead *bh = bh_it->second; + ceph_assert(opos == (loff_t)(bh->start() + bhoff)); + + uint64_t len = std::min(f_it->second - foff, bh->length() - bhoff); + ldout(cct, 10) << "readx rmap opos " << opos << ": " << *bh << " +" + << bhoff << " frag " << f_it->first << "~" + << f_it->second << " +" << foff << "~" << len + << dendl; + + bufferlist bit; + // put substr here first, since substr_of clobbers, and we + // may get multiple bh's at this stripe_map position + if (bh->is_zero()) { + stripe_map[f_it->first].append_zero(len); + } else { + bit.substr_of(bh->bl, + opos - bh->start(), + len); + stripe_map[f_it->first].claim_append(bit); + } + + opos += len; + bhoff += len; + foff += len; + if (opos == bh->end()) { + ++bh_it; + bhoff = 0; + } + if (foff == f_it->second) { + ++f_it; + foff = 0; + } + if (bh_it == hits.end()) break; + if (f_it == ex_it->buffer_extents.end()) + break; + } + ceph_assert(f_it == ex_it->buffer_extents.end()); + ceph_assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length); + } + + if (dontneed && o->include_all_cached_data(ex_it->offset, ex_it->length)) + bottouch_ob(o); + } + } + + if (!success) { + if (perfcounter && external_call) { + perfcounter->inc(l_objectcacher_data_read, total_bytes_read); + perfcounter->inc(l_objectcacher_cache_bytes_miss, bytes_not_in_cache); + perfcounter->inc(l_objectcacher_cache_ops_miss); + } + if (onfinish) { + ldout(cct, 20) << "readx defer " << rd << dendl; + } else { + ldout(cct, 20) << "readx drop " << rd << " (no complete, but no waiter)" + << dendl; + delete rd; + } + return 0; // wait! + } + if (perfcounter && external_call) { + perfcounter->inc(l_objectcacher_data_read, total_bytes_read); + perfcounter->inc(l_objectcacher_cache_bytes_hit, bytes_in_cache); + perfcounter->inc(l_objectcacher_cache_ops_hit); + } + + // no misses... success! do the read. + ldout(cct, 10) << "readx has all buffers" << dendl; + + // ok, assemble into result buffer. + uint64_t pos = 0; + if (rd->bl && !error) { + rd->bl->clear(); + for (map<uint64_t,bufferlist>::iterator i = stripe_map.begin(); + i != stripe_map.end(); + ++i) { + ceph_assert(pos == i->first); + ldout(cct, 10) << "readx adding buffer len " << i->second.length() + << " at " << pos << dendl; + pos += i->second.length(); + rd->bl->claim_append(i->second); + ceph_assert(rd->bl->length() == pos); + } + ldout(cct, 10) << "readx result is " << rd->bl->length() << dendl; + } else if (!error) { + ldout(cct, 10) << "readx no bufferlist ptr (readahead?), done." << dendl; + map<uint64_t,bufferlist>::reverse_iterator i = stripe_map.rbegin(); + pos = i->first + i->second.length(); + } + + // done with read. + int ret = error ? error : pos; + ldout(cct, 20) << "readx done " << rd << " " << ret << dendl; + ceph_assert(pos <= (uint64_t) INT_MAX); + + delete rd; + + trim(); + + return ret; +} + +void ObjectCacher::retry_waiting_reads() +{ + list<Context *> ls; + ls.swap(waitfor_read); + + while (!ls.empty() && waitfor_read.empty()) { + Context *ctx = ls.front(); + ls.pop_front(); + ctx->complete(0); + } + waitfor_read.splice(waitfor_read.end(), ls); +} + +int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace, + ZTracer::Trace *parent_trace) +{ + ceph_assert(lock.is_locked()); + ceph::real_time now = ceph::real_clock::now(); + uint64_t bytes_written = 0; + uint64_t bytes_written_in_flush = 0; + bool dontneed = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED; + bool nocache = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + + ZTracer::Trace trace; + if (parent_trace != nullptr) { + trace.init("write", &trace_endpoint, parent_trace); + trace.event("start"); + } + + list<Context*> wait_for_reads; + for (vector<ObjectExtent>::iterator ex_it = wr->extents.begin(); + ex_it != wr->extents.end(); + ++ex_it) { + // get object cache + sobject_t soid(ex_it->oid, CEPH_NOSNAP); + Object *o = get_object(soid, ex_it->objectno, oset, ex_it->oloc, + ex_it->truncate_size, oset->truncate_seq); + + // map it all into a single bufferhead. + BufferHead *bh = o->map_write(*ex_it, wr->journal_tid); + bool missing = bh->is_missing(); + bh->snapc = wr->snapc; + + // readers that need to be woken up due to an overwrite + for (auto& [_, wait_for_read] : bh->waitfor_read) { + wait_for_reads.splice(wait_for_reads.end(), wait_for_read); + } + bh->waitfor_read.clear(); + + bytes_written += ex_it->length; + if (bh->is_tx()) { + bytes_written_in_flush += ex_it->length; + } + + // adjust buffer pointers (ie "copy" data into my cache) + // this is over a single ObjectExtent, so we know that + // - there is one contiguous bh + // - the buffer frags need not be (and almost certainly aren't) + // note: i assume striping is monotonic... no jumps backwards, ever! + loff_t opos = ex_it->offset; + for (vector<pair<uint64_t, uint64_t> >::iterator f_it + = ex_it->buffer_extents.begin(); + f_it != ex_it->buffer_extents.end(); + ++f_it) { + ldout(cct, 10) << "writex writing " << f_it->first << "~" + << f_it->second << " into " << *bh << " at " << opos + << dendl; + uint64_t bhoff = opos - bh->start(); + ceph_assert(f_it->second <= bh->length() - bhoff); + + // get the frag we're mapping in + bufferlist frag; + frag.substr_of(wr->bl, f_it->first, f_it->second); + + // keep anything left of bhoff + if (!bhoff) + bh->bl.swap(frag); + else + bh->bl.claim_append(frag); + + opos += f_it->second; + } + + // ok, now bh is dirty. + mark_dirty(bh); + if (dontneed) + bh->set_dontneed(true); + else if (nocache && missing) + bh->set_nocache(true); + else + touch_bh(bh); + + bh->last_write = now; + + o->try_merge_bh(bh); + } + + if (perfcounter) { + perfcounter->inc(l_objectcacher_data_written, bytes_written); + if (bytes_written_in_flush) { + perfcounter->inc(l_objectcacher_overwritten_in_flush, + bytes_written_in_flush); + } + } + + int r = _wait_for_write(wr, bytes_written, oset, &trace, onfreespace); + delete wr; + + finish_contexts(cct, wait_for_reads, 0); + + //verify_stats(); + trim(); + return r; +} + +class ObjectCacher::C_WaitForWrite : public Context { +public: + C_WaitForWrite(ObjectCacher *oc, uint64_t len, + const ZTracer::Trace &trace, Context *onfinish) : + m_oc(oc), m_len(len), m_trace(trace), m_onfinish(onfinish) {} + void finish(int r) override; +private: + ObjectCacher *m_oc; + uint64_t m_len; + ZTracer::Trace m_trace; + Context *m_onfinish; +}; + +void ObjectCacher::C_WaitForWrite::finish(int r) +{ + std::lock_guard l(m_oc->lock); + m_oc->maybe_wait_for_writeback(m_len, &m_trace); + m_onfinish->complete(r); +} + +void ObjectCacher::maybe_wait_for_writeback(uint64_t len, + ZTracer::Trace *trace) +{ + ceph_assert(lock.is_locked()); + ceph::mono_time start = ceph::mono_clock::now(); + int blocked = 0; + // wait for writeback? + // - wait for dirty and tx bytes (relative to the max_dirty threshold) + // - do not wait for bytes other waiters are waiting on. this means that + // threads do not wait for each other. this effectively allows the cache + // size to balloon proportional to the data that is in flight. + + uint64_t max_dirty_bh = max_dirty >> BUFFER_MEMORY_WEIGHT; + while (get_stat_dirty() + get_stat_tx() > 0 && + (((uint64_t)(get_stat_dirty() + get_stat_tx()) >= + max_dirty + get_stat_dirty_waiting()) || + (dirty_or_tx_bh.size() >= + max_dirty_bh + get_stat_nr_dirty_waiters()))) { + + if (blocked == 0) { + trace->event("start wait for writeback"); + } + ldout(cct, 10) << __func__ << " waiting for dirty|tx " + << (get_stat_dirty() + get_stat_tx()) << " >= max " + << max_dirty << " + dirty_waiting " + << get_stat_dirty_waiting() << dendl; + flusher_cond.Signal(); + stat_dirty_waiting += len; + ++stat_nr_dirty_waiters; + stat_cond.Wait(lock); + stat_dirty_waiting -= len; + --stat_nr_dirty_waiters; + ++blocked; + ldout(cct, 10) << __func__ << " woke up" << dendl; + } + if (blocked > 0) { + trace->event("finish wait for writeback"); + } + if (blocked && perfcounter) { + perfcounter->inc(l_objectcacher_write_ops_blocked); + perfcounter->inc(l_objectcacher_write_bytes_blocked, len); + ceph::timespan blocked = ceph::mono_clock::now() - start; + perfcounter->tinc(l_objectcacher_write_time_blocked, blocked); + } +} + +// blocking wait for write. +int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, + ZTracer::Trace *trace, Context *onfreespace) +{ + ceph_assert(lock.is_locked()); + ceph_assert(trace != nullptr); + int ret = 0; + + if (max_dirty > 0 && !(wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_FUA)) { + if (block_writes_upfront) { + maybe_wait_for_writeback(len, trace); + if (onfreespace) + onfreespace->complete(0); + } else { + ceph_assert(onfreespace); + finisher.queue(new C_WaitForWrite(this, len, *trace, onfreespace)); + } + } else { + // write-thru! flush what we just wrote. + Cond cond; + bool done = false; + Context *fin = block_writes_upfront ? + new C_Cond(&cond, &done, &ret) : onfreespace; + ceph_assert(fin); + bool flushed = flush_set(oset, wr->extents, trace, fin); + ceph_assert(!flushed); // we just dirtied it, and didn't drop our lock! + ldout(cct, 10) << "wait_for_write waiting on write-thru of " << len + << " bytes" << dendl; + if (block_writes_upfront) { + while (!done) + cond.Wait(lock); + ldout(cct, 10) << "wait_for_write woke up, ret " << ret << dendl; + if (onfreespace) + onfreespace->complete(ret); + } + } + + // start writeback anyway? + if (get_stat_dirty() > 0 && (uint64_t) get_stat_dirty() > target_dirty) { + ldout(cct, 10) << "wait_for_write " << get_stat_dirty() << " > target " + << target_dirty << ", nudging flusher" << dendl; + flusher_cond.Signal(); + } + return ret; +} + +void ObjectCacher::flusher_entry() +{ + ldout(cct, 10) << "flusher start" << dendl; + lock.Lock(); + while (!flusher_stop) { + loff_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + + get_stat_dirty(); + ldout(cct, 11) << "flusher " + << all << " / " << max_size << ": " + << get_stat_tx() << " tx, " + << get_stat_rx() << " rx, " + << get_stat_clean() << " clean, " + << get_stat_dirty() << " dirty (" + << target_dirty << " target, " + << max_dirty << " max)" + << dendl; + loff_t actual = get_stat_dirty() + get_stat_dirty_waiting(); + + ZTracer::Trace trace; + if (cct->_conf->osdc_blkin_trace_all) { + trace.init("flusher", &trace_endpoint); + trace.event("start"); + } + + if (actual > 0 && (uint64_t) actual > target_dirty) { + // flush some dirty pages + ldout(cct, 10) << "flusher " << get_stat_dirty() << " dirty + " + << get_stat_dirty_waiting() << " dirty_waiting > target " + << target_dirty << ", flushing some dirty bhs" << dendl; + flush(&trace, actual - target_dirty); + } else { + // check tail of lru for old dirty items + ceph::real_time cutoff = ceph::real_clock::now(); + cutoff -= max_dirty_age; + BufferHead *bh = 0; + int max = MAX_FLUSH_UNDER_LOCK; + while ((bh = static_cast<BufferHead*>(bh_lru_dirty. + lru_get_next_expire())) != 0 && + bh->last_write <= cutoff && + max > 0) { + ldout(cct, 10) << "flusher flushing aged dirty bh " << *bh << dendl; + if (scattered_write) { + bh_write_adjacencies(bh, cutoff, NULL, &max); + } else { + bh_write(bh, trace); + --max; + } + } + if (!max) { + // back off the lock to avoid starving other threads + trace.event("backoff"); + lock.Unlock(); + lock.Lock(); + continue; + } + } + + trace.event("finish"); + if (flusher_stop) + break; + + flusher_cond.WaitInterval(lock, seconds(1)); + } + + /* Wait for reads to finish. This is only possible if handling + * -ENOENT made some read completions finish before their rados read + * came back. If we don't wait for them, and destroy the cache, when + * the rados reads do come back their callback will try to access the + * no-longer-valid ObjectCacher. + */ + while (reads_outstanding > 0) { + ldout(cct, 10) << "Waiting for all reads to complete. Number left: " + << reads_outstanding << dendl; + read_cond.Wait(lock); + } + + lock.Unlock(); + ldout(cct, 10) << "flusher finish" << dendl; +} + + +// ------------------------------------------------- + +bool ObjectCacher::set_is_empty(ObjectSet *oset) +{ + ceph_assert(lock.is_locked()); + if (oset->objects.empty()) + return true; + + for (xlist<Object*>::iterator p = oset->objects.begin(); !p.end(); ++p) + if (!(*p)->is_empty()) + return false; + + return true; +} + +bool ObjectCacher::set_is_cached(ObjectSet *oset) +{ + ceph_assert(lock.is_locked()); + if (oset->objects.empty()) + return false; + + for (xlist<Object*>::iterator p = oset->objects.begin(); + !p.end(); ++p) { + Object *ob = *p; + for (map<loff_t,BufferHead*>::iterator q = ob->data.begin(); + q != ob->data.end(); + ++q) { + BufferHead *bh = q->second; + if (!bh->is_dirty() && !bh->is_tx()) + return true; + } + } + + return false; +} + +bool ObjectCacher::set_is_dirty_or_committing(ObjectSet *oset) +{ + ceph_assert(lock.is_locked()); + if (oset->objects.empty()) + return false; + + for (xlist<Object*>::iterator i = oset->objects.begin(); + !i.end(); ++i) { + Object *ob = *i; + + for (map<loff_t,BufferHead*>::iterator p = ob->data.begin(); + p != ob->data.end(); + ++p) { + BufferHead *bh = p->second; + if (bh->is_dirty() || bh->is_tx()) + return true; + } + } + + return false; +} + + +// purge. non-blocking. violently removes dirty buffers from cache. +void ObjectCacher::purge(Object *ob) +{ + ceph_assert(lock.is_locked()); + ldout(cct, 10) << "purge " << *ob << dendl; + + ob->truncate(0); +} + + +// flush. non-blocking. no callback. +// true if clean, already flushed. +// false if we wrote something. +// be sloppy about the ranges and flush any buffer it touches +bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length, + ZTracer::Trace *trace) +{ + ceph_assert(trace != nullptr); + ceph_assert(lock.is_locked()); + list<BufferHead*> blist; + bool clean = true; + ldout(cct, 10) << "flush " << *ob << " " << offset << "~" << length << dendl; + for (map<loff_t,BufferHead*>::const_iterator p = ob->data_lower_bound(offset); + p != ob->data.end(); + ++p) { + BufferHead *bh = p->second; + ldout(cct, 20) << "flush " << *bh << dendl; + if (length && bh->start() > offset+length) { + break; + } + if (bh->is_tx()) { + clean = false; + continue; + } + if (!bh->is_dirty()) { + continue; + } + + if (scattered_write) + blist.push_back(bh); + else + bh_write(bh, *trace); + clean = false; + } + if (scattered_write && !blist.empty()) + bh_write_scattered(blist); + + return clean; +} + +bool ObjectCacher::_flush_set_finish(C_GatherBuilder *gather, + Context *onfinish) +{ + ceph_assert(lock.is_locked()); + if (gather->has_subs()) { + gather->set_finisher(onfinish); + gather->activate(); + return false; + } + + ldout(cct, 10) << "flush_set has no dirty|tx bhs" << dendl; + onfinish->complete(0); + return true; +} + +// flush. non-blocking, takes callback. +// returns true if already flushed +bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish) +{ + ceph_assert(lock.is_locked()); + ceph_assert(onfinish != NULL); + if (oset->objects.empty()) { + ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl; + onfinish->complete(0); + return true; + } + + ldout(cct, 10) << "flush_set " << oset << dendl; + + // we'll need to wait for all objects to flush! + C_GatherBuilder gather(cct); + set<Object*> waitfor_commit; + + list<BufferHead*> blist; + Object *last_ob = NULL; + set<BufferHead*, BufferHead::ptr_lt>::const_iterator it, p, q; + + // Buffer heads in dirty_or_tx_bh are sorted in ObjectSet/Object/offset + // order. But items in oset->objects are not sorted. So the iterator can + // point to any buffer head in the ObjectSet + BufferHead key(*oset->objects.begin()); + it = dirty_or_tx_bh.lower_bound(&key); + p = q = it; + + bool backwards = true; + if (it != dirty_or_tx_bh.begin()) + --it; + else + backwards = false; + + for (; p != dirty_or_tx_bh.end(); p = q) { + ++q; + BufferHead *bh = *p; + if (bh->ob->oset != oset) + break; + waitfor_commit.insert(bh->ob); + if (bh->is_dirty()) { + if (scattered_write) { + if (last_ob != bh->ob) { + if (!blist.empty()) { + bh_write_scattered(blist); + blist.clear(); + } + last_ob = bh->ob; + } + blist.push_back(bh); + } else { + bh_write(bh, {}); + } + } + } + + if (backwards) { + for(p = q = it; true; p = q) { + if (q != dirty_or_tx_bh.begin()) + --q; + else + backwards = false; + BufferHead *bh = *p; + if (bh->ob->oset != oset) + break; + waitfor_commit.insert(bh->ob); + if (bh->is_dirty()) { + if (scattered_write) { + if (last_ob != bh->ob) { + if (!blist.empty()) { + bh_write_scattered(blist); + blist.clear(); + } + last_ob = bh->ob; + } + blist.push_front(bh); + } else { + bh_write(bh, {}); + } + } + if (!backwards) + break; + } + } + + if (scattered_write && !blist.empty()) + bh_write_scattered(blist); + + for (set<Object*>::iterator i = waitfor_commit.begin(); + i != waitfor_commit.end(); ++i) { + Object *ob = *i; + + // we'll need to gather... + ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid " + << ob->last_write_tid << " on " << *ob << dendl; + ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub()); + } + + return _flush_set_finish(&gather, onfinish); +} + +// flush. non-blocking, takes callback. +// returns true if already flushed +bool ObjectCacher::flush_set(ObjectSet *oset, vector<ObjectExtent>& exv, + ZTracer::Trace *trace, Context *onfinish) +{ + ceph_assert(lock.is_locked()); + ceph_assert(trace != nullptr); + ceph_assert(onfinish != NULL); + if (oset->objects.empty()) { + ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl; + onfinish->complete(0); + return true; + } + + ldout(cct, 10) << "flush_set " << oset << " on " << exv.size() + << " ObjectExtents" << dendl; + + // we'll need to wait for all objects to flush! + C_GatherBuilder gather(cct); + + for (vector<ObjectExtent>::iterator p = exv.begin(); + p != exv.end(); + ++p) { + ObjectExtent &ex = *p; + sobject_t soid(ex.oid, CEPH_NOSNAP); + if (objects[oset->poolid].count(soid) == 0) + continue; + Object *ob = objects[oset->poolid][soid]; + + ldout(cct, 20) << "flush_set " << oset << " ex " << ex << " ob " << soid + << " " << ob << dendl; + + if (!flush(ob, ex.offset, ex.length, trace)) { + // we'll need to gather... + ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid " + << ob->last_write_tid << " on " << *ob << dendl; + ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub()); + } + } + + return _flush_set_finish(&gather, onfinish); +} + +// flush all dirty data. non-blocking, takes callback. +// returns true if already flushed +bool ObjectCacher::flush_all(Context *onfinish) +{ + ceph_assert(lock.is_locked()); + ceph_assert(onfinish != NULL); + + ldout(cct, 10) << "flush_all " << dendl; + + // we'll need to wait for all objects to flush! + C_GatherBuilder gather(cct); + set<Object*> waitfor_commit; + + list<BufferHead*> blist; + Object *last_ob = NULL; + set<BufferHead*, BufferHead::ptr_lt>::iterator next, it; + next = it = dirty_or_tx_bh.begin(); + while (it != dirty_or_tx_bh.end()) { + ++next; + BufferHead *bh = *it; + waitfor_commit.insert(bh->ob); + + if (bh->is_dirty()) { + if (scattered_write) { + if (last_ob != bh->ob) { + if (!blist.empty()) { + bh_write_scattered(blist); + blist.clear(); + } + last_ob = bh->ob; + } + blist.push_back(bh); + } else { + bh_write(bh, {}); + } + } + + it = next; + } + + if (scattered_write && !blist.empty()) + bh_write_scattered(blist); + + for (set<Object*>::iterator i = waitfor_commit.begin(); + i != waitfor_commit.end(); + ++i) { + Object *ob = *i; + + // we'll need to gather... + ldout(cct, 10) << "flush_all will wait for ack tid " + << ob->last_write_tid << " on " << *ob << dendl; + ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub()); + } + + return _flush_set_finish(&gather, onfinish); +} + +void ObjectCacher::purge_set(ObjectSet *oset) +{ + ceph_assert(lock.is_locked()); + if (oset->objects.empty()) { + ldout(cct, 10) << "purge_set on " << oset << " dne" << dendl; + return; + } + + ldout(cct, 10) << "purge_set " << oset << dendl; + const bool were_dirty = oset->dirty_or_tx > 0; + + for (xlist<Object*>::iterator i = oset->objects.begin(); + !i.end(); ++i) { + Object *ob = *i; + purge(ob); + } + + // Although we have purged rather than flushed, caller should still + // drop any resources associate with dirty data. + ceph_assert(oset->dirty_or_tx == 0); + if (flush_set_callback && were_dirty) { + flush_set_callback(flush_set_callback_arg, oset); + } +} + + +loff_t ObjectCacher::release(Object *ob) +{ + ceph_assert(lock.is_locked()); + list<BufferHead*> clean; + loff_t o_unclean = 0; + + for (map<loff_t,BufferHead*>::iterator p = ob->data.begin(); + p != ob->data.end(); + ++p) { + BufferHead *bh = p->second; + if (bh->is_clean() || bh->is_zero() || bh->is_error()) + clean.push_back(bh); + else + o_unclean += bh->length(); + } + + for (list<BufferHead*>::iterator p = clean.begin(); + p != clean.end(); + ++p) { + bh_remove(ob, *p); + delete *p; + } + + if (ob->can_close()) { + ldout(cct, 10) << "release trimming " << *ob << dendl; + close_object(ob); + ceph_assert(o_unclean == 0); + return 0; + } + + if (ob->complete) { + ldout(cct, 10) << "release clearing complete on " << *ob << dendl; + ob->complete = false; + } + if (!ob->exists) { + ldout(cct, 10) << "release setting exists on " << *ob << dendl; + ob->exists = true; + } + + return o_unclean; +} + +loff_t ObjectCacher::release_set(ObjectSet *oset) +{ + ceph_assert(lock.is_locked()); + // return # bytes not clean (and thus not released). + loff_t unclean = 0; + + if (oset->objects.empty()) { + ldout(cct, 10) << "release_set on " << oset << " dne" << dendl; + return 0; + } + + ldout(cct, 10) << "release_set " << oset << dendl; + + xlist<Object*>::iterator q; + for (xlist<Object*>::iterator p = oset->objects.begin(); + !p.end(); ) { + q = p; + ++q; + Object *ob = *p; + + loff_t o_unclean = release(ob); + unclean += o_unclean; + + if (o_unclean) + ldout(cct, 10) << "release_set " << oset << " " << *ob + << " has " << o_unclean << " bytes left" + << dendl; + p = q; + } + + if (unclean) { + ldout(cct, 10) << "release_set " << oset + << ", " << unclean << " bytes left" << dendl; + } + + return unclean; +} + + +uint64_t ObjectCacher::release_all() +{ + ceph_assert(lock.is_locked()); + ldout(cct, 10) << "release_all" << dendl; + uint64_t unclean = 0; + + vector<ceph::unordered_map<sobject_t, Object*> >::iterator i + = objects.begin(); + while (i != objects.end()) { + ceph::unordered_map<sobject_t, Object*>::iterator p = i->begin(); + while (p != i->end()) { + ceph::unordered_map<sobject_t, Object*>::iterator n = p; + ++n; + + Object *ob = p->second; + + loff_t o_unclean = release(ob); + unclean += o_unclean; + + if (o_unclean) + ldout(cct, 10) << "release_all " << *ob + << " has " << o_unclean << " bytes left" + << dendl; + p = n; + } + ++i; + } + + if (unclean) { + ldout(cct, 10) << "release_all unclean " << unclean << " bytes left" + << dendl; + } + + return unclean; +} + +void ObjectCacher::clear_nonexistence(ObjectSet *oset) +{ + ceph_assert(lock.is_locked()); + ldout(cct, 10) << "clear_nonexistence() " << oset << dendl; + + for (xlist<Object*>::iterator p = oset->objects.begin(); + !p.end(); ++p) { + Object *ob = *p; + if (!ob->exists) { + ldout(cct, 10) << " setting exists and complete on " << *ob << dendl; + ob->exists = true; + ob->complete = false; + } + for (xlist<C_ReadFinish*>::iterator q = ob->reads.begin(); + !q.end(); ++q) { + C_ReadFinish *comp = *q; + comp->distrust_enoent(); + } + } +} + +/** + * discard object extents from an ObjectSet by removing the objects in + * exls from the in-memory oset. + */ +void ObjectCacher::discard_set(ObjectSet *oset, const vector<ObjectExtent>& exls) +{ + ceph_assert(lock.is_locked()); + bool was_dirty = oset->dirty_or_tx > 0; + + _discard(oset, exls, nullptr); + _discard_finish(oset, was_dirty, nullptr); +} + +/** + * discard object extents from an ObjectSet by removing the objects in + * exls from the in-memory oset. If the bh is in TX state, the discard + * will wait for the write to commit prior to invoking on_finish. + */ +void ObjectCacher::discard_writeback(ObjectSet *oset, + const vector<ObjectExtent>& exls, + Context* on_finish) +{ + ceph_assert(lock.is_locked()); + bool was_dirty = oset->dirty_or_tx > 0; + + C_GatherBuilder gather(cct); + _discard(oset, exls, &gather); + + if (gather.has_subs()) { + bool flushed = was_dirty && oset->dirty_or_tx == 0; + gather.set_finisher(new FunctionContext( + [this, oset, flushed, on_finish](int) { + ceph_assert(lock.is_locked()); + if (flushed && flush_set_callback) + flush_set_callback(flush_set_callback_arg, oset); + if (on_finish) + on_finish->complete(0); + })); + gather.activate(); + return; + } + + _discard_finish(oset, was_dirty, on_finish); +} + +void ObjectCacher::_discard(ObjectSet *oset, const vector<ObjectExtent>& exls, + C_GatherBuilder* gather) +{ + if (oset->objects.empty()) { + ldout(cct, 10) << __func__ << " on " << oset << " dne" << dendl; + return; + } + + ldout(cct, 10) << __func__ << " " << oset << dendl; + + for (auto& ex : exls) { + ldout(cct, 10) << __func__ << " " << oset << " ex " << ex << dendl; + sobject_t soid(ex.oid, CEPH_NOSNAP); + if (objects[oset->poolid].count(soid) == 0) + continue; + Object *ob = objects[oset->poolid][soid]; + + ob->discard(ex.offset, ex.length, gather); + } +} + +void ObjectCacher::_discard_finish(ObjectSet *oset, bool was_dirty, + Context* on_finish) +{ + ceph_assert(lock.is_locked()); + + // did we truncate off dirty data? + if (flush_set_callback && was_dirty && oset->dirty_or_tx == 0) { + flush_set_callback(flush_set_callback_arg, oset); + } + + // notify that in-flight writeback has completed + if (on_finish != nullptr) { + on_finish->complete(0); + } +} + +void ObjectCacher::verify_stats() const +{ + ceph_assert(lock.is_locked()); + ldout(cct, 10) << "verify_stats" << dendl; + + loff_t clean = 0, zero = 0, dirty = 0, rx = 0, tx = 0, missing = 0, + error = 0; + for (vector<ceph::unordered_map<sobject_t, Object*> >::const_iterator i + = objects.begin(); + i != objects.end(); + ++i) { + for (ceph::unordered_map<sobject_t, Object*>::const_iterator p + = i->begin(); + p != i->end(); + ++p) { + Object *ob = p->second; + for (map<loff_t, BufferHead*>::const_iterator q = ob->data.begin(); + q != ob->data.end(); + ++q) { + BufferHead *bh = q->second; + switch (bh->get_state()) { + case BufferHead::STATE_MISSING: + missing += bh->length(); + break; + case BufferHead::STATE_CLEAN: + clean += bh->length(); + break; + case BufferHead::STATE_ZERO: + zero += bh->length(); + break; + case BufferHead::STATE_DIRTY: + dirty += bh->length(); + break; + case BufferHead::STATE_TX: + tx += bh->length(); + break; + case BufferHead::STATE_RX: + rx += bh->length(); + break; + case BufferHead::STATE_ERROR: + error += bh->length(); + break; + default: + ceph_abort(); + } + } + } + } + + ldout(cct, 10) << " clean " << clean << " rx " << rx << " tx " << tx + << " dirty " << dirty << " missing " << missing + << " error " << error << dendl; + ceph_assert(clean == stat_clean); + ceph_assert(rx == stat_rx); + ceph_assert(tx == stat_tx); + ceph_assert(dirty == stat_dirty); + ceph_assert(missing == stat_missing); + ceph_assert(zero == stat_zero); + ceph_assert(error == stat_error); +} + +void ObjectCacher::bh_stat_add(BufferHead *bh) +{ + ceph_assert(lock.is_locked()); + switch (bh->get_state()) { + case BufferHead::STATE_MISSING: + stat_missing += bh->length(); + break; + case BufferHead::STATE_CLEAN: + stat_clean += bh->length(); + break; + case BufferHead::STATE_ZERO: + stat_zero += bh->length(); + break; + case BufferHead::STATE_DIRTY: + stat_dirty += bh->length(); + bh->ob->dirty_or_tx += bh->length(); + bh->ob->oset->dirty_or_tx += bh->length(); + break; + case BufferHead::STATE_TX: + stat_tx += bh->length(); + bh->ob->dirty_or_tx += bh->length(); + bh->ob->oset->dirty_or_tx += bh->length(); + break; + case BufferHead::STATE_RX: + stat_rx += bh->length(); + break; + case BufferHead::STATE_ERROR: + stat_error += bh->length(); + break; + default: + ceph_abort_msg("bh_stat_add: invalid bufferhead state"); + } + if (get_stat_dirty_waiting() > 0) + stat_cond.Signal(); +} + +void ObjectCacher::bh_stat_sub(BufferHead *bh) +{ + ceph_assert(lock.is_locked()); + switch (bh->get_state()) { + case BufferHead::STATE_MISSING: + stat_missing -= bh->length(); + break; + case BufferHead::STATE_CLEAN: + stat_clean -= bh->length(); + break; + case BufferHead::STATE_ZERO: + stat_zero -= bh->length(); + break; + case BufferHead::STATE_DIRTY: + stat_dirty -= bh->length(); + bh->ob->dirty_or_tx -= bh->length(); + bh->ob->oset->dirty_or_tx -= bh->length(); + break; + case BufferHead::STATE_TX: + stat_tx -= bh->length(); + bh->ob->dirty_or_tx -= bh->length(); + bh->ob->oset->dirty_or_tx -= bh->length(); + break; + case BufferHead::STATE_RX: + stat_rx -= bh->length(); + break; + case BufferHead::STATE_ERROR: + stat_error -= bh->length(); + break; + default: + ceph_abort_msg("bh_stat_sub: invalid bufferhead state"); + } +} + +void ObjectCacher::bh_set_state(BufferHead *bh, int s) +{ + ceph_assert(lock.is_locked()); + int state = bh->get_state(); + // move between lru lists? + if (s == BufferHead::STATE_DIRTY && state != BufferHead::STATE_DIRTY) { + bh_lru_rest.lru_remove(bh); + bh_lru_dirty.lru_insert_top(bh); + } else if (s != BufferHead::STATE_DIRTY &&state == BufferHead::STATE_DIRTY) { + bh_lru_dirty.lru_remove(bh); + if (bh->get_dontneed()) + bh_lru_rest.lru_insert_bot(bh); + else + bh_lru_rest.lru_insert_top(bh); + } + + if ((s == BufferHead::STATE_TX || + s == BufferHead::STATE_DIRTY) && + state != BufferHead::STATE_TX && + state != BufferHead::STATE_DIRTY) { + dirty_or_tx_bh.insert(bh); + } else if ((state == BufferHead::STATE_TX || + state == BufferHead::STATE_DIRTY) && + s != BufferHead::STATE_TX && + s != BufferHead::STATE_DIRTY) { + dirty_or_tx_bh.erase(bh); + } + + if (s != BufferHead::STATE_ERROR && + state == BufferHead::STATE_ERROR) { + bh->error = 0; + } + + // set state + bh_stat_sub(bh); + bh->set_state(s); + bh_stat_add(bh); +} + +void ObjectCacher::bh_add(Object *ob, BufferHead *bh) +{ + ceph_assert(lock.is_locked()); + ldout(cct, 30) << "bh_add " << *ob << " " << *bh << dendl; + ob->add_bh(bh); + if (bh->is_dirty()) { + bh_lru_dirty.lru_insert_top(bh); + dirty_or_tx_bh.insert(bh); + } else { + if (bh->get_dontneed()) + bh_lru_rest.lru_insert_bot(bh); + else + bh_lru_rest.lru_insert_top(bh); + } + + if (bh->is_tx()) { + dirty_or_tx_bh.insert(bh); + } + bh_stat_add(bh); +} + +void ObjectCacher::bh_remove(Object *ob, BufferHead *bh) +{ + ceph_assert(lock.is_locked()); + ceph_assert(bh->get_journal_tid() == 0); + ldout(cct, 30) << "bh_remove " << *ob << " " << *bh << dendl; + ob->remove_bh(bh); + if (bh->is_dirty()) { + bh_lru_dirty.lru_remove(bh); + dirty_or_tx_bh.erase(bh); + } else { + bh_lru_rest.lru_remove(bh); + } + + if (bh->is_tx()) { + dirty_or_tx_bh.erase(bh); + } + bh_stat_sub(bh); + if (get_stat_dirty_waiting() > 0) + stat_cond.Signal(); +} + diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h new file mode 100644 index 00000000..a976f082 --- /dev/null +++ b/src/osdc/ObjectCacher.h @@ -0,0 +1,774 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_OBJECTCACHER_H +#define CEPH_OBJECTCACHER_H + +#include "include/types.h" +#include "include/lru.h" +#include "include/Context.h" +#include "include/xlist.h" + +#include "common/Cond.h" +#include "common/Finisher.h" +#include "common/Thread.h" +#include "common/zipkin_trace.h" + +#include "Objecter.h" +#include "Striper.h" + +class CephContext; +class WritebackHandler; +class PerfCounters; + +enum { + l_objectcacher_first = 25000, + + l_objectcacher_cache_ops_hit, // ops we satisfy completely from cache + l_objectcacher_cache_ops_miss, // ops we don't satisfy completely from cache + + l_objectcacher_cache_bytes_hit, // bytes read directly from cache + + l_objectcacher_cache_bytes_miss, // bytes we couldn't read directly + + // from cache + + l_objectcacher_data_read, // total bytes read out + l_objectcacher_data_written, // bytes written to cache + l_objectcacher_data_flushed, // bytes flushed to WritebackHandler + l_objectcacher_overwritten_in_flush, // bytes overwritten while + // flushing is in progress + + l_objectcacher_write_ops_blocked, // total write ops we delayed due + // to dirty limits + l_objectcacher_write_bytes_blocked, // total number of write bytes + // we delayed due to dirty + // limits + l_objectcacher_write_time_blocked, // total time in seconds spent + // blocking a write due to dirty + // limits + + l_objectcacher_last, +}; + +class ObjectCacher { + PerfCounters *perfcounter; + public: + CephContext *cct; + class Object; + struct ObjectSet; + class C_ReadFinish; + + typedef void (*flush_set_callback_t) (void *p, ObjectSet *oset); + + // read scatter/gather + struct OSDRead { + vector<ObjectExtent> extents; + snapid_t snap; + bufferlist *bl; + int fadvise_flags; + OSDRead(snapid_t s, bufferlist *b, int f) + : snap(s), bl(b), fadvise_flags(f) {} + }; + + OSDRead *prepare_read(snapid_t snap, bufferlist *b, int f) const { + return new OSDRead(snap, b, f); + } + + // write scatter/gather + struct OSDWrite { + vector<ObjectExtent> extents; + SnapContext snapc; + bufferlist bl; + ceph::real_time mtime; + int fadvise_flags; + ceph_tid_t journal_tid; + OSDWrite(const SnapContext& sc, const bufferlist& b, ceph::real_time mt, + int f, ceph_tid_t _journal_tid) + : snapc(sc), bl(b), mtime(mt), fadvise_flags(f), + journal_tid(_journal_tid) {} + }; + + OSDWrite *prepare_write(const SnapContext& sc, + const bufferlist &b, + ceph::real_time mt, + int f, + ceph_tid_t journal_tid) const { + return new OSDWrite(sc, b, mt, f, journal_tid); + } + + + + // ******* BufferHead ********* + class BufferHead : public LRUObject { + public: + // states + static const int STATE_MISSING = 0; + static const int STATE_CLEAN = 1; + static const int STATE_ZERO = 2; // NOTE: these are *clean* zeros + static const int STATE_DIRTY = 3; + static const int STATE_RX = 4; + static const int STATE_TX = 5; + static const int STATE_ERROR = 6; // a read error occurred + + private: + // my fields + int state; + int ref; + struct { + loff_t start, length; // bh extent in object + } ex; + bool dontneed; //indicate bh don't need by anyone + bool nocache; //indicate bh don't need by this caller + + public: + Object *ob; + bufferlist bl; + ceph_tid_t last_write_tid; // version of bh (if non-zero) + ceph_tid_t last_read_tid; // tid of last read op (if any) + ceph::real_time last_write; + SnapContext snapc; + ceph_tid_t journal_tid; + int error; // holds return value for failed reads + + map<loff_t, list<Context*> > waitfor_read; + + // cons + explicit BufferHead(Object *o) : + state(STATE_MISSING), + ref(0), + dontneed(false), + nocache(false), + ob(o), + last_write_tid(0), + last_read_tid(0), + journal_tid(0), + error(0) { + ex.start = ex.length = 0; + } + + // extent + loff_t start() const { return ex.start; } + void set_start(loff_t s) { ex.start = s; } + loff_t length() const { return ex.length; } + void set_length(loff_t l) { ex.length = l; } + loff_t end() const { return ex.start + ex.length; } + loff_t last() const { return end() - 1; } + + // states + void set_state(int s) { + if (s == STATE_RX || s == STATE_TX) get(); + if (state == STATE_RX || state == STATE_TX) put(); + state = s; + } + int get_state() const { return state; } + + inline ceph_tid_t get_journal_tid() const { + return journal_tid; + } + inline void set_journal_tid(ceph_tid_t _journal_tid) { + journal_tid = _journal_tid; + } + + bool is_missing() const { return state == STATE_MISSING; } + bool is_dirty() const { return state == STATE_DIRTY; } + bool is_clean() const { return state == STATE_CLEAN; } + bool is_zero() const { return state == STATE_ZERO; } + bool is_tx() const { return state == STATE_TX; } + bool is_rx() const { return state == STATE_RX; } + bool is_error() const { return state == STATE_ERROR; } + + // reference counting + int get() { + ceph_assert(ref >= 0); + if (ref == 0) lru_pin(); + return ++ref; + } + int put() { + ceph_assert(ref > 0); + if (ref == 1) lru_unpin(); + --ref; + return ref; + } + + void set_dontneed(bool v) { + dontneed = v; + } + bool get_dontneed() const { + return dontneed; + } + + void set_nocache(bool v) { + nocache = v; + } + bool get_nocache() const { + return nocache; + } + + inline bool can_merge_journal(BufferHead *bh) const { + return (get_journal_tid() == bh->get_journal_tid()); + } + + struct ptr_lt { + bool operator()(const BufferHead* l, const BufferHead* r) const { + const Object *lob = l->ob; + const Object *rob = r->ob; + const ObjectSet *loset = lob->oset; + const ObjectSet *roset = rob->oset; + if (loset != roset) + return loset < roset; + if (lob != rob) + return lob < rob; + if (l->start() != r->start()) + return l->start() < r->start(); + return l < r; + } + }; + }; + + // ******* Object ********* + class Object : public LRUObject { + private: + // ObjectCacher::Object fields + int ref; + ObjectCacher *oc; + sobject_t oid; + friend struct ObjectSet; + + public: + uint64_t object_no; + ObjectSet *oset; + xlist<Object*>::item set_item; + object_locator_t oloc; + uint64_t truncate_size, truncate_seq; + + bool complete; + bool exists; + + map<loff_t, BufferHead*> data; + + ceph_tid_t last_write_tid; // version of bh (if non-zero) + ceph_tid_t last_commit_tid; // last update committed. + + int dirty_or_tx; + + map< ceph_tid_t, list<Context*> > waitfor_commit; + xlist<C_ReadFinish*> reads; + + Object(const Object&) = delete; + Object& operator=(const Object&) = delete; + + Object(ObjectCacher *_oc, sobject_t o, uint64_t ono, ObjectSet *os, + object_locator_t& l, uint64_t ts, uint64_t tq) : + ref(0), + oc(_oc), + oid(o), object_no(ono), oset(os), set_item(this), oloc(l), + truncate_size(ts), truncate_seq(tq), + complete(false), exists(true), + last_write_tid(0), last_commit_tid(0), + dirty_or_tx(0) { + // add to set + os->objects.push_back(&set_item); + } + ~Object() { + reads.clear(); + ceph_assert(ref == 0); + ceph_assert(data.empty()); + ceph_assert(dirty_or_tx == 0); + set_item.remove_myself(); + } + + sobject_t get_soid() const { return oid; } + object_t get_oid() { return oid.oid; } + snapid_t get_snap() { return oid.snap; } + ObjectSet *get_object_set() const { return oset; } + string get_namespace() { return oloc.nspace; } + uint64_t get_object_number() const { return object_no; } + + const object_locator_t& get_oloc() const { return oloc; } + void set_object_locator(object_locator_t& l) { oloc = l; } + + bool can_close() const { + if (lru_is_expireable()) { + ceph_assert(data.empty()); + ceph_assert(waitfor_commit.empty()); + return true; + } + return false; + } + + /** + * Check buffers and waiters for consistency + * - no overlapping buffers + * - index in map matches BH + * - waiters fall within BH + */ + void audit_buffers(); + + /** + * find first buffer that includes or follows an offset + * + * @param offset object byte offset + * @return iterator pointing to buffer, or data.end() + */ + map<loff_t,BufferHead*>::const_iterator data_lower_bound(loff_t offset) const { + map<loff_t,BufferHead*>::const_iterator p = data.lower_bound(offset); + if (p != data.begin() && + (p == data.end() || p->first > offset)) { + --p; // might overlap! + if (p->first + p->second->length() <= offset) + ++p; // doesn't overlap. + } + return p; + } + + // bh + // add to my map + void add_bh(BufferHead *bh) { + if (data.empty()) + get(); + ceph_assert(data.count(bh->start()) == 0); + data[bh->start()] = bh; + } + void remove_bh(BufferHead *bh) { + ceph_assert(data.count(bh->start())); + data.erase(bh->start()); + if (data.empty()) + put(); + } + + bool is_empty() const { return data.empty(); } + + // mid-level + BufferHead *split(BufferHead *bh, loff_t off); + void merge_left(BufferHead *left, BufferHead *right); + bool can_merge_bh(BufferHead *left, BufferHead *right); + void try_merge_bh(BufferHead *bh); + void maybe_rebuild_buffer(BufferHead *bh); + + bool is_cached(loff_t off, loff_t len) const; + bool include_all_cached_data(loff_t off, loff_t len); + int map_read(ObjectExtent &ex, + map<loff_t, BufferHead*>& hits, + map<loff_t, BufferHead*>& missing, + map<loff_t, BufferHead*>& rx, + map<loff_t, BufferHead*>& errors); + BufferHead *map_write(ObjectExtent &ex, ceph_tid_t tid); + + void replace_journal_tid(BufferHead *bh, ceph_tid_t tid); + void truncate(loff_t s); + void discard(loff_t off, loff_t len, C_GatherBuilder* commit_gather); + + // reference counting + int get() { + ceph_assert(ref >= 0); + if (ref == 0) lru_pin(); + return ++ref; + } + int put() { + ceph_assert(ref > 0); + if (ref == 1) lru_unpin(); + --ref; + return ref; + } + }; + + + struct ObjectSet { + void *parent; + + inodeno_t ino; + uint64_t truncate_seq, truncate_size; + + int64_t poolid; + xlist<Object*> objects; + + int dirty_or_tx; + bool return_enoent; + + ObjectSet(void *p, int64_t _poolid, inodeno_t i) + : parent(p), ino(i), truncate_seq(0), + truncate_size(0), poolid(_poolid), dirty_or_tx(0), + return_enoent(false) {} + + }; + + + // ******* ObjectCacher ********* + // ObjectCacher fields + private: + WritebackHandler& writeback_handler; + bool scattered_write; + + string name; + Mutex& lock; + + uint64_t max_dirty, target_dirty, max_size, max_objects; + ceph::timespan max_dirty_age; + bool block_writes_upfront; + + ZTracer::Endpoint trace_endpoint; + + flush_set_callback_t flush_set_callback; + void *flush_set_callback_arg; + + // indexed by pool_id + vector<ceph::unordered_map<sobject_t, Object*> > objects; + + list<Context*> waitfor_read; + + ceph_tid_t last_read_tid; + + set<BufferHead*, BufferHead::ptr_lt> dirty_or_tx_bh; + LRU bh_lru_dirty, bh_lru_rest; + LRU ob_lru; + + Cond flusher_cond; + bool flusher_stop; + void flusher_entry(); + class FlusherThread : public Thread { + ObjectCacher *oc; + public: + explicit FlusherThread(ObjectCacher *o) : oc(o) {} + void *entry() override { + oc->flusher_entry(); + return 0; + } + } flusher_thread; + + Finisher finisher; + + // objects + Object *get_object_maybe(sobject_t oid, object_locator_t &l) { + // have it? + if (((uint32_t)l.pool < objects.size()) && + (objects[l.pool].count(oid))) + return objects[l.pool][oid]; + return NULL; + } + + Object *get_object(sobject_t oid, uint64_t object_no, ObjectSet *oset, + object_locator_t &l, uint64_t truncate_size, + uint64_t truncate_seq); + void close_object(Object *ob); + + // bh stats + Cond stat_cond; + + loff_t stat_clean; + loff_t stat_zero; + loff_t stat_dirty; + loff_t stat_rx; + loff_t stat_tx; + loff_t stat_missing; + loff_t stat_error; + loff_t stat_dirty_waiting; // bytes that writers are waiting on to write + + size_t stat_nr_dirty_waiters; + + void verify_stats() const; + + void bh_stat_add(BufferHead *bh); + void bh_stat_sub(BufferHead *bh); + loff_t get_stat_tx() const { return stat_tx; } + loff_t get_stat_rx() const { return stat_rx; } + loff_t get_stat_dirty() const { return stat_dirty; } + loff_t get_stat_clean() const { return stat_clean; } + loff_t get_stat_zero() const { return stat_zero; } + loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; } + size_t get_stat_nr_dirty_waiters() const { return stat_nr_dirty_waiters; } + + void touch_bh(BufferHead *bh) { + if (bh->is_dirty()) + bh_lru_dirty.lru_touch(bh); + else + bh_lru_rest.lru_touch(bh); + + bh->set_dontneed(false); + bh->set_nocache(false); + touch_ob(bh->ob); + } + void touch_ob(Object *ob) { + ob_lru.lru_touch(ob); + } + void bottouch_ob(Object *ob) { + ob_lru.lru_bottouch(ob); + } + + // bh states + void bh_set_state(BufferHead *bh, int s); + void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { + bh_set_state(bh2, bh1->get_state()); + } + + void mark_missing(BufferHead *bh) { + bh_set_state(bh,BufferHead::STATE_MISSING); + } + void mark_clean(BufferHead *bh) { + bh_set_state(bh, BufferHead::STATE_CLEAN); + } + void mark_zero(BufferHead *bh) { + bh_set_state(bh, BufferHead::STATE_ZERO); + } + void mark_rx(BufferHead *bh) { + bh_set_state(bh, BufferHead::STATE_RX); + } + void mark_tx(BufferHead *bh) { + bh_set_state(bh, BufferHead::STATE_TX); } + void mark_error(BufferHead *bh) { + bh_set_state(bh, BufferHead::STATE_ERROR); + } + void mark_dirty(BufferHead *bh) { + bh_set_state(bh, BufferHead::STATE_DIRTY); + bh_lru_dirty.lru_touch(bh); + //bh->set_dirty_stamp(ceph_clock_now()); + } + + void bh_add(Object *ob, BufferHead *bh); + void bh_remove(Object *ob, BufferHead *bh); + + // io + void bh_read(BufferHead *bh, int op_flags, + const ZTracer::Trace &parent_trace); + void bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace); + void bh_write_scattered(list<BufferHead*>& blist); + void bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff, + int64_t *amount, int *max_count); + + void trim(); + void flush(ZTracer::Trace *trace, loff_t amount=0); + + /** + * flush a range of buffers + * + * Flush any buffers that intersect the specified extent. If len==0, + * flush *all* buffers for the object. + * + * @param o object + * @param off start offset + * @param len extent length, or 0 for entire object + * @return true if object was already clean/flushed. + */ + bool flush(Object *o, loff_t off, loff_t len, + ZTracer::Trace *trace); + loff_t release(Object *o); + void purge(Object *o); + + int64_t reads_outstanding; + Cond read_cond; + + int _readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, + bool external_call, ZTracer::Trace *trace); + void retry_waiting_reads(); + + public: + void bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid, + loff_t offset, uint64_t length, + bufferlist &bl, int r, + bool trust_enoent); + void bh_write_commit(int64_t poolid, sobject_t oid, + vector<pair<loff_t, uint64_t> >& ranges, + ceph_tid_t t, int r); + + class C_WriteCommit; + class C_WaitForWrite; + + void perf_start(); + void perf_stop(); + + + + ObjectCacher(CephContext *cct_, string name, WritebackHandler& wb, Mutex& l, + flush_set_callback_t flush_callback, + void *flush_callback_arg, + uint64_t max_bytes, uint64_t max_objects, + uint64_t max_dirty, uint64_t target_dirty, double max_age, + bool block_writes_upfront); + ~ObjectCacher(); + + void start() { + flusher_thread.create("flusher"); + } + void stop() { + ceph_assert(flusher_thread.is_started()); + lock.Lock(); // hmm.. watch out for deadlock! + flusher_stop = true; + flusher_cond.Signal(); + lock.Unlock(); + flusher_thread.join(); + } + + + class C_RetryRead; + + + // non-blocking. async. + + /** + * @note total read size must be <= INT_MAX, since + * the return value is total bytes read + */ + int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, + ZTracer::Trace *parent_trace = nullptr); + int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace, + ZTracer::Trace *parent_trace = nullptr); + bool is_cached(ObjectSet *oset, vector<ObjectExtent>& extents, + snapid_t snapid); + +private: + // write blocking + int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, + ZTracer::Trace *trace, Context *onfreespace); + void maybe_wait_for_writeback(uint64_t len, ZTracer::Trace *trace); + bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish); + + void _discard(ObjectSet *oset, const vector<ObjectExtent>& exls, + C_GatherBuilder* gather); + void _discard_finish(ObjectSet *oset, bool was_dirty, Context* on_finish); + +public: + bool set_is_empty(ObjectSet *oset); + bool set_is_cached(ObjectSet *oset); + bool set_is_dirty_or_committing(ObjectSet *oset); + + bool flush_set(ObjectSet *oset, Context *onfinish=0); + bool flush_set(ObjectSet *oset, vector<ObjectExtent>& ex, + ZTracer::Trace *trace, Context *onfinish = 0); + bool flush_all(Context *onfinish = 0); + + void purge_set(ObjectSet *oset); + + // returns # of bytes not released (ie non-clean) + loff_t release_set(ObjectSet *oset); + uint64_t release_all(); + + void discard_set(ObjectSet *oset, const vector<ObjectExtent>& ex); + void discard_writeback(ObjectSet *oset, const vector<ObjectExtent>& ex, + Context* on_finish); + + /** + * Retry any in-flight reads that get -ENOENT instead of marking + * them zero, and get rid of any cached -ENOENTs. + * After this is called and the cache's lock is unlocked, + * any new requests will treat -ENOENT normally. + */ + void clear_nonexistence(ObjectSet *oset); + + + // cache sizes + void set_max_dirty(uint64_t v) { + max_dirty = v; + } + void set_target_dirty(int64_t v) { + target_dirty = v; + } + void set_max_size(int64_t v) { + max_size = v; + } + void set_max_dirty_age(double a) { + max_dirty_age = make_timespan(a); + } + void set_max_objects(int64_t v) { + max_objects = v; + } + + + // file functions + + /*** async+caching (non-blocking) file interface ***/ + int file_is_cached(ObjectSet *oset, file_layout_t *layout, + snapid_t snapid, loff_t offset, uint64_t len) { + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, oset->ino, layout, offset, len, + oset->truncate_size, extents); + return is_cached(oset, extents, snapid); + } + + int file_read(ObjectSet *oset, file_layout_t *layout, snapid_t snapid, + loff_t offset, uint64_t len, bufferlist *bl, int flags, + Context *onfinish) { + OSDRead *rd = prepare_read(snapid, bl, flags); + Striper::file_to_extents(cct, oset->ino, layout, offset, len, + oset->truncate_size, rd->extents); + return readx(rd, oset, onfinish); + } + + int file_write(ObjectSet *oset, file_layout_t *layout, + const SnapContext& snapc, loff_t offset, uint64_t len, + bufferlist& bl, ceph::real_time mtime, int flags) { + OSDWrite *wr = prepare_write(snapc, bl, mtime, flags, 0); + Striper::file_to_extents(cct, oset->ino, layout, offset, len, + oset->truncate_size, wr->extents); + return writex(wr, oset, NULL); + } + + bool file_flush(ObjectSet *oset, file_layout_t *layout, + const SnapContext& snapc, loff_t offset, uint64_t len, + Context *onfinish) { + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, oset->ino, layout, offset, len, + oset->truncate_size, extents); + ZTracer::Trace trace; + return flush_set(oset, extents, &trace, onfinish); + } +}; + + +inline ostream& operator<<(ostream &out, const ObjectCacher::BufferHead &bh) +{ + out << "bh[ " << &bh << " " + << bh.start() << "~" << bh.length() + << " " << bh.ob + << " (" << bh.bl.length() << ")" + << " v " << bh.last_write_tid; + if (bh.get_journal_tid() != 0) { + out << " j " << bh.get_journal_tid(); + } + if (bh.is_tx()) out << " tx"; + if (bh.is_rx()) out << " rx"; + if (bh.is_dirty()) out << " dirty"; + if (bh.is_clean()) out << " clean"; + if (bh.is_zero()) out << " zero"; + if (bh.is_missing()) out << " missing"; + if (bh.bl.length() > 0) out << " firstbyte=" << (int)bh.bl[0]; + if (bh.error) out << " error=" << bh.error; + out << "]"; + out << " waiters = {"; + for (map<loff_t, list<Context*> >::const_iterator it + = bh.waitfor_read.begin(); + it != bh.waitfor_read.end(); ++it) { + out << " " << it->first << "->["; + for (list<Context*>::const_iterator lit = it->second.begin(); + lit != it->second.end(); ++lit) { + out << *lit << ", "; + } + out << "]"; + } + out << "}"; + return out; +} + +inline ostream& operator<<(ostream &out, const ObjectCacher::ObjectSet &os) +{ + return out << "objectset[" << os.ino + << " ts " << os.truncate_seq << "/" << os.truncate_size + << " objects " << os.objects.size() + << " dirty_or_tx " << os.dirty_or_tx + << "]"; +} + +inline ostream& operator<<(ostream &out, const ObjectCacher::Object &ob) +{ + out << "object[" + << ob.get_soid() << " oset " << ob.oset << dec + << " wr " << ob.last_write_tid << "/" << ob.last_commit_tid; + + if (ob.complete) + out << " COMPLETE"; + if (!ob.exists) + out << " !EXISTS"; + + out << "]"; + return out; +} + +#endif diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc new file mode 100644 index 00000000..bc39114a --- /dev/null +++ b/src/osdc/Objecter.cc @@ -0,0 +1,5285 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Objecter.h" +#include "osd/OSDMap.h" +#include "Filer.h" + +#include "mon/MonClient.h" + +#include "msg/Messenger.h" +#include "msg/Message.h" + +#include "messages/MPing.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDMap.h" + +#include "messages/MPoolOp.h" +#include "messages/MPoolOpReply.h" + +#include "messages/MGetPoolStats.h" +#include "messages/MGetPoolStatsReply.h" +#include "messages/MStatfs.h" +#include "messages/MStatfsReply.h" + +#include "messages/MMonCommand.h" + +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" + +#include "messages/MWatchNotify.h" + +#include <errno.h> + +#include "common/config.h" +#include "common/perf_counters.h" +#include "common/scrub_types.h" +#include "include/str_list.h" +#include "common/errno.h" +#include "common/EventTrace.h" + +using ceph::real_time; +using ceph::real_clock; + +using ceph::mono_clock; +using ceph::mono_time; + +using ceph::timespan; + + +#define dout_subsys ceph_subsys_objecter +#undef dout_prefix +#define dout_prefix *_dout << messenger->get_myname() << ".objecter " + + +enum { + l_osdc_first = 123200, + l_osdc_op_active, + l_osdc_op_laggy, + l_osdc_op_send, + l_osdc_op_send_bytes, + l_osdc_op_resend, + l_osdc_op_reply, + + l_osdc_op, + l_osdc_op_r, + l_osdc_op_w, + l_osdc_op_rmw, + l_osdc_op_pg, + + l_osdc_osdop_stat, + l_osdc_osdop_create, + l_osdc_osdop_read, + l_osdc_osdop_write, + l_osdc_osdop_writefull, + l_osdc_osdop_writesame, + l_osdc_osdop_append, + l_osdc_osdop_zero, + l_osdc_osdop_truncate, + l_osdc_osdop_delete, + l_osdc_osdop_mapext, + l_osdc_osdop_sparse_read, + l_osdc_osdop_clonerange, + l_osdc_osdop_getxattr, + l_osdc_osdop_setxattr, + l_osdc_osdop_cmpxattr, + l_osdc_osdop_rmxattr, + l_osdc_osdop_resetxattrs, + l_osdc_osdop_call, + l_osdc_osdop_watch, + l_osdc_osdop_notify, + l_osdc_osdop_src_cmpxattr, + l_osdc_osdop_pgls, + l_osdc_osdop_pgls_filter, + l_osdc_osdop_other, + + l_osdc_linger_active, + l_osdc_linger_send, + l_osdc_linger_resend, + l_osdc_linger_ping, + + l_osdc_poolop_active, + l_osdc_poolop_send, + l_osdc_poolop_resend, + + l_osdc_poolstat_active, + l_osdc_poolstat_send, + l_osdc_poolstat_resend, + + l_osdc_statfs_active, + l_osdc_statfs_send, + l_osdc_statfs_resend, + + l_osdc_command_active, + l_osdc_command_send, + l_osdc_command_resend, + + l_osdc_map_epoch, + l_osdc_map_full, + l_osdc_map_inc, + + l_osdc_osd_sessions, + l_osdc_osd_session_open, + l_osdc_osd_session_close, + l_osdc_osd_laggy, + + l_osdc_osdop_omap_wr, + l_osdc_osdop_omap_rd, + l_osdc_osdop_omap_del, + + l_osdc_last, +}; + + +// config obs ---------------------------- + +static const char *config_keys[] = { + "crush_location", + NULL +}; + +class Objecter::RequestStateHook : public AdminSocketHook { + Objecter *m_objecter; +public: + explicit RequestStateHook(Objecter *objecter); + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override; +}; + +/** + * This is a more limited form of C_Contexts, but that requires + * a ceph_context which we don't have here. + */ +class ObjectOperation::C_TwoContexts : public Context { + Context *first; + Context *second; +public: + C_TwoContexts(Context *first, Context *second) : + first(first), second(second) {} + void finish(int r) override { + first->complete(r); + second->complete(r); + first = NULL; + second = NULL; + } + + ~C_TwoContexts() override { + delete first; + delete second; + } +}; + +void ObjectOperation::add_handler(Context *extra) { + size_t last = out_handler.size() - 1; + Context *orig = out_handler[last]; + if (orig) { + Context *wrapper = new C_TwoContexts(orig, extra); + out_handler[last] = wrapper; + } else { + out_handler[last] = extra; + } +} + +Objecter::OSDSession::unique_completion_lock Objecter::OSDSession::get_lock( + object_t& oid) +{ + if (oid.name.empty()) + return unique_completion_lock(); + + static constexpr uint32_t HASH_PRIME = 1021; + uint32_t h = ceph_str_hash_linux(oid.name.c_str(), oid.name.size()) + % HASH_PRIME; + + return unique_completion_lock(completion_locks[h % num_locks], + std::defer_lock); +} + +const char** Objecter::get_tracked_conf_keys() const +{ + return config_keys; +} + + +void Objecter::handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + if (changed.count("crush_location")) { + update_crush_location(); + } +} + +void Objecter::update_crush_location() +{ + unique_lock wl(rwlock); + crush_location = cct->crush_location.get_location(); +} + +// messages ------------------------------ + +/* + * initialize only internal data structures, don't initiate cluster interaction + */ +void Objecter::init() +{ + ceph_assert(!initialized); + + if (!logger) { + PerfCountersBuilder pcb(cct, "objecter", l_osdc_first, l_osdc_last); + + pcb.add_u64(l_osdc_op_active, "op_active", "Operations active", "actv", + PerfCountersBuilder::PRIO_CRITICAL); + pcb.add_u64(l_osdc_op_laggy, "op_laggy", "Laggy operations"); + pcb.add_u64_counter(l_osdc_op_send, "op_send", "Sent operations"); + pcb.add_u64_counter(l_osdc_op_send_bytes, "op_send_bytes", "Sent data", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_u64_counter(l_osdc_op_resend, "op_resend", "Resent operations"); + pcb.add_u64_counter(l_osdc_op_reply, "op_reply", "Operation reply"); + + pcb.add_u64_counter(l_osdc_op, "op", "Operations"); + pcb.add_u64_counter(l_osdc_op_r, "op_r", "Read operations", "rd", + PerfCountersBuilder::PRIO_CRITICAL); + pcb.add_u64_counter(l_osdc_op_w, "op_w", "Write operations", "wr", + PerfCountersBuilder::PRIO_CRITICAL); + pcb.add_u64_counter(l_osdc_op_rmw, "op_rmw", "Read-modify-write operations", + "rdwr", PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64_counter(l_osdc_op_pg, "op_pg", "PG operation"); + + pcb.add_u64_counter(l_osdc_osdop_stat, "osdop_stat", "Stat operations"); + pcb.add_u64_counter(l_osdc_osdop_create, "osdop_create", + "Create object operations"); + pcb.add_u64_counter(l_osdc_osdop_read, "osdop_read", "Read operations"); + pcb.add_u64_counter(l_osdc_osdop_write, "osdop_write", "Write operations"); + pcb.add_u64_counter(l_osdc_osdop_writefull, "osdop_writefull", + "Write full object operations"); + pcb.add_u64_counter(l_osdc_osdop_writesame, "osdop_writesame", + "Write same operations"); + pcb.add_u64_counter(l_osdc_osdop_append, "osdop_append", + "Append operation"); + pcb.add_u64_counter(l_osdc_osdop_zero, "osdop_zero", + "Set object to zero operations"); + pcb.add_u64_counter(l_osdc_osdop_truncate, "osdop_truncate", + "Truncate object operations"); + pcb.add_u64_counter(l_osdc_osdop_delete, "osdop_delete", + "Delete object operations"); + pcb.add_u64_counter(l_osdc_osdop_mapext, "osdop_mapext", + "Map extent operations"); + pcb.add_u64_counter(l_osdc_osdop_sparse_read, "osdop_sparse_read", + "Sparse read operations"); + pcb.add_u64_counter(l_osdc_osdop_clonerange, "osdop_clonerange", + "Clone range operations"); + pcb.add_u64_counter(l_osdc_osdop_getxattr, "osdop_getxattr", + "Get xattr operations"); + pcb.add_u64_counter(l_osdc_osdop_setxattr, "osdop_setxattr", + "Set xattr operations"); + pcb.add_u64_counter(l_osdc_osdop_cmpxattr, "osdop_cmpxattr", + "Xattr comparison operations"); + pcb.add_u64_counter(l_osdc_osdop_rmxattr, "osdop_rmxattr", + "Remove xattr operations"); + pcb.add_u64_counter(l_osdc_osdop_resetxattrs, "osdop_resetxattrs", + "Reset xattr operations"); + pcb.add_u64_counter(l_osdc_osdop_call, "osdop_call", + "Call (execute) operations"); + pcb.add_u64_counter(l_osdc_osdop_watch, "osdop_watch", + "Watch by object operations"); + pcb.add_u64_counter(l_osdc_osdop_notify, "osdop_notify", + "Notify about object operations"); + pcb.add_u64_counter(l_osdc_osdop_src_cmpxattr, "osdop_src_cmpxattr", + "Extended attribute comparison in multi operations"); + pcb.add_u64_counter(l_osdc_osdop_pgls, "osdop_pgls"); + pcb.add_u64_counter(l_osdc_osdop_pgls_filter, "osdop_pgls_filter"); + pcb.add_u64_counter(l_osdc_osdop_other, "osdop_other", "Other operations"); + + pcb.add_u64(l_osdc_linger_active, "linger_active", + "Active lingering operations"); + pcb.add_u64_counter(l_osdc_linger_send, "linger_send", + "Sent lingering operations"); + pcb.add_u64_counter(l_osdc_linger_resend, "linger_resend", + "Resent lingering operations"); + pcb.add_u64_counter(l_osdc_linger_ping, "linger_ping", + "Sent pings to lingering operations"); + + pcb.add_u64(l_osdc_poolop_active, "poolop_active", + "Active pool operations"); + pcb.add_u64_counter(l_osdc_poolop_send, "poolop_send", + "Sent pool operations"); + pcb.add_u64_counter(l_osdc_poolop_resend, "poolop_resend", + "Resent pool operations"); + + pcb.add_u64(l_osdc_poolstat_active, "poolstat_active", + "Active get pool stat operations"); + pcb.add_u64_counter(l_osdc_poolstat_send, "poolstat_send", + "Pool stat operations sent"); + pcb.add_u64_counter(l_osdc_poolstat_resend, "poolstat_resend", + "Resent pool stats"); + + pcb.add_u64(l_osdc_statfs_active, "statfs_active", "Statfs operations"); + pcb.add_u64_counter(l_osdc_statfs_send, "statfs_send", "Sent FS stats"); + pcb.add_u64_counter(l_osdc_statfs_resend, "statfs_resend", + "Resent FS stats"); + + pcb.add_u64(l_osdc_command_active, "command_active", "Active commands"); + pcb.add_u64_counter(l_osdc_command_send, "command_send", + "Sent commands"); + pcb.add_u64_counter(l_osdc_command_resend, "command_resend", + "Resent commands"); + + pcb.add_u64(l_osdc_map_epoch, "map_epoch", "OSD map epoch"); + pcb.add_u64_counter(l_osdc_map_full, "map_full", + "Full OSD maps received"); + pcb.add_u64_counter(l_osdc_map_inc, "map_inc", + "Incremental OSD maps received"); + + pcb.add_u64(l_osdc_osd_sessions, "osd_sessions", + "Open sessions"); // open sessions + pcb.add_u64_counter(l_osdc_osd_session_open, "osd_session_open", + "Sessions opened"); + pcb.add_u64_counter(l_osdc_osd_session_close, "osd_session_close", + "Sessions closed"); + pcb.add_u64(l_osdc_osd_laggy, "osd_laggy", "Laggy OSD sessions"); + + pcb.add_u64_counter(l_osdc_osdop_omap_wr, "omap_wr", + "OSD OMAP write operations"); + pcb.add_u64_counter(l_osdc_osdop_omap_rd, "omap_rd", + "OSD OMAP read operations"); + pcb.add_u64_counter(l_osdc_osdop_omap_del, "omap_del", + "OSD OMAP delete operations"); + + logger = pcb.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); + } + + m_request_state_hook = new RequestStateHook(this); + AdminSocket* admin_socket = cct->get_admin_socket(); + int ret = admin_socket->register_command("objecter_requests", + "objecter_requests", + m_request_state_hook, + "show in-progress osd requests"); + + /* Don't warn on EEXIST, happens if multiple ceph clients + * are instantiated from one process */ + if (ret < 0 && ret != -EEXIST) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(ret) << dendl; + } + + update_crush_location(); + + cct->_conf.add_observer(this); + + initialized = true; +} + +/* + * ok, cluster interaction can happen + */ +void Objecter::start(const OSDMap* o) +{ + shared_lock rl(rwlock); + + start_tick(); + if (o) { + osdmap->deepish_copy_from(*o); + } else if (osdmap->get_epoch() == 0) { + _maybe_request_map(); + } +} + +void Objecter::shutdown() +{ + ceph_assert(initialized); + + unique_lock wl(rwlock); + + initialized = false; + + wl.unlock(); + cct->_conf.remove_observer(this); + wl.lock(); + + map<int,OSDSession*>::iterator p; + while (!osd_sessions.empty()) { + p = osd_sessions.begin(); + close_session(p->second); + } + + while(!check_latest_map_lingers.empty()) { + map<uint64_t, LingerOp*>::iterator i = check_latest_map_lingers.begin(); + i->second->put(); + check_latest_map_lingers.erase(i->first); + } + + while(!check_latest_map_ops.empty()) { + map<ceph_tid_t, Op*>::iterator i = check_latest_map_ops.begin(); + i->second->put(); + check_latest_map_ops.erase(i->first); + } + + while(!check_latest_map_commands.empty()) { + map<ceph_tid_t, CommandOp*>::iterator i + = check_latest_map_commands.begin(); + i->second->put(); + check_latest_map_commands.erase(i->first); + } + + while(!poolstat_ops.empty()) { + map<ceph_tid_t,PoolStatOp*>::iterator i = poolstat_ops.begin(); + delete i->second; + poolstat_ops.erase(i->first); + } + + while(!statfs_ops.empty()) { + map<ceph_tid_t, StatfsOp*>::iterator i = statfs_ops.begin(); + delete i->second; + statfs_ops.erase(i->first); + } + + while(!pool_ops.empty()) { + map<ceph_tid_t, PoolOp*>::iterator i = pool_ops.begin(); + delete i->second; + pool_ops.erase(i->first); + } + + ldout(cct, 20) << __func__ << " clearing up homeless session..." << dendl; + while(!homeless_session->linger_ops.empty()) { + std::map<uint64_t, LingerOp*>::iterator i + = homeless_session->linger_ops.begin(); + ldout(cct, 10) << " linger_op " << i->first << dendl; + LingerOp *lop = i->second; + { + OSDSession::unique_lock swl(homeless_session->lock); + _session_linger_op_remove(homeless_session, lop); + } + linger_ops.erase(lop->linger_id); + linger_ops_set.erase(lop); + lop->put(); + } + + while(!homeless_session->ops.empty()) { + std::map<ceph_tid_t, Op*>::iterator i = homeless_session->ops.begin(); + ldout(cct, 10) << " op " << i->first << dendl; + Op *op = i->second; + { + OSDSession::unique_lock swl(homeless_session->lock); + _session_op_remove(homeless_session, op); + } + op->put(); + } + + while(!homeless_session->command_ops.empty()) { + std::map<ceph_tid_t, CommandOp*>::iterator i + = homeless_session->command_ops.begin(); + ldout(cct, 10) << " command_op " << i->first << dendl; + CommandOp *cop = i->second; + { + OSDSession::unique_lock swl(homeless_session->lock); + _session_command_op_remove(homeless_session, cop); + } + cop->put(); + } + + if (tick_event) { + if (timer.cancel_event(tick_event)) { + ldout(cct, 10) << " successfully canceled tick" << dendl; + } + tick_event = 0; + } + + if (logger) { + cct->get_perfcounters_collection()->remove(logger); + delete logger; + logger = NULL; + } + + // Let go of Objecter write lock so timer thread can shutdown + wl.unlock(); + + // Outside of lock to avoid cycle WRT calls to RequestStateHook + // This is safe because we guarantee no concurrent calls to + // shutdown() with the ::initialized check at start. + if (m_request_state_hook) { + AdminSocket* admin_socket = cct->get_admin_socket(); + admin_socket->unregister_command("objecter_requests"); + delete m_request_state_hook; + m_request_state_hook = NULL; + } +} + +void Objecter::_send_linger(LingerOp *info, + shunique_lock& sul) +{ + ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock); + + vector<OSDOp> opv; + Context *oncommit = NULL; + LingerOp::shared_lock watchl(info->watch_lock); + bufferlist *poutbl = NULL; + if (info->registered && info->is_watch) { + ldout(cct, 15) << "send_linger " << info->linger_id << " reconnect" + << dendl; + opv.push_back(OSDOp()); + opv.back().op.op = CEPH_OSD_OP_WATCH; + opv.back().op.watch.cookie = info->get_cookie(); + opv.back().op.watch.op = CEPH_OSD_WATCH_OP_RECONNECT; + opv.back().op.watch.gen = ++info->register_gen; + oncommit = new C_Linger_Reconnect(this, info); + } else { + ldout(cct, 15) << "send_linger " << info->linger_id << " register" + << dendl; + opv = info->ops; + C_Linger_Commit *c = new C_Linger_Commit(this, info); + if (!info->is_watch) { + info->notify_id = 0; + poutbl = &c->outbl; + } + oncommit = c; + } + watchl.unlock(); + Op *o = new Op(info->target.base_oid, info->target.base_oloc, + opv, info->target.flags | CEPH_OSD_FLAG_READ, + oncommit, info->pobjver); + o->outbl = poutbl; + o->snapid = info->snap; + o->snapc = info->snapc; + o->mtime = info->mtime; + + o->target = info->target; + o->tid = ++last_tid; + + // do not resend this; we will send a new op to reregister + o->should_resend = false; + o->ctx_budgeted = true; + + if (info->register_tid) { + // repeat send. cancel old registration op, if any. + OSDSession::unique_lock sl(info->session->lock); + if (info->session->ops.count(info->register_tid)) { + Op *o = info->session->ops[info->register_tid]; + _op_cancel_map_check(o); + _cancel_linger_op(o); + } + sl.unlock(); + } + + _op_submit_with_budget(o, sul, &info->register_tid, &info->ctx_budget); + + logger->inc(l_osdc_linger_send); +} + +void Objecter::_linger_commit(LingerOp *info, int r, bufferlist& outbl) +{ + LingerOp::unique_lock wl(info->watch_lock); + ldout(cct, 10) << "_linger_commit " << info->linger_id << dendl; + if (info->on_reg_commit) { + info->on_reg_commit->complete(r); + info->on_reg_commit = NULL; + } + if (r < 0 && info->on_notify_finish) { + info->on_notify_finish->complete(r); + info->on_notify_finish = nullptr; + } + + // only tell the user the first time we do this + info->registered = true; + info->pobjver = NULL; + + if (!info->is_watch) { + // make note of the notify_id + auto p = outbl.cbegin(); + try { + decode(info->notify_id, p); + ldout(cct, 10) << "_linger_commit notify_id=" << info->notify_id + << dendl; + } + catch (buffer::error& e) { + } + } +} + +struct C_DoWatchError : public Context { + Objecter *objecter; + Objecter::LingerOp *info; + int err; + C_DoWatchError(Objecter *o, Objecter::LingerOp *i, int r) + : objecter(o), info(i), err(r) { + info->get(); + info->_queued_async(); + } + void finish(int r) override { + Objecter::unique_lock wl(objecter->rwlock); + bool canceled = info->canceled; + wl.unlock(); + + if (!canceled) { + info->watch_context->handle_error(info->get_cookie(), err); + } + + info->finished_async(); + info->put(); + } +}; + +int Objecter::_normalize_watch_error(int r) +{ + // translate ENOENT -> ENOTCONN so that a delete->disconnection + // notification and a failure to reconnect because we raced with + // the delete appear the same to the user. + if (r == -ENOENT) + r = -ENOTCONN; + return r; +} + +void Objecter::_linger_reconnect(LingerOp *info, int r) +{ + ldout(cct, 10) << __func__ << " " << info->linger_id << " = " << r + << " (last_error " << info->last_error << ")" << dendl; + if (r < 0) { + LingerOp::unique_lock wl(info->watch_lock); + if (!info->last_error) { + r = _normalize_watch_error(r); + info->last_error = r; + if (info->watch_context) { + finisher->queue(new C_DoWatchError(this, info, r)); + } + } + wl.unlock(); + } +} + +void Objecter::_send_linger_ping(LingerOp *info) +{ + // rwlock is locked unique + // info->session->lock is locked + + if (cct->_conf->objecter_inject_no_watch_ping) { + ldout(cct, 10) << __func__ << " " << info->linger_id << " SKIPPING" + << dendl; + return; + } + if (osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) { + ldout(cct, 10) << __func__ << " PAUSERD" << dendl; + return; + } + + ceph::coarse_mono_time now = ceph::coarse_mono_clock::now(); + ldout(cct, 10) << __func__ << " " << info->linger_id << " now " << now + << dendl; + + vector<OSDOp> opv(1); + opv[0].op.op = CEPH_OSD_OP_WATCH; + opv[0].op.watch.cookie = info->get_cookie(); + opv[0].op.watch.op = CEPH_OSD_WATCH_OP_PING; + opv[0].op.watch.gen = info->register_gen; + C_Linger_Ping *onack = new C_Linger_Ping(this, info); + Op *o = new Op(info->target.base_oid, info->target.base_oloc, + opv, info->target.flags | CEPH_OSD_FLAG_READ, + onack, NULL, NULL); + o->target = info->target; + o->should_resend = false; + _send_op_account(o); + o->tid = ++last_tid; + _session_op_assign(info->session, o); + _send_op(o); + info->ping_tid = o->tid; + + onack->sent = now; + logger->inc(l_osdc_linger_ping); +} + +void Objecter::_linger_ping(LingerOp *info, int r, ceph::coarse_mono_time sent, + uint32_t register_gen) +{ + LingerOp::unique_lock l(info->watch_lock); + ldout(cct, 10) << __func__ << " " << info->linger_id + << " sent " << sent << " gen " << register_gen << " = " << r + << " (last_error " << info->last_error + << " register_gen " << info->register_gen << ")" << dendl; + if (info->register_gen == register_gen) { + if (r == 0) { + info->watch_valid_thru = sent; + } else if (r < 0 && !info->last_error) { + r = _normalize_watch_error(r); + info->last_error = r; + if (info->watch_context) { + finisher->queue(new C_DoWatchError(this, info, r)); + } + } + } else { + ldout(cct, 20) << " ignoring old gen" << dendl; + } +} + +int Objecter::linger_check(LingerOp *info) +{ + LingerOp::shared_lock l(info->watch_lock); + + ceph::coarse_mono_time stamp = info->watch_valid_thru; + if (!info->watch_pending_async.empty()) + stamp = std::min(info->watch_valid_thru, info->watch_pending_async.front()); + auto age = ceph::coarse_mono_clock::now() - stamp; + + ldout(cct, 10) << __func__ << " " << info->linger_id + << " err " << info->last_error + << " age " << age << dendl; + if (info->last_error) + return info->last_error; + // return a safe upper bound (we are truncating to ms) + return + 1 + std::chrono::duration_cast<std::chrono::milliseconds>(age).count(); +} + +void Objecter::linger_cancel(LingerOp *info) +{ + unique_lock wl(rwlock); + _linger_cancel(info); + info->put(); +} + +void Objecter::_linger_cancel(LingerOp *info) +{ + // rwlock is locked unique + ldout(cct, 20) << __func__ << " linger_id=" << info->linger_id << dendl; + if (!info->canceled) { + OSDSession *s = info->session; + OSDSession::unique_lock sl(s->lock); + _session_linger_op_remove(s, info); + sl.unlock(); + + linger_ops.erase(info->linger_id); + linger_ops_set.erase(info); + ceph_assert(linger_ops.size() == linger_ops_set.size()); + + info->canceled = true; + info->put(); + + logger->dec(l_osdc_linger_active); + } +} + + + +Objecter::LingerOp *Objecter::linger_register(const object_t& oid, + const object_locator_t& oloc, + int flags) +{ + LingerOp *info = new LingerOp(this); + info->target.base_oid = oid; + info->target.base_oloc = oloc; + if (info->target.base_oloc.key == oid) + info->target.base_oloc.key.clear(); + info->target.flags = flags; + info->watch_valid_thru = ceph::coarse_mono_clock::now(); + + unique_lock l(rwlock); + + // Acquire linger ID + info->linger_id = ++max_linger_id; + ldout(cct, 10) << __func__ << " info " << info + << " linger_id " << info->linger_id + << " cookie " << info->get_cookie() + << dendl; + linger_ops[info->linger_id] = info; + linger_ops_set.insert(info); + ceph_assert(linger_ops.size() == linger_ops_set.size()); + + info->get(); // for the caller + return info; +} + +ceph_tid_t Objecter::linger_watch(LingerOp *info, + ObjectOperation& op, + const SnapContext& snapc, + real_time mtime, + bufferlist& inbl, + Context *oncommit, + version_t *objver) +{ + info->is_watch = true; + info->snapc = snapc; + info->mtime = mtime; + info->target.flags |= CEPH_OSD_FLAG_WRITE; + info->ops = op.ops; + info->inbl = inbl; + info->poutbl = NULL; + info->pobjver = objver; + info->on_reg_commit = oncommit; + + info->ctx_budget = take_linger_budget(info); + + shunique_lock sul(rwlock, ceph::acquire_unique); + _linger_submit(info, sul); + logger->inc(l_osdc_linger_active); + + return info->linger_id; +} + +ceph_tid_t Objecter::linger_notify(LingerOp *info, + ObjectOperation& op, + snapid_t snap, bufferlist& inbl, + bufferlist *poutbl, + Context *onfinish, + version_t *objver) +{ + info->snap = snap; + info->target.flags |= CEPH_OSD_FLAG_READ; + info->ops = op.ops; + info->inbl = inbl; + info->poutbl = poutbl; + info->pobjver = objver; + info->on_reg_commit = onfinish; + + info->ctx_budget = take_linger_budget(info); + + shunique_lock sul(rwlock, ceph::acquire_unique); + _linger_submit(info, sul); + logger->inc(l_osdc_linger_active); + + return info->linger_id; +} + +void Objecter::_linger_submit(LingerOp *info, shunique_lock& sul) +{ + ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock); + ceph_assert(info->linger_id); + ceph_assert(info->ctx_budget != -1); // caller needs to have taken budget already! + + // Populate Op::target + OSDSession *s = NULL; + _calc_target(&info->target, nullptr); + + // Create LingerOp<->OSDSession relation + int r = _get_session(info->target.osd, &s, sul); + ceph_assert(r == 0); + OSDSession::unique_lock sl(s->lock); + _session_linger_op_assign(s, info); + sl.unlock(); + put_session(s); + + _send_linger(info, sul); +} + +struct C_DoWatchNotify : public Context { + Objecter *objecter; + Objecter::LingerOp *info; + MWatchNotify *msg; + C_DoWatchNotify(Objecter *o, Objecter::LingerOp *i, MWatchNotify *m) + : objecter(o), info(i), msg(m) { + info->get(); + info->_queued_async(); + msg->get(); + } + void finish(int r) override { + objecter->_do_watch_notify(info, msg); + } +}; + +void Objecter::handle_watch_notify(MWatchNotify *m) +{ + shared_lock l(rwlock); + if (!initialized) { + return; + } + + LingerOp *info = reinterpret_cast<LingerOp*>(m->cookie); + if (linger_ops_set.count(info) == 0) { + ldout(cct, 7) << __func__ << " cookie " << m->cookie << " dne" << dendl; + return; + } + LingerOp::unique_lock wl(info->watch_lock); + if (m->opcode == CEPH_WATCH_EVENT_DISCONNECT) { + if (!info->last_error) { + info->last_error = -ENOTCONN; + if (info->watch_context) { + finisher->queue(new C_DoWatchError(this, info, -ENOTCONN)); + } + } + } else if (!info->is_watch) { + // we have CEPH_WATCH_EVENT_NOTIFY_COMPLETE; we can do this inline + // since we know the only user (librados) is safe to call in + // fast-dispatch context + if (info->notify_id && + info->notify_id != m->notify_id) { + ldout(cct, 10) << __func__ << " reply notify " << m->notify_id + << " != " << info->notify_id << ", ignoring" << dendl; + } else if (info->on_notify_finish) { + info->notify_result_bl->claim(m->get_data()); + info->on_notify_finish->complete(m->return_code); + + // if we race with reconnect we might get a second notify; only + // notify the caller once! + info->on_notify_finish = NULL; + } + } else { + finisher->queue(new C_DoWatchNotify(this, info, m)); + } +} + +void Objecter::_do_watch_notify(LingerOp *info, MWatchNotify *m) +{ + ldout(cct, 10) << __func__ << " " << *m << dendl; + + shared_lock l(rwlock); + ceph_assert(initialized); + + if (info->canceled) { + l.unlock(); + goto out; + } + + // notify completion? + ceph_assert(info->is_watch); + ceph_assert(info->watch_context); + ceph_assert(m->opcode != CEPH_WATCH_EVENT_DISCONNECT); + + l.unlock(); + + switch (m->opcode) { + case CEPH_WATCH_EVENT_NOTIFY: + info->watch_context->handle_notify(m->notify_id, m->cookie, + m->notifier_gid, m->bl); + break; + } + + out: + info->finished_async(); + info->put(); + m->put(); +} + +bool Objecter::ms_dispatch(Message *m) +{ + ldout(cct, 10) << __func__ << " " << cct << " " << *m << dendl; + switch (m->get_type()) { + // these we exlusively handle + case CEPH_MSG_OSD_OPREPLY: + handle_osd_op_reply(static_cast<MOSDOpReply*>(m)); + return true; + + case CEPH_MSG_OSD_BACKOFF: + handle_osd_backoff(static_cast<MOSDBackoff*>(m)); + return true; + + case CEPH_MSG_WATCH_NOTIFY: + handle_watch_notify(static_cast<MWatchNotify*>(m)); + m->put(); + return true; + + case MSG_COMMAND_REPLY: + if (m->get_source().type() == CEPH_ENTITY_TYPE_OSD) { + handle_command_reply(static_cast<MCommandReply*>(m)); + return true; + } else { + return false; + } + + case MSG_GETPOOLSTATSREPLY: + handle_get_pool_stats_reply(static_cast<MGetPoolStatsReply*>(m)); + return true; + + case CEPH_MSG_POOLOP_REPLY: + handle_pool_op_reply(static_cast<MPoolOpReply*>(m)); + return true; + + case CEPH_MSG_STATFS_REPLY: + handle_fs_stats_reply(static_cast<MStatfsReply*>(m)); + return true; + + // these we give others a chance to inspect + + // MDS, OSD + case CEPH_MSG_OSD_MAP: + handle_osd_map(static_cast<MOSDMap*>(m)); + return false; + } + return false; +} + +void Objecter::_scan_requests( + OSDSession *s, + bool skipped_map, + bool cluster_full, + map<int64_t, bool> *pool_full_map, + map<ceph_tid_t, Op*>& need_resend, + list<LingerOp*>& need_resend_linger, + map<ceph_tid_t, CommandOp*>& need_resend_command, + shunique_lock& sul, + const mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps) +{ + ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock); + + list<LingerOp*> unregister_lingers; + + OSDSession::unique_lock sl(s->lock); + + // check for changed linger mappings (_before_ regular ops) + map<ceph_tid_t,LingerOp*>::iterator lp = s->linger_ops.begin(); + while (lp != s->linger_ops.end()) { + LingerOp *op = lp->second; + ceph_assert(op->session == s); + // check_linger_pool_dne() may touch linger_ops; prevent iterator + // invalidation + ++lp; + ldout(cct, 10) << " checking linger op " << op->linger_id << dendl; + bool unregister, force_resend_writes = cluster_full; + int r = _recalc_linger_op_target(op, sul); + if (pool_full_map) + force_resend_writes = force_resend_writes || + (*pool_full_map)[op->target.base_oloc.pool]; + switch (r) { + case RECALC_OP_TARGET_NO_ACTION: + if (!skipped_map && !force_resend_writes) + break; + // -- fall-thru -- + case RECALC_OP_TARGET_NEED_RESEND: + need_resend_linger.push_back(op); + _linger_cancel_map_check(op); + break; + case RECALC_OP_TARGET_POOL_DNE: + _check_linger_pool_dne(op, &unregister); + if (unregister) { + ldout(cct, 10) << " need to unregister linger op " + << op->linger_id << dendl; + op->get(); + unregister_lingers.push_back(op); + } + break; + } + } + + // check for changed request mappings + map<ceph_tid_t,Op*>::iterator p = s->ops.begin(); + while (p != s->ops.end()) { + Op *op = p->second; + ++p; // check_op_pool_dne() may touch ops; prevent iterator invalidation + ldout(cct, 10) << " checking op " << op->tid << dendl; + _prune_snapc(osdmap->get_new_removed_snaps(), op); + if (skipped_map) { + _prune_snapc(*gap_removed_snaps, op); + } + bool force_resend_writes = cluster_full; + if (pool_full_map) + force_resend_writes = force_resend_writes || + (*pool_full_map)[op->target.base_oloc.pool]; + int r = _calc_target(&op->target, + op->session ? op->session->con.get() : nullptr); + switch (r) { + case RECALC_OP_TARGET_NO_ACTION: + if (!skipped_map && !(force_resend_writes && op->respects_full())) + break; + // -- fall-thru -- + case RECALC_OP_TARGET_NEED_RESEND: + _session_op_remove(op->session, op); + need_resend[op->tid] = op; + _op_cancel_map_check(op); + break; + case RECALC_OP_TARGET_POOL_DNE: + _check_op_pool_dne(op, &sl); + break; + } + } + + // commands + map<ceph_tid_t,CommandOp*>::iterator cp = s->command_ops.begin(); + while (cp != s->command_ops.end()) { + CommandOp *c = cp->second; + ++cp; + ldout(cct, 10) << " checking command " << c->tid << dendl; + bool force_resend_writes = cluster_full; + if (pool_full_map) + force_resend_writes = force_resend_writes || + (*pool_full_map)[c->target_pg.pool()]; + int r = _calc_command_target(c, sul); + switch (r) { + case RECALC_OP_TARGET_NO_ACTION: + // resend if skipped map; otherwise do nothing. + if (!skipped_map && !force_resend_writes) + break; + // -- fall-thru -- + case RECALC_OP_TARGET_NEED_RESEND: + need_resend_command[c->tid] = c; + _session_command_op_remove(c->session, c); + _command_cancel_map_check(c); + break; + case RECALC_OP_TARGET_POOL_DNE: + case RECALC_OP_TARGET_OSD_DNE: + case RECALC_OP_TARGET_OSD_DOWN: + _check_command_map_dne(c); + break; + } + } + + sl.unlock(); + + for (list<LingerOp*>::iterator iter = unregister_lingers.begin(); + iter != unregister_lingers.end(); + ++iter) { + _linger_cancel(*iter); + (*iter)->put(); + } +} + +void Objecter::handle_osd_map(MOSDMap *m) +{ + shunique_lock sul(rwlock, acquire_unique); + if (!initialized) + return; + + ceph_assert(osdmap); + + if (m->fsid != monc->get_fsid()) { + ldout(cct, 0) << "handle_osd_map fsid " << m->fsid + << " != " << monc->get_fsid() << dendl; + return; + } + + bool was_pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD); + bool cluster_full = _osdmap_full_flag(); + bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || cluster_full || + _osdmap_has_pool_full(); + map<int64_t, bool> pool_full_map; + for (map<int64_t, pg_pool_t>::const_iterator it + = osdmap->get_pools().begin(); + it != osdmap->get_pools().end(); ++it) + pool_full_map[it->first] = _osdmap_pool_full(it->second); + + + list<LingerOp*> need_resend_linger; + map<ceph_tid_t, Op*> need_resend; + map<ceph_tid_t, CommandOp*> need_resend_command; + + if (m->get_last() <= osdmap->get_epoch()) { + ldout(cct, 3) << "handle_osd_map ignoring epochs [" + << m->get_first() << "," << m->get_last() + << "] <= " << osdmap->get_epoch() << dendl; + } else { + ldout(cct, 3) << "handle_osd_map got epochs [" + << m->get_first() << "," << m->get_last() + << "] > " << osdmap->get_epoch() << dendl; + + if (osdmap->get_epoch()) { + bool skipped_map = false; + // we want incrementals + for (epoch_t e = osdmap->get_epoch() + 1; + e <= m->get_last(); + e++) { + + if (osdmap->get_epoch() == e-1 && + m->incremental_maps.count(e)) { + ldout(cct, 3) << "handle_osd_map decoding incremental epoch " << e + << dendl; + OSDMap::Incremental inc(m->incremental_maps[e]); + osdmap->apply_incremental(inc); + + emit_blacklist_events(inc); + + logger->inc(l_osdc_map_inc); + } + else if (m->maps.count(e)) { + ldout(cct, 3) << "handle_osd_map decoding full epoch " << e << dendl; + OSDMap *new_osdmap = new OSDMap(); + new_osdmap->decode(m->maps[e]); + + emit_blacklist_events(*osdmap, *new_osdmap); + + osdmap = new_osdmap; + + logger->inc(l_osdc_map_full); + } + else { + if (e >= m->get_oldest()) { + ldout(cct, 3) << "handle_osd_map requesting missing epoch " + << osdmap->get_epoch()+1 << dendl; + _maybe_request_map(); + break; + } + ldout(cct, 3) << "handle_osd_map missing epoch " + << osdmap->get_epoch()+1 + << ", jumping to " << m->get_oldest() << dendl; + e = m->get_oldest() - 1; + skipped_map = true; + continue; + } + logger->set(l_osdc_map_epoch, osdmap->get_epoch()); + + cluster_full = cluster_full || _osdmap_full_flag(); + update_pool_full_map(pool_full_map); + + // check all outstanding requests on every epoch + for (auto& i : need_resend) { + _prune_snapc(osdmap->get_new_removed_snaps(), i.second); + if (skipped_map) { + _prune_snapc(m->gap_removed_snaps, i.second); + } + } + _scan_requests(homeless_session, skipped_map, cluster_full, + &pool_full_map, need_resend, + need_resend_linger, need_resend_command, sul, + &m->gap_removed_snaps); + for (map<int,OSDSession*>::iterator p = osd_sessions.begin(); + p != osd_sessions.end(); ) { + OSDSession *s = p->second; + _scan_requests(s, skipped_map, cluster_full, + &pool_full_map, need_resend, + need_resend_linger, need_resend_command, sul, + &m->gap_removed_snaps); + ++p; + // osd down or addr change? + if (!osdmap->is_up(s->osd) || + (s->con && + s->con->get_peer_addrs() != osdmap->get_addrs(s->osd))) { + close_session(s); + } + } + + ceph_assert(e == osdmap->get_epoch()); + } + + } else { + // first map. we want the full thing. + if (m->maps.count(m->get_last())) { + for (map<int,OSDSession*>::iterator p = osd_sessions.begin(); + p != osd_sessions.end(); ++p) { + OSDSession *s = p->second; + _scan_requests(s, false, false, NULL, need_resend, + need_resend_linger, need_resend_command, sul, + nullptr); + } + ldout(cct, 3) << "handle_osd_map decoding full epoch " + << m->get_last() << dendl; + osdmap->decode(m->maps[m->get_last()]); + + _scan_requests(homeless_session, false, false, NULL, + need_resend, need_resend_linger, + need_resend_command, sul, nullptr); + } else { + ldout(cct, 3) << "handle_osd_map hmm, i want a full map, requesting" + << dendl; + monc->sub_want("osdmap", 0, CEPH_SUBSCRIBE_ONETIME); + monc->renew_subs(); + } + } + } + + // make sure need_resend targets reflect latest map + for (auto p = need_resend.begin(); p != need_resend.end(); ) { + Op *op = p->second; + if (op->target.epoch < osdmap->get_epoch()) { + ldout(cct, 10) << __func__ << " checking op " << p->first << dendl; + int r = _calc_target(&op->target, nullptr); + if (r == RECALC_OP_TARGET_POOL_DNE) { + p = need_resend.erase(p); + _check_op_pool_dne(op, nullptr); + } else { + ++p; + } + } else { + ++p; + } + } + + bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD); + bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag() + || _osdmap_has_pool_full(); + + // was/is paused? + if (was_pauserd || was_pausewr || pauserd || pausewr || + osdmap->get_epoch() < epoch_barrier) { + _maybe_request_map(); + } + + // resend requests + for (map<ceph_tid_t, Op*>::iterator p = need_resend.begin(); + p != need_resend.end(); ++p) { + Op *op = p->second; + OSDSession *s = op->session; + bool mapped_session = false; + if (!s) { + int r = _map_session(&op->target, &s, sul); + ceph_assert(r == 0); + mapped_session = true; + } else { + get_session(s); + } + OSDSession::unique_lock sl(s->lock); + if (mapped_session) { + _session_op_assign(s, op); + } + if (op->should_resend) { + if (!op->session->is_homeless() && !op->target.paused) { + logger->inc(l_osdc_op_resend); + _send_op(op); + } + } else { + _op_cancel_map_check(op); + _cancel_linger_op(op); + } + sl.unlock(); + put_session(s); + } + for (list<LingerOp*>::iterator p = need_resend_linger.begin(); + p != need_resend_linger.end(); ++p) { + LingerOp *op = *p; + ceph_assert(op->session); + if (!op->session->is_homeless()) { + logger->inc(l_osdc_linger_resend); + _send_linger(op, sul); + } + } + for (map<ceph_tid_t,CommandOp*>::iterator p = need_resend_command.begin(); + p != need_resend_command.end(); ++p) { + CommandOp *c = p->second; + if (c->target.osd >= 0) { + _assign_command_session(c, sul); + if (c->session && !c->session->is_homeless()) { + _send_command(c); + } + } + } + + _dump_active(); + + // finish any Contexts that were waiting on a map update + map<epoch_t,list< pair< Context*, int > > >::iterator p = + waiting_for_map.begin(); + while (p != waiting_for_map.end() && + p->first <= osdmap->get_epoch()) { + //go through the list and call the onfinish methods + for (list<pair<Context*, int> >::iterator i = p->second.begin(); + i != p->second.end(); ++i) { + i->first->complete(i->second); + } + waiting_for_map.erase(p++); + } + + monc->sub_got("osdmap", osdmap->get_epoch()); + + if (!waiting_for_map.empty()) { + _maybe_request_map(); + } +} + +void Objecter::enable_blacklist_events() +{ + unique_lock wl(rwlock); + + blacklist_events_enabled = true; +} + +void Objecter::consume_blacklist_events(std::set<entity_addr_t> *events) +{ + unique_lock wl(rwlock); + + if (events->empty()) { + events->swap(blacklist_events); + } else { + for (const auto &i : blacklist_events) { + events->insert(i); + } + blacklist_events.clear(); + } +} + +void Objecter::emit_blacklist_events(const OSDMap::Incremental &inc) +{ + if (!blacklist_events_enabled) { + return; + } + + for (const auto &i : inc.new_blacklist) { + blacklist_events.insert(i.first); + } +} + +void Objecter::emit_blacklist_events(const OSDMap &old_osd_map, + const OSDMap &new_osd_map) +{ + if (!blacklist_events_enabled) { + return; + } + + std::set<entity_addr_t> old_set; + std::set<entity_addr_t> new_set; + + old_osd_map.get_blacklist(&old_set); + new_osd_map.get_blacklist(&new_set); + + std::set<entity_addr_t> delta_set; + std::set_difference( + new_set.begin(), new_set.end(), old_set.begin(), old_set.end(), + std::inserter(delta_set, delta_set.begin())); + blacklist_events.insert(delta_set.begin(), delta_set.end()); +} + +// op pool check + +void Objecter::C_Op_Map_Latest::finish(int r) +{ + if (r == -EAGAIN || r == -ECANCELED) + return; + + lgeneric_subdout(objecter->cct, objecter, 10) + << "op_map_latest r=" << r << " tid=" << tid + << " latest " << latest << dendl; + + Objecter::unique_lock wl(objecter->rwlock); + + map<ceph_tid_t, Op*>::iterator iter = + objecter->check_latest_map_ops.find(tid); + if (iter == objecter->check_latest_map_ops.end()) { + lgeneric_subdout(objecter->cct, objecter, 10) + << "op_map_latest op "<< tid << " not found" << dendl; + return; + } + + Op *op = iter->second; + objecter->check_latest_map_ops.erase(iter); + + lgeneric_subdout(objecter->cct, objecter, 20) + << "op_map_latest op "<< op << dendl; + + if (op->map_dne_bound == 0) + op->map_dne_bound = latest; + + OSDSession::unique_lock sl(op->session->lock, defer_lock); + objecter->_check_op_pool_dne(op, &sl); + + op->put(); +} + +int Objecter::pool_snap_by_name(int64_t poolid, const char *snap_name, + snapid_t *snap) const +{ + shared_lock rl(rwlock); + + auto& pools = osdmap->get_pools(); + auto iter = pools.find(poolid); + if (iter == pools.end()) { + return -ENOENT; + } + const pg_pool_t& pg_pool = iter->second; + for (auto p = pg_pool.snaps.begin(); + p != pg_pool.snaps.end(); + ++p) { + if (p->second.name == snap_name) { + *snap = p->first; + return 0; + } + } + return -ENOENT; +} + +int Objecter::pool_snap_get_info(int64_t poolid, snapid_t snap, + pool_snap_info_t *info) const +{ + shared_lock rl(rwlock); + + auto& pools = osdmap->get_pools(); + auto iter = pools.find(poolid); + if (iter == pools.end()) { + return -ENOENT; + } + const pg_pool_t& pg_pool = iter->second; + auto p = pg_pool.snaps.find(snap); + if (p == pg_pool.snaps.end()) + return -ENOENT; + *info = p->second; + + return 0; +} + +int Objecter::pool_snap_list(int64_t poolid, vector<uint64_t> *snaps) +{ + shared_lock rl(rwlock); + + const pg_pool_t *pi = osdmap->get_pg_pool(poolid); + if (!pi) + return -ENOENT; + for (map<snapid_t,pool_snap_info_t>::const_iterator p = pi->snaps.begin(); + p != pi->snaps.end(); + ++p) { + snaps->push_back(p->first); + } + return 0; +} + +// sl may be unlocked. +void Objecter::_check_op_pool_dne(Op *op, unique_lock *sl) +{ + // rwlock is locked unique + + if (op->target.pool_ever_existed) { + // the pool previously existed and now it does not, which means it + // was deleted. + op->map_dne_bound = osdmap->get_epoch(); + ldout(cct, 10) << "check_op_pool_dne tid " << op->tid + << " pool previously exists but now does not" + << dendl; + } else { + ldout(cct, 10) << "check_op_pool_dne tid " << op->tid + << " current " << osdmap->get_epoch() + << " map_dne_bound " << op->map_dne_bound + << dendl; + } + if (op->map_dne_bound > 0) { + if (osdmap->get_epoch() >= op->map_dne_bound) { + // we had a new enough map + ldout(cct, 10) << "check_op_pool_dne tid " << op->tid + << " concluding pool " << op->target.base_pgid.pool() + << " dne" << dendl; + if (op->onfinish) { + num_in_flight--; + op->onfinish->complete(-ENOENT); + } + + OSDSession *s = op->session; + if (s) { + ceph_assert(s != NULL); + ceph_assert(sl->mutex() == &s->lock); + bool session_locked = sl->owns_lock(); + if (!session_locked) { + sl->lock(); + } + _finish_op(op, 0); + if (!session_locked) { + sl->unlock(); + } + } else { + _finish_op(op, 0); // no session + } + } + } else { + _send_op_map_check(op); + } +} + +void Objecter::_send_op_map_check(Op *op) +{ + // rwlock is locked unique + // ask the monitor + if (check_latest_map_ops.count(op->tid) == 0) { + op->get(); + check_latest_map_ops[op->tid] = op; + C_Op_Map_Latest *c = new C_Op_Map_Latest(this, op->tid); + monc->get_version("osdmap", &c->latest, NULL, c); + } +} + +void Objecter::_op_cancel_map_check(Op *op) +{ + // rwlock is locked unique + map<ceph_tid_t, Op*>::iterator iter = + check_latest_map_ops.find(op->tid); + if (iter != check_latest_map_ops.end()) { + Op *op = iter->second; + op->put(); + check_latest_map_ops.erase(iter); + } +} + +// linger pool check + +void Objecter::C_Linger_Map_Latest::finish(int r) +{ + if (r == -EAGAIN || r == -ECANCELED) { + // ignore callback; we will retry in resend_mon_ops() + return; + } + + unique_lock wl(objecter->rwlock); + + map<uint64_t, LingerOp*>::iterator iter = + objecter->check_latest_map_lingers.find(linger_id); + if (iter == objecter->check_latest_map_lingers.end()) { + return; + } + + LingerOp *op = iter->second; + objecter->check_latest_map_lingers.erase(iter); + + if (op->map_dne_bound == 0) + op->map_dne_bound = latest; + + bool unregister; + objecter->_check_linger_pool_dne(op, &unregister); + + if (unregister) { + objecter->_linger_cancel(op); + } + + op->put(); +} + +void Objecter::_check_linger_pool_dne(LingerOp *op, bool *need_unregister) +{ + // rwlock is locked unique + + *need_unregister = false; + + if (op->register_gen > 0) { + ldout(cct, 10) << "_check_linger_pool_dne linger_id " << op->linger_id + << " pool previously existed but now does not" + << dendl; + op->map_dne_bound = osdmap->get_epoch(); + } else { + ldout(cct, 10) << "_check_linger_pool_dne linger_id " << op->linger_id + << " current " << osdmap->get_epoch() + << " map_dne_bound " << op->map_dne_bound + << dendl; + } + if (op->map_dne_bound > 0) { + if (osdmap->get_epoch() >= op->map_dne_bound) { + LingerOp::unique_lock wl{op->watch_lock}; + if (op->on_reg_commit) { + op->on_reg_commit->complete(-ENOENT); + op->on_reg_commit = nullptr; + } + if (op->on_notify_finish) { + op->on_notify_finish->complete(-ENOENT); + op->on_notify_finish = nullptr; + } + *need_unregister = true; + } + } else { + _send_linger_map_check(op); + } +} + +void Objecter::_send_linger_map_check(LingerOp *op) +{ + // ask the monitor + if (check_latest_map_lingers.count(op->linger_id) == 0) { + op->get(); + check_latest_map_lingers[op->linger_id] = op; + C_Linger_Map_Latest *c = new C_Linger_Map_Latest(this, op->linger_id); + monc->get_version("osdmap", &c->latest, NULL, c); + } +} + +void Objecter::_linger_cancel_map_check(LingerOp *op) +{ + // rwlock is locked unique + + map<uint64_t, LingerOp*>::iterator iter = + check_latest_map_lingers.find(op->linger_id); + if (iter != check_latest_map_lingers.end()) { + LingerOp *op = iter->second; + op->put(); + check_latest_map_lingers.erase(iter); + } +} + +// command pool check + +void Objecter::C_Command_Map_Latest::finish(int r) +{ + if (r == -EAGAIN || r == -ECANCELED) { + // ignore callback; we will retry in resend_mon_ops() + return; + } + + unique_lock wl(objecter->rwlock); + + map<uint64_t, CommandOp*>::iterator iter = + objecter->check_latest_map_commands.find(tid); + if (iter == objecter->check_latest_map_commands.end()) { + return; + } + + CommandOp *c = iter->second; + objecter->check_latest_map_commands.erase(iter); + + if (c->map_dne_bound == 0) + c->map_dne_bound = latest; + + OSDSession::unique_lock sul(c->session->lock); + objecter->_check_command_map_dne(c); + sul.unlock(); + + c->put(); +} + +void Objecter::_check_command_map_dne(CommandOp *c) +{ + // rwlock is locked unique + // session is locked unique + + ldout(cct, 10) << "_check_command_map_dne tid " << c->tid + << " current " << osdmap->get_epoch() + << " map_dne_bound " << c->map_dne_bound + << dendl; + if (c->map_dne_bound > 0) { + if (osdmap->get_epoch() >= c->map_dne_bound) { + _finish_command(c, c->map_check_error, c->map_check_error_str); + } + } else { + _send_command_map_check(c); + } +} + +void Objecter::_send_command_map_check(CommandOp *c) +{ + // rwlock is locked unique + // session is locked unique + + // ask the monitor + if (check_latest_map_commands.count(c->tid) == 0) { + c->get(); + check_latest_map_commands[c->tid] = c; + C_Command_Map_Latest *f = new C_Command_Map_Latest(this, c->tid); + monc->get_version("osdmap", &f->latest, NULL, f); + } +} + +void Objecter::_command_cancel_map_check(CommandOp *c) +{ + // rwlock is locked uniqe + + map<uint64_t, CommandOp*>::iterator iter = + check_latest_map_commands.find(c->tid); + if (iter != check_latest_map_commands.end()) { + CommandOp *c = iter->second; + c->put(); + check_latest_map_commands.erase(iter); + } +} + + +/** + * Look up OSDSession by OSD id. + * + * @returns 0 on success, or -EAGAIN if the lock context requires + * promotion to write. + */ +int Objecter::_get_session(int osd, OSDSession **session, shunique_lock& sul) +{ + ceph_assert(sul && sul.mutex() == &rwlock); + + if (osd < 0) { + *session = homeless_session; + ldout(cct, 20) << __func__ << " osd=" << osd << " returning homeless" + << dendl; + return 0; + } + + map<int,OSDSession*>::iterator p = osd_sessions.find(osd); + if (p != osd_sessions.end()) { + OSDSession *s = p->second; + s->get(); + *session = s; + ldout(cct, 20) << __func__ << " s=" << s << " osd=" << osd << " " + << s->get_nref() << dendl; + return 0; + } + if (!sul.owns_lock()) { + return -EAGAIN; + } + OSDSession *s = new OSDSession(cct, osd); + osd_sessions[osd] = s; + s->con = messenger->connect_to_osd(osdmap->get_addrs(osd)); + s->con->set_priv(RefCountedPtr{s}); + logger->inc(l_osdc_osd_session_open); + logger->set(l_osdc_osd_sessions, osd_sessions.size()); + s->get(); + *session = s; + ldout(cct, 20) << __func__ << " s=" << s << " osd=" << osd << " " + << s->get_nref() << dendl; + return 0; +} + +void Objecter::put_session(Objecter::OSDSession *s) +{ + if (s && !s->is_homeless()) { + ldout(cct, 20) << __func__ << " s=" << s << " osd=" << s->osd << " " + << s->get_nref() << dendl; + s->put(); + } +} + +void Objecter::get_session(Objecter::OSDSession *s) +{ + ceph_assert(s != NULL); + + if (!s->is_homeless()) { + ldout(cct, 20) << __func__ << " s=" << s << " osd=" << s->osd << " " + << s->get_nref() << dendl; + s->get(); + } +} + +void Objecter::_reopen_session(OSDSession *s) +{ + // rwlock is locked unique + // s->lock is locked + + auto addrs = osdmap->get_addrs(s->osd); + ldout(cct, 10) << "reopen_session osd." << s->osd << " session, addr now " + << addrs << dendl; + if (s->con) { + s->con->set_priv(NULL); + s->con->mark_down(); + logger->inc(l_osdc_osd_session_close); + } + s->con = messenger->connect_to_osd(addrs); + s->con->set_priv(RefCountedPtr{s}); + s->incarnation++; + logger->inc(l_osdc_osd_session_open); +} + +void Objecter::close_session(OSDSession *s) +{ + // rwlock is locked unique + + ldout(cct, 10) << "close_session for osd." << s->osd << dendl; + if (s->con) { + s->con->set_priv(NULL); + s->con->mark_down(); + logger->inc(l_osdc_osd_session_close); + } + OSDSession::unique_lock sl(s->lock); + + std::list<LingerOp*> homeless_lingers; + std::list<CommandOp*> homeless_commands; + std::list<Op*> homeless_ops; + + while (!s->linger_ops.empty()) { + std::map<uint64_t, LingerOp*>::iterator i = s->linger_ops.begin(); + ldout(cct, 10) << " linger_op " << i->first << dendl; + homeless_lingers.push_back(i->second); + _session_linger_op_remove(s, i->second); + } + + while (!s->ops.empty()) { + std::map<ceph_tid_t, Op*>::iterator i = s->ops.begin(); + ldout(cct, 10) << " op " << i->first << dendl; + homeless_ops.push_back(i->second); + _session_op_remove(s, i->second); + } + + while (!s->command_ops.empty()) { + std::map<ceph_tid_t, CommandOp*>::iterator i = s->command_ops.begin(); + ldout(cct, 10) << " command_op " << i->first << dendl; + homeless_commands.push_back(i->second); + _session_command_op_remove(s, i->second); + } + + osd_sessions.erase(s->osd); + sl.unlock(); + put_session(s); + + // Assign any leftover ops to the homeless session + { + OSDSession::unique_lock hsl(homeless_session->lock); + for (std::list<LingerOp*>::iterator i = homeless_lingers.begin(); + i != homeless_lingers.end(); ++i) { + _session_linger_op_assign(homeless_session, *i); + } + for (std::list<Op*>::iterator i = homeless_ops.begin(); + i != homeless_ops.end(); ++i) { + _session_op_assign(homeless_session, *i); + } + for (std::list<CommandOp*>::iterator i = homeless_commands.begin(); + i != homeless_commands.end(); ++i) { + _session_command_op_assign(homeless_session, *i); + } + } + + logger->set(l_osdc_osd_sessions, osd_sessions.size()); +} + +void Objecter::wait_for_osd_map() +{ + unique_lock l(rwlock); + if (osdmap->get_epoch()) { + l.unlock(); + return; + } + + // Leave this since it goes with C_SafeCond + Mutex lock(""); + Cond cond; + bool done; + lock.Lock(); + C_SafeCond *context = new C_SafeCond(&lock, &cond, &done, NULL); + waiting_for_map[0].push_back(pair<Context*, int>(context, 0)); + l.unlock(); + while (!done) + cond.Wait(lock); + lock.Unlock(); +} + +struct C_Objecter_GetVersion : public Context { + Objecter *objecter; + uint64_t oldest, newest; + Context *fin; + C_Objecter_GetVersion(Objecter *o, Context *c) + : objecter(o), oldest(0), newest(0), fin(c) {} + void finish(int r) override { + if (r >= 0) { + objecter->get_latest_version(oldest, newest, fin); + } else if (r == -EAGAIN) { // try again as instructed + objecter->wait_for_latest_osdmap(fin); + } else { + // it doesn't return any other error codes! + ceph_abort(); + } + } +}; + +void Objecter::wait_for_latest_osdmap(Context *fin) +{ + ldout(cct, 10) << __func__ << dendl; + C_Objecter_GetVersion *c = new C_Objecter_GetVersion(this, fin); + monc->get_version("osdmap", &c->newest, &c->oldest, c); +} + +void Objecter::get_latest_version(epoch_t oldest, epoch_t newest, Context *fin) +{ + unique_lock wl(rwlock); + if (osdmap->get_epoch() >= newest) { + ldout(cct, 10) << __func__ << " latest " << newest << ", have it" << dendl; + wl.unlock(); + if (fin) + fin->complete(0); + return; + } + + ldout(cct, 10) << __func__ << " latest " << newest << ", waiting" << dendl; + _wait_for_new_map(fin, newest, 0); +} + +void Objecter::maybe_request_map() +{ + shared_lock rl(rwlock); + _maybe_request_map(); +} + +void Objecter::_maybe_request_map() +{ + // rwlock is locked + int flag = 0; + if (_osdmap_full_flag() + || osdmap->test_flag(CEPH_OSDMAP_PAUSERD) + || osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) { + ldout(cct, 10) << "_maybe_request_map subscribing (continuous) to next " + "osd map (FULL flag is set)" << dendl; + } else { + ldout(cct, 10) + << "_maybe_request_map subscribing (onetime) to next osd map" << dendl; + flag = CEPH_SUBSCRIBE_ONETIME; + } + epoch_t epoch = osdmap->get_epoch() ? osdmap->get_epoch()+1 : 0; + if (monc->sub_want("osdmap", epoch, flag)) { + monc->renew_subs(); + } +} + +void Objecter::_wait_for_new_map(Context *c, epoch_t epoch, int err) +{ + // rwlock is locked unique + waiting_for_map[epoch].push_back(pair<Context *, int>(c, err)); + _maybe_request_map(); +} + + +/** + * Use this together with wait_for_map: this is a pre-check to avoid + * allocating a Context for wait_for_map if we can see that we + * definitely already have the epoch. + * + * This does *not* replace the need to handle the return value of + * wait_for_map: just because we don't have it in this pre-check + * doesn't mean we won't have it when calling back into wait_for_map, + * since the objecter lock is dropped in between. + */ +bool Objecter::have_map(const epoch_t epoch) +{ + shared_lock rl(rwlock); + if (osdmap->get_epoch() >= epoch) { + return true; + } else { + return false; + } +} + +bool Objecter::wait_for_map(epoch_t epoch, Context *c, int err) +{ + unique_lock wl(rwlock); + if (osdmap->get_epoch() >= epoch) { + return true; + } + _wait_for_new_map(c, epoch, err); + return false; +} + +void Objecter::_kick_requests(OSDSession *session, + map<uint64_t, LingerOp *>& lresend) +{ + // rwlock is locked unique + + // clear backoffs + session->backoffs.clear(); + session->backoffs_by_id.clear(); + + // resend ops + map<ceph_tid_t,Op*> resend; // resend in tid order + for (map<ceph_tid_t, Op*>::iterator p = session->ops.begin(); + p != session->ops.end();) { + Op *op = p->second; + ++p; + if (op->should_resend) { + if (!op->target.paused) + resend[op->tid] = op; + } else { + _op_cancel_map_check(op); + _cancel_linger_op(op); + } + } + + logger->inc(l_osdc_op_resend, resend.size()); + while (!resend.empty()) { + _send_op(resend.begin()->second); + resend.erase(resend.begin()); + } + + // resend lingers + logger->inc(l_osdc_linger_resend, session->linger_ops.size()); + for (map<ceph_tid_t, LingerOp*>::iterator j = session->linger_ops.begin(); + j != session->linger_ops.end(); ++j) { + LingerOp *op = j->second; + op->get(); + ceph_assert(lresend.count(j->first) == 0); + lresend[j->first] = op; + } + + // resend commands + logger->inc(l_osdc_command_resend, session->command_ops.size()); + map<uint64_t,CommandOp*> cresend; // resend in order + for (map<ceph_tid_t, CommandOp*>::iterator k = session->command_ops.begin(); + k != session->command_ops.end(); ++k) { + cresend[k->first] = k->second; + } + while (!cresend.empty()) { + _send_command(cresend.begin()->second); + cresend.erase(cresend.begin()); + } +} + +void Objecter::_linger_ops_resend(map<uint64_t, LingerOp *>& lresend, + unique_lock& ul) +{ + ceph_assert(ul.owns_lock()); + shunique_lock sul(std::move(ul)); + while (!lresend.empty()) { + LingerOp *op = lresend.begin()->second; + if (!op->canceled) { + _send_linger(op, sul); + } + op->put(); + lresend.erase(lresend.begin()); + } + ul = sul.release_to_unique(); +} + +void Objecter::start_tick() +{ + ceph_assert(tick_event == 0); + tick_event = + timer.add_event(ceph::make_timespan(cct->_conf->objecter_tick_interval), + &Objecter::tick, this); +} + +void Objecter::tick() +{ + shared_lock rl(rwlock); + + ldout(cct, 10) << "tick" << dendl; + + // we are only called by C_Tick + tick_event = 0; + + if (!initialized) { + // we raced with shutdown + ldout(cct, 10) << __func__ << " raced with shutdown" << dendl; + return; + } + + set<OSDSession*> toping; + + + // look for laggy requests + auto cutoff = ceph::coarse_mono_clock::now(); + cutoff -= ceph::make_timespan(cct->_conf->objecter_timeout); // timeout + + unsigned laggy_ops = 0; + + for (map<int,OSDSession*>::iterator siter = osd_sessions.begin(); + siter != osd_sessions.end(); ++siter) { + OSDSession *s = siter->second; + OSDSession::lock_guard l(s->lock); + bool found = false; + for (map<ceph_tid_t,Op*>::iterator p = s->ops.begin(); + p != s->ops.end(); + ++p) { + Op *op = p->second; + ceph_assert(op->session); + if (op->stamp < cutoff) { + ldout(cct, 2) << " tid " << p->first << " on osd." << op->session->osd + << " is laggy" << dendl; + found = true; + ++laggy_ops; + } + } + for (map<uint64_t,LingerOp*>::iterator p = s->linger_ops.begin(); + p != s->linger_ops.end(); + ++p) { + LingerOp *op = p->second; + LingerOp::unique_lock wl(op->watch_lock); + ceph_assert(op->session); + ldout(cct, 10) << " pinging osd that serves lingering tid " << p->first + << " (osd." << op->session->osd << ")" << dendl; + found = true; + if (op->is_watch && op->registered && !op->last_error) + _send_linger_ping(op); + } + for (map<uint64_t,CommandOp*>::iterator p = s->command_ops.begin(); + p != s->command_ops.end(); + ++p) { + CommandOp *op = p->second; + ceph_assert(op->session); + ldout(cct, 10) << " pinging osd that serves command tid " << p->first + << " (osd." << op->session->osd << ")" << dendl; + found = true; + } + if (found) + toping.insert(s); + } + if (num_homeless_ops || !toping.empty()) { + _maybe_request_map(); + } + + logger->set(l_osdc_op_laggy, laggy_ops); + logger->set(l_osdc_osd_laggy, toping.size()); + + if (!toping.empty()) { + // send a ping to these osds, to ensure we detect any session resets + // (osd reply message policy is lossy) + for (set<OSDSession*>::const_iterator i = toping.begin(); + i != toping.end(); + ++i) { + (*i)->con->send_message(new MPing); + } + } + + // Make sure we don't reschedule if we wake up after shutdown + if (initialized) { + tick_event = timer.reschedule_me(ceph::make_timespan( + cct->_conf->objecter_tick_interval)); + } +} + +void Objecter::resend_mon_ops() +{ + unique_lock wl(rwlock); + + ldout(cct, 10) << "resend_mon_ops" << dendl; + + for (map<ceph_tid_t,PoolStatOp*>::iterator p = poolstat_ops.begin(); + p != poolstat_ops.end(); + ++p) { + _poolstat_submit(p->second); + logger->inc(l_osdc_poolstat_resend); + } + + for (map<ceph_tid_t,StatfsOp*>::iterator p = statfs_ops.begin(); + p != statfs_ops.end(); + ++p) { + _fs_stats_submit(p->second); + logger->inc(l_osdc_statfs_resend); + } + + for (map<ceph_tid_t,PoolOp*>::iterator p = pool_ops.begin(); + p != pool_ops.end(); + ++p) { + _pool_op_submit(p->second); + logger->inc(l_osdc_poolop_resend); + } + + for (map<ceph_tid_t, Op*>::iterator p = check_latest_map_ops.begin(); + p != check_latest_map_ops.end(); + ++p) { + C_Op_Map_Latest *c = new C_Op_Map_Latest(this, p->second->tid); + monc->get_version("osdmap", &c->latest, NULL, c); + } + + for (map<uint64_t, LingerOp*>::iterator p = check_latest_map_lingers.begin(); + p != check_latest_map_lingers.end(); + ++p) { + C_Linger_Map_Latest *c + = new C_Linger_Map_Latest(this, p->second->linger_id); + monc->get_version("osdmap", &c->latest, NULL, c); + } + + for (map<uint64_t, CommandOp*>::iterator p + = check_latest_map_commands.begin(); + p != check_latest_map_commands.end(); + ++p) { + C_Command_Map_Latest *c = new C_Command_Map_Latest(this, p->second->tid); + monc->get_version("osdmap", &c->latest, NULL, c); + } +} + +// read | write --------------------------- + +void Objecter::op_submit(Op *op, ceph_tid_t *ptid, int *ctx_budget) +{ + shunique_lock rl(rwlock, ceph::acquire_shared); + ceph_tid_t tid = 0; + if (!ptid) + ptid = &tid; + op->trace.event("op submit"); + _op_submit_with_budget(op, rl, ptid, ctx_budget); +} + +void Objecter::_op_submit_with_budget(Op *op, shunique_lock& sul, + ceph_tid_t *ptid, + int *ctx_budget) +{ + ceph_assert(initialized); + + ceph_assert(op->ops.size() == op->out_bl.size()); + ceph_assert(op->ops.size() == op->out_rval.size()); + ceph_assert(op->ops.size() == op->out_handler.size()); + + // throttle. before we look at any state, because + // _take_op_budget() may drop our lock while it blocks. + if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) { + int op_budget = _take_op_budget(op, sul); + // take and pass out the budget for the first OP + // in the context session + if (ctx_budget && (*ctx_budget == -1)) { + *ctx_budget = op_budget; + } + } + + if (osd_timeout > timespan(0)) { + if (op->tid == 0) + op->tid = ++last_tid; + auto tid = op->tid; + op->ontimeout = timer.add_event(osd_timeout, + [this, tid]() { + op_cancel(tid, -ETIMEDOUT); }); + } + + _op_submit(op, sul, ptid); +} + +void Objecter::_send_op_account(Op *op) +{ + inflight_ops++; + + // add to gather set(s) + if (op->onfinish) { + num_in_flight++; + } else { + ldout(cct, 20) << " note: not requesting reply" << dendl; + } + + logger->inc(l_osdc_op_active); + logger->inc(l_osdc_op); + + if ((op->target.flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)) == + (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) + logger->inc(l_osdc_op_rmw); + else if (op->target.flags & CEPH_OSD_FLAG_WRITE) + logger->inc(l_osdc_op_w); + else if (op->target.flags & CEPH_OSD_FLAG_READ) + logger->inc(l_osdc_op_r); + + if (op->target.flags & CEPH_OSD_FLAG_PGOP) + logger->inc(l_osdc_op_pg); + + for (vector<OSDOp>::iterator p = op->ops.begin(); p != op->ops.end(); ++p) { + int code = l_osdc_osdop_other; + switch (p->op.op) { + case CEPH_OSD_OP_STAT: code = l_osdc_osdop_stat; break; + case CEPH_OSD_OP_CREATE: code = l_osdc_osdop_create; break; + case CEPH_OSD_OP_READ: code = l_osdc_osdop_read; break; + case CEPH_OSD_OP_WRITE: code = l_osdc_osdop_write; break; + case CEPH_OSD_OP_WRITEFULL: code = l_osdc_osdop_writefull; break; + case CEPH_OSD_OP_WRITESAME: code = l_osdc_osdop_writesame; break; + case CEPH_OSD_OP_APPEND: code = l_osdc_osdop_append; break; + case CEPH_OSD_OP_ZERO: code = l_osdc_osdop_zero; break; + case CEPH_OSD_OP_TRUNCATE: code = l_osdc_osdop_truncate; break; + case CEPH_OSD_OP_DELETE: code = l_osdc_osdop_delete; break; + case CEPH_OSD_OP_MAPEXT: code = l_osdc_osdop_mapext; break; + case CEPH_OSD_OP_SPARSE_READ: code = l_osdc_osdop_sparse_read; break; + case CEPH_OSD_OP_GETXATTR: code = l_osdc_osdop_getxattr; break; + case CEPH_OSD_OP_SETXATTR: code = l_osdc_osdop_setxattr; break; + case CEPH_OSD_OP_CMPXATTR: code = l_osdc_osdop_cmpxattr; break; + case CEPH_OSD_OP_RMXATTR: code = l_osdc_osdop_rmxattr; break; + case CEPH_OSD_OP_RESETXATTRS: code = l_osdc_osdop_resetxattrs; break; + + // OMAP read operations + case CEPH_OSD_OP_OMAPGETVALS: + case CEPH_OSD_OP_OMAPGETKEYS: + case CEPH_OSD_OP_OMAPGETHEADER: + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + case CEPH_OSD_OP_OMAP_CMP: code = l_osdc_osdop_omap_rd; break; + + // OMAP write operations + case CEPH_OSD_OP_OMAPSETVALS: + case CEPH_OSD_OP_OMAPSETHEADER: code = l_osdc_osdop_omap_wr; break; + + // OMAP del operations + case CEPH_OSD_OP_OMAPCLEAR: + case CEPH_OSD_OP_OMAPRMKEYS: code = l_osdc_osdop_omap_del; break; + + case CEPH_OSD_OP_CALL: code = l_osdc_osdop_call; break; + case CEPH_OSD_OP_WATCH: code = l_osdc_osdop_watch; break; + case CEPH_OSD_OP_NOTIFY: code = l_osdc_osdop_notify; break; + } + if (code) + logger->inc(code); + } +} + +void Objecter::_op_submit(Op *op, shunique_lock& sul, ceph_tid_t *ptid) +{ + // rwlock is locked + + ldout(cct, 10) << __func__ << " op " << op << dendl; + + // pick target + ceph_assert(op->session == NULL); + OSDSession *s = NULL; + + bool check_for_latest_map = _calc_target(&op->target, nullptr) + == RECALC_OP_TARGET_POOL_DNE; + + // Try to get a session, including a retry if we need to take write lock + int r = _get_session(op->target.osd, &s, sul); + if (r == -EAGAIN || + (check_for_latest_map && sul.owns_lock_shared()) || + cct->_conf->objecter_debug_inject_relock_delay) { + epoch_t orig_epoch = osdmap->get_epoch(); + sul.unlock(); + if (cct->_conf->objecter_debug_inject_relock_delay) { + sleep(1); + } + sul.lock(); + if (orig_epoch != osdmap->get_epoch()) { + // map changed; recalculate mapping + ldout(cct, 10) << __func__ << " relock raced with osdmap, recalc target" + << dendl; + check_for_latest_map = _calc_target(&op->target, nullptr) + == RECALC_OP_TARGET_POOL_DNE; + if (s) { + put_session(s); + s = NULL; + r = -EAGAIN; + } + } + } + if (r == -EAGAIN) { + ceph_assert(s == NULL); + r = _get_session(op->target.osd, &s, sul); + } + ceph_assert(r == 0); + ceph_assert(s); // may be homeless + + _send_op_account(op); + + // send? + + ceph_assert(op->target.flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)); + + if (osdmap_full_try) { + op->target.flags |= CEPH_OSD_FLAG_FULL_TRY; + } + + bool need_send = false; + + if (osdmap->get_epoch() < epoch_barrier) { + ldout(cct, 10) << " barrier, paused " << op << " tid " << op->tid + << dendl; + op->target.paused = true; + _maybe_request_map(); + } else if ((op->target.flags & CEPH_OSD_FLAG_WRITE) && + osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) { + ldout(cct, 10) << " paused modify " << op << " tid " << op->tid + << dendl; + op->target.paused = true; + _maybe_request_map(); + } else if ((op->target.flags & CEPH_OSD_FLAG_READ) && + osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) { + ldout(cct, 10) << " paused read " << op << " tid " << op->tid + << dendl; + op->target.paused = true; + _maybe_request_map(); + } else if (op->respects_full() && + (_osdmap_full_flag() || + _osdmap_pool_full(op->target.base_oloc.pool))) { + ldout(cct, 0) << " FULL, paused modify " << op << " tid " + << op->tid << dendl; + op->target.paused = true; + _maybe_request_map(); + } else if (!s->is_homeless()) { + need_send = true; + } else { + _maybe_request_map(); + } + + OSDSession::unique_lock sl(s->lock); + if (op->tid == 0) + op->tid = ++last_tid; + + ldout(cct, 10) << "_op_submit oid " << op->target.base_oid + << " '" << op->target.base_oloc << "' '" + << op->target.target_oloc << "' " << op->ops << " tid " + << op->tid << " osd." << (!s->is_homeless() ? s->osd : -1) + << dendl; + + _session_op_assign(s, op); + + if (need_send) { + _send_op(op); + } + + // Last chance to touch Op here, after giving up session lock it can + // be freed at any time by response handler. + ceph_tid_t tid = op->tid; + if (check_for_latest_map) { + _send_op_map_check(op); + } + if (ptid) + *ptid = tid; + op = NULL; + + sl.unlock(); + put_session(s); + + ldout(cct, 5) << num_in_flight << " in flight" << dendl; +} + +int Objecter::op_cancel(OSDSession *s, ceph_tid_t tid, int r) +{ + ceph_assert(initialized); + + OSDSession::unique_lock sl(s->lock); + + map<ceph_tid_t, Op*>::iterator p = s->ops.find(tid); + if (p == s->ops.end()) { + ldout(cct, 10) << __func__ << " tid " << tid << " dne in session " + << s->osd << dendl; + return -ENOENT; + } + +#if 0 + if (s->con) { + ldout(cct, 20) << " revoking rx buffer for " << tid + << " on " << s->con << dendl; + s->con->revoke_rx_buffer(tid); + } +#endif + + ldout(cct, 10) << __func__ << " tid " << tid << " in session " << s->osd + << dendl; + Op *op = p->second; + if (op->onfinish) { + num_in_flight--; + op->onfinish->complete(r); + op->onfinish = NULL; + } + _op_cancel_map_check(op); + _finish_op(op, r); + sl.unlock(); + + return 0; +} + +int Objecter::op_cancel(ceph_tid_t tid, int r) +{ + int ret = 0; + + unique_lock wl(rwlock); + ret = _op_cancel(tid, r); + + return ret; +} + +int Objecter::op_cancel(const vector<ceph_tid_t>& tids, int r) +{ + unique_lock wl(rwlock); + ldout(cct,10) << __func__ << " " << tids << dendl; + for (auto tid : tids) { + _op_cancel(tid, r); + } + return 0; +} + +int Objecter::_op_cancel(ceph_tid_t tid, int r) +{ + int ret = 0; + + ldout(cct, 5) << __func__ << ": cancelling tid " << tid << " r=" << r + << dendl; + +start: + + for (map<int, OSDSession *>::iterator siter = osd_sessions.begin(); + siter != osd_sessions.end(); ++siter) { + OSDSession *s = siter->second; + OSDSession::shared_lock sl(s->lock); + if (s->ops.find(tid) != s->ops.end()) { + sl.unlock(); + ret = op_cancel(s, tid, r); + if (ret == -ENOENT) { + /* oh no! raced, maybe tid moved to another session, restarting */ + goto start; + } + return ret; + } + } + + ldout(cct, 5) << __func__ << ": tid " << tid + << " not found in live sessions" << dendl; + + // Handle case where the op is in homeless session + OSDSession::shared_lock sl(homeless_session->lock); + if (homeless_session->ops.find(tid) != homeless_session->ops.end()) { + sl.unlock(); + ret = op_cancel(homeless_session, tid, r); + if (ret == -ENOENT) { + /* oh no! raced, maybe tid moved to another session, restarting */ + goto start; + } else { + return ret; + } + } else { + sl.unlock(); + } + + ldout(cct, 5) << __func__ << ": tid " << tid + << " not found in homeless session" << dendl; + + return ret; +} + + +epoch_t Objecter::op_cancel_writes(int r, int64_t pool) +{ + unique_lock wl(rwlock); + + std::vector<ceph_tid_t> to_cancel; + bool found = false; + + for (map<int, OSDSession *>::iterator siter = osd_sessions.begin(); + siter != osd_sessions.end(); ++siter) { + OSDSession *s = siter->second; + OSDSession::shared_lock sl(s->lock); + for (map<ceph_tid_t, Op*>::iterator op_i = s->ops.begin(); + op_i != s->ops.end(); ++op_i) { + if (op_i->second->target.flags & CEPH_OSD_FLAG_WRITE + && (pool == -1 || op_i->second->target.target_oloc.pool == pool)) { + to_cancel.push_back(op_i->first); + } + } + sl.unlock(); + + for (std::vector<ceph_tid_t>::iterator titer = to_cancel.begin(); + titer != to_cancel.end(); + ++titer) { + int cancel_result = op_cancel(s, *titer, r); + // We hold rwlock across search and cancellation, so cancels + // should always succeed + ceph_assert(cancel_result == 0); + } + if (!found && to_cancel.size()) + found = true; + to_cancel.clear(); + } + + const epoch_t epoch = osdmap->get_epoch(); + + wl.unlock(); + + if (found) { + return epoch; + } else { + return -1; + } +} + +bool Objecter::is_pg_changed( + int oldprimary, + const vector<int>& oldacting, + int newprimary, + const vector<int>& newacting, + bool any_change) +{ + if (OSDMap::primary_changed( + oldprimary, + oldacting, + newprimary, + newacting)) + return true; + if (any_change && oldacting != newacting) + return true; + return false; // same primary (tho replicas may have changed) +} + +bool Objecter::target_should_be_paused(op_target_t *t) +{ + const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool); + bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD); + bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || + _osdmap_full_flag() || _osdmap_pool_full(*pi); + + return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || + (t->flags & CEPH_OSD_FLAG_WRITE && pausewr) || + (osdmap->get_epoch() < epoch_barrier); +} + +/** + * Locking public accessor for _osdmap_full_flag + */ +bool Objecter::osdmap_full_flag() const +{ + shared_lock rl(rwlock); + + return _osdmap_full_flag(); +} + +bool Objecter::osdmap_pool_full(const int64_t pool_id) const +{ + shared_lock rl(rwlock); + + if (_osdmap_full_flag()) { + return true; + } + + return _osdmap_pool_full(pool_id); +} + +bool Objecter::_osdmap_pool_full(const int64_t pool_id) const +{ + const pg_pool_t *pool = osdmap->get_pg_pool(pool_id); + if (pool == NULL) { + ldout(cct, 4) << __func__ << ": DNE pool " << pool_id << dendl; + return false; + } + + return _osdmap_pool_full(*pool); +} + +bool Objecter::_osdmap_has_pool_full() const +{ + for (map<int64_t, pg_pool_t>::const_iterator it + = osdmap->get_pools().begin(); + it != osdmap->get_pools().end(); ++it) { + if (_osdmap_pool_full(it->second)) + return true; + } + return false; +} + +bool Objecter::_osdmap_pool_full(const pg_pool_t &p) const +{ + return p.has_flag(pg_pool_t::FLAG_FULL) && honor_osdmap_full; +} + +/** + * Wrapper around osdmap->test_flag for special handling of the FULL flag. + */ +bool Objecter::_osdmap_full_flag() const +{ + // Ignore the FULL flag if the caller does not have honor_osdmap_full + return osdmap->test_flag(CEPH_OSDMAP_FULL) && honor_osdmap_full; +} + +void Objecter::update_pool_full_map(map<int64_t, bool>& pool_full_map) +{ + for (map<int64_t, pg_pool_t>::const_iterator it + = osdmap->get_pools().begin(); + it != osdmap->get_pools().end(); ++it) { + if (pool_full_map.find(it->first) == pool_full_map.end()) { + pool_full_map[it->first] = _osdmap_pool_full(it->second); + } else { + pool_full_map[it->first] = _osdmap_pool_full(it->second) || + pool_full_map[it->first]; + } + } +} + +int64_t Objecter::get_object_hash_position(int64_t pool, const string& key, + const string& ns) +{ + shared_lock rl(rwlock); + const pg_pool_t *p = osdmap->get_pg_pool(pool); + if (!p) + return -ENOENT; + return p->hash_key(key, ns); +} + +int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key, + const string& ns) +{ + shared_lock rl(rwlock); + const pg_pool_t *p = osdmap->get_pg_pool(pool); + if (!p) + return -ENOENT; + return p->raw_hash_to_pg(p->hash_key(key, ns)); +} + +void Objecter::_prune_snapc( + const mempool::osdmap::map<int64_t, + OSDMap::snap_interval_set_t>& new_removed_snaps, + Op *op) +{ + bool match = false; + auto i = new_removed_snaps.find(op->target.base_pgid.pool()); + if (i != new_removed_snaps.end()) { + for (auto s : op->snapc.snaps) { + if (i->second.contains(s)) { + match = true; + break; + } + } + if (match) { + vector<snapid_t> new_snaps; + for (auto s : op->snapc.snaps) { + if (!i->second.contains(s)) { + new_snaps.push_back(s); + } + } + op->snapc.snaps.swap(new_snaps); + ldout(cct,10) << __func__ << " op " << op->tid << " snapc " << op->snapc + << " (was " << new_snaps << ")" << dendl; + } + } +} + +int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) +{ + // rwlock is locked + bool is_read = t->flags & CEPH_OSD_FLAG_READ; + bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; + t->epoch = osdmap->get_epoch(); + ldout(cct,20) << __func__ << " epoch " << t->epoch + << " base " << t->base_oid << " " << t->base_oloc + << " precalc_pgid " << (int)t->precalc_pgid + << " pgid " << t->base_pgid + << (is_read ? " is_read" : "") + << (is_write ? " is_write" : "") + << dendl; + + const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool); + if (!pi) { + t->osd = -1; + return RECALC_OP_TARGET_POOL_DNE; + } + ldout(cct,30) << __func__ << " base pi " << pi + << " pg_num " << pi->get_pg_num() << dendl; + + bool force_resend = false; + if (osdmap->get_epoch() == pi->last_force_op_resend) { + if (t->last_force_resend < pi->last_force_op_resend) { + t->last_force_resend = pi->last_force_op_resend; + force_resend = true; + } else if (t->last_force_resend == 0) { + force_resend = true; + } + } + + // apply tiering + t->target_oid = t->base_oid; + t->target_oloc = t->base_oloc; + if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + if (is_read && pi->has_read_tier()) + t->target_oloc.pool = pi->read_tier; + if (is_write && pi->has_write_tier()) + t->target_oloc.pool = pi->write_tier; + pi = osdmap->get_pg_pool(t->target_oloc.pool); + if (!pi) { + t->osd = -1; + return RECALC_OP_TARGET_POOL_DNE; + } + } + + pg_t pgid; + if (t->precalc_pgid) { + ceph_assert(t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY); + ceph_assert(t->base_oid.name.empty()); // make sure this is a pg op + ceph_assert(t->base_oloc.pool == (int64_t)t->base_pgid.pool()); + pgid = t->base_pgid; + } else { + int ret = osdmap->object_locator_to_pg(t->target_oid, t->target_oloc, + pgid); + if (ret == -ENOENT) { + t->osd = -1; + return RECALC_OP_TARGET_POOL_DNE; + } + } + ldout(cct,20) << __func__ << " target " << t->target_oid << " " + << t->target_oloc << " -> pgid " << pgid << dendl; + ldout(cct,30) << __func__ << " target pi " << pi + << " pg_num " << pi->get_pg_num() << dendl; + t->pool_ever_existed = true; + + int size = pi->size; + int min_size = pi->min_size; + unsigned pg_num = pi->get_pg_num(); + unsigned pg_num_pending = pi->get_pg_num_pending(); + int up_primary, acting_primary; + vector<int> up, acting; + osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary, + &acting, &acting_primary); + bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE); + bool recovery_deletes = osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES); + unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask); + pg_t prev_pgid(prev_seed, pgid.pool()); + if (any_change && PastIntervals::is_new_interval( + t->acting_primary, + acting_primary, + t->acting, + acting, + t->up_primary, + up_primary, + t->up, + up, + t->size, + size, + t->min_size, + min_size, + t->pg_num, + pg_num, + t->pg_num_pending, + pg_num_pending, + t->sort_bitwise, + sort_bitwise, + t->recovery_deletes, + recovery_deletes, + prev_pgid)) { + force_resend = true; + } + + bool unpaused = false; + bool should_be_paused = target_should_be_paused(t); + if (t->paused && !should_be_paused) { + unpaused = true; + } + t->paused = should_be_paused; + + bool legacy_change = + t->pgid != pgid || + is_pg_changed( + t->acting_primary, t->acting, acting_primary, acting, + t->used_replica || any_change); + bool split_or_merge = false; + if (t->pg_num) { + split_or_merge = + prev_pgid.is_split(t->pg_num, pg_num, nullptr) || + prev_pgid.is_merge_source(t->pg_num, pg_num, nullptr) || + prev_pgid.is_merge_target(t->pg_num, pg_num); + } + + if (legacy_change || split_or_merge || force_resend) { + t->pgid = pgid; + t->acting = acting; + t->acting_primary = acting_primary; + t->up_primary = up_primary; + t->up = up; + t->size = size; + t->min_size = min_size; + t->pg_num = pg_num; + t->pg_num_mask = pi->get_pg_num_mask(); + t->pg_num_pending = pg_num_pending; + osdmap->get_primary_shard( + pg_t(ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask), pgid.pool()), + &t->actual_pgid); + t->sort_bitwise = sort_bitwise; + t->recovery_deletes = recovery_deletes; + ldout(cct, 10) << __func__ << " " + << " raw pgid " << pgid << " -> actual " << t->actual_pgid + << " acting " << acting + << " primary " << acting_primary << dendl; + t->used_replica = false; + if (acting_primary == -1) { + t->osd = -1; + } else { + int osd; + bool read = is_read && !is_write; + if (read && (t->flags & CEPH_OSD_FLAG_BALANCE_READS)) { + int p = rand() % acting.size(); + if (p) + t->used_replica = true; + osd = acting[p]; + ldout(cct, 10) << " chose random osd." << osd << " of " << acting + << dendl; + } else if (read && (t->flags & CEPH_OSD_FLAG_LOCALIZE_READS) && + acting.size() > 1) { + // look for a local replica. prefer the primary if the + // distance is the same. + int best = -1; + int best_locality = 0; + for (unsigned i = 0; i < acting.size(); ++i) { + int locality = osdmap->crush->get_common_ancestor_distance( + cct, acting[i], crush_location); + ldout(cct, 20) << __func__ << " localize: rank " << i + << " osd." << acting[i] + << " locality " << locality << dendl; + if (i == 0 || + (locality >= 0 && best_locality >= 0 && + locality < best_locality) || + (best_locality < 0 && locality >= 0)) { + best = i; + best_locality = locality; + if (i) + t->used_replica = true; + } + } + ceph_assert(best >= 0); + osd = acting[best]; + } else { + osd = acting_primary; + } + t->osd = osd; + } + } + if (legacy_change || unpaused || force_resend) { + return RECALC_OP_TARGET_NEED_RESEND; + } + if (split_or_merge && + (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS || + HAVE_FEATURE(osdmap->get_xinfo(acting_primary).features, + RESEND_ON_SPLIT))) { + return RECALC_OP_TARGET_NEED_RESEND; + } + return RECALC_OP_TARGET_NO_ACTION; +} + +int Objecter::_map_session(op_target_t *target, OSDSession **s, + shunique_lock& sul) +{ + _calc_target(target, nullptr); + return _get_session(target->osd, s, sul); +} + +void Objecter::_session_op_assign(OSDSession *to, Op *op) +{ + // to->lock is locked + ceph_assert(op->session == NULL); + ceph_assert(op->tid); + + get_session(to); + op->session = to; + to->ops[op->tid] = op; + + if (to->is_homeless()) { + num_homeless_ops++; + } + + ldout(cct, 15) << __func__ << " " << to->osd << " " << op->tid << dendl; +} + +void Objecter::_session_op_remove(OSDSession *from, Op *op) +{ + ceph_assert(op->session == from); + // from->lock is locked + + if (from->is_homeless()) { + num_homeless_ops--; + } + + from->ops.erase(op->tid); + put_session(from); + op->session = NULL; + + ldout(cct, 15) << __func__ << " " << from->osd << " " << op->tid << dendl; +} + +void Objecter::_session_linger_op_assign(OSDSession *to, LingerOp *op) +{ + // to lock is locked unique + ceph_assert(op->session == NULL); + + if (to->is_homeless()) { + num_homeless_ops++; + } + + get_session(to); + op->session = to; + to->linger_ops[op->linger_id] = op; + + ldout(cct, 15) << __func__ << " " << to->osd << " " << op->linger_id + << dendl; +} + +void Objecter::_session_linger_op_remove(OSDSession *from, LingerOp *op) +{ + ceph_assert(from == op->session); + // from->lock is locked unique + + if (from->is_homeless()) { + num_homeless_ops--; + } + + from->linger_ops.erase(op->linger_id); + put_session(from); + op->session = NULL; + + ldout(cct, 15) << __func__ << " " << from->osd << " " << op->linger_id + << dendl; +} + +void Objecter::_session_command_op_remove(OSDSession *from, CommandOp *op) +{ + ceph_assert(from == op->session); + // from->lock is locked + + if (from->is_homeless()) { + num_homeless_ops--; + } + + from->command_ops.erase(op->tid); + put_session(from); + op->session = NULL; + + ldout(cct, 15) << __func__ << " " << from->osd << " " << op->tid << dendl; +} + +void Objecter::_session_command_op_assign(OSDSession *to, CommandOp *op) +{ + // to->lock is locked + ceph_assert(op->session == NULL); + ceph_assert(op->tid); + + if (to->is_homeless()) { + num_homeless_ops++; + } + + get_session(to); + op->session = to; + to->command_ops[op->tid] = op; + + ldout(cct, 15) << __func__ << " " << to->osd << " " << op->tid << dendl; +} + +int Objecter::_recalc_linger_op_target(LingerOp *linger_op, + shunique_lock& sul) +{ + // rwlock is locked unique + + int r = _calc_target(&linger_op->target, nullptr, true); + if (r == RECALC_OP_TARGET_NEED_RESEND) { + ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id + << " pgid " << linger_op->target.pgid + << " acting " << linger_op->target.acting << dendl; + + OSDSession *s = NULL; + r = _get_session(linger_op->target.osd, &s, sul); + ceph_assert(r == 0); + + if (linger_op->session != s) { + // NB locking two sessions (s and linger_op->session) at the + // same time here is only safe because we are the only one that + // takes two, and we are holding rwlock for write. Disable + // lockdep because it doesn't know that. + OSDSession::unique_lock sl(s->lock); + _session_linger_op_remove(linger_op->session, linger_op); + _session_linger_op_assign(s, linger_op); + } + + put_session(s); + return RECALC_OP_TARGET_NEED_RESEND; + } + return r; +} + +void Objecter::_cancel_linger_op(Op *op) +{ + ldout(cct, 15) << "cancel_op " << op->tid << dendl; + + ceph_assert(!op->should_resend); + if (op->onfinish) { + delete op->onfinish; + num_in_flight--; + } + + _finish_op(op, 0); +} + +void Objecter::_finish_op(Op *op, int r) +{ + ldout(cct, 15) << __func__ << " " << op->tid << dendl; + + // op->session->lock is locked unique or op->session is null + + if (!op->ctx_budgeted && op->budget >= 0) { + put_op_budget_bytes(op->budget); + op->budget = -1; + } + + if (op->ontimeout && r != -ETIMEDOUT) + timer.cancel_event(op->ontimeout); + + if (op->session) { + _session_op_remove(op->session, op); + } + + logger->dec(l_osdc_op_active); + + ceph_assert(check_latest_map_ops.find(op->tid) == check_latest_map_ops.end()); + + inflight_ops--; + + op->put(); +} + +MOSDOp *Objecter::_prepare_osd_op(Op *op) +{ + // rwlock is locked + + int flags = op->target.flags; + flags |= CEPH_OSD_FLAG_KNOWN_REDIR; + + // Nothing checks this any longer, but needed for compatibility with + // pre-luminous osds + flags |= CEPH_OSD_FLAG_ONDISK; + + if (!honor_osdmap_full) + flags |= CEPH_OSD_FLAG_FULL_FORCE; + + op->target.paused = false; + op->stamp = ceph::coarse_mono_clock::now(); + + hobject_t hobj = op->target.get_hobj(); + MOSDOp *m = new MOSDOp(client_inc, op->tid, + hobj, op->target.actual_pgid, + osdmap->get_epoch(), + flags, op->features); + + m->set_snapid(op->snapid); + m->set_snap_seq(op->snapc.seq); + m->set_snaps(op->snapc.snaps); + + m->ops = op->ops; + m->set_mtime(op->mtime); + m->set_retry_attempt(op->attempts++); + + if (!op->trace.valid() && cct->_conf->osdc_blkin_trace_all) { + op->trace.init("op", &trace_endpoint); + } + + if (op->priority) + m->set_priority(op->priority); + else + m->set_priority(cct->_conf->osd_client_op_priority); + + if (op->reqid != osd_reqid_t()) { + m->set_reqid(op->reqid); + } + + logger->inc(l_osdc_op_send); + ssize_t sum = 0; + for (unsigned i = 0; i < m->ops.size(); i++) { + sum += m->ops[i].indata.length(); + } + logger->inc(l_osdc_op_send_bytes, sum); + + return m; +} + +void Objecter::_send_op(Op *op) +{ + // rwlock is locked + // op->session->lock is locked + + // backoff? + auto p = op->session->backoffs.find(op->target.actual_pgid); + if (p != op->session->backoffs.end()) { + hobject_t hoid = op->target.get_hobj(); + auto q = p->second.lower_bound(hoid); + if (q != p->second.begin()) { + --q; + if (hoid >= q->second.end) { + ++q; + } + } + if (q != p->second.end()) { + ldout(cct, 20) << __func__ << " ? " << q->first << " [" << q->second.begin + << "," << q->second.end << ")" << dendl; + int r = cmp(hoid, q->second.begin); + if (r == 0 || (r > 0 && hoid < q->second.end)) { + ldout(cct, 10) << __func__ << " backoff " << op->target.actual_pgid + << " id " << q->second.id << " on " << hoid + << ", queuing " << op << " tid " << op->tid << dendl; + return; + } + } + } + + ceph_assert(op->tid > 0); + MOSDOp *m = _prepare_osd_op(op); + + if (op->target.actual_pgid != m->get_spg()) { + ldout(cct, 10) << __func__ << " " << op->tid << " pgid change from " + << m->get_spg() << " to " << op->target.actual_pgid + << ", updating and reencoding" << dendl; + m->set_spg(op->target.actual_pgid); + m->clear_payload(); // reencode + } + + ldout(cct, 15) << "_send_op " << op->tid << " to " + << op->target.actual_pgid << " on osd." << op->session->osd + << dendl; + + ConnectionRef con = op->session->con; + ceph_assert(con); + +#if 0 + // preallocated rx buffer? + if (op->con) { + ldout(cct, 20) << " revoking rx buffer for " << op->tid << " on " + << op->con << dendl; + op->con->revoke_rx_buffer(op->tid); + } + if (op->outbl && + op->ontimeout == 0 && // only post rx_buffer if no timeout; see #9582 + op->outbl->length()) { + op->outbl->invalidate_crc(); // messenger writes through c_str() + ldout(cct, 20) << " posting rx buffer for " << op->tid << " on " << con + << dendl; + op->con = con; + op->con->post_rx_buffer(op->tid, *op->outbl); + } +#endif + + op->incarnation = op->session->incarnation; + + if (op->trace.valid()) { + m->trace.init("op msg", nullptr, &op->trace); + } + op->session->con->send_message(m); +} + +int Objecter::calc_op_budget(const vector<OSDOp>& ops) +{ + int op_budget = 0; + for (vector<OSDOp>::const_iterator i = ops.begin(); + i != ops.end(); + ++i) { + if (i->op.op & CEPH_OSD_OP_MODE_WR) { + op_budget += i->indata.length(); + } else if (ceph_osd_op_mode_read(i->op.op)) { + if (ceph_osd_op_uses_extent(i->op.op)) { + if ((int64_t)i->op.extent.length > 0) + op_budget += (int64_t)i->op.extent.length; + } else if (ceph_osd_op_type_attr(i->op.op)) { + op_budget += i->op.xattr.name_len + i->op.xattr.value_len; + } + } + } + return op_budget; +} + +void Objecter::_throttle_op(Op *op, + shunique_lock& sul, + int op_budget) +{ + ceph_assert(sul && sul.mutex() == &rwlock); + bool locked_for_write = sul.owns_lock(); + + if (!op_budget) + op_budget = calc_op_budget(op->ops); + if (!op_throttle_bytes.get_or_fail(op_budget)) { //couldn't take right now + sul.unlock(); + op_throttle_bytes.get(op_budget); + if (locked_for_write) + sul.lock(); + else + sul.lock_shared(); + } + if (!op_throttle_ops.get_or_fail(1)) { //couldn't take right now + sul.unlock(); + op_throttle_ops.get(1); + if (locked_for_write) + sul.lock(); + else + sul.lock_shared(); + } +} + +int Objecter::take_linger_budget(LingerOp *info) +{ + return 1; +} + +/* This function DOES put the passed message before returning */ +void Objecter::handle_osd_op_reply(MOSDOpReply *m) +{ + ldout(cct, 10) << "in handle_osd_op_reply" << dendl; + + // get pio + ceph_tid_t tid = m->get_tid(); + + shunique_lock sul(rwlock, ceph::acquire_shared); + if (!initialized) { + m->put(); + return; + } + + ConnectionRef con = m->get_connection(); + auto priv = con->get_priv(); + auto s = static_cast<OSDSession*>(priv.get()); + if (!s || s->con != con) { + ldout(cct, 7) << __func__ << " no session on con " << con << dendl; + m->put(); + return; + } + + OSDSession::unique_lock sl(s->lock); + + map<ceph_tid_t, Op *>::iterator iter = s->ops.find(tid); + if (iter == s->ops.end()) { + ldout(cct, 7) << "handle_osd_op_reply " << tid + << (m->is_ondisk() ? " ondisk" : (m->is_onnvram() ? + " onnvram" : " ack")) + << " ... stray" << dendl; + sl.unlock(); + m->put(); + return; + } + + ldout(cct, 7) << "handle_osd_op_reply " << tid + << (m->is_ondisk() ? " ondisk" : + (m->is_onnvram() ? " onnvram" : " ack")) + << " uv " << m->get_user_version() + << " in " << m->get_pg() + << " attempt " << m->get_retry_attempt() + << dendl; + Op *op = iter->second; + op->trace.event("osd op reply"); + + if (retry_writes_after_first_reply && op->attempts == 1 && + (op->target.flags & CEPH_OSD_FLAG_WRITE)) { + ldout(cct, 7) << "retrying write after first reply: " << tid << dendl; + if (op->onfinish) { + num_in_flight--; + } + _session_op_remove(s, op); + sl.unlock(); + + _op_submit(op, sul, NULL); + m->put(); + return; + } + + if (m->get_retry_attempt() >= 0) { + if (m->get_retry_attempt() != (op->attempts - 1)) { + ldout(cct, 7) << " ignoring reply from attempt " + << m->get_retry_attempt() + << " from " << m->get_source_inst() + << "; last attempt " << (op->attempts - 1) << " sent to " + << op->session->con->get_peer_addr() << dendl; + m->put(); + sl.unlock(); + return; + } + } else { + // we don't know the request attempt because the server is old, so + // just accept this one. we may do ACK callbacks we shouldn't + // have, but that is better than doing callbacks out of order. + } + + Context *onfinish = 0; + + int rc = m->get_result(); + + if (m->is_redirect_reply()) { + ldout(cct, 5) << " got redirect reply; redirecting" << dendl; + if (op->onfinish) + num_in_flight--; + _session_op_remove(s, op); + sl.unlock(); + + // FIXME: two redirects could race and reorder + + op->tid = 0; + m->get_redirect().combine_with_locator(op->target.target_oloc, + op->target.target_oid.name); + op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED | + CEPH_OSD_FLAG_IGNORE_CACHE | + CEPH_OSD_FLAG_IGNORE_OVERLAY); + _op_submit(op, sul, NULL); + m->put(); + return; + } + + if (rc == -EAGAIN) { + ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl; + if (op->onfinish) + num_in_flight--; + _session_op_remove(s, op); + sl.unlock(); + + op->tid = 0; + op->target.flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS); + op->target.pgid = pg_t(); + _op_submit(op, sul, NULL); + m->put(); + return; + } + + sul.unlock(); + + if (op->objver) + *op->objver = m->get_user_version(); + if (op->reply_epoch) + *op->reply_epoch = m->get_map_epoch(); + if (op->data_offset) + *op->data_offset = m->get_header().data_off; + + // got data? + if (op->outbl) { +#if 0 + if (op->con) + op->con->revoke_rx_buffer(op->tid); +#endif + auto& bl = m->get_data(); + if (op->outbl->length() == bl.length() && + bl.get_num_buffers() <= 1) { + // this is here to keep previous users to *relied* on getting data + // read into existing buffers happy. Notably, + // libradosstriper::RadosStriperImpl::aio_read(). + ldout(cct,10) << __func__ << " copying resulting " << bl.length() + << " into existing buffer of length " << op->outbl->length() + << dendl; + bufferlist t; + t.claim(*op->outbl); + t.invalidate_crc(); // we're overwriting the raw buffers via c_str() + bl.copy(0, bl.length(), t.c_str()); + op->outbl->substr_of(t, 0, bl.length()); + } else { + m->claim_data(*op->outbl); + } + op->outbl = 0; + } + + // per-op result demuxing + vector<OSDOp> out_ops; + m->claim_ops(out_ops); + + if (out_ops.size() != op->ops.size()) + ldout(cct, 0) << "WARNING: tid " << op->tid << " reply ops " << out_ops + << " != request ops " << op->ops + << " from " << m->get_source_inst() << dendl; + + vector<bufferlist*>::iterator pb = op->out_bl.begin(); + vector<int*>::iterator pr = op->out_rval.begin(); + vector<Context*>::iterator ph = op->out_handler.begin(); + ceph_assert(op->out_bl.size() == op->out_rval.size()); + ceph_assert(op->out_bl.size() == op->out_handler.size()); + vector<OSDOp>::iterator p = out_ops.begin(); + for (unsigned i = 0; + p != out_ops.end() && pb != op->out_bl.end(); + ++i, ++p, ++pb, ++pr, ++ph) { + ldout(cct, 10) << " op " << i << " rval " << p->rval + << " len " << p->outdata.length() << dendl; + if (*pb) + **pb = p->outdata; + // set rval before running handlers so that handlers + // can change it if e.g. decoding fails + if (*pr) + **pr = ceph_to_hostos_errno(p->rval); + if (*ph) { + ldout(cct, 10) << " op " << i << " handler " << *ph << dendl; + (*ph)->complete(ceph_to_hostos_errno(p->rval)); + *ph = NULL; + } + } + + // NOTE: we assume that since we only request ONDISK ever we will + // only ever get back one (type of) ack ever. + + if (op->onfinish) { + num_in_flight--; + onfinish = op->onfinish; + op->onfinish = NULL; + } + logger->inc(l_osdc_op_reply); + + /* get it before we call _finish_op() */ + auto completion_lock = s->get_lock(op->target.base_oid); + + ldout(cct, 15) << "handle_osd_op_reply completed tid " << tid << dendl; + _finish_op(op, 0); + + ldout(cct, 5) << num_in_flight << " in flight" << dendl; + + // serialize completions + if (completion_lock.mutex()) { + completion_lock.lock(); + } + sl.unlock(); + + // do callbacks + if (onfinish) { + onfinish->complete(rc); + } + if (completion_lock.mutex()) { + completion_lock.unlock(); + } + + m->put(); +} + +void Objecter::handle_osd_backoff(MOSDBackoff *m) +{ + ldout(cct, 10) << __func__ << " " << *m << dendl; + shunique_lock sul(rwlock, ceph::acquire_shared); + if (!initialized) { + m->put(); + return; + } + + ConnectionRef con = m->get_connection(); + auto priv = con->get_priv(); + auto s = static_cast<OSDSession*>(priv.get()); + if (!s || s->con != con) { + ldout(cct, 7) << __func__ << " no session on con " << con << dendl; + m->put(); + return; + } + + get_session(s); + + OSDSession::unique_lock sl(s->lock); + + switch (m->op) { + case CEPH_OSD_BACKOFF_OP_BLOCK: + { + // register + OSDBackoff& b = s->backoffs[m->pgid][m->begin]; + s->backoffs_by_id.insert(make_pair(m->id, &b)); + b.pgid = m->pgid; + b.id = m->id; + b.begin = m->begin; + b.end = m->end; + + // ack with original backoff's epoch so that the osd can discard this if + // there was a pg split. + Message *r = new MOSDBackoff(m->pgid, + m->map_epoch, + CEPH_OSD_BACKOFF_OP_ACK_BLOCK, + m->id, m->begin, m->end); + // this priority must match the MOSDOps from _prepare_osd_op + r->set_priority(cct->_conf->osd_client_op_priority); + con->send_message(r); + } + break; + + case CEPH_OSD_BACKOFF_OP_UNBLOCK: + { + auto p = s->backoffs_by_id.find(m->id); + if (p != s->backoffs_by_id.end()) { + OSDBackoff *b = p->second; + if (b->begin != m->begin && + b->end != m->end) { + lderr(cct) << __func__ << " got " << m->pgid << " id " << m->id + << " unblock on [" + << m->begin << "," << m->end << ") but backoff is [" + << b->begin << "," << b->end << ")" << dendl; + // hrmpf, unblock it anyway. + } + ldout(cct, 10) << __func__ << " unblock backoff " << b->pgid + << " id " << b->id + << " [" << b->begin << "," << b->end + << ")" << dendl; + auto spgp = s->backoffs.find(b->pgid); + ceph_assert(spgp != s->backoffs.end()); + spgp->second.erase(b->begin); + if (spgp->second.empty()) { + s->backoffs.erase(spgp); + } + s->backoffs_by_id.erase(p); + + // check for any ops to resend + for (auto& q : s->ops) { + if (q.second->target.actual_pgid == m->pgid) { + int r = q.second->target.contained_by(m->begin, m->end); + ldout(cct, 20) << __func__ << " contained_by " << r << " on " + << q.second->target.get_hobj() << dendl; + if (r) { + _send_op(q.second); + } + } + } + } else { + lderr(cct) << __func__ << " " << m->pgid << " id " << m->id + << " unblock on [" + << m->begin << "," << m->end << ") but backoff dne" << dendl; + } + } + break; + + default: + ldout(cct, 10) << __func__ << " unrecognized op " << (int)m->op << dendl; + } + + sul.unlock(); + sl.unlock(); + + m->put(); + put_session(s); +} + +uint32_t Objecter::list_nobjects_seek(NListContext *list_context, + uint32_t pos) +{ + shared_lock rl(rwlock); + list_context->pos = hobject_t(object_t(), string(), CEPH_NOSNAP, + pos, list_context->pool_id, string()); + ldout(cct, 10) << __func__ << " " << list_context + << " pos " << pos << " -> " << list_context->pos << dendl; + pg_t actual = osdmap->raw_pg_to_pg(pg_t(pos, list_context->pool_id)); + list_context->current_pg = actual.ps(); + list_context->at_end_of_pool = false; + return pos; +} + +uint32_t Objecter::list_nobjects_seek(NListContext *list_context, + const hobject_t& cursor) +{ + shared_lock rl(rwlock); + ldout(cct, 10) << "list_nobjects_seek " << list_context << dendl; + list_context->pos = cursor; + list_context->at_end_of_pool = false; + pg_t actual = osdmap->raw_pg_to_pg(pg_t(cursor.get_hash(), list_context->pool_id)); + list_context->current_pg = actual.ps(); + list_context->sort_bitwise = true; + return list_context->current_pg; +} + +void Objecter::list_nobjects_get_cursor(NListContext *list_context, + hobject_t *cursor) +{ + shared_lock rl(rwlock); + if (list_context->list.empty()) { + *cursor = list_context->pos; + } else { + const librados::ListObjectImpl& entry = list_context->list.front(); + const string *key = (entry.locator.empty() ? &entry.oid : &entry.locator); + uint32_t h = osdmap->get_pg_pool(list_context->pool_id)->hash_key(*key, entry.nspace); + *cursor = hobject_t(entry.oid, entry.locator, list_context->pool_snap_seq, h, list_context->pool_id, entry.nspace); + } +} + +void Objecter::list_nobjects(NListContext *list_context, Context *onfinish) +{ + ldout(cct, 10) << __func__ << " pool_id " << list_context->pool_id + << " pool_snap_seq " << list_context->pool_snap_seq + << " max_entries " << list_context->max_entries + << " list_context " << list_context + << " onfinish " << onfinish + << " current_pg " << list_context->current_pg + << " pos " << list_context->pos << dendl; + + shared_lock rl(rwlock); + const pg_pool_t *pool = osdmap->get_pg_pool(list_context->pool_id); + if (!pool) { // pool is gone + rl.unlock(); + put_nlist_context_budget(list_context); + onfinish->complete(-ENOENT); + return; + } + int pg_num = pool->get_pg_num(); + bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE); + + if (list_context->pos.is_min()) { + list_context->starting_pg_num = 0; + list_context->sort_bitwise = sort_bitwise; + list_context->starting_pg_num = pg_num; + } + if (list_context->sort_bitwise != sort_bitwise) { + list_context->pos = hobject_t( + object_t(), string(), CEPH_NOSNAP, + list_context->current_pg, list_context->pool_id, string()); + list_context->sort_bitwise = sort_bitwise; + ldout(cct, 10) << " hobject sort order changed, restarting this pg at " + << list_context->pos << dendl; + } + if (list_context->starting_pg_num != pg_num) { + if (!sort_bitwise) { + // start reading from the beginning; the pgs have changed + ldout(cct, 10) << " pg_num changed; restarting with " << pg_num << dendl; + list_context->pos = collection_list_handle_t(); + } + list_context->starting_pg_num = pg_num; + } + + if (list_context->pos.is_max()) { + ldout(cct, 20) << __func__ << " end of pool, list " + << list_context->list << dendl; + if (list_context->list.empty()) { + list_context->at_end_of_pool = true; + } + // release the listing context's budget once all + // OPs (in the session) are finished + put_nlist_context_budget(list_context); + onfinish->complete(0); + return; + } + + ObjectOperation op; + op.pg_nls(list_context->max_entries, list_context->filter, + list_context->pos, osdmap->get_epoch()); + list_context->bl.clear(); + C_NList *onack = new C_NList(list_context, onfinish, this); + object_locator_t oloc(list_context->pool_id, list_context->nspace); + + // note current_pg in case we don't have (or lose) SORTBITWISE + list_context->current_pg = pool->raw_hash_to_pg(list_context->pos.get_hash()); + rl.unlock(); + + pg_read(list_context->current_pg, oloc, op, + &list_context->bl, 0, onack, &onack->epoch, + &list_context->ctx_budget); +} + +void Objecter::_nlist_reply(NListContext *list_context, int r, + Context *final_finish, epoch_t reply_epoch) +{ + ldout(cct, 10) << __func__ << " " << list_context << dendl; + + auto iter = list_context->bl.cbegin(); + pg_nls_response_t response; + bufferlist extra_info; + decode(response, iter); + if (!iter.end()) { + decode(extra_info, iter); + } + + // if the osd returns 1 (newer code), or handle MAX, it means we + // hit the end of the pg. + if ((response.handle.is_max() || r == 1) && + !list_context->sort_bitwise) { + // legacy OSD and !sortbitwise, figure out the next PG on our own + ++list_context->current_pg; + if (list_context->current_pg == list_context->starting_pg_num) { + // end of pool + list_context->pos = hobject_t::get_max(); + } else { + // next pg + list_context->pos = hobject_t(object_t(), string(), CEPH_NOSNAP, + list_context->current_pg, + list_context->pool_id, string()); + } + } else { + list_context->pos = response.handle; + } + + int response_size = response.entries.size(); + ldout(cct, 20) << " response.entries.size " << response_size + << ", response.entries " << response.entries + << ", handle " << response.handle + << ", tentative new pos " << list_context->pos << dendl; + list_context->extra_info.append(extra_info); + if (response_size) { + list_context->list.splice(list_context->list.end(), response.entries); + } + + if (list_context->list.size() >= list_context->max_entries) { + ldout(cct, 20) << " hit max, returning results so far, " + << list_context->list << dendl; + // release the listing context's budget once all + // OPs (in the session) are finished + put_nlist_context_budget(list_context); + final_finish->complete(0); + return; + } + + // continue! + list_nobjects(list_context, final_finish); +} + +void Objecter::put_nlist_context_budget(NListContext *list_context) +{ + if (list_context->ctx_budget >= 0) { + ldout(cct, 10) << " release listing context's budget " << + list_context->ctx_budget << dendl; + put_op_budget_bytes(list_context->ctx_budget); + list_context->ctx_budget = -1; + } +} + +// snapshots + +int Objecter::create_pool_snap(int64_t pool, string& snap_name, + Context *onfinish) +{ + unique_lock wl(rwlock); + ldout(cct, 10) << "create_pool_snap; pool: " << pool << "; snap: " + << snap_name << dendl; + + const pg_pool_t *p = osdmap->get_pg_pool(pool); + if (!p) + return -EINVAL; + if (p->snap_exists(snap_name.c_str())) + return -EEXIST; + + PoolOp *op = new PoolOp; + if (!op) + return -ENOMEM; + op->tid = ++last_tid; + op->pool = pool; + op->name = snap_name; + op->onfinish = onfinish; + op->pool_op = POOL_OP_CREATE_SNAP; + pool_ops[op->tid] = op; + + pool_op_submit(op); + + return 0; +} + +struct C_SelfmanagedSnap : public Context { + bufferlist bl; + snapid_t *psnapid; + Context *fin; + C_SelfmanagedSnap(snapid_t *ps, Context *f) : psnapid(ps), fin(f) {} + void finish(int r) override { + if (r == 0) { + try { + auto p = bl.cbegin(); + decode(*psnapid, p); + } catch (buffer::error&) { + r = -EIO; + } + } + fin->complete(r); + } +}; + +int Objecter::allocate_selfmanaged_snap(int64_t pool, snapid_t *psnapid, + Context *onfinish) +{ + unique_lock wl(rwlock); + ldout(cct, 10) << "allocate_selfmanaged_snap; pool: " << pool << dendl; + PoolOp *op = new PoolOp; + if (!op) return -ENOMEM; + op->tid = ++last_tid; + op->pool = pool; + C_SelfmanagedSnap *fin = new C_SelfmanagedSnap(psnapid, onfinish); + op->onfinish = fin; + op->blp = &fin->bl; + op->pool_op = POOL_OP_CREATE_UNMANAGED_SNAP; + pool_ops[op->tid] = op; + + pool_op_submit(op); + return 0; +} + +int Objecter::delete_pool_snap(int64_t pool, string& snap_name, + Context *onfinish) +{ + unique_lock wl(rwlock); + ldout(cct, 10) << "delete_pool_snap; pool: " << pool << "; snap: " + << snap_name << dendl; + + const pg_pool_t *p = osdmap->get_pg_pool(pool); + if (!p) + return -EINVAL; + if (!p->snap_exists(snap_name.c_str())) + return -ENOENT; + + PoolOp *op = new PoolOp; + if (!op) + return -ENOMEM; + op->tid = ++last_tid; + op->pool = pool; + op->name = snap_name; + op->onfinish = onfinish; + op->pool_op = POOL_OP_DELETE_SNAP; + pool_ops[op->tid] = op; + + pool_op_submit(op); + + return 0; +} + +int Objecter::delete_selfmanaged_snap(int64_t pool, snapid_t snap, + Context *onfinish) +{ + unique_lock wl(rwlock); + ldout(cct, 10) << "delete_selfmanaged_snap; pool: " << pool << "; snap: " + << snap << dendl; + PoolOp *op = new PoolOp; + if (!op) return -ENOMEM; + op->tid = ++last_tid; + op->pool = pool; + op->onfinish = onfinish; + op->pool_op = POOL_OP_DELETE_UNMANAGED_SNAP; + op->snapid = snap; + pool_ops[op->tid] = op; + + pool_op_submit(op); + + return 0; +} + +int Objecter::create_pool(string& name, Context *onfinish, + int crush_rule) +{ + unique_lock wl(rwlock); + ldout(cct, 10) << "create_pool name=" << name << dendl; + + if (osdmap->lookup_pg_pool_name(name) >= 0) + return -EEXIST; + + PoolOp *op = new PoolOp; + if (!op) + return -ENOMEM; + op->tid = ++last_tid; + op->pool = 0; + op->name = name; + op->onfinish = onfinish; + op->pool_op = POOL_OP_CREATE; + pool_ops[op->tid] = op; + op->crush_rule = crush_rule; + + pool_op_submit(op); + + return 0; +} + +int Objecter::delete_pool(int64_t pool, Context *onfinish) +{ + unique_lock wl(rwlock); + ldout(cct, 10) << "delete_pool " << pool << dendl; + + if (!osdmap->have_pg_pool(pool)) + return -ENOENT; + + _do_delete_pool(pool, onfinish); + return 0; +} + +int Objecter::delete_pool(const string &pool_name, Context *onfinish) +{ + unique_lock wl(rwlock); + ldout(cct, 10) << "delete_pool " << pool_name << dendl; + + int64_t pool = osdmap->lookup_pg_pool_name(pool_name); + if (pool < 0) + return pool; + + _do_delete_pool(pool, onfinish); + return 0; +} + +void Objecter::_do_delete_pool(int64_t pool, Context *onfinish) +{ + PoolOp *op = new PoolOp; + op->tid = ++last_tid; + op->pool = pool; + op->name = "delete"; + op->onfinish = onfinish; + op->pool_op = POOL_OP_DELETE; + pool_ops[op->tid] = op; + pool_op_submit(op); +} + +void Objecter::pool_op_submit(PoolOp *op) +{ + // rwlock is locked + if (mon_timeout > timespan(0)) { + op->ontimeout = timer.add_event(mon_timeout, + [this, op]() { + pool_op_cancel(op->tid, -ETIMEDOUT); }); + } + _pool_op_submit(op); +} + +void Objecter::_pool_op_submit(PoolOp *op) +{ + // rwlock is locked unique + + ldout(cct, 10) << "pool_op_submit " << op->tid << dendl; + MPoolOp *m = new MPoolOp(monc->get_fsid(), op->tid, op->pool, + op->name, op->pool_op, + last_seen_osdmap_version); + if (op->snapid) m->snapid = op->snapid; + if (op->crush_rule) m->crush_rule = op->crush_rule; + monc->send_mon_message(m); + op->last_submit = ceph::coarse_mono_clock::now(); + + logger->inc(l_osdc_poolop_send); +} + +/** + * Handle a reply to a PoolOp message. Check that we sent the message + * and give the caller responsibility for the returned bufferlist. + * Then either call the finisher or stash the PoolOp, depending on if we + * have a new enough map. + * Lastly, clean up the message and PoolOp. + */ +void Objecter::handle_pool_op_reply(MPoolOpReply *m) +{ + FUNCTRACE(cct); + shunique_lock sul(rwlock, acquire_shared); + if (!initialized) { + sul.unlock(); + m->put(); + return; + } + + ldout(cct, 10) << "handle_pool_op_reply " << *m << dendl; + ceph_tid_t tid = m->get_tid(); + map<ceph_tid_t, PoolOp *>::iterator iter = pool_ops.find(tid); + if (iter != pool_ops.end()) { + PoolOp *op = iter->second; + ldout(cct, 10) << "have request " << tid << " at " << op << " Op: " + << ceph_pool_op_name(op->pool_op) << dendl; + if (op->blp) + op->blp->claim(m->response_data); + if (m->version > last_seen_osdmap_version) + last_seen_osdmap_version = m->version; + if (osdmap->get_epoch() < m->epoch) { + sul.unlock(); + sul.lock(); + // recheck op existence since we have let go of rwlock + // (for promotion) above. + iter = pool_ops.find(tid); + if (iter == pool_ops.end()) + goto done; // op is gone. + if (osdmap->get_epoch() < m->epoch) { + ldout(cct, 20) << "waiting for client to reach epoch " << m->epoch + << " before calling back" << dendl; + _wait_for_new_map(op->onfinish, m->epoch, m->replyCode); + } else { + // map epoch changed, probably because a MOSDMap message + // sneaked in. Do caller-specified callback now or else + // we lose it forever. + ceph_assert(op->onfinish); + op->onfinish->complete(m->replyCode); + } + } else { + ceph_assert(op->onfinish); + op->onfinish->complete(m->replyCode); + } + op->onfinish = NULL; + if (!sul.owns_lock()) { + sul.unlock(); + sul.lock(); + } + iter = pool_ops.find(tid); + if (iter != pool_ops.end()) { + _finish_pool_op(op, 0); + } + } else { + ldout(cct, 10) << "unknown request " << tid << dendl; + } + +done: + // Not strictly necessary, since we'll release it on return. + sul.unlock(); + + ldout(cct, 10) << "done" << dendl; + m->put(); +} + +int Objecter::pool_op_cancel(ceph_tid_t tid, int r) +{ + ceph_assert(initialized); + + unique_lock wl(rwlock); + + map<ceph_tid_t, PoolOp*>::iterator it = pool_ops.find(tid); + if (it == pool_ops.end()) { + ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl; + return -ENOENT; + } + + ldout(cct, 10) << __func__ << " tid " << tid << dendl; + + PoolOp *op = it->second; + if (op->onfinish) + op->onfinish->complete(r); + + _finish_pool_op(op, r); + return 0; +} + +void Objecter::_finish_pool_op(PoolOp *op, int r) +{ + // rwlock is locked unique + pool_ops.erase(op->tid); + logger->set(l_osdc_poolop_active, pool_ops.size()); + + if (op->ontimeout && r != -ETIMEDOUT) { + timer.cancel_event(op->ontimeout); + } + + delete op; +} + +// pool stats + +void Objecter::get_pool_stats(list<string>& pools, + map<string,pool_stat_t> *result, + bool *per_pool, + Context *onfinish) +{ + ldout(cct, 10) << "get_pool_stats " << pools << dendl; + + PoolStatOp *op = new PoolStatOp; + op->tid = ++last_tid; + op->pools = pools; + op->pool_stats = result; + op->per_pool = per_pool; + op->onfinish = onfinish; + if (mon_timeout > timespan(0)) { + op->ontimeout = timer.add_event(mon_timeout, + [this, op]() { + pool_stat_op_cancel(op->tid, + -ETIMEDOUT); }); + } else { + op->ontimeout = 0; + } + + unique_lock wl(rwlock); + + poolstat_ops[op->tid] = op; + + logger->set(l_osdc_poolstat_active, poolstat_ops.size()); + + _poolstat_submit(op); +} + +void Objecter::_poolstat_submit(PoolStatOp *op) +{ + ldout(cct, 10) << "_poolstat_submit " << op->tid << dendl; + monc->send_mon_message(new MGetPoolStats(monc->get_fsid(), op->tid, + op->pools, + last_seen_pgmap_version)); + op->last_submit = ceph::coarse_mono_clock::now(); + + logger->inc(l_osdc_poolstat_send); +} + +void Objecter::handle_get_pool_stats_reply(MGetPoolStatsReply *m) +{ + ldout(cct, 10) << "handle_get_pool_stats_reply " << *m << dendl; + ceph_tid_t tid = m->get_tid(); + + unique_lock wl(rwlock); + if (!initialized) { + m->put(); + return; + } + + map<ceph_tid_t, PoolStatOp *>::iterator iter = poolstat_ops.find(tid); + if (iter != poolstat_ops.end()) { + PoolStatOp *op = poolstat_ops[tid]; + ldout(cct, 10) << "have request " << tid << " at " << op << dendl; + *op->pool_stats = m->pool_stats; + *op->per_pool = m->per_pool; + if (m->version > last_seen_pgmap_version) { + last_seen_pgmap_version = m->version; + } + op->onfinish->complete(0); + _finish_pool_stat_op(op, 0); + } else { + ldout(cct, 10) << "unknown request " << tid << dendl; + } + ldout(cct, 10) << "done" << dendl; + m->put(); +} + +int Objecter::pool_stat_op_cancel(ceph_tid_t tid, int r) +{ + ceph_assert(initialized); + + unique_lock wl(rwlock); + + map<ceph_tid_t, PoolStatOp*>::iterator it = poolstat_ops.find(tid); + if (it == poolstat_ops.end()) { + ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl; + return -ENOENT; + } + + ldout(cct, 10) << __func__ << " tid " << tid << dendl; + + PoolStatOp *op = it->second; + if (op->onfinish) + op->onfinish->complete(r); + _finish_pool_stat_op(op, r); + return 0; +} + +void Objecter::_finish_pool_stat_op(PoolStatOp *op, int r) +{ + // rwlock is locked unique + + poolstat_ops.erase(op->tid); + logger->set(l_osdc_poolstat_active, poolstat_ops.size()); + + if (op->ontimeout && r != -ETIMEDOUT) + timer.cancel_event(op->ontimeout); + + delete op; +} + +void Objecter::get_fs_stats(ceph_statfs& result, + boost::optional<int64_t> data_pool, + Context *onfinish) +{ + ldout(cct, 10) << "get_fs_stats" << dendl; + unique_lock l(rwlock); + + StatfsOp *op = new StatfsOp; + op->tid = ++last_tid; + op->stats = &result; + op->data_pool = data_pool; + op->onfinish = onfinish; + if (mon_timeout > timespan(0)) { + op->ontimeout = timer.add_event(mon_timeout, + [this, op]() { + statfs_op_cancel(op->tid, + -ETIMEDOUT); }); + } else { + op->ontimeout = 0; + } + statfs_ops[op->tid] = op; + + logger->set(l_osdc_statfs_active, statfs_ops.size()); + + _fs_stats_submit(op); +} + +void Objecter::_fs_stats_submit(StatfsOp *op) +{ + // rwlock is locked unique + + ldout(cct, 10) << "fs_stats_submit" << op->tid << dendl; + monc->send_mon_message(new MStatfs(monc->get_fsid(), op->tid, + op->data_pool, + last_seen_pgmap_version)); + op->last_submit = ceph::coarse_mono_clock::now(); + + logger->inc(l_osdc_statfs_send); +} + +void Objecter::handle_fs_stats_reply(MStatfsReply *m) +{ + unique_lock wl(rwlock); + if (!initialized) { + m->put(); + return; + } + + ldout(cct, 10) << "handle_fs_stats_reply " << *m << dendl; + ceph_tid_t tid = m->get_tid(); + + if (statfs_ops.count(tid)) { + StatfsOp *op = statfs_ops[tid]; + ldout(cct, 10) << "have request " << tid << " at " << op << dendl; + *(op->stats) = m->h.st; + if (m->h.version > last_seen_pgmap_version) + last_seen_pgmap_version = m->h.version; + op->onfinish->complete(0); + _finish_statfs_op(op, 0); + } else { + ldout(cct, 10) << "unknown request " << tid << dendl; + } + m->put(); + ldout(cct, 10) << "done" << dendl; +} + +int Objecter::statfs_op_cancel(ceph_tid_t tid, int r) +{ + ceph_assert(initialized); + + unique_lock wl(rwlock); + + map<ceph_tid_t, StatfsOp*>::iterator it = statfs_ops.find(tid); + if (it == statfs_ops.end()) { + ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl; + return -ENOENT; + } + + ldout(cct, 10) << __func__ << " tid " << tid << dendl; + + StatfsOp *op = it->second; + if (op->onfinish) + op->onfinish->complete(r); + _finish_statfs_op(op, r); + return 0; +} + +void Objecter::_finish_statfs_op(StatfsOp *op, int r) +{ + // rwlock is locked unique + + statfs_ops.erase(op->tid); + logger->set(l_osdc_statfs_active, statfs_ops.size()); + + if (op->ontimeout && r != -ETIMEDOUT) + timer.cancel_event(op->ontimeout); + + delete op; +} + +// scatter/gather + +void Objecter::_sg_read_finish(vector<ObjectExtent>& extents, + vector<bufferlist>& resultbl, + bufferlist *bl, Context *onfinish) +{ + // all done + ldout(cct, 15) << "_sg_read_finish" << dendl; + + if (extents.size() > 1) { + Striper::StripedReadResult r; + vector<bufferlist>::iterator bit = resultbl.begin(); + for (vector<ObjectExtent>::iterator eit = extents.begin(); + eit != extents.end(); + ++eit, ++bit) { + r.add_partial_result(cct, *bit, eit->buffer_extents); + } + bl->clear(); + r.assemble_result(cct, *bl, false); + } else { + ldout(cct, 15) << " only one frag" << dendl; + bl->claim(resultbl[0]); + } + + // done + uint64_t bytes_read = bl->length(); + ldout(cct, 7) << "_sg_read_finish " << bytes_read << " bytes" << dendl; + + if (onfinish) { + onfinish->complete(bytes_read);// > 0 ? bytes_read:m->get_result()); + } +} + + +void Objecter::ms_handle_connect(Connection *con) +{ + ldout(cct, 10) << "ms_handle_connect " << con << dendl; + if (!initialized) + return; + + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) + resend_mon_ops(); +} + +bool Objecter::ms_handle_reset(Connection *con) +{ + if (!initialized) + return false; + if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) { + unique_lock wl(rwlock); + + auto priv = con->get_priv(); + auto session = static_cast<OSDSession*>(priv.get()); + if (session) { + ldout(cct, 1) << "ms_handle_reset " << con << " session " << session + << " osd." << session->osd << dendl; + // the session maybe had been closed if new osdmap just handled + // says the osd down + if (!(initialized && osdmap->is_up(session->osd))) { + ldout(cct, 1) << "ms_handle_reset aborted,initialized=" << initialized << dendl; + wl.unlock(); + return false; + } + map<uint64_t, LingerOp *> lresend; + OSDSession::unique_lock sl(session->lock); + _reopen_session(session); + _kick_requests(session, lresend); + sl.unlock(); + _linger_ops_resend(lresend, wl); + wl.unlock(); + maybe_request_map(); + } + return true; + } + return false; +} + +void Objecter::ms_handle_remote_reset(Connection *con) +{ + /* + * treat these the same. + */ + ms_handle_reset(con); +} + +bool Objecter::ms_handle_refused(Connection *con) +{ + // just log for now + if (osdmap && (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD)) { + int osd = osdmap->identify_osd(con->get_peer_addr()); + if (osd >= 0) { + ldout(cct, 1) << "ms_handle_refused on osd." << osd << dendl; + } + } + return false; +} + +bool Objecter::ms_get_authorizer(int dest_type, + AuthAuthorizer **authorizer) +{ + if (!initialized) + return false; + if (dest_type == CEPH_ENTITY_TYPE_MON) + return true; + *authorizer = monc->build_authorizer(dest_type); + return *authorizer != NULL; +} + +void Objecter::op_target_t::dump(Formatter *f) const +{ + f->dump_stream("pg") << pgid; + f->dump_int("osd", osd); + f->dump_stream("object_id") << base_oid; + f->dump_stream("object_locator") << base_oloc; + f->dump_stream("target_object_id") << target_oid; + f->dump_stream("target_object_locator") << target_oloc; + f->dump_int("paused", (int)paused); + f->dump_int("used_replica", (int)used_replica); + f->dump_int("precalc_pgid", (int)precalc_pgid); +} + +void Objecter::_dump_active(OSDSession *s) +{ + for (map<ceph_tid_t,Op*>::iterator p = s->ops.begin(); + p != s->ops.end(); + ++p) { + Op *op = p->second; + ldout(cct, 20) << op->tid << "\t" << op->target.pgid + << "\tosd." << (op->session ? op->session->osd : -1) + << "\t" << op->target.base_oid + << "\t" << op->ops << dendl; + } +} + +void Objecter::_dump_active() +{ + ldout(cct, 20) << "dump_active .. " << num_homeless_ops << " homeless" + << dendl; + for (map<int, OSDSession *>::iterator siter = osd_sessions.begin(); + siter != osd_sessions.end(); ++siter) { + OSDSession *s = siter->second; + OSDSession::shared_lock sl(s->lock); + _dump_active(s); + sl.unlock(); + } + _dump_active(homeless_session); +} + +void Objecter::dump_active() +{ + shared_lock rl(rwlock); + _dump_active(); + rl.unlock(); +} + +void Objecter::dump_requests(Formatter *fmt) +{ + // Read-lock on Objecter held here + fmt->open_object_section("requests"); + dump_ops(fmt); + dump_linger_ops(fmt); + dump_pool_ops(fmt); + dump_pool_stat_ops(fmt); + dump_statfs_ops(fmt); + dump_command_ops(fmt); + fmt->close_section(); // requests object +} + +void Objecter::_dump_ops(const OSDSession *s, Formatter *fmt) +{ + for (map<ceph_tid_t,Op*>::const_iterator p = s->ops.begin(); + p != s->ops.end(); + ++p) { + Op *op = p->second; + auto age = std::chrono::duration<double>(coarse_mono_clock::now() - op->stamp); + fmt->open_object_section("op"); + fmt->dump_unsigned("tid", op->tid); + op->target.dump(fmt); + fmt->dump_stream("last_sent") << op->stamp; + fmt->dump_float("age", age.count()); + fmt->dump_int("attempts", op->attempts); + fmt->dump_stream("snapid") << op->snapid; + fmt->dump_stream("snap_context") << op->snapc; + fmt->dump_stream("mtime") << op->mtime; + + fmt->open_array_section("osd_ops"); + for (vector<OSDOp>::const_iterator it = op->ops.begin(); + it != op->ops.end(); + ++it) { + fmt->dump_stream("osd_op") << *it; + } + fmt->close_section(); // osd_ops array + + fmt->close_section(); // op object + } +} + +void Objecter::dump_ops(Formatter *fmt) +{ + // Read-lock on Objecter held + fmt->open_array_section("ops"); + for (map<int, OSDSession *>::const_iterator siter = osd_sessions.begin(); + siter != osd_sessions.end(); ++siter) { + OSDSession *s = siter->second; + OSDSession::shared_lock sl(s->lock); + _dump_ops(s, fmt); + sl.unlock(); + } + _dump_ops(homeless_session, fmt); + fmt->close_section(); // ops array +} + +void Objecter::_dump_linger_ops(const OSDSession *s, Formatter *fmt) +{ + for (map<uint64_t, LingerOp*>::const_iterator p = s->linger_ops.begin(); + p != s->linger_ops.end(); + ++p) { + LingerOp *op = p->second; + fmt->open_object_section("linger_op"); + fmt->dump_unsigned("linger_id", op->linger_id); + op->target.dump(fmt); + fmt->dump_stream("snapid") << op->snap; + fmt->dump_stream("registered") << op->registered; + fmt->close_section(); // linger_op object + } +} + +void Objecter::dump_linger_ops(Formatter *fmt) +{ + // We have a read-lock on the objecter + fmt->open_array_section("linger_ops"); + for (map<int, OSDSession *>::const_iterator siter = osd_sessions.begin(); + siter != osd_sessions.end(); ++siter) { + OSDSession *s = siter->second; + OSDSession::shared_lock sl(s->lock); + _dump_linger_ops(s, fmt); + sl.unlock(); + } + _dump_linger_ops(homeless_session, fmt); + fmt->close_section(); // linger_ops array +} + +void Objecter::_dump_command_ops(const OSDSession *s, Formatter *fmt) +{ + for (map<uint64_t, CommandOp*>::const_iterator p = s->command_ops.begin(); + p != s->command_ops.end(); + ++p) { + CommandOp *op = p->second; + fmt->open_object_section("command_op"); + fmt->dump_unsigned("command_id", op->tid); + fmt->dump_int("osd", op->session ? op->session->osd : -1); + fmt->open_array_section("command"); + for (vector<string>::const_iterator q = op->cmd.begin(); + q != op->cmd.end(); ++q) + fmt->dump_string("word", *q); + fmt->close_section(); + if (op->target_osd >= 0) + fmt->dump_int("target_osd", op->target_osd); + else + fmt->dump_stream("target_pg") << op->target_pg; + fmt->close_section(); // command_op object + } +} + +void Objecter::dump_command_ops(Formatter *fmt) +{ + // We have a read-lock on the Objecter here + fmt->open_array_section("command_ops"); + for (map<int, OSDSession *>::const_iterator siter = osd_sessions.begin(); + siter != osd_sessions.end(); ++siter) { + OSDSession *s = siter->second; + OSDSession::shared_lock sl(s->lock); + _dump_command_ops(s, fmt); + sl.unlock(); + } + _dump_command_ops(homeless_session, fmt); + fmt->close_section(); // command_ops array +} + +void Objecter::dump_pool_ops(Formatter *fmt) const +{ + fmt->open_array_section("pool_ops"); + for (map<ceph_tid_t, PoolOp*>::const_iterator p = pool_ops.begin(); + p != pool_ops.end(); + ++p) { + PoolOp *op = p->second; + fmt->open_object_section("pool_op"); + fmt->dump_unsigned("tid", op->tid); + fmt->dump_int("pool", op->pool); + fmt->dump_string("name", op->name); + fmt->dump_int("operation_type", op->pool_op); + fmt->dump_unsigned("crush_rule", op->crush_rule); + fmt->dump_stream("snapid") << op->snapid; + fmt->dump_stream("last_sent") << op->last_submit; + fmt->close_section(); // pool_op object + } + fmt->close_section(); // pool_ops array +} + +void Objecter::dump_pool_stat_ops(Formatter *fmt) const +{ + fmt->open_array_section("pool_stat_ops"); + for (map<ceph_tid_t, PoolStatOp*>::const_iterator p = poolstat_ops.begin(); + p != poolstat_ops.end(); + ++p) { + PoolStatOp *op = p->second; + fmt->open_object_section("pool_stat_op"); + fmt->dump_unsigned("tid", op->tid); + fmt->dump_stream("last_sent") << op->last_submit; + + fmt->open_array_section("pools"); + for (list<string>::const_iterator it = op->pools.begin(); + it != op->pools.end(); + ++it) { + fmt->dump_string("pool", *it); + } + fmt->close_section(); // pools array + + fmt->close_section(); // pool_stat_op object + } + fmt->close_section(); // pool_stat_ops array +} + +void Objecter::dump_statfs_ops(Formatter *fmt) const +{ + fmt->open_array_section("statfs_ops"); + for (map<ceph_tid_t, StatfsOp*>::const_iterator p = statfs_ops.begin(); + p != statfs_ops.end(); + ++p) { + StatfsOp *op = p->second; + fmt->open_object_section("statfs_op"); + fmt->dump_unsigned("tid", op->tid); + fmt->dump_stream("last_sent") << op->last_submit; + fmt->close_section(); // statfs_op object + } + fmt->close_section(); // statfs_ops array +} + +Objecter::RequestStateHook::RequestStateHook(Objecter *objecter) : + m_objecter(objecter) +{ +} + +bool Objecter::RequestStateHook::call(std::string_view command, + const cmdmap_t& cmdmap, + std::string_view format, + bufferlist& out) +{ + Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); + shared_lock rl(m_objecter->rwlock); + m_objecter->dump_requests(f); + f->flush(out); + delete f; + return true; +} + +void Objecter::blacklist_self(bool set) +{ + ldout(cct, 10) << "blacklist_self " << (set ? "add" : "rm") << dendl; + + vector<string> cmd; + cmd.push_back("{\"prefix\":\"osd blacklist\", "); + if (set) + cmd.push_back("\"blacklistop\":\"add\","); + else + cmd.push_back("\"blacklistop\":\"rm\","); + stringstream ss; + // this is somewhat imprecise in that we are blacklisting our first addr only + ss << messenger->get_myaddrs().front().get_legacy_str(); + cmd.push_back("\"addr\":\"" + ss.str() + "\""); + + MMonCommand *m = new MMonCommand(monc->get_fsid()); + m->cmd = cmd; + + monc->send_mon_message(m); +} + +// commands + +void Objecter::handle_command_reply(MCommandReply *m) +{ + unique_lock wl(rwlock); + if (!initialized) { + m->put(); + return; + } + + ConnectionRef con = m->get_connection(); + auto priv = con->get_priv(); + auto s = static_cast<OSDSession*>(priv.get()); + if (!s || s->con != con) { + ldout(cct, 7) << __func__ << " no session on con " << con << dendl; + m->put(); + return; + } + + OSDSession::shared_lock sl(s->lock); + map<ceph_tid_t,CommandOp*>::iterator p = s->command_ops.find(m->get_tid()); + if (p == s->command_ops.end()) { + ldout(cct, 10) << "handle_command_reply tid " << m->get_tid() + << " not found" << dendl; + m->put(); + sl.unlock(); + return; + } + + CommandOp *c = p->second; + if (!c->session || + m->get_connection() != c->session->con) { + ldout(cct, 10) << "handle_command_reply tid " << m->get_tid() + << " got reply from wrong connection " + << m->get_connection() << " " << m->get_source_inst() + << dendl; + m->put(); + sl.unlock(); + return; + } + if (c->poutbl) { + c->poutbl->claim(m->get_data()); + } + + sl.unlock(); + + OSDSession::unique_lock sul(s->lock); + _finish_command(c, m->r, m->rs); + sul.unlock(); + + m->put(); +} + +void Objecter::submit_command(CommandOp *c, ceph_tid_t *ptid) +{ + shunique_lock sul(rwlock, ceph::acquire_unique); + + ceph_tid_t tid = ++last_tid; + ldout(cct, 10) << "_submit_command " << tid << " " << c->cmd << dendl; + c->tid = tid; + + { + OSDSession::unique_lock hs_wl(homeless_session->lock); + _session_command_op_assign(homeless_session, c); + } + + _calc_command_target(c, sul); + _assign_command_session(c, sul); + if (osd_timeout > timespan(0)) { + c->ontimeout = timer.add_event(osd_timeout, + [this, c, tid]() { + command_op_cancel(c->session, tid, + -ETIMEDOUT); }); + } + + if (!c->session->is_homeless()) { + _send_command(c); + } else { + _maybe_request_map(); + } + if (c->map_check_error) + _send_command_map_check(c); + *ptid = tid; + + logger->inc(l_osdc_command_active); +} + +int Objecter::_calc_command_target(CommandOp *c, shunique_lock& sul) +{ + ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock); + + c->map_check_error = 0; + + // ignore overlays, just like we do with pg ops + c->target.flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY; + + if (c->target_osd >= 0) { + if (!osdmap->exists(c->target_osd)) { + c->map_check_error = -ENOENT; + c->map_check_error_str = "osd dne"; + c->target.osd = -1; + return RECALC_OP_TARGET_OSD_DNE; + } + if (osdmap->is_down(c->target_osd)) { + c->map_check_error = -ENXIO; + c->map_check_error_str = "osd down"; + c->target.osd = -1; + return RECALC_OP_TARGET_OSD_DOWN; + } + c->target.osd = c->target_osd; + } else { + int ret = _calc_target(&(c->target), nullptr, true); + if (ret == RECALC_OP_TARGET_POOL_DNE) { + c->map_check_error = -ENOENT; + c->map_check_error_str = "pool dne"; + c->target.osd = -1; + return ret; + } else if (ret == RECALC_OP_TARGET_OSD_DOWN) { + c->map_check_error = -ENXIO; + c->map_check_error_str = "osd down"; + c->target.osd = -1; + return ret; + } + } + + OSDSession *s; + int r = _get_session(c->target.osd, &s, sul); + ceph_assert(r != -EAGAIN); /* shouldn't happen as we're holding the write lock */ + + if (c->session != s) { + put_session(s); + return RECALC_OP_TARGET_NEED_RESEND; + } + + put_session(s); + + ldout(cct, 20) << "_recalc_command_target " << c->tid << " no change, " + << c->session << dendl; + + return RECALC_OP_TARGET_NO_ACTION; +} + +void Objecter::_assign_command_session(CommandOp *c, + shunique_lock& sul) +{ + ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock); + + OSDSession *s; + int r = _get_session(c->target.osd, &s, sul); + ceph_assert(r != -EAGAIN); /* shouldn't happen as we're holding the write lock */ + + if (c->session != s) { + if (c->session) { + OSDSession *cs = c->session; + OSDSession::unique_lock csl(cs->lock); + _session_command_op_remove(c->session, c); + csl.unlock(); + } + OSDSession::unique_lock sl(s->lock); + _session_command_op_assign(s, c); + } + + put_session(s); +} + +void Objecter::_send_command(CommandOp *c) +{ + ldout(cct, 10) << "_send_command " << c->tid << dendl; + ceph_assert(c->session); + ceph_assert(c->session->con); + MCommand *m = new MCommand(monc->monmap.fsid); + m->cmd = c->cmd; + m->set_data(c->inbl); + m->set_tid(c->tid); + c->session->con->send_message(m); + logger->inc(l_osdc_command_send); +} + +int Objecter::command_op_cancel(OSDSession *s, ceph_tid_t tid, int r) +{ + ceph_assert(initialized); + + unique_lock wl(rwlock); + + map<ceph_tid_t, CommandOp*>::iterator it = s->command_ops.find(tid); + if (it == s->command_ops.end()) { + ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl; + return -ENOENT; + } + + ldout(cct, 10) << __func__ << " tid " << tid << dendl; + + CommandOp *op = it->second; + _command_cancel_map_check(op); + OSDSession::unique_lock sl(op->session->lock); + _finish_command(op, r, ""); + sl.unlock(); + return 0; +} + +void Objecter::_finish_command(CommandOp *c, int r, string rs) +{ + // rwlock is locked unique + // session lock is locked + + ldout(cct, 10) << "_finish_command " << c->tid << " = " << r << " " + << rs << dendl; + if (c->prs) + *c->prs = rs; + if (c->onfinish) + c->onfinish->complete(r); + + if (c->ontimeout && r != -ETIMEDOUT) + timer.cancel_event(c->ontimeout); + + _session_command_op_remove(c->session, c); + + c->put(); + + logger->dec(l_osdc_command_active); +} + +Objecter::OSDSession::~OSDSession() +{ + // Caller is responsible for re-assigning or + // destroying any ops that were assigned to us + ceph_assert(ops.empty()); + ceph_assert(linger_ops.empty()); + ceph_assert(command_ops.empty()); +} + +Objecter::~Objecter() +{ + delete osdmap; + + ceph_assert(homeless_session->get_nref() == 1); + ceph_assert(num_homeless_ops == 0); + homeless_session->put(); + + ceph_assert(osd_sessions.empty()); + ceph_assert(poolstat_ops.empty()); + ceph_assert(statfs_ops.empty()); + ceph_assert(pool_ops.empty()); + ceph_assert(waiting_for_map.empty()); + ceph_assert(linger_ops.empty()); + ceph_assert(check_latest_map_lingers.empty()); + ceph_assert(check_latest_map_ops.empty()); + ceph_assert(check_latest_map_commands.empty()); + + ceph_assert(!m_request_state_hook); + ceph_assert(!logger); +} + +/** + * Wait until this OSD map epoch is received before + * sending any more operations to OSDs. Use this + * when it is known that the client can't trust + * anything from before this epoch (e.g. due to + * client blacklist at this epoch). + */ +void Objecter::set_epoch_barrier(epoch_t epoch) +{ + unique_lock wl(rwlock); + + ldout(cct, 7) << __func__ << ": barrier " << epoch << " (was " + << epoch_barrier << ") current epoch " << osdmap->get_epoch() + << dendl; + if (epoch > epoch_barrier) { + epoch_barrier = epoch; + _maybe_request_map(); + } +} + + + +hobject_t Objecter::enumerate_objects_begin() +{ + return hobject_t(); +} + +hobject_t Objecter::enumerate_objects_end() +{ + return hobject_t::get_max(); +} + +struct C_EnumerateReply : public Context { + bufferlist bl; + + Objecter *objecter; + hobject_t *next; + std::list<librados::ListObjectImpl> *result; + const hobject_t end; + const int64_t pool_id; + Context *on_finish; + + epoch_t epoch; + int budget; + + C_EnumerateReply(Objecter *objecter_, hobject_t *next_, + std::list<librados::ListObjectImpl> *result_, + const hobject_t end_, const int64_t pool_id_, Context *on_finish_) : + objecter(objecter_), next(next_), result(result_), + end(end_), pool_id(pool_id_), on_finish(on_finish_), + epoch(0), budget(-1) + {} + + void finish(int r) override { + objecter->_enumerate_reply( + bl, r, end, pool_id, budget, epoch, result, next, on_finish); + } +}; + +void Objecter::enumerate_objects( + int64_t pool_id, + const std::string &ns, + const hobject_t &start, + const hobject_t &end, + const uint32_t max, + const bufferlist &filter_bl, + std::list<librados::ListObjectImpl> *result, + hobject_t *next, + Context *on_finish) +{ + ceph_assert(result); + + if (!end.is_max() && start > end) { + lderr(cct) << __func__ << ": start " << start << " > end " << end << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (max < 1) { + lderr(cct) << __func__ << ": result size may not be zero" << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (start.is_max()) { + on_finish->complete(0); + return; + } + + shared_lock rl(rwlock); + ceph_assert(osdmap->get_epoch()); + if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { + rl.unlock(); + lderr(cct) << __func__ << ": SORTBITWISE cluster flag not set" << dendl; + on_finish->complete(-EOPNOTSUPP); + return; + } + const pg_pool_t *p = osdmap->get_pg_pool(pool_id); + if (!p) { + lderr(cct) << __func__ << ": pool " << pool_id << " DNE in osd epoch " + << osdmap->get_epoch() << dendl; + rl.unlock(); + on_finish->complete(-ENOENT); + return; + } else { + rl.unlock(); + } + + ldout(cct, 20) << __func__ << ": start=" << start << " end=" << end << dendl; + + // Stash completion state + C_EnumerateReply *on_ack = new C_EnumerateReply( + this, next, result, end, pool_id, on_finish); + + ObjectOperation op; + op.pg_nls(max, filter_bl, start, 0); + + // Issue. See you later in _enumerate_reply + object_locator_t oloc(pool_id, ns); + pg_read(start.get_hash(), oloc, op, + &on_ack->bl, 0, on_ack, &on_ack->epoch, &on_ack->budget); +} + +void Objecter::_enumerate_reply( + bufferlist &bl, + int r, + const hobject_t &end, + const int64_t pool_id, + int budget, + epoch_t reply_epoch, + std::list<librados::ListObjectImpl> *result, + hobject_t *next, + Context *on_finish) +{ + if (budget >= 0) { + put_op_budget_bytes(budget); + } + + if (r < 0) { + ldout(cct, 4) << __func__ << ": remote error " << r << dendl; + on_finish->complete(r); + return; + } + + ceph_assert(next != NULL); + + // Decode the results + auto iter = bl.cbegin(); + pg_nls_response_t response; + + // XXX extra_info doesn't seem used anywhere? + bufferlist extra_info; + decode(response, iter); + if (!iter.end()) { + decode(extra_info, iter); + } + + ldout(cct, 10) << __func__ << ": got " << response.entries.size() + << " handle " << response.handle + << " reply_epoch " << reply_epoch << dendl; + ldout(cct, 20) << __func__ << ": response.entries.size " + << response.entries.size() << ", response.entries " + << response.entries << dendl; + if (response.handle <= end) { + *next = response.handle; + } else { + ldout(cct, 10) << __func__ << ": adjusted next down to end " << end + << dendl; + *next = end; + + // drop anything after 'end' + shared_lock rl(rwlock); + const pg_pool_t *pool = osdmap->get_pg_pool(pool_id); + if (!pool) { + // pool is gone, drop any results which are now meaningless. + rl.unlock(); + on_finish->complete(-ENOENT); + return; + } + while (!response.entries.empty()) { + uint32_t hash = response.entries.back().locator.empty() ? + pool->hash_key(response.entries.back().oid, + response.entries.back().nspace) : + pool->hash_key(response.entries.back().locator, + response.entries.back().nspace); + hobject_t last(response.entries.back().oid, + response.entries.back().locator, + CEPH_NOSNAP, + hash, + pool_id, + response.entries.back().nspace); + if (last < end) + break; + ldout(cct, 20) << __func__ << " dropping item " << last + << " >= end " << end << dendl; + response.entries.pop_back(); + } + rl.unlock(); + } + if (!response.entries.empty()) { + result->merge(response.entries); + } + + // release the listing context's budget once all + // OPs (in the session) are finished +#if 0 + put_nlist_context_budget(list_context); +#endif + on_finish->complete(r); + return; +} + +namespace { + using namespace librados; + + template <typename T> + void do_decode(std::vector<T>& items, std::vector<bufferlist>& bls) + { + for (auto bl : bls) { + auto p = bl.cbegin(); + T t; + decode(t, p); + items.push_back(t); + } + } + + struct C_ObjectOperation_scrub_ls : public Context { + bufferlist bl; + uint32_t *interval; + std::vector<inconsistent_obj_t> *objects = nullptr; + std::vector<inconsistent_snapset_t> *snapsets = nullptr; + int *rval; + + C_ObjectOperation_scrub_ls(uint32_t *interval, + std::vector<inconsistent_obj_t> *objects, + int *rval) + : interval(interval), objects(objects), rval(rval) {} + C_ObjectOperation_scrub_ls(uint32_t *interval, + std::vector<inconsistent_snapset_t> *snapsets, + int *rval) + : interval(interval), snapsets(snapsets), rval(rval) {} + void finish(int r) override { + if (r < 0 && r != -EAGAIN) { + if (rval) + *rval = r; + return; + } + + if (rval) + *rval = 0; + + try { + decode(); + } catch (buffer::error&) { + if (rval) + *rval = -EIO; + } + } + private: + void decode() { + scrub_ls_result_t result; + auto p = bl.cbegin(); + result.decode(p); + *interval = result.interval; + if (objects) { + do_decode(*objects, result.vals); + } else { + do_decode(*snapsets, result.vals); + } + } + }; + + template <typename T> + void do_scrub_ls(::ObjectOperation *op, + const scrub_ls_arg_t& arg, + std::vector<T> *items, + uint32_t *interval, + int *rval) + { + OSDOp& osd_op = op->add_op(CEPH_OSD_OP_SCRUBLS); + op->flags |= CEPH_OSD_FLAG_PGOP; + ceph_assert(interval); + arg.encode(osd_op.indata); + unsigned p = op->ops.size() - 1; + auto *h = new C_ObjectOperation_scrub_ls{interval, items, rval}; + op->out_handler[p] = h; + op->out_bl[p] = &h->bl; + op->out_rval[p] = rval; + } +} + +void ::ObjectOperation::scrub_ls(const librados::object_id_t& start_after, + uint64_t max_to_get, + std::vector<librados::inconsistent_obj_t> *objects, + uint32_t *interval, + int *rval) +{ + scrub_ls_arg_t arg = {*interval, 0, start_after, max_to_get}; + do_scrub_ls(this, arg, objects, interval, rval); +} + +void ::ObjectOperation::scrub_ls(const librados::object_id_t& start_after, + uint64_t max_to_get, + std::vector<librados::inconsistent_snapset_t> *snapsets, + uint32_t *interval, + int *rval) +{ + scrub_ls_arg_t arg = {*interval, 1, start_after, max_to_get}; + do_scrub_ls(this, arg, snapsets, interval, rval); +} diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h new file mode 100644 index 00000000..ca8d85f7 --- /dev/null +++ b/src/osdc/Objecter.h @@ -0,0 +1,3067 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OBJECTER_H +#define CEPH_OBJECTER_H + +#include <condition_variable> +#include <list> +#include <map> +#include <mutex> +#include <memory> +#include <sstream> +#include <type_traits> + +#include <boost/thread/shared_mutex.hpp> + +#include "include/ceph_assert.h" +#include "include/buffer.h" +#include "include/types.h" +#include "include/rados/rados_types.hpp" + +#include "common/admin_socket.h" +#include "common/ceph_time.h" +#include "common/ceph_timer.h" +#include "common/config_obs.h" +#include "common/shunique_lock.h" +#include "common/zipkin_trace.h" +#include "common/Finisher.h" +#include "common/Throttle.h" + +#include "messages/MOSDOp.h" +#include "msg/Dispatcher.h" +#include "osd/OSDMap.h" + + +class Context; +class Messenger; +class OSDMap; +class MonClient; +class Message; +class Finisher; + +class MPoolOpReply; + +class MGetPoolStatsReply; +class MStatfsReply; +class MCommandReply; +class MWatchNotify; + +class PerfCounters; + +// ----------------------------------------- + +struct ObjectOperation { + vector<OSDOp> ops; + int flags; + int priority; + + vector<bufferlist*> out_bl; + vector<Context*> out_handler; + vector<int*> out_rval; + + ObjectOperation() : flags(0), priority(0) {} + ~ObjectOperation() { + while (!out_handler.empty()) { + delete out_handler.back(); + out_handler.pop_back(); + } + } + + size_t size() { + return ops.size(); + } + + void set_last_op_flags(int flags) { + ceph_assert(!ops.empty()); + ops.rbegin()->op.flags = flags; + } + + class C_TwoContexts; + /** + * Add a callback to run when this operation completes, + * after any other callbacks for it. + */ + void add_handler(Context *extra); + + OSDOp& add_op(int op) { + int s = ops.size(); + ops.resize(s+1); + ops[s].op.op = op; + out_bl.resize(s+1); + out_bl[s] = NULL; + out_handler.resize(s+1); + out_handler[s] = NULL; + out_rval.resize(s+1); + out_rval[s] = NULL; + return ops[s]; + } + void add_data(int op, uint64_t off, uint64_t len, bufferlist& bl) { + OSDOp& osd_op = add_op(op); + osd_op.op.extent.offset = off; + osd_op.op.extent.length = len; + osd_op.indata.claim_append(bl); + } + void add_writesame(int op, uint64_t off, uint64_t write_len, + bufferlist& bl) { + OSDOp& osd_op = add_op(op); + osd_op.op.writesame.offset = off; + osd_op.op.writesame.length = write_len; + osd_op.op.writesame.data_length = bl.length(); + osd_op.indata.claim_append(bl); + } + void add_xattr(int op, const char *name, const bufferlist& data) { + OSDOp& osd_op = add_op(op); + osd_op.op.xattr.name_len = (name ? strlen(name) : 0); + osd_op.op.xattr.value_len = data.length(); + if (name) + osd_op.indata.append(name, osd_op.op.xattr.name_len); + osd_op.indata.append(data); + } + void add_xattr_cmp(int op, const char *name, uint8_t cmp_op, + uint8_t cmp_mode, const bufferlist& data) { + OSDOp& osd_op = add_op(op); + osd_op.op.xattr.name_len = (name ? strlen(name) : 0); + osd_op.op.xattr.value_len = data.length(); + osd_op.op.xattr.cmp_op = cmp_op; + osd_op.op.xattr.cmp_mode = cmp_mode; + if (name) + osd_op.indata.append(name, osd_op.op.xattr.name_len); + osd_op.indata.append(data); + } + void add_call(int op, const char *cname, const char *method, + bufferlist &indata, + bufferlist *outbl, Context *ctx, int *prval) { + OSDOp& osd_op = add_op(op); + + unsigned p = ops.size() - 1; + out_handler[p] = ctx; + out_bl[p] = outbl; + out_rval[p] = prval; + + osd_op.op.cls.class_len = strlen(cname); + osd_op.op.cls.method_len = strlen(method); + osd_op.op.cls.indata_len = indata.length(); + osd_op.indata.append(cname, osd_op.op.cls.class_len); + osd_op.indata.append(method, osd_op.op.cls.method_len); + osd_op.indata.append(indata); + } + void add_pgls(int op, uint64_t count, collection_list_handle_t cookie, + epoch_t start_epoch) { + OSDOp& osd_op = add_op(op); + osd_op.op.pgls.count = count; + osd_op.op.pgls.start_epoch = start_epoch; + encode(cookie, osd_op.indata); + } + void add_pgls_filter(int op, uint64_t count, const bufferlist& filter, + collection_list_handle_t cookie, epoch_t start_epoch) { + OSDOp& osd_op = add_op(op); + osd_op.op.pgls.count = count; + osd_op.op.pgls.start_epoch = start_epoch; + string cname = "pg"; + string mname = "filter"; + encode(cname, osd_op.indata); + encode(mname, osd_op.indata); + osd_op.indata.append(filter); + encode(cookie, osd_op.indata); + } + void add_alloc_hint(int op, uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags) { + OSDOp& osd_op = add_op(op); + osd_op.op.alloc_hint.expected_object_size = expected_object_size; + osd_op.op.alloc_hint.expected_write_size = expected_write_size; + osd_op.op.alloc_hint.flags = flags; + } + + // ------ + + // pg + void pg_ls(uint64_t count, bufferlist& filter, + collection_list_handle_t cookie, epoch_t start_epoch) { + if (filter.length() == 0) + add_pgls(CEPH_OSD_OP_PGLS, count, cookie, start_epoch); + else + add_pgls_filter(CEPH_OSD_OP_PGLS_FILTER, count, filter, cookie, + start_epoch); + flags |= CEPH_OSD_FLAG_PGOP; + } + + void pg_nls(uint64_t count, const bufferlist& filter, + collection_list_handle_t cookie, epoch_t start_epoch) { + if (filter.length() == 0) + add_pgls(CEPH_OSD_OP_PGNLS, count, cookie, start_epoch); + else + add_pgls_filter(CEPH_OSD_OP_PGNLS_FILTER, count, filter, cookie, + start_epoch); + flags |= CEPH_OSD_FLAG_PGOP; + } + + void scrub_ls(const librados::object_id_t& start_after, + uint64_t max_to_get, + std::vector<librados::inconsistent_obj_t> *objects, + uint32_t *interval, + int *rval); + void scrub_ls(const librados::object_id_t& start_after, + uint64_t max_to_get, + std::vector<librados::inconsistent_snapset_t> *objects, + uint32_t *interval, + int *rval); + + void create(bool excl) { + OSDOp& o = add_op(CEPH_OSD_OP_CREATE); + o.op.flags = (excl ? CEPH_OSD_OP_FLAG_EXCL : 0); + } + + struct C_ObjectOperation_stat : public Context { + bufferlist bl; + uint64_t *psize; + ceph::real_time *pmtime; + time_t *ptime; + struct timespec *pts; + int *prval; + C_ObjectOperation_stat(uint64_t *ps, ceph::real_time *pm, time_t *pt, struct timespec *_pts, + int *prval) + : psize(ps), pmtime(pm), ptime(pt), pts(_pts), prval(prval) {} + void finish(int r) override { + if (r >= 0) { + auto p = bl.cbegin(); + try { + uint64_t size; + ceph::real_time mtime; + decode(size, p); + decode(mtime, p); + if (psize) + *psize = size; + if (pmtime) + *pmtime = mtime; + if (ptime) + *ptime = ceph::real_clock::to_time_t(mtime); + if (pts) + *pts = ceph::real_clock::to_timespec(mtime); + } catch (buffer::error& e) { + if (prval) + *prval = -EIO; + } + } + } + }; + void stat(uint64_t *psize, ceph::real_time *pmtime, int *prval) { + add_op(CEPH_OSD_OP_STAT); + unsigned p = ops.size() - 1; + C_ObjectOperation_stat *h = new C_ObjectOperation_stat(psize, pmtime, NULL, NULL, + prval); + out_bl[p] = &h->bl; + out_handler[p] = h; + out_rval[p] = prval; + } + void stat(uint64_t *psize, time_t *ptime, int *prval) { + add_op(CEPH_OSD_OP_STAT); + unsigned p = ops.size() - 1; + C_ObjectOperation_stat *h = new C_ObjectOperation_stat(psize, NULL, ptime, NULL, + prval); + out_bl[p] = &h->bl; + out_handler[p] = h; + out_rval[p] = prval; + } + void stat(uint64_t *psize, struct timespec *pts, int *prval) { + add_op(CEPH_OSD_OP_STAT); + unsigned p = ops.size() - 1; + C_ObjectOperation_stat *h = new C_ObjectOperation_stat(psize, NULL, NULL, pts, + prval); + out_bl[p] = &h->bl; + out_handler[p] = h; + out_rval[p] = prval; + } + // object cmpext + struct C_ObjectOperation_cmpext : public Context { + int *prval; + explicit C_ObjectOperation_cmpext(int *prval) + : prval(prval) {} + + void finish(int r) { + if (prval) + *prval = r; + } + }; + + void cmpext(uint64_t off, bufferlist& cmp_bl, int *prval) { + add_data(CEPH_OSD_OP_CMPEXT, off, cmp_bl.length(), cmp_bl); + unsigned p = ops.size() - 1; + C_ObjectOperation_cmpext *h = new C_ObjectOperation_cmpext(prval); + out_handler[p] = h; + out_rval[p] = prval; + } + + // Used by C API + void cmpext(uint64_t off, uint64_t cmp_len, const char *cmp_buf, int *prval) { + bufferlist cmp_bl; + cmp_bl.append(cmp_buf, cmp_len); + add_data(CEPH_OSD_OP_CMPEXT, off, cmp_len, cmp_bl); + unsigned p = ops.size() - 1; + C_ObjectOperation_cmpext *h = new C_ObjectOperation_cmpext(prval); + out_handler[p] = h; + out_rval[p] = prval; + } + + void read(uint64_t off, uint64_t len, bufferlist *pbl, int *prval, + Context* ctx) { + bufferlist bl; + add_data(CEPH_OSD_OP_READ, off, len, bl); + unsigned p = ops.size() - 1; + out_bl[p] = pbl; + out_rval[p] = prval; + out_handler[p] = ctx; + } + + struct C_ObjectOperation_sparse_read : public Context { + bufferlist bl; + bufferlist *data_bl; + std::map<uint64_t, uint64_t> *extents; + int *prval; + C_ObjectOperation_sparse_read(bufferlist *data_bl, + std::map<uint64_t, uint64_t> *extents, + int *prval) + : data_bl(data_bl), extents(extents), prval(prval) {} + void finish(int r) override { + auto iter = bl.cbegin(); + if (r >= 0) { + // NOTE: it's possible the sub-op has not been executed but the result + // code remains zeroed. Avoid the costly exception handling on a + // potential IO path. + if (bl.length() > 0) { + try { + decode(*extents, iter); + decode(*data_bl, iter); + } catch (buffer::error& e) { + if (prval) + *prval = -EIO; + } + } else if (prval) { + *prval = -EIO; + } + } + } + }; + void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m, + bufferlist *data_bl, int *prval) { + bufferlist bl; + add_data(CEPH_OSD_OP_SPARSE_READ, off, len, bl); + unsigned p = ops.size() - 1; + C_ObjectOperation_sparse_read *h = + new C_ObjectOperation_sparse_read(data_bl, m, prval); + out_bl[p] = &h->bl; + out_handler[p] = h; + out_rval[p] = prval; + } + void write(uint64_t off, bufferlist& bl, + uint64_t truncate_size, + uint32_t truncate_seq) { + add_data(CEPH_OSD_OP_WRITE, off, bl.length(), bl); + OSDOp& o = *ops.rbegin(); + o.op.extent.truncate_size = truncate_size; + o.op.extent.truncate_seq = truncate_seq; + } + void write(uint64_t off, bufferlist& bl) { + write(off, bl, 0, 0); + } + void write_full(bufferlist& bl) { + add_data(CEPH_OSD_OP_WRITEFULL, 0, bl.length(), bl); + } + void writesame(uint64_t off, uint64_t write_len, bufferlist& bl) { + add_writesame(CEPH_OSD_OP_WRITESAME, off, write_len, bl); + } + void append(bufferlist& bl) { + add_data(CEPH_OSD_OP_APPEND, 0, bl.length(), bl); + } + void zero(uint64_t off, uint64_t len) { + bufferlist bl; + add_data(CEPH_OSD_OP_ZERO, off, len, bl); + } + void truncate(uint64_t off) { + bufferlist bl; + add_data(CEPH_OSD_OP_TRUNCATE, off, 0, bl); + } + void remove() { + bufferlist bl; + add_data(CEPH_OSD_OP_DELETE, 0, 0, bl); + } + void mapext(uint64_t off, uint64_t len) { + bufferlist bl; + add_data(CEPH_OSD_OP_MAPEXT, off, len, bl); + } + void sparse_read(uint64_t off, uint64_t len) { + bufferlist bl; + add_data(CEPH_OSD_OP_SPARSE_READ, off, len, bl); + } + + void checksum(uint8_t type, const bufferlist &init_value_bl, + uint64_t off, uint64_t len, size_t chunk_size, + bufferlist *pbl, int *prval, Context *ctx) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_CHECKSUM); + osd_op.op.checksum.offset = off; + osd_op.op.checksum.length = len; + osd_op.op.checksum.type = type; + osd_op.op.checksum.chunk_size = chunk_size; + osd_op.indata.append(init_value_bl); + + unsigned p = ops.size() - 1; + out_bl[p] = pbl; + out_rval[p] = prval; + out_handler[p] = ctx; + } + + // object attrs + void getxattr(const char *name, bufferlist *pbl, int *prval) { + bufferlist bl; + add_xattr(CEPH_OSD_OP_GETXATTR, name, bl); + unsigned p = ops.size() - 1; + out_bl[p] = pbl; + out_rval[p] = prval; + } + struct C_ObjectOperation_decodevals : public Context { + uint64_t max_entries; + bufferlist bl; + std::map<std::string,bufferlist> *pattrs; + bool *ptruncated; + int *prval; + C_ObjectOperation_decodevals(uint64_t m, std::map<std::string,bufferlist> *pa, + bool *pt, int *pr) + : max_entries(m), pattrs(pa), ptruncated(pt), prval(pr) { + if (ptruncated) { + *ptruncated = false; + } + } + void finish(int r) override { + if (r >= 0) { + auto p = bl.cbegin(); + try { + if (pattrs) + decode(*pattrs, p); + if (ptruncated) { + std::map<std::string,bufferlist> ignore; + if (!pattrs) { + decode(ignore, p); + pattrs = &ignore; + } + if (!p.end()) { + decode(*ptruncated, p); + } else { + // the OSD did not provide this. since old OSDs do not + // enfoce omap result limits either, we can infer it from + // the size of the result + *ptruncated = (pattrs->size() == max_entries); + } + } + } + catch (buffer::error& e) { + if (prval) + *prval = -EIO; + } + } + } + }; + struct C_ObjectOperation_decodekeys : public Context { + uint64_t max_entries; + bufferlist bl; + std::set<std::string> *pattrs; + bool *ptruncated; + int *prval; + C_ObjectOperation_decodekeys(uint64_t m, std::set<std::string> *pa, bool *pt, + int *pr) + : max_entries(m), pattrs(pa), ptruncated(pt), prval(pr) { + if (ptruncated) { + *ptruncated = false; + } + } + void finish(int r) override { + if (r >= 0) { + auto p = bl.cbegin(); + try { + if (pattrs) + decode(*pattrs, p); + if (ptruncated) { + std::set<std::string> ignore; + if (!pattrs) { + decode(ignore, p); + pattrs = &ignore; + } + if (!p.end()) { + decode(*ptruncated, p); + } else { + // the OSD did not provide this. since old OSDs do not + // enforce omap result limits either, we can infer it from + // the size of the result + *ptruncated = (pattrs->size() == max_entries); + } + } + } + catch (buffer::error& e) { + if (prval) + *prval = -EIO; + } + } + } + }; + struct C_ObjectOperation_decodewatchers : public Context { + bufferlist bl; + list<obj_watch_t> *pwatchers; + int *prval; + C_ObjectOperation_decodewatchers(list<obj_watch_t> *pw, int *pr) + : pwatchers(pw), prval(pr) {} + void finish(int r) override { + if (r >= 0) { + auto p = bl.cbegin(); + try { + obj_list_watch_response_t resp; + decode(resp, p); + if (pwatchers) { + for (list<watch_item_t>::iterator i = resp.entries.begin() ; + i != resp.entries.end() ; ++i) { + obj_watch_t ow; + string sa = i->addr.get_legacy_str(); + strncpy(ow.addr, sa.c_str(), 256); + ow.watcher_id = i->name.num(); + ow.cookie = i->cookie; + ow.timeout_seconds = i->timeout_seconds; + pwatchers->push_back(ow); + } + } + } + catch (buffer::error& e) { + if (prval) + *prval = -EIO; + } + } + } + }; + struct C_ObjectOperation_decodesnaps : public Context { + bufferlist bl; + librados::snap_set_t *psnaps; + int *prval; + C_ObjectOperation_decodesnaps(librados::snap_set_t *ps, int *pr) + : psnaps(ps), prval(pr) {} + void finish(int r) override { + if (r >= 0) { + auto p = bl.cbegin(); + try { + obj_list_snap_response_t resp; + decode(resp, p); + if (psnaps) { + psnaps->clones.clear(); + for (vector<clone_info>::iterator ci = resp.clones.begin(); + ci != resp.clones.end(); + ++ci) { + librados::clone_info_t clone; + + clone.cloneid = ci->cloneid; + clone.snaps.reserve(ci->snaps.size()); + clone.snaps.insert(clone.snaps.end(), ci->snaps.begin(), + ci->snaps.end()); + clone.overlap = ci->overlap; + clone.size = ci->size; + + psnaps->clones.push_back(clone); + } + psnaps->seq = resp.seq; + } + } catch (buffer::error& e) { + if (prval) + *prval = -EIO; + } + } + } + }; + void getxattrs(std::map<std::string,bufferlist> *pattrs, int *prval) { + add_op(CEPH_OSD_OP_GETXATTRS); + if (pattrs || prval) { + unsigned p = ops.size() - 1; + C_ObjectOperation_decodevals *h + = new C_ObjectOperation_decodevals(0, pattrs, nullptr, prval); + out_handler[p] = h; + out_bl[p] = &h->bl; + out_rval[p] = prval; + } + } + void setxattr(const char *name, const bufferlist& bl) { + add_xattr(CEPH_OSD_OP_SETXATTR, name, bl); + } + void setxattr(const char *name, const string& s) { + bufferlist bl; + bl.append(s); + add_xattr(CEPH_OSD_OP_SETXATTR, name, bl); + } + void cmpxattr(const char *name, uint8_t cmp_op, uint8_t cmp_mode, + const bufferlist& bl) { + add_xattr_cmp(CEPH_OSD_OP_CMPXATTR, name, cmp_op, cmp_mode, bl); + } + void rmxattr(const char *name) { + bufferlist bl; + add_xattr(CEPH_OSD_OP_RMXATTR, name, bl); + } + void setxattrs(map<string, bufferlist>& attrs) { + bufferlist bl; + encode(attrs, bl); + add_xattr(CEPH_OSD_OP_RESETXATTRS, 0, bl.length()); + } + void resetxattrs(const char *prefix, map<string, bufferlist>& attrs) { + bufferlist bl; + encode(attrs, bl); + add_xattr(CEPH_OSD_OP_RESETXATTRS, prefix, bl); + } + + // trivialmap + void tmap_update(bufferlist& bl) { + add_data(CEPH_OSD_OP_TMAPUP, 0, 0, bl); + } + + // objectmap + void omap_get_keys(const string &start_after, + uint64_t max_to_get, + std::set<std::string> *out_set, + bool *ptruncated, + int *prval) { + OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETKEYS); + bufferlist bl; + encode(start_after, bl); + encode(max_to_get, bl); + op.op.extent.offset = 0; + op.op.extent.length = bl.length(); + op.indata.claim_append(bl); + if (prval || ptruncated || out_set) { + unsigned p = ops.size() - 1; + C_ObjectOperation_decodekeys *h = + new C_ObjectOperation_decodekeys(max_to_get, out_set, ptruncated, prval); + out_handler[p] = h; + out_bl[p] = &h->bl; + out_rval[p] = prval; + } + } + + void omap_get_vals(const string &start_after, + const string &filter_prefix, + uint64_t max_to_get, + std::map<std::string, bufferlist> *out_set, + bool *ptruncated, + int *prval) { + OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETVALS); + bufferlist bl; + encode(start_after, bl); + encode(max_to_get, bl); + encode(filter_prefix, bl); + op.op.extent.offset = 0; + op.op.extent.length = bl.length(); + op.indata.claim_append(bl); + if (prval || out_set || ptruncated) { + unsigned p = ops.size() - 1; + C_ObjectOperation_decodevals *h = + new C_ObjectOperation_decodevals(max_to_get, out_set, ptruncated, prval); + out_handler[p] = h; + out_bl[p] = &h->bl; + out_rval[p] = prval; + } + } + + void omap_get_vals_by_keys(const std::set<std::string> &to_get, + std::map<std::string, bufferlist> *out_set, + int *prval) { + OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETVALSBYKEYS); + bufferlist bl; + encode(to_get, bl); + op.op.extent.offset = 0; + op.op.extent.length = bl.length(); + op.indata.claim_append(bl); + if (prval || out_set) { + unsigned p = ops.size() - 1; + C_ObjectOperation_decodevals *h = + new C_ObjectOperation_decodevals(0, out_set, nullptr, prval); + out_handler[p] = h; + out_bl[p] = &h->bl; + out_rval[p] = prval; + } + } + + void omap_cmp(const std::map<std::string, pair<bufferlist,int> > &assertions, + int *prval) { + OSDOp &op = add_op(CEPH_OSD_OP_OMAP_CMP); + bufferlist bl; + encode(assertions, bl); + op.op.extent.offset = 0; + op.op.extent.length = bl.length(); + op.indata.claim_append(bl); + if (prval) { + unsigned p = ops.size() - 1; + out_rval[p] = prval; + } + } + + struct C_ObjectOperation_copyget : public Context { + bufferlist bl; + object_copy_cursor_t *cursor; + uint64_t *out_size; + ceph::real_time *out_mtime; + std::map<std::string,bufferlist> *out_attrs; + bufferlist *out_data, *out_omap_header, *out_omap_data; + vector<snapid_t> *out_snaps; + snapid_t *out_snap_seq; + uint32_t *out_flags; + uint32_t *out_data_digest; + uint32_t *out_omap_digest; + mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *out_reqids; + mempool::osd_pglog::map<uint32_t, int> *out_reqid_return_codes; + uint64_t *out_truncate_seq; + uint64_t *out_truncate_size; + int *prval; + C_ObjectOperation_copyget(object_copy_cursor_t *c, + uint64_t *s, + ceph::real_time *m, + std::map<std::string,bufferlist> *a, + bufferlist *d, bufferlist *oh, + bufferlist *o, + std::vector<snapid_t> *osnaps, + snapid_t *osnap_seq, + uint32_t *flags, + uint32_t *dd, + uint32_t *od, + mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *oreqids, + mempool::osd_pglog::map<uint32_t, int> *oreqid_return_codes, + uint64_t *otseq, + uint64_t *otsize, + int *r) + : cursor(c), + out_size(s), out_mtime(m), + out_attrs(a), out_data(d), out_omap_header(oh), + out_omap_data(o), out_snaps(osnaps), out_snap_seq(osnap_seq), + out_flags(flags), out_data_digest(dd), out_omap_digest(od), + out_reqids(oreqids), + out_reqid_return_codes(oreqid_return_codes), + out_truncate_seq(otseq), + out_truncate_size(otsize), + prval(r) {} + void finish(int r) override { + // reqids are copied on ENOENT + if (r < 0 && r != -ENOENT) + return; + try { + auto p = bl.cbegin(); + object_copy_data_t copy_reply; + decode(copy_reply, p); + if (r == -ENOENT) { + if (out_reqids) + *out_reqids = copy_reply.reqids; + return; + } + if (out_size) + *out_size = copy_reply.size; + if (out_mtime) + *out_mtime = ceph::real_clock::from_ceph_timespec(copy_reply.mtime); + if (out_attrs) + *out_attrs = copy_reply.attrs; + if (out_data) + out_data->claim_append(copy_reply.data); + if (out_omap_header) + out_omap_header->claim_append(copy_reply.omap_header); + if (out_omap_data) + *out_omap_data = copy_reply.omap_data; + if (out_snaps) + *out_snaps = copy_reply.snaps; + if (out_snap_seq) + *out_snap_seq = copy_reply.snap_seq; + if (out_flags) + *out_flags = copy_reply.flags; + if (out_data_digest) + *out_data_digest = copy_reply.data_digest; + if (out_omap_digest) + *out_omap_digest = copy_reply.omap_digest; + if (out_reqids) + *out_reqids = copy_reply.reqids; + if (out_reqid_return_codes) + *out_reqid_return_codes = copy_reply.reqid_return_codes; + if (out_truncate_seq) + *out_truncate_seq = copy_reply.truncate_seq; + if (out_truncate_size) + *out_truncate_size = copy_reply.truncate_size; + *cursor = copy_reply.cursor; + } catch (buffer::error& e) { + if (prval) + *prval = -EIO; + } + } + }; + + void copy_get(object_copy_cursor_t *cursor, + uint64_t max, + uint64_t *out_size, + ceph::real_time *out_mtime, + std::map<std::string,bufferlist> *out_attrs, + bufferlist *out_data, + bufferlist *out_omap_header, + bufferlist *out_omap_data, + vector<snapid_t> *out_snaps, + snapid_t *out_snap_seq, + uint32_t *out_flags, + uint32_t *out_data_digest, + uint32_t *out_omap_digest, + mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *out_reqids, + mempool::osd_pglog::map<uint32_t, int> *out_reqid_return_codes, + uint64_t *truncate_seq, + uint64_t *truncate_size, + int *prval) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET); + osd_op.op.copy_get.max = max; + encode(*cursor, osd_op.indata); + encode(max, osd_op.indata); + unsigned p = ops.size() - 1; + out_rval[p] = prval; + C_ObjectOperation_copyget *h = + new C_ObjectOperation_copyget(cursor, out_size, out_mtime, + out_attrs, out_data, out_omap_header, + out_omap_data, out_snaps, out_snap_seq, + out_flags, out_data_digest, + out_omap_digest, out_reqids, + out_reqid_return_codes, truncate_seq, + truncate_size, prval); + out_bl[p] = &h->bl; + out_handler[p] = h; + } + + void undirty() { + add_op(CEPH_OSD_OP_UNDIRTY); + } + + struct C_ObjectOperation_isdirty : public Context { + bufferlist bl; + bool *pisdirty; + int *prval; + C_ObjectOperation_isdirty(bool *p, int *r) + : pisdirty(p), prval(r) {} + void finish(int r) override { + if (r < 0) + return; + try { + auto p = bl.cbegin(); + bool isdirty; + decode(isdirty, p); + if (pisdirty) + *pisdirty = isdirty; + } catch (buffer::error& e) { + if (prval) + *prval = -EIO; + } + } + }; + + void is_dirty(bool *pisdirty, int *prval) { + add_op(CEPH_OSD_OP_ISDIRTY); + unsigned p = ops.size() - 1; + out_rval[p] = prval; + C_ObjectOperation_isdirty *h = + new C_ObjectOperation_isdirty(pisdirty, prval); + out_bl[p] = &h->bl; + out_handler[p] = h; + } + + struct C_ObjectOperation_hit_set_ls : public Context { + bufferlist bl; + std::list< std::pair<time_t, time_t> > *ptls; + std::list< std::pair<ceph::real_time, ceph::real_time> > *putls; + int *prval; + C_ObjectOperation_hit_set_ls(std::list< std::pair<time_t, time_t> > *t, + std::list< std::pair<ceph::real_time, + ceph::real_time> > *ut, + int *r) + : ptls(t), putls(ut), prval(r) {} + void finish(int r) override { + if (r < 0) + return; + try { + auto p = bl.cbegin(); + std::list< std::pair<ceph::real_time, ceph::real_time> > ls; + decode(ls, p); + if (ptls) { + ptls->clear(); + for (auto p = ls.begin(); p != ls.end(); ++p) + // round initial timestamp up to the next full second to + // keep this a valid interval. + ptls->push_back( + make_pair(ceph::real_clock::to_time_t( + ceph::ceil(p->first, + // Sadly, no time literals until C++14. + std::chrono::seconds(1))), + ceph::real_clock::to_time_t(p->second))); + } + if (putls) + putls->swap(ls); + } catch (buffer::error& e) { + r = -EIO; + } + if (prval) + *prval = r; + } + }; + + /** + * list available HitSets. + * + * We will get back a list of time intervals. Note that the most + * recent range may have an empty end timestamp if it is still + * accumulating. + * + * @param pls [out] list of time intervals + * @param prval [out] return value + */ + void hit_set_ls(std::list< std::pair<time_t, time_t> > *pls, int *prval) { + add_op(CEPH_OSD_OP_PG_HITSET_LS); + unsigned p = ops.size() - 1; + out_rval[p] = prval; + C_ObjectOperation_hit_set_ls *h = + new C_ObjectOperation_hit_set_ls(pls, NULL, prval); + out_bl[p] = &h->bl; + out_handler[p] = h; + } + void hit_set_ls(std::list<std::pair<ceph::real_time, ceph::real_time> > *pls, + int *prval) { + add_op(CEPH_OSD_OP_PG_HITSET_LS); + unsigned p = ops.size() - 1; + out_rval[p] = prval; + C_ObjectOperation_hit_set_ls *h = + new C_ObjectOperation_hit_set_ls(NULL, pls, prval); + out_bl[p] = &h->bl; + out_handler[p] = h; + } + + /** + * get HitSet + * + * Return an encoded HitSet that includes the provided time + * interval. + * + * @param stamp [in] timestamp + * @param pbl [out] target buffer for encoded HitSet + * @param prval [out] return value + */ + void hit_set_get(ceph::real_time stamp, bufferlist *pbl, int *prval) { + OSDOp& op = add_op(CEPH_OSD_OP_PG_HITSET_GET); + op.op.hit_set_get.stamp = ceph::real_clock::to_ceph_timespec(stamp); + unsigned p = ops.size() - 1; + out_rval[p] = prval; + out_bl[p] = pbl; + } + + void omap_get_header(bufferlist *bl, int *prval) { + add_op(CEPH_OSD_OP_OMAPGETHEADER); + unsigned p = ops.size() - 1; + out_bl[p] = bl; + out_rval[p] = prval; + } + + void omap_set(const map<string, bufferlist> &map) { + bufferlist bl; + encode(map, bl); + add_data(CEPH_OSD_OP_OMAPSETVALS, 0, bl.length(), bl); + } + + void omap_set_header(bufferlist &bl) { + add_data(CEPH_OSD_OP_OMAPSETHEADER, 0, bl.length(), bl); + } + + void omap_clear() { + add_op(CEPH_OSD_OP_OMAPCLEAR); + } + + void omap_rm_keys(const std::set<std::string> &to_remove) { + bufferlist bl; + encode(to_remove, bl); + add_data(CEPH_OSD_OP_OMAPRMKEYS, 0, bl.length(), bl); + } + + // object classes + void call(const char *cname, const char *method, bufferlist &indata) { + add_call(CEPH_OSD_OP_CALL, cname, method, indata, NULL, NULL, NULL); + } + + void call(const char *cname, const char *method, bufferlist &indata, + bufferlist *outdata, Context *ctx, int *prval) { + add_call(CEPH_OSD_OP_CALL, cname, method, indata, outdata, ctx, prval); + } + + // watch/notify + void watch(uint64_t cookie, __u8 op, uint32_t timeout = 0) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_WATCH); + osd_op.op.watch.cookie = cookie; + osd_op.op.watch.op = op; + osd_op.op.watch.timeout = timeout; + } + + void notify(uint64_t cookie, uint32_t prot_ver, uint32_t timeout, + bufferlist &bl, bufferlist *inbl) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_NOTIFY); + osd_op.op.notify.cookie = cookie; + encode(prot_ver, *inbl); + encode(timeout, *inbl); + encode(bl, *inbl); + osd_op.indata.append(*inbl); + } + + void notify_ack(uint64_t notify_id, uint64_t cookie, + bufferlist& reply_bl) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_NOTIFY_ACK); + bufferlist bl; + encode(notify_id, bl); + encode(cookie, bl); + encode(reply_bl, bl); + osd_op.indata.append(bl); + } + + void list_watchers(list<obj_watch_t> *out, + int *prval) { + (void)add_op(CEPH_OSD_OP_LIST_WATCHERS); + if (prval || out) { + unsigned p = ops.size() - 1; + C_ObjectOperation_decodewatchers *h = + new C_ObjectOperation_decodewatchers(out, prval); + out_handler[p] = h; + out_bl[p] = &h->bl; + out_rval[p] = prval; + } + } + + void list_snaps(librados::snap_set_t *out, int *prval) { + (void)add_op(CEPH_OSD_OP_LIST_SNAPS); + if (prval || out) { + unsigned p = ops.size() - 1; + C_ObjectOperation_decodesnaps *h = + new C_ObjectOperation_decodesnaps(out, prval); + out_handler[p] = h; + out_bl[p] = &h->bl; + out_rval[p] = prval; + } + } + + void assert_version(uint64_t ver) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_ASSERT_VER); + osd_op.op.assert_ver.ver = ver; + } + + void cmpxattr(const char *name, const bufferlist& val, + int op, int mode) { + add_xattr(CEPH_OSD_OP_CMPXATTR, name, val); + OSDOp& o = *ops.rbegin(); + o.op.xattr.cmp_op = op; + o.op.xattr.cmp_mode = mode; + } + + void rollback(uint64_t snapid) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_ROLLBACK); + osd_op.op.snap.snapid = snapid; + } + + void copy_from(object_t src, snapid_t snapid, object_locator_t src_oloc, + version_t src_version, unsigned flags, + unsigned src_fadvise_flags) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_FROM); + osd_op.op.copy_from.snapid = snapid; + osd_op.op.copy_from.src_version = src_version; + osd_op.op.copy_from.flags = flags; + osd_op.op.copy_from.src_fadvise_flags = src_fadvise_flags; + encode(src, osd_op.indata); + encode(src_oloc, osd_op.indata); + } + + /** + * writeback content to backing tier + * + * If object is marked dirty in the cache tier, write back content + * to backing tier. If the object is clean this is a no-op. + * + * If writeback races with an update, the update will block. + * + * use with IGNORE_CACHE to avoid triggering promote. + */ + void cache_flush() { + add_op(CEPH_OSD_OP_CACHE_FLUSH); + } + + /** + * writeback content to backing tier + * + * If object is marked dirty in the cache tier, write back content + * to backing tier. If the object is clean this is a no-op. + * + * If writeback races with an update, return EAGAIN. Requires that + * the SKIPRWLOCKS flag be set. + * + * use with IGNORE_CACHE to avoid triggering promote. + */ + void cache_try_flush() { + add_op(CEPH_OSD_OP_CACHE_TRY_FLUSH); + } + + /** + * evict object from cache tier + * + * If object is marked clean, remove the object from the cache tier. + * Otherwise, return EBUSY. + * + * use with IGNORE_CACHE to avoid triggering promote. + */ + void cache_evict() { + add_op(CEPH_OSD_OP_CACHE_EVICT); + } + + /* + * Extensible tier + */ + void set_redirect(object_t tgt, snapid_t snapid, object_locator_t tgt_oloc, + version_t tgt_version, int flag) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_SET_REDIRECT); + osd_op.op.copy_from.snapid = snapid; + osd_op.op.copy_from.src_version = tgt_version; + encode(tgt, osd_op.indata); + encode(tgt_oloc, osd_op.indata); + set_last_op_flags(flag); + } + + void set_chunk(uint64_t src_offset, uint64_t src_length, object_locator_t tgt_oloc, + object_t tgt_oid, uint64_t tgt_offset, int flag) { + OSDOp& osd_op = add_op(CEPH_OSD_OP_SET_CHUNK); + encode(src_offset, osd_op.indata); + encode(src_length, osd_op.indata); + encode(tgt_oloc, osd_op.indata); + encode(tgt_oid, osd_op.indata); + encode(tgt_offset, osd_op.indata); + set_last_op_flags(flag); + } + + void tier_promote() { + add_op(CEPH_OSD_OP_TIER_PROMOTE); + } + + void unset_manifest() { + add_op(CEPH_OSD_OP_UNSET_MANIFEST); + } + + void set_alloc_hint(uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags) { + add_alloc_hint(CEPH_OSD_OP_SETALLOCHINT, expected_object_size, + expected_write_size, flags); + + // CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed + // not worth a feature bit. Set FAILOK per-op flag to make + // sure older osds don't trip over an unsupported opcode. + set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); + } + + void dup(vector<OSDOp>& sops) { + ops = sops; + out_bl.resize(sops.size()); + out_handler.resize(sops.size()); + out_rval.resize(sops.size()); + for (uint32_t i = 0; i < sops.size(); i++) { + out_bl[i] = &sops[i].outdata; + out_handler[i] = NULL; + out_rval[i] = &sops[i].rval; + } + } + + /** + * Pin/unpin an object in cache tier + */ + void cache_pin() { + add_op(CEPH_OSD_OP_CACHE_PIN); + } + + void cache_unpin() { + add_op(CEPH_OSD_OP_CACHE_UNPIN); + } +}; + + +// ---------------- + + +class Objecter : public md_config_obs_t, public Dispatcher { +public: + // config observer bits + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) override; + +public: + Messenger *messenger; + MonClient *monc; + Finisher *finisher; + ZTracer::Endpoint trace_endpoint; +private: + OSDMap *osdmap; +public: + using Dispatcher::cct; + std::multimap<string,string> crush_location; + + std::atomic<bool> initialized{false}; + +private: + std::atomic<uint64_t> last_tid{0}; + std::atomic<unsigned> inflight_ops{0}; + std::atomic<int> client_inc{-1}; + uint64_t max_linger_id; + std::atomic<unsigned> num_in_flight{0}; + std::atomic<int> global_op_flags{0}; // flags which are applied to each IO op + bool keep_balanced_budget; + bool honor_osdmap_full; + bool osdmap_full_try; + + // If this is true, accumulate a set of blacklisted entities + // to be drained by consume_blacklist_events. + bool blacklist_events_enabled; + std::set<entity_addr_t> blacklist_events; + +public: + void maybe_request_map(); + + void enable_blacklist_events(); +private: + + void _maybe_request_map(); + + version_t last_seen_osdmap_version; + version_t last_seen_pgmap_version; + + mutable std::shared_mutex rwlock; + using lock_guard = std::lock_guard<decltype(rwlock)>; + using unique_lock = std::unique_lock<decltype(rwlock)>; + using shared_lock = boost::shared_lock<decltype(rwlock)>; + using shunique_lock = ceph::shunique_lock<decltype(rwlock)>; + ceph::timer<ceph::coarse_mono_clock> timer; + + PerfCounters *logger; + + uint64_t tick_event; + + void start_tick(); + void tick(); + void update_crush_location(); + + class RequestStateHook; + + RequestStateHook *m_request_state_hook; + +public: + /*** track pending operations ***/ + // read + + struct OSDSession; + + struct op_target_t { + int flags = 0; + + epoch_t epoch = 0; ///< latest epoch we calculated the mapping + + object_t base_oid; + object_locator_t base_oloc; + object_t target_oid; + object_locator_t target_oloc; + + ///< true if we are directed at base_pgid, not base_oid + bool precalc_pgid = false; + + ///< true if we have ever mapped to a valid pool + bool pool_ever_existed = false; + + ///< explcit pg target, if any + pg_t base_pgid; + + pg_t pgid; ///< last (raw) pg we mapped to + spg_t actual_pgid; ///< last (actual) spg_t we mapped to + unsigned pg_num = 0; ///< last pg_num we mapped to + unsigned pg_num_mask = 0; ///< last pg_num_mask we mapped to + unsigned pg_num_pending = 0; ///< last pg_num we mapped to + vector<int> up; ///< set of up osds for last pg we mapped to + vector<int> acting; ///< set of acting osds for last pg we mapped to + int up_primary = -1; ///< last up_primary we mapped to + int acting_primary = -1; ///< last acting_primary we mapped to + int size = -1; ///< the size of the pool when were were last mapped + int min_size = -1; ///< the min size of the pool when were were last mapped + bool sort_bitwise = false; ///< whether the hobject_t sort order is bitwise + bool recovery_deletes = false; ///< whether the deletes are performed during recovery instead of peering + + bool used_replica = false; + bool paused = false; + + int osd = -1; ///< the final target osd, or -1 + + epoch_t last_force_resend = 0; + + op_target_t(object_t oid, object_locator_t oloc, int flags) + : flags(flags), + base_oid(oid), + base_oloc(oloc) + {} + + explicit op_target_t(pg_t pgid) + : base_oloc(pgid.pool(), pgid.ps()), + precalc_pgid(true), + base_pgid(pgid) + {} + + op_target_t() = default; + + hobject_t get_hobj() { + return hobject_t(target_oid, + target_oloc.key, + CEPH_NOSNAP, + target_oloc.hash >= 0 ? target_oloc.hash : pgid.ps(), + target_oloc.pool, + target_oloc.nspace); + } + + bool contained_by(const hobject_t& begin, const hobject_t& end) { + hobject_t h = get_hobj(); + int r = cmp(h, begin); + return r == 0 || (r > 0 && h < end); + } + + void dump(Formatter *f) const; + }; + + struct Op : public RefCountedObject { + OSDSession *session; + int incarnation; + + op_target_t target; + + ConnectionRef con; // for rx buffer only + uint64_t features; // explicitly specified op features + + vector<OSDOp> ops; + + snapid_t snapid; + SnapContext snapc; + ceph::real_time mtime; + + bufferlist *outbl; + vector<bufferlist*> out_bl; + vector<Context*> out_handler; + vector<int*> out_rval; + + int priority; + Context *onfinish; + uint64_t ontimeout; + + ceph_tid_t tid; + int attempts; + + version_t *objver; + epoch_t *reply_epoch; + + ceph::coarse_mono_time stamp; + + epoch_t map_dne_bound; + + int budget; + + /// true if we should resend this message on failure + bool should_resend; + + /// true if the throttle budget is get/put on a series of OPs, + /// instead of per OP basis, when this flag is set, the budget is + /// acquired before sending the very first OP of the series and + /// released upon receiving the last OP reply. + bool ctx_budgeted; + + int *data_offset; + + osd_reqid_t reqid; // explicitly setting reqid + ZTracer::Trace trace; + + Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op, + int f, Context *fin, version_t *ov, int *offset = NULL, + ZTracer::Trace *parent_trace = nullptr) : + session(NULL), incarnation(0), + target(o, ol, f), + con(NULL), + features(CEPH_FEATURES_SUPPORTED_DEFAULT), + snapid(CEPH_NOSNAP), + outbl(NULL), + priority(0), + onfinish(fin), + ontimeout(0), + tid(0), + attempts(0), + objver(ov), + reply_epoch(NULL), + map_dne_bound(0), + budget(-1), + should_resend(true), + ctx_budgeted(false), + data_offset(offset) { + ops.swap(op); + + /* initialize out_* to match op vector */ + out_bl.resize(ops.size()); + out_rval.resize(ops.size()); + out_handler.resize(ops.size()); + for (unsigned i = 0; i < ops.size(); i++) { + out_bl[i] = NULL; + out_handler[i] = NULL; + out_rval[i] = NULL; + } + + if (target.base_oloc.key == o) + target.base_oloc.key.clear(); + + if (parent_trace && parent_trace->valid()) { + trace.init("op", nullptr, parent_trace); + trace.event("start"); + } + } + + bool operator<(const Op& other) const { + return tid < other.tid; + } + + bool respects_full() const { + return + (target.flags & (CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_RWORDERED)) && + !(target.flags & (CEPH_OSD_FLAG_FULL_TRY | CEPH_OSD_FLAG_FULL_FORCE)); + } + + private: + ~Op() override { + while (!out_handler.empty()) { + delete out_handler.back(); + out_handler.pop_back(); + } + trace.event("finish"); + } + }; + + struct C_Op_Map_Latest : public Context { + Objecter *objecter; + ceph_tid_t tid; + version_t latest; + C_Op_Map_Latest(Objecter *o, ceph_tid_t t) : objecter(o), tid(t), + latest(0) {} + void finish(int r) override; + }; + + struct C_Command_Map_Latest : public Context { + Objecter *objecter; + uint64_t tid; + version_t latest; + C_Command_Map_Latest(Objecter *o, ceph_tid_t t) : objecter(o), tid(t), + latest(0) {} + void finish(int r) override; + }; + + struct C_Stat : public Context { + bufferlist bl; + uint64_t *psize; + ceph::real_time *pmtime; + Context *fin; + C_Stat(uint64_t *ps, ceph::real_time *pm, Context *c) : + psize(ps), pmtime(pm), fin(c) {} + void finish(int r) override { + if (r >= 0) { + auto p = bl.cbegin(); + uint64_t s; + ceph::real_time m; + decode(s, p); + decode(m, p); + if (psize) + *psize = s; + if (pmtime) + *pmtime = m; + } + fin->complete(r); + } + }; + + struct C_GetAttrs : public Context { + bufferlist bl; + map<string,bufferlist>& attrset; + Context *fin; + C_GetAttrs(map<string, bufferlist>& set, Context *c) : attrset(set), + fin(c) {} + void finish(int r) override { + if (r >= 0) { + auto p = bl.cbegin(); + decode(attrset, p); + } + fin->complete(r); + } + }; + + + // Pools and statistics + struct NListContext { + collection_list_handle_t pos; + + // these are for !sortbitwise compat only + int current_pg = 0; + int starting_pg_num = 0; + bool sort_bitwise = false; + + bool at_end_of_pool = false; ///< publicly visible end flag + + int64_t pool_id = -1; + int pool_snap_seq = 0; + uint64_t max_entries = 0; + string nspace; + + bufferlist bl; // raw data read to here + std::list<librados::ListObjectImpl> list; + + bufferlist filter; + + bufferlist extra_info; + + // The budget associated with this context, once it is set (>= 0), + // the budget is not get/released on OP basis, instead the budget + // is acquired before sending the first OP and released upon receiving + // the last op reply. + int ctx_budget = -1; + + bool at_end() const { + return at_end_of_pool; + } + + uint32_t get_pg_hash_position() const { + return pos.get_hash(); + } + }; + + struct C_NList : public Context { + NListContext *list_context; + Context *final_finish; + Objecter *objecter; + epoch_t epoch; + C_NList(NListContext *lc, Context * finish, Objecter *ob) : + list_context(lc), final_finish(finish), objecter(ob), epoch(0) {} + void finish(int r) override { + if (r >= 0) { + objecter->_nlist_reply(list_context, r, final_finish, epoch); + } else { + final_finish->complete(r); + } + } + }; + + struct PoolStatOp { + ceph_tid_t tid; + list<string> pools; + + map<string,pool_stat_t> *pool_stats; + bool *per_pool; + Context *onfinish; + uint64_t ontimeout; + + ceph::coarse_mono_time last_submit; + }; + + struct StatfsOp { + ceph_tid_t tid; + struct ceph_statfs *stats; + boost::optional<int64_t> data_pool; + Context *onfinish; + uint64_t ontimeout; + + ceph::coarse_mono_time last_submit; + }; + + struct PoolOp { + ceph_tid_t tid; + int64_t pool; + string name; + Context *onfinish; + uint64_t ontimeout; + int pool_op; + int16_t crush_rule; + snapid_t snapid; + bufferlist *blp; + + ceph::coarse_mono_time last_submit; + PoolOp() : tid(0), pool(0), onfinish(NULL), ontimeout(0), pool_op(0), + crush_rule(0), snapid(0), blp(NULL) {} + }; + + // -- osd commands -- + struct CommandOp : public RefCountedObject { + OSDSession *session = nullptr; + ceph_tid_t tid = 0; + vector<string> cmd; + bufferlist inbl; + bufferlist *poutbl = nullptr; + string *prs = nullptr; + + // target_osd == -1 means target_pg is valid + const int target_osd = -1; + const pg_t target_pg; + + op_target_t target; + + epoch_t map_dne_bound = 0; + int map_check_error = 0; // error to return if map check fails + const char *map_check_error_str = nullptr; + + Context *onfinish = nullptr; + uint64_t ontimeout = 0; + ceph::coarse_mono_time last_submit; + + CommandOp( + int target_osd, + const vector<string> &cmd, + bufferlist inbl, + bufferlist *poutbl, + string *prs, + Context *onfinish) + : cmd(cmd), + inbl(inbl), + poutbl(poutbl), + prs(prs), + target_osd(target_osd), + onfinish(onfinish) {} + + CommandOp( + pg_t pgid, + const vector<string> &cmd, + bufferlist inbl, + bufferlist *poutbl, + string *prs, + Context *onfinish) + : cmd(cmd), + inbl(inbl), + poutbl(poutbl), + prs(prs), + target_pg(pgid), + target(pgid), + onfinish(onfinish) {} + + }; + + void submit_command(CommandOp *c, ceph_tid_t *ptid); + int _calc_command_target(CommandOp *c, shunique_lock &sul); + void _assign_command_session(CommandOp *c, shunique_lock &sul); + void _send_command(CommandOp *c); + int command_op_cancel(OSDSession *s, ceph_tid_t tid, int r); + void _finish_command(CommandOp *c, int r, string rs); + void handle_command_reply(MCommandReply *m); + + + // -- lingering ops -- + + struct WatchContext { + // this simply mirrors librados WatchCtx2 + virtual void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) = 0; + virtual void handle_error(uint64_t cookie, int err) = 0; + virtual ~WatchContext() {} + }; + + struct LingerOp : public RefCountedObject { + uint64_t linger_id; + + op_target_t target; + + snapid_t snap; + SnapContext snapc; + ceph::real_time mtime; + + vector<OSDOp> ops; + bufferlist inbl; + bufferlist *poutbl; + version_t *pobjver; + + bool is_watch; + ceph::coarse_mono_time watch_valid_thru; ///< send time for last acked ping + int last_error; ///< error from last failed ping|reconnect, if any + std::shared_mutex watch_lock; + using lock_guard = std::unique_lock<decltype(watch_lock)>; + using unique_lock = std::unique_lock<decltype(watch_lock)>; + using shared_lock = boost::shared_lock<decltype(watch_lock)>; + using shunique_lock = ceph::shunique_lock<decltype(watch_lock)>; + + // queue of pending async operations, with the timestamp of + // when they were queued. + list<ceph::coarse_mono_time> watch_pending_async; + + uint32_t register_gen; + bool registered; + bool canceled; + Context *on_reg_commit; + + // we trigger these from an async finisher + Context *on_notify_finish; + bufferlist *notify_result_bl; + uint64_t notify_id; + + WatchContext *watch_context; + + OSDSession *session; + + Objecter *objecter; + int ctx_budget; + ceph_tid_t register_tid; + ceph_tid_t ping_tid; + epoch_t map_dne_bound; + + void _queued_async() { + // watch_lock ust be locked unique + watch_pending_async.push_back(ceph::coarse_mono_clock::now()); + } + void finished_async() { + unique_lock l(watch_lock); + ceph_assert(!watch_pending_async.empty()); + watch_pending_async.pop_front(); + } + + explicit LingerOp(Objecter *o) : linger_id(0), + target(object_t(), object_locator_t(), 0), + snap(CEPH_NOSNAP), poutbl(NULL), pobjver(NULL), + is_watch(false), last_error(0), + register_gen(0), + registered(false), + canceled(false), + on_reg_commit(NULL), + on_notify_finish(NULL), + notify_result_bl(NULL), + notify_id(0), + watch_context(NULL), + session(NULL), + objecter(o), + ctx_budget(-1), + register_tid(0), + ping_tid(0), + map_dne_bound(0) {} + + const LingerOp &operator=(const LingerOp& r) = delete; + LingerOp(const LingerOp& o) = delete; + + uint64_t get_cookie() { + return reinterpret_cast<uint64_t>(this); + } + + private: + ~LingerOp() override { + delete watch_context; + } + }; + + struct C_Linger_Commit : public Context { + Objecter *objecter; + LingerOp *info; + bufferlist outbl; // used for notify only + C_Linger_Commit(Objecter *o, LingerOp *l) : objecter(o), info(l) { + info->get(); + } + ~C_Linger_Commit() override { + info->put(); + } + void finish(int r) override { + objecter->_linger_commit(info, r, outbl); + } + }; + + struct C_Linger_Reconnect : public Context { + Objecter *objecter; + LingerOp *info; + C_Linger_Reconnect(Objecter *o, LingerOp *l) : objecter(o), info(l) { + info->get(); + } + ~C_Linger_Reconnect() override { + info->put(); + } + void finish(int r) override { + objecter->_linger_reconnect(info, r); + } + }; + + struct C_Linger_Ping : public Context { + Objecter *objecter; + LingerOp *info; + ceph::coarse_mono_time sent; + uint32_t register_gen; + C_Linger_Ping(Objecter *o, LingerOp *l) + : objecter(o), info(l), register_gen(info->register_gen) { + info->get(); + } + ~C_Linger_Ping() override { + info->put(); + } + void finish(int r) override { + objecter->_linger_ping(info, r, sent, register_gen); + } + }; + + struct C_Linger_Map_Latest : public Context { + Objecter *objecter; + uint64_t linger_id; + version_t latest; + C_Linger_Map_Latest(Objecter *o, uint64_t id) : + objecter(o), linger_id(id), latest(0) {} + void finish(int r) override; + }; + + // -- osd sessions -- + struct OSDBackoff { + spg_t pgid; + uint64_t id; + hobject_t begin, end; + }; + + struct OSDSession : public RefCountedObject { + std::shared_mutex lock; + using lock_guard = std::lock_guard<decltype(lock)>; + using unique_lock = std::unique_lock<decltype(lock)>; + using shared_lock = boost::shared_lock<decltype(lock)>; + using shunique_lock = ceph::shunique_lock<decltype(lock)>; + + // pending ops + map<ceph_tid_t,Op*> ops; + map<uint64_t, LingerOp*> linger_ops; + map<ceph_tid_t,CommandOp*> command_ops; + + // backoffs + map<spg_t,map<hobject_t,OSDBackoff>> backoffs; + map<uint64_t,OSDBackoff*> backoffs_by_id; + + int osd; + int incarnation; + ConnectionRef con; + int num_locks; + std::unique_ptr<std::mutex[]> completion_locks; + using unique_completion_lock = std::unique_lock< + decltype(completion_locks)::element_type>; + + + OSDSession(CephContext *cct, int o) : + osd(o), incarnation(0), con(NULL), + num_locks(cct->_conf->objecter_completion_locks_per_session), + completion_locks(new std::mutex[num_locks]) {} + + ~OSDSession() override; + + bool is_homeless() { return (osd == -1); } + + unique_completion_lock get_lock(object_t& oid); + }; + map<int,OSDSession*> osd_sessions; + + bool osdmap_full_flag() const; + bool osdmap_pool_full(const int64_t pool_id) const; + + private: + + /** + * Test pg_pool_t::FLAG_FULL on a pool + * + * @return true if the pool exists and has the flag set, or + * the global full flag is set, else false + */ + bool _osdmap_pool_full(const int64_t pool_id) const; + bool _osdmap_pool_full(const pg_pool_t &p) const; + void update_pool_full_map(map<int64_t, bool>& pool_full_map); + + map<uint64_t, LingerOp*> linger_ops; + // we use this just to confirm a cookie is valid before dereferencing the ptr + set<LingerOp*> linger_ops_set; + + map<ceph_tid_t,PoolStatOp*> poolstat_ops; + map<ceph_tid_t,StatfsOp*> statfs_ops; + map<ceph_tid_t,PoolOp*> pool_ops; + std::atomic<unsigned> num_homeless_ops{0}; + + OSDSession *homeless_session; + + // ops waiting for an osdmap with a new pool or confirmation that + // the pool does not exist (may be expanded to other uses later) + map<uint64_t, LingerOp*> check_latest_map_lingers; + map<ceph_tid_t, Op*> check_latest_map_ops; + map<ceph_tid_t, CommandOp*> check_latest_map_commands; + + map<epoch_t,list< pair<Context*, int> > > waiting_for_map; + + ceph::timespan mon_timeout; + ceph::timespan osd_timeout; + + MOSDOp *_prepare_osd_op(Op *op); + void _send_op(Op *op); + void _send_op_account(Op *op); + void _cancel_linger_op(Op *op); + void _finish_op(Op *op, int r); + static bool is_pg_changed( + int oldprimary, + const vector<int>& oldacting, + int newprimary, + const vector<int>& newacting, + bool any_change=false); + enum recalc_op_target_result { + RECALC_OP_TARGET_NO_ACTION = 0, + RECALC_OP_TARGET_NEED_RESEND, + RECALC_OP_TARGET_POOL_DNE, + RECALC_OP_TARGET_OSD_DNE, + RECALC_OP_TARGET_OSD_DOWN, + }; + bool _osdmap_full_flag() const; + bool _osdmap_has_pool_full() const; + void _prune_snapc( + const mempool::osdmap::map<int64_t, OSDMap::snap_interval_set_t>& new_removed_snaps, + Op *op); + + bool target_should_be_paused(op_target_t *op); + int _calc_target(op_target_t *t, Connection *con, + bool any_change = false); + int _map_session(op_target_t *op, OSDSession **s, + shunique_lock& lc); + + void _session_op_assign(OSDSession *s, Op *op); + void _session_op_remove(OSDSession *s, Op *op); + void _session_linger_op_assign(OSDSession *to, LingerOp *op); + void _session_linger_op_remove(OSDSession *from, LingerOp *op); + void _session_command_op_assign(OSDSession *to, CommandOp *op); + void _session_command_op_remove(OSDSession *from, CommandOp *op); + + int _assign_op_target_session(Op *op, shunique_lock& lc, + bool src_session_locked, + bool dst_session_locked); + int _recalc_linger_op_target(LingerOp *op, shunique_lock& lc); + + void _linger_submit(LingerOp *info, shunique_lock& sul); + void _send_linger(LingerOp *info, shunique_lock& sul); + void _linger_commit(LingerOp *info, int r, bufferlist& outbl); + void _linger_reconnect(LingerOp *info, int r); + void _send_linger_ping(LingerOp *info); + void _linger_ping(LingerOp *info, int r, ceph::coarse_mono_time sent, + uint32_t register_gen); + int _normalize_watch_error(int r); + + friend class C_DoWatchError; +public: + void linger_callback_flush(Context *ctx) { + finisher->queue(ctx); + } + +private: + void _check_op_pool_dne(Op *op, unique_lock *sl); + void _send_op_map_check(Op *op); + void _op_cancel_map_check(Op *op); + void _check_linger_pool_dne(LingerOp *op, bool *need_unregister); + void _send_linger_map_check(LingerOp *op); + void _linger_cancel_map_check(LingerOp *op); + void _check_command_map_dne(CommandOp *op); + void _send_command_map_check(CommandOp *op); + void _command_cancel_map_check(CommandOp *op); + + void _kick_requests(OSDSession *session, map<uint64_t, LingerOp *>& lresend); + void _linger_ops_resend(map<uint64_t, LingerOp *>& lresend, unique_lock& ul); + + int _get_session(int osd, OSDSession **session, shunique_lock& sul); + void put_session(OSDSession *s); + void get_session(OSDSession *s); + void _reopen_session(OSDSession *session); + void close_session(OSDSession *session); + + void _nlist_reply(NListContext *list_context, int r, Context *final_finish, + epoch_t reply_epoch); + + void resend_mon_ops(); + + /** + * handle a budget for in-flight ops + * budget is taken whenever an op goes into the ops map + * and returned whenever an op is removed from the map + * If throttle_op needs to throttle it will unlock client_lock. + */ + int calc_op_budget(const vector<OSDOp>& ops); + void _throttle_op(Op *op, shunique_lock& sul, int op_size = 0); + int _take_op_budget(Op *op, shunique_lock& sul) { + ceph_assert(sul && sul.mutex() == &rwlock); + int op_budget = calc_op_budget(op->ops); + if (keep_balanced_budget) { + _throttle_op(op, sul, op_budget); + } else { // update take_linger_budget to match this! + op_throttle_bytes.take(op_budget); + op_throttle_ops.take(1); + } + op->budget = op_budget; + return op_budget; + } + int take_linger_budget(LingerOp *info); + friend class WatchContext; // to invoke put_up_budget_bytes + void put_op_budget_bytes(int op_budget) { + ceph_assert(op_budget >= 0); + op_throttle_bytes.put(op_budget); + op_throttle_ops.put(1); + } + void put_nlist_context_budget(NListContext *list_context); + Throttle op_throttle_bytes, op_throttle_ops; + + public: + Objecter(CephContext *cct_, Messenger *m, MonClient *mc, + Finisher *fin, + double mon_timeout, + double osd_timeout) : + Dispatcher(cct_), messenger(m), monc(mc), finisher(fin), + trace_endpoint("0.0.0.0", 0, "Objecter"), + osdmap(new OSDMap), + max_linger_id(0), + keep_balanced_budget(false), honor_osdmap_full(true), osdmap_full_try(false), + blacklist_events_enabled(false), + last_seen_osdmap_version(0), last_seen_pgmap_version(0), + logger(NULL), tick_event(0), m_request_state_hook(NULL), + homeless_session(new OSDSession(cct, -1)), + mon_timeout(ceph::make_timespan(mon_timeout)), + osd_timeout(ceph::make_timespan(osd_timeout)), + op_throttle_bytes(cct, "objecter_bytes", + cct->_conf->objecter_inflight_op_bytes), + op_throttle_ops(cct, "objecter_ops", cct->_conf->objecter_inflight_ops), + epoch_barrier(0), + retry_writes_after_first_reply(cct->_conf->objecter_retry_writes_after_first_reply) + { } + ~Objecter() override; + + void init(); + void start(const OSDMap *o = nullptr); + void shutdown(); + + // These two templates replace osdmap_(get)|(put)_read. Simply wrap + // whatever functionality you want to use the OSDMap in a lambda like: + // + // with_osdmap([](const OSDMap& o) { o.do_stuff(); }); + // + // or + // + // auto t = with_osdmap([&](const OSDMap& o) { return o.lookup_stuff(x); }); + // + // Do not call into something that will try to lock the OSDMap from + // here or you will have great woe and misery. + + template<typename Callback, typename...Args> + auto with_osdmap(Callback&& cb, Args&&... args) const -> + decltype(cb(*osdmap, std::forward<Args>(args)...)) { + shared_lock l(rwlock); + return std::forward<Callback>(cb)(*osdmap, std::forward<Args>(args)...); + } + + + /** + * Tell the objecter to throttle outgoing ops according to its + * budget (in _conf). If you do this, ops can block, in + * which case it will unlock client_lock and sleep until + * incoming messages reduce the used budget low enough for + * the ops to continue going; then it will lock client_lock again. + */ + void set_balanced_budget() { keep_balanced_budget = true; } + void unset_balanced_budget() { keep_balanced_budget = false; } + + void set_honor_osdmap_full() { honor_osdmap_full = true; } + void unset_honor_osdmap_full() { honor_osdmap_full = false; } + + void set_osdmap_full_try() { osdmap_full_try = true; } + void unset_osdmap_full_try() { osdmap_full_try = false; } + + void _scan_requests( + OSDSession *s, + bool skipped_map, + bool cluster_full, + map<int64_t, bool> *pool_full_map, + map<ceph_tid_t, Op*>& need_resend, + list<LingerOp*>& need_resend_linger, + map<ceph_tid_t, CommandOp*>& need_resend_command, + shunique_lock& sul, + const mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps); + + int64_t get_object_hash_position(int64_t pool, const string& key, + const string& ns); + int64_t get_object_pg_hash_position(int64_t pool, const string& key, + const string& ns); + + // messages + public: + bool ms_dispatch(Message *m) override; + bool ms_can_fast_dispatch_any() const override { + return true; + } + bool ms_can_fast_dispatch(const Message *m) const override { + switch (m->get_type()) { + case CEPH_MSG_OSD_OPREPLY: + case CEPH_MSG_WATCH_NOTIFY: + return true; + default: + return false; + } + } + void ms_fast_dispatch(Message *m) override { + if (!ms_dispatch(m)) { + m->put(); + } + } + + void handle_osd_op_reply(class MOSDOpReply *m); + void handle_osd_backoff(class MOSDBackoff *m); + void handle_watch_notify(class MWatchNotify *m); + void handle_osd_map(class MOSDMap *m); + void wait_for_osd_map(); + + /** + * Get list of entities blacklisted since this was last called, + * and reset the list. + * + * Uses a std::set because typical use case is to compare some + * other list of clients to see which overlap with the blacklisted + * addrs. + * + */ + void consume_blacklist_events(std::set<entity_addr_t> *events); + + int pool_snap_by_name(int64_t poolid, + const char *snap_name, + snapid_t *snap) const; + int pool_snap_get_info(int64_t poolid, snapid_t snap, + pool_snap_info_t *info) const; + int pool_snap_list(int64_t poolid, vector<uint64_t> *snaps); +private: + + void emit_blacklist_events(const OSDMap::Incremental &inc); + void emit_blacklist_events(const OSDMap &old_osd_map, + const OSDMap &new_osd_map); + + // low-level + void _op_submit(Op *op, shunique_lock& lc, ceph_tid_t *ptid); + void _op_submit_with_budget(Op *op, shunique_lock& lc, + ceph_tid_t *ptid, + int *ctx_budget = NULL); + // public interface +public: + void op_submit(Op *op, ceph_tid_t *ptid = NULL, int *ctx_budget = NULL); + bool is_active() { + shared_lock l(rwlock); + return !((!inflight_ops) && linger_ops.empty() && + poolstat_ops.empty() && statfs_ops.empty()); + } + + /** + * Output in-flight requests + */ + void _dump_active(OSDSession *s); + void _dump_active(); + void dump_active(); + void dump_requests(Formatter *fmt); + void _dump_ops(const OSDSession *s, Formatter *fmt); + void dump_ops(Formatter *fmt); + void _dump_linger_ops(const OSDSession *s, Formatter *fmt); + void dump_linger_ops(Formatter *fmt); + void _dump_command_ops(const OSDSession *s, Formatter *fmt); + void dump_command_ops(Formatter *fmt); + void dump_pool_ops(Formatter *fmt) const; + void dump_pool_stat_ops(Formatter *fmt) const; + void dump_statfs_ops(Formatter *fmt) const; + + int get_client_incarnation() const { return client_inc; } + void set_client_incarnation(int inc) { client_inc = inc; } + + bool have_map(epoch_t epoch); + /// wait for epoch; true if we already have it + bool wait_for_map(epoch_t epoch, Context *c, int err=0); + void _wait_for_new_map(Context *c, epoch_t epoch, int err=0); + void wait_for_latest_osdmap(Context *fin); + void get_latest_version(epoch_t oldest, epoch_t neweset, Context *fin); + + /** Get the current set of global op flags */ + int get_global_op_flags() const { return global_op_flags; } + /** Add a flag to the global op flags, not really atomic operation */ + void add_global_op_flags(int flag) { + global_op_flags.fetch_or(flag); + } + /** Clear the passed flags from the global op flag set */ + void clear_global_op_flag(int flags) { + global_op_flags.fetch_and(~flags); + } + + /// cancel an in-progress request with the given return code +private: + int op_cancel(OSDSession *s, ceph_tid_t tid, int r); + int _op_cancel(ceph_tid_t tid, int r); +public: + int op_cancel(ceph_tid_t tid, int r); + int op_cancel(const vector<ceph_tid_t>& tidls, int r); + + /** + * Any write op which is in progress at the start of this call shall no + * longer be in progress when this call ends. Operations started after the + * start of this call may still be in progress when this call ends. + * + * @return the latest possible epoch in which a cancelled op could have + * existed, or -1 if nothing was cancelled. + */ + epoch_t op_cancel_writes(int r, int64_t pool=-1); + + // commands + void osd_command(int osd, const std::vector<string>& cmd, + const bufferlist& inbl, ceph_tid_t *ptid, + bufferlist *poutbl, string *prs, Context *onfinish) { + ceph_assert(osd >= 0); + CommandOp *c = new CommandOp( + osd, + cmd, + inbl, + poutbl, + prs, + onfinish); + submit_command(c, ptid); + } + void pg_command(pg_t pgid, const vector<string>& cmd, + const bufferlist& inbl, ceph_tid_t *ptid, + bufferlist *poutbl, string *prs, Context *onfinish) { + CommandOp *c = new CommandOp( + pgid, + cmd, + inbl, + poutbl, + prs, + onfinish); + submit_command(c, ptid); + } + + // mid-level helpers + Op *prepare_mutate_op( + const object_t& oid, const object_locator_t& oloc, + ObjectOperation& op, const SnapContext& snapc, + ceph::real_time mtime, int flags, + Context *oncommit, version_t *objver = NULL, + osd_reqid_t reqid = osd_reqid_t(), + ZTracer::Trace *parent_trace = nullptr) { + Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver, nullptr, parent_trace); + o->priority = op.priority; + o->mtime = mtime; + o->snapc = snapc; + o->out_rval.swap(op.out_rval); + o->reqid = reqid; + return o; + } + ceph_tid_t mutate( + const object_t& oid, const object_locator_t& oloc, + ObjectOperation& op, const SnapContext& snapc, + ceph::real_time mtime, int flags, + Context *oncommit, version_t *objver = NULL, + osd_reqid_t reqid = osd_reqid_t()) { + Op *o = prepare_mutate_op(oid, oloc, op, snapc, mtime, flags, + oncommit, objver, reqid); + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + Op *prepare_read_op( + const object_t& oid, const object_locator_t& oloc, + ObjectOperation& op, + snapid_t snapid, bufferlist *pbl, int flags, + Context *onack, version_t *objver = NULL, + int *data_offset = NULL, + uint64_t features = 0, + ZTracer::Trace *parent_trace = nullptr) { + Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags | + CEPH_OSD_FLAG_READ, onack, objver, data_offset, parent_trace); + o->priority = op.priority; + o->snapid = snapid; + o->outbl = pbl; + if (!o->outbl && op.size() == 1 && op.out_bl[0]->length()) + o->outbl = op.out_bl[0]; + o->out_bl.swap(op.out_bl); + o->out_handler.swap(op.out_handler); + o->out_rval.swap(op.out_rval); + return o; + } + ceph_tid_t read( + const object_t& oid, const object_locator_t& oloc, + ObjectOperation& op, + snapid_t snapid, bufferlist *pbl, int flags, + Context *onack, version_t *objver = NULL, + int *data_offset = NULL, + uint64_t features = 0) { + Op *o = prepare_read_op(oid, oloc, op, snapid, pbl, flags, onack, objver, + data_offset); + if (features) + o->features = features; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + Op *prepare_pg_read_op( + uint32_t hash, object_locator_t oloc, + ObjectOperation& op, bufferlist *pbl, int flags, + Context *onack, epoch_t *reply_epoch, + int *ctx_budget) { + Op *o = new Op(object_t(), oloc, + op.ops, + flags | global_op_flags | CEPH_OSD_FLAG_READ | + CEPH_OSD_FLAG_IGNORE_OVERLAY, + onack, NULL); + o->target.precalc_pgid = true; + o->target.base_pgid = pg_t(hash, oloc.pool); + o->priority = op.priority; + o->snapid = CEPH_NOSNAP; + o->outbl = pbl; + o->out_bl.swap(op.out_bl); + o->out_handler.swap(op.out_handler); + o->out_rval.swap(op.out_rval); + o->reply_epoch = reply_epoch; + if (ctx_budget) { + // budget is tracked by listing context + o->ctx_budgeted = true; + } + return o; + } + ceph_tid_t pg_read( + uint32_t hash, object_locator_t oloc, + ObjectOperation& op, bufferlist *pbl, int flags, + Context *onack, epoch_t *reply_epoch, + int *ctx_budget) { + Op *o = prepare_pg_read_op(hash, oloc, op, pbl, flags, + onack, reply_epoch, ctx_budget); + ceph_tid_t tid; + op_submit(o, &tid, ctx_budget); + return tid; + } + + // caller owns a ref + LingerOp *linger_register(const object_t& oid, const object_locator_t& oloc, + int flags); + ceph_tid_t linger_watch(LingerOp *info, + ObjectOperation& op, + const SnapContext& snapc, ceph::real_time mtime, + bufferlist& inbl, + Context *onfinish, + version_t *objver); + ceph_tid_t linger_notify(LingerOp *info, + ObjectOperation& op, + snapid_t snap, bufferlist& inbl, + bufferlist *poutbl, + Context *onack, + version_t *objver); + int linger_check(LingerOp *info); + void linger_cancel(LingerOp *info); // releases a reference + void _linger_cancel(LingerOp *info); + + void _do_watch_notify(LingerOp *info, MWatchNotify *m); + + /** + * set up initial ops in the op vector, and allocate a final op slot. + * + * The caller is responsible for filling in the final ops_count ops. + * + * @param ops op vector + * @param ops_count number of final ops the caller will fill in + * @param extra_ops pointer to [array of] initial op[s] + * @return index of final op (for caller to fill in) + */ + int init_ops(vector<OSDOp>& ops, int ops_count, ObjectOperation *extra_ops) { + int i; + int extra = 0; + + if (extra_ops) + extra = extra_ops->ops.size(); + + ops.resize(ops_count + extra); + + for (i=0; i<extra; i++) { + ops[i] = extra_ops->ops[i]; + } + + return i; + } + + + // high-level helpers + Op *prepare_stat_op( + const object_t& oid, const object_locator_t& oloc, + snapid_t snap, uint64_t *psize, ceph::real_time *pmtime, + int flags, Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_STAT; + C_Stat *fin = new C_Stat(psize, pmtime, onfinish); + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_READ, fin, objver); + o->snapid = snap; + o->outbl = &fin->bl; + return o; + } + ceph_tid_t stat( + const object_t& oid, const object_locator_t& oloc, + snapid_t snap, uint64_t *psize, ceph::real_time *pmtime, + int flags, Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + Op *o = prepare_stat_op(oid, oloc, snap, psize, pmtime, flags, + onfinish, objver, extra_ops); + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + + Op *prepare_read_op( + const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl, + int flags, Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0, + ZTracer::Trace *parent_trace = nullptr) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_READ; + ops[i].op.extent.offset = off; + ops[i].op.extent.length = len; + ops[i].op.extent.truncate_size = 0; + ops[i].op.extent.truncate_seq = 0; + ops[i].op.flags = op_flags; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_READ, onfinish, objver, nullptr, parent_trace); + o->snapid = snap; + o->outbl = pbl; + return o; + } + ceph_tid_t read( + const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl, + int flags, Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + Op *o = prepare_read_op(oid, oloc, off, len, snap, pbl, flags, + onfinish, objver, extra_ops, op_flags); + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + + Op *prepare_cmpext_op( + const object_t& oid, const object_locator_t& oloc, + uint64_t off, bufferlist &cmp_bl, + snapid_t snap, int flags, Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_CMPEXT; + ops[i].op.extent.offset = off; + ops[i].op.extent.length = cmp_bl.length(); + ops[i].op.extent.truncate_size = 0; + ops[i].op.extent.truncate_seq = 0; + ops[i].indata = cmp_bl; + ops[i].op.flags = op_flags; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_READ, onfinish, objver); + o->snapid = snap; + return o; + } + + ceph_tid_t cmpext( + const object_t& oid, const object_locator_t& oloc, + uint64_t off, bufferlist &cmp_bl, + snapid_t snap, int flags, Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + Op *o = prepare_cmpext_op(oid, oloc, off, cmp_bl, snap, + flags, onfinish, objver, extra_ops, op_flags); + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + + ceph_tid_t read_trunc(const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, snapid_t snap, + bufferlist *pbl, int flags, uint64_t trunc_size, + __u32 trunc_seq, Context *onfinish, + version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_READ; + ops[i].op.extent.offset = off; + ops[i].op.extent.length = len; + ops[i].op.extent.truncate_size = trunc_size; + ops[i].op.extent.truncate_seq = trunc_seq; + ops[i].op.flags = op_flags; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_READ, onfinish, objver); + o->snapid = snap; + o->outbl = pbl; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + ceph_tid_t mapext(const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl, + int flags, Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_MAPEXT; + ops[i].op.extent.offset = off; + ops[i].op.extent.length = len; + ops[i].op.extent.truncate_size = 0; + ops[i].op.extent.truncate_seq = 0; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_READ, onfinish, objver); + o->snapid = snap; + o->outbl = pbl; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + ceph_tid_t getxattr(const object_t& oid, const object_locator_t& oloc, + const char *name, snapid_t snap, bufferlist *pbl, int flags, + Context *onfinish, + version_t *objver = NULL, ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_GETXATTR; + ops[i].op.xattr.name_len = (name ? strlen(name) : 0); + ops[i].op.xattr.value_len = 0; + if (name) + ops[i].indata.append(name, ops[i].op.xattr.name_len); + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_READ, onfinish, objver); + o->snapid = snap; + o->outbl = pbl; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + + ceph_tid_t getxattrs(const object_t& oid, const object_locator_t& oloc, + snapid_t snap, map<string,bufferlist>& attrset, + int flags, Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_GETXATTRS; + C_GetAttrs *fin = new C_GetAttrs(attrset, onfinish); + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_READ, fin, objver); + o->snapid = snap; + o->outbl = &fin->bl; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + + ceph_tid_t read_full(const object_t& oid, const object_locator_t& oloc, + snapid_t snap, bufferlist *pbl, int flags, + Context *onfinish, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + return read(oid, oloc, 0, 0, snap, pbl, flags | global_op_flags | + CEPH_OSD_FLAG_READ, onfinish, objver, extra_ops); + } + + + // writes + ceph_tid_t _modify(const object_t& oid, const object_locator_t& oloc, + vector<OSDOp>& ops, ceph::real_time mtime, + const SnapContext& snapc, int flags, + Context *oncommit, + version_t *objver = NULL) { + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + Op *prepare_write_op( + const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, const SnapContext& snapc, + const bufferlist &bl, ceph::real_time mtime, int flags, + Context *oncommit, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0, + ZTracer::Trace *parent_trace = nullptr) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_WRITE; + ops[i].op.extent.offset = off; + ops[i].op.extent.length = len; + ops[i].op.extent.truncate_size = 0; + ops[i].op.extent.truncate_seq = 0; + ops[i].indata = bl; + ops[i].op.flags = op_flags; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver, + nullptr, parent_trace); + o->mtime = mtime; + o->snapc = snapc; + return o; + } + ceph_tid_t write( + const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, const SnapContext& snapc, + const bufferlist &bl, ceph::real_time mtime, int flags, + Context *oncommit, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + Op *o = prepare_write_op(oid, oloc, off, len, snapc, bl, mtime, flags, + oncommit, objver, extra_ops, op_flags); + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + Op *prepare_append_op( + const object_t& oid, const object_locator_t& oloc, + uint64_t len, const SnapContext& snapc, + const bufferlist &bl, ceph::real_time mtime, int flags, + Context *oncommit, + version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_APPEND; + ops[i].op.extent.offset = 0; + ops[i].op.extent.length = len; + ops[i].op.extent.truncate_size = 0; + ops[i].op.extent.truncate_seq = 0; + ops[i].indata = bl; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + return o; + } + ceph_tid_t append( + const object_t& oid, const object_locator_t& oloc, + uint64_t len, const SnapContext& snapc, + const bufferlist &bl, ceph::real_time mtime, int flags, + Context *oncommit, + version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + Op *o = prepare_append_op(oid, oloc, len, snapc, bl, mtime, flags, + oncommit, objver, extra_ops); + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + ceph_tid_t write_trunc(const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, const SnapContext& snapc, + const bufferlist &bl, ceph::real_time mtime, int flags, + uint64_t trunc_size, __u32 trunc_seq, + Context *oncommit, + version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_WRITE; + ops[i].op.extent.offset = off; + ops[i].op.extent.length = len; + ops[i].op.extent.truncate_size = trunc_size; + ops[i].op.extent.truncate_seq = trunc_seq; + ops[i].indata = bl; + ops[i].op.flags = op_flags; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + Op *prepare_write_full_op( + const object_t& oid, const object_locator_t& oloc, + const SnapContext& snapc, const bufferlist &bl, + ceph::real_time mtime, int flags, + Context *oncommit, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_WRITEFULL; + ops[i].op.extent.offset = 0; + ops[i].op.extent.length = bl.length(); + ops[i].indata = bl; + ops[i].op.flags = op_flags; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + return o; + } + ceph_tid_t write_full( + const object_t& oid, const object_locator_t& oloc, + const SnapContext& snapc, const bufferlist &bl, + ceph::real_time mtime, int flags, + Context *oncommit, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + Op *o = prepare_write_full_op(oid, oloc, snapc, bl, mtime, flags, + oncommit, objver, extra_ops, op_flags); + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + Op *prepare_writesame_op( + const object_t& oid, const object_locator_t& oloc, + uint64_t write_len, uint64_t off, + const SnapContext& snapc, const bufferlist &bl, + ceph::real_time mtime, int flags, + Context *oncommit, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_WRITESAME; + ops[i].op.writesame.offset = off; + ops[i].op.writesame.length = write_len; + ops[i].op.writesame.data_length = bl.length(); + ops[i].indata = bl; + ops[i].op.flags = op_flags; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + return o; + } + ceph_tid_t writesame( + const object_t& oid, const object_locator_t& oloc, + uint64_t write_len, uint64_t off, + const SnapContext& snapc, const bufferlist &bl, + ceph::real_time mtime, int flags, + Context *oncommit, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL, int op_flags = 0) { + + Op *o = prepare_writesame_op(oid, oloc, write_len, off, snapc, bl, + mtime, flags, oncommit, objver, + extra_ops, op_flags); + + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + ceph_tid_t trunc(const object_t& oid, const object_locator_t& oloc, + const SnapContext& snapc, ceph::real_time mtime, int flags, + uint64_t trunc_size, __u32 trunc_seq, + Context *oncommit, version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_TRUNCATE; + ops[i].op.extent.offset = trunc_size; + ops[i].op.extent.truncate_size = trunc_size; + ops[i].op.extent.truncate_seq = trunc_seq; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + ceph_tid_t zero(const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, const SnapContext& snapc, + ceph::real_time mtime, int flags, Context *oncommit, + version_t *objver = NULL, ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_ZERO; + ops[i].op.extent.offset = off; + ops[i].op.extent.length = len; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + ceph_tid_t rollback_object(const object_t& oid, const object_locator_t& oloc, + const SnapContext& snapc, snapid_t snapid, + ceph::real_time mtime, Context *oncommit, + version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_ROLLBACK; + ops[i].op.snap.snapid = snapid; + Op *o = new Op(oid, oloc, ops, CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + ceph_tid_t create(const object_t& oid, const object_locator_t& oloc, + const SnapContext& snapc, ceph::real_time mtime, int global_flags, + int create_flags, Context *oncommit, + version_t *objver = NULL, + ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_CREATE; + ops[i].op.flags = create_flags; + Op *o = new Op(oid, oloc, ops, global_flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + Op *prepare_remove_op( + const object_t& oid, const object_locator_t& oloc, + const SnapContext& snapc, ceph::real_time mtime, int flags, + Context *oncommit, + version_t *objver = NULL, ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_DELETE; + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + return o; + } + ceph_tid_t remove( + const object_t& oid, const object_locator_t& oloc, + const SnapContext& snapc, ceph::real_time mtime, int flags, + Context *oncommit, + version_t *objver = NULL, ObjectOperation *extra_ops = NULL) { + Op *o = prepare_remove_op(oid, oloc, snapc, mtime, flags, + oncommit, objver, extra_ops); + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + + ceph_tid_t setxattr(const object_t& oid, const object_locator_t& oloc, + const char *name, const SnapContext& snapc, const bufferlist &bl, + ceph::real_time mtime, int flags, + Context *oncommit, + version_t *objver = NULL, ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_SETXATTR; + ops[i].op.xattr.name_len = (name ? strlen(name) : 0); + ops[i].op.xattr.value_len = bl.length(); + if (name) + ops[i].indata.append(name, ops[i].op.xattr.name_len); + ops[i].indata.append(bl); + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + ceph_tid_t removexattr(const object_t& oid, const object_locator_t& oloc, + const char *name, const SnapContext& snapc, + ceph::real_time mtime, int flags, + Context *oncommit, + version_t *objver = NULL, ObjectOperation *extra_ops = NULL) { + vector<OSDOp> ops; + int i = init_ops(ops, 1, extra_ops); + ops[i].op.op = CEPH_OSD_OP_RMXATTR; + ops[i].op.xattr.name_len = (name ? strlen(name) : 0); + ops[i].op.xattr.value_len = 0; + if (name) + ops[i].indata.append(name, ops[i].op.xattr.name_len); + Op *o = new Op(oid, oloc, ops, flags | global_op_flags | + CEPH_OSD_FLAG_WRITE, oncommit, objver); + o->mtime = mtime; + o->snapc = snapc; + ceph_tid_t tid; + op_submit(o, &tid); + return tid; + } + + void list_nobjects(NListContext *p, Context *onfinish); + uint32_t list_nobjects_seek(NListContext *p, uint32_t pos); + uint32_t list_nobjects_seek(NListContext *list_context, const hobject_t& c); + void list_nobjects_get_cursor(NListContext *list_context, hobject_t *c); + + hobject_t enumerate_objects_begin(); + hobject_t enumerate_objects_end(); + //hobject_t enumerate_objects_begin(int n, int m); + void enumerate_objects( + int64_t pool_id, + const std::string &ns, + const hobject_t &start, + const hobject_t &end, + const uint32_t max, + const bufferlist &filter_bl, + std::list<librados::ListObjectImpl> *result, + hobject_t *next, + Context *on_finish); + + void _enumerate_reply( + bufferlist &bl, + int r, + const hobject_t &end, + const int64_t pool_id, + int budget, + epoch_t reply_epoch, + std::list<librados::ListObjectImpl> *result, + hobject_t *next, + Context *on_finish); + friend class C_EnumerateReply; + + // ------------------------- + // pool ops +private: + void pool_op_submit(PoolOp *op); + void _pool_op_submit(PoolOp *op); + void _finish_pool_op(PoolOp *op, int r); + void _do_delete_pool(int64_t pool, Context *onfinish); +public: + int create_pool_snap(int64_t pool, string& snapName, Context *onfinish); + int allocate_selfmanaged_snap(int64_t pool, snapid_t *psnapid, + Context *onfinish); + int delete_pool_snap(int64_t pool, string& snapName, Context *onfinish); + int delete_selfmanaged_snap(int64_t pool, snapid_t snap, Context *onfinish); + + int create_pool(string& name, Context *onfinish, + int crush_rule=-1); + int delete_pool(int64_t pool, Context *onfinish); + int delete_pool(const string& name, Context *onfinish); + + void handle_pool_op_reply(MPoolOpReply *m); + int pool_op_cancel(ceph_tid_t tid, int r); + + // -------------------------- + // pool stats +private: + void _poolstat_submit(PoolStatOp *op); +public: + void handle_get_pool_stats_reply(MGetPoolStatsReply *m); + void get_pool_stats(list<string>& pools, map<string,pool_stat_t> *result, + bool *per_pool, + Context *onfinish); + int pool_stat_op_cancel(ceph_tid_t tid, int r); + void _finish_pool_stat_op(PoolStatOp *op, int r); + + // --------------------------- + // df stats +private: + void _fs_stats_submit(StatfsOp *op); +public: + void handle_fs_stats_reply(MStatfsReply *m); + void get_fs_stats(struct ceph_statfs& result, boost::optional<int64_t> poolid, + Context *onfinish); + int statfs_op_cancel(ceph_tid_t tid, int r); + void _finish_statfs_op(StatfsOp *op, int r); + + // --------------------------- + // some scatter/gather hackery + + void _sg_read_finish(vector<ObjectExtent>& extents, + vector<bufferlist>& resultbl, + bufferlist *bl, Context *onfinish); + + struct C_SGRead : public Context { + Objecter *objecter; + vector<ObjectExtent> extents; + vector<bufferlist> resultbl; + bufferlist *bl; + Context *onfinish; + C_SGRead(Objecter *ob, + vector<ObjectExtent>& e, vector<bufferlist>& r, bufferlist *b, + Context *c) : + objecter(ob), bl(b), onfinish(c) { + extents.swap(e); + resultbl.swap(r); + } + void finish(int r) override { + objecter->_sg_read_finish(extents, resultbl, bl, onfinish); + } + }; + + void sg_read_trunc(vector<ObjectExtent>& extents, snapid_t snap, + bufferlist *bl, int flags, uint64_t trunc_size, + __u32 trunc_seq, Context *onfinish, int op_flags = 0) { + if (extents.size() == 1) { + read_trunc(extents[0].oid, extents[0].oloc, extents[0].offset, + extents[0].length, snap, bl, flags, extents[0].truncate_size, + trunc_seq, onfinish, 0, 0, op_flags); + } else { + C_GatherBuilder gather(cct); + vector<bufferlist> resultbl(extents.size()); + int i=0; + for (vector<ObjectExtent>::iterator p = extents.begin(); + p != extents.end(); + ++p) { + read_trunc(p->oid, p->oloc, p->offset, p->length, snap, &resultbl[i++], + flags, p->truncate_size, trunc_seq, gather.new_sub(), + 0, 0, op_flags); + } + gather.set_finisher(new C_SGRead(this, extents, resultbl, bl, onfinish)); + gather.activate(); + } + } + + void sg_read(vector<ObjectExtent>& extents, snapid_t snap, bufferlist *bl, + int flags, Context *onfinish, int op_flags = 0) { + sg_read_trunc(extents, snap, bl, flags, 0, 0, onfinish, op_flags); + } + + void sg_write_trunc(vector<ObjectExtent>& extents, const SnapContext& snapc, + const bufferlist& bl, ceph::real_time mtime, int flags, + uint64_t trunc_size, __u32 trunc_seq, + Context *oncommit, int op_flags = 0) { + if (extents.size() == 1) { + write_trunc(extents[0].oid, extents[0].oloc, extents[0].offset, + extents[0].length, snapc, bl, mtime, flags, + extents[0].truncate_size, trunc_seq, oncommit, + 0, 0, op_flags); + } else { + C_GatherBuilder gcom(cct, oncommit); + for (vector<ObjectExtent>::iterator p = extents.begin(); + p != extents.end(); + ++p) { + bufferlist cur; + for (vector<pair<uint64_t,uint64_t> >::iterator bit + = p->buffer_extents.begin(); + bit != p->buffer_extents.end(); + ++bit) + bl.copy(bit->first, bit->second, cur); + ceph_assert(cur.length() == p->length); + write_trunc(p->oid, p->oloc, p->offset, p->length, + snapc, cur, mtime, flags, p->truncate_size, trunc_seq, + oncommit ? gcom.new_sub():0, + 0, 0, op_flags); + } + gcom.activate(); + } + } + + void sg_write(vector<ObjectExtent>& extents, const SnapContext& snapc, + const bufferlist& bl, ceph::real_time mtime, int flags, + Context *oncommit, int op_flags = 0) { + sg_write_trunc(extents, snapc, bl, mtime, flags, 0, 0, oncommit, + op_flags); + } + + void ms_handle_connect(Connection *con) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override; + bool ms_handle_refused(Connection *con) override; + bool ms_get_authorizer(int dest_type, + AuthAuthorizer **authorizer) override; + + void blacklist_self(bool set); + +private: + epoch_t epoch_barrier; + bool retry_writes_after_first_reply; +public: + void set_epoch_barrier(epoch_t epoch); + + PerfCounters *get_logger() { + return logger; + } +}; + +#endif diff --git a/src/osdc/Striper.cc b/src/osdc/Striper.cc new file mode 100644 index 00000000..3286b012 --- /dev/null +++ b/src/osdc/Striper.cc @@ -0,0 +1,411 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Striper.h" + +#include "include/types.h" +#include "include/buffer.h" +#include "osd/OSDMap.h" + +#include "common/config.h" +#include "common/debug.h" + +#define dout_subsys ceph_subsys_striper +#undef dout_prefix +#define dout_prefix *_dout << "striper " + + +void Striper::file_to_extents(CephContext *cct, const char *object_format, + const file_layout_t *layout, + uint64_t offset, uint64_t len, + uint64_t trunc_size, + vector<ObjectExtent>& extents, + uint64_t buffer_offset) +{ + map<object_t,vector<ObjectExtent> > object_extents; + file_to_extents(cct, object_format, layout, offset, len, trunc_size, + object_extents, buffer_offset); + assimilate_extents(object_extents, extents); +} + +void Striper::file_to_extents( + CephContext *cct, const char *object_format, + const file_layout_t *layout, + uint64_t offset, uint64_t len, + uint64_t trunc_size, + map<object_t,vector<ObjectExtent> >& object_extents, + uint64_t buffer_offset) +{ + ldout(cct, 10) << "file_to_extents " << offset << "~" << len + << " format " << object_format + << dendl; + ceph_assert(len > 0); + + /* + * we want only one extent per object! this means that each extent + * we read may map into different bits of the final read + * buffer.. hence ObjectExtent.buffer_extents + */ + + __u32 object_size = layout->object_size; + __u32 su = layout->stripe_unit; + __u32 stripe_count = layout->stripe_count; + ceph_assert(object_size >= su); + if (stripe_count == 1) { + ldout(cct, 20) << " sc is one, reset su to os" << dendl; + su = object_size; + } + uint64_t stripes_per_object = object_size / su; + ldout(cct, 20) << " su " << su << " sc " << stripe_count << " os " + << object_size << " stripes_per_object " << stripes_per_object + << dendl; + + uint64_t cur = offset; + uint64_t left = len; + while (left > 0) { + // layout into objects + uint64_t blockno = cur / su; // which block + // which horizontal stripe (Y) + uint64_t stripeno = blockno / stripe_count; + // which object in the object set (X) + uint64_t stripepos = blockno % stripe_count; + // which object set + uint64_t objectsetno = stripeno / stripes_per_object; + // object id + uint64_t objectno = objectsetno * stripe_count + stripepos; + + // find oid, extent + char buf[strlen(object_format) + 32]; + snprintf(buf, sizeof(buf), object_format, (long long unsigned)objectno); + object_t oid = buf; + + // map range into object + uint64_t block_start = (stripeno % stripes_per_object) * su; + uint64_t block_off = cur % su; + uint64_t max = su - block_off; + + uint64_t x_offset = block_start + block_off; + uint64_t x_len; + if (left > max) + x_len = max; + else + x_len = left; + + ldout(cct, 20) << " off " << cur << " blockno " << blockno << " stripeno " + << stripeno << " stripepos " << stripepos << " objectsetno " + << objectsetno << " objectno " << objectno + << " block_start " << block_start << " block_off " + << block_off << " " << x_offset << "~" << x_len + << dendl; + + ObjectExtent *ex = 0; + vector<ObjectExtent>& exv = object_extents[oid]; + if (exv.empty() || exv.back().offset + exv.back().length != x_offset) { + exv.resize(exv.size() + 1); + ex = &exv.back(); + ex->oid = oid; + ex->objectno = objectno; + ex->oloc = OSDMap::file_to_object_locator(*layout); + + ex->offset = x_offset; + ex->length = x_len; + ex->truncate_size = object_truncate_size(cct, layout, objectno, + trunc_size); + + ldout(cct, 20) << " added new " << *ex << dendl; + } else { + // add to extent + ex = &exv.back(); + ldout(cct, 20) << " adding in to " << *ex << dendl; + ex->length += x_len; + } + ex->buffer_extents.push_back(make_pair(cur - offset + buffer_offset, + x_len)); + + ldout(cct, 15) << "file_to_extents " << *ex << " in " << ex->oloc + << dendl; + // ldout(cct, 0) << "map: ino " << ino << " oid " << ex.oid << " osd " + // << ex.osd << " offset " << ex.offset << " len " << ex.len + // << " ... left " << left << dendl; + + left -= x_len; + cur += x_len; + } +} + +void Striper::assimilate_extents( + map<object_t,vector<ObjectExtent> >& object_extents, + vector<ObjectExtent>& extents) +{ + // make final list + for (map<object_t, vector<ObjectExtent> >::iterator it + = object_extents.begin(); + it != object_extents.end(); + ++it) { + for (vector<ObjectExtent>::iterator p = it->second.begin(); + p != it->second.end(); + ++p) { + extents.push_back(*p); + } + } +} + +void Striper::extent_to_file(CephContext *cct, file_layout_t *layout, + uint64_t objectno, uint64_t off, uint64_t len, + vector<pair<uint64_t, uint64_t> >& extents) +{ + ldout(cct, 10) << "extent_to_file " << objectno << " " << off << "~" + << len << dendl; + + __u32 object_size = layout->object_size; + __u32 su = layout->stripe_unit; + __u32 stripe_count = layout->stripe_count; + ceph_assert(object_size >= su); + uint64_t stripes_per_object = object_size / su; + ldout(cct, 20) << " stripes_per_object " << stripes_per_object << dendl; + + uint64_t off_in_block = off % su; + + extents.reserve(len / su + 1); + + while (len > 0) { + uint64_t stripepos = objectno % stripe_count; + uint64_t objectsetno = objectno / stripe_count; + uint64_t stripeno = off / su + objectsetno * stripes_per_object; + uint64_t blockno = stripeno * stripe_count + stripepos; + uint64_t extent_off = blockno * su + off_in_block; + uint64_t extent_len = std::min(len, su - off_in_block); + extents.push_back(make_pair(extent_off, extent_len)); + + ldout(cct, 20) << " object " << off << "~" << extent_len + << " -> file " << extent_off << "~" << extent_len + << dendl; + + off_in_block = 0; + off += extent_len; + len -= extent_len; + } +} + +uint64_t Striper::object_truncate_size(CephContext *cct, + const file_layout_t *layout, + uint64_t objectno, uint64_t trunc_size) +{ + uint64_t obj_trunc_size; + if (trunc_size == 0 || trunc_size == (uint64_t)-1) { + obj_trunc_size = trunc_size; + } else { + __u32 object_size = layout->object_size; + __u32 su = layout->stripe_unit; + __u32 stripe_count = layout->stripe_count; + ceph_assert(object_size >= su); + uint64_t stripes_per_object = object_size / su; + + uint64_t objectsetno = objectno / stripe_count; + uint64_t trunc_objectsetno = trunc_size / object_size / stripe_count; + if (objectsetno > trunc_objectsetno) + obj_trunc_size = 0; + else if (objectsetno < trunc_objectsetno) + obj_trunc_size = object_size; + else { + uint64_t trunc_blockno = trunc_size / su; + uint64_t trunc_stripeno = trunc_blockno / stripe_count; + uint64_t trunc_stripepos = trunc_blockno % stripe_count; + uint64_t trunc_objectno = trunc_objectsetno * stripe_count + + trunc_stripepos; + if (objectno < trunc_objectno) + obj_trunc_size = ((trunc_stripeno % stripes_per_object) + 1) * su; + else if (objectno > trunc_objectno) + obj_trunc_size = (trunc_stripeno % stripes_per_object) * su; + else + obj_trunc_size = (trunc_stripeno % stripes_per_object) * su + + (trunc_size % su); + } + } + ldout(cct, 20) << "object_truncate_size " << objectno << " " + << trunc_size << "->" << obj_trunc_size << dendl; + return obj_trunc_size; +} + +uint64_t Striper::get_num_objects(const file_layout_t& layout, + uint64_t size) +{ + __u32 stripe_unit = layout.stripe_unit; + __u32 stripe_count = layout.stripe_count; + uint64_t period = layout.get_period(); + uint64_t num_periods = (size + period - 1) / period; + uint64_t remainder_bytes = size % period; + uint64_t remainder_objs = 0; + if ((remainder_bytes > 0) && (remainder_bytes < (uint64_t)stripe_count + * stripe_unit)) + remainder_objs = stripe_count - ((remainder_bytes + stripe_unit - 1) + / stripe_unit); + return num_periods * stripe_count - remainder_objs; +} + +// StripedReadResult + +void Striper::StripedReadResult::add_partial_result( + CephContext *cct, bufferlist& bl, + const vector<pair<uint64_t,uint64_t> >& buffer_extents) +{ + ldout(cct, 10) << "add_partial_result(" << this << ") " << bl.length() + << " to " << buffer_extents << dendl; + for (vector<pair<uint64_t,uint64_t> >::const_iterator p + = buffer_extents.begin(); + p != buffer_extents.end(); + ++p) { + pair<bufferlist, uint64_t>& r = partial[p->first]; + size_t actual = std::min<uint64_t>(bl.length(), p->second); + bl.splice(0, actual, &r.first); + r.second = p->second; + total_intended_len += r.second; + } +} + +void Striper::StripedReadResult::add_partial_sparse_result( + CephContext *cct, bufferlist& bl, const map<uint64_t, uint64_t>& bl_map, + uint64_t bl_off, const vector<pair<uint64_t,uint64_t> >& buffer_extents) +{ + ldout(cct, 10) << "add_partial_sparse_result(" << this << ") " << bl.length() + << " covering " << bl_map << " (offset " << bl_off << ")" + << " to " << buffer_extents << dendl; + map<uint64_t, uint64_t>::const_iterator s = bl_map.begin(); + for (vector<pair<uint64_t,uint64_t> >::const_iterator p + = buffer_extents.begin(); + p != buffer_extents.end(); + ++p) { + uint64_t tofs = p->first; + size_t tlen = p->second; + ldout(cct, 30) << " be " << tofs << "~" << tlen << dendl; + while (tlen > 0) { + ldout(cct, 20) << " t " << tofs << "~" << tlen + << " bl has " << bl.length() + << " off " << bl_off + << dendl; + if (s == bl_map.end()) { + ldout(cct, 20) << " s at end" << dendl; + pair<bufferlist, uint64_t>& r = partial[tofs]; + r.second = tlen; + total_intended_len += r.second; + break; + } + + ldout(cct, 30) << " s " << s->first << "~" << s->second << dendl; + + // skip zero-length extent + if (s->second == 0) { + ldout(cct, 30) << " s len 0, skipping" << dendl; + ++s; + continue; + } + + if (s->first > bl_off) { + // gap in sparse read result + pair<bufferlist, uint64_t>& r = partial[tofs]; + size_t gap = std::min<size_t>(s->first - bl_off, tlen); + ldout(cct, 20) << " s gap " << gap << ", skipping" << dendl; + r.second = gap; + total_intended_len += r.second; + bl_off += gap; + tofs += gap; + tlen -= gap; + if (tlen == 0) { + continue; + } + } + + ceph_assert(s->first <= bl_off); + size_t left = (s->first + s->second) - bl_off; + size_t actual = std::min(left, tlen); + + if (actual > 0) { + ldout(cct, 20) << " s has " << actual << ", copying" << dendl; + pair<bufferlist, uint64_t>& r = partial[tofs]; + bl.splice(0, actual, &r.first); + r.second = actual; + total_intended_len += r.second; + bl_off += actual; + tofs += actual; + tlen -= actual; + } + if (actual == left) { + ldout(cct, 30) << " s advancing" << dendl; + ++s; + } + } + } +} + +void Striper::StripedReadResult::assemble_result(CephContext *cct, + bufferlist& bl, + bool zero_tail) +{ + ldout(cct, 10) << "assemble_result(" << this << ") zero_tail=" << zero_tail + << dendl; + size_t zeros = 0; // zeros preceding current position + for (auto& p : partial) { + size_t got = p.second.first.length(); + size_t expect = p.second.second; + if (got) { + if (zeros) { + bl.append_zero(zeros); + zeros = 0; + } + bl.claim_append(p.second.first); + } + zeros += expect - got; + } + if (zero_tail && zeros) { + bl.append_zero(zeros); + } + partial.clear(); +} + +void Striper::StripedReadResult::assemble_result(CephContext *cct, char *buffer, size_t length) +{ + + ceph_assert(buffer && length == total_intended_len); + + map<uint64_t,pair<bufferlist,uint64_t> >::reverse_iterator p = partial.rbegin(); + if (p == partial.rend()) + return; + + uint64_t curr = length; + uint64_t end = p->first + p->second.second; + while (p != partial.rend()) { + // sanity check + ldout(cct, 20) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second + << " " << p->second.first.length() << " bytes" + << dendl; + ceph_assert(p->first == end - p->second.second); + end = p->first; + + size_t len = p->second.first.length(); + ceph_assert(curr >= p->second.second); + curr -= p->second.second; + if (len < p->second.second) { + if (len) + p->second.first.copy(0, len, buffer + curr); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(buffer + curr + len, 0, p->second.second - len); + } else { + p->second.first.copy(0, len, buffer + curr); + } + ++p; + } + partial.clear(); + ceph_assert(curr == 0); +} + diff --git a/src/osdc/Striper.h b/src/osdc/Striper.h new file mode 100644 index 00000000..6d110e95 --- /dev/null +++ b/src/osdc/Striper.h @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_STRIPER_H +#define CEPH_STRIPER_H + +#include "include/types.h" +#include "osd/osd_types.h" + +class CephContext; + +//namespace ceph { + + class Striper { + public: + /* + * map (ino, layout, offset, len) to a (list of) ObjectExtents (byte + * ranges in objects on (primary) osds) + */ + static void file_to_extents(CephContext *cct, const char *object_format, + const file_layout_t *layout, + uint64_t offset, uint64_t len, + uint64_t trunc_size, + map<object_t, vector<ObjectExtent> >& extents, + uint64_t buffer_offset=0); + + static void file_to_extents(CephContext *cct, const char *object_format, + const file_layout_t *layout, + uint64_t offset, uint64_t len, + uint64_t trunc_size, + vector<ObjectExtent>& extents, + uint64_t buffer_offset=0); + + static void file_to_extents(CephContext *cct, inodeno_t ino, + const file_layout_t *layout, + uint64_t offset, uint64_t len, + uint64_t trunc_size, + vector<ObjectExtent>& extents) { + // generate prefix/format + char buf[32]; + snprintf(buf, sizeof(buf), "%llx.%%08llx", (long long unsigned)ino); + + file_to_extents(cct, buf, layout, offset, len, trunc_size, extents); + } + + static void assimilate_extents( + map<object_t, vector<ObjectExtent> >& object_extents, + vector<ObjectExtent>& extents); + + /** + * reverse map an object extent to file extents + */ + static void extent_to_file(CephContext *cct, file_layout_t *layout, + uint64_t objectno, uint64_t off, uint64_t len, + vector<pair<uint64_t, uint64_t> >& extents); + + static uint64_t object_truncate_size( + CephContext *cct, const file_layout_t *layout, + uint64_t objectno, uint64_t trunc_size); + + static uint64_t get_num_objects(const file_layout_t& layout, + uint64_t size); + /* + * helper to assemble a striped result + */ + class StripedReadResult { + // offset -> (data, intended length) + map<uint64_t, pair<bufferlist, uint64_t> > partial; + uint64_t total_intended_len = 0; //sum of partial.second.second + + public: + void add_partial_result( + CephContext *cct, bufferlist& bl, + const vector<pair<uint64_t,uint64_t> >& buffer_extents); + /** + * add sparse read into results + * + * @param bl buffer + * @param bl_map map of which logical source extents this covers + * @param bl_off logical buffer offset (e.g., first bl_map key + * if the buffer is not sparse) + * @param buffer_extents output buffer extents the data maps to + */ + void add_partial_sparse_result( + CephContext *cct, bufferlist& bl, + const map<uint64_t, uint64_t>& bl_map, uint64_t bl_off, + const vector<pair<uint64_t,uint64_t> >& buffer_extents); + + void assemble_result(CephContext *cct, bufferlist& bl, bool zero_tail); + + /** + * @buffer copy read data into buffer + * @len the length of buffer + */ + void assemble_result(CephContext *cct, char *buffer, size_t len); + }; + + }; + +//}; + +#endif diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h new file mode 100644 index 00000000..ef3b7f6e --- /dev/null +++ b/src/osdc/WritebackHandler.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_OSDC_WRITEBACKHANDLER_H +#define CEPH_OSDC_WRITEBACKHANDLER_H + +#include "include/Context.h" +#include "include/types.h" +#include "common/zipkin_trace.h" +#include "osd/osd_types.h" + +class WritebackHandler { + public: + WritebackHandler() {} + virtual ~WritebackHandler() {} + + virtual void read(const object_t& oid, uint64_t object_no, + const object_locator_t& oloc, uint64_t off, uint64_t len, + snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, Context *onfinish) = 0; + /** + * check if a given extent read result may change due to a write + * + * Check if the content we see at the given read offset may change + * due to a write to this object. + * + * @param oid object + * @param read_off read offset + * @param read_len read length + * @param snapid read snapid + */ + virtual bool may_copy_on_write(const object_t& oid, uint64_t read_off, + uint64_t read_len, snapid_t snapid) = 0; + virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, + const SnapContext& snapc, + const bufferlist &bl, ceph::real_time mtime, + uint64_t trunc_size, __u32 trunc_seq, + ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) = 0; + + virtual void overwrite_extent(const object_t& oid, uint64_t off, uint64_t len, + ceph_tid_t original_journal_tid, + ceph_tid_t new_journal_tid) {} + + virtual bool can_scattered_write() { return false; } + virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc, + vector<pair<uint64_t, bufferlist> >& io_vec, + const SnapContext& snapc, ceph::real_time mtime, + uint64_t trunc_size, __u32 trunc_seq, + Context *oncommit) { + return 0; + } +}; + +#endif |