diff options
Diffstat (limited to 'src/osd/PrimaryLogPG.cc')
-rw-r--r-- | src/osd/PrimaryLogPG.cc | 15834 |
1 files changed, 15834 insertions, 0 deletions
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc new file mode 100644 index 000000000..243e127eb --- /dev/null +++ b/src/osd/PrimaryLogPG.cc @@ -0,0 +1,15834 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include <errno.h> + +#include <charconv> +#include <sstream> +#include <utility> + +#include <boost/intrusive_ptr.hpp> +#include <boost/tuple/tuple.hpp> + +#include "PrimaryLogPG.h" + +#include "cls/cas/cls_cas_ops.h" +#include "common/CDC.h" +#include "common/EventTrace.h" +#include "common/ceph_crypto.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/scrub_types.h" +#include "include/compat.h" +#include "json_spirit/json_spirit_reader.h" +#include "json_spirit/json_spirit_value.h" +#include "messages/MCommandReply.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGScan.h" +#include "messages/MOSDPGTrim.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" +#include "messages/MOSDRepScrub.h" +#include "messages/MOSDScrubReserve.h" +#include "mon/MonClient.h" +#include "objclass/objclass.h" +#include "osd/ClassHandler.h" +#include "osdc/Objecter.h" +#include "osd/scrubber/PrimaryLogScrub.h" +#include "osd/scrubber/ScrubStore.h" +#include "osd/scrubber/pg_scrubber.h" + +#include "OSD.h" +#include "OpRequest.h" +#include "PG.h" +#include "Session.h" + +// required includes order: +#include "json_spirit/json_spirit_value.h" +#include "json_spirit/json_spirit_reader.h" +#include "include/ceph_assert.h" // json_spirit clobbers it +#include "include/rados/rados_types.hpp" + +#ifdef WITH_LTTNG +#include "tracing/osd.h" +#else +#define tracepoint(...) +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap() +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +#include "osd_tracer.h" + +MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd); + +using std::less; +using std::list; +using std::ostream; +using std::pair; +using std::make_pair; +using std::make_unique; +using std::map; +using std::ostringstream; +using std::set; +using std::string; +using std::string_view; +using std::stringstream; +using std::unique_ptr; +using std::vector; + +using ceph::bufferlist; +using ceph::bufferptr; +using ceph::Formatter; +using ceph::decode; +using ceph::decode_noclear; +using ceph::encode; +using ceph::encode_destructively; + +using namespace ceph::osd::scheduler; +using TOPNSPC::common::cmd_getval; +using TOPNSPC::common::cmd_getval_or; + +template <typename T> +static ostream& _prefix(std::ostream *_dout, T *pg) { + return pg->gen_prefix(*_dout); +} + +/** + * The CopyCallback class defines an interface for completions to the + * copy_start code. Users of the copy infrastructure must implement + * one and give an instance of the class to start_copy. + * + * The implementer is responsible for making sure that the CopyCallback + * can associate itself with the correct copy operation. + */ +class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> { +protected: + CopyCallback() {} + /** + * results.get<0>() is the return code: 0 for success; -ECANCELED if + * the operation was cancelled by the local OSD; -errno for other issues. + * results.get<1>() is a pointer to a CopyResults object, which you are + * responsible for deleting. + */ + void finish(CopyCallbackResults results_) override = 0; + +public: + /// Provide the final size of the copied object to the CopyCallback + ~CopyCallback() override {} +}; + +template <typename T> +class PrimaryLogPG::BlessedGenContext : public GenContext<T> { + PrimaryLogPGRef pg; + unique_ptr<GenContext<T>> c; + epoch_t e; +public: + BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(T t) override { + std::scoped_lock locker{*pg}; + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(t); + } + bool sync_finish(T t) { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(t); + return true; + } +}; + +GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext( + GenContext<ThreadPool::TPHandle&> *c) { + return new BlessedGenContext<ThreadPool::TPHandle&>( + this, c, get_osdmap_epoch()); +} + +template <typename T> +class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> { + PrimaryLogPGRef pg; + unique_ptr<GenContext<T>> c; + epoch_t e; +public: + UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(T t) override { + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(t); + } + bool sync_finish(T t) { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(t); + return true; + } +}; + +GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext( + GenContext<ThreadPool::TPHandle&> *c) { + return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>( + this, c, get_osdmap_epoch()); +} + +class PrimaryLogPG::BlessedContext : public Context { + PrimaryLogPGRef pg; + unique_ptr<Context> c; + epoch_t e; +public: + BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e) + : pg(pg), c(c), e(e) {} + void finish(int r) override { + std::scoped_lock locker{*pg}; + if (pg->pg_has_reset_since(e)) + c.reset(); + else + c.release()->complete(r); + } + bool sync_finish(int r) override { + // we assume here all blessed/wrapped Contexts can complete synchronously. + c.release()->complete(r); + return true; + } +}; + +Context *PrimaryLogPG::bless_context(Context *c) { + return new BlessedContext(this, c, get_osdmap_epoch()); +} + +class PrimaryLogPG::C_PG_ObjectContext : public Context { + PrimaryLogPGRef pg; + ObjectContext *obc; + public: + C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) : + pg(p), obc(o) {} + void finish(int r) override { + pg->object_context_destructor_callback(obc); + } +}; + +struct OnReadComplete : public Context { + PrimaryLogPG *pg; + PrimaryLogPG::OpContext *opcontext; + OnReadComplete( + PrimaryLogPG *pg, + PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {} + void finish(int r) override { + opcontext->finish_read(pg); + } + ~OnReadComplete() override {} +}; + +class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context { + PrimaryLogPGRef pg; + ObjectContextRef obc; + public: + C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) : + pg(p), obc(o) {} + bool sync_finish(int r) override { + pg->_applied_recovered_object(obc); + return true; + } + void finish(int r) override { + std::scoped_lock locker{*pg}; + pg->_applied_recovered_object(obc); + } +}; + +class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context { + PrimaryLogPGRef pg; + epoch_t epoch; + eversion_t last_complete; + public: + C_OSD_CommittedPushedObject( + PrimaryLogPG *p, epoch_t epoch, eversion_t lc) : + pg(p), epoch(epoch), last_complete(lc) { + } + void finish(int r) override { + pg->_committed_pushed_object(epoch, last_complete); + } +}; + +class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context { + PrimaryLogPGRef pg; + public: + explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) : + pg(p) {} + bool sync_finish(int r) override { + pg->_applied_recovered_object_replica(); + return true; + } + void finish(int r) override { + std::scoped_lock locker{*pg}; + pg->_applied_recovered_object_replica(); + } +}; + +// OpContext +void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg) +{ + inflightreads = 1; + list<pair<boost::tuple<uint64_t, uint64_t, unsigned>, + pair<bufferlist*, Context*> > > in; + in.swap(pending_async_reads); + pg->pgbackend->objects_read_async( + obc->obs.oi.soid, + in, + new OnReadComplete(pg, this), pg->get_pool().fast_read); +} +void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg) +{ + ceph_assert(inflightreads > 0); + --inflightreads; + if (async_reads_complete()) { + ceph_assert(pg->in_progress_async_reads.size()); + ceph_assert(pg->in_progress_async_reads.front().second == this); + pg->in_progress_async_reads.pop_front(); + + // Restart the op context now that all reads have been + // completed. Read failures will be handled by the op finisher + pg->execute_ctx(this); + } +} + +class CopyFromCallback : public PrimaryLogPG::CopyCallback { +public: + PrimaryLogPG::CopyResults *results = nullptr; + PrimaryLogPG::OpContext *ctx; + OSDOp &osd_op; + uint32_t truncate_seq; + uint64_t truncate_size; + bool have_truncate = false; + + CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op) + : ctx(ctx), osd_op(osd_op) { + } + ~CopyFromCallback() override {} + + void finish(PrimaryLogPG::CopyCallbackResults results_) override { + results = results_.get<1>(); + int r = results_.get<0>(); + + // Only use truncate_{seq,size} from the original object if the client + // did not sent us these parameters + if (!have_truncate) { + truncate_seq = results->truncate_seq; + truncate_size = results->truncate_size; + } + + // for finish_copyfrom + ctx->user_at_version = results->user_version; + + if (r >= 0) { + ctx->pg->execute_ctx(ctx); + } else { + if (r != -ECANCELED) { // on cancel just toss it out; client resends + if (ctx->op) + ctx->pg->osd->reply_op_error(ctx->op, r); + } else if (results->should_requeue) { + if (ctx->op) + ctx->pg->requeue_op(ctx->op); + } + ctx->pg->close_op_ctx(ctx); + } + } + + bool is_temp_obj_used() { + return results->started_temp_obj; + } + uint64_t get_data_size() { + return results->object_size; + } + void set_truncate(uint32_t seq, uint64_t size) { + truncate_seq = seq; + truncate_size = size; + have_truncate = true; + } +}; + +struct CopyFromFinisher : public PrimaryLogPG::OpFinisher { + CopyFromCallback *copy_from_callback; + + explicit CopyFromFinisher(CopyFromCallback *copy_from_callback) + : copy_from_callback(copy_from_callback) { + } + + int execute() override { + // instance will be destructed after this method completes + copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback); + return 0; + } +}; + +// ====================== +// PGBackend::Listener + +void PrimaryLogPG::on_local_recover( + const hobject_t &hoid, + const ObjectRecoveryInfo &_recovery_info, + ObjectContextRef obc, + bool is_delete, + ObjectStore::Transaction *t + ) +{ + dout(10) << __func__ << ": " << hoid << dendl; + + ObjectRecoveryInfo recovery_info(_recovery_info); + clear_object_snap_mapping(t, hoid); + if (!is_delete && recovery_info.soid.is_snap()) { + OSDriver::OSTransaction _t(osdriver.get_transaction(t)); + set<snapid_t> snaps; + dout(20) << " snapset " << recovery_info.ss << dendl; + auto p = recovery_info.ss.clone_snaps.find(hoid.snap); + if (p != recovery_info.ss.clone_snaps.end()) { + snaps.insert(p->second.begin(), p->second.end()); + dout(20) << " snaps " << snaps << dendl; + snap_mapper.add_oid( + recovery_info.soid, + snaps, + &_t); + } else { + derr << __func__ << " " << hoid << " had no clone_snaps" << dendl; + } + } + if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) && + recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) { + ceph_assert(is_primary()); + const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second; + if (latest->op == pg_log_entry_t::LOST_REVERT && + latest->reverting_to == recovery_info.version) { + dout(10) << " got old revert version " << recovery_info.version + << " for " << *latest << dendl; + recovery_info.version = latest->version; + // update the attr to the revert event version + recovery_info.oi.prior_version = recovery_info.oi.version; + recovery_info.oi.version = latest->version; + bufferlist bl; + encode(recovery_info.oi, bl, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + ceph_assert(!pool.info.is_erasure()); + t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl); + if (obc) + obc->attr_cache[OI_ATTR] = bl; + } + } + + // keep track of active pushes for scrub + ++active_pushes; + + recovery_state.recover_got( + recovery_info.soid, + recovery_info.version, + is_delete, + *t); + + if (is_primary()) { + if (!is_delete) { + obc->obs.exists = true; + + bool got = obc->get_recovery_read(); + ceph_assert(got); + + ceph_assert(recovering.count(obc->obs.oi.soid)); + recovering[obc->obs.oi.soid] = obc; + obc->obs.oi = recovery_info.oi; // may have been updated above + } + + t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc)); + + publish_stats_to_osd(); + release_backoffs(hoid); + if (!is_unreadable_object(hoid)) { + auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid); + if (unreadable_object_entry != waiting_for_unreadable_object.end()) { + dout(20) << " kicking unreadable waiters on " << hoid << dendl; + requeue_ops(unreadable_object_entry->second); + waiting_for_unreadable_object.erase(unreadable_object_entry); + } + } + } else { + t->register_on_applied( + new C_OSD_AppliedRecoveredObjectReplica(this)); + + } + + t->register_on_commit( + new C_OSD_CommittedPushedObject( + this, + get_osdmap_epoch(), + info.last_complete)); +} + +void PrimaryLogPG::on_global_recover( + const hobject_t &soid, + const object_stat_sum_t &stat_diff, + bool is_delete) +{ + recovery_state.object_recovered(soid, stat_diff); + publish_stats_to_osd(); + dout(10) << "pushed " << soid << " to all replicas" << dendl; + auto i = recovering.find(soid); + ceph_assert(i != recovering.end()); + + if (i->second && i->second->rwstate.recovery_read_marker) { + // recover missing won't have had an obc, but it gets filled in + // during on_local_recover + ceph_assert(i->second); + list<OpRequestRef> requeue_list; + i->second->drop_recovery_read(&requeue_list); + requeue_ops(requeue_list); + } + + backfills_in_flight.erase(soid); + + recovering.erase(i); + finish_recovery_op(soid); + release_backoffs(soid); + auto degraded_object_entry = waiting_for_degraded_object.find(soid); + if (degraded_object_entry != waiting_for_degraded_object.end()) { + dout(20) << " kicking degraded waiters on " << soid << dendl; + requeue_ops(degraded_object_entry->second); + waiting_for_degraded_object.erase(degraded_object_entry); + } + auto unreadable_object_entry = waiting_for_unreadable_object.find(soid); + if (unreadable_object_entry != waiting_for_unreadable_object.end()) { + dout(20) << " kicking unreadable waiters on " << soid << dendl; + requeue_ops(unreadable_object_entry->second); + waiting_for_unreadable_object.erase(unreadable_object_entry); + } + finish_degraded_object(soid); +} + +void PrimaryLogPG::schedule_recovery_work( + GenContext<ThreadPool::TPHandle&> *c, + uint64_t cost) +{ + osd->queue_recovery_context( + this, c, cost, + recovery_state.get_recovery_op_priority()); +} + +void PrimaryLogPG::replica_clear_repop_obc( + const vector<pg_log_entry_t> &logv, + ObjectStore::Transaction &t) +{ + for (auto &&e: logv) { + /* Have to blast all clones, they share a snapset */ + object_contexts.clear_range( + e.soid.get_object_boundary(), e.soid.get_head()); + ceph_assert( + snapset_contexts.find(e.soid.get_head()) == + snapset_contexts.end()); + } +} + +bool PrimaryLogPG::should_send_op( + pg_shard_t peer, + const hobject_t &hoid) { + if (peer == get_primary()) + return true; + ceph_assert(recovery_state.has_peer_info(peer)); + bool should_send = + hoid.pool != (int64_t)info.pgid.pool() || + hoid <= last_backfill_started || + hoid <= recovery_state.get_peer_info(peer).last_backfill; + if (!should_send) { + ceph_assert(is_backfill_target(peer)); + dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer + << ", object " << hoid + << " beyond std::max(last_backfill_started " + << ", peer_info[peer].last_backfill " + << recovery_state.get_peer_info(peer).last_backfill + << ")" << dendl; + return should_send; + } + if (is_async_recovery_target(peer) && + recovery_state.get_peer_missing(peer).is_missing(hoid)) { + should_send = false; + dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer + << ", object " << hoid + << " which is pending recovery in async_recovery_targets" << dendl; + } + return should_send; +} + + +ConnectionRef PrimaryLogPG::get_con_osd_cluster( + int peer, epoch_t from_epoch) +{ + return osd->get_con_osd_cluster(peer, from_epoch); +} + +PerfCounters *PrimaryLogPG::get_logger() +{ + return osd->logger; +} + + +// ==================== +// missing objects + +bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const +{ + return recovery_state.get_pg_log().get_missing().get_items().count(soid); +} + +void PrimaryLogPG::maybe_kick_recovery( + const hobject_t &soid) +{ + eversion_t v; + bool work_started = false; + if (!recovery_state.get_missing_loc().needs_recovery(soid, &v)) + return; + + map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid); + if (p != recovering.end()) { + dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl; + } else if (recovery_state.get_missing_loc().is_unfound(soid)) { + dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl; + } else { + dout(7) << "object " << soid << " v " << v << ", recovering." << dendl; + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + if (is_missing_object(soid)) { + recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h); + } else if (recovery_state.get_missing_loc().is_deleted(soid)) { + prep_object_replica_deletes(soid, v, h, &work_started); + } else { + prep_object_replica_pushes(soid, v, h, &work_started); + } + pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH); + } +} + +void PrimaryLogPG::wait_for_unreadable_object( + const hobject_t& soid, OpRequestRef op) +{ + ceph_assert(is_unreadable_object(soid)); + maybe_kick_recovery(soid); + waiting_for_unreadable_object[soid].push_back(op); + op->mark_delayed("waiting for missing object"); + osd->logger->inc(l_osd_op_delayed_unreadable); +} + +bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid) +{ + /* The conditions below may clear (on_local_recover, before we queue + * the transaction) before we actually requeue the degraded waiters + * in on_global_recover after the transaction completes. + */ + if (waiting_for_degraded_object.count(soid)) + return true; + if (recovery_state.get_pg_log().get_missing().get_items().count(soid)) + return true; + ceph_assert(!get_acting_recovery_backfill().empty()); + for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin(); + i != get_acting_recovery_backfill().end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + auto peer_missing_entry = recovery_state.get_peer_missing().find(peer); + // If an object is missing on an async_recovery_target, return false. + // This will not block the op and the object is async recovered later. + if (peer_missing_entry != recovery_state.get_peer_missing().end() && + peer_missing_entry->second.get_items().count(soid)) { + if (is_async_recovery_target(peer)) + continue; + else + return true; + } + // Object is degraded if after last_backfill AND + // we are backfilling it + if (is_backfill_target(peer) && + recovery_state.get_peer_info(peer).last_backfill <= soid && + last_backfill_started >= soid && + backfills_in_flight.count(soid)) + return true; + } + return false; +} + +bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid) +{ + for (auto &i: get_async_recovery_targets()) { + auto peer_missing_entry = recovery_state.get_peer_missing().find(i); + if (peer_missing_entry != recovery_state.get_peer_missing().end() && + peer_missing_entry->second.get_items().count(soid)) { + dout(30) << __func__ << " " << soid << dendl; + return true; + } + } + return false; +} + +void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op) +{ + ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid)); + + maybe_kick_recovery(soid); + waiting_for_degraded_object[soid].push_back(op); + op->mark_delayed("waiting for degraded object"); + osd->logger->inc(l_osd_op_delayed_degraded); +} + +void PrimaryLogPG::block_write_on_full_cache( + const hobject_t& _oid, OpRequestRef op) +{ + const hobject_t oid = _oid.get_head(); + dout(20) << __func__ << ": blocking object " << oid + << " on full cache" << dendl; + objects_blocked_on_cache_full.insert(oid); + waiting_for_cache_not_full.push_back(op); + op->mark_delayed("waiting for cache not full"); +} + +void PrimaryLogPG::block_for_clean( + const hobject_t& oid, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << oid + << " on primary repair" << dendl; + waiting_for_clean_to_primary_repair.push_back(op); + op->mark_delayed("waiting for clean to repair"); +} + +void PrimaryLogPG::block_write_on_snap_rollback( + const hobject_t& oid, ObjectContextRef obc, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << oid.get_head() + << " on snap promotion " << obc->obs.oi.soid << dendl; + // otherwise, we'd have blocked in do_op + ceph_assert(oid.is_head()); + ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0); + /* + * We block the head object here. + * + * Let's assume that there is racing read When the head object is being rollbacked. + * Since the two different ops can trigger promote_object() with the same source, + * infinite loop happens by canceling ops each other. + * To avoid this, we block the head object during rollback. + * So, the racing read will be blocked until the rollback is completed. + * see also: https://tracker.ceph.com/issues/49726 + */ + ObjectContextRef head_obc = get_object_context(oid, false); + head_obc->start_block(); + objects_blocked_on_snap_promotion[oid] = obc; + wait_for_blocked_object(obc->obs.oi.soid, op); +} + +void PrimaryLogPG::block_write_on_degraded_snap( + const hobject_t& snap, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << snap.get_head() + << " on degraded snap " << snap << dendl; + // otherwise, we'd have blocked in do_op + ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0); + objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap; + wait_for_degraded_object(snap, op); +} + +bool PrimaryLogPG::maybe_await_blocked_head( + const hobject_t &hoid, + OpRequestRef op) +{ + ObjectContextRef obc; + obc = object_contexts.lookup(hoid.get_head()); + if (obc) { + if (obc->is_blocked()) { + wait_for_blocked_object(obc->obs.oi.soid, op); + return true; + } else { + return false; + } + } + return false; +} + +void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op) +{ + dout(10) << __func__ << " " << soid << " " << *op->get_req() << dendl; + waiting_for_blocked_object[soid].push_back(op); + op->mark_delayed("waiting for blocked object"); +} + +void PrimaryLogPG::maybe_force_recovery() +{ + // no force if not in degraded/recovery/backfill states + if (!is_degraded() && + !state_test(PG_STATE_RECOVERING | + PG_STATE_RECOVERY_WAIT | + PG_STATE_BACKFILLING | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILL_TOOFULL)) + return; + + if (recovery_state.get_pg_log().get_log().approx_size() < + cct->_conf->osd_max_pg_log_entries * + cct->_conf->osd_force_recovery_pg_log_entries_factor) + return; + + // find the oldest missing object + version_t min_version = recovery_state.get_pg_log().get_log().head.version; + hobject_t soid; + if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) { + min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first; + soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second; + } + ceph_assert(!get_acting_recovery_backfill().empty()); + for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin(); + it != get_acting_recovery_backfill().end(); + ++it) { + if (*it == get_primary()) continue; + pg_shard_t peer = *it; + auto it_missing = recovery_state.get_peer_missing().find(peer); + if (it_missing != recovery_state.get_peer_missing().end() && + !it_missing->second.get_rmissing().empty()) { + const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin(); + dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first + << " oid " << min_obj->second << dendl; + if (min_version > min_obj->first) { + min_version = min_obj->first; + soid = min_obj->second; + } + } + } + + // recover it + if (soid != hobject_t()) + maybe_kick_recovery(soid); +} + +bool PrimaryLogPG::check_laggy(OpRequestRef& op) +{ + assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(), + SERVER_OCTOPUS)); + if (state_test(PG_STATE_WAIT)) { + dout(10) << __func__ << " PG is WAIT state" << dendl; + } else if (!state_test(PG_STATE_LAGGY)) { + auto mnow = osd->get_mnow(); + auto ru = recovery_state.get_readable_until(); + if (mnow <= ru) { + // not laggy + return true; + } + dout(10) << __func__ + << " mnow " << mnow + << " > readable_until " << ru << dendl; + + if (!is_primary()) { + osd->reply_op_error(op, -EAGAIN); + return false; + } + + // go to laggy state + state_set(PG_STATE_LAGGY); + publish_stats_to_osd(); + } + dout(10) << __func__ << " not readable" << dendl; + waiting_for_readable.push_back(op); + op->mark_delayed("waiting for readable"); + return false; +} + +bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op) +{ + assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(), + SERVER_OCTOPUS)); + if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) { + return true; // not laggy + } + dout(10) << __func__ << " not readable" << dendl; + waiting_for_readable.push_front(op); + op->mark_delayed("waiting for readable"); + return false; +} + +void PrimaryLogPG::recheck_readable() +{ + if (!is_wait() && !is_laggy()) { + dout(20) << __func__ << " wasn't wait or laggy" << dendl; + return; + } + auto mnow = osd->get_mnow(); + bool pub = false; + if (is_wait()) { + auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub(); + if (mnow < prior_readable_until_ub) { + dout(10) << __func__ << " still wait (mnow " << mnow + << " < prior_readable_until_ub " << prior_readable_until_ub + << ")" << dendl; + } else { + dout(10) << __func__ << " no longer wait (mnow " << mnow + << " >= prior_readable_until_ub " << prior_readable_until_ub + << ")" << dendl; + state_clear(PG_STATE_WAIT); + recovery_state.clear_prior_readable_until_ub(); + pub = true; + } + } + if (is_laggy()) { + auto ru = recovery_state.get_readable_until(); + if (ru == ceph::signedspan::zero()) { + dout(10) << __func__ << " still laggy (mnow " << mnow + << ", readable_until zero)" << dendl; + } else if (mnow >= ru) { + dout(10) << __func__ << " still laggy (mnow " << mnow + << " >= readable_until " << ru << ")" << dendl; + } else { + dout(10) << __func__ << " no longer laggy (mnow " << mnow + << " < readable_until " << ru << ")" << dendl; + state_clear(PG_STATE_LAGGY); + pub = true; + } + } + if (pub) { + publish_stats_to_osd(); + } + if (!is_laggy() && !is_wait()) { + requeue_ops(waiting_for_readable); + } +} + +bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj) +{ + bufferlist bl; + + // If filter has expressed an interest in an xattr, load it. + if (!filter.get_xattr().empty()) { + int ret = pgbackend->objects_get_attr( + sobj, + filter.get_xattr(), + &bl); + dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl; + if (ret < 0) { + if (ret != -ENODATA || filter.reject_empty_xattr()) { + return false; + } + } + } + + return filter.filter(sobj, bl); +} + +std::pair<int, std::unique_ptr<const PGLSFilter>> +PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter) +{ + string type; + // storing non-const PGLSFilter for the sake of ::init() + std::unique_ptr<PGLSFilter> filter; + + try { + decode(type, iter); + } + catch (ceph::buffer::error& e) { + return { -EINVAL, nullptr }; + } + + if (type.compare("plain") == 0) { + filter = std::make_unique<PGLSPlainFilter>(); + } else { + std::size_t dot = type.find('.'); + if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) { + return { -EINVAL, nullptr }; + } + + const std::string class_name = type.substr(0, dot); + const std::string filter_name = type.substr(dot + 1); + ClassHandler::ClassData *cls = NULL; + int r = ClassHandler::get_instance().open_class(class_name, &cls); + if (r != 0) { + derr << "Error opening class '" << class_name << "': " + << cpp_strerror(r) << dendl; + if (r != -EPERM) // propagate permission error + r = -EINVAL; + return { r, nullptr }; + } else { + ceph_assert(cls); + } + + ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name); + if (class_filter == NULL) { + derr << "Error finding filter '" << filter_name << "' in class " + << class_name << dendl; + return { -EINVAL, nullptr }; + } + filter.reset(class_filter->fn()); + if (!filter) { + // Object classes are obliged to return us something, but let's + // give an error rather than asserting out. + derr << "Buggy class " << class_name << " failed to construct " + "filter " << filter_name << dendl; + return { -EINVAL, nullptr }; + } + } + + ceph_assert(filter); + int r = filter->init(iter); + if (r < 0) { + derr << "Error initializing filter " << type << ": " + << cpp_strerror(r) << dendl; + return { -EINVAL, nullptr }; + } else { + // Successfully constructed and initialized, return it. + return std::make_pair(0, std::move(filter)); + } +} + + +// ========================================================== + +void PrimaryLogPG::do_command( + const string_view& orig_prefix, + const cmdmap_t& cmdmap, + const bufferlist& idata, + std::function<void(int,const std::string&,bufferlist&)> on_finish) +{ + string format; + cmd_getval(cmdmap, "format", format); + auto f(Formatter::create_unique(format, "json-pretty", "json-pretty")); + int ret = 0; + stringstream ss; // stderr error message stream + bufferlist outbl; // if empty at end, we'll dump formatter as output + + // get final prefix: + // - ceph pg <pgid> foo -> prefix=pg, cmd=foo + // - ceph tell <pgid> foo -> prefix=foo + string prefix(orig_prefix); + string command; + cmd_getval(cmdmap, "cmd", command); + if (command.size()) { + prefix = command; + } + + if (prefix == "query") { + f->open_object_section("pg"); + f->dump_stream("snap_trimq") << snap_trimq; + f->dump_unsigned("snap_trimq_len", snap_trimq.size()); + recovery_state.dump_peering_state(f.get()); + + f->open_array_section("recovery_state"); + handle_query_state(f.get()); + f->close_section(); + + if (is_primary() && is_active() && m_scrubber) { + m_scrubber->dump_scrubber(f.get(), m_planned_scrub); + } + + f->open_object_section("agent_state"); + if (agent_state) + agent_state->dump(f.get()); + f->close_section(); + + f->close_section(); + } + else if (prefix == "log") { + + f->open_object_section("op_log"); + f->open_object_section("pg_log_t"); + recovery_state.get_pg_log().get_log().dump(f.get()); + f->close_section(); + f->close_section(); + } + else if (prefix == "mark_unfound_lost") { + string mulcmd; + cmd_getval(cmdmap, "mulcmd", mulcmd); + int mode = -1; + if (mulcmd == "revert") { + if (pool.info.is_erasure()) { + ss << "mode must be 'delete' for ec pool"; + ret = -EINVAL; + goto out; + } + mode = pg_log_entry_t::LOST_REVERT; + } else if (mulcmd == "delete") { + mode = pg_log_entry_t::LOST_DELETE; + } else { + ss << "mode must be 'revert' or 'delete'; mark not yet implemented"; + ret = -EINVAL; + goto out; + } + ceph_assert(mode == pg_log_entry_t::LOST_REVERT || + mode == pg_log_entry_t::LOST_DELETE); + + if (!is_primary()) { + ss << "not primary"; + ret = -EROFS; + goto out; + } + + uint64_t unfound = recovery_state.get_missing_loc().num_unfound(); + if (!unfound) { + ss << "pg has no unfound objects"; + goto out; // make command idempotent + } + + if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) { + ss << "pg has " << unfound + << " unfound objects but we haven't probed all sources, not marking lost"; + ret = -EINVAL; + goto out; + } + + mark_all_unfound_lost(mode, on_finish); + return; + } + + else if (prefix == "list_unfound") { + hobject_t offset; + string offset_json; + bool show_offset = false; + if (cmd_getval(cmdmap, "offset", offset_json)) { + json_spirit::Value v; + try { + if (!json_spirit::read(offset_json, v)) + throw std::runtime_error("bad json"); + offset.decode(v); + } catch (std::runtime_error& e) { + ss << "error parsing offset: " << e.what(); + ret = -EINVAL; + goto out; + } + show_offset = true; + } + f->open_object_section("missing"); + if (show_offset) { + f->open_object_section("offset"); + offset.dump(f.get()); + f->close_section(); + } + auto &needs_recovery_map = recovery_state.get_missing_loc() + .get_needs_recovery(); + f->dump_int("num_missing", needs_recovery_map.size()); + f->dump_int("num_unfound", get_num_unfound()); + map<hobject_t, pg_missing_item>::const_iterator p = + needs_recovery_map.upper_bound(offset); + { + f->open_array_section("objects"); + int32_t num = 0; + for (; p != needs_recovery_map.end() && + num < cct->_conf->osd_command_max_records; + ++p) { + if (recovery_state.get_missing_loc().is_unfound(p->first)) { + f->open_object_section("object"); + { + f->open_object_section("oid"); + p->first.dump(f.get()); + f->close_section(); + } + p->second.dump(f.get()); // have, need keys + { + f->open_array_section("locations"); + for (auto &&r : recovery_state.get_missing_loc().get_locations( + p->first)) { + f->dump_stream("shard") << r; + } + f->close_section(); + } + f->close_section(); + num++; + } + } + f->close_section(); + } + // Get possible locations of missing objects from pg information + PeeringState::QueryUnfound q(f.get()); + recovery_state.handle_event(q, 0); + f->dump_bool("more", p != needs_recovery_map.end()); + f->close_section(); + } + + else if (prefix == "scrub" || + prefix == "deep_scrub") { + bool deep = (prefix == "deep_scrub"); + int64_t time = cmd_getval_or<int64_t>(cmdmap, "time", 0); + + if (is_primary()) { + const pg_pool_t *p = &pool.info; + double pool_scrub_max_interval = 0; + double scrub_max_interval; + if (deep) { + p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval); + scrub_max_interval = pool_scrub_max_interval > 0 ? + pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval; + } else { + p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval); + scrub_max_interval = pool_scrub_max_interval > 0 ? + pool_scrub_max_interval : g_conf()->osd_scrub_max_interval; + } + // Instead of marking must_scrub force a schedule scrub + utime_t stamp = ceph_clock_now(); + if (time == 0) + stamp -= scrub_max_interval; + else + stamp -= (float)time; + stamp -= 100.0; // push back last scrub more for good measure + if (deep) { + set_last_deep_scrub_stamp(stamp); + } + set_last_scrub_stamp(stamp); // for 'deep' as well, as we use this value to order scrubs + f->open_object_section("result"); + f->dump_bool("deep", deep); + f->dump_stream("stamp") << stamp; + f->close_section(); + } else { + ss << "Not primary"; + ret = -EPERM; + } + outbl.append(ss.str()); + } + + else if (prefix == "block" || prefix == "unblock" || prefix == "set" || + prefix == "unset") { + string value; + cmd_getval(cmdmap, "value", value); + + if (is_primary()) { + ret = m_scrubber->asok_debug(prefix, value, f.get(), ss); + f->open_object_section("result"); + f->dump_bool("success", true); + f->close_section(); + } else { + ss << "Not primary"; + ret = -EPERM; + } + outbl.append(ss.str()); + } + else { + ret = -ENOSYS; + ss << "prefix '" << prefix << "' not implemented"; + } + + out: + if (ret >= 0 && outbl.length() == 0) { + f->flush(outbl); + } + on_finish(ret, ss.str(), outbl); +} + + +// ========================================================== + +void PrimaryLogPG::do_pg_op(OpRequestRef op) +{ + const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + dout(10) << "do_pg_op " << *m << dendl; + + op->mark_started(); + + int result = 0; + string cname, mname; + + snapid_t snapid = m->get_snapid(); + + vector<OSDOp> ops = m->ops; + + for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) { + std::unique_ptr<const PGLSFilter> filter; + OSDOp& osd_op = *p; + auto bp = p->indata.cbegin(); + switch (p->op.op) { + case CEPH_OSD_OP_PGNLS_FILTER: + try { + decode(cname, bp); + decode(mname, bp); + } + catch (const ceph::buffer::error& e) { + dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl; + result = -EINVAL; + break; + } + std::tie(result, filter) = get_pgls_filter(bp); + if (result < 0) + break; + + ceph_assert(filter); + + // fall through + + case CEPH_OSD_OP_PGNLS: + if (snapid != CEPH_NOSNAP) { + result = -EINVAL; + break; + } + if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) { + dout(10) << " pgnls pg=" << m->get_pg() + << " " << get_osdmap()->raw_pg_to_pg(m->get_pg()) + << " != " << info.pgid << dendl; + result = 0; // hmm? + } else { + unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls, + p->op.pgls.count); + + dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size + << dendl; + // read into a buffer + vector<hobject_t> sentries; + pg_nls_response_t response; + try { + decode(response.handle, bp); + } + catch (const ceph::buffer::error& e) { + dout(0) << "unable to decode PGNLS handle in " << *m << dendl; + result = -EINVAL; + break; + } + + hobject_t next; + hobject_t lower_bound = response.handle; + hobject_t pg_start = info.pgid.pgid.get_hobj_start(); + hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + dout(10) << " pgnls lower_bound " << lower_bound + << " pg_end " << pg_end << dendl; + if (((!lower_bound.is_max() && lower_bound >= pg_end) || + (lower_bound != hobject_t() && lower_bound < pg_start))) { + // this should only happen with a buggy client. + dout(10) << "outside of PG bounds " << pg_start << " .. " + << pg_end << dendl; + result = -EINVAL; + break; + } + + hobject_t current = lower_bound; + int r = pgbackend->objects_list_partial( + current, + list_size, + list_size, + &sentries, + &next); + if (r != 0) { + result = -EINVAL; + break; + } + + map<hobject_t, pg_missing_item>::const_iterator missing_iter = + recovery_state.get_pg_log().get_missing().get_items().lower_bound(current); + vector<hobject_t>::iterator ls_iter = sentries.begin(); + hobject_t _max = hobject_t::get_max(); + while (1) { + const hobject_t &mcand = + missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ? + _max : + missing_iter->first; + const hobject_t &lcand = + ls_iter == sentries.end() ? + _max : + *ls_iter; + + hobject_t candidate; + if (mcand == lcand) { + candidate = mcand; + if (!mcand.is_max()) { + ++ls_iter; + ++missing_iter; + } + } else if (mcand < lcand) { + candidate = mcand; + ceph_assert(!mcand.is_max()); + ++missing_iter; + } else { + candidate = lcand; + ceph_assert(!lcand.is_max()); + ++ls_iter; + } + + dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash() + << " vs lower bound 0x" << lower_bound.get_hash() + << std::dec << dendl; + + if (candidate >= next) { + break; + } + + if (response.entries.size() == list_size) { + next = candidate; + break; + } + + if (candidate.snap != CEPH_NOSNAP) + continue; + + // skip internal namespace + if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace) + continue; + + if (recovery_state.get_missing_loc().is_deleted(candidate)) + continue; + + // skip wrong namespace + if (m->get_hobj().nspace != librados::all_nspaces && + candidate.get_namespace() != m->get_hobj().nspace) + continue; + + if (filter && !pgls_filter(*filter, candidate)) + continue; + + dout(20) << "pgnls item 0x" << std::hex + << candidate.get_hash() + << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash()) + << std::dec << " " + << candidate.oid.name << dendl; + + librados::ListObjectImpl item; + item.nspace = candidate.get_namespace(); + item.oid = candidate.oid.name; + item.locator = candidate.get_key(); + response.entries.push_back(item); + } + + if (next.is_max() && + missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() && + ls_iter == sentries.end()) { + result = 1; + + // Set response.handle to the start of the next PG according + // to the object sort order. + response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + } else { + response.handle = next; + } + dout(10) << "pgnls handle=" << response.handle << dendl; + encode(response, osd_op.outdata); + dout(10) << " pgnls result=" << result << " outdata.length()=" + << osd_op.outdata.length() << dendl; + } + break; + + case CEPH_OSD_OP_PGLS_FILTER: + try { + decode(cname, bp); + decode(mname, bp); + } + catch (const ceph::buffer::error& e) { + dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl; + result = -EINVAL; + break; + } + std::tie(result, filter) = get_pgls_filter(bp); + if (result < 0) + break; + + ceph_assert(filter); + + // fall through + + case CEPH_OSD_OP_PGLS: + if (snapid != CEPH_NOSNAP) { + result = -EINVAL; + break; + } + if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) { + dout(10) << " pgls pg=" << m->get_pg() + << " " << get_osdmap()->raw_pg_to_pg(m->get_pg()) + << " != " << info.pgid << dendl; + result = 0; // hmm? + } else { + unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls, + p->op.pgls.count); + + dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl; + // read into a buffer + vector<hobject_t> sentries; + pg_ls_response_t response; + try { + decode(response.handle, bp); + } + catch (const ceph::buffer::error& e) { + dout(0) << "unable to decode PGLS handle in " << *m << dendl; + result = -EINVAL; + break; + } + + hobject_t next; + hobject_t current = response.handle; + int r = pgbackend->objects_list_partial( + current, + list_size, + list_size, + &sentries, + &next); + if (r != 0) { + result = -EINVAL; + break; + } + + ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty()); + + map<hobject_t, pg_missing_item>::const_iterator missing_iter = + recovery_state.get_pg_log().get_missing().get_items().lower_bound(current); + vector<hobject_t>::iterator ls_iter = sentries.begin(); + hobject_t _max = hobject_t::get_max(); + while (1) { + const hobject_t &mcand = + missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ? + _max : + missing_iter->first; + const hobject_t &lcand = + ls_iter == sentries.end() ? + _max : + *ls_iter; + + hobject_t candidate; + if (mcand == lcand) { + candidate = mcand; + if (!mcand.is_max()) { + ++ls_iter; + ++missing_iter; + } + } else if (mcand < lcand) { + candidate = mcand; + ceph_assert(!mcand.is_max()); + ++missing_iter; + } else { + candidate = lcand; + ceph_assert(!lcand.is_max()); + ++ls_iter; + } + + if (candidate >= next) { + break; + } + + if (response.entries.size() == list_size) { + next = candidate; + break; + } + + if (candidate.snap != CEPH_NOSNAP) + continue; + + // skip wrong namespace + if (candidate.get_namespace() != m->get_hobj().nspace) + continue; + + if (recovery_state.get_missing_loc().is_deleted(candidate)) + continue; + + if (filter && !pgls_filter(*filter, candidate)) + continue; + + response.entries.push_back(make_pair(candidate.oid, + candidate.get_key())); + } + if (next.is_max() && + missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() && + ls_iter == sentries.end()) { + result = 1; + } + response.handle = next; + encode(response, osd_op.outdata); + dout(10) << " pgls result=" << result << " outdata.length()=" + << osd_op.outdata.length() << dendl; + } + break; + + case CEPH_OSD_OP_PG_HITSET_LS: + { + list< pair<utime_t,utime_t> > ls; + for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) + ls.push_back(make_pair(p->begin, p->end)); + if (hit_set) + ls.push_back(make_pair(hit_set_start_stamp, utime_t())); + encode(ls, osd_op.outdata); + } + break; + + case CEPH_OSD_OP_PG_HITSET_GET: + { + utime_t stamp(osd_op.op.hit_set_get.stamp); + if (hit_set_start_stamp && stamp >= hit_set_start_stamp) { + // read the current in-memory HitSet, not the version we've + // checkpointed. + if (!hit_set) { + result= -ENOENT; + break; + } + encode(*hit_set, osd_op.outdata); + result = osd_op.outdata.length(); + } else { + // read an archived HitSet. + hobject_t oid; + for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + if (stamp >= p->begin && stamp <= p->end) { + oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + break; + } + } + if (oid == hobject_t()) { + result = -ENOENT; + break; + } + if (!pool.info.is_replicated()) { + // FIXME: EC not supported yet + result = -EOPNOTSUPP; + break; + } + if (is_unreadable_object(oid)) { + wait_for_unreadable_object(oid, op); + return; + } + result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata); + } + } + break; + + case CEPH_OSD_OP_SCRUBLS: + result = do_scrub_ls(m, &osd_op); + break; + + default: + result = -EINVAL; + break; + } + + if (result < 0) + break; + } + + // reply + MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + false); + reply->claim_op_out_data(ops); + reply->set_result(result); + reply->set_reply_versions(info.last_update, info.last_user_version); + osd->send_message_osd_client(reply, m->get_connection()); +} + +int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op) +{ + if (m->get_pg() != info.pgid.pgid) { + dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl; + return -EINVAL; // hmm? + } + auto bp = osd_op->indata.cbegin(); + scrub_ls_arg_t arg; + try { + arg.decode(bp); + } catch (ceph::buffer::error&) { + dout(10) << " corrupted scrub_ls_arg_t" << dendl; + return -EINVAL; + } + + int r = 0; + scrub_ls_result_t result = {.interval = info.history.same_interval_since}; + + if (arg.interval != 0 && arg.interval != info.history.same_interval_since) { + r = -EAGAIN; + } else { + bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result); + if (store_queried) { + encode(result, osd_op->outdata); + } else { + // the scrubber's store is not initialized + r = -ENOENT; + } + } + + return r; +} + +/** + * Grabs locks for OpContext, should be cleaned up in close_op_ctx + * + * @param ctx [in,out] ctx to get locks for + * @return true on success, false if we are queued + */ +bool PrimaryLogPG::get_rw_locks(bool write_ordered, OpContext *ctx) +{ + /* If head_obc, !obc->obs->exists and we will always take the + * snapdir lock *before* the head lock. Since all callers will do + * this (read or write) if we get the first we will be guaranteed + * to get the second. + */ + if (write_ordered && ctx->op->may_read()) { + ctx->lock_type = RWState::RWEXCL; + } else if (write_ordered) { + ctx->lock_type = RWState::RWWRITE; + } else { + ceph_assert(ctx->op->may_read()); + ctx->lock_type = RWState::RWREAD; + } + + if (ctx->head_obc) { + ceph_assert(!ctx->obc->obs.exists); + if (!ctx->lock_manager.get_lock_type( + ctx->lock_type, + ctx->head_obc->obs.oi.soid, + ctx->head_obc, + ctx->op)) { + ctx->lock_type = RWState::RWNONE; + return false; + } + } + if (ctx->lock_manager.get_lock_type( + ctx->lock_type, + ctx->obc->obs.oi.soid, + ctx->obc, + ctx->op)) { + return true; + } else { + ceph_assert(!ctx->head_obc); + ctx->lock_type = RWState::RWNONE; + return false; + } +} + +/** + * Releases locks + * + * @param manager [in] manager with locks to release + */ +void PrimaryLogPG::release_object_locks( + ObcLockManager &lock_manager) { + std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req; + bool requeue_recovery = false; + bool requeue_snaptrim = false; + lock_manager.put_locks( + &to_req, + &requeue_recovery, + &requeue_snaptrim); + if (requeue_recovery) + queue_recovery(); + if (requeue_snaptrim) + snap_trimmer_machine.process_event(TrimWriteUnblocked()); + + if (!to_req.empty()) { + // requeue at front of scrub blocking queue if we are blocked by scrub + for (auto &&p: to_req) { + if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) { + for (auto& op : p.second) { + op->mark_delayed("waiting for scrub"); + } + + waiting_for_scrub.splice( + waiting_for_scrub.begin(), + p.second, + p.second.begin(), + p.second.end()); + } else if (is_laggy()) { + for (auto& op : p.second) { + op->mark_delayed("waiting for readable"); + } + waiting_for_readable.splice( + waiting_for_readable.begin(), + p.second, + p.second.begin(), + p.second.end()); + } else { + requeue_ops(p.second); + } + } + } +} + +PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap, + const PGPool &_pool, + const map<string,string>& ec_profile, spg_t p) : + PG(o, curmap, _pool, p), + pgbackend( + PGBackend::build_pg_backend( + _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)), + object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count), + new_backfill(false), + temp_seq(0), + snap_trimmer_machine(this) +{ + recovery_state.set_backend_predicates( + pgbackend->get_is_readable_predicate(), + pgbackend->get_is_recoverable_predicate()); + snap_trimmer_machine.initiate(); + + m_scrubber = make_unique<PrimaryLogScrub>(this); +} + +PrimaryLogPG::~PrimaryLogPG() +{ + m_scrubber.reset(); +} + +void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc) +{ + src_oloc = oloc; + if (oloc.key.empty()) + src_oloc.key = oid.name; +} + +void PrimaryLogPG::handle_backoff(OpRequestRef& op) +{ + auto m = op->get_req<MOSDBackoff>(); + auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv()); + if (!session) + return; // drop it. + hobject_t begin = info.pgid.pgid.get_hobj_start(); + hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); + if (begin < m->begin) { + begin = m->begin; + } + if (end > m->end) { + end = m->end; + } + dout(10) << __func__ << " backoff ack id " << m->id + << " [" << begin << "," << end << ")" << dendl; + session->ack_backoff(cct, m->pgid, m->id, begin, end); +} + +void PrimaryLogPG::do_request( + OpRequestRef& op, + ThreadPool::TPHandle &handle) +{ + if (op->osd_trace) { + op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace); + op->pg_trace.event("do request"); + } + + +// make sure we have a new enough map + auto p = waiting_for_map.find(op->get_source()); + if (p != waiting_for_map.end()) { + // preserve ordering + dout(20) << __func__ << " waiting_for_map " + << p->first << " not empty, queueing" << dendl; + p->second.push_back(op); + op->mark_delayed("waiting_for_map not empty"); + return; + } + if (!have_same_or_newer_map(op->min_epoch)) { + dout(20) << __func__ << " min " << op->min_epoch + << ", queue on waiting_for_map " << op->get_source() << dendl; + waiting_for_map[op->get_source()].push_back(op); + op->mark_delayed("op must wait for map"); + osd->request_osdmap_update(op->min_epoch); + return; + } + + if (can_discard_request(op)) { + return; + } + + // pg-wide backoffs + const Message *m = op->get_req(); + int msg_type = m->get_type(); + if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) { + auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv()); + if (!session) + return; // drop it. + if (msg_type == CEPH_MSG_OSD_OP) { + if (session->check_backoff(cct, info.pgid, + info.pgid.pgid.get_hobj_start(), m)) { + return; + } + + bool backoff = + is_down() || + is_incomplete() || + (!is_active() && is_peered()); + if (g_conf()->osd_backoff_on_peering && !backoff) { + if (is_peering()) { + backoff = true; + } + } + if (backoff) { + add_pg_backoff(session); + return; + } + } + // pg backoff acks at pg-level + if (msg_type == CEPH_MSG_OSD_BACKOFF) { + const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m); + if (ba->begin != ba->end) { + handle_backoff(op); + return; + } + } + } + + if (!is_peered()) { + // Delay unless PGBackend says it's ok + if (pgbackend->can_handle_while_inactive(op)) { + bool handled = pgbackend->handle_message(op); + ceph_assert(handled); + return; + } else { + waiting_for_peered.push_back(op); + op->mark_delayed("waiting for peered"); + return; + } + } + + if (recovery_state.needs_flush()) { + dout(20) << "waiting for flush on " << *op->get_req() << dendl; + waiting_for_flush.push_back(op); + op->mark_delayed("waiting for flush"); + return; + } + + ceph_assert(is_peered() && !recovery_state.needs_flush()); + if (pgbackend->handle_message(op)) + return; + + switch (msg_type) { + case CEPH_MSG_OSD_OP: + case CEPH_MSG_OSD_BACKOFF: + if (!is_active()) { + dout(20) << " peered, not active, waiting for active on " + << *op->get_req() << dendl; + waiting_for_active.push_back(op); + op->mark_delayed("waiting for active"); + return; + } + switch (msg_type) { + case CEPH_MSG_OSD_OP: + // verify client features + if ((pool.info.has_tiers() || pool.info.is_tier()) && + !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) { + osd->reply_op_error(op, -EOPNOTSUPP); + return; + } + do_op(op); + break; + case CEPH_MSG_OSD_BACKOFF: + // object-level backoff acks handled in osdop context + handle_backoff(op); + break; + } + break; + + case MSG_OSD_PG_SCAN: + do_scan(op, handle); + break; + + case MSG_OSD_PG_BACKFILL: + do_backfill(op); + break; + + case MSG_OSD_PG_BACKFILL_REMOVE: + do_backfill_remove(op); + break; + + case MSG_OSD_SCRUB_RESERVE: + { + if (!m_scrubber) { + osd->reply_op_error(op, -EAGAIN); + return; + } + auto m = op->get_req<MOSDScrubReserve>(); + switch (m->type) { + case MOSDScrubReserve::REQUEST: + m_scrubber->handle_scrub_reserve_request(op); + break; + case MOSDScrubReserve::GRANT: + m_scrubber->handle_scrub_reserve_grant(op, m->from); + break; + case MOSDScrubReserve::REJECT: + m_scrubber->handle_scrub_reserve_reject(op, m->from); + break; + case MOSDScrubReserve::RELEASE: + m_scrubber->handle_scrub_reserve_release(op); + break; + } + } + break; + + case MSG_OSD_REP_SCRUB: + replica_scrub(op, handle); + break; + + case MSG_OSD_REP_SCRUBMAP: + do_replica_scrub_map(op); + break; + + case MSG_OSD_PG_UPDATE_LOG_MISSING: + do_update_log_missing(op); + break; + + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + do_update_log_missing_reply(op); + break; + + default: + ceph_abort_msg("bad message type in do_request"); + } +} + +/** do_op - do an op + * pg lock will be held (if multithreaded) + * osd_lock NOT held. + */ +void PrimaryLogPG::do_op(OpRequestRef& op) +{ + FUNCTRACE(cct); + // NOTE: take a non-const pointer here; we must be careful not to + // change anything that will break other reads on m (operator<<). + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + if (m->finish_decode()) { + op->reset_desc(); // for TrackedOp + m->clear_payload(); + } + + dout(20) << __func__ << ": op " << *m << dendl; + + const hobject_t head = m->get_hobj().get_head(); + + if (!info.pgid.pgid.contains( + info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) { + derr << __func__ << " " << info.pgid.pgid << " does not contain " + << head << " pg_num " << pool.info.get_pg_num() << " hash " + << std::hex << head.get_hash() << std::dec << dendl; + osd->clog->warn() << info.pgid.pgid << " does not contain " << head + << " op " << *m; + ceph_assert(!cct->_conf->osd_debug_misdirected_ops); + return; + } + + bool can_backoff = + m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF); + ceph::ref_t<Session> session; + if (can_backoff) { + session = static_cast<Session*>(m->get_connection()->get_priv().get()); + if (!session.get()) { + dout(10) << __func__ << " no session" << dendl; + return; + } + + if (session->check_backoff(cct, info.pgid, head, m)) { + return; + } + } + + if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) { + // not implemented. + dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + { + int r = op->maybe_init_op_info(*get_osdmap()); + if (r) { + osd->reply_op_error(op, r); + return; + } + } + + if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) && + op->may_read() && + !(op->may_write() || op->may_cache())) { + // balanced reads; any replica will do + if (!(is_primary() || is_nonprimary())) { + osd->handle_misdirected_op(this, op); + return; + } + } else { + // normal case; must be primary + if (!is_primary()) { + osd->handle_misdirected_op(this, op); + return; + } + } + + if (!check_laggy(op)) { + return; + } + + if (!op_has_sufficient_caps(op)) { + osd->reply_op_error(op, -EPERM); + return; + } + + if (op->includes_pg_op()) { + return do_pg_op(op); + } + + // object name too long? + if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) { + dout(4) << "do_op name is longer than " + << cct->_conf->osd_max_object_name_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) { + dout(4) << "do_op locator is longer than " + << cct->_conf->osd_max_object_name_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) { + dout(4) << "do_op namespace is longer than " + << cct->_conf->osd_max_object_namespace_len + << " bytes" << dendl; + osd->reply_op_error(op, -ENAMETOOLONG); + return; + } + if (m->get_hobj().oid.name.empty()) { + dout(4) << "do_op empty oid name is not allowed" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + if (int r = osd->store->validate_hobject_key(head)) { + dout(4) << "do_op object " << head << " invalid for backing store: " + << r << dendl; + osd->reply_op_error(op, r); + return; + } + + // blocklisted? + if (get_osdmap()->is_blocklisted(m->get_source_addr())) { + dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl; + osd->reply_op_error(op, -EBLOCKLISTED); + return; + } + + // order this op as a write? + bool write_ordered = op->rwordered(); + + // discard due to cluster full transition? (we discard any op that + // originates before the cluster or pool is marked full; the client + // will resend after the full flag is removed or if they expect the + // op to succeed despite being full). The except is FULL_FORCE and + // FULL_TRY ops, which there is no reason to discard because they + // bypass all full checks anyway. If this op isn't write or + // read-ordered, we skip. + // FIXME: we exclude mds writes for now. + if (write_ordered && !(m->get_source().is_mds() || + m->has_flag(CEPH_OSD_FLAG_FULL_TRY) || + m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) && + info.history.last_epoch_marked_full > m->get_map_epoch()) { + dout(10) << __func__ << " discarding op sent before full " << m << " " + << *m << dendl; + return; + } + // mds should have stopped writing before this point. + // We can't allow OSD to become non-startable even if mds + // could be writing as part of file removals. + if (write_ordered && osd->check_failsafe_full(get_dpp()) && + !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl; + return; + } + int64_t poolid = get_pgid().pool(); + const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid); + if (!pi) { + return; + } + if (pi->has_flag(pg_pool_t::FLAG_EIO)) { + // drop op on the floor; the client will handle returning EIO + if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) { + dout(10) << __func__ << " discarding op due to pool EIO flag" << dendl; + } else { + dout(10) << __func__ << " replying EIO due to pool EIO flag" << dendl; + osd->reply_op_error(op, -EIO); + } + return; + } + if (op->may_write()) { + + // invalid? + if (m->get_snapid() != CEPH_NOSNAP) { + dout(20) << __func__ << ": write to clone not valid " << *m << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + + // too big? + if (cct->_conf->osd_max_write_size && + m->get_data_len() > cct->_conf->osd_max_write_size << 20) { + // journal can't hold commit! + derr << "do_op msg data len " << m->get_data_len() + << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20) + << " on " << *m << dendl; + osd->reply_op_error(op, -OSD_WRITETOOBIG); + return; + } + } + + dout(10) << "do_op " << *m + << (op->may_write() ? " may_write" : "") + << (op->may_read() ? " may_read" : "") + << (op->may_cache() ? " may_cache" : "") + << " -> " << (write_ordered ? "write-ordered" : "read-ordered") + << " flags " << ceph_osd_flag_string(m->get_flags()) + << dendl; + + + // missing object? + if (is_unreadable_object(head)) { + if (!is_primary()) { + osd->reply_op_error(op, -EAGAIN); + return; + } + if (can_backoff && + (g_conf()->osd_backoff_on_degraded || + (g_conf()->osd_backoff_on_unfound && + recovery_state.get_missing_loc().is_unfound(head)))) { + add_backoff(session, head, head); + maybe_kick_recovery(head); + } else { + wait_for_unreadable_object(head, op); + } + return; + } + + if (write_ordered) { + // degraded object? + if (is_degraded_or_backfilling_object(head)) { + if (can_backoff && g_conf()->osd_backoff_on_degraded) { + add_backoff(session, head, head); + maybe_kick_recovery(head); + } else { + wait_for_degraded_object(head, op); + } + return; + } + + if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) { + dout(20) << __func__ << ": waiting for scrub" << dendl; + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + return; + } + if (!check_laggy_requeue(op)) { + return; + } + + // blocked on snap? + if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head); + blocked_iter != std::end(objects_blocked_on_degraded_snap)) { + hobject_t to_wait_on(head); + to_wait_on.snap = blocked_iter->second; + wait_for_degraded_object(to_wait_on, op); + return; + } + if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head); + blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) { + wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op); + return; + } + if (objects_blocked_on_cache_full.count(head)) { + block_write_on_full_cache(head, op); + return; + } + } + + // dup/resent? + if (op->may_write() || op->may_cache()) { + // warning: we will get back *a* request for this reqid, but not + // necessarily the most recent. this happens with flush and + // promote ops, but we can't possible have both in our log where + // the original request is still not stable on disk, so for our + // purposes here it doesn't matter which one we get. + eversion_t version; + version_t user_version; + int return_code = 0; + vector<pg_log_op_return_item_t> op_returns; + bool got = check_in_progress_op( + m->get_reqid(), &version, &user_version, &return_code, &op_returns); + if (got) { + dout(3) << __func__ << " dup " << m->get_reqid() + << " version " << version << dendl; + if (already_complete(version)) { + osd->reply_op_error(op, return_code, version, user_version, op_returns); + } else { + dout(10) << " waiting for " << version << " to commit" << dendl; + // always queue ondisk waiters, so that we can requeue if needed + waiting_for_ondisk[version].emplace_back(op, user_version, return_code, + op_returns); + op->mark_delayed("waiting for ondisk"); + } + return; + } + } + + ObjectContextRef obc; + bool can_create = op->may_write(); + hobject_t missing_oid; + + // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS + const hobject_t& oid = + m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj(); + + // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else + for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) { + OSDOp& osd_op = *p; + + if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) { + if (m->get_snapid() != CEPH_SNAPDIR) { + dout(10) << "LIST_SNAPS with incorrect context" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + } else { + if (m->get_snapid() == CEPH_SNAPDIR) { + dout(10) << "non-LIST_SNAPS on snapdir" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + } + } + + // io blocked on obc? + if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) && + maybe_await_blocked_head(oid, op)) { + return; + } + + if (!is_primary()) { + if (!recovery_state.can_serve_replica_read(oid)) { + dout(20) << __func__ + << ": unstable write on replica, bouncing to primary " + << *m << dendl; + osd->reply_op_error(op, -EAGAIN); + return; + } + dout(20) << __func__ << ": serving replica read on oid " << oid + << dendl; + } + + int r = find_object_context( + oid, &obc, can_create, + m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE), + &missing_oid); + + // LIST_SNAPS needs the ssc too + if (obc && + m->get_snapid() == CEPH_SNAPDIR && + !obc->ssc) { + obc->ssc = get_snapset_context(oid, true); + } + + if (r == -EAGAIN) { + // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise, + // we have to wait for the object. + if (is_primary()) { + // missing the specific snap we need; requeue and wait. + ceph_assert(!op->may_write()); // only happens on a read/cache + wait_for_unreadable_object(missing_oid, op); + return; + } + } else if (r == 0) { + if (is_unreadable_object(obc->obs.oi.soid)) { + dout(10) << __func__ << ": clone " << obc->obs.oi.soid + << " is unreadable, waiting" << dendl; + wait_for_unreadable_object(obc->obs.oi.soid, op); + return; + } + + // degraded object? (the check above was for head; this could be a clone) + if (write_ordered && + obc->obs.oi.soid.snap != CEPH_NOSNAP && + is_degraded_or_backfilling_object(obc->obs.oi.soid)) { + dout(10) << __func__ << ": clone " << obc->obs.oi.soid + << " is degraded, waiting" << dendl; + wait_for_degraded_object(obc->obs.oi.soid, op); + return; + } + } + + bool in_hit_set = false; + if (hit_set) { + if (obc.get()) { + if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid)) + in_hit_set = true; + } else { + if (missing_oid != hobject_t() && hit_set->contains(missing_oid)) + in_hit_set = true; + } + if (!op->hitset_inserted) { + hit_set->insert(oid); + op->hitset_inserted = true; + if (hit_set->is_full() || + hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) { + hit_set_persist(); + } + } + } + + if (agent_state) { + if (agent_choose_mode(false, op)) + return; + } + + if (obc.get() && obc->obs.exists) { + if (recover_adjacent_clones(obc, op)) { + return; + } + if (maybe_handle_manifest(op, + write_ordered, + obc)) + return; + } + + if (maybe_handle_cache(op, + write_ordered, + obc, + r, + missing_oid, + false, + in_hit_set)) + return; + + if (r && (r != -ENOENT || !obc)) { + // copy the reqids for copy get on ENOENT + if (r == -ENOENT && + (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) { + fill_in_copy_get_noent(op, oid, m->ops[0]); + return; + } + dout(20) << __func__ << ": find_object_context got error " << r << dendl; + if (op->may_write() && + get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + record_write_error(op, oid, nullptr, r); + } else { + osd->reply_op_error(op, r); + } + return; + } + + // make sure locator is consistent + object_locator_t oloc(obc->obs.oi.soid); + if (m->get_object_locator() != oloc) { + dout(10) << " provided locator " << m->get_object_locator() + << " != object's " << obc->obs.oi.soid << dendl; + osd->clog->warn() << "bad locator " << m->get_object_locator() + << " on object " << oloc + << " op " << *m; + } + + // io blocked on obc? + if (obc->is_blocked() && + !m->has_flag(CEPH_OSD_FLAG_FLUSH)) { + wait_for_blocked_object(obc->obs.oi.soid, op); + return; + } + + dout(25) << __func__ << " oi " << obc->obs.oi << dendl; + + OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this); + + if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) { + dout(20) << __func__ << ": skipping rw locks" << dendl; + } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) { + dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl; + + // verify there is in fact a flush in progress + // FIXME: we could make this a stronger test. + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid); + if (p == flush_ops.end()) { + dout(10) << __func__ << " no flush in progress, aborting" << dendl; + reply_ctx(ctx, -EINVAL); + return; + } + } else if (!get_rw_locks(write_ordered, ctx)) { + dout(20) << __func__ << " waiting for rw locks " << dendl; + op->mark_delayed("waiting for rw locks"); + close_op_ctx(ctx); + return; + } + dout(20) << __func__ << " obc " << *obc << dendl; + + if (r) { + dout(20) << __func__ << " returned an error: " << r << dendl; + if (op->may_write() && + get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + record_write_error(op, oid, nullptr, r, + ctx->op->allows_returnvec() ? ctx : nullptr); + } else { + osd->reply_op_error(op, r); + } + close_op_ctx(ctx); + return; + } + + if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) { + ctx->ignore_cache = true; + } + + if ((op->may_read()) && (obc->obs.oi.is_lost())) { + // This object is lost. Reading from it returns an error. + dout(20) << __func__ << ": object " << obc->obs.oi.soid + << " is lost" << dendl; + reply_ctx(ctx, -ENFILE); + return; + } + if (!op->may_write() && + !op->may_cache() && + (!obc->obs.exists || + ((m->get_snapid() != CEPH_SNAPDIR) && + obc->obs.oi.is_whiteout()))) { + // copy the reqids for copy get on ENOENT + if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) { + fill_in_copy_get_noent(op, oid, m->ops[0]); + close_op_ctx(ctx); + return; + } + reply_ctx(ctx, -ENOENT); + return; + } + + op->mark_started(); + + execute_ctx(ctx); + utime_t prepare_latency = ceph_clock_now(); + prepare_latency -= op->get_dequeued_time(); + osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency); + if (op->may_read() && op->may_write()) { + osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency); + } else if (op->may_read()) { + osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency); + } else if (op->may_write() || op->may_cache()) { + osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency); + } + + // force recovery of the oldest missing object if too many logs + maybe_force_recovery(); +} + +PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail( + OpRequestRef op, + bool write_ordered, + ObjectContextRef obc) +{ + if (!obc) { + dout(20) << __func__ << ": no obc " << dendl; + return cache_result_t::NOOP; + } + + if (!obc->obs.oi.has_manifest()) { + dout(20) << __func__ << ": " << obc->obs.oi.soid + << " is not manifest object " << dendl; + return cache_result_t::NOOP; + } + if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) { + dout(20) << __func__ << ": ignoring redirect due to flag" << dendl; + return cache_result_t::NOOP; + } + + // if it is write-ordered and blocked, stop now + if (obc->is_blocked() && write_ordered) { + // we're already doing something with this object + dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl; + return cache_result_t::NOOP; + } + + vector<OSDOp> ops = op->get_req<MOSDOp>()->ops; + for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) { + OSDOp& osd_op = *p; + ceph_osd_op& op = osd_op.op; + if (op.op == CEPH_OSD_OP_SET_REDIRECT || + op.op == CEPH_OSD_OP_SET_CHUNK || + op.op == CEPH_OSD_OP_UNSET_MANIFEST || + op.op == CEPH_OSD_OP_TIER_PROMOTE || + op.op == CEPH_OSD_OP_TIER_FLUSH || + op.op == CEPH_OSD_OP_TIER_EVICT || + op.op == CEPH_OSD_OP_ISDIRTY) { + return cache_result_t::NOOP; + } + } + + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + if (op->may_write() || write_ordered) { + do_proxy_write(op, obc); + } else { + // promoted object + if (obc->obs.oi.size != 0) { + return cache_result_t::NOOP; + } + do_proxy_read(op, obc); + } + return cache_result_t::HANDLED_PROXY; + case object_manifest_t::TYPE_CHUNKED: + { + if (can_proxy_chunked_read(op, obc)) { + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid); + if (p != flush_ops.end()) { + do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true); + return cache_result_t::HANDLED_PROXY; + } + do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered); + return cache_result_t::HANDLED_PROXY; + } + + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + hobject_t head = m->get_hobj(); + + if (is_degraded_or_backfilling_object(head)) { + dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl; + wait_for_degraded_object(head, op); + return cache_result_t::BLOCKED_RECOVERY; + } + + if (m_scrubber->write_blocked_by_scrub(head)) { + dout(20) << __func__ << ": waiting for scrub" << dendl; + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + return cache_result_t::BLOCKED_RECOVERY; + } + if (!check_laggy_requeue(op)) { + return cache_result_t::BLOCKED_RECOVERY; + } + + for (auto& p : obc->obs.oi.manifest.chunk_map) { + if (p.second.is_missing()) { + auto m = op->get_req<MOSDOp>(); + const object_locator_t oloc = m->get_object_locator(); + promote_object(obc, obc->obs.oi.soid, oloc, op, NULL); + return cache_result_t::BLOCKED_PROMOTE; + } + } + return cache_result_t::NOOP; + } + default: + ceph_abort_msg("unrecognized manifest type"); + } + + return cache_result_t::NOOP; +} + +void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid, + MOSDOpReply *orig_reply, int r, + OpContext *ctx_for_op_returns) +{ + dout(20) << __func__ << " r=" << r << dendl; + ceph_assert(op->may_write()); + const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid(); + mempool::osd_pglog::list<pg_log_entry_t> entries; + entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid, + get_next_version(), eversion_t(), 0, + reqid, utime_t(), r)); + if (ctx_for_op_returns) { + entries.back().set_op_returns(*ctx_for_op_returns->ops); + dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl; + } + + struct OnComplete { + PrimaryLogPG *pg; + OpRequestRef op; + boost::intrusive_ptr<MOSDOpReply> orig_reply; + int r; + OnComplete( + PrimaryLogPG *pg, + OpRequestRef op, + MOSDOpReply *orig_reply, + int r) + : pg(pg), op(op), + orig_reply(orig_reply, false /* take over ref */), r(r) + {} + void operator()() { + ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl; + auto m = op->get_req<MOSDOp>(); + MOSDOpReply *reply = orig_reply.detach(); + ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl; + pg->osd->send_message_osd_client(reply, m->get_connection()); + } + }; + + ObcLockManager lock_manager; + submit_log_entries( + entries, + std::move(lock_manager), + std::optional<std::function<void(void)> >( + OnComplete(this, op, orig_reply, r)), + op, + r); +} + +PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail( + OpRequestRef op, + bool write_ordered, + ObjectContextRef obc, + int r, hobject_t missing_oid, + bool must_promote, + bool in_hit_set, + ObjectContextRef *promote_obc) +{ + // return quickly if caching is not enabled + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) + return cache_result_t::NOOP; + + if (op && + op->get_req() && + op->get_req()->get_type() == CEPH_MSG_OSD_OP && + (op->get_req<MOSDOp>()->get_flags() & + CEPH_OSD_FLAG_IGNORE_CACHE)) { + dout(20) << __func__ << ": ignoring cache due to flag" << dendl; + return cache_result_t::NOOP; + } + + must_promote = must_promote || op->need_promote(); + + if (obc) + dout(25) << __func__ << " " << obc->obs.oi << " " + << (obc->obs.exists ? "exists" : "DNE") + << " missing_oid " << missing_oid + << " must_promote " << (int)must_promote + << " in_hit_set " << (int)in_hit_set + << dendl; + else + dout(25) << __func__ << " (no obc)" + << " missing_oid " << missing_oid + << " must_promote " << (int)must_promote + << " in_hit_set " << (int)in_hit_set + << dendl; + + // if it is write-ordered and blocked, stop now + if (obc.get() && obc->is_blocked() && write_ordered) { + // we're already doing something with this object + dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl; + return cache_result_t::NOOP; + } + + if (r == -ENOENT && missing_oid == hobject_t()) { + // we know this object is logically absent (e.g., an undefined clone) + return cache_result_t::NOOP; + } + + if (obc.get() && obc->obs.exists) { + osd->logger->inc(l_osd_op_cache_hit); + return cache_result_t::NOOP; + } + if (!is_primary()) { + dout(20) << __func__ << " cache miss; ask the primary" << dendl; + osd->reply_op_error(op, -EAGAIN); + return cache_result_t::REPLIED_WITH_EAGAIN; + } + + if (missing_oid == hobject_t() && obc.get()) { + missing_oid = obc->obs.oi.soid; + } + + auto m = op->get_req<MOSDOp>(); + const object_locator_t oloc = m->get_object_locator(); + + if (op->need_skip_handle_cache()) { + return cache_result_t::NOOP; + } + + OpRequestRef promote_op; + + switch (pool.info.cache_mode) { + case pg_pool_t::CACHEMODE_WRITEBACK: + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + if (!op->may_write() && !op->may_cache() && + !write_ordered && !must_promote) { + dout(20) << __func__ << " cache pool full, proxying read" << dendl; + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + } + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + + if (must_promote || (!hit_set && !op->need_skip_promote())) { + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + + if (op->may_write() || op->may_cache()) { + do_proxy_write(op); + + // Promote too? + if (!op->need_skip_promote() && + maybe_promote(obc, missing_oid, oloc, in_hit_set, + pool.info.min_write_recency_for_promote, + OpRequestRef(), + promote_obc)) { + return cache_result_t::BLOCKED_PROMOTE; + } + return cache_result_t::HANDLED_PROXY; + } else { + do_proxy_read(op); + + // Avoid duplicate promotion + if (obc.get() && obc->is_blocked()) { + if (promote_obc) + *promote_obc = obc; + return cache_result_t::BLOCKED_PROMOTE; + } + + // Promote too? + if (!op->need_skip_promote()) { + (void)maybe_promote(obc, missing_oid, oloc, in_hit_set, + pool.info.min_read_recency_for_promote, + promote_op, promote_obc); + } + + return cache_result_t::HANDLED_PROXY; + } + ceph_abort_msg("unreachable"); + return cache_result_t::NOOP; + + case pg_pool_t::CACHEMODE_READONLY: + // TODO: clean this case up + if (!obc.get() && r == -ENOENT) { + // we don't have the object and op's a read + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + if (!r) { // it must be a write + do_cache_redirect(op); + return cache_result_t::HANDLED_REDIRECT; + } + // crap, there was a failure of some kind + return cache_result_t::NOOP; + + case pg_pool_t::CACHEMODE_FORWARD: + // this mode is deprecated; proxy instead + case pg_pool_t::CACHEMODE_PROXY: + if (!must_promote) { + if (op->may_write() || op->may_cache() || write_ordered) { + do_proxy_write(op); + return cache_result_t::HANDLED_PROXY; + } else { + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + } + } + // ugh, we're forced to promote. + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + + case pg_pool_t::CACHEMODE_READFORWARD: + // this mode is deprecated; proxy instead + case pg_pool_t::CACHEMODE_READPROXY: + // Do writeback to the cache tier for writes + if (op->may_write() || write_ordered || must_promote) { + if (agent_state && + agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + dout(20) << __func__ << " cache pool full, waiting" << dendl; + block_write_on_full_cache(missing_oid, op); + return cache_result_t::BLOCKED_FULL; + } + promote_object(obc, missing_oid, oloc, op, promote_obc); + return cache_result_t::BLOCKED_PROMOTE; + } + + // If it is a read, we can read, we need to proxy it + do_proxy_read(op); + return cache_result_t::HANDLED_PROXY; + + default: + ceph_abort_msg("unrecognized cache_mode"); + } + return cache_result_t::NOOP; +} + +bool PrimaryLogPG::maybe_promote(ObjectContextRef obc, + const hobject_t& missing_oid, + const object_locator_t& oloc, + bool in_hit_set, + uint32_t recency, + OpRequestRef promote_op, + ObjectContextRef *promote_obc) +{ + dout(20) << __func__ << " missing_oid " << missing_oid + << " in_hit_set " << in_hit_set << dendl; + + switch (recency) { + case 0: + break; + case 1: + // Check if in the current hit set + if (in_hit_set) { + break; + } else { + // not promoting + return false; + } + break; + default: + { + unsigned count = (int)in_hit_set; + if (count) { + // Check if in other hit sets + const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid; + for (map<time_t,HitSetRef>::reverse_iterator itor = + agent_state->hit_set_map.rbegin(); + itor != agent_state->hit_set_map.rend(); + ++itor) { + if (!itor->second->contains(oid)) { + break; + } + ++count; + if (count >= recency) { + break; + } + } + } + if (count >= recency) { + break; + } + return false; // not promoting + } + break; + } + + if (osd->promote_throttle()) { + dout(10) << __func__ << " promote throttled" << dendl; + return false; + } + promote_object(obc, missing_oid, oloc, promote_op, promote_obc); + return true; +} + +void PrimaryLogPG::do_cache_redirect(OpRequestRef op) +{ + auto m = op->get_req<MOSDOp>(); + int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); + MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(), + flags, false); + request_redirect_t redir(m->get_object_locator(), pool.info.tier_of); + reply->set_redirect(redir); + dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op " + << *op->get_req() << dendl; + m->get_connection()->send_message(reply); + return; +} + +struct C_ProxyRead : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyReadOpRef prdop; + utime_t start; + C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyReadOpRef& prd) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), prdop(prd), start(ceph_clock_now()) + {} + void finish(int r) override { + if (prdop->canceled) + return; + std::scoped_lock locker{*pg}; + if (prdop->canceled) { + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_proxy_read(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start); + } + } +}; + +struct C_ProxyChunkRead : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyReadOpRef prdop; + utime_t start; + ObjectOperation *obj_op; + int op_index = 0; + uint64_t req_offset = 0; + ObjectContextRef obc; + uint64_t req_total_len = 0; + C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyReadOpRef& prd) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL) + {} + void finish(int r) override { + if (prdop->canceled) + return; + std::scoped_lock locker{*pg}; + if (prdop->canceled) { + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + if (r >= 0) { + if (!prdop->ops[op_index].outdata.length()) { + ceph_assert(req_total_len); + bufferlist list; + bufferptr bptr(req_total_len); + list.push_back(std::move(bptr)); + prdop->ops[op_index].outdata.append(list); + } + ceph_assert(obj_op); + uint64_t copy_offset; + if (req_offset >= prdop->ops[op_index].op.extent.offset) { + copy_offset = req_offset - prdop->ops[op_index].op.extent.offset; + } else { + copy_offset = 0; + } + prdop->ops[op_index].outdata.begin(copy_offset).copy_in( + obj_op->ops[0].outdata.length(), + obj_op->ops[0].outdata.c_str()); + } + + pg->finish_proxy_read(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start); + if (obj_op) { + delete obj_op; + } + } + } +}; + +void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc) +{ + // NOTE: non-const here because the ProxyReadOp needs mutable refs to + // stash the result in the request's OSDOp vector + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + object_locator_t oloc; + hobject_t soid; + /* extensible tier */ + if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) { + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + oloc = object_locator_t(obc->obs.oi.manifest.redirect_target); + soid = obc->obs.oi.manifest.redirect_target; + break; + default: + ceph_abort_msg("unrecognized manifest type"); + } + } else { + /* proxy */ + soid = m->get_hobj(); + oloc = object_locator_t(m->get_object_locator()); + oloc.pool = pool.info.tier_of; + } + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + + // pass through some original flags that make sense. + // - leave out redirection and balancing flags since we are + // already proxying through the primary + // - leave off read/write/exec flags that are derived from the op + flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED | + CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ENFORCE_SNAPC | + CEPH_OSD_FLAG_MAP_SNAP_CLONE); + + dout(10) << __func__ << " Start proxy read for " << *m << dendl; + + ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops)); + + ObjectOperation obj_op; + obj_op.dup(prdop->ops); + + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK && + (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) { + for (unsigned i = 0; i < obj_op.ops.size(); i++) { + ceph_osd_op op = obj_op.ops[i].op; + switch (op.op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_CHECKSUM: + case CEPH_OSD_OP_CMPEXT: + op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) & + ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + } + } + } + + C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(), + prdop); + ceph_tid_t tid = osd->objecter->read( + soid.oid, oloc, obj_op, + m->get_snapid(), NULL, + flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())), + &prdop->user_version, + &prdop->data_offset, + m->get_features()); + fin->tid = tid; + prdop->objecter_tid = tid; + proxyread_ops[tid] = prdop; + in_progress_proxy_ops[soid].push_back(op); +} + +void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + + map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid); + if (p == proxyread_ops.end()) { + dout(10) << __func__ << " no proxyread_op found" << dendl; + return; + } + ProxyReadOpRef prdop = p->second; + if (tid != prdop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != prdop " << prdop + << " tid " << prdop->objecter_tid << dendl; + return; + } + if (oid != prdop->soid) { + dout(10) << __func__ << " oid " << oid << " != prdop " << prdop + << " soid " << prdop->soid << dendl; + return; + } + proxyread_ops.erase(tid); + + map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid); + if (q == in_progress_proxy_ops.end()) { + dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl; + return; + } + ceph_assert(q->second.size()); + list<OpRequestRef>::iterator it = std::find(q->second.begin(), + q->second.end(), + prdop->op); + ceph_assert(it != q->second.end()); + OpRequestRef op = *it; + q->second.erase(it); + if (q->second.size() == 0) { + in_progress_proxy_ops.erase(oid); + } else if (std::find(q->second.begin(), + q->second.end(), + prdop->op) != q->second.end()) { + /* multiple read case */ + dout(20) << __func__ << " " << oid << " is not completed " << dendl; + return; + } + + osd->logger->inc(l_osd_tier_proxy_read); + + auto m = op->get_req<MOSDOp>(); + OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this); + ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + ctx->user_at_version = prdop->user_version; + ctx->data_off = prdop->data_offset; + ctx->ignore_log_op_stats = true; + complete_read_ctx(r, ctx); +} + +void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid) +{ + map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid); + if (p == in_progress_proxy_ops.end()) + return; + + list<OpRequestRef>& ls = p->second; + dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl; + requeue_ops(ls); + in_progress_proxy_ops.erase(p); +} + +void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop, + vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << " " << prdop->soid << dendl; + prdop->canceled = true; + + // cancel objecter op, if we can + if (prdop->objecter_tid) { + tids->push_back(prdop->objecter_tid); + for (uint32_t i = 0; i < prdop->ops.size(); i++) { + prdop->ops[i].outdata.clear(); + } + proxyread_ops.erase(prdop->objecter_tid); + prdop->objecter_tid = 0; + } +} + +void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << dendl; + + // cancel proxy reads + map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin(); + while (p != proxyread_ops.end()) { + cancel_proxy_read((p++)->second, tids); + } + + // cancel proxy writes + map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin(); + while (q != proxywrite_ops.end()) { + cancel_proxy_write((q++)->second, tids); + } + + if (requeue) { + map<hobject_t, list<OpRequestRef>>::iterator p = + in_progress_proxy_ops.begin(); + while (p != in_progress_proxy_ops.end()) { + list<OpRequestRef>& ls = p->second; + dout(10) << __func__ << " " << p->first << " requeuing " << ls.size() + << " requests" << dendl; + requeue_ops(ls); + in_progress_proxy_ops.erase(p++); + } + } else { + in_progress_proxy_ops.clear(); + } +} + +struct C_ProxyWrite_Commit : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::ProxyWriteOpRef pwop; + C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::ProxyWriteOpRef& pw) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), pwop(pw) + {} + void finish(int r) override { + if (pwop->canceled) + return; + std::scoped_lock locker{*pg}; + if (pwop->canceled) { + return; + } + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_proxy_write(oid, tid, r); + } + } +}; + +void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc) +{ + // NOTE: non-const because ProxyWriteOp takes a mutable ref + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + object_locator_t oloc; + SnapContext snapc(m->get_snap_seq(), m->get_snaps()); + hobject_t soid; + /* extensible tier */ + if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) { + switch (obc->obs.oi.manifest.type) { + case object_manifest_t::TYPE_REDIRECT: + oloc = object_locator_t(obc->obs.oi.manifest.redirect_target); + soid = obc->obs.oi.manifest.redirect_target; + break; + default: + ceph_abort_msg("unrecognized manifest type"); + } + } else { + /* proxy */ + soid = m->get_hobj(); + oloc = object_locator_t(m->get_object_locator()); + oloc.pool = pool.info.tier_of; + } + + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (!(op->may_write() || op->may_cache())) { + flags |= CEPH_OSD_FLAG_RWORDERED; + } + if (op->allows_returnvec()) { + flags |= CEPH_OSD_FLAG_RETURNVEC; + } + + dout(10) << __func__ << " Start proxy write for " << *m << dendl; + + ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid())); + pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this); + pwop->mtime = m->get_mtime(); + + ObjectOperation obj_op; + obj_op.dup(pwop->ops); + + C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit( + this, soid, get_last_peering_reset(), pwop); + ceph_tid_t tid = osd->objecter->mutate( + soid.oid, oloc, obj_op, snapc, + ceph::real_clock::from_ceph_timespec(pwop->mtime), + flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())), + &pwop->user_version, pwop->reqid); + fin->tid = tid; + pwop->objecter_tid = tid; + proxywrite_ops[tid] = pwop; + in_progress_proxy_ops[soid].push_back(op); +} + +void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid, + ObjectContextRef obc, bool write_ordered) +{ + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + OSDOp *osd_op = NULL; + for (unsigned int i = 0; i < m->ops.size(); i++) { + osd_op = &m->ops[i]; + uint64_t cursor = osd_op->op.extent.offset; + uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length; + uint64_t chunk_length = 0, chunk_index = 0, req_len = 0; + object_manifest_t *manifest = &obc->obs.oi.manifest; + map <uint64_t, map<uint64_t, uint64_t>> chunk_read; + + while (cursor < op_length) { + chunk_index = 0; + chunk_length = 0; + /* find the right chunk position for cursor */ + for (auto &p : manifest->chunk_map) { + if (p.first <= cursor && p.first + p.second.length > cursor) { + chunk_length = p.second.length; + chunk_index = p.first; + break; + } + } + /* no index */ + if (!chunk_index && !chunk_length) { + if (cursor == osd_op->op.extent.offset) { + OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this); + ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + ctx->data_off = osd_op->op.extent.offset; + ctx->ignore_log_op_stats = true; + complete_read_ctx(0, ctx); + } + break; + } + uint64_t next_length = chunk_length; + /* the size to read -> | op length | */ + /* | a chunk | */ + if (cursor + next_length > op_length) { + next_length = op_length - cursor; + } + /* the size to read -> | op length | */ + /* | a chunk | */ + if (cursor + next_length > chunk_index + chunk_length) { + next_length = chunk_index + chunk_length - cursor; + } + + chunk_read[cursor] = {{chunk_index, next_length}}; + cursor += next_length; + } + + req_len = cursor - osd_op->op.extent.offset; + for (auto &p : chunk_read) { + auto chunks = p.second.begin(); + dout(20) << __func__ << " chunk_index: " << chunks->first + << " next_length: " << chunks->second << " cursor: " + << p.first << dendl; + do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered); + } + } +} + +struct RefCountCallback : public Context { +public: + PrimaryLogPG::OpContext *ctx; + OSDOp& osd_op; + bool requeue = false; + + RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op) + : ctx(ctx), osd_op(osd_op) {} + void finish(int r) override { + // NB: caller must already have pg->lock held + ctx->obc->stop_block(); + ctx->pg->kick_object_context_blocked(ctx->obc); + if (r >= 0) { + osd_op.rval = 0; + ctx->pg->execute_ctx(ctx); + } else { + // on cancel simply toss op out, + // or requeue as requested + if (r != -ECANCELED) { + if (ctx->op) + ctx->pg->osd->reply_op_error(ctx->op, r); + } else if (requeue) { + if (ctx->op) + ctx->pg->requeue_op(ctx->op); + } + ctx->pg->close_op_ctx(ctx); + } + } + void set_requeue(bool rq) { + requeue = rq; + } +}; + +struct SetManifestFinisher : public PrimaryLogPG::OpFinisher { + OSDOp& osd_op; + + explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) { + } + + int execute() override { + return osd_op.rval; + } +}; + +struct C_SetManifestRefCountDone : public Context { + PrimaryLogPGRef pg; + hobject_t soid; + uint64_t offset; + ceph_tid_t tid = 0; + C_SetManifestRefCountDone(PrimaryLogPG *p, + hobject_t soid, uint64_t offset) : + pg(p), soid(soid), offset(offset) {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock locker{*pg}; + pg->finish_set_manifest_refcount(soid, r, tid, offset); + } +}; + +struct C_SetDedupChunks : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + uint64_t offset; + + C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), offset(offset) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock locker{*pg}; + if (last_peering_reset != pg->get_last_peering_reset()) { + return; + } + pg->finish_set_dedup(oid, r, tid, offset); + } +}; + +void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << dendl; + auto p = manifest_ops.begin(); + while (p != manifest_ops.end()) { + auto mop = p->second; + // cancel objecter op, if we can + if (mop->objecter_tid) { + tids->push_back(mop->objecter_tid); + mop->objecter_tid = 0; + } else if (!mop->tids.empty()) { + for (auto &p : mop->tids) { + tids->push_back(p.second); + } + } + if (mop->cb) { + mop->cb->set_requeue(requeue); + mop->cb->complete(-ECANCELED); + } + manifest_ops.erase(p++); + } +} + +int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op) +{ + int cnt = 0; + // head + for (auto &p : obc->obs.oi.manifest.chunk_map) { + if (p.second.oid.oid.name == fp_oid) { + cnt++; + } + } + // snap + SnapSet& ss = obc->ssc->snapset; + const OSDMapRef& osdmap = get_osdmap(); + for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin(); + p != ss.clones.rend(); + ++p) { + object_ref_delta_t refs; + ObjectContextRef obc_l = nullptr; + ObjectContextRef obc_g = nullptr; + hobject_t clone_oid = obc->obs.oi.soid; + clone_oid.snap = *p; + if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) { + return -EBUSY; + } + if (is_unreadable_object(clone_oid)) { + dout(10) << __func__ << ": " << clone_oid + << " is unreadable. Need to wait for recovery" << dendl; + wait_for_unreadable_object(clone_oid, op); + return -EAGAIN; + } + ObjectContextRef clone_obc = get_object_context(clone_oid, false); + if (!clone_obc) { + break; + } + if (recover_adjacent_clones(clone_obc, op)) { + return -EAGAIN; + } + get_adjacent_clones(clone_obc, obc_l, obc_g); + clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set( + obc_g ? &(obc_g->obs.oi.manifest) : nullptr , + nullptr, + refs); + for (auto p = refs.begin(); p != refs.end(); ++p) { + if (p->first.oid.name == fp_oid && p->second > 0) { + cnt += p->second; + } + } + } + + return cnt; +} + +bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op) +{ + if (!obc->ssc || !obc->ssc->snapset.clones.size()) { + return false; + } + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + bool has_manifest_op = std::any_of( + begin(m->ops), + end(m->ops), + [](const auto& osd_op) { + return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK; + }); + if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) { + return false; + } + ceph_assert(op); + + const SnapSet& snapset = obc->ssc->snapset; + auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap); + auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool { + hobject_t cid = obc->obs.oi.soid; + cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter; + if (is_unreadable_object(cid)) { + dout(10) << __func__ << ": clone " << cid + << " is unreadable, waiting" << dendl; + wait_for_unreadable_object(cid, op); + return true; + } + return false; + }; + if (s != snapset.clones.begin()) { + if (is_unreadable_snap(s - 1)) { + return true; + } + } + if (s != snapset.clones.end()) { + if (is_unreadable_snap(s + 1)) { + return true; + } + } + return false; +} + +ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc) +{ + auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(), + obc->obs.oi.soid.snap); + if (s != obc->ssc->snapset.clones.begin()) { + auto s_iter = s - 1; + hobject_t cid = obc->obs.oi.soid; + object_ref_delta_t refs; + cid.snap = *s_iter; + ObjectContextRef cobc = get_object_context(cid, false, NULL); + ceph_assert(cobc); + return cobc; + } + return nullptr; +} + +void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs) +{ + for (auto p = refs.begin(); p != refs.end(); ++p) { + int dec_ref_count = p->second; + ceph_assert(dec_ref_count < 0); + while (dec_ref_count < 0) { + dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl; + refcount_manifest(soid, p->first, + refcount_t::DECREMENT_REF, NULL, std::nullopt); + dec_ref_count++; + } + } +} + + +void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc, + ObjectContextRef& _l, ObjectContextRef& _g) +{ + const SnapSet& snapset = src_obc->ssc->snapset; + const object_info_t& oi = src_obc->obs.oi; + + auto get_context = [this, &oi, &snapset](auto iter) + -> ObjectContextRef { + hobject_t cid = oi.soid; + cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter; + ObjectContextRef obc = get_object_context(cid, false, NULL); + ceph_assert(obc); + return obc; + }; + + // check adjacent clones + auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap); + + // We *must* find the clone iff it's not head, + // let s == snapset.clones.end() mean head + ceph_assert((s == snapset.clones.end()) == oi.soid.is_head()); + + if (s != snapset.clones.begin()) { + _l = get_context(s - 1); + } + + if (s != snapset.clones.end()) { + _g = get_context(s + 1); + } +} + +bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk, + OSDOp& osd_op) +{ + object_ref_delta_t refs; + ObjectContextRef obc_l, obc_g; + get_adjacent_clones(ctx->obc, obc_l, obc_g); + set_chunk.calc_refs_to_inc_on_set( + obc_l ? &(obc_l->obs.oi.manifest) : nullptr, + obc_g ? &(obc_g->obs.oi.manifest) : nullptr, + refs); + bool need_inc_ref = false; + if (!refs.is_empty()) { + ManifestOpRef mop(std::make_shared<ManifestOp>(ctx->obc, nullptr)); + for (auto c : set_chunk.chunk_map) { + auto p = refs.find(c.second.oid); + if (p == refs.end()) { + continue; + } + + int inc_ref_count = p->second; + if (inc_ref_count > 0) { + /* + * In set-chunk case, the first thing we should do is to increment + * the reference the targe object has prior to update object_manifest in object_info_t. + * So, call directly refcount_manifest. + */ + auto target_oid = p->first; + auto offset = c.first; + auto length = c.second.length; + auto* fin = new C_SetManifestRefCountDone(this, ctx->obs->oi.soid, offset); + ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, target_oid, + refcount_t::INCREMENT_REF, fin, std::nullopt); + fin->tid = tid; + mop->chunks[target_oid] = make_pair(offset, length); + mop->num_chunks++; + mop->tids[offset] = tid; + + if (!ctx->obc->is_blocked()) { + dout(15) << fmt::format("{}: blocking object on rc: tid:{}", __func__, tid) << dendl; + ctx->obc->start_block(); + } + need_inc_ref = true; + } else if (inc_ref_count < 0) { + hobject_t src = ctx->obs->oi.soid; + hobject_t tgt = p->first; + ctx->register_on_commit( + [src, tgt, this](){ + refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt); + }); + } + } + if (mop->tids.size()) { + mop->cb = new RefCountCallback(ctx, osd_op); + manifest_ops[ctx->obs->oi.soid] = mop; + manifest_ops[ctx->obs->oi.soid]->op = ctx->op; + } + } + + return need_inc_ref; +} + +void PrimaryLogPG::update_chunk_map_by_dirty(OpContext* ctx) { + /* + * We should consider two cases here: + * 1) just modification: This created dirty regions, but didn't update chunk_map. + * 2) rollback: In rollback, head will be converted to the clone the rollback targets. + * Also, rollback already updated chunk_map. + * So, we should do here is to check whether chunk_map is updated and the clean_region has dirty regions. + * In case of the rollback, chunk_map doesn't need to be clear + */ + for (auto &p : ctx->obs->oi.manifest.chunk_map) { + if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) { + ctx->new_obs.oi.manifest.chunk_map.erase(p.first); + if (ctx->new_obs.oi.manifest.chunk_map.empty()) { + ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST); + ctx->delta_stats.num_objects_manifest--; + } + } + } +} + +void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx) +{ + object_ref_delta_t refs; + ObjectContextRef cobc = nullptr; + ObjectContextRef obc = ctx->obc; + // Look over previous snapshot, then figure out whether updated chunk needs to be deleted + cobc = get_prev_clone_obc(obc); + obc->obs.oi.manifest.calc_refs_to_drop_on_modify( + cobc ? &cobc->obs.oi.manifest : nullptr, + ctx->clean_regions, + refs); + if (!refs.is_empty()) { + hobject_t soid = obc->obs.oi.soid; + ctx->register_on_commit( + [soid, this, refs](){ + dec_refcount(soid, refs); + }); + } +} + +void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx) +{ + ceph_assert(oi.has_manifest()); + ceph_assert(ctx->obc->ssc); + + if (oi.manifest.is_chunked()) { + object_ref_delta_t refs; + ObjectContextRef obc_l, obc_g, obc; + /* in trim_object, oi and ctx can have different oid */ + obc = get_object_context(oi.soid, false, NULL); + ceph_assert(obc); + get_adjacent_clones(obc, obc_l, obc_g); + oi.manifest.calc_refs_to_drop_on_removal( + obc_l ? &(obc_l->obs.oi.manifest) : nullptr, + obc_g ? &(obc_g->obs.oi.manifest) : nullptr, + refs); + + if (!refs.is_empty()) { + /* dec_refcount will use head object anyway */ + hobject_t soid = ctx->obc->obs.oi.soid; + ctx->register_on_commit( + [soid, this, refs](){ + dec_refcount(soid, refs); + }); + } + } else if (oi.manifest.is_redirect() && + oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) { + ctx->register_on_commit( + [oi, this](){ + refcount_manifest(oi.soid, oi.manifest.redirect_target, + refcount_t::DECREMENT_REF, NULL, std::nullopt); + }); + } +} + +ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type, + Context *cb, std::optional<bufferlist> chunk) +{ + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_RWORDERED; + + dout(10) << __func__ << " Start refcount from " << src_soid + << " to " << tgt_soid << dendl; + + ObjectOperation obj_op; + bufferlist in; + if (type == refcount_t::INCREMENT_REF) { + cls_cas_chunk_get_ref_op call; + call.source = src_soid.get_head(); + ::encode(call, in); + obj_op.call("cas", "chunk_get_ref", in); + } else if (type == refcount_t::DECREMENT_REF) { + cls_cas_chunk_put_ref_op call; + call.source = src_soid.get_head(); + ::encode(call, in); + obj_op.call("cas", "chunk_put_ref", in); + } else if (type == refcount_t::CREATE_OR_GET_REF) { + cls_cas_chunk_create_or_get_ref_op get_call; + get_call.source = src_soid.get_head(); + ceph_assert(chunk); + get_call.data = std::move(*chunk); + ::encode(get_call, in); + obj_op.call("cas", "chunk_create_or_get_ref", in); + } else { + ceph_assert(0 == "unrecognized type"); + } + + Context *c = nullptr; + if (cb) { + c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard())); + } + + object_locator_t oloc(tgt_soid); + ObjectContextRef src_obc = get_object_context(src_soid, false, NULL); + ceph_assert(src_obc); + auto tid = osd->objecter->mutate( + tgt_soid.oid, oloc, obj_op, SnapContext(), + ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime), + flags, c); + return tid; +} + +void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index, + uint64_t chunk_index, uint64_t req_offset, uint64_t req_length, + uint64_t req_total_len, bool write_ordered) +{ + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + object_manifest_t *manifest = &obc->obs.oi.manifest; + if (!manifest->chunk_map.count(chunk_index)) { + return; + } + uint64_t chunk_length = manifest->chunk_map[chunk_index].length; + hobject_t soid = manifest->chunk_map[chunk_index].oid; + hobject_t ori_soid = m->get_hobj(); + object_locator_t oloc(soid); + unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (write_ordered) { + flags |= CEPH_OSD_FLAG_RWORDERED; + } + + if (!chunk_length || soid == hobject_t()) { + return; + } + + /* same as do_proxy_read() */ + flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED | + CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ENFORCE_SNAPC | + CEPH_OSD_FLAG_MAP_SNAP_CLONE); + + dout(10) << __func__ << " Start do chunk proxy read for " << *m + << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset + << " req_length: " << req_length << dendl; + + ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops)); + + ObjectOperation *pobj_op = new ObjectOperation; + OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op); + + if (chunk_index <= req_offset) { + osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index; + } else { + ceph_abort_msg("chunk_index > req_offset"); + } + osd_op.op.extent.length = req_length; + + ObjectOperation obj_op; + obj_op.dup(pobj_op->ops); + + C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(), + prdop); + fin->obj_op = pobj_op; + fin->op_index = op_index; + fin->req_offset = req_offset; + fin->obc = obc; + fin->req_total_len = req_total_len; + + ceph_tid_t tid = osd->objecter->read( + soid.oid, oloc, obj_op, + m->get_snapid(), NULL, + flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())), + &prdop->user_version, + &prdop->data_offset, + m->get_features()); + fin->tid = tid; + prdop->objecter_tid = tid; + proxyread_ops[tid] = prdop; + in_progress_proxy_ops[ori_soid].push_back(op); +} + +bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc) +{ + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + OSDOp *osd_op = NULL; + bool ret = true; + for (unsigned int i = 0; i < m->ops.size(); i++) { + osd_op = &m->ops[i]; + ceph_osd_op op = osd_op->op; + switch (op.op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SYNC_READ: { + uint64_t cursor = osd_op->op.extent.offset; + uint64_t remain = osd_op->op.extent.length; + + /* requested chunks exist in chunk_map ? */ + for (auto &p : obc->obs.oi.manifest.chunk_map) { + if (p.first <= cursor && p.first + p.second.length > cursor) { + if (!p.second.is_missing()) { + return false; + } + if (p.second.length >= remain) { + remain = 0; + break; + } else { + remain = remain - p.second.length; + } + cursor += p.second.length; + } + } + + if (remain) { + dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl; + return false; + } + continue; + } + default: + return false; + } + } + return ret; +} + +void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + + map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid); + if (p == proxywrite_ops.end()) { + dout(10) << __func__ << " no proxywrite_op found" << dendl; + return; + } + ProxyWriteOpRef pwop = p->second; + ceph_assert(tid == pwop->objecter_tid); + ceph_assert(oid == pwop->soid); + + proxywrite_ops.erase(tid); + + map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid); + if (q == in_progress_proxy_ops.end()) { + dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl; + delete pwop->ctx; + pwop->ctx = NULL; + return; + } + list<OpRequestRef>& in_progress_op = q->second; + ceph_assert(in_progress_op.size()); + list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(), + in_progress_op.end(), + pwop->op); + ceph_assert(it != in_progress_op.end()); + in_progress_op.erase(it); + if (in_progress_op.size() == 0) { + in_progress_proxy_ops.erase(oid); + } else if (std::find(in_progress_op.begin(), + in_progress_op.end(), + pwop->op) != in_progress_op.end()) { + if (pwop->ctx) + delete pwop->ctx; + pwop->ctx = NULL; + dout(20) << __func__ << " " << oid << " tid " << tid + << " in_progress_op size: " + << in_progress_op.size() << dendl; + return; + } + + osd->logger->inc(l_osd_tier_proxy_write); + + auto m = pwop->op->get_req<MOSDOp>(); + ceph_assert(m != NULL); + + if (!pwop->sent_reply) { + // send commit. + assert(pwop->ctx->reply == nullptr); + MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0, + true /* we claim it below */); + reply->set_reply_versions(eversion_t(), pwop->user_version); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + reply->claim_op_out_data(pwop->ops); + dout(10) << " sending commit on " << pwop << " " << reply << dendl; + osd->send_message_osd_client(reply, m->get_connection()); + pwop->sent_reply = true; + pwop->ctx->op->mark_commit_sent(); + } + + delete pwop->ctx; + pwop->ctx = NULL; +} + +void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop, + vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << " " << pwop->soid << dendl; + pwop->canceled = true; + + // cancel objecter op, if we can + if (pwop->objecter_tid) { + tids->push_back(pwop->objecter_tid); + delete pwop->ctx; + pwop->ctx = NULL; + proxywrite_ops.erase(pwop->objecter_tid); + pwop->objecter_tid = 0; + } +} + +class PromoteCallback: public PrimaryLogPG::CopyCallback { + ObjectContextRef obc; + PrimaryLogPG *pg; + utime_t start; +public: + PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_) + : obc(obc_), + pg(pg_), + start(ceph_clock_now()) {} + + void finish(PrimaryLogPG::CopyCallbackResults results) override { + PrimaryLogPG::CopyResults *results_data = results.get<1>(); + int r = results.get<0>(); + if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) { + pg->finish_promote_manifest(r, results_data, obc); + } else { + pg->finish_promote(r, results_data, obc); + } + pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start); + } +}; + +class PromoteManifestCallback: public PrimaryLogPG::CopyCallback { + ObjectContextRef obc; + PrimaryLogPG *pg; + utime_t start; + PrimaryLogPG::OpContext *ctx; + PrimaryLogPG::CopyCallbackResults promote_results; +public: + PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx) + : obc(obc_), + pg(pg_), + start(ceph_clock_now()), ctx(ctx) {} + + void finish(PrimaryLogPG::CopyCallbackResults results) override { + PrimaryLogPG::CopyResults *results_data = results.get<1>(); + int r = results.get<0>(); + promote_results = results; + if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) { + ctx->user_at_version = results_data->user_version; + } + if (r >= 0) { + ctx->pg->execute_ctx(ctx); + } else { + if (r != -ECANCELED) { + if (ctx->op) + ctx->pg->osd->reply_op_error(ctx->op, r); + } else if (results_data->should_requeue) { + if (ctx->op) + ctx->pg->requeue_op(ctx->op); + } + ctx->pg->close_op_ctx(ctx); + } + pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start); + } + friend struct PromoteFinisher; +}; + +struct PromoteFinisher : public PrimaryLogPG::OpFinisher { + PromoteManifestCallback *promote_callback; + + explicit PromoteFinisher(PromoteManifestCallback *promote_callback) + : promote_callback(promote_callback) { + } + + int execute() override { + if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) { + promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(), + promote_callback->promote_results.get<1>(), + promote_callback->obc); + } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) { + promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(), + promote_callback->promote_results.get<1>(), + promote_callback->obc); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + return 0; + } +}; + +void PrimaryLogPG::promote_object(ObjectContextRef obc, + const hobject_t& missing_oid, + const object_locator_t& oloc, + OpRequestRef op, + ObjectContextRef *promote_obc) +{ + hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid; + ceph_assert(hoid != hobject_t()); + if (m_scrubber->write_blocked_by_scrub(hoid)) { + dout(10) << __func__ << " " << hoid + << " blocked by scrub" << dendl; + if (op) { + waiting_for_scrub.push_back(op); + op->mark_delayed("waiting for scrub"); + dout(10) << __func__ << " " << hoid + << " placing op in waiting_for_scrub" << dendl; + } else { + dout(10) << __func__ << " " << hoid + << " no op, dropping on the floor" << dendl; + } + return; + } + if (op && !check_laggy_requeue(op)) { + return; + } + if (!obc) { // we need to create an ObjectContext + ceph_assert(missing_oid != hobject_t()); + obc = get_object_context(missing_oid, true); + } + if (promote_obc) + *promote_obc = obc; + + /* + * Before promote complete, if there are proxy-reads for the object, + * for this case we don't use DONTNEED. + */ + unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL; + map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid); + if (q == in_progress_proxy_ops.end()) { + src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED; + } + + CopyCallback *cb; + object_locator_t my_oloc; + hobject_t src_hoid; + if (!obc->obs.oi.has_manifest()) { + my_oloc = oloc; + my_oloc.pool = pool.info.tier_of; + src_hoid = obc->obs.oi.soid; + cb = new PromoteCallback(obc, this); + } else { + if (obc->obs.oi.manifest.is_chunked()) { + src_hoid = obc->obs.oi.soid; + cb = new PromoteCallback(obc, this); + } else if (obc->obs.oi.manifest.is_redirect()) { + object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target); + my_oloc = src_oloc; + src_hoid = obc->obs.oi.manifest.redirect_target; + cb = new PromoteCallback(obc, this); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + } + + unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | + CEPH_OSD_COPY_FROM_FLAG_RWORDERED; + start_copy(cb, obc, src_hoid, my_oloc, 0, flags, + obc->obs.oi.soid.snap == CEPH_NOSNAP, + src_fadvise_flags, 0); + + ceph_assert(obc->is_blocked()); + + if (op) + wait_for_blocked_object(obc->obs.oi.soid, op); + + recovery_state.update_stats( + [](auto &history, auto &stats) { + stats.stats.sum.num_promote++; + return false; + }); +} + +void PrimaryLogPG::execute_ctx(OpContext *ctx) +{ + FUNCTRACE(cct); + dout(10) << __func__ << " " << ctx << dendl; + ctx->reset_obs(ctx->obc); + ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx + OpRequestRef op = ctx->op; + auto m = op->get_req<MOSDOp>(); + ObjectContextRef obc = ctx->obc; + const hobject_t& soid = obc->obs.oi.soid; + + // this method must be idempotent since we may call it several times + // before we finally apply the resulting transaction. + ctx->op_t.reset(new PGTransaction); + + if (op->may_write() || op->may_cache()) { + // snap + if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) && + pool.info.is_pool_snaps_mode()) { + // use pool's snapc + ctx->snapc = pool.snapc; + } else { + // client specified snapc + ctx->snapc.seq = m->get_snap_seq(); + ctx->snapc.snaps = m->get_snaps(); + filter_snapc(ctx->snapc.snaps); + } + if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) && + ctx->snapc.seq < obc->ssc->snapset.seq) { + dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq + << " < snapset seq " << obc->ssc->snapset.seq + << " on " << obc->obs.oi.soid << dendl; + reply_ctx(ctx, -EOLDSNAPC); + return; + } + + // version + ctx->at_version = get_next_version(); + ctx->mtime = m->get_mtime(); + + dout(10) << __func__ << " " << soid << " " << *ctx->ops + << " ov " << obc->obs.oi.version << " av " << ctx->at_version + << " snapc " << ctx->snapc + << " snapset " << obc->ssc->snapset + << dendl; + } else { + dout(10) << __func__ << " " << soid << " " << *ctx->ops + << " ov " << obc->obs.oi.version + << dendl; + } + + if (!ctx->user_at_version) + ctx->user_at_version = obc->obs.oi.user_version; + dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl; + + { +#ifdef WITH_LTTNG + osd_reqid_t reqid = ctx->op->get_reqid(); +#endif + tracepoint(osd, prepare_tx_enter, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + + int result = prepare_transaction(ctx); + + { +#ifdef WITH_LTTNG + osd_reqid_t reqid = ctx->op->get_reqid(); +#endif + tracepoint(osd, prepare_tx_exit, reqid.name._type, + reqid.name._num, reqid.tid, reqid.inc); + } + + bool pending_async_reads = !ctx->pending_async_reads.empty(); + if (result == -EINPROGRESS || pending_async_reads) { + // come back later. + if (pending_async_reads) { + ceph_assert(pool.info.is_erasure()); + in_progress_async_reads.push_back(make_pair(op, ctx)); + ctx->start_async_reads(this); + } + return; + } + + if (result == -EAGAIN) { + // clean up after the ctx + close_op_ctx(ctx); + return; + } + + bool ignore_out_data = false; + if (!ctx->op_t->empty() && + op->may_write() && + result >= 0) { + // successful update + if (ctx->op->allows_returnvec()) { + // enforce reasonable bound on the return buffer sizes + for (auto& i : *ctx->ops) { + if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) { + dout(10) << __func__ << " op " << i << " outdata overflow" << dendl; + result = -EOVERFLOW; // overall result is overflow + i.rval = -EOVERFLOW; + i.outdata.clear(); + } + } + } else { + // legacy behavior -- zero result and return data etc. + ignore_out_data = true; + result = 0; + } + } + + // prepare the reply + ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0, + ignore_out_data); + dout(20) << __func__ << " alloc reply " << ctx->reply + << " result " << result << dendl; + + // read or error? + if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) { + // finish side-effects + if (result >= 0) + do_osd_op_effects(ctx, m->get_connection()); + + complete_read_ctx(result, ctx); + return; + } + + ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version); + + ceph_assert(op->may_write() || op->may_cache()); + + // trim log? + recovery_state.update_trim_to(); + + // verify that we are doing this in order? + if (cct->_conf->osd_debug_op_order && m->get_source().is_client() && + !pool.info.is_tier() && !pool.info.has_tiers()) { + map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid]; + ceph_tid_t t = m->get_tid(); + client_t n = m->get_source().num(); + map<client_t,ceph_tid_t>::iterator p = cm.find(n); + if (p == cm.end()) { + dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl; + cm[n] = t; + } else { + dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl; + if (p->second > t) { + derr << "bad op order, already applied " << p->second << " > this " << t << dendl; + ceph_abort_msg("out of order op"); + } + p->second = t; + } + } + + if (ctx->update_log_only) { + if (result >= 0) + do_osd_op_effects(ctx, m->get_connection()); + + dout(20) << __func__ << " update_log_only -- result=" << result << dendl; + // save just what we need from ctx + MOSDOpReply *reply = ctx->reply; + ctx->reply = nullptr; + reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0); + + if (result == -ENOENT) { + reply->set_enoent_reply_versions(info.last_update, + info.last_user_version); + } + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + // append to pg log for dup detection - don't save buffers for now + record_write_error(op, soid, reply, result, + ctx->op->allows_returnvec() ? ctx : nullptr); + close_op_ctx(ctx); + return; + } + + // no need to capture PG ref, repop cancel will handle that + // Can capture the ctx by pointer, it's owned by the repop + ctx->register_on_commit( + [m, ctx, this](){ + if (ctx->op) + log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read); + + if (m && !ctx->sent_reply) { + MOSDOpReply *reply = ctx->reply; + ctx->reply = nullptr; + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + dout(10) << " sending reply on " << *m << " " << reply << dendl; + osd->send_message_osd_client(reply, m->get_connection()); + ctx->sent_reply = true; + ctx->op->mark_commit_sent(); + } + }); + ctx->register_on_success( + [ctx, this]() { + do_osd_op_effects( + ctx, + ctx->op ? ctx->op->get_req()->get_connection() : + ConnectionRef()); + }); + ctx->register_on_finish( + [ctx]() { + delete ctx; + }); + + // issue replica writes + ceph_tid_t rep_tid = osd->get_tid(); + + RepGather *repop = new_repop(ctx, rep_tid); + + issue_repop(repop, ctx); + eval_repop(repop); + repop->put(); +} + +void PrimaryLogPG::close_op_ctx(OpContext *ctx) { + release_object_locks(ctx->lock_manager); + + ctx->op_t.reset(); + + for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end(); + ctx->on_finish.erase(p++)) { + (*p)(); + } + delete ctx; +} + +void PrimaryLogPG::reply_ctx(OpContext *ctx, int r) +{ + if (ctx->op) + osd->reply_op_error(ctx->op, r); + close_op_ctx(ctx); +} + +void PrimaryLogPG::log_op_stats(const OpRequest& op, + const uint64_t inb, + const uint64_t outb) +{ + auto m = op.get_req<MOSDOp>(); + const utime_t now = ceph_clock_now(); + + const utime_t latency = now - m->get_recv_stamp(); + const utime_t process_latency = now - op.get_dequeued_time(); + + osd->logger->inc(l_osd_op); + + osd->logger->inc(l_osd_op_outb, outb); + osd->logger->inc(l_osd_op_inb, inb); + osd->logger->tinc(l_osd_op_lat, latency); + osd->logger->tinc(l_osd_op_process_lat, process_latency); + + if (op.may_read() && op.may_write()) { + osd->logger->inc(l_osd_op_rw); + osd->logger->inc(l_osd_op_rw_inb, inb); + osd->logger->inc(l_osd_op_rw_outb, outb); + osd->logger->tinc(l_osd_op_rw_lat, latency); + osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb); + osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb); + osd->logger->tinc(l_osd_op_rw_process_lat, process_latency); + } else if (op.may_read()) { + osd->logger->inc(l_osd_op_r); + osd->logger->inc(l_osd_op_r_outb, outb); + osd->logger->tinc(l_osd_op_r_lat, latency); + osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb); + osd->logger->tinc(l_osd_op_r_process_lat, process_latency); + } else if (op.may_write() || op.may_cache()) { + osd->logger->inc(l_osd_op_w); + osd->logger->inc(l_osd_op_w_inb, inb); + osd->logger->tinc(l_osd_op_w_lat, latency); + osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb); + osd->logger->tinc(l_osd_op_w_process_lat, process_latency); + } else { + ceph_abort(); + } + + dout(15) << "log_op_stats " << *m + << " inb " << inb + << " outb " << outb + << " lat " << latency << dendl; + + if (m_dynamic_perf_stats.is_enabled()) { + m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency); + } +} + +void PrimaryLogPG::set_dynamic_perf_stats_queries( + const std::list<OSDPerfMetricQuery> &queries) +{ + m_dynamic_perf_stats.set_queries(queries); +} + +void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats) +{ + std::swap(m_dynamic_perf_stats, *stats); +} + +void PrimaryLogPG::do_scan( + OpRequestRef op, + ThreadPool::TPHandle &handle) +{ + auto m = op->get_req<MOSDPGScan>(); + ceph_assert(m->get_type() == MSG_OSD_PG_SCAN); + dout(10) << "do_scan " << *m << dendl; + + op->mark_started(); + + switch (m->op) { + case MOSDPGScan::OP_SCAN_GET_DIGEST: + { + auto dpp = get_dpp(); + if (osd->check_backfill_full(dpp)) { + dout(1) << __func__ << ": Canceling backfill: Full." << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::BackfillTooFull()))); + return; + } + + BackfillInterval bi; + bi.begin = m->begin; + // No need to flush, there won't be any in progress writes occuring + // past m->begin + scan_range( + cct->_conf->osd_backfill_scan_min, + cct->_conf->osd_backfill_scan_max, + &bi, + handle); + MOSDPGScan *reply = new MOSDPGScan( + MOSDPGScan::OP_SCAN_DIGEST, + pg_whoami, + get_osdmap_epoch(), m->query_epoch, + spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end); + encode(bi.objects, reply->get_data()); + osd->send_message_osd_cluster(reply, m->get_connection()); + } + break; + + case MOSDPGScan::OP_SCAN_DIGEST: + { + pg_shard_t from = m->from; + + // Check that from is in backfill_targets vector + ceph_assert(is_backfill_target(from)); + + BackfillInterval& bi = peer_backfill_info[from]; + bi.begin = m->begin; + bi.end = m->end; + auto p = m->get_data().cbegin(); + + // take care to preserve ordering! + bi.clear_objects(); + decode_noclear(bi.objects, p); + dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end + << " bi.objects.size()=" << bi.objects.size() << dendl; + + if (waiting_on_backfill.erase(from)) { + if (waiting_on_backfill.empty()) { + ceph_assert( + peer_backfill_info.size() == + get_backfill_targets().size()); + finish_recovery_op(hobject_t::get_max()); + } + } else { + // we canceled backfill for a while due to a too full, and this + // is an extra response from a non-too-full peer + dout(20) << __func__ << " canceled backfill (too full?)" << dendl; + } + } + break; + } +} + +void PrimaryLogPG::do_backfill(OpRequestRef op) +{ + auto m = op->get_req<MOSDPGBackfill>(); + ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL); + dout(10) << "do_backfill " << *m << dendl; + + op->mark_started(); + + switch (m->op) { + case MOSDPGBackfill::OP_BACKFILL_FINISH: + { + ceph_assert(cct->_conf->osd_kill_backfill_at != 1); + + MOSDPGBackfill *reply = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + get_osdmap_epoch(), + m->query_epoch, + spg_t(info.pgid.pgid, get_primary().shard)); + reply->set_priority(recovery_state.get_recovery_op_priority()); + osd->send_message_osd_cluster(reply, m->get_connection()); + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + RecoveryDone()))); + } + // fall-thru + + case MOSDPGBackfill::OP_BACKFILL_PROGRESS: + { + ceph_assert(cct->_conf->osd_kill_backfill_at != 2); + + ObjectStore::Transaction t; + recovery_state.update_backfill_progress( + m->last_backfill, + m->stats, + m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS, + t); + + int tr = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(tr == 0); + } + break; + + case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK: + { + ceph_assert(is_primary()); + ceph_assert(cct->_conf->osd_kill_backfill_at != 3); + finish_recovery_op(hobject_t::get_max()); + } + break; + } +} + +void PrimaryLogPG::do_backfill_remove(OpRequestRef op) +{ + const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>( + op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE); + dout(7) << __func__ << " " << m->ls << dendl; + + op->mark_started(); + + ObjectStore::Transaction t; + for (auto& p : m->ls) { + if (is_remote_backfilling()) { + struct stat st; + int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN, + pg_whoami.shard) , &st); + if (r == 0) { + sub_local_num_bytes(st.st_size); + int64_t usersize; + if (pool.info.is_erasure()) { + bufferlist bv; + int r = osd->store->getattr( + ch, + ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + usersize = oi.size * pgbackend->get_ec_data_chunk_count(); + } else { + dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard) + << " can't get object info" << dendl; + usersize = 0; + } + } else { + usersize = st.st_size; + } + sub_num_bytes(usersize); + dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard) + << " sub actual data by " << st.st_size + << " sub num_bytes by " << usersize + << dendl; + } + } + remove_snap_mapped_object(t, p.first); + } + int r = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(r == 0); +} + +int PrimaryLogPG::trim_object( + bool first, const hobject_t &coid, snapid_t snap_to_trim, + PrimaryLogPG::OpContextUPtr *ctxp) +{ + *ctxp = NULL; + + // load clone info + bufferlist bl; + ObjectContextRef obc = get_object_context(coid, false, NULL); + if (!obc || !obc->ssc || !obc->ssc->exists) { + osd->clog->error() << __func__ << ": Can not trim " << coid + << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)"); + return -ENOENT; + } + + hobject_t head_oid = coid.get_head(); + ObjectContextRef head_obc = get_object_context(head_oid, false); + if (!head_obc) { + osd->clog->error() << __func__ << ": Can not trim " << coid + << " repair needed, no snapset obc for " << head_oid; + return -ENOENT; + } + + SnapSet& snapset = obc->ssc->snapset; + + object_info_t &coi = obc->obs.oi; + auto citer = snapset.clone_snaps.find(coid.snap); + if (citer == snapset.clone_snaps.end()) { + osd->clog->error() << "No clone_snaps in snapset " << snapset + << " for object " << coid << "\n"; + return -ENOENT; + } + set<snapid_t> old_snaps(citer->second.begin(), citer->second.end()); + if (old_snaps.empty()) { + osd->clog->error() << "No object info snaps for object " << coid; + return -ENOENT; + } + + dout(10) << coid << " old_snaps " << old_snaps + << " old snapset " << snapset << dendl; + if (snapset.seq == 0) { + osd->clog->error() << "No snapset.seq for object " << coid; + return -ENOENT; + } + + set<snapid_t> new_snaps; + const OSDMapRef& osdmap = get_osdmap(); + for (set<snapid_t>::iterator i = old_snaps.begin(); + i != old_snaps.end(); + ++i) { + if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) && + *i != snap_to_trim) { + new_snaps.insert(*i); + } + } + + vector<snapid_t>::iterator p = snapset.clones.end(); + + if (new_snaps.empty()) { + p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap); + if (p == snapset.clones.end()) { + osd->clog->error() << "Snap " << coid.snap << " not in clones"; + return -ENOENT; + } + } + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->head_obc = head_obc; + + if (!ctx->lock_manager.get_snaptrimmer_write( + coid, + obc, + first)) { + close_op_ctx(ctx.release()); + dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl; + return -ENOLCK; + } + + if (!ctx->lock_manager.get_snaptrimmer_write( + head_oid, + head_obc, + first)) { + close_op_ctx(ctx.release()); + dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl; + return -ENOLCK; + } + + ctx->at_version = get_next_version(); + + PGTransaction *t = ctx->op_t.get(); + + int64_t num_objects_before_trim = ctx->delta_stats.num_objects; + + if (new_snaps.empty()) { + // remove clone + dout(10) << coid << " snaps " << old_snaps << " -> " + << new_snaps << " ... deleting" << dendl; + + // ...from snapset + ceph_assert(p != snapset.clones.end()); + + snapid_t last = coid.snap; + ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last); + + if (p != snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + vector<snapid_t>::iterator n = p - 1; + hobject_t prev_coid = coid; + prev_coid.snap = *n; + bool adjust_prev_bytes = is_present_clone(prev_coid); + + if (adjust_prev_bytes) + ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n); + + snapset.clone_overlap[*n].intersection_of( + snapset.clone_overlap[*p]); + + if (adjust_prev_bytes) + ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n); + } + ctx->delta_stats.num_objects--; + if (coi.is_dirty()) + ctx->delta_stats.num_objects_dirty--; + if (coi.is_omap()) + ctx->delta_stats.num_objects_omap--; + if (coi.is_whiteout()) { + dout(20) << __func__ << " trimming whiteout on " << coid << dendl; + ctx->delta_stats.num_whiteouts--; + } + ctx->delta_stats.num_object_clones--; + if (coi.is_cache_pinned()) + ctx->delta_stats.num_objects_pinned--; + if (coi.has_manifest()) { + dec_all_refcount_manifest(coi, ctx.get()); + ctx->delta_stats.num_objects_manifest--; + } + obc->obs.exists = false; + + snapset.clones.erase(p); + snapset.clone_overlap.erase(last); + snapset.clone_size.erase(last); + snapset.clone_snaps.erase(last); + + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::DELETE, + coid, + ctx->at_version, + ctx->obs->oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + t->remove(coid); + t->update_snaps( + coid, + old_snaps, + new_snaps); + + coi = object_info_t(coid); + + ctx->at_version.version++; + } else { + // save adjusted snaps for this object + dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl; + snapset.clone_snaps[coid.snap] = + vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend()); + // we still do a 'modify' event on this object just to trigger a + // snapmapper.update ... :( + + coi.prior_version = coi.version; + coi.version = ctx->at_version; + bl.clear(); + encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + t->setattr(coid, OI_ATTR, bl); + + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + coid, + coi.version, + coi.prior_version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + ctx->at_version.version++; + + t->update_snaps( + coid, + old_snaps, + new_snaps); + } + + // save head snapset + dout(10) << coid << " new snapset " << snapset << " on " + << head_obc->obs.oi << dendl; + if (snapset.clones.empty() && + (head_obc->obs.oi.is_whiteout() && + !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) && + !head_obc->obs.oi.is_cache_pinned())) { + // NOTE: this arguably constitutes minor interference with the + // tiering agent if this is a cache tier since a snap trim event + // is effectively evicting a whiteout we might otherwise want to + // keep around. + dout(10) << coid << " removing " << head_oid << dendl; + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::DELETE, + head_oid, + ctx->at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + dout(10) << "removing snap head" << dendl; + object_info_t& oi = head_obc->obs.oi; + ctx->delta_stats.num_objects--; + if (oi.is_dirty()) { + ctx->delta_stats.num_objects_dirty--; + } + if (oi.is_omap()) + ctx->delta_stats.num_objects_omap--; + if (oi.is_whiteout()) { + dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl; + ctx->delta_stats.num_whiteouts--; + } + if (oi.is_cache_pinned()) { + ctx->delta_stats.num_objects_pinned--; + } + if (oi.has_manifest()) { + ctx->delta_stats.num_objects_manifest--; + dec_all_refcount_manifest(oi, ctx.get()); + } + head_obc->obs.exists = false; + head_obc->obs.oi = object_info_t(head_oid); + t->remove(head_oid); + } else { + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + // filter SnapSet::snaps for the benefit of pre-octopus + // peers. This is perhaps overly conservative in that I'm not + // certain they need this, but let's be conservative here. + dout(10) << coid << " filtering snapset on " << head_oid << dendl; + snapset.filter(pool.info); + } else { + snapset.snaps.clear(); + } + dout(10) << coid << " writing updated snapset on " << head_oid + << ", snapset is " << snapset << dendl; + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + head_oid, + ctx->at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + + head_obc->obs.oi.prior_version = head_obc->obs.oi.version; + head_obc->obs.oi.version = ctx->at_version; + + map <string, bufferlist, less<>> attrs; + bl.clear(); + encode(snapset, bl); + attrs[SS_ATTR] = std::move(bl); + + bl.clear(); + encode(head_obc->obs.oi, bl, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR] = std::move(bl); + t->setattrs(head_oid, attrs); + } + + // Stats reporting - Set number of objects trimmed + if (num_objects_before_trim > ctx->delta_stats.num_objects) { + int64_t num_objects_trimmed = + num_objects_before_trim - ctx->delta_stats.num_objects; + add_objects_trimmed_count(num_objects_trimmed); + } + + *ctxp = std::move(ctx); + return 0; +} + +void PrimaryLogPG::kick_snap_trim() +{ + ceph_assert(is_active()); + ceph_assert(is_primary()); + if (is_clean() && + !state_test(PG_STATE_PREMERGE) && + !snap_trimq.empty()) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) { + dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl; + } else { + dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl; + reset_objects_trimmed(); + set_snaptrim_begin_stamp(); + snap_trimmer_machine.process_event(KickTrim()); + } + } +} + +void PrimaryLogPG::snap_trimmer_scrub_complete() +{ + if (is_primary() && is_active() && is_clean() && !snap_trimq.empty()) { + dout(10) << "scrub finished - requeuing snap_trimmer" << dendl; + snap_trimmer_machine.process_event(ScrubComplete()); + } +} + +void PrimaryLogPG::snap_trimmer(epoch_t queued) +{ + if (recovery_state.is_deleting() || pg_has_reset_since(queued)) { + return; + } + + ceph_assert(is_primary()); + + dout(10) << "snap_trimmer posting" << dendl; + snap_trimmer_machine.process_event(DoSnapWork()); + dout(10) << "snap_trimmer complete" << dendl; + return; +} + +namespace { + +template<typename U, typename V> +int do_cmp_xattr(int op, const U& lhs, const V& rhs) +{ + switch (op) { + case CEPH_OSD_CMPXATTR_OP_EQ: + return lhs == rhs; + case CEPH_OSD_CMPXATTR_OP_NE: + return lhs != rhs; + case CEPH_OSD_CMPXATTR_OP_GT: + return lhs > rhs; + case CEPH_OSD_CMPXATTR_OP_GTE: + return lhs >= rhs; + case CEPH_OSD_CMPXATTR_OP_LT: + return lhs < rhs; + case CEPH_OSD_CMPXATTR_OP_LTE: + return lhs <= rhs; + default: + return -EINVAL; + } +} + +} // anonymous namespace + +int PrimaryLogPG::do_xattr_cmp_u64(int op, uint64_t v1, bufferlist& xattr) +{ + uint64_t v2; + + if (xattr.length()) { + const char* first = xattr.c_str(); + if (auto [p, ec] = std::from_chars(first, first + xattr.length(), v2); + ec != std::errc()) { + return -EINVAL; + } + } else { + v2 = 0; + } + dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl; + return do_cmp_xattr(op, v1, v2); +} + +int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr) +{ + string_view v2s(xattr.c_str(), xattr.length()); + dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl; + return do_cmp_xattr(op, v1s, v2s); +} + +int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op) +{ + ceph_osd_op& op = osd_op.op; + vector<OSDOp> write_ops(1); + OSDOp& write_op = write_ops[0]; + uint64_t write_length = op.writesame.length; + int result = 0; + + if (!write_length) + return 0; + + if (!op.writesame.data_length || write_length % op.writesame.data_length) + return -EINVAL; + + if (op.writesame.data_length != osd_op.indata.length()) { + derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl; + return -EINVAL; + } + + while (write_length) { + write_op.indata.append(osd_op.indata); + write_length -= op.writesame.data_length; + } + + write_op.op.op = CEPH_OSD_OP_WRITE; + write_op.op.extent.offset = op.writesame.offset; + write_op.op.extent.length = op.writesame.length; + result = do_osd_ops(ctx, write_ops); + if (result < 0) + derr << "do_writesame do_osd_ops failed " << result << dendl; + + return result; +} + +// ======================================================================== +// low level osd ops + +int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags) +{ + dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl; + bufferlist header, vals; + int r = _get_tmap(ctx, &header, &vals); + if (r < 0) { + if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK)) + r = 0; + return r; + } + + vector<OSDOp> ops(3); + + ops[0].op.op = CEPH_OSD_OP_TRUNCATE; + ops[0].op.extent.offset = 0; + ops[0].op.extent.length = 0; + + ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER; + ops[1].indata = std::move(header); + + ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS; + ops[2].indata = std::move(vals); + + return do_osd_ops(ctx, ops); +} + +int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp, + OSDOp& osd_op, bufferlist& bl) +{ + // decode + bufferlist header; + map<string, bufferlist> m; + if (bl.length()) { + auto p = bl.cbegin(); + decode(header, p); + decode(m, p); + ceph_assert(p.end()); + } + + // do the update(s) + while (!bp.end()) { + __u8 op; + string key; + decode(op, bp); + + switch (op) { + case CEPH_OSD_TMAP_SET: // insert key + { + decode(key, bp); + bufferlist data; + decode(data, bp); + m[key] = data; + } + break; + case CEPH_OSD_TMAP_RM: // remove key + decode(key, bp); + if (!m.count(key)) { + return -ENOENT; + } + m.erase(key); + break; + case CEPH_OSD_TMAP_RMSLOPPY: // remove key + decode(key, bp); + m.erase(key); + break; + case CEPH_OSD_TMAP_HDR: // update header + { + decode(header, bp); + } + break; + default: + return -EINVAL; + } + } + + // reencode + bufferlist obl; + encode(header, obl); + encode(m, obl); + + // write it out + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = obl.length(); + newop.indata = obl; + do_osd_ops(ctx, nops); + return 0; +} + +int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op) +{ + bufferlist::const_iterator orig_bp = bp; + int result = 0; + if (bp.end()) { + dout(10) << "tmapup is a no-op" << dendl; + } else { + // read the whole object + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_READ; + newop.op.extent.offset = 0; + newop.op.extent.length = 0; + result = do_osd_ops(ctx, nops); + + dout(10) << "tmapup read " << newop.outdata.length() << dendl; + + dout(30) << " starting is \n"; + newop.outdata.hexdump(*_dout); + *_dout << dendl; + + auto ip = newop.outdata.cbegin(); + bufferlist obl; + + dout(30) << "the update command is: \n"; + osd_op.indata.hexdump(*_dout); + *_dout << dendl; + + // header + bufferlist header; + __u32 nkeys = 0; + if (newop.outdata.length()) { + decode(header, ip); + decode(nkeys, ip); + } + dout(10) << "tmapup header " << header.length() << dendl; + + if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) { + ++bp; + decode(header, bp); + dout(10) << "tmapup new header " << header.length() << dendl; + } + + encode(header, obl); + + dout(20) << "tmapup initial nkeys " << nkeys << dendl; + + // update keys + bufferlist newkeydata; + string nextkey, last_in_key; + bufferlist nextval; + bool have_next = false; + if (!ip.end()) { + have_next = true; + decode(nextkey, ip); + decode(nextval, ip); + } + while (!bp.end() && !result) { + __u8 op; + string key; + try { + decode(op, bp); + decode(key, bp); + } + catch (ceph::buffer::error& e) { + return -EINVAL; + } + if (key < last_in_key) { + dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key + << "', falling back to an inefficient (unsorted) update" << dendl; + bp = orig_bp; + return do_tmapup_slow(ctx, bp, osd_op, newop.outdata); + } + last_in_key = key; + + dout(10) << "tmapup op " << (int)op << " key " << key << dendl; + + // skip existing intervening keys + bool key_exists = false; + while (have_next && !key_exists) { + dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl; + if (nextkey > key) + break; + if (nextkey < key) { + // copy untouched. + encode(nextkey, newkeydata); + encode(nextval, newkeydata); + dout(20) << " keep " << nextkey << " " << nextval.length() << dendl; + } else { + // don't copy; discard old value. and stop. + dout(20) << " drop " << nextkey << " " << nextval.length() << dendl; + key_exists = true; + nkeys--; + } + if (!ip.end()) { + decode(nextkey, ip); + decode(nextval, ip); + } else { + have_next = false; + } + } + + if (op == CEPH_OSD_TMAP_SET) { + bufferlist val; + try { + decode(val, bp); + } + catch (ceph::buffer::error& e) { + return -EINVAL; + } + encode(key, newkeydata); + encode(val, newkeydata); + dout(20) << " set " << key << " " << val.length() << dendl; + nkeys++; + } else if (op == CEPH_OSD_TMAP_CREATE) { + if (key_exists) { + return -EEXIST; + } + bufferlist val; + try { + decode(val, bp); + } + catch (ceph::buffer::error& e) { + return -EINVAL; + } + encode(key, newkeydata); + encode(val, newkeydata); + dout(20) << " create " << key << " " << val.length() << dendl; + nkeys++; + } else if (op == CEPH_OSD_TMAP_RM) { + // do nothing. + if (!key_exists) { + return -ENOENT; + } + } else if (op == CEPH_OSD_TMAP_RMSLOPPY) { + // do nothing + } else { + dout(10) << " invalid tmap op " << (int)op << dendl; + return -EINVAL; + } + } + + // copy remaining + if (have_next) { + encode(nextkey, newkeydata); + encode(nextval, newkeydata); + dout(20) << " keep " << nextkey << " " << nextval.length() << dendl; + } + if (!ip.end()) { + bufferlist rest; + rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off()); + dout(20) << " keep trailing " << rest.length() + << " at " << newkeydata.length() << dendl; + newkeydata.claim_append(rest); + } + + // encode final key count + key data + dout(20) << "tmapup final nkeys " << nkeys << dendl; + encode(nkeys, obl); + obl.claim_append(newkeydata); + + if (0) { + dout(30) << " final is \n"; + obl.hexdump(*_dout); + *_dout << dendl; + + // sanity check + auto tp = obl.cbegin(); + bufferlist h; + decode(h, tp); + map<string,bufferlist> d; + decode(d, tp); + ceph_assert(tp.end()); + dout(0) << " **** debug sanity check, looks ok ****" << dendl; + } + + // write it out + if (!result) { + dout(20) << "tmapput write " << obl.length() << dendl; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = obl.length(); + newop.indata = obl; + do_osd_ops(ctx, nops); + } + } + return result; +} + +static int check_offset_and_length(uint64_t offset, uint64_t length, + uint64_t max, DoutPrefixProvider *dpp) +{ + if (offset >= max || + length > max || + offset + length > max) { + ldpp_dout(dpp, 10) << __func__ << " " + << "osd_max_object_size: " << max + << "; Hard limit of object size is 4GB." << dendl; + return -EFBIG; + } + + return 0; +} + +struct FillInVerifyExtent : public Context { + ceph_le64 *r; + int32_t *rval; + bufferlist *outdatap; + std::optional<uint32_t> maybe_crc; + uint64_t size; + OSDService *osd; + hobject_t soid; + uint32_t flags; + FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp, + std::optional<uint32_t> mc, uint64_t size, + OSDService *osd, hobject_t soid, uint32_t flags) : + r(r), rval(rv), outdatap(blp), maybe_crc(mc), + size(size), osd(osd), soid(soid), flags(flags) {} + void finish(int len) override { + if (len < 0) { + *rval = len; + return; + } + *r = len; + *rval = 0; + + // whole object? can we verify the checksum? + if (maybe_crc && *r == size) { + uint32_t crc = outdatap->crc32c(-1); + if (maybe_crc != crc) { + osd->clog->error() << std::hex << " full-object read crc 0x" << crc + << " != expected 0x" << *maybe_crc + << std::dec << " on " << soid; + if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) { + *rval = -EIO; + *r = 0; + } + } + } + } +}; + +struct ToSparseReadResult : public Context { + int* result; + bufferlist* data_bl; + uint64_t data_offset; + ceph_le64* len; + ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset, + ceph_le64* len) + : result(result), data_bl(bl), data_offset(offset),len(len) {} + void finish(int r) override { + if (r < 0) { + *result = r; + return; + } + *result = 0; + *len = r; + bufferlist outdata; + map<uint64_t, uint64_t> extents = {{data_offset, r}}; + encode(extents, outdata); + encode_destructively(*data_bl, outdata); + data_bl->swap(outdata); + } +}; + +template<typename V> +static string list_keys(const map<string, V>& m) { + string s; + for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) { + if (!s.empty()) { + s.push_back(','); + } + s.append(itr->first); + } + return s; +} + +template<typename T> +static string list_entries(const T& m) { + string s; + for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) { + if (!s.empty()) { + s.push_back(','); + } + s.append(*itr); + } + return s; +} + +void PrimaryLogPG::maybe_create_new_object( + OpContext *ctx, + bool ignore_transaction) +{ + ObjectState& obs = ctx->new_obs; + if (!obs.exists) { + ctx->delta_stats.num_objects++; + obs.exists = true; + ceph_assert(!obs.oi.is_whiteout()); + obs.oi.new_object(); + if (!ignore_transaction) + ctx->op_t->create(obs.oi.soid); + } else if (obs.oi.is_whiteout()) { + dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + --ctx->delta_stats.num_whiteouts; + } +} + +struct ReadFinisher : public PrimaryLogPG::OpFinisher { + OSDOp& osd_op; + + explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) { + } + + int execute() override { + return osd_op.rval; + } +}; + +struct C_ChecksumRead : public Context { + PrimaryLogPG *primary_log_pg; + OSDOp &osd_op; + Checksummer::CSumType csum_type; + bufferlist init_value_bl; + ceph_le64 read_length; + bufferlist read_bl; + Context *fill_extent_ctx; + + C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op, + Checksummer::CSumType csum_type, bufferlist &&init_value_bl, + std::optional<uint32_t> maybe_crc, uint64_t size, + OSDService *osd, hobject_t soid, uint32_t flags) + : primary_log_pg(primary_log_pg), osd_op(osd_op), + csum_type(csum_type), init_value_bl(std::move(init_value_bl)), + fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval, + &read_bl, maybe_crc, size, + osd, soid, flags)) { + } + ~C_ChecksumRead() override { + delete fill_extent_ctx; + } + + void finish(int r) override { + fill_extent_ctx->complete(r); + fill_extent_ctx = nullptr; + + if (osd_op.rval >= 0) { + bufferlist::const_iterator init_value_bl_it = init_value_bl.begin(); + osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type, + &init_value_bl_it, read_bl); + } + } +}; + +int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op, + bufferlist::const_iterator *bl_it) +{ + dout(20) << __func__ << dendl; + + auto& op = osd_op.op; + if (op.checksum.chunk_size > 0) { + if (op.checksum.length == 0) { + dout(10) << __func__ << ": length required when chunk size provided" + << dendl; + return -EINVAL; + } + if (op.checksum.length % op.checksum.chunk_size != 0) { + dout(10) << __func__ << ": length not aligned to chunk size" << dendl; + return -EINVAL; + } + } + + auto& oi = ctx->new_obs.oi; + if (op.checksum.offset == 0 && op.checksum.length == 0) { + // zeroed offset+length implies checksum whole object + op.checksum.length = oi.size; + } else if (op.checksum.offset >= oi.size) { + // read size was trimmed to zero, do nothing + // see PrimaryLogPG::do_read + return 0; + } else if (op.extent.offset + op.extent.length > oi.size) { + op.extent.length = oi.size - op.extent.offset; + if (op.checksum.chunk_size > 0 && + op.checksum.length % op.checksum.chunk_size != 0) { + dout(10) << __func__ << ": length (trimmed to 0x" + << std::hex << op.checksum.length + << ") not aligned to chunk size 0x" + << op.checksum.chunk_size << std::dec + << dendl; + return -EINVAL; + } + } + + Checksummer::CSumType csum_type; + switch (op.checksum.type) { + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32: + csum_type = Checksummer::CSUM_XXHASH32; + break; + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64: + csum_type = Checksummer::CSUM_XXHASH64; + break; + case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C: + csum_type = Checksummer::CSUM_CRC32C; + break; + default: + dout(10) << __func__ << ": unknown crc type (" + << static_cast<uint32_t>(op.checksum.type) << ")" << dendl; + return -EINVAL; + } + + size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type); + if (bl_it->get_remaining() < csum_init_value_size) { + dout(10) << __func__ << ": init value not provided" << dendl; + return -EINVAL; + } + + bufferlist init_value_bl; + init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(), + csum_init_value_size); + *bl_it += csum_init_value_size; + + if (pool.info.is_erasure() && op.checksum.length > 0) { + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. + std::optional<uint32_t> maybe_crc; + if (oi.is_data_digest() && op.checksum.offset == 0 && + op.checksum.length >= oi.size) { + maybe_crc = oi.data_digest; + } + + // async read + auto& soid = oi.soid; + auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type, + std::move(init_value_bl), maybe_crc, + oi.size, osd, soid, op.flags); + + ctx->pending_async_reads.push_back({ + {op.checksum.offset, op.checksum.length, op.flags}, + {&checksum_ctx->read_bl, checksum_ctx}}); + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + return -EINPROGRESS; + } + + // sync read + std::vector<OSDOp> read_ops(1); + auto& read_op = read_ops[0]; + if (op.checksum.length > 0) { + read_op.op.op = CEPH_OSD_OP_READ; + read_op.op.flags = op.flags; + read_op.op.extent.offset = op.checksum.offset; + read_op.op.extent.length = op.checksum.length; + read_op.op.extent.truncate_size = 0; + read_op.op.extent.truncate_seq = 0; + + int r = do_osd_ops(ctx, read_ops); + if (r < 0) { + derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl; + return r; + } + } + + bufferlist::const_iterator init_value_bl_it = init_value_bl.begin(); + return finish_checksum(osd_op, csum_type, &init_value_bl_it, + read_op.outdata); +} + +int PrimaryLogPG::finish_checksum(OSDOp& osd_op, + Checksummer::CSumType csum_type, + bufferlist::const_iterator *init_value_bl_it, + const bufferlist &read_bl) { + dout(20) << __func__ << dendl; + + auto& op = osd_op.op; + + if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) { + derr << __func__ << ": bytes read " << read_bl.length() << " != " + << op.checksum.length << dendl; + return -EINVAL; + } + + size_t csum_chunk_size = (op.checksum.chunk_size != 0 ? + op.checksum.chunk_size : read_bl.length()); + uint32_t csum_count = (csum_chunk_size > 0 ? + read_bl.length() / csum_chunk_size : 0); + + bufferlist csum; + bufferptr csum_data; + if (csum_count > 0) { + size_t csum_value_size = Checksummer::get_csum_value_size(csum_type); + csum_data = ceph::buffer::create(csum_value_size * csum_count); + csum_data.zero(); + csum.append(csum_data); + + switch (csum_type) { + case Checksummer::CSUM_XXHASH32: + { + Checksummer::xxhash32::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate<Checksummer::xxhash32>( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + case Checksummer::CSUM_XXHASH64: + { + Checksummer::xxhash64::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate<Checksummer::xxhash64>( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + case Checksummer::CSUM_CRC32C: + { + Checksummer::crc32c::init_value_t init_value; + decode(init_value, *init_value_bl_it); + Checksummer::calculate<Checksummer::crc32c>( + init_value, csum_chunk_size, 0, read_bl.length(), read_bl, + &csum_data); + } + break; + default: + break; + } + } + + encode(csum_count, osd_op.outdata); + osd_op.outdata.claim_append(csum); + return 0; +} + +struct C_ExtentCmpRead : public Context { + PrimaryLogPG *primary_log_pg; + OSDOp &osd_op; + ceph_le64 read_length{}; + bufferlist read_bl; + Context *fill_extent_ctx; + + C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op, + std::optional<uint32_t> maybe_crc, uint64_t size, + OSDService *osd, hobject_t soid, uint32_t flags) + : primary_log_pg(primary_log_pg), osd_op(osd_op), + fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval, + &read_bl, maybe_crc, size, + osd, soid, flags)) { + } + ~C_ExtentCmpRead() override { + delete fill_extent_ctx; + } + + void finish(int r) override { + if (r == -ENOENT) { + osd_op.rval = 0; + read_bl.clear(); + delete fill_extent_ctx; + } else { + fill_extent_ctx->complete(r); + } + fill_extent_ctx = nullptr; + + if (osd_op.rval >= 0) { + osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl); + } + } +}; + +int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op) +{ + dout(20) << __func__ << dendl; + ceph_osd_op& op = osd_op.op; + + auto& oi = ctx->new_obs.oi; + uint64_t size = oi.size; + if ((oi.truncate_seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size)) { + size = op.extent.truncate_size; + } + + if (op.extent.offset >= size) { + op.extent.length = 0; + } else if (op.extent.offset + op.extent.length > size) { + op.extent.length = size - op.extent.offset; + } + + if (op.extent.length == 0) { + dout(20) << __func__ << " zero length extent" << dendl; + return finish_extent_cmp(osd_op, bufferlist{}); + } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) { + dout(20) << __func__ << " object DNE" << dendl; + return finish_extent_cmp(osd_op, {}); + } else if (pool.info.is_erasure()) { + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. + std::optional<uint32_t> maybe_crc; + if (oi.is_data_digest() && op.checksum.offset == 0 && + op.checksum.length >= oi.size) { + maybe_crc = oi.data_digest; + } + + // async read + auto& soid = oi.soid; + auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size, + osd, soid, op.flags); + ctx->pending_async_reads.push_back({ + {op.extent.offset, op.extent.length, op.flags}, + {&extent_cmp_ctx->read_bl, extent_cmp_ctx}}); + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + return -EINPROGRESS; + } + + // sync read + vector<OSDOp> read_ops(1); + OSDOp& read_op = read_ops[0]; + + read_op.op.op = CEPH_OSD_OP_SYNC_READ; + read_op.op.extent.offset = op.extent.offset; + read_op.op.extent.length = op.extent.length; + read_op.op.extent.truncate_seq = op.extent.truncate_seq; + read_op.op.extent.truncate_size = op.extent.truncate_size; + + int result = do_osd_ops(ctx, read_ops); + if (result < 0) { + derr << __func__ << " failed " << result << dendl; + return result; + } + return finish_extent_cmp(osd_op, read_op.outdata); +} + +int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl) +{ + for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) { + char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0); + if (osd_op.indata[idx] != read_byte) { + return (-MAX_ERRNO - idx); + } + } + + return 0; +} + +int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) { + dout(20) << __func__ << dendl; + auto& op = osd_op.op; + auto& oi = ctx->new_obs.oi; + auto& soid = oi.soid; + __u32 seq = oi.truncate_seq; + uint64_t size = oi.size; + bool trimmed_read = false; + + dout(30) << __func__ << " oi.size: " << oi.size << dendl; + dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl; + dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl; + dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl; + + // are we beyond truncate_size? + if ( (seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size) && + (size > op.extent.truncate_size) ) + size = op.extent.truncate_size; + + if (op.extent.length == 0) //length is zero mean read the whole object + op.extent.length = size; + + if (op.extent.offset >= size) { + op.extent.length = 0; + trimmed_read = true; + } else if (op.extent.offset + op.extent.length > size) { + op.extent.length = size - op.extent.offset; + trimmed_read = true; + } + + dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl; + + // read into a buffer + int result = 0; + if (trimmed_read && op.extent.length == 0) { + // read size was trimmed to zero and it is expected to do nothing + // a read operation of 0 bytes does *not* do nothing, this is why + // the trimmed_read boolean is needed + } else if (pool.info.is_erasure()) { + // The initialisation below is required to silence a false positive + // -Wmaybe-uninitialized warning + std::optional<uint32_t> maybe_crc; + // If there is a data digest and it is possible we are reading + // entire object, pass the digest. FillInVerifyExtent will + // will check the oi.size again. + if (oi.is_data_digest() && op.extent.offset == 0 && + op.extent.length >= oi.size) + maybe_crc = oi.data_digest; + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(op.extent.offset, op.extent.length, op.flags), + make_pair(&osd_op.outdata, + new FillInVerifyExtent(&op.extent.length, &osd_op.rval, + &osd_op.outdata, maybe_crc, oi.size, + osd, soid, op.flags)))); + dout(10) << " async_read noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + } else { + int r = pgbackend->objects_read_sync( + soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata); + // whole object? can we verify the checksum? + if (r >= 0 && op.extent.offset == 0 && + (uint64_t)r == oi.size && oi.is_data_digest()) { + uint32_t crc = osd_op.outdata.crc32c(-1); + if (oi.data_digest != crc) { + osd->clog->error() << info.pgid << std::hex + << " full-object read crc 0x" << crc + << " != expected 0x" << oi.data_digest + << std::dec << " on " << soid; + r = -EIO; // try repair later + } + } + if (r == -EIO) { + r = rep_repair_primary_object(soid, ctx); + } + if (r >= 0) + op.extent.length = r; + else if (r == -EAGAIN) { + result = -EAGAIN; + } else { + result = r; + op.extent.length = 0; + } + dout(10) << " read got " << r << " / " << op.extent.length + << " bytes from obj " << soid << dendl; + } + if (result >= 0) { + ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10); + ctx->delta_stats.num_rd++; + } + return result; +} + +int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { + dout(20) << __func__ << dendl; + auto& op = osd_op.op; + auto& oi = ctx->new_obs.oi; + auto& soid = oi.soid; + uint64_t size = oi.size; + uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + + // are we beyond truncate_size? + if ((oi.truncate_seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size) && + (size > op.extent.truncate_size)) { + size = op.extent.truncate_size; + } + + if (offset > size) { + length = 0; + } else if (offset + length > size) { + length = size - offset; + } + + ++ctx->num_read; + if (pool.info.is_erasure()) { + // translate sparse read to a normal one if not supported + + if (length > 0) { + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(offset, length, op.flags), + make_pair( + &osd_op.outdata, + new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset, + &op.extent.length)))); + dout(10) << " async_read (was sparse_read) noted for " << soid << dendl; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + } else { + dout(10) << " sparse read ended up empty for " << soid << dendl; + map<uint64_t, uint64_t> extents; + encode(extents, osd_op.outdata); + } + } else { + // read into a buffer + map<uint64_t, uint64_t> m; + int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN, + info.pgid.shard), + offset, length, m); + if (r < 0) { + return r; + } + + bufferlist data_bl; + r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl); + if (r == -EIO) { + r = rep_repair_primary_object(soid, ctx); + } + if (r < 0) { + return r; + } + + // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read. + // Maybe at first, there is no much whole objects. With continued use, more + // and more whole object exist. So from this point, for spare-read add + // checksum make sense. + if ((uint64_t)r == oi.size && oi.is_data_digest()) { + uint32_t crc = data_bl.crc32c(-1); + if (oi.data_digest != crc) { + osd->clog->error() << info.pgid << std::hex + << " full-object read crc 0x" << crc + << " != expected 0x" << oi.data_digest + << std::dec << " on " << soid; + r = rep_repair_primary_object(soid, ctx); + if (r < 0) { + return r; + } + } + } + + op.extent.length = r; + + encode(m, osd_op.outdata); // re-encode since it might be modified + ::encode_destructively(data_bl, osd_op.outdata); + + dout(10) << " sparse_read got " << r << " bytes from object " + << soid << dendl; + } + + ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10); + ctx->delta_stats.num_rd++; + return 0; +} + +int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) +{ + int result = 0; + SnapSetContext *ssc = ctx->obc->ssc; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + const bool skip_data_digest = osd->store->has_builtin_csum() && + osd->osd_skip_data_digest; + + PGTransaction* t = ctx->op_t.get(); + + dout(10) << "do_osd_op " << soid << " " << ops << dendl; + + ctx->current_osd_subop_num = 0; + for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) { + OSDOp& osd_op = *p; + ceph_osd_op& op = osd_op.op; + + OpFinisher* op_finisher = nullptr; + { + auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num); + if (op_finisher_it != ctx->op_finishers.end()) { + op_finisher = op_finisher_it->second.get(); + } + } + + // TODO: check endianness (ceph_le32 vs uint32_t, etc.) + // The fields in ceph_osd_op are little-endian (according to the definition in rados.h), + // but the code in this function seems to treat them as native-endian. What should the + // tracepoints do? + tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags); + + dout(10) << "do_osd_op " << osd_op << dendl; + + auto bp = osd_op.indata.cbegin(); + + // user-visible modifcation? + switch (op.op) { + // non user-visible modifications + case CEPH_OSD_OP_WATCH: + case CEPH_OSD_OP_CACHE_EVICT: + case CEPH_OSD_OP_CACHE_FLUSH: + case CEPH_OSD_OP_CACHE_TRY_FLUSH: + case CEPH_OSD_OP_UNDIRTY: + case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly + case CEPH_OSD_OP_COPY_FROM2: + case CEPH_OSD_OP_CACHE_PIN: + case CEPH_OSD_OP_CACHE_UNPIN: + case CEPH_OSD_OP_SET_REDIRECT: + case CEPH_OSD_OP_SET_CHUNK: + case CEPH_OSD_OP_TIER_PROMOTE: + case CEPH_OSD_OP_TIER_FLUSH: + case CEPH_OSD_OP_TIER_EVICT: + break; + default: + if (op.op & CEPH_OSD_OP_MODE_WR) + ctx->user_modify = true; + } + + // munge -1 truncate to 0 truncate + if (ceph_osd_op_uses_extent(op.op) && + op.extent.truncate_seq == 1 && + op.extent.truncate_size == (-1ULL)) { + op.extent.truncate_size = 0; + op.extent.truncate_seq = 0; + } + + // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes) + if (op.op == CEPH_OSD_OP_ZERO && + obs.exists && + op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) && + op.extent.length >= 1 && + op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) && + op.extent.offset + op.extent.length >= oi.size) { + if (op.extent.offset >= oi.size) { + // no-op + goto fail; + } + dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length + << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl; + op.op = CEPH_OSD_OP_TRUNCATE; + } + + switch (op.op) { + + // --- READS --- + + case CEPH_OSD_OP_CMPEXT: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + + if (op_finisher == nullptr) { + result = do_extent_cmp(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_SYNC_READ: + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + // fall through + case CEPH_OSD_OP_READ: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + if (op_finisher == nullptr) { + if (!ctx->data_off) { + ctx->data_off = op.extent.offset; + } + result = do_read(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_CHECKSUM: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type, + op.checksum.offset, op.checksum.length, + op.checksum.chunk_size); + + if (op_finisher == nullptr) { + result = do_checksum(ctx, osd_op, &bp); + } else { + result = op_finisher->execute(); + } + } + break; + + /* map extents */ + case CEPH_OSD_OP_MAPEXT: + tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_read; + { + // read into a buffer + bufferlist bl; + int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN, + info.pgid.shard), + op.extent.offset, op.extent.length, bl); + osd_op.outdata = std::move(bl); + if (r < 0) + result = r; + else + ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + ctx->delta_stats.num_rd++; + dout(10) << " map_extents done on object " << soid << dendl; + } + break; + + /* map extents */ + case CEPH_OSD_OP_SPARSE_READ: + tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), + soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, + op.extent.length, op.extent.truncate_size, + op.extent.truncate_seq); + if (op_finisher == nullptr) { + result = do_sparse_read(ctx, osd_op); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_CALL: + { + string cname, mname; + bufferlist indata; + try { + bp.copy(op.cls.class_len, cname); + bp.copy(op.cls.method_len, mname); + bp.copy(op.cls.indata_len, indata); + } catch (ceph::buffer::error& e) { + dout(10) << "call unable to decode class + method + indata" << dendl; + dout(30) << "in dump: "; + osd_op.indata.hexdump(*_dout); + *_dout << dendl; + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???"); + break; + } + tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str()); + + ClassHandler::ClassData *cls; + result = ClassHandler::get_instance().open_class(cname, &cls); + ceph_assert(result == 0); // init_op_flags() already verified this works. + + ClassHandler::ClassMethod *method = cls->get_method(mname); + if (!method) { + dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl; + result = -EOPNOTSUPP; + break; + } + + int flags = method->get_flags(); + if (flags & CLS_METHOD_WR) + ctx->user_modify = true; + + bufferlist outdata; + dout(10) << "call method " << cname << "." << mname << dendl; + int prev_rd = ctx->num_read; + int prev_wr = ctx->num_write; + result = method->exec((cls_method_context_t)&ctx, indata, outdata); + + if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) { + derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl; + result = -EIO; + break; + } + if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) { + derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl; + result = -EIO; + break; + } + + dout(10) << "method called response length=" << outdata.length() << dendl; + op.extent.length = outdata.length(); + osd_op.outdata.claim_append(outdata); + dout(30) << "out dump: "; + osd_op.outdata.hexdump(*_dout); + *_dout << dendl; + } + break; + + case CEPH_OSD_OP_STAT: + // note: stat does not require RD + { + tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val); + + if (obs.exists && !oi.is_whiteout()) { + encode(oi.size, osd_op.outdata); + encode(oi.mtime, osd_op.outdata); + dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl; + } else { + result = -ENOENT; + dout(10) << "stat oi object does not exist" << dendl; + } + + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_ISDIRTY: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val); + bool is_dirty = obs.oi.is_dirty(); + encode(is_dirty, osd_op.outdata); + ctx->delta_stats.num_rd++; + result = 0; + } + break; + + case CEPH_OSD_OP_UNDIRTY: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val); + if (oi.is_dirty()) { + ctx->undirty = true; // see make_writeable() + ctx->modify = true; + ctx->delta_stats.num_wr++; + } + } + break; + + case CEPH_OSD_OP_CACHE_TRY_FLUSH: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val); + if (ctx->lock_type != RWState::RWNONE) { + dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl; + result = -EINVAL; + break; + } + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + if (oi.is_dirty()) { + result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt); + if (result == -EINPROGRESS) + result = -EAGAIN; + } else { + result = 0; + } + } + break; + + case CEPH_OSD_OP_CACHE_FLUSH: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val); + if (ctx->lock_type == RWState::RWNONE) { + dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl; + result = -EINVAL; + break; + } + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + hobject_t missing; + if (oi.is_dirty()) { + result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt); + if (result == -EINPROGRESS) + result = -EAGAIN; + } else { + result = 0; + } + // Check special return value which has set missing_return + if (result == -ENOENT) { + dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl; + ceph_assert(!missing.is_min()); + wait_for_unreadable_object(missing, ctx->op); + // Error code which is used elsewhere when wait_for_unreadable_object() is used + result = -EAGAIN; + } + } + break; + + case CEPH_OSD_OP_CACHE_EVICT: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = 0; + break; + } + if (oi.is_cache_pinned()) { + dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl; + result = -EPERM; + break; + } + if (oi.is_dirty()) { + result = -EBUSY; + break; + } + if (!oi.watchers.empty()) { + result = -EBUSY; + break; + } + if (soid.snap == CEPH_NOSNAP) { + result = _verify_no_head_clones(soid, ssc->snapset); + if (result < 0) + break; + } + result = _delete_oid(ctx, true, false); + if (result >= 0) { + // mark that this is a cache eviction to avoid triggering normal + // make_writeable() clone creation in finish_ctx() + ctx->cache_operation = true; + } + osd->logger->inc(l_osd_tier_evict); + } + break; + + case CEPH_OSD_OP_GETXATTR: + ++ctx->num_read; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + int r = getattr_maybe_cache( + ctx->obc, + name, + &(osd_op.outdata)); + if (r >= 0) { + op.xattr.value_len = osd_op.outdata.length(); + result = 0; + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + } else + result = r; + + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_GETXATTRS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val); + map<string, bufferlist,less<>> out; + result = getattrs_maybe_cache( + ctx->obc, + &out); + + bufferlist bl; + encode(out, bl); + ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + ctx->delta_stats.num_rd++; + osd_op.outdata.claim_append(bl); + } + break; + + case CEPH_OSD_OP_CMPXATTR: + ++ctx->num_read; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + name[op.xattr.name_len + 1] = 0; + + bufferlist xattr; + result = getattr_maybe_cache( + ctx->obc, + name, + &xattr); + if (result < 0 && result != -EEXIST && result != -ENODATA) + break; + + ctx->delta_stats.num_rd++; + ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10); + + switch (op.xattr.cmp_mode) { + case CEPH_OSD_CMPXATTR_MODE_STRING: + { + string val; + bp.copy(op.xattr.value_len, val); + val[op.xattr.value_len] = 0; + dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val + << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl; + result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr); + } + break; + + case CEPH_OSD_CMPXATTR_MODE_U64: + { + uint64_t u64val; + try { + decode(u64val, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val + << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl; + result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr); + } + break; + + default: + dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl; + result = -EINVAL; + } + + if (!result) { + dout(10) << "comparison returned false" << dendl; + result = -ECANCELED; + break; + } + if (result < 0) { + dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl; + break; + } + + dout(10) << "comparison returned true" << dendl; + } + break; + + case CEPH_OSD_OP_ASSERT_VER: + ++ctx->num_read; + { + uint64_t ver = op.assert_ver.ver; + tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver); + if (!ver) { + result = -EINVAL; + } else if (ver < oi.user_version) { + result = -ERANGE; + } else if (ver > oi.user_version) { + result = -EOVERFLOW; + } + } + break; + + case CEPH_OSD_OP_LIST_WATCHERS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val); + obj_list_watch_response_t resp; + + map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter; + for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end(); + ++oi_iter) { + dout(20) << "key cookie=" << oi_iter->first.first + << " entity=" << oi_iter->first.second << " " + << oi_iter->second << dendl; + ceph_assert(oi_iter->first.first == oi_iter->second.cookie); + ceph_assert(oi_iter->first.second.is_client()); + + watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie, + oi_iter->second.timeout_seconds, oi_iter->second.addr); + resp.entries.push_back(wi); + } + + resp.encode(osd_op.outdata, ctx->get_features()); + result = 0; + + ctx->delta_stats.num_rd++; + break; + } + + case CEPH_OSD_OP_LIST_SNAPS: + ++ctx->num_read; + { + tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val); + obj_list_snap_response_t resp; + + if (!ssc) { + ssc = ctx->obc->ssc = get_snapset_context(soid, false); + } + ceph_assert(ssc); + dout(20) << " snapset " << ssc->snapset << dendl; + + int clonecount = ssc->snapset.clones.size(); + clonecount++; // for head + resp.clones.reserve(clonecount); + for (auto clone_iter = ssc->snapset.clones.begin(); + clone_iter != ssc->snapset.clones.end(); ++clone_iter) { + clone_info ci; + ci.cloneid = *clone_iter; + + hobject_t clone_oid = soid; + clone_oid.snap = *clone_iter; + + auto p = ssc->snapset.clone_snaps.find(*clone_iter); + if (p == ssc->snapset.clone_snaps.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_snaps found for oid " + << soid << " clone " << *clone_iter + << " snapset " << ssc->snapset; + result = -EINVAL; + break; + } + for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) { + ci.snaps.push_back(*q); + } + + dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl; + + map<snapid_t, interval_set<uint64_t> >::const_iterator coi; + coi = ssc->snapset.clone_overlap.find(ci.cloneid); + if (coi == ssc->snapset.clone_overlap.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_overlap found for oid " + << soid << " clone " << *clone_iter; + result = -EINVAL; + break; + } + const interval_set<uint64_t> &o = coi->second; + ci.overlap.reserve(o.num_intervals()); + for (interval_set<uint64_t>::const_iterator r = o.begin(); + r != o.end(); ++r) { + ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(), + r.get_len())); + } + + map<snapid_t, uint64_t>::const_iterator si; + si = ssc->snapset.clone_size.find(ci.cloneid); + if (si == ssc->snapset.clone_size.end()) { + osd->clog->error() << "osd." << osd->whoami + << ": inconsistent clone_size found for oid " + << soid << " clone " << *clone_iter; + result = -EINVAL; + break; + } + ci.size = si->second; + + resp.clones.push_back(ci); + } + if (result < 0) { + break; + } + if (!ctx->obc->obs.oi.is_whiteout()) { + ceph_assert(obs.exists); + clone_info ci; + ci.cloneid = CEPH_NOSNAP; + + //Size for HEAD is oi.size + ci.size = oi.size; + + resp.clones.push_back(ci); + } + resp.seq = ssc->snapset.seq; + + resp.encode(osd_op.outdata); + result = 0; + + ctx->delta_stats.num_rd++; + break; + } + + case CEPH_OSD_OP_NOTIFY: + ++ctx->num_read; + { + uint32_t timeout; + bufferlist bl; + + try { + uint32_t ver; // obsolete + decode(ver, bp); + decode(timeout, bp); + decode(bl, bp); + } catch (const ceph::buffer::error &e) { + timeout = 0; + } + tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout); + if (!timeout) + timeout = cct->_conf->osd_default_notify_timeout; + + notify_info_t n; + n.timeout = timeout; + n.notify_id = osd->get_next_id(get_osdmap_epoch()); + n.cookie = op.notify.cookie; + n.bl = bl; + ctx->notifies.push_back(n); + + // return our unique notify id to the client + encode(n.notify_id, osd_op.outdata); + } + break; + + case CEPH_OSD_OP_NOTIFY_ACK: + ++ctx->num_read; + { + try { + uint64_t notify_id = 0; + uint64_t watch_cookie = 0; + decode(notify_id, bp); + decode(watch_cookie, bp); + bufferlist reply_bl; + if (!bp.end()) { + decode(reply_bl, bp); + } + tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y"); + OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl); + ctx->notify_acks.push_back(ack); + } catch (const ceph::buffer::error &e) { + tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N"); + OpContext::NotifyAck ack( + // op.watch.cookie is actually the notify_id for historical reasons + op.watch.cookie + ); + ctx->notify_acks.push_back(ack); + } + } + break; + + case CEPH_OSD_OP_SETALLOCHINT: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size); + maybe_create_new_object(ctx); + oi.expected_object_size = op.alloc_hint.expected_object_size; + oi.expected_write_size = op.alloc_hint.expected_write_size; + oi.alloc_hint_flags = op.alloc_hint.flags; + t->set_alloc_hint(soid, op.alloc_hint.expected_object_size, + op.alloc_hint.expected_write_size, + op.alloc_hint.flags); + } + break; + + + // --- WRITES --- + + // -- object data -- + + case CEPH_OSD_OP_WRITE: + ++ctx->num_write; + result = 0; + { // write + __u32 seq = oi.truncate_seq; + tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + if (op.extent.length != osd_op.indata.length()) { + result = -EINVAL; + break; + } + + if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED)) + op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + if (pool.info.requires_aligned_append() && + (op.extent.offset % pool.info.required_alignment() != 0)) { + result = -EOPNOTSUPP; + break; + } + + if (!obs.exists) { + if (pool.info.requires_aligned_append() && op.extent.offset) { + result = -EOPNOTSUPP; + break; + } + } else if (op.extent.offset != oi.size && + pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + + if (seq && (seq > op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > oi.size)) { + // old write, arrived after trimtrunc + op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset); + dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq + << ", adjusting write length to " << op.extent.length << dendl; + bufferlist t; + t.substr_of(osd_op.indata, 0, op.extent.length); + osd_op.indata.swap(t); + } + if (op.extent.truncate_seq > seq) { + // write arrives before trimtrunc + if (obs.exists && !oi.is_whiteout()) { + dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq + << ", truncating to " << op.extent.truncate_size << dendl; + t->truncate(soid, op.extent.truncate_size); + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + if (oi.size > op.extent.truncate_size) { + interval_set<uint64_t> trim; + trim.insert(op.extent.truncate_size, + oi.size - op.extent.truncate_size); + ctx->modified_ranges.union_of(trim); + ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size); + oi.clear_data_digest(); + } + if (op.extent.truncate_size != oi.size) { + truncate_update_size_and_usage(ctx->delta_stats, + oi, + op.extent.truncate_size); + } + } else { + dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq + << ", but object is new" << dendl; + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + } + } + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + maybe_create_new_object(ctx); + + if (op.extent.length == 0) { + if (op.extent.offset > oi.size) { + t->truncate( + soid, op.extent.offset); + truncate_update_size_and_usage(ctx->delta_stats, oi, + op.extent.offset); + } else { + t->nop(soid); + } + } else { + t->write( + soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags); + } + + if (op.extent.offset == 0 && op.extent.length >= oi.size + && !skip_data_digest) { + obs.oi.set_data_digest(osd_op.indata.crc32c(-1)); + } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) { + if (skip_data_digest) { + obs.oi.clear_data_digest(); + } else { + obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest)); + } + } else { + obs.oi.clear_data_digest(); + } + write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, + op.extent.offset, op.extent.length); + ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length); + dout(10) << "clean_regions modified" << ctx->clean_regions << dendl; + } + break; + + case CEPH_OSD_OP_WRITEFULL: + ++ctx->num_write; + result = 0; + { // write full object + tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length); + + if (op.extent.length != osd_op.indata.length()) { + result = -EINVAL; + break; + } + result = check_offset_and_length( + 0, op.extent.length, + static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED)) + op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + maybe_create_new_object(ctx); + if (pool.info.is_erasure()) { + t->truncate(soid, 0); + } else if (obs.exists && op.extent.length < oi.size) { + t->truncate(soid, op.extent.length); + } + if (op.extent.length) { + t->write(soid, 0, op.extent.length, osd_op.indata, op.flags); + } + if (!skip_data_digest) { + obs.oi.set_data_digest(osd_op.indata.crc32c(-1)); + } else { + obs.oi.clear_data_digest(); + } + ctx->clean_regions.mark_data_region_dirty(0, + std::max((uint64_t)op.extent.length, oi.size)); + write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, + 0, op.extent.length, true); + } + break; + + case CEPH_OSD_OP_WRITESAME: + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length); + result = do_writesame(ctx, osd_op); + break; + + case CEPH_OSD_OP_ROLLBACK : + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val); + result = _rollback_to(ctx, osd_op); + break; + + case CEPH_OSD_OP_ZERO: + tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length); + if (pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + { // zero + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + if (op.extent.length && obs.exists && !oi.is_whiteout()) { + t->zero(soid, op.extent.offset, op.extent.length); + interval_set<uint64_t> ch; + ch.insert(op.extent.offset, op.extent.length); + ctx->modified_ranges.union_of(ch); + ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length); + ctx->delta_stats.num_wr++; + oi.clear_data_digest(); + } else { + // no-op + } + } + break; + case CEPH_OSD_OP_CREATE: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val); + if (obs.exists && !oi.is_whiteout() && + (op.flags & CEPH_OSD_OP_FLAG_EXCL)) { + result = -EEXIST; /* this is an exclusive create */ + } else { + if (osd_op.indata.length()) { + auto p = osd_op.indata.cbegin(); + string category; + try { + decode(category, p); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + // category is no longer implemented. + } + maybe_create_new_object(ctx); + t->nop(soid); + } + } + break; + + case CEPH_OSD_OP_TRIMTRUNC: + op.extent.offset = op.extent.truncate_size; + // falling through + + case CEPH_OSD_OP_TRUNCATE: + tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + if (pool.info.requires_aligned_append()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = 0; + { + // truncate + if (!obs.exists || oi.is_whiteout()) { + dout(10) << " object dne, truncate is a no-op" << dendl; + break; + } + + result = check_offset_and_length( + op.extent.offset, op.extent.length, + static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp()); + if (result < 0) + break; + + if (op.extent.truncate_seq) { + ceph_assert(op.extent.offset == op.extent.truncate_size); + if (op.extent.truncate_seq <= oi.truncate_seq) { + dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq + << ", no-op" << dendl; + break; // old + } + dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq + << ", truncating" << dendl; + oi.truncate_seq = op.extent.truncate_seq; + oi.truncate_size = op.extent.truncate_size; + } + + maybe_create_new_object(ctx); + t->truncate(soid, op.extent.offset); + if (oi.size > op.extent.offset) { + interval_set<uint64_t> trim; + trim.insert(op.extent.offset, oi.size-op.extent.offset); + ctx->modified_ranges.union_of(trim); + ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset); + } else if (oi.size < op.extent.offset) { + ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size); + } + if (op.extent.offset != oi.size) { + truncate_update_size_and_usage(ctx->delta_stats, + oi, + op.extent.offset); + } + ctx->delta_stats.num_wr++; + // do no set exists, or we will break above DELETE -> TRUNCATE munging. + + oi.clear_data_digest(); + } + break; + + case CEPH_OSD_OP_DELETE: + ++ctx->num_write; + result = 0; + tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val); + { + result = _delete_oid(ctx, false, ctx->ignore_cache); + } + break; + + case CEPH_OSD_OP_WATCH: + ++ctx->num_write; + result = 0; + { + tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val, + op.watch.cookie, op.watch.op); + if (!obs.exists) { + result = -ENOENT; + break; + } + result = 0; + uint64_t cookie = op.watch.cookie; + entity_name_t entity = ctx->reqid.name; + ObjectContextRef obc = ctx->obc; + + dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op) + << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie + << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl; + dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl; + dout(10) << "watch: peer_addr=" + << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl; + + uint32_t timeout = cct->_conf->osd_client_watch_timeout; + if (op.watch.timeout != 0) { + timeout = op.watch.timeout; + } + + watch_info_t w(cookie, timeout, + ctx->op->get_req()->get_connection()->get_peer_addr()); + if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH || + op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) { + if (oi.watchers.count(make_pair(cookie, entity))) { + dout(10) << " found existing watch " << w << " by " << entity << dendl; + } else { + dout(10) << " registered new watch " << w << " by " << entity << dendl; + oi.watchers[make_pair(cookie, entity)] = w; + t->nop(soid); // make sure update the object_info on disk! + } + bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH); + ctx->watch_connects.push_back(make_pair(w, will_ping)); + } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) { + if (!oi.watchers.count(make_pair(cookie, entity))) { + result = -ENOTCONN; + break; + } + dout(10) << " found existing watch " << w << " by " << entity << dendl; + ctx->watch_connects.push_back(make_pair(w, true)); + } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) { + /* Note: WATCH with PING doesn't cause may_write() to return true, + * so if there is nothing else in the transaction, this is going + * to run do_osd_op_effects, but not write out a log entry */ + if (!oi.watchers.count(make_pair(cookie, entity))) { + result = -ENOTCONN; + break; + } + map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p = + obc->watchers.find(make_pair(cookie, entity)); + if (p == obc->watchers.end() || + !p->second->is_connected()) { + // client needs to reconnect + result = -ETIMEDOUT; + break; + } + dout(10) << " found existing watch " << w << " by " << entity << dendl; + p->second->got_ping(ceph_clock_now()); + result = 0; + } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) { + map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter = + oi.watchers.find(make_pair(cookie, entity)); + if (oi_iter != oi.watchers.end()) { + dout(10) << " removed watch " << oi_iter->second << " by " + << entity << dendl; + oi.watchers.erase(oi_iter); + t->nop(soid); // update oi on disk + ctx->watch_disconnects.push_back( + watch_disconnect_t(cookie, entity, false)); + } else { + dout(10) << " can't remove: no watch by " << entity << dendl; + } + } + } + break; + + case CEPH_OSD_OP_CACHE_PIN: + tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val); + if ((!pool.info.is_tier() || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) { + result = -EINVAL; + dout(10) << " pin object is only allowed on the cache tier " << dendl; + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + + if (!oi.is_cache_pinned()) { + oi.set_flag(object_info_t::FLAG_CACHE_PIN); + ctx->modify = true; + ctx->delta_stats.num_objects_pinned++; + ctx->delta_stats.num_wr++; + } + } + break; + + case CEPH_OSD_OP_CACHE_UNPIN: + tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val); + if ((!pool.info.is_tier() || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) { + result = -EINVAL; + dout(10) << " pin object is only allowed on the cache tier " << dendl; + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + + if (oi.is_cache_pinned()) { + oi.clear_flag(object_info_t::FLAG_CACHE_PIN); + ctx->modify = true; + ctx->delta_stats.num_objects_pinned--; + ctx->delta_stats.num_wr++; + } + } + break; + + case CEPH_OSD_OP_SET_REDIRECT: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::luminous) { + result = -EOPNOTSUPP; + break; + } + + object_t target_name; + object_locator_t target_oloc; + snapid_t target_snapid = (uint64_t)op.copy_from.snapid; + version_t target_version = op.copy_from.src_version; + try { + decode(target_name, bp); + decode(target_oloc, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + pg_t raw_pg; + result = get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg); + if (result < 0) { + dout(5) << " pool information is invalid: " << result << dendl; + break; + } + hobject_t target(target_name, target_oloc.key, target_snapid, + raw_pg.ps(), raw_pg.pool(), + target_oloc.nspace); + if (target == soid) { + dout(20) << " set-redirect self is invalid" << dendl; + result = -EINVAL; + break; + } + + bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE); + bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE); + if (has_reference) { + result = -EINVAL; + dout(5) << " the object is already a manifest " << dendl; + break; + } + if (op_finisher == nullptr && need_reference) { + // start + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new SetManifestFinisher(osd_op)); + ManifestOpRef mop = std::make_shared<ManifestOp>(ctx->obc, new RefCountCallback(ctx, osd_op)); + auto* fin = new C_SetManifestRefCountDone(this, soid, 0); + ceph_tid_t tid = refcount_manifest(soid, target, + refcount_t::INCREMENT_REF, fin, std::nullopt); + fin->tid = tid; + mop->num_chunks++; + mop->tids[0] = tid; + manifest_ops[soid] = mop; + ctx->obc->start_block(); + result = -EINPROGRESS; + } else { + // finish + if (op_finisher) { + result = op_finisher->execute(); + ceph_assert(result == 0); + } + + if (!oi.has_manifest() && !oi.manifest.is_redirect()) + ctx->delta_stats.num_objects_manifest++; + + oi.set_flag(object_info_t::FLAG_MANIFEST); + oi.manifest.redirect_target = target; + oi.manifest.type = object_manifest_t::TYPE_REDIRECT; + t->truncate(soid, 0); + ctx->clean_regions.mark_data_region_dirty(0, oi.size); + if (oi.is_omap() && pool.info.supports_omap()) { + t->omap_clear(soid); + obs.oi.clear_omap_digest(); + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + ctx->clean_regions.mark_omap_dirty(); + } + write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, + 0, oi.size, false); + ctx->delta_stats.num_bytes -= oi.size; + oi.size = 0; + oi.new_object(); + oi.user_version = target_version; + ctx->user_at_version = target_version; + /* rm_attrs */ + map<string,bufferlist,less<>> rmattrs; + result = getattrs_maybe_cache(ctx->obc, &rmattrs); + if (result < 0) { + dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl; + return result; + } + map<string, bufferlist>::iterator iter; + for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) { + const string& name = iter->first; + t->rmattr(soid, name); + } + if (!has_reference && need_reference) { + oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE); + } + dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl; + if (op_finisher) { + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + } + + break; + + case CEPH_OSD_OP_SET_CHUNK: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::luminous) { + result = -EOPNOTSUPP; + break; + } + if (oi.manifest.is_redirect()) { + result = -EINVAL; + goto fail; + } + + object_locator_t tgt_oloc; + uint64_t src_offset, src_length, tgt_offset; + object_t tgt_name; + try { + decode(src_offset, bp); + decode(src_length, bp); + decode(tgt_oloc, bp); + decode(tgt_name, bp); + decode(tgt_offset, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + + if (!src_length) { + result = -EINVAL; + goto fail; + } + if (src_offset + src_length > oi.size) { + result = -ERANGE; + goto fail; + } + if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) { + result = -EOPNOTSUPP; + break; + } + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + + for (auto &p : oi.manifest.chunk_map) { + interval_set<uint64_t> chunk; + chunk.insert(p.first, p.second.length); + if (chunk.intersects(src_offset, src_length)) { + dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length + << " chunk_info: " << p << dendl; + result = -EOPNOTSUPP; + goto fail; + } + } + + pg_t raw_pg; + chunk_info_t chunk_info; + result = get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg); + if (result < 0) { + dout(5) << " pool information is invalid: " << result << dendl; + break; + } + hobject_t target(tgt_name, tgt_oloc.key, snapid_t(), + raw_pg.ps(), raw_pg.pool(), + tgt_oloc.nspace); + bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) && + (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE)); + if (has_reference) { + result = -EINVAL; + dout(5) << " the object is already a manifest " << dendl; + break; + } + chunk_info.oid = target; + chunk_info.offset = tgt_offset; + chunk_info.length = src_length; + if (op_finisher == nullptr) { + // start + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new SetManifestFinisher(osd_op)); + object_manifest_t set_chunk; + bool need_inc_ref = false; + set_chunk.chunk_map[src_offset] = chunk_info; + need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op); + if (need_inc_ref) { + result = -EINPROGRESS; + break; + } + } + if (op_finisher) { + result = op_finisher->execute(); + ceph_assert(result == 0); + } + + oi.manifest.chunk_map[src_offset] = chunk_info; + if (!oi.has_manifest() && !oi.manifest.is_chunked()) + ctx->delta_stats.num_objects_manifest++; + oi.set_flag(object_info_t::FLAG_MANIFEST); + oi.manifest.type = object_manifest_t::TYPE_CHUNKED; + if (!has_reference) { + oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE); + } + ctx->modify = true; + ctx->cache_operation = true; + + dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version + << " chunk_info: " << chunk_info << dendl; + if (op_finisher) { + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + + break; + + case CEPH_OSD_OP_TIER_PROMOTE: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::luminous) { + result = -EOPNOTSUPP; + break; + } + if (!obs.oi.has_manifest()) { + result = 0; + break; + } + + if (op_finisher == nullptr) { + PromoteManifestCallback *cb; + object_locator_t my_oloc; + hobject_t src_hoid; + + if (obs.oi.manifest.is_chunked()) { + src_hoid = obs.oi.soid; + } else if (obs.oi.manifest.is_redirect()) { + object_locator_t src_oloc(obs.oi.manifest.redirect_target); + my_oloc = src_oloc; + src_hoid = obs.oi.manifest.redirect_target; + } else { + ceph_abort_msg("unrecognized manifest type"); + } + cb = new PromoteManifestCallback(ctx->obc, this, ctx); + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new PromoteFinisher(cb)); + unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | + CEPH_OSD_COPY_FROM_FLAG_RWORDERED; + unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL; + start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags, + obs.oi.soid.snap == CEPH_NOSNAP, + src_fadvise_flags, 0); + + dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl; + result = -EINPROGRESS; + } else { + result = op_finisher->execute(); + ceph_assert(result == 0); + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + + break; + + case CEPH_OSD_OP_TIER_FLUSH: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + result = -EOPNOTSUPP; + break; + } + + if (oi.is_dirty() || !obs.oi.has_manifest()) { + result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt, true); + if (result == -EINPROGRESS) + result = -EAGAIN; + } else { + result = 0; + } + } + + break; + + case CEPH_OSD_OP_TIER_EVICT: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + result = -EOPNOTSUPP; + break; + } + if (!obs.oi.has_manifest()) { + result = -EINVAL; + break; + } + + // The chunks already has a reference, so it is just enough to invoke truncate if necessary + for (auto &p : obs.oi.manifest.chunk_map) { + p.second.set_flag(chunk_info_t::FLAG_MISSING); + // punch hole + t->zero(soid, p.first, p.second.length); + interval_set<uint64_t> ch; + ch.insert(p.first, p.second.length); + ctx->modified_ranges.union_of(ch); + ctx->clean_regions.mark_data_region_dirty(p.first, p.second.length); + } + oi.clear_data_digest(); + ctx->delta_stats.num_wr++; + ctx->cache_operation = true; + ctx->undirty = true; + osd->logger->inc(l_osd_tier_evict); + } + + break; + + case CEPH_OSD_OP_UNSET_MANIFEST: + ++ctx->num_write; + result = 0; + { + if (pool.info.is_tier()) { + result = -EINVAL; + break; + } + if (!obs.exists) { + result = -ENOENT; + break; + } + if (!oi.has_manifest()) { + result = -EOPNOTSUPP; + break; + } + if (get_osdmap()->require_osd_release < ceph_release_t::luminous) { + result = -EOPNOTSUPP; + break; + } + + dec_all_refcount_manifest(oi, ctx); + + oi.clear_flag(object_info_t::FLAG_MANIFEST); + oi.manifest = object_manifest_t(); + ctx->delta_stats.num_objects_manifest--; + ctx->delta_stats.num_wr++; + ctx->modify = true; + } + + break; + + // -- object attrs -- + + case CEPH_OSD_OP_SETXATTR: + ++ctx->num_write; + result = 0; + { + if (cct->_conf->osd_max_attr_size > 0 && + op.xattr.value_len > cct->_conf->osd_max_attr_size) { + tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???"); + result = -EFBIG; + break; + } + unsigned max_name_len = + std::min<uint64_t>(osd->store->get_max_attr_name_length(), + cct->_conf->osd_max_attr_name_len); + if (op.xattr.name_len > max_name_len) { + result = -ENAMETOOLONG; + break; + } + maybe_create_new_object(ctx); + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + string name = "_" + aname; + bufferlist bl; + bp.copy(op.xattr.value_len, bl); + t->setattr(soid, name, bl); + ctx->delta_stats.num_wr++; + } + break; + + case CEPH_OSD_OP_RMXATTR: + ++ctx->num_write; + result = 0; + { + string aname; + bp.copy(op.xattr.name_len, aname); + tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str()); + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + string name = "_" + aname; + t->rmattr(soid, name); + ctx->delta_stats.num_wr++; + } + break; + + + // -- fancy writers -- + case CEPH_OSD_OP_APPEND: + { + tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + // just do it inline; this works because we are happy to execute + // fancy op on replicas as well. + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITE; + newop.op.extent.offset = oi.size; + newop.op.extent.length = op.extent.length; + newop.op.extent.truncate_seq = oi.truncate_seq; + newop.indata = osd_op.indata; + result = do_osd_ops(ctx, nops); + osd_op.outdata = std::move(newop.outdata); + } + break; + + case CEPH_OSD_OP_STARTSYNC: + result = 0; + t->nop(soid); + break; + + // -- trivial map -- + case CEPH_OSD_OP_TMAPGET: + tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + { + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_SYNC_READ; + newop.op.extent.offset = 0; + newop.op.extent.length = 0; + result = do_osd_ops(ctx, nops); + osd_op.outdata = std::move(newop.outdata); + } + break; + + case CEPH_OSD_OP_TMAPPUT: + tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + { + //_dout_lock.Lock(); + //osd_op.data.hexdump(*_dout); + //_dout_lock.Unlock(); + + // verify sort order + bool unsorted = false; + if (true) { + bufferlist header; + decode(header, bp); + uint32_t n; + decode(n, bp); + string last_key; + while (n--) { + string key; + decode(key, bp); + dout(10) << "tmapput key " << key << dendl; + bufferlist val; + decode(val, bp); + if (key < last_key) { + dout(10) << "TMAPPUT is unordered; resorting" << dendl; + unsorted = true; + break; + } + last_key = key; + } + } + + // write it + vector<OSDOp> nops(1); + OSDOp& newop = nops[0]; + newop.op.op = CEPH_OSD_OP_WRITEFULL; + newop.op.extent.offset = 0; + newop.op.extent.length = osd_op.indata.length(); + newop.indata = osd_op.indata; + + if (unsorted) { + bp = osd_op.indata.begin(); + bufferlist header; + map<string, bufferlist> m; + decode(header, bp); + decode(m, bp); + ceph_assert(bp.end()); + bufferlist newbl; + encode(header, newbl); + encode(m, newbl); + newop.indata = newbl; + } + result = do_osd_ops(ctx, nops); + ceph_assert(result == 0); + } + break; + + case CEPH_OSD_OP_TMAPUP: + tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val); + if (pool.info.is_erasure()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = do_tmapup(ctx, bp, osd_op); + break; + + case CEPH_OSD_OP_TMAP2OMAP: + ++ctx->num_write; + tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val); + result = do_tmap2omap(ctx, op.tmap2omap.flags); + break; + + // OMAP Read ops + case CEPH_OSD_OP_OMAPGETKEYS: + ++ctx->num_read; + { + string start_after; + uint64_t max_return; + try { + decode(start_after, bp); + decode(max_return, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0); + goto fail; + } + if (max_return > cct->_conf->osd_max_omap_entries_per_request) { + max_return = cct->_conf->osd_max_omap_entries_per_request; + } + tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return); + + bufferlist bl; + uint32_t num = 0; + bool truncated = false; + if (oi.is_omap()) { + ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( + ch, ghobject_t(soid) + ); + ceph_assert(iter); + iter->upper_bound(start_after); + for (num = 0; iter->valid(); ++num, iter->next()) { + if (num >= max_return || + bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(iter->key(), bl); + } + } // else return empty out_set + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(bl); + encode(truncated, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETVALS: + ++ctx->num_read; + { + string start_after; + uint64_t max_return; + string filter_prefix; + try { + decode(start_after, bp); + decode(max_return, bp); + decode(filter_prefix, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???"); + goto fail; + } + if (max_return > cct->_conf->osd_max_omap_entries_per_request) { + max_return = cct->_conf->osd_max_omap_entries_per_request; + } + tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str()); + + uint32_t num = 0; + bool truncated = false; + bufferlist bl; + if (oi.is_omap()) { + ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( + ch, ghobject_t(soid) + ); + if (!iter) { + result = -ENOENT; + goto fail; + } + iter->upper_bound(start_after); + if (filter_prefix > start_after) iter->lower_bound(filter_prefix); + for (num = 0; + iter->valid() && + iter->key().substr(0, filter_prefix.size()) == filter_prefix; + ++num, iter->next()) { + dout(20) << "Found key " << iter->key() << dendl; + if (num >= max_return || + bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(iter->key(), bl); + encode(iter->value(), bl); + } + } // else return empty out_set + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(bl); + encode(truncated, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETHEADER: + tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val); + if (!oi.is_omap()) { + // return empty header + break; + } + ++ctx->num_read; + { + osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + ++ctx->num_read; + { + set<string> keys_to_get; + try { + decode(keys_to_get, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???"); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str()); + map<string, bufferlist> out; + if (oi.is_omap()) { + osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out); + } // else return empty omap entries + encode(out, osd_op.outdata); + ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + ctx->delta_stats.num_rd++; + } + break; + + case CEPH_OSD_OP_OMAP_CMP: + ++ctx->num_read; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???"); + break; + } + map<string, pair<bufferlist, int> > assertions; + try { + decode(assertions, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???"); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str()); + + map<string, bufferlist> out; + + if (oi.is_omap()) { + set<string> to_get; + for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin(); + i != assertions.end(); + ++i) + to_get.insert(i->first); + int r = osd->store->omap_get_values(ch, ghobject_t(soid), + to_get, &out); + if (r < 0) { + result = r; + break; + } + } // else leave out empty + + //Should set num_rd_kb based on encode length of map + ctx->delta_stats.num_rd++; + + int r = 0; + bufferlist empty; + for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin(); + i != assertions.end(); + ++i) { + auto out_entry = out.find(i->first); + bufferlist &bl = (out_entry != out.end()) ? + out_entry->second : empty; + switch (i->second.second) { + case CEPH_OSD_CMPXATTR_OP_EQ: + if (!(bl == i->second.first)) { + r = -ECANCELED; + } + break; + case CEPH_OSD_CMPXATTR_OP_LT: + if (!(bl < i->second.first)) { + r = -ECANCELED; + } + break; + case CEPH_OSD_CMPXATTR_OP_GT: + if (!(bl > i->second.first)) { + r = -ECANCELED; + } + break; + default: + r = -EINVAL; + break; + } + if (r < 0) + break; + } + if (r < 0) { + result = r; + } + } + break; + + // OMAP Write ops + case CEPH_OSD_OP_OMAPSETVALS: + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + break; + } + ++ctx->num_write; + result = 0; + { + maybe_create_new_object(ctx); + bufferlist to_set_bl; + try { + decode_str_str_map_to_bl(bp, &to_set_bl); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val); + if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) { + dout(20) << "setting vals: " << dendl; + map<string,bufferlist> to_set; + bufferlist::const_iterator pt = to_set_bl.begin(); + decode(to_set, pt); + for (map<string, bufferlist>::iterator i = to_set.begin(); + i != to_set.end(); + ++i) { + dout(20) << "\t" << i->first << dendl; + } + } + t->omap_setkeys(soid, to_set_bl); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10); + } + obs.oi.set_flag(object_info_t::FLAG_OMAP); + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_OMAPSETHEADER: + tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val); + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = 0; + { + maybe_create_new_object(ctx); + t->omap_setheader(soid, osd_op.indata); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + } + obs.oi.set_flag(object_info_t::FLAG_OMAP); + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_OMAPCLEAR: + tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val); + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + if (oi.is_omap()) { + t->omap_clear(soid); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + obs.oi.clear_omap_digest(); + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + } + break; + + case CEPH_OSD_OP_OMAPRMKEYS: + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + break; + } + bufferlist to_rm_bl; + try { + decode_str_set_to_bl(bp, &to_rm_bl); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + goto fail; + } + tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val); + t->omap_rmkeys(soid, to_rm_bl); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + } + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_OMAPRMKEYRANGE: + tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val); + if (!pool.info.supports_omap()) { + result = -EOPNOTSUPP; + break; + } + ++ctx->num_write; + result = 0; + { + if (!obs.exists || oi.is_whiteout()) { + result = -ENOENT; + break; + } + std::string key_begin, key_end; + try { + decode(key_begin, bp); + decode(key_end, bp); + } catch (ceph::buffer::error& e) { + result = -EINVAL; + goto fail; + } + t->omap_rmkeyrange(soid, key_begin, key_end); + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + } + obs.oi.clear_omap_digest(); + break; + + case CEPH_OSD_OP_COPY_GET: + ++ctx->num_read; + tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(), + soid.snap.val); + if (op_finisher == nullptr) { + result = do_copy_get(ctx, bp, osd_op, ctx->obc); + } else { + result = op_finisher->execute(); + } + break; + + case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: + ++ctx->num_write; + result = 0; + { + object_t src_name; + object_locator_t src_oloc; + uint32_t truncate_seq = 0; + uint64_t truncate_size = 0; + bool have_truncate = false; + snapid_t src_snapid = (uint64_t)op.copy_from.snapid; + version_t src_version = op.copy_from.src_version; + + if ((op.op == CEPH_OSD_OP_COPY_FROM2) && + (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) { + dout(20) << "invalid copy-from2 flags 0x" + << std::hex << (int)op.copy_from.flags << std::dec << dendl; + result = -EINVAL; + break; + } + try { + decode(src_name, bp); + decode(src_oloc, bp); + // check if client sent us truncate_seq and truncate_size + if ((op.op == CEPH_OSD_OP_COPY_FROM2) && + (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) { + decode(truncate_seq, bp); + decode(truncate_size, bp); + have_truncate = true; + } + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + tracepoint(osd, + do_osd_op_pre_copy_from, + soid.oid.name.c_str(), + soid.snap.val, + "???", + 0, + "???", + "???", + 0, + src_snapid, + src_version); + goto fail; + } + tracepoint(osd, + do_osd_op_pre_copy_from, + soid.oid.name.c_str(), + soid.snap.val, + src_name.name.c_str(), + src_oloc.pool, + src_oloc.key.c_str(), + src_oloc.nspace.c_str(), + src_oloc.hash, + src_snapid, + src_version); + if (op_finisher == nullptr) { + // start + pg_t raw_pg; + get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg); + hobject_t src(src_name, src_oloc.key, src_snapid, + raw_pg.ps(), raw_pg.pool(), + src_oloc.nspace); + if (src == soid) { + dout(20) << " copy from self is invalid" << dendl; + result = -EINVAL; + break; + } + CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op); + if (have_truncate) + cb->set_truncate(truncate_seq, truncate_size); + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new CopyFromFinisher(cb)); + start_copy(cb, ctx->obc, src, src_oloc, src_version, + op.copy_from.flags, + false, + op.copy_from.src_fadvise_flags, + op.flags); + result = -EINPROGRESS; + } else { + // finish + result = op_finisher->execute(); + ceph_assert(result == 0); + + // COPY_FROM cannot be executed multiple times -- it must restart + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + break; + + default: + tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op)); + dout(1) << "unrecognized osd op " << op.op + << " " << ceph_osd_op_name(op.op) + << dendl; + result = -EOPNOTSUPP; + } + + fail: + osd_op.rval = result; + tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result); + if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) && + result != -EAGAIN && result != -EINPROGRESS) + result = 0; + + if (result < 0) + break; + } + if (result < 0) { + dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl; + } + return result; +} + +int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals) +{ + if (ctx->new_obs.oi.size == 0) { + dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl; + return -ENODATA; + } + vector<OSDOp> nops(1); + OSDOp &newop = nops[0]; + newop.op.op = CEPH_OSD_OP_TMAPGET; + do_osd_ops(ctx, nops); + try { + bufferlist::const_iterator i = newop.outdata.begin(); + decode(*header, i); + (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining()); + } catch (...) { + dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid + << dendl; + return -EINVAL; + } + dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid + << dendl; + return 0; +} + +int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid, + const SnapSet& ss) +{ + // verify that all clones have been evicted + dout(20) << __func__ << " verifying clones are absent " + << ss << dendl; + for (vector<snapid_t>::const_iterator p = ss.clones.begin(); + p != ss.clones.end(); + ++p) { + hobject_t clone_oid = soid; + clone_oid.snap = *p; + if (is_missing_object(clone_oid)) + return -EBUSY; + ObjectContextRef clone_obc = get_object_context(clone_oid, false); + if (clone_obc && clone_obc->obs.exists) { + dout(10) << __func__ << " cannot evict head before clone " + << clone_oid << dendl; + return -EBUSY; + } + if (copy_ops.count(clone_oid)) { + dout(10) << __func__ << " cannot evict head, pending promote on clone " + << clone_oid << dendl; + return -EBUSY; + } + } + return 0; +} + +inline int PrimaryLogPG::_delete_oid( + OpContext *ctx, + bool no_whiteout, // no whiteouts, no matter what. + bool try_no_whiteout) // try not to whiteout +{ + SnapSet& snapset = ctx->new_snapset; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + PGTransaction* t = ctx->op_t.get(); + + // cache: cache: set whiteout on delete? + bool whiteout = false; + if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE + && !no_whiteout + && !try_no_whiteout) { + whiteout = true; + } + + // in luminous or later, we can't delete the head if there are + // clones. we trust the caller passing no_whiteout has already + // verified they don't exist. + if (!snapset.clones.empty() || + (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) { + if (no_whiteout) { + dout(20) << __func__ << " has or will have clones but no_whiteout=1" + << dendl; + } else { + dout(20) << __func__ << " has or will have clones; will whiteout" + << dendl; + whiteout = true; + } + } + dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout + << " no_whiteout=" << (int)no_whiteout + << " try_no_whiteout=" << (int)try_no_whiteout + << dendl; + if (!obs.exists || (obs.oi.is_whiteout() && whiteout)) + return -ENOENT; + + t->remove(soid); + + if (oi.size > 0) { + interval_set<uint64_t> ch; + ch.insert(0, oi.size); + ctx->modified_ranges.union_of(ch); + ctx->clean_regions.mark_data_region_dirty(0, oi.size); + } + + ctx->clean_regions.mark_omap_dirty(); + ctx->delta_stats.num_wr++; + if (soid.is_snap()) { + ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap)); + ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap); + } else { + ctx->delta_stats.num_bytes -= oi.size; + } + oi.size = 0; + oi.new_object(); + + // disconnect all watchers + for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p = + oi.watchers.begin(); + p != oi.watchers.end(); + ++p) { + dout(20) << __func__ << " will disconnect watcher " << p->first << dendl; + ctx->watch_disconnects.push_back( + watch_disconnect_t(p->first.first, p->first.second, true)); + } + oi.watchers.clear(); + + if (whiteout) { + dout(20) << __func__ << " setting whiteout on " << soid << dendl; + oi.set_flag(object_info_t::FLAG_WHITEOUT); + ctx->delta_stats.num_whiteouts++; + t->create(soid); + osd->logger->inc(l_osd_tier_whiteout); + return 0; + } + + if (oi.has_manifest()) { + ctx->delta_stats.num_objects_manifest--; + dec_all_refcount_manifest(oi, ctx); + } + + // delete the head + ctx->delta_stats.num_objects--; + if (soid.is_snap()) + ctx->delta_stats.num_object_clones--; + if (oi.is_whiteout()) { + dout(20) << __func__ << " deleting whiteout on " << soid << dendl; + ctx->delta_stats.num_whiteouts--; + oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + if (oi.is_cache_pinned()) { + ctx->delta_stats.num_objects_pinned--; + } + obs.exists = false; + return 0; +} + +int PrimaryLogPG::_rollback_to(OpContext *ctx, OSDOp& op) +{ + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + snapid_t snapid = (uint64_t)op.op.snap.snapid; + hobject_t missing_oid; + + dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl; + + ObjectContextRef rollback_to; + + int ret = find_object_context( + hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(), + soid.get_namespace()), + &rollback_to, false, false, &missing_oid); + if (ret == -EAGAIN) { + /* clone must be missing */ + ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid)); + dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone " + << missing_oid << " (requested snapid: ) " << snapid << dendl; + block_write_on_degraded_snap(missing_oid, ctx->op); + return ret; + } + { + ObjectContextRef promote_obc; + cache_result_t tier_mode_result; + if (obs.exists && obs.oi.has_manifest()) { + /* + * In the case of manifest object, the object_info exists on the base tier at all time, + * so promote_obc should be equal to rollback_to + * */ + promote_obc = rollback_to; + tier_mode_result = + maybe_handle_manifest_detail( + ctx->op, + true, + rollback_to); + } else { + tier_mode_result = + maybe_handle_cache_detail( + ctx->op, + true, + rollback_to, + ret, + missing_oid, + true, + false, + &promote_obc); + } + switch (tier_mode_result) { + case cache_result_t::NOOP: + break; + case cache_result_t::BLOCKED_PROMOTE: + ceph_assert(promote_obc); + block_write_on_snap_rollback(soid, promote_obc, ctx->op); + return -EAGAIN; + case cache_result_t::BLOCKED_FULL: + block_write_on_full_cache(soid, ctx->op); + return -EAGAIN; + case cache_result_t::REPLIED_WITH_EAGAIN: + ceph_abort_msg("this can't happen, no rollback on replica"); + default: + ceph_abort_msg("must promote was set, other values are not valid"); + return -EAGAIN; + } + } + + if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) { + // there's no snapshot here, or there's no object. + // if there's no snapshot, we delete the object; otherwise, do nothing. + dout(20) << "_rollback_to deleting head on " << soid.oid + << " because got ENOENT|whiteout on find_object_context" << dendl; + if (ctx->obc->obs.oi.watchers.size()) { + // Cannot delete an object with watchers + ret = -EBUSY; + } else { + _delete_oid(ctx, false, false); + ret = 0; + } + } else if (ret) { + // ummm....huh? It *can't* return anything else at time of writing. + ceph_abort_msg("unexpected error code in _rollback_to"); + } else { //we got our context, let's use it to do the rollback! + hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid; + if (is_degraded_or_backfilling_object(rollback_to_sobject) || + is_degraded_on_async_recovery_target(rollback_to_sobject)) { + dout(20) << "_rollback_to attempted to roll back to a degraded object " + << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl; + block_write_on_degraded_snap(rollback_to_sobject, ctx->op); + ret = -EAGAIN; + } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) { + // rolling back to the head; we just need to clone it. + ctx->modify = true; + } else { + if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) { + /* + * looking at the following case, the foo head needs the reference of chunk4 and chunk5 + * in case snap[1] is removed. + * + * Before rollback to snap[1]: + * + * foo snap[1]: [chunk4] [chunk5] + * foo snap[0]: [ chunk2 ] + * foo head : [chunk1] [chunk3] + * + * After: + * + * foo snap[1]: [chunk4] [chunk5] + * foo snap[0]: [ chunk2 ] + * foo head : [chunk4] [chunk5] + * + */ + OpFinisher* op_finisher = nullptr; + auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num); + if (op_finisher_it != ctx->op_finishers.end()) { + op_finisher = op_finisher_it->second.get(); + } + if (!op_finisher) { + bool need_inc_ref = inc_refcount_by_set(ctx, rollback_to->obs.oi.manifest, op); + if (need_inc_ref) { + ceph_assert(op_finisher_it == ctx->op_finishers.end()); + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new SetManifestFinisher(op)); + return -EINPROGRESS; + } + } else { + op_finisher->execute(); + ctx->op_finishers.erase(ctx->current_osd_subop_num); + } + } + _do_rollback_to(ctx, rollback_to, op); + } + } + return ret; +} + +void PrimaryLogPG::_do_rollback_to(OpContext *ctx, ObjectContextRef rollback_to, + OSDOp& op) +{ + SnapSet& snapset = ctx->new_snapset; + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + PGTransaction* t = ctx->op_t.get(); + snapid_t snapid = (uint64_t)op.op.snap.snapid; + hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid; + + /* 1) Delete current head + * 2) Clone correct snapshot into head + * 3) Calculate clone_overlaps by following overlaps + * forward from rollback snapshot */ + dout(10) << "_do_rollback_to deleting " << soid.oid + << " and rolling back to old snap" << dendl; + + if (obs.exists) { + t->remove(soid); + if (obs.oi.has_manifest()) { + dec_all_refcount_manifest(obs.oi, ctx); + oi.manifest.clear(); + oi.manifest.type = object_manifest_t::TYPE_NONE; + oi.clear_flag(object_info_t::FLAG_MANIFEST); + ctx->delta_stats.num_objects_manifest--; + ctx->cache_operation = true; // do not trigger to call ref function to calculate refcount + } + } + t->clone(soid, rollback_to_sobject); + t->add_obc(rollback_to); + + map<snapid_t, interval_set<uint64_t> >::iterator iter = + snapset.clone_overlap.lower_bound(snapid); + ceph_assert(iter != snapset.clone_overlap.end()); + interval_set<uint64_t> overlaps = iter->second; + for ( ; + iter != snapset.clone_overlap.end(); + ++iter) + overlaps.intersection_of(iter->second); + + if (obs.oi.size > 0) { + interval_set<uint64_t> modified; + modified.insert(0, obs.oi.size); + overlaps.intersection_of(modified); + modified.subtract(overlaps); + ctx->modified_ranges.union_of(modified); + } + + // Adjust the cached objectcontext + maybe_create_new_object(ctx, true); + ctx->delta_stats.num_bytes -= obs.oi.size; + ctx->delta_stats.num_bytes += rollback_to->obs.oi.size; + ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size)); + ctx->clean_regions.mark_omap_dirty(); + obs.oi.size = rollback_to->obs.oi.size; + if (rollback_to->obs.oi.is_data_digest()) + obs.oi.set_data_digest(rollback_to->obs.oi.data_digest); + else + obs.oi.clear_data_digest(); + if (rollback_to->obs.oi.is_omap_digest()) + obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest); + else + obs.oi.clear_omap_digest(); + + if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) { + obs.oi.set_flag(object_info_t::FLAG_MANIFEST); + obs.oi.manifest.type = rollback_to->obs.oi.manifest.type; + obs.oi.manifest.chunk_map = rollback_to->obs.oi.manifest.chunk_map; + ctx->cache_operation = true; + ctx->delta_stats.num_objects_manifest++; + } + + if (rollback_to->obs.oi.is_omap()) { + dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl; + obs.oi.set_flag(object_info_t::FLAG_OMAP); + } else { + dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } +} + +void PrimaryLogPG::_make_clone( + OpContext *ctx, + PGTransaction* t, + ObjectContextRef clone_obc, + const hobject_t& head, const hobject_t& coid, + object_info_t *poi) +{ + bufferlist bv; + encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + + t->clone(coid, head); + setattr_maybe_cache(clone_obc, t, OI_ATTR, bv); + rmattr_maybe_cache(clone_obc, t, SS_ATTR); +} + +void PrimaryLogPG::make_writeable(OpContext *ctx) +{ + const hobject_t& soid = ctx->obs->oi.soid; + SnapContext& snapc = ctx->snapc; + + // clone? + ceph_assert(soid.snap == CEPH_NOSNAP); + dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset + << " snapc=" << snapc << dendl; + + bool was_dirty = ctx->obc->obs.oi.is_dirty(); + if (ctx->new_obs.exists) { + // we will mark the object dirty + if (ctx->undirty && was_dirty) { + dout(20) << " clearing DIRTY flag" << dendl; + ceph_assert(ctx->new_obs.oi.is_dirty()); + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + osd->logger->inc(l_osd_tier_clean); + } else if (!was_dirty && !ctx->undirty) { + dout(20) << " setting DIRTY flag" << dendl; + ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY); + ++ctx->delta_stats.num_objects_dirty; + osd->logger->inc(l_osd_tier_dirty); + } + } else { + if (was_dirty) { + dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + } + } + + if ((ctx->new_obs.exists && + ctx->new_obs.oi.is_omap()) && + (!ctx->obc->obs.exists || + !ctx->obc->obs.oi.is_omap())) { + ++ctx->delta_stats.num_objects_omap; + } + if ((!ctx->new_obs.exists || + !ctx->new_obs.oi.is_omap()) && + (ctx->obc->obs.exists && + ctx->obc->obs.oi.is_omap())) { + --ctx->delta_stats.num_objects_omap; + } + + if (ctx->new_snapset.seq > snapc.seq) { + dout(10) << " op snapset is old" << dendl; + } + + if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed) + snapc.snaps.size() && // there are snaps + !ctx->cache_operation && + snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old + // clone + hobject_t coid = soid; + coid.snap = snapc.seq; + + const auto snaps = [&] { + auto last = find_if_not( + begin(snapc.snaps), end(snapc.snaps), + [&](snapid_t snap_id) { return snap_id > ctx->new_snapset.seq; }); + return vector<snapid_t>{begin(snapc.snaps), last}; + }(); + + // prepare clone + object_info_t static_snap_oi(coid); + object_info_t *snap_oi; + if (is_primary()) { + ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid); + ctx->clone_obc->destructor_callback = + new C_PG_ObjectContext(this, ctx->clone_obc.get()); + ctx->clone_obc->obs.oi = static_snap_oi; + ctx->clone_obc->obs.exists = true; + ctx->clone_obc->ssc = ctx->obc->ssc; + ctx->clone_obc->ssc->ref++; + if (pool.info.is_erasure()) + ctx->clone_obc->attr_cache = ctx->obc->attr_cache; + snap_oi = &ctx->clone_obc->obs.oi; + if (ctx->obc->obs.oi.has_manifest()) { + if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) && + ctx->obc->obs.oi.manifest.is_redirect()) { + snap_oi->set_flag(object_info_t::FLAG_MANIFEST); + snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT; + snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target; + } else if (ctx->obc->obs.oi.manifest.is_chunked()) { + snap_oi->set_flag(object_info_t::FLAG_MANIFEST); + snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED; + snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map; + } else { + ceph_abort_msg("unrecognized manifest type"); + } + } + bool got = ctx->lock_manager.get_write_greedy( + coid, + ctx->clone_obc, + ctx->op); + ceph_assert(got); + dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl; + } else { + snap_oi = &static_snap_oi; + } + snap_oi->version = ctx->at_version; + snap_oi->prior_version = ctx->obs->oi.version; + snap_oi->copy_user_bits(ctx->obs->oi); + + _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi); + + ctx->delta_stats.num_objects++; + if (snap_oi->is_dirty()) { + ctx->delta_stats.num_objects_dirty++; + osd->logger->inc(l_osd_tier_dirty); + } + if (snap_oi->is_omap()) + ctx->delta_stats.num_objects_omap++; + if (snap_oi->is_cache_pinned()) + ctx->delta_stats.num_objects_pinned++; + if (snap_oi->has_manifest()) + ctx->delta_stats.num_objects_manifest++; + ctx->delta_stats.num_object_clones++; + ctx->new_snapset.clones.push_back(coid.snap); + ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size; + ctx->new_snapset.clone_snaps[coid.snap] = snaps; + + // clone_overlap should contain an entry for each clone + // (an empty interval_set if there is no overlap) + ctx->new_snapset.clone_overlap[coid.snap]; + if (ctx->obs->oi.size) { + ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size); + } + + // log clone + dout(10) << " cloning v " << ctx->obs->oi.version + << " to " << coid << " v " << ctx->at_version + << " snaps=" << snaps + << " snapset=" << ctx->new_snapset << dendl; + ctx->log.push_back(pg_log_entry_t( + pg_log_entry_t::CLONE, coid, ctx->at_version, + ctx->obs->oi.version, + ctx->obs->oi.user_version, + osd_reqid_t(), ctx->new_obs.oi.mtime, 0)); + encode(snaps, ctx->log.back().snaps); + + ctx->at_version.version++; + } + + // update most recent clone_overlap and usage stats + if (ctx->new_snapset.clones.size() > 0) { + // the clone_overlap is difference of range between head and clones. + // we need to check whether the most recent clone exists, if it's + // been evicted, it's not included in the stats, but the clone_overlap + // is still exist in the snapset, so we should update the + // clone_overlap to make it sense. + hobject_t last_clone_oid = soid; + last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first; + interval_set<uint64_t> &newest_overlap = + ctx->new_snapset.clone_overlap.rbegin()->second; + ctx->modified_ranges.intersection_of(newest_overlap); + if (is_present_clone(last_clone_oid)) { + // modified_ranges is still in use by the clone + ctx->delta_stats.num_bytes += ctx->modified_ranges.size(); + } + newest_overlap.subtract(ctx->modified_ranges); + } + + if (snapc.seq > ctx->new_snapset.seq) { + // update snapset with latest snap context + ctx->new_snapset.seq = snapc.seq; + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + ctx->new_snapset.snaps = snapc.snaps; + } else { + ctx->new_snapset.snaps.clear(); + } + } + dout(20) << "make_writeable " << soid + << " done, snapset=" << ctx->new_snapset << dendl; +} + + +void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi, + interval_set<uint64_t>& modified, uint64_t offset, + uint64_t length, bool write_full) +{ + interval_set<uint64_t> ch; + if (write_full) { + if (oi.size) + ch.insert(0, oi.size); + } else if (length) + ch.insert(offset, length); + modified.union_of(ch); + if (write_full || + (offset + length > oi.size && length)) { + uint64_t new_size = offset + length; + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += new_size; + oi.size = new_size; + } + + delta_stats.num_wr++; + delta_stats.num_wr_kb += shift_round_up(length, 10); +} + +void PrimaryLogPG::truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size) +{ + if (oi.size != truncate_size) { + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += truncate_size; + oi.size = truncate_size; + } +} + +void PrimaryLogPG::complete_disconnect_watches( + ObjectContextRef obc, + const list<watch_disconnect_t> &to_disconnect) +{ + for (list<watch_disconnect_t>::const_iterator i = + to_disconnect.begin(); + i != to_disconnect.end(); + ++i) { + pair<uint64_t, entity_name_t> watcher(i->cookie, i->name); + auto watchers_entry = obc->watchers.find(watcher); + if (watchers_entry != obc->watchers.end()) { + WatchRef watch = watchers_entry->second; + dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl; + obc->watchers.erase(watcher); + watch->remove(i->send_disconnect); + } else { + dout(10) << "do_osd_op_effects disconnect failed to find watcher " + << watcher << dendl; + } + } +} + +void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn) +{ + entity_name_t entity = ctx->reqid.name; + dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl; + + // disconnects first + complete_disconnect_watches(ctx->obc, ctx->watch_disconnects); + + ceph_assert(conn); + + auto session = conn->get_priv(); + if (!session) + return; + + for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin(); + i != ctx->watch_connects.end(); + ++i) { + pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity); + dout(15) << "do_osd_op_effects applying watch connect on session " + << session.get() << " watcher " << watcher << dendl; + WatchRef watch; + if (ctx->obc->watchers.count(watcher)) { + dout(15) << "do_osd_op_effects found existing watch watcher " << watcher + << dendl; + watch = ctx->obc->watchers[watcher]; + } else { + dout(15) << "do_osd_op_effects new watcher " << watcher + << dendl; + watch = Watch::makeWatchRef( + this, osd, ctx->obc, i->first.timeout_seconds, + i->first.cookie, entity, conn->get_peer_addr()); + ctx->obc->watchers.insert( + make_pair( + watcher, + watch)); + } + watch->connect(conn, i->second); + } + + for (list<notify_info_t>::iterator p = ctx->notifies.begin(); + p != ctx->notifies.end(); + ++p) { + dout(10) << "do_osd_op_effects, notify " << *p << dendl; + ConnectionRef conn(ctx->op->get_req()->get_connection()); + NotifyRef notif( + Notify::makeNotifyRef( + conn, + ctx->reqid.name.num(), + p->bl, + p->timeout, + p->cookie, + p->notify_id, + ctx->obc->obs.oi.user_version, + osd)); + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i = + ctx->obc->watchers.begin(); + i != ctx->obc->watchers.end(); + ++i) { + dout(10) << "starting notify on watch " << i->first << dendl; + i->second->start_notify(notif); + } + notif->init(); + } + + for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin(); + p != ctx->notify_acks.end(); + ++p) { + if (p->watch_cookie) + dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl; + else + dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl; + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i = + ctx->obc->watchers.begin(); + i != ctx->obc->watchers.end(); + ++i) { + if (i->first.second != entity) continue; + if (p->watch_cookie && + *(p->watch_cookie) != i->first.first) continue; + dout(10) << "acking notify on watch " << i->first << dendl; + i->second->notify_ack(p->notify_id, p->reply_bl); + } + } +} + +hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target) +{ + ostringstream ss; + ss << "temp_" << info.pgid << "_" << get_role() + << "_" << osd->monc->get_global_id() << "_" << (++temp_seq); + hobject_t hoid = target.make_temp_hobject(ss.str()); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +hobject_t PrimaryLogPG::get_temp_recovery_object( + const hobject_t& target, + eversion_t version) +{ + ostringstream ss; + ss << "temp_recovering_" << info.pgid // (note this includes the shardid) + << "_" << version + << "_" << info.history.same_interval_since + << "_" << target.snap; + // pgid + version + interval + snapid is unique, and short + hobject_t hoid = target.make_temp_hobject(ss.str()); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +int PrimaryLogPG::prepare_transaction(OpContext *ctx) +{ + ceph_assert(!ctx->ops->empty()); + + // valid snap context? + if (!ctx->snapc.is_valid()) { + dout(10) << " invalid snapc " << ctx->snapc << dendl; + return -EINVAL; + } + + // prepare the actual mutation + int result = do_osd_ops(ctx, *ctx->ops); + if (result < 0) { + if (ctx->op->may_write() && + get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + // need to save the error code in the pg log, to detect dup ops, + // but do nothing else + ctx->update_log_only = true; + } + return result; + } + + // read-op? write-op noop? done? + if (ctx->op_t->empty() && !ctx->modify) { + if (ctx->pending_async_reads.empty()) + unstable_stats.add(ctx->delta_stats); + if (ctx->op->may_write() && + get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + ctx->update_log_only = true; + } + return result; + } + + // check for full + if ((ctx->delta_stats.num_bytes > 0 || + ctx->delta_stats.num_objects > 0) && // FIXME: keys? + pool.info.has_flag(pg_pool_t::FLAG_FULL)) { + auto m = ctx->op->get_req<MOSDOp>(); + if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now + m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) { + dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS" + << dendl; + } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + // they tried, they failed. + dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl; + return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC; + } else { + // drop request + dout(20) << __func__ << " full, dropping request (bad client)" << dendl; + return -EAGAIN; + } + } + + const hobject_t& soid = ctx->obs->oi.soid; + // clone, if necessary + if (soid.snap == CEPH_NOSNAP) + make_writeable(ctx); + + finish_ctx(ctx, + ctx->new_obs.exists ? pg_log_entry_t::MODIFY : + pg_log_entry_t::DELETE, + result); + + return result; +} + +void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result) +{ + const hobject_t& soid = ctx->obs->oi.soid; + dout(20) << __func__ << " " << soid << " " << ctx + << " op " << pg_log_entry_t::get_op_name(log_op_type) + << dendl; + utime_t now = ceph_clock_now(); + + + // Drop the reference if deduped chunk is modified + if (ctx->new_obs.oi.is_dirty() && + (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) && + !ctx->cache_operation && + log_op_type != pg_log_entry_t::PROMOTE) { + update_chunk_map_by_dirty(ctx); + // If a clone is creating, ignore dropping the reference for manifest object + if (!ctx->delta_stats.num_object_clones) { + dec_refcount_by_dirty(ctx); + } + } + + // finish and log the op. + if (ctx->user_modify) { + // update the user_version for any modify ops, except for the watch op + ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1; + /* In order for new clients and old clients to interoperate properly + * when exchanging versions, we need to lower bound the user_version + * (which our new clients pay proper attention to) + * by the at_version (which is all the old clients can ever see). */ + if (ctx->at_version.version > ctx->user_at_version) + ctx->user_at_version = ctx->at_version.version; + ctx->new_obs.oi.user_version = ctx->user_at_version; + } + ctx->bytes_written = ctx->op_t->get_bytes_written(); + + if (ctx->new_obs.exists) { + ctx->new_obs.oi.version = ctx->at_version; + ctx->new_obs.oi.prior_version = ctx->obs->oi.version; + ctx->new_obs.oi.last_reqid = ctx->reqid; + if (ctx->mtime != utime_t()) { + ctx->new_obs.oi.mtime = ctx->mtime; + dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl; + ctx->new_obs.oi.local_mtime = now; + } else { + dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl; + } + + // object_info_t + map <string, bufferlist, less<>> attrs; + bufferlist bv(sizeof(ctx->new_obs.oi)); + encode(ctx->new_obs.oi, bv, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR] = std::move(bv); + + // snapset + if (soid.snap == CEPH_NOSNAP) { + dout(10) << " final snapset " << ctx->new_snapset + << " in " << soid << dendl; + bufferlist bss; + encode(ctx->new_snapset, bss); + attrs[SS_ATTR] = std::move(bss); + } else { + dout(10) << " no snapset (this is a clone)" << dendl; + } + ctx->op_t->setattrs(soid, attrs); + } else { + // reset cached oi + ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid); + } + + // append to log + ctx->log.push_back( + pg_log_entry_t(log_op_type, soid, ctx->at_version, + ctx->obs->oi.version, + ctx->user_at_version, ctx->reqid, + ctx->mtime, + (ctx->op && ctx->op->allows_returnvec()) ? result : 0)); + if (ctx->op && ctx->op->allows_returnvec()) { + // also the per-op values + ctx->log.back().set_op_returns(*ctx->ops); + dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns + << dendl; + } + + ctx->log.back().clean_regions = ctx->clean_regions; + dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl; + + if (soid.snap < CEPH_NOSNAP) { + switch (log_op_type) { + case pg_log_entry_t::MODIFY: + case pg_log_entry_t::PROMOTE: + case pg_log_entry_t::CLEAN: + dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset + << dendl; + encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps); + break; + default: + break; + } + } + + if (!ctx->extra_reqids.empty()) { + dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " " + << ctx->extra_reqid_return_codes << dendl; + ctx->log.back().extra_reqids.swap(ctx->extra_reqids); + ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes); + } + + // apply new object state. + ctx->obc->obs = ctx->new_obs; + + if (soid.is_head() && !ctx->obc->obs.exists) { + ctx->obc->ssc->exists = false; + ctx->obc->ssc->snapset = SnapSet(); + } else { + ctx->obc->ssc->exists = true; + ctx->obc->ssc->snapset = ctx->new_snapset; + } +} + +void PrimaryLogPG::apply_stats( + const hobject_t &soid, + const object_stat_sum_t &delta_stats) { + + recovery_state.apply_op_stats(soid, delta_stats); + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + const pg_info_t& pinfo = recovery_state.get_peer_info(bt); + if (soid > pinfo.last_backfill && soid <= last_backfill_started) { + pending_backfill_updates[soid].stats.add(delta_stats); + } + } + + m_scrubber->stats_of_handled_objects(delta_stats, soid); +} + +void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx) +{ + auto m = ctx->op->get_req<MOSDOp>(); + ceph_assert(ctx->async_reads_complete()); + + for (auto p = ctx->ops->begin(); + p != ctx->ops->end() && result >= 0; ++p) { + if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = p->rval; + break; + } + ctx->bytes_read += p->outdata.length(); + } + ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0); + + MOSDOpReply *reply = ctx->reply; + ctx->reply = nullptr; + + if (result >= 0) { + if (!ctx->ignore_log_op_stats) { + log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read); + + publish_stats_to_osd(); + } + + // on read, return the current object version + if (ctx->obs) { + reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version); + } else { + reply->set_reply_versions(eversion_t(), ctx->user_at_version); + } + } else if (result == -ENOENT) { + // on ENOENT, set a floor for what the next user version will be. + reply->set_enoent_reply_versions(info.last_update, info.last_user_version); + } + + reply->set_result(result); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + osd->send_message_osd_client(reply, m->get_connection()); + close_op_ctx(ctx); +} + +// ======================================================================== +// copyfrom + +struct C_Copyfrom : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive + C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::CopyOpRef& c) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), cop(c) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock l{*pg}; + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->process_copy_chunk(oid, tid, r); + cop.reset(); + } + } +}; + +struct C_CopyFrom_AsyncReadCb : public Context { + OSDOp *osd_op; + object_copy_data_t reply_obj; + uint64_t features; + size_t len; + C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) : + osd_op(osd_op), features(features), len(0) {} + void finish(int r) override { + osd_op->rval = r; + if (r < 0) { + return; + } + + ceph_assert(len > 0); + ceph_assert(len <= reply_obj.data.length()); + bufferlist bl; + bl.substr_of(reply_obj.data, 0, len); + reply_obj.data.swap(bl); + encode(reply_obj, osd_op->outdata, features); + } +}; + +struct C_CopyChunk : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive + uint64_t offset = 0; + C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr, + const PrimaryLogPG::CopyOpRef& c) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), cop(c) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock l{*pg}; + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->process_copy_chunk_manifest(oid, tid, r, offset); + cop.reset(); + } + } +}; + +int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp, + OSDOp& osd_op, ObjectContextRef &obc) +{ + object_info_t& oi = obc->obs.oi; + hobject_t& soid = oi.soid; + int result = 0; + object_copy_cursor_t cursor; + uint64_t out_max; + try { + decode(cursor, bp); + decode(out_max, bp); + } + catch (ceph::buffer::error& e) { + result = -EINVAL; + return result; + } + + const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req()); + uint64_t features = op->get_features(); + + bool async_read_started = false; + object_copy_data_t _reply_obj; + C_CopyFrom_AsyncReadCb *cb = nullptr; + if (pool.info.is_erasure()) { + cb = new C_CopyFrom_AsyncReadCb(&osd_op, features); + } + object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj; + // size, mtime + reply_obj.size = oi.size; + reply_obj.mtime = oi.mtime; + ceph_assert(obc->ssc); + if (soid.snap < CEPH_NOSNAP) { + auto p = obc->ssc->snapset.clone_snaps.find(soid.snap); + ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn? + reply_obj.snaps = p->second; + } else { + reply_obj.snap_seq = obc->ssc->snapset.seq; + } + if (oi.is_data_digest()) { + reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST; + reply_obj.data_digest = oi.data_digest; + } + if (oi.is_omap_digest()) { + reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST; + reply_obj.omap_digest = oi.omap_digest; + } + reply_obj.truncate_seq = oi.truncate_seq; + reply_obj.truncate_size = oi.truncate_size; + + // attrs + map<string,bufferlist,less<>>& out_attrs = reply_obj.attrs; + if (!cursor.attr_complete) { + result = getattrs_maybe_cache( + ctx->obc, + &out_attrs); + if (result < 0) { + if (cb) { + delete cb; + } + return result; + } + cursor.attr_complete = true; + dout(20) << " got attrs" << dendl; + } + + int64_t left = out_max - osd_op.outdata.length(); + + // data + bufferlist& bl = reply_obj.data; + if (left > 0 && !cursor.data_complete) { + if (cursor.data_offset < oi.size) { + uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left); + if (cb) { + async_read_started = true; + ctx->pending_async_reads.push_back( + make_pair( + boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags), + make_pair(&bl, cb))); + cb->len = max_read; + + ctx->op_finishers[ctx->current_osd_subop_num].reset( + new ReadFinisher(osd_op)); + result = -EINPROGRESS; + + dout(10) << __func__ << ": async_read noted for " << soid << dendl; + } else { + result = pgbackend->objects_read_sync( + oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl); + if (result < 0) + return result; + } + left -= max_read; + cursor.data_offset += max_read; + } + if (cursor.data_offset == oi.size) { + cursor.data_complete = true; + dout(20) << " got data" << dendl; + } + ceph_assert(cursor.data_offset <= oi.size); + } + + // omap + uint32_t omap_keys = 0; + if (!pool.info.supports_omap() || !oi.is_omap()) { + cursor.omap_complete = true; + } else { + if (left > 0 && !cursor.omap_complete) { + ceph_assert(cursor.data_complete); + if (cursor.omap_offset.empty()) { + osd->store->omap_get_header(ch, ghobject_t(oi.soid), + &reply_obj.omap_header); + } + bufferlist omap_data; + ObjectMap::ObjectMapIterator iter = + osd->store->get_omap_iterator(ch, ghobject_t(oi.soid)); + ceph_assert(iter); + iter->upper_bound(cursor.omap_offset); + for (; iter->valid(); iter->next()) { + ++omap_keys; + encode(iter->key(), omap_data); + encode(iter->value(), omap_data); + left -= iter->key().length() + 4 + iter->value().length() + 4; + if (left <= 0) + break; + } + if (omap_keys) { + encode(omap_keys, reply_obj.omap_data); + reply_obj.omap_data.claim_append(omap_data); + } + if (iter->valid()) { + cursor.omap_offset = iter->key(); + } else { + cursor.omap_complete = true; + dout(20) << " got omap" << dendl; + } + } + } + + if (cursor.is_complete()) { + // include reqids only in the final step. this is a bit fragile + // but it works... + recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, + &reply_obj.reqids, + &reply_obj.reqid_return_codes); + dout(20) << " got reqids" << dendl; + } + + dout(20) << " cursor.is_complete=" << cursor.is_complete() + << " " << out_attrs.size() << " attrs" + << " " << bl.length() << " bytes" + << " " << reply_obj.omap_header.length() << " omap header bytes" + << " " << reply_obj.omap_data.length() << " omap data bytes in " + << omap_keys << " keys" + << " " << reply_obj.reqids.size() << " reqids" + << dendl; + reply_obj.cursor = cursor; + if (!async_read_started) { + encode(reply_obj, osd_op.outdata, features); + } + if (cb && !async_read_started) { + delete cb; + } + + if (result > 0) { + result = 0; + } + return result; +} + +void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid, + OSDOp& osd_op) +{ + const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req()); + uint64_t features = m->get_features(); + object_copy_data_t reply_obj; + + recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids, + &reply_obj.reqid_return_codes); + dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl; + encode(reply_obj, osd_op.outdata, features); + osd_op.rval = -ENOENT; + MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false); + reply->set_result(-ENOENT); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + osd->send_message_osd_client(reply, m->get_connection()); +} + +void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc, + hobject_t src, object_locator_t oloc, + version_t version, unsigned flags, + bool mirror_snapset, + unsigned src_obj_fadvise_flags, + unsigned dest_obj_fadvise_flags) +{ + const hobject_t& dest = obc->obs.oi.soid; + dout(10) << __func__ << " " << dest + << " from " << src << " " << oloc << " v" << version + << " flags " << flags + << (mirror_snapset ? " mirror_snapset" : "") + << dendl; + + ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP); + + // cancel a previous in-progress copy? + if (copy_ops.count(dest)) { + // FIXME: if the src etc match, we could avoid restarting from the + // beginning. + CopyOpRef cop = copy_ops[dest]; + vector<ceph_tid_t> tids; + cancel_copy(cop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + } + + CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags, + mirror_snapset, src_obj_fadvise_flags, + dest_obj_fadvise_flags)); + copy_ops[dest] = cop; + dout(20) << fmt::format("{}: blocking {}", __func__, dest) << dendl; + obc->start_block(); + + if (!obc->obs.oi.has_manifest()) { + _copy_some(obc, cop); + } else { + if (obc->obs.oi.manifest.is_redirect()) { + _copy_some(obc, cop); + } else if (obc->obs.oi.manifest.is_chunked()) { + auto p = obc->obs.oi.manifest.chunk_map.begin(); + _copy_some_manifest(obc, cop, p->first); + } else { + ceph_abort_msg("unrecognized manifest type"); + } + } +} + +void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop) +{ + dout(10) << __func__ << " " << *obc << " " << cop << dendl; + + unsigned flags = 0; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH) + flags |= CEPH_OSD_FLAG_FLUSH; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE) + flags |= CEPH_OSD_FLAG_IGNORE_CACHE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY) + flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE) + flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED) + flags |= CEPH_OSD_FLAG_RWORDERED; + + C_GatherBuilder gather(cct); + + if (cop->cursor.is_initial() && cop->mirror_snapset) { + // list snaps too. + ceph_assert(cop->src.snap == CEPH_NOSNAP); + ObjectOperation op; + op.list_snaps(&cop->results.snapset, NULL); + ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op, + CEPH_SNAPDIR, NULL, + flags, gather.new_sub(), NULL); + cop->objecter_tid2 = tid; + } + + ObjectOperation op; + if (cop->results.user_version) { + op.assert_version(cop->results.user_version); + } else { + // we should learn the version after the first chunk, if we didn't know + // it already! + ceph_assert(cop->cursor.is_initial()); + } + op.copy_get(&cop->cursor, get_copy_chunk_size(), + &cop->results.object_size, &cop->results.mtime, + &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data, + &cop->results.snaps, &cop->results.snap_seq, + &cop->results.flags, + &cop->results.source_data_digest, + &cop->results.source_omap_digest, + &cop->results.reqids, + &cop->results.reqid_return_codes, + &cop->results.truncate_seq, + &cop->results.truncate_size, + &cop->rval); + op.set_last_op_flags(cop->src_obj_fadvise_flags); + + C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid, + get_last_peering_reset(), cop); + gather.set_finisher(new C_OnFinisher(fin, + osd->get_objecter_finisher(get_pg_shard()))); + + ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op, + cop->src.snap, NULL, + flags, + gather.new_sub(), + // discover the object version if we don't know it yet + cop->results.user_version ? NULL : &cop->results.user_version); + fin->tid = tid; + cop->objecter_tid = tid; + gather.activate(); +} + +void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset) +{ + dout(10) << __func__ << " " << *obc << " " << cop << dendl; + + unsigned flags = 0; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH) + flags |= CEPH_OSD_FLAG_FLUSH; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE) + flags |= CEPH_OSD_FLAG_IGNORE_CACHE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY) + flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE) + flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE; + if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED) + flags |= CEPH_OSD_FLAG_RWORDERED; + + int num_chunks = 0; + uint64_t last_offset = 0, chunks_size = 0; + object_manifest_t *manifest = &obc->obs.oi.manifest; + map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset); + for (;iter != manifest->chunk_map.end(); ++iter) { + num_chunks++; + chunks_size += iter->second.length; + last_offset = iter->first; + if (get_copy_chunk_size() < chunks_size) { + break; + } + } + + cop->num_chunk = num_chunks; + cop->start_offset = start_offset; + cop->last_offset = last_offset; + dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks + << " start_offset: " << start_offset << " chunks_size: " << chunks_size + << " last_offset: " << last_offset << dendl; + + iter = manifest->chunk_map.find(start_offset); + for (;iter != manifest->chunk_map.end(); ++iter) { + uint64_t obj_offset = iter->first; + uint64_t length = manifest->chunk_map[iter->first].length; + hobject_t soid = manifest->chunk_map[iter->first].oid; + object_locator_t oloc(soid); + CopyCallback * cb = NULL; + CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc, + cop->results.user_version, cop->flags, cop->mirror_snapset, + cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags)); + sub_cop->cursor.data_offset = obj_offset; + cop->chunk_cops[obj_offset] = sub_cop; + + int s = sub_cop->chunk_ops.size(); + sub_cop->chunk_ops.resize(s+1); + sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ; + sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset; + sub_cop->chunk_ops[s].op.extent.length = length; + + ObjectOperation op; + op.dup(sub_cop->chunk_ops); + + if (cop->results.user_version) { + op.assert_version(cop->results.user_version); + } else { + // we should learn the version after the first chunk, if we didn't know + // it already! + ceph_assert(cop->cursor.is_initial()); + } + op.set_last_op_flags(cop->src_obj_fadvise_flags); + + C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid, + get_last_peering_reset(), cop); + fin->offset = obj_offset; + + ceph_tid_t tid = osd->objecter->read( + soid.oid, oloc, op, + sub_cop->src.snap, NULL, + flags, + new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())), + // discover the object version if we don't know it yet + sub_cop->results.user_version ? NULL : &sub_cop->results.user_version); + fin->tid = tid; + sub_cop->objecter_tid = tid; + + dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: " + << manifest->chunk_map[iter->first].offset + << " length: " << length << " pool id: " << oloc.pool + << " tid: " << tid << dendl; + + if (last_offset <= iter->first) { + break; + } + } +} + +void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid); + if (p == copy_ops.end()) { + dout(10) << __func__ << " no copy_op found" << dendl; + return; + } + CopyOpRef cop = p->second; + if (tid != cop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != cop " << cop + << " tid " << cop->objecter_tid << dendl; + return; + } + + if (cop->omap_data.length() || cop->omap_header.length()) + cop->results.has_omap = true; + + if (r >= 0 && !pool.info.supports_omap() && + (cop->omap_data.length() || cop->omap_header.length())) { + r = -EOPNOTSUPP; + } + cop->objecter_tid = 0; + cop->objecter_tid2 = 0; // assume this ordered before us (if it happened) + ObjectContextRef& cobc = cop->obc; + + if (r < 0) + goto out; + + ceph_assert(cop->rval >= 0); + + if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) { + // verify snap hasn't been deleted + vector<snapid_t>::iterator p = cop->results.snaps.begin(); + while (p != cop->results.snaps.end()) { + // make best effort to sanitize snaps/clones. + if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) { + dout(10) << __func__ << " clone snap " << *p << " has been deleted" + << dendl; + for (vector<snapid_t>::iterator q = p + 1; + q != cop->results.snaps.end(); + ++q) + *(q - 1) = *q; + cop->results.snaps.resize(cop->results.snaps.size() - 1); + } else { + ++p; + } + } + if (cop->results.snaps.empty()) { + dout(10) << __func__ << " no more snaps for " << oid << dendl; + r = -ENOENT; + goto out; + } + } + + ceph_assert(cop->rval >= 0); + + if (!cop->temp_cursor.data_complete) { + cop->results.data_digest = cop->data.crc32c(cop->results.data_digest); + } + if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) { + if (cop->omap_header.length()) { + cop->results.omap_digest = + cop->omap_header.crc32c(cop->results.omap_digest); + } + if (cop->omap_data.length()) { + bufferlist keys; + keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4); + cop->results.omap_digest = keys.crc32c(cop->results.omap_digest); + } + } + + if (!cop->temp_cursor.attr_complete) { + for (map<string,bufferlist>::iterator p = cop->attrs.begin(); + p != cop->attrs.end(); + ++p) { + cop->results.attrs[string("_") + p->first] = p->second; + } + cop->attrs.clear(); + } + + if (!cop->cursor.is_complete()) { + // write out what we have so far + if (cop->temp_cursor.is_initial()) { + ceph_assert(!cop->results.started_temp_obj); + cop->results.started_temp_obj = true; + cop->results.temp_oid = generate_temp_object(oid); + dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl; + } + ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true); + OpContextUPtr ctx = simple_opc_create(tempobc); + if (cop->temp_cursor.is_initial()) { + ctx->new_temp_oid = cop->results.temp_oid; + } + _write_copy_chunk(cop, ctx->op_t.get()); + simple_opc_submit(std::move(ctx)); + dout(10) << __func__ << " fetching more" << dendl; + _copy_some(cobc, cop); + return; + } + + // verify digests? + if (cop->results.is_data_digest() || cop->results.is_omap_digest()) { + dout(20) << __func__ << std::hex + << " got digest: rx data 0x" << cop->results.data_digest + << " omap 0x" << cop->results.omap_digest + << ", source: data 0x" << cop->results.source_data_digest + << " omap 0x" << cop->results.source_omap_digest + << std::dec + << " flags " << cop->results.flags + << dendl; + } + if (cop->results.is_data_digest() && + cop->results.data_digest != cop->results.source_data_digest) { + derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest + << " != source 0x" << cop->results.source_data_digest << std::dec + << dendl; + osd->clog->error() << info.pgid << " copy from " << cop->src + << " to " << cop->obc->obs.oi.soid << std::hex + << " data digest 0x" << cop->results.data_digest + << " != source 0x" << cop->results.source_data_digest + << std::dec; + r = -EIO; + goto out; + } + if (cop->results.is_omap_digest() && + cop->results.omap_digest != cop->results.source_omap_digest) { + derr << __func__ << std::hex + << " omap digest 0x" << cop->results.omap_digest + << " != source 0x" << cop->results.source_omap_digest + << std::dec << dendl; + osd->clog->error() << info.pgid << " copy from " << cop->src + << " to " << cop->obc->obs.oi.soid << std::hex + << " omap digest 0x" << cop->results.omap_digest + << " != source 0x" << cop->results.source_omap_digest + << std::dec; + r = -EIO; + goto out; + } + if (cct->_conf->osd_debug_inject_copyfrom_error) { + derr << __func__ << " injecting copyfrom failure" << dendl; + r = -EIO; + goto out; + } + + cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>( + [this, &cop /* avoid ref cycle */](PGTransaction *t) { + ObjectState& obs = cop->obc->obs; + if (cop->temp_cursor.is_initial()) { + dout(20) << "fill_in_final_tx: writing " + << "directly to final object" << dendl; + // write directly to final object + cop->results.temp_oid = obs.oi.soid; + _write_copy_chunk(cop, t); + } else { + // finish writing to temp object, then move into place + dout(20) << "fill_in_final_tx: writing to temp object" << dendl; + if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) { + /* In redirect manifest case, the object exists in the upper tier. + * So, to avoid a conflict when rename() is called, remove existing + * object first + */ + t->remove(obs.oi.soid); + } + _write_copy_chunk(cop, t); + t->rename(obs.oi.soid, cop->results.temp_oid); + } + t->setattrs(obs.oi.soid, cop->results.attrs); + }); + + dout(20) << __func__ << " success; committing" << dendl; + + out: + dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl; + CopyCallbackResults results(r, &cop->results); + cop->cb->complete(results); + + copy_ops.erase(cobc->obs.oi.soid); + cobc->stop_block(); + + if (r < 0 && cop->results.started_temp_obj) { + dout(10) << __func__ << " deleting partial temp object " + << cop->results.temp_oid << dendl; + ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true); + OpContextUPtr ctx = simple_opc_create(tempobc); + ctx->op_t->remove(cop->results.temp_oid); + ctx->discard_temp_oid = cop->results.temp_oid; + simple_opc_submit(std::move(ctx)); + } + + // cancel and requeue proxy ops on this object + if (!r) { + cancel_and_requeue_proxy_ops(cobc->obs.oi.soid); + } + + kick_object_context_blocked(cobc); +} + +void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid); + if (p == copy_ops.end()) { + dout(10) << __func__ << " no copy_op found" << dendl; + return; + } + CopyOpRef obj_cop = p->second; + CopyOpRef chunk_cop = obj_cop->chunk_cops[offset]; + + if (tid != chunk_cop->objecter_tid) { + dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop + << " tid " << chunk_cop->objecter_tid << dendl; + return; + } + + if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) { + r = -EOPNOTSUPP; + } + + chunk_cop->objecter_tid = 0; + chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened) + ObjectContextRef& cobc = obj_cop->obc; + OSDOp &chunk_data = chunk_cop->chunk_ops[0]; + + if (r < 0) { + obj_cop->failed = true; + goto out; + } + + if (obj_cop->failed) { + return; + } + if (!chunk_data.outdata.length()) { + r = -EIO; + obj_cop->failed = true; + goto out; + } + + obj_cop->num_chunk--; + + /* check all of the copyop are completed */ + if (obj_cop->num_chunk) { + dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl; + return; + } + + { + OpContextUPtr ctx = simple_opc_create(obj_cop->obc); + if (!ctx->lock_manager.take_write_lock( + obj_cop->obc->obs.oi.soid, + obj_cop->obc)) { + // recovery op can take read lock. + // so need to wait for recovery completion + r = -EAGAIN; + obj_cop->failed = true; + close_op_ctx(ctx.release()); + goto out; + } + dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl; + + PGTransaction *t = ctx->op_t.get(); + ObjectState& obs = ctx->new_obs; + for (auto p : obj_cop->chunk_cops) { + OSDOp &sub_chunk = p.second->chunk_ops[0]; + t->write(cobc->obs.oi.soid, + p.second->cursor.data_offset, + sub_chunk.outdata.length(), + sub_chunk.outdata, + p.second->dest_obj_fadvise_flags); + dout(20) << __func__ << " offset: " << p.second->cursor.data_offset + << " length: " << sub_chunk.outdata.length() << dendl; + write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges, + p.second->cursor.data_offset, sub_chunk.outdata.length()); + obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING); + ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length()); + sub_chunk.outdata.clear(); + } + obs.oi.clear_data_digest(); + ctx->at_version = get_next_version(); + finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE); + simple_opc_submit(std::move(ctx)); + obj_cop->chunk_cops.clear(); + + auto p = cobc->obs.oi.manifest.chunk_map.rbegin(); + /* check remaining work */ + if (p != cobc->obs.oi.manifest.chunk_map.rend()) { + if (obj_cop->last_offset < p->first) { + for (auto &en : cobc->obs.oi.manifest.chunk_map) { + if (obj_cop->last_offset < en.first) { + _copy_some_manifest(cobc, obj_cop, en.first); + return; + } + } + } + } + } + + out: + dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl; + CopyCallbackResults results(r, &obj_cop->results); + obj_cop->cb->complete(results); + + copy_ops.erase(cobc->obs.oi.soid); + cobc->stop_block(); + + // cancel and requeue proxy ops on this object + if (!r) { + cancel_and_requeue_proxy_ops(cobc->obs.oi.soid); + } + + kick_object_context_blocked(cobc); +} + +void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) { + vector<ceph_tid_t> tids; + for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin(); + it != proxyread_ops.end();) { + if (it->second->soid == oid) { + cancel_proxy_read((it++)->second, &tids); + } else { + ++it; + } + } + for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin(); + it != proxywrite_ops.end();) { + if (it->second->soid == oid) { + cancel_proxy_write((it++)->second, &tids); + } else { + ++it; + } + } + osd->objecter->op_cancel(tids, -ECANCELED); + kick_proxy_ops_blocked(oid); +} + +void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t) +{ + dout(20) << __func__ << " " << cop + << " " << cop->attrs.size() << " attrs" + << " " << cop->data.length() << " bytes" + << " " << cop->omap_header.length() << " omap header bytes" + << " " << cop->omap_data.length() << " omap data bytes" + << dendl; + if (!cop->temp_cursor.attr_complete) { + t->create(cop->results.temp_oid); + } + if (!cop->temp_cursor.data_complete) { + ceph_assert(cop->data.length() + cop->temp_cursor.data_offset == + cop->cursor.data_offset); + if (pool.info.required_alignment() && + !cop->cursor.data_complete) { + /** + * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset + * to pick it up on the next pass. + */ + ceph_assert(cop->temp_cursor.data_offset % + pool.info.required_alignment() == 0); + if (cop->data.length() % pool.info.required_alignment() != 0) { + uint64_t to_trim = + cop->data.length() % pool.info.required_alignment(); + bufferlist bl; + bl.substr_of(cop->data, 0, cop->data.length() - to_trim); + cop->data.swap(bl); + cop->cursor.data_offset -= to_trim; + ceph_assert(cop->data.length() + cop->temp_cursor.data_offset == + cop->cursor.data_offset); + } + } + if (cop->data.length()) { + t->write( + cop->results.temp_oid, + cop->temp_cursor.data_offset, + cop->data.length(), + cop->data, + cop->dest_obj_fadvise_flags); + } + cop->data.clear(); + } + if (pool.info.supports_omap()) { + if (!cop->temp_cursor.omap_complete) { + if (cop->omap_header.length()) { + t->omap_setheader( + cop->results.temp_oid, + cop->omap_header); + cop->omap_header.clear(); + } + if (cop->omap_data.length()) { + map<string,bufferlist> omap; + bufferlist::const_iterator p = cop->omap_data.begin(); + decode(omap, p); + t->omap_setkeys(cop->results.temp_oid, omap); + cop->omap_data.clear(); + } + } + } else { + ceph_assert(cop->omap_header.length() == 0); + ceph_assert(cop->omap_data.length() == 0); + } + cop->temp_cursor = cop->cursor; +} + +void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb) +{ + OpContext *ctx = cb->ctx; + dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl; + + ObjectState& obs = ctx->new_obs; + if (obs.exists) { + dout(20) << __func__ << ": exists, removing" << dendl; + ctx->op_t->remove(obs.oi.soid); + } else { + ctx->delta_stats.num_objects++; + obs.exists = true; + } + if (cb->is_temp_obj_used()) { + ctx->discard_temp_oid = cb->results->temp_oid; + } + cb->results->fill_in_final_tx(ctx->op_t.get()); + + // CopyFromCallback fills this in for us + obs.oi.user_version = ctx->user_at_version; + + if (cb->results->is_data_digest()) { + obs.oi.set_data_digest(cb->results->data_digest); + } else { + obs.oi.clear_data_digest(); + } + if (cb->results->is_omap_digest()) { + obs.oi.set_omap_digest(cb->results->omap_digest); + } else { + obs.oi.clear_omap_digest(); + } + + obs.oi.truncate_seq = cb->truncate_seq; + obs.oi.truncate_size = cb->truncate_size; + + obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime); + ctx->mtime = utime_t(); + + ctx->extra_reqids = cb->results->reqids; + ctx->extra_reqid_return_codes = cb->results->reqid_return_codes; + + // cache: clear whiteout? + if (obs.oi.is_whiteout()) { + dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + --ctx->delta_stats.num_whiteouts; + } + + if (cb->results->has_omap) { + dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl; + obs.oi.set_flag(object_info_t::FLAG_OMAP); + ctx->clean_regions.mark_omap_dirty(); + } else { + dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl; + obs.oi.clear_flag(object_info_t::FLAG_OMAP); + } + + interval_set<uint64_t> ch; + if (obs.oi.size > 0) + ch.insert(0, obs.oi.size); + ctx->modified_ranges.union_of(ch); + ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size())); + + if (cb->get_data_size() != obs.oi.size) { + ctx->delta_stats.num_bytes -= obs.oi.size; + obs.oi.size = cb->get_data_size(); + ctx->delta_stats.num_bytes += obs.oi.size; + } + ctx->delta_stats.num_wr++; + ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10); + + osd->logger->inc(l_osd_copyfrom); +} + +void PrimaryLogPG::finish_promote(int r, CopyResults *results, + ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + dout(10) << __func__ << " " << soid << " r=" << r + << " uv" << results->user_version << dendl; + + if (r == -ECANCELED) { + return; + } + + if (r != -ENOENT && soid.is_snap()) { + if (results->snaps.empty()) { + // we must have read "snap" content from the head object in the + // base pool. use snap_seq to construct what snaps should be + // for this clone (what is was before we evicted the clean clone + // from this pool, and what it will be when we flush and the + // clone eventually happens in the base pool). we want to use + // snaps in (results->snap_seq,soid.snap] + SnapSet& snapset = obc->ssc->snapset; + for (auto p = snapset.clone_snaps.rbegin(); + p != snapset.clone_snaps.rend(); + ++p) { + for (auto snap : p->second) { + if (snap > soid.snap) { + continue; + } + if (snap <= results->snap_seq) { + break; + } + results->snaps.push_back(snap); + } + } + } + + dout(20) << __func__ << " snaps " << results->snaps << dendl; + filter_snapc(results->snaps); + + dout(20) << __func__ << " filtered snaps " << results->snaps << dendl; + if (results->snaps.empty()) { + dout(20) << __func__ + << " snaps are empty, clone is invalid," + << " setting r to ENOENT" << dendl; + r = -ENOENT; + } + } + + if (r < 0 && results->started_temp_obj) { + dout(10) << __func__ << " abort; will clean up partial work" << dendl; + ObjectContextRef tempobc = get_object_context(results->temp_oid, false); + ceph_assert(tempobc); + OpContextUPtr ctx = simple_opc_create(tempobc); + ctx->op_t->remove(results->temp_oid); + simple_opc_submit(std::move(ctx)); + results->started_temp_obj = false; + } + + if (r == -ENOENT && soid.is_snap()) { + dout(10) << __func__ + << ": enoent while trying to promote clone, " << soid + << " must have been trimmed, removing from snapset" + << dendl; + hobject_t head(soid.get_head()); + ObjectContextRef obc = get_object_context(head, false); + ceph_assert(obc); + + OpContextUPtr tctx = simple_opc_create(obc); + tctx->at_version = get_next_version(); + if (get_osdmap()->require_osd_release < ceph_release_t::octopus) { + filter_snapc(tctx->new_snapset.snaps); + } else { + tctx->new_snapset.snaps.clear(); + } + vector<snapid_t> new_clones; + map<snapid_t, vector<snapid_t>> new_clone_snaps; + for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin(); + i != tctx->new_snapset.clones.end(); + ++i) { + if (*i != soid.snap) { + new_clones.push_back(*i); + auto p = tctx->new_snapset.clone_snaps.find(*i); + if (p != tctx->new_snapset.clone_snaps.end()) { + new_clone_snaps[*i] = p->second; + } + } + } + tctx->new_snapset.clones.swap(new_clones); + tctx->new_snapset.clone_overlap.erase(soid.snap); + tctx->new_snapset.clone_size.erase(soid.snap); + tctx->new_snapset.clone_snaps.swap(new_clone_snaps); + + // take RWWRITE lock for duration of our local write. ignore starvation. + if (!tctx->lock_manager.take_write_lock( + head, + obc)) { + ceph_abort_msg("problem!"); + } + dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl; + + finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE); + + simple_opc_submit(std::move(tctx)); + return; + } + + bool whiteout = false; + if (r == -ENOENT) { + ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above + dout(10) << __func__ << " whiteout " << soid << dendl; + whiteout = true; + } + + if (r < 0 && !whiteout) { + derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl; + // pass error to everyone blocked on this object + // FIXME: this is pretty sloppy, but at this point we got + // something unexpected and don't have many other options. + map<hobject_t,list<OpRequestRef>>::iterator blocked_iter = + waiting_for_blocked_object.find(soid); + if (blocked_iter != waiting_for_blocked_object.end()) { + while (!blocked_iter->second.empty()) { + osd->reply_op_error(blocked_iter->second.front(), r); + blocked_iter->second.pop_front(); + } + waiting_for_blocked_object.erase(blocked_iter); + } + return; + } + + osd->promote_finish(results->object_size); + + OpContextUPtr tctx = simple_opc_create(obc); + tctx->at_version = get_next_version(); + + if (!obc->obs.oi.has_manifest()) { + ++tctx->delta_stats.num_objects; + } + if (soid.snap < CEPH_NOSNAP) + ++tctx->delta_stats.num_object_clones; + tctx->new_obs.exists = true; + + tctx->extra_reqids = results->reqids; + tctx->extra_reqid_return_codes = results->reqid_return_codes; + + if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) { + tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE; + tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE); + tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST); + tctx->new_obs.oi.manifest.redirect_target = hobject_t(); + tctx->delta_stats.num_objects_manifest--; + if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) { + dec_all_refcount_manifest(obc->obs.oi, tctx.get()); + } + } + + if (whiteout) { + // create a whiteout + tctx->op_t->create(soid); + tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT); + ++tctx->delta_stats.num_whiteouts; + dout(20) << __func__ << " creating whiteout on " << soid << dendl; + osd->logger->inc(l_osd_tier_whiteout); + } else { + if (results->has_omap) { + dout(10) << __func__ << " setting omap flag on " << soid << dendl; + tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP); + ++tctx->delta_stats.num_objects_omap; + } + + results->fill_in_final_tx(tctx->op_t.get()); + if (results->started_temp_obj) { + tctx->discard_temp_oid = results->temp_oid; + } + tctx->new_obs.oi.size = results->object_size; + tctx->new_obs.oi.user_version = results->user_version; + tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime); + tctx->mtime = utime_t(); + if (results->is_data_digest()) { + tctx->new_obs.oi.set_data_digest(results->data_digest); + } else { + tctx->new_obs.oi.clear_data_digest(); + } + if (results->object_size) + tctx->clean_regions.mark_data_region_dirty(0, results->object_size); + if (results->is_omap_digest()) { + tctx->new_obs.oi.set_omap_digest(results->omap_digest); + } else { + tctx->new_obs.oi.clear_omap_digest(); + } + if (results->has_omap) + tctx->clean_regions.mark_omap_dirty(); + tctx->new_obs.oi.truncate_seq = results->truncate_seq; + tctx->new_obs.oi.truncate_size = results->truncate_size; + + if (soid.snap != CEPH_NOSNAP) { + ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap)); + ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap)); + ceph_assert(obc->ssc->snapset.clone_size[soid.snap] == + results->object_size); + ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap)); + + tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap); + } else { + tctx->delta_stats.num_bytes += results->object_size; + } + } + + if (results->mirror_snapset) { + ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP); + tctx->new_snapset.from_snap_set( + results->snapset, + get_osdmap()->require_osd_release < ceph_release_t::luminous); + } + dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl; + + // take RWWRITE lock for duration of our local write. ignore starvation. + if (!tctx->lock_manager.take_write_lock( + obc->obs.oi.soid, + obc)) { + ceph_abort_msg("problem!"); + } + dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl; + + finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE); + + simple_opc_submit(std::move(tctx)); + + osd->logger->inc(l_osd_tier_promote); + + if (agent_state && + agent_state->is_idle()) + agent_choose_mode(); +} + +void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results, + ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + dout(10) << __func__ << " " << soid << " r=" << r + << " uv" << results->user_version << dendl; + + if (r == -ECANCELED || r == -EAGAIN) { + return; + } + + if (r < 0) { + derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl; + // pass error to everyone blocked on this object + // FIXME: this is pretty sloppy, but at this point we got + // something unexpected and don't have many other options. + map<hobject_t,list<OpRequestRef>>::iterator blocked_iter = + waiting_for_blocked_object.find(soid); + if (blocked_iter != waiting_for_blocked_object.end()) { + while (!blocked_iter->second.empty()) { + osd->reply_op_error(blocked_iter->second.front(), r); + blocked_iter->second.pop_front(); + } + waiting_for_blocked_object.erase(blocked_iter); + } + return; + } + + osd->promote_finish(results->object_size); + osd->logger->inc(l_osd_tier_promote); + + if (agent_state && + agent_state->is_idle()) + agent_choose_mode(); +} + +void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue, + vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << " " << cop->obc->obs.oi.soid + << " from " << cop->src << " " << cop->oloc + << " v" << cop->results.user_version << dendl; + + // cancel objecter op, if we can + if (cop->objecter_tid) { + tids->push_back(cop->objecter_tid); + cop->objecter_tid = 0; + if (cop->objecter_tid2) { + tids->push_back(cop->objecter_tid2); + cop->objecter_tid2 = 0; + } + } + + copy_ops.erase(cop->obc->obs.oi.soid); + cop->obc->stop_block(); + + kick_object_context_blocked(cop->obc); + cop->results.should_requeue = requeue; + CopyCallbackResults result(-ECANCELED, &cop->results); + cop->cb->complete(result); + + // There may still be an objecter callback referencing this copy op. + // That callback will not need the obc since it's been canceled, and + // we need the obc reference to go away prior to flush. + cop->obc = ObjectContextRef(); +} + +void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << dendl; + map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin(); + while (p != copy_ops.end()) { + // requeue this op? can I queue up all of them? + cancel_copy((p++)->second, requeue, tids); + } +} + +struct C_gather : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + OSDOp *osd_op; + C_gather(PrimaryLogPG *pg_, hobject_t oid_, epoch_t lpr_, OSDOp *osd_op_) : + pg(pg_), oid(oid_), last_peering_reset(lpr_), osd_op(osd_op_) {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock locker{*pg}; + auto p = pg->cls_gather_ops.find(oid); + if (p == pg->cls_gather_ops.end()) { + // op was cancelled + return; + } + if (last_peering_reset != pg->get_last_peering_reset()) { + return; + } + osd_op->rval = r; + PrimaryLogPG::OpContext *ctx = p->second.ctx; + pg->cls_gather_ops.erase(p); + pg->execute_ctx(ctx); + } +}; + +int PrimaryLogPG::start_cls_gather(OpContext *ctx, std::map<std::string, bufferlist> *src_obj_buffs, const std::string& pool, + const char *cls, const char *method, bufferlist& inbl) +{ + OpRequestRef op = ctx->op; + MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); + + auto pool_id = osd->objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name), pool); + object_locator_t oloc(pool_id); + + ObjectState& obs = ctx->new_obs; + object_info_t& oi = obs.oi; + const hobject_t& soid = oi.soid; + + ObjectContextRef obc = get_object_context(soid, false); + C_GatherBuilder gather(cct); + + auto [iter, inserted] = cls_gather_ops.emplace(soid, CLSGatherOp(ctx, obc, op)); + ceph_assert(inserted); + auto &cgop = iter->second; + for (std::map<std::string, bufferlist>::iterator it = src_obj_buffs->begin(); it != src_obj_buffs->end(); it++) { + std::string oid = it->first; + ObjectOperation obj_op; + obj_op.call(cls, method, inbl); + uint32_t flags = 0; + ceph_tid_t tid = osd->objecter->read( + object_t(oid), oloc, obj_op, + m->get_snapid(), &it->second, + flags, gather.new_sub()); + cgop.objecter_tids.push_back(tid); + dout(10) << __func__ << " src=" << oid << ", tgt=" << soid << dendl; + } + + C_gather *fin = new C_gather(this, soid, get_last_peering_reset(), &(*ctx->ops)[ctx->current_osd_subop_num]); + gather.set_finisher(new C_OnFinisher(fin, + osd->get_objecter_finisher(get_pg_shard()))); + gather.activate(); + + return -EINPROGRESS; +} + +// ======================================================================== +// flush +// +// Flush a dirty object in the cache tier by writing it back to the +// base tier. The sequence looks like: +// +// * send a copy-from operation to the base tier to copy the current +// version of the object +// * base tier will pull the object via (perhaps multiple) copy-get(s) +// * on completion, we check if the object has been modified. if so, +// just reply with -EAGAIN. +// * try to take a write lock so we can clear the dirty flag. if this +// fails, wait and retry +// * start a repop that clears the bit. +// +// If we have to wait, we will retry by coming back through the +// start_flush method. We check if a flush is already in progress +// and, if so, try to finish it by rechecking the version and trying +// to clear the dirty bit. +// +// In order for the cache-flush (a write op) to not block the copy-get +// from reading the object, the client *must* set the SKIPRWLOCKS +// flag. +// +// NOTE: normally writes are strictly ordered for the client, but +// flushes are special in that they can be reordered with respect to +// other writes. In particular, we can't have a flush request block +// an update to the cache pool object! + +struct C_Flush : public Context { + PrimaryLogPGRef pg; + hobject_t oid; + epoch_t last_peering_reset; + ceph_tid_t tid; + utime_t start; + C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr) + : pg(p), oid(o), last_peering_reset(lpr), + tid(0), start(ceph_clock_now()) + {} + void finish(int r) override { + if (r == -ECANCELED) + return; + std::scoped_lock locker{*pg}; + if (last_peering_reset == pg->get_last_peering_reset()) { + pg->finish_flush(oid, tid, r); + pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start); + } + } +}; + +int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc) +{ + const object_info_t& oi = obc->obs.oi; + const hobject_t& soid = oi.soid; + + ceph_assert(obc->is_blocked()); + if (oi.size == 0) { + // evicted + return 0; + } + if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) { + dout(0) << " fingerprint algorithm is not set " << dendl; + return -EINVAL; + } + if (pool.info.get_dedup_tier() <= 0) { + dout(10) << " dedup tier is not set " << dendl; + return -EINVAL; + } + + /* + * The operations to make dedup chunks are tracked by a ManifestOp. + * This op will be finished if all the operations are completed. + */ + ManifestOpRef mop(std::make_shared<ManifestOp>(obc, nullptr)); + + // cdc + std::map<uint64_t, bufferlist> chunks; + int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks); + if (r < 0) { + return r; + } + if (!chunks.size()) { + return 0; + } + + // chunks issued here are different with chunk_map newly generated + // because the same chunks in previous snap will not be issued + // So, we need two data structures; the first is the issued chunk list to track + // issued operations, and the second is the new chunk_map to update chunk_map after + // all operations are finished + object_ref_delta_t refs; + ObjectContextRef obc_l, obc_g; + get_adjacent_clones(obc, obc_l, obc_g); + // skip if the same content exits in prev snap at same offset + mop->new_manifest.calc_refs_to_inc_on_set( + obc_l ? &(obc_l->obs.oi.manifest) : nullptr, + obc_g ? &(obc_g->obs.oi.manifest) : nullptr, + refs); + + for (auto p : chunks) { + hobject_t target = mop->new_manifest.chunk_map[p.first].oid; + if (refs.find(target) == refs.end()) { + continue; + } + C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first); + ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF, + fin, std::move(chunks[p.first])); + mop->chunks[target] = make_pair(p.first, p.second.length()); + mop->num_chunks++; + mop->tids[p.first] = tid; + fin->tid = tid; + dout(10) << __func__ << " oid: " << soid << " tid: " << tid + << " target: " << target << " offset: " << p.first + << " length: " << p.second.length() << dendl; + } + + if (mop->tids.size()) { + manifest_ops[soid] = mop; + manifest_ops[soid]->op = op; + } else { + // size == 0 + return 0; + } + + return -EINPROGRESS; +} + +int PrimaryLogPG::do_cdc(const object_info_t& oi, + std::map<uint64_t, chunk_info_t>& chunk_map, + std::map<uint64_t, bufferlist>& chunks) +{ + string chunk_algo = pool.info.get_dedup_chunk_algorithm_name(); + int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size(); + uint64_t total_length = 0; + + std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1); + if (!cdc) { + dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl; + return -EINVAL; + } + + bufferlist bl; + /** + * We disable EC pool as a base tier of distributed dedup. + * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync(). + * Therefore, we should change the current implementation totally to make EC pool compatible. + * As s result, we leave this as a future work. + */ + int r = pgbackend->objects_read_sync( + oi.soid, 0, oi.size, 0, &bl); + if (r < 0) { + dout(0) << __func__ << " read fail " << oi.soid + << " len: " << oi.size << " r: " << r << dendl; + return r; + } + if (bl.length() != oi.size) { + dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: " + << oi.size << " during chunking " << dendl; + return -EIO; + } + + dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length() + << " oi.size: " << oi.size + << " chunk_size: " << chunk_size << dendl; + + vector<pair<uint64_t, uint64_t>> cdc_chunks; + cdc->calc_chunks(bl, &cdc_chunks); + + // get fingerprint + for (auto p : cdc_chunks) { + bufferlist chunk; + chunk.substr_of(bl, p.first, p.second); + auto [ret, target] = get_fpoid_from_chunk(oi.soid, chunk); + if (ret < 0) { + return ret; + } + chunks[p.first] = std::move(chunk); + chunk_map[p.first] = chunk_info_t(0, p.second, target); + total_length += p.second; + } + return total_length; +} + +std::pair<int, hobject_t> PrimaryLogPG::get_fpoid_from_chunk( + const hobject_t soid, bufferlist& chunk) +{ + pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type(); + if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) { + return make_pair(-EINVAL, hobject_t()); + } + object_t fp_oid = [&fp_algo, &chunk]() -> string { + switch (fp_algo) { + case pg_pool_t::TYPE_FINGERPRINT_SHA1: + return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str(); + case pg_pool_t::TYPE_FINGERPRINT_SHA256: + return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str(); + case pg_pool_t::TYPE_FINGERPRINT_SHA512: + return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str(); + default: + assert(0 == "unrecognized fingerprint type"); + return {}; + } + }(); + + pg_t raw_pg; + object_locator_t oloc(soid); + oloc.pool = pool.info.get_dedup_tier(); + // check if dedup_tier isn't set + ceph_assert(oloc.pool > 0); + int ret = get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg); + if (ret < 0) { + return make_pair(ret, hobject_t()); + } + hobject_t target(fp_oid, oloc.key, snapid_t(), + raw_pg.ps(), raw_pg.pool(), + oloc.nspace); + return make_pair(0, target); +} + +int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid); + if (p == manifest_ops.end()) { + dout(10) << __func__ << " no manifest_op found" << dendl; + return -EINVAL; + } + ManifestOpRef mop = p->second; + mop->results[offset] = r; + if (r < 0) { + // if any failure occurs, put a mark on the results to recognize the failure + mop->results[0] = r; + } + if (mop->num_chunks != mop->results.size()) { + // there are on-going works + return -EINPROGRESS; + } + ObjectContextRef obc = mop->obc; + ceph_assert(obc); + ceph_assert(obc->is_blocked()); + obc->stop_block(); + kick_object_context_blocked(obc); + if (mop->results[0] < 0) { + // check if the previous op returns fail + ceph_assert(mop->num_chunks == mop->results.size()); + manifest_ops.erase(oid); + osd->reply_op_error(mop->op, mop->results[0]); + return -EIO; + } + + if (mop->chunks.size()) { + OpContextUPtr ctx = simple_opc_create(obc); + ceph_assert(ctx); + if (ctx->lock_manager.get_lock_type( + RWState::RWWRITE, + oid, + obc, + mop->op)) { + dout(20) << __func__ << " took write lock" << dendl; + } else if (mop->op) { + dout(10) << __func__ << " waiting on write lock " << mop->op << dendl; + close_op_ctx(ctx.release()); + return -EAGAIN; + } + + ctx->at_version = get_next_version(); + ctx->new_obs = obc->obs; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + if (!ctx->obs->oi.has_manifest()) { + ctx->delta_stats.num_objects_manifest++; + ctx->new_obs.oi.set_flag(object_info_t::FLAG_MANIFEST); + ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_CHUNKED; + } + + /* + * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head. + * head: [0, 2) aaa <-- tier_flush() + * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc + * + * In this case, if the new chunk_map is as follows, + * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc + * we should drop aaa from head by using calc_refs_to_drop_on_removal(). + * So, the precedure is + * 1. calc_refs_to_drop_on_removal() + * 2. register old references to drop after tier_flush() is committed + * 3. update new chunk_map + */ + + ObjectCleanRegions c_regions = ctx->clean_regions; + ObjectContextRef cobc = get_prev_clone_obc(obc); + c_regions.mark_fully_dirty(); + // CDC was done on entire range of manifest object, + // so the first thing we should do here is to drop the reference to old chunks + ObjectContextRef obc_l, obc_g; + get_adjacent_clones(obc, obc_l, obc_g); + // clear all old references + object_ref_delta_t refs; + ctx->obs->oi.manifest.calc_refs_to_drop_on_removal( + obc_l ? &(obc_l->obs.oi.manifest) : nullptr, + obc_g ? &(obc_g->obs.oi.manifest) : nullptr, + refs); + if (!refs.is_empty()) { + ctx->register_on_commit( + [oid, this, refs](){ + dec_refcount(oid, refs); + }); + } + + // set new references + ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map; + + finish_ctx(ctx.get(), pg_log_entry_t::CLEAN); + simple_opc_submit(std::move(ctx)); + } + if (mop->op) + osd->reply_op_error(mop->op, r); + + manifest_ops.erase(oid); + return 0; +} + +int PrimaryLogPG::finish_set_manifest_refcount(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid); + if (p == manifest_ops.end()) { + dout(10) << __func__ << " no manifest_op found" << dendl; + return -EINVAL; + } + ManifestOpRef mop = p->second; + mop->results[offset] = r; + if (r < 0) { + // if any failure occurs, put a mark on the results to recognize the failure + mop->results[0] = r; + } + if (mop->num_chunks != mop->results.size()) { + // there are on-going works + return -EINPROGRESS; + } + + if (mop->cb) { + mop->cb->complete(r); + } + + manifest_ops.erase(p); + mop.reset(); + + return 0; +} + +int PrimaryLogPG::start_flush( + OpRequestRef op, ObjectContextRef obc, + bool blocking, hobject_t *pmissing, + std::optional<std::function<void()>> &&on_flush, + bool force_dedup) +{ + const object_info_t& oi = obc->obs.oi; + const hobject_t& soid = oi.soid; + dout(10) << __func__ << " " << soid + << " v" << oi.version + << " uv" << oi.user_version + << " " << (blocking ? "blocking" : "non-blocking/best-effort") + << dendl; + + bool preoctopus_compat = + get_osdmap()->require_osd_release < ceph_release_t::octopus; + SnapSet snapset; + if (preoctopus_compat) { + // for pre-octopus compatibility, filter SnapSet::snaps. not + // certain we need this, but let's be conservative. + snapset = obc->ssc->snapset.get_filtered(pool.info); + } else { + // NOTE: change this to a const ref when we remove this compat code + snapset = obc->ssc->snapset; + } + + if ((obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) + || force_dedup) { + // current dedup tier only supports blocking operation + if (!blocking) { + return -EOPNOTSUPP; + } + } + + // verify there are no (older) check for dirty clones + { + dout(20) << " snapset " << snapset << dendl; + vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin(); + while (p != snapset.clones.rend() && *p >= soid.snap) + ++p; + if (p != snapset.clones.rend()) { + hobject_t next = soid; + next.snap = *p; + ceph_assert(next.snap < soid.snap); + if (recovery_state.get_pg_log().get_missing().is_missing(next)) { + dout(10) << __func__ << " missing clone is " << next << dendl; + if (pmissing) + *pmissing = next; + return -ENOENT; + } + ObjectContextRef older_obc = get_object_context(next, false); + if (older_obc) { + dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi + << dendl; + if (older_obc->obs.oi.is_dirty()) { + dout(10) << __func__ << " next oldest clone is dirty: " + << older_obc->obs.oi << dendl; + return -EBUSY; + } + } else { + dout(20) << __func__ << " next oldest clone " << next + << " is not present; implicitly clean" << dendl; + } + } else { + dout(20) << __func__ << " no older clones" << dendl; + } + } + + if (blocking) { + dout(20) << fmt::format("{}: blocking {}", __func__, soid) << dendl; + obc->start_block(); + } + + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid); + if (p != flush_ops.end()) { + FlushOpRef fop = p->second; + if (fop->op == op) { + // we couldn't take the write lock on a cache-try-flush before; + // now we are trying again for the lock. + return try_flush_mark_clean(fop); + } + if (fop->flushed_version == obc->obs.oi.user_version && + (fop->blocking || !blocking)) { + // nonblocking can join anything + // blocking can only join a blocking flush + dout(20) << __func__ << " piggybacking on existing flush " << dendl; + if (op) + fop->dup_ops.push_back(op); + return -EAGAIN; // clean up this ctx; op will retry later + } + + // cancel current flush since it will fail anyway, or because we + // are blocking and the existing flush is nonblocking. + dout(20) << __func__ << " canceling previous flush; it will fail" << dendl; + if (fop->op) + osd->reply_op_error(fop->op, -EBUSY); + while (!fop->dup_ops.empty()) { + osd->reply_op_error(fop->dup_ops.front(), -EBUSY); + fop->dup_ops.pop_front(); + } + vector<ceph_tid_t> tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + } + + if ((obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) + || force_dedup) { + int r = start_dedup(op, obc); + if (r != -EINPROGRESS) { + if (blocking) + obc->stop_block(); + } + return r; + } + + /** + * In general, we need to send a delete and a copyfrom. + * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)] + * where 4 is marked as clean. To flush 10, we have to: + * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4 + * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8 + * + * There is a complicating case. Supposed there had been a clone 7 + * for snaps [7, 6] which has been trimmed since they no longer exist. + * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit + * the delete, the snap will be promoted to 5, and the head will become + * a whiteout. When the copy-from goes through, we'll end up with + * 8:[8,4,3,2]:[4(4,3,2)]+head. + * + * Another complication is the case where there is an interval change + * after doing the delete and the flush but before marking the object + * clean. We'll happily delete head and then recreate it at the same + * sequence number, which works out ok. + */ + + SnapContext snapc, dsnapc; + if (snapset.seq != 0) { + if (soid.snap == CEPH_NOSNAP) { + snapc = snapset.get_ssc_as_of(snapset.seq); + } else { + snapid_t min_included_snap; + auto p = snapset.clone_snaps.find(soid.snap); + ceph_assert(p != snapset.clone_snaps.end()); + min_included_snap = p->second.back(); + snapc = snapset.get_ssc_as_of(min_included_snap - 1); + } + + snapid_t prev_snapc = 0; + for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin(); + citer != snapset.clones.rend(); + ++citer) { + if (*citer < soid.snap) { + prev_snapc = *citer; + break; + } + } + + dsnapc = snapset.get_ssc_as_of(prev_snapc); + } + + object_locator_t base_oloc(soid); + base_oloc.pool = pool.info.tier_of; + + if (dsnapc.seq < snapc.seq) { + ObjectOperation o; + o.remove(); + osd->objecter->mutate( + soid.oid, + base_oloc, + o, + dsnapc, + ceph::real_clock::from_ceph_timespec(oi.mtime), + (CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_ENFORCE_SNAPC), + NULL /* no callback, we'll rely on the ordering w.r.t the next op */); + } + + FlushOpRef fop(std::make_shared<FlushOp>()); + fop->obc = obc; + fop->flushed_version = oi.user_version; + fop->blocking = blocking; + fop->on_flush = std::move(on_flush); + fop->op = op; + + ObjectOperation o; + if (oi.is_whiteout()) { + fop->removal = true; + o.remove(); + } else { + object_locator_t oloc(soid); + o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version, + CEPH_OSD_COPY_FROM_FLAG_FLUSH | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE, + LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + + //mean the base tier don't cache data after this + if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) + o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED); + } + C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset()); + + ceph_tid_t tid = osd->objecter->mutate( + soid.oid, base_oloc, o, snapc, + ceph::real_clock::from_ceph_timespec(oi.mtime), + CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC, + new C_OnFinisher(fin, + osd->get_objecter_finisher(get_pg_shard()))); + /* we're under the pg lock and fin->finish() is grabbing that */ + fin->tid = tid; + fop->objecter_tid = tid; + + flush_ops[soid] = fop; + + recovery_state.update_stats( + [&oi](auto &history, auto &stats) { + stats.stats.sum.num_flush++; + stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10); + return false; + }); + return -EINPROGRESS; +} + +void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r) +{ + dout(10) << __func__ << " " << oid << " tid " << tid + << " " << cpp_strerror(r) << dendl; + map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid); + if (p == flush_ops.end()) { + dout(10) << __func__ << " no flush_op found" << dendl; + return; + } + FlushOpRef fop = p->second; + if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) { + dout(10) << __func__ << " tid " << tid << " != fop " << fop + << " tid " << fop->objecter_tid << dendl; + return; + } + ObjectContextRef obc = fop->obc; + fop->objecter_tid = 0; + + if (r < 0 && !(r == -ENOENT && fop->removal)) { + if (fop->op) + osd->reply_op_error(fop->op, -EBUSY); + if (fop->blocking) { + obc->stop_block(); + kick_object_context_blocked(obc); + } + + if (!fop->dup_ops.empty()) { + dout(20) << __func__ << " requeueing dups" << dendl; + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = std::nullopt; + } + flush_ops.erase(oid); + return; + } + + r = try_flush_mark_clean(fop); + if (r == -EBUSY && fop->op) { + osd->reply_op_error(fop->op, r); + } +} + +int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop) +{ + ObjectContextRef obc = fop->obc; + const hobject_t& oid = obc->obs.oi.soid; + + if (fop->blocking) { + obc->stop_block(); + kick_object_context_blocked(obc); + } + + if (fop->flushed_version != obc->obs.oi.user_version || + !obc->obs.exists) { + if (obc->obs.exists) + dout(10) << __func__ << " flushed_version " << fop->flushed_version + << " != current " << obc->obs.oi.user_version + << dendl; + else + dout(10) << __func__ << " object no longer exists" << dendl; + + if (!fop->dup_ops.empty()) { + dout(20) << __func__ << " requeueing dups" << dendl; + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = std::nullopt; + } + flush_ops.erase(oid); + if (fop->blocking) + osd->logger->inc(l_osd_tier_flush_fail); + else + osd->logger->inc(l_osd_tier_try_flush_fail); + return -EBUSY; + } + + if (!fop->blocking && + m_scrubber->write_blocked_by_scrub(oid)) { + if (fop->op) { + dout(10) << __func__ << " blocked by scrub" << dendl; + requeue_op(fop->op); + requeue_ops(fop->dup_ops); + return -EAGAIN; // will retry + } else { + osd->logger->inc(l_osd_tier_try_flush_fail); + vector<ceph_tid_t> tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + return -ECANCELED; + } + } + + // successfully flushed, can we evict this object? + if (!obc->obs.oi.has_manifest() && !fop->op && + agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && + agent_maybe_evict(obc, true)) { + osd->logger->inc(l_osd_tier_clean); + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = std::nullopt; + } + flush_ops.erase(oid); + return 0; + } + + dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl; + OpContextUPtr ctx = simple_opc_create(fop->obc); + + // successfully flushed; can we clear the dirty bit? + // try to take the lock manually, since we don't + // have a ctx yet. + if (ctx->lock_manager.get_lock_type( + RWState::RWWRITE, + oid, + obc, + fop->op)) { + dout(20) << __func__ << " took write lock" << dendl; + } else if (fop->op) { + dout(10) << __func__ << " waiting on write lock " << fop->op << " " + << fop->dup_ops << dendl; + // fop->op is now waiting on the lock; get fop->dup_ops to wait too. + for (auto op : fop->dup_ops) { + bool locked = ctx->lock_manager.get_lock_type( + RWState::RWWRITE, + oid, + obc, + op); + ceph_assert(!locked); + } + close_op_ctx(ctx.release()); + return -EAGAIN; // will retry + } else { + dout(10) << __func__ << " failed write lock, no op; failing" << dendl; + close_op_ctx(ctx.release()); + osd->logger->inc(l_osd_tier_try_flush_fail); + vector<ceph_tid_t> tids; + cancel_flush(fop, false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + return -ECANCELED; + } + + if (fop->on_flush) { + ctx->register_on_finish(*(fop->on_flush)); + fop->on_flush = std::nullopt; + } + + ctx->at_version = get_next_version(); + + ctx->new_obs = obc->obs; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY); + --ctx->delta_stats.num_objects_dirty; + if (fop->obc->obs.oi.has_manifest()) { + ceph_assert(obc->obs.oi.manifest.is_chunked()); + PGTransaction* t = ctx->op_t.get(); + uint64_t chunks_size = 0; + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + chunks_size += p.second.length; + } + if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) { + t->omap_clear(oid); + ctx->new_obs.oi.clear_omap_digest(); + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP); + ctx->clean_regions.mark_omap_dirty(); + } + if (obc->obs.oi.size == chunks_size) { + t->truncate(oid, 0); + interval_set<uint64_t> trim; + trim.insert(0, ctx->new_obs.oi.size); + ctx->modified_ranges.union_of(trim); + truncate_update_size_and_usage(ctx->delta_stats, + ctx->new_obs.oi, + 0); + ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size); + ctx->new_obs.oi.new_object(); + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + p.second.set_flag(chunk_info_t::FLAG_MISSING); + } + } else { + for (auto &p : ctx->new_obs.oi.manifest.chunk_map) { + dout(20) << __func__ << " offset: " << p.second.offset + << " length: " << p.second.length << dendl; + p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN + } + } + } + + finish_ctx(ctx.get(), pg_log_entry_t::CLEAN); + + osd->logger->inc(l_osd_tier_clean); + + if (!fop->dup_ops.empty() || fop->op) { + dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl; + list<OpRequestRef> ls; + if (fop->op) + ls.push_back(fop->op); + ls.splice(ls.end(), fop->dup_ops); + requeue_ops(ls); + } + + simple_opc_submit(std::move(ctx)); + + flush_ops.erase(oid); + + if (fop->blocking) + osd->logger->inc(l_osd_tier_flush); + else + osd->logger->inc(l_osd_tier_try_flush); + + return -EINPROGRESS; +} + +void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue, + vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid " + << fop->objecter_tid << dendl; + if (fop->objecter_tid) { + tids->push_back(fop->objecter_tid); + fop->objecter_tid = 0; + } + if (fop->io_tids.size()) { + for (auto &p : fop->io_tids) { + tids->push_back(p.second); + p.second = 0; + } + } + if (fop->blocking && fop->obc->is_blocked()) { + fop->obc->stop_block(); + kick_object_context_blocked(fop->obc); + } + if (requeue) { + if (fop->op) + requeue_op(fop->op); + requeue_ops(fop->dup_ops); + } + if (fop->on_flush) { + (*(fop->on_flush))(); + fop->on_flush = std::nullopt; + } + flush_ops.erase(fop->obc->obs.oi.soid); +} + +void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << dendl; + map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin(); + while (p != flush_ops.end()) { + cancel_flush((p++)->second, requeue, tids); + } +} + +bool PrimaryLogPG::is_present_clone(hobject_t coid) +{ + if (!pool.info.allow_incomplete_clones()) + return true; + if (is_missing_object(coid)) + return true; + ObjectContextRef obc = get_object_context(coid, false); + return obc && obc->obs.exists; +} + +// ======================================================================== +// cls gather +// + +void PrimaryLogPG::cancel_cls_gather(map<hobject_t,CLSGatherOp>::iterator iter, bool requeue, + vector<ceph_tid_t> *tids) +{ + auto &cgop = iter->second; + for (std::vector<ceph_tid_t>::iterator p = cgop.objecter_tids.begin(); p != cgop.objecter_tids.end(); p++) { + tids->push_back(*p); + dout(10) << __func__ << " " << cgop.obc->obs.oi.soid << " tid " << *p << dendl; + } + cgop.objecter_tids.clear(); + close_op_ctx(cgop.ctx); + cgop.ctx = NULL; + if (requeue) { + if (cgop.op) + requeue_op(cgop.op); + } + cls_gather_ops.erase(iter); +} + +void PrimaryLogPG::cancel_cls_gather_ops(bool requeue, vector<ceph_tid_t> *tids) +{ + dout(10) << __func__ << dendl; + map<hobject_t,CLSGatherOp>::iterator p = cls_gather_ops.begin(); + while (p != cls_gather_ops.end()) { + cancel_cls_gather(p++, requeue, tids); + } +} + +// ======================================================================== +// rep op gather + +class C_OSD_RepopCommit : public Context { + PrimaryLogPGRef pg; + boost::intrusive_ptr<PrimaryLogPG::RepGather> repop; +public: + C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop) + : pg(pg), repop(repop) {} + void finish(int) override { + pg->repop_all_committed(repop.get()); + } +}; + +void PrimaryLogPG::repop_all_committed(RepGather *repop) +{ + dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed " + << dendl; + repop->all_committed = true; + if (!repop->rep_aborted) { + if (repop->v != eversion_t()) { + recovery_state.complete_write(repop->v, repop->pg_local_last_complete); + } + eval_repop(repop); + } +} + +void PrimaryLogPG::op_applied(const eversion_t &applied_version) +{ + dout(10) << "op_applied version " << applied_version << dendl; + ceph_assert(applied_version != eversion_t()); + ceph_assert(applied_version <= info.last_update); + recovery_state.local_write_applied(applied_version); + + if (is_primary() && m_scrubber) { + // if there's a scrub operation waiting for the selected chunk to be fully updated - + // allow it to continue + m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied()); + } +} + +void PrimaryLogPG::eval_repop(RepGather *repop) +{ + dout(10) << "eval_repop " << *repop + << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl; + + // ondisk? + if (repop->all_committed) { + dout(10) << " commit: " << *repop << dendl; + for (auto p = repop->on_committed.begin(); + p != repop->on_committed.end(); + repop->on_committed.erase(p++)) { + (*p)(); + } + // send dup commits, in order + auto it = waiting_for_ondisk.find(repop->v); + if (it != waiting_for_ondisk.end()) { + ceph_assert(waiting_for_ondisk.begin()->first == repop->v); + for (auto& i : it->second) { + int return_code = repop->r; + if (return_code >= 0) { + return_code = std::get<2>(i); + } + osd->reply_op_error(std::get<0>(i), return_code, repop->v, + std::get<1>(i), std::get<3>(i)); + } + waiting_for_ondisk.erase(it); + } + + publish_stats_to_osd(); + + dout(10) << " removing " << *repop << dendl; + ceph_assert(!repop_queue.empty()); + dout(20) << " q front is " << *repop_queue.front() << dendl; + if (repop_queue.front() == repop) { + RepGather *to_remove = nullptr; + while (!repop_queue.empty() && + (to_remove = repop_queue.front())->all_committed) { + repop_queue.pop_front(); + for (auto p = to_remove->on_success.begin(); + p != to_remove->on_success.end(); + to_remove->on_success.erase(p++)) { + (*p)(); + } + remove_repop(to_remove); + } + } + } +} + +void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx) +{ + FUNCTRACE(cct); + const hobject_t& soid = ctx->obs->oi.soid; + dout(7) << "issue_repop rep_tid " << repop->rep_tid + << " o " << soid + << dendl; + + + repop->v = ctx->at_version; + + ctx->op_t->add_obc(ctx->obc); + if (ctx->clone_obc) { + ctx->op_t->add_obc(ctx->clone_obc); + } + if (ctx->head_obc) { + ctx->op_t->add_obc(ctx->head_obc); + } + + Context *on_all_commit = new C_OSD_RepopCommit(this, repop); + if (!(ctx->log.empty())) { + ceph_assert(ctx->at_version >= projected_last_update); + projected_last_update = ctx->at_version; + } + for (auto &&entry: ctx->log) { + projected_log.add(entry); + } + + recovery_state.pre_submit_op( + soid, + ctx->log, + ctx->at_version); + pgbackend->submit_transaction( + soid, + ctx->delta_stats, + ctx->at_version, + std::move(ctx->op_t), + recovery_state.get_pg_trim_to(), + recovery_state.get_min_last_complete_ondisk(), + std::move(ctx->log), + ctx->updated_hset_history, + on_all_commit, + repop->rep_tid, + ctx->reqid, + ctx->op); +} + +PrimaryLogPG::RepGather *PrimaryLogPG::new_repop( + OpContext *ctx, + ceph_tid_t rep_tid) +{ + if (ctx->op) + dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl; + else + dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl; + + RepGather *repop = new RepGather( + ctx, rep_tid, info.last_complete); + + repop->start = ceph_clock_now(); + + repop_queue.push_back(&repop->queue_item); + repop->get(); + + osd->logger->inc(l_osd_op_wip); + + dout(10) << __func__ << ": " << *repop << dendl; + return repop; +} + +boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop( + eversion_t version, + int r, + ObcLockManager &&manager, + OpRequestRef &&op, + std::optional<std::function<void(void)> > &&on_complete) +{ + RepGather *repop = new RepGather( + std::move(manager), + std::move(op), + std::move(on_complete), + osd->get_tid(), + info.last_complete, + r); + repop->v = version; + + repop->start = ceph_clock_now(); + + repop_queue.push_back(&repop->queue_item); + + osd->logger->inc(l_osd_op_wip); + + dout(10) << __func__ << ": " << *repop << dendl; + return boost::intrusive_ptr<RepGather>(repop); +} + +void PrimaryLogPG::remove_repop(RepGather *repop) +{ + dout(20) << __func__ << " " << *repop << dendl; + + for (auto p = repop->on_finish.begin(); + p != repop->on_finish.end(); + repop->on_finish.erase(p++)) { + (*p)(); + } + + release_object_locks( + repop->lock_manager); + repop->put(); + + osd->logger->dec(l_osd_op_wip); +} + +PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc) +{ + dout(20) << __func__ << " " << obc->obs.oi.soid << dendl; + ceph_tid_t rep_tid = osd->get_tid(); + osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid); + OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this)); + ctx->op_t.reset(new PGTransaction()); + ctx->mtime = ceph_clock_now(); + return ctx; +} + +void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx) +{ + RepGather *repop = new_repop(ctx.get(), ctx->reqid.tid); + dout(20) << __func__ << " " << repop << dendl; + issue_repop(repop, ctx.get()); + eval_repop(repop); + recovery_state.update_trim_to(); + repop->put(); +} + + +void PrimaryLogPG::submit_log_entries( + const mempool::osd_pglog::list<pg_log_entry_t> &entries, + ObcLockManager &&manager, + std::optional<std::function<void(void)> > &&_on_complete, + OpRequestRef op, + int r) +{ + dout(10) << __func__ << " " << entries << dendl; + ceph_assert(is_primary()); + + eversion_t version; + if (!entries.empty()) { + ceph_assert(entries.rbegin()->version >= projected_last_update); + version = projected_last_update = entries.rbegin()->version; + } + + boost::intrusive_ptr<RepGather> repop; + std::optional<std::function<void(void)> > on_complete; + if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) { + repop = new_repop( + version, + r, + std::move(manager), + std::move(op), + std::move(_on_complete)); + } else { + on_complete = std::move(_on_complete); + } + + pgbackend->call_write_ordered( + [this, entries, repop, on_complete]() { + ObjectStore::Transaction t; + eversion_t old_last_update = info.last_update; + recovery_state.merge_new_log_entries( + entries, t, recovery_state.get_pg_trim_to(), + recovery_state.get_min_last_complete_ondisk()); + + set<pg_shard_t> waiting_on; + for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin(); + i != get_acting_recovery_backfill().end(); + ++i) { + pg_shard_t peer(*i); + if (peer == pg_whoami) continue; + ceph_assert(recovery_state.get_peer_missing().count(peer)); + ceph_assert(recovery_state.has_peer_info(peer)); + if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) { + ceph_assert(repop); + MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing( + entries, + spg_t(info.pgid.pgid, i->shard), + pg_whoami.shard, + get_osdmap_epoch(), + get_last_peering_reset(), + repop->rep_tid, + recovery_state.get_pg_trim_to(), + recovery_state.get_min_last_complete_ondisk()); + osd->send_message_osd_cluster( + peer.osd, m, get_osdmap_epoch()); + waiting_on.insert(peer); + } else { + MOSDPGLog *m = new MOSDPGLog( + peer.shard, pg_whoami.shard, + info.last_update.epoch, + info, get_last_peering_reset()); + m->log.log = entries; + m->log.tail = old_last_update; + m->log.head = info.last_update; + osd->send_message_osd_cluster( + peer.osd, m, get_osdmap_epoch()); + } + } + ceph_tid_t rep_tid = repop->rep_tid; + waiting_on.insert(pg_whoami); + log_entry_update_waiting_on.insert( + make_pair( + rep_tid, + LogUpdateCtx{std::move(repop), std::move(waiting_on)} + )); + struct OnComplete : public Context { + PrimaryLogPGRef pg; + ceph_tid_t rep_tid; + epoch_t epoch; + OnComplete( + PrimaryLogPGRef pg, + ceph_tid_t rep_tid, + epoch_t epoch) + : pg(pg), rep_tid(rep_tid), epoch(epoch) {} + void finish(int) override { + std::scoped_lock l{*pg}; + if (!pg->pg_has_reset_since(epoch)) { + auto it = pg->log_entry_update_waiting_on.find(rep_tid); + ceph_assert(it != pg->log_entry_update_waiting_on.end()); + auto it2 = it->second.waiting_on.find(pg->pg_whoami); + ceph_assert(it2 != it->second.waiting_on.end()); + it->second.waiting_on.erase(it2); + if (it->second.waiting_on.empty()) { + pg->repop_all_committed(it->second.repop.get()); + pg->log_entry_update_waiting_on.erase(it); + } + } + } + }; + t.register_on_commit( + new OnComplete{this, rep_tid, get_osdmap_epoch()}); + int r = osd->store->queue_transaction(ch, std::move(t), NULL); + ceph_assert(r == 0); + op_applied(info.last_update); + }); + + recovery_state.update_trim_to(); +} + +void PrimaryLogPG::cancel_log_updates() +{ + // get rid of all the LogUpdateCtx so their references to repops are + // dropped + log_entry_update_waiting_on.clear(); +} + +// ------------------------------------------------------- + +void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls) +{ + std::scoped_lock l{*this}; + pair<hobject_t, ObjectContextRef> i; + while (object_contexts.get_next(i.first, &i)) { + ObjectContextRef obc(i.second); + get_obc_watchers(obc, *ls); + } +} + +void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers) +{ + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = + obc->watchers.begin(); + j != obc->watchers.end(); + ++j) { + obj_watch_item_t owi; + + owi.obj = obc->obs.oi.soid; + owi.wi.addr = j->second->get_peer_addr(); + owi.wi.name = j->second->get_entity(); + owi.wi.cookie = j->second->get_cookie(); + owi.wi.timeout_seconds = j->second->get_timeout(); + + dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr + << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl; + + pg_watchers.push_back(owi); + } +} + +void PrimaryLogPG::check_blocklisted_watchers() +{ + dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl; + pair<hobject_t, ObjectContextRef> i; + while (object_contexts.get_next(i.first, &i)) + check_blocklisted_obc_watchers(i.second); +} + +void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc) +{ + dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl; + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k = + obc->watchers.begin(); + k != obc->watchers.end(); + ) { + //Advance iterator now so handle_watch_timeout() can erase element + map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++; + dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl; + entity_addr_t ea = j->second->get_peer_addr(); + dout(30) << "watch: Check entity_addr_t " << ea << dendl; + if (get_osdmap()->is_blocklisted(ea)) { + dout(10) << "watch: Found blocklisted watcher for " << ea << dendl; + ceph_assert(j->second->get_pg() == this); + j->second->unregister_cb(); + handle_watch_timeout(j->second); + } + } +} + +void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc) +{ + ceph_assert(is_primary() && is_active()); + auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid); + ceph_assert((recovering.count(obc->obs.oi.soid) || + !is_missing_object(obc->obs.oi.soid)) || + (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary() + it_objects->second->op == + pg_log_entry_t::LOST_REVERT && + it_objects->second->reverting_to == + obc->obs.oi.version)); + + dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl; + ceph_assert(obc->watchers.empty()); + // populate unconnected_watchers + for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p = + obc->obs.oi.watchers.begin(); + p != obc->obs.oi.watchers.end(); + ++p) { + utime_t expire = info.stats.last_became_active; + expire += p->second.timeout_seconds; + dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl; + WatchRef watch( + Watch::makeWatchRef( + this, osd, obc, p->second.timeout_seconds, p->first.first, + p->first.second, p->second.addr)); + watch->disconnect(); + obc->watchers.insert( + make_pair( + make_pair(p->first.first, p->first.second), + watch)); + } + // Look for watchers from blocklisted clients and drop + check_blocklisted_obc_watchers(obc); +} + +void PrimaryLogPG::handle_watch_timeout(WatchRef watch) +{ + ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref + dout(10) << "handle_watch_timeout obc " << *obc << dendl; + + if (!is_active()) { + dout(10) << "handle_watch_timeout not active, no-op" << dendl; + return; + } + if (!obc->obs.exists) { + dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl; + return; + } + if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) { + callbacks_for_degraded_object[obc->obs.oi.soid].push_back( + watch->get_delayed_cb() + ); + dout(10) << "handle_watch_timeout waiting for degraded on obj " + << obc->obs.oi.soid + << dendl; + return; + } + + if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) { + dout(10) << "handle_watch_timeout waiting for scrub on obj " + << obc->obs.oi.soid + << dendl; + m_scrubber->add_callback( + watch->get_delayed_cb() // This callback! + ); + return; + } + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->at_version = get_next_version(); + + object_info_t& oi = ctx->new_obs.oi; + oi.watchers.erase(make_pair(watch->get_cookie(), + watch->get_entity())); + + list<watch_disconnect_t> watch_disconnects = { + watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true) + }; + ctx->register_on_success( + [this, obc, watch_disconnects]() { + complete_disconnect_watches(obc, watch_disconnects); + }); + + + PGTransaction *t = ctx->op_t.get(); + ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid, + ctx->at_version, + oi.version, + 0, + osd_reqid_t(), ctx->mtime, 0)); + + oi.prior_version = obc->obs.oi.version; + oi.version = ctx->at_version; + bufferlist bl; + encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + t->setattr(obc->obs.oi.soid, OI_ATTR, bl); + + // apply new object state. + ctx->obc->obs = ctx->new_obs; + + // no ctx->delta_stats + simple_opc_submit(std::move(ctx)); +} + +ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi, + SnapSetContext *ssc) +{ + ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid)); + ceph_assert(obc->destructor_callback == NULL); + obc->destructor_callback = new C_PG_ObjectContext(this, obc.get()); + obc->obs.oi = oi; + obc->obs.exists = false; + obc->ssc = ssc; + if (ssc) + register_snapset_context(ssc); + dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl; + if (is_active()) + populate_obc_watchers(obc); + return obc; +} + +ObjectContextRef PrimaryLogPG::get_object_context( + const hobject_t& soid, + bool can_create, + const map<string, bufferlist, less<>> *attrs) +{ + auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid); + ceph_assert( + attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) || + // or this is a revert... see recover_primary() + (it_objects != recovery_state.get_pg_log().get_log().objects.end() && + it_objects->second->op == + pg_log_entry_t::LOST_REVERT)); + ObjectContextRef obc = object_contexts.lookup(soid); + osd->logger->inc(l_osd_object_ctx_cache_total); + if (obc) { + osd->logger->inc(l_osd_object_ctx_cache_hit); + dout(10) << __func__ << ": found obc in cache: " << *obc + << dendl; + } else { + dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl; + // check disk + bufferlist bv; + if (attrs) { + auto it_oi = attrs->find(OI_ATTR); + ceph_assert(it_oi != attrs->end()); + bv = it_oi->second; + } else { + int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv); + if (r < 0) { + if (!can_create) { + dout(10) << __func__ << ": no obc for soid " + << soid << " and !can_create" + << dendl; + return ObjectContextRef(); // -ENOENT! + } + + dout(10) << __func__ << ": no obc for soid " + << soid << " but can_create" + << dendl; + // new object. + object_info_t oi(soid); + SnapSetContext *ssc = get_snapset_context( + soid, true, 0, false); + ceph_assert(ssc); + obc = create_object_context(oi, ssc); + dout(10) << __func__ << ": " << *obc + << " oi: " << obc->obs.oi + << " " << *obc->ssc << dendl; + return obc; + } + } + + object_info_t oi; + try { + bufferlist::const_iterator bliter = bv.begin(); + decode(oi, bliter); + } catch (...) { + dout(0) << __func__ << ": obc corrupt: " << soid << dendl; + return ObjectContextRef(); // -ENOENT! + } + + ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool()); + + obc = object_contexts.lookup_or_create(oi.soid); + obc->destructor_callback = new C_PG_ObjectContext(this, obc.get()); + obc->obs.oi = oi; + obc->obs.exists = true; + + obc->ssc = get_snapset_context( + soid, true, + soid.has_snapset() ? attrs : 0); + + if (is_primary() && is_active()) + populate_obc_watchers(obc); + + if (pool.info.is_erasure()) { + if (attrs) { + obc->attr_cache = *attrs; + } else { + int r = pgbackend->objects_get_attrs( + soid, + &obc->attr_cache); + ceph_assert(r == 0); + } + } + + dout(10) << __func__ << ": creating obc from disk: " << *obc + << dendl; + } + + // XXX: Caller doesn't expect this + if (obc->ssc == NULL) { + derr << __func__ << ": obc->ssc not available, not returning context" << dendl; + return ObjectContextRef(); // -ENOENT! + } + + dout(10) << __func__ << ": " << *obc + << " oi: " << obc->obs.oi + << " exists: " << (int)obc->obs.exists + << " " << *obc->ssc << dendl; + return obc; +} + +void PrimaryLogPG::context_registry_on_change() +{ + pair<hobject_t, ObjectContextRef> i; + while (object_contexts.get_next(i.first, &i)) { + ObjectContextRef obc(i.second); + if (obc) { + for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = + obc->watchers.begin(); + j != obc->watchers.end(); + obc->watchers.erase(j++)) { + j->second->discard(); + } + } + } +} + + +/* + * If we return an error, and set *pmissing, then promoting that + * object may help. + * + * If we return -EAGAIN, we will always set *pmissing to the missing + * object to wait for. + * + * If we return an error but do not set *pmissing, then we know the + * object does not exist. + */ +int PrimaryLogPG::find_object_context(const hobject_t& oid, + ObjectContextRef *pobc, + bool can_create, + bool map_snapid_to_clone, + hobject_t *pmissing) +{ + FUNCTRACE(cct); + ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool())); + // want the head? + if (oid.snap == CEPH_NOSNAP) { + ObjectContextRef obc = get_object_context(oid, can_create); + if (!obc) { + if (pmissing) + *pmissing = oid; + return -ENOENT; + } + dout(10) << __func__ << " " << oid + << " @" << oid.snap + << " oi=" << obc->obs.oi + << dendl; + *pobc = obc; + + return 0; + } + + // we want a snap + + hobject_t head = oid.get_head(); + SnapSetContext *ssc = get_snapset_context(oid, can_create); + if (!ssc || !(ssc->exists || can_create)) { + dout(20) << __func__ << " " << oid << " no snapset" << dendl; + if (pmissing) + *pmissing = head; // start by getting the head + if (ssc) + put_snapset_context(ssc); + return -ENOENT; + } + + if (map_snapid_to_clone) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " map_snapid_to_clone=true" << dendl; + if (oid.snap > ssc->snapset.seq) { + // already must be readable + ObjectContextRef obc = get_object_context(head, false); + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to head" << dendl; + *pobc = obc; + put_snapset_context(ssc); + return (obc && obc->obs.exists) ? 0 : -ENOENT; + } else { + vector<snapid_t>::const_iterator citer = std::find( + ssc->snapset.clones.begin(), + ssc->snapset.clones.end(), + oid.snap); + if (citer == ssc->snapset.clones.end()) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to nothing" << dendl; + put_snapset_context(ssc); + return -ENOENT; + } + + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " maps to " << oid << dendl; + + if (recovery_state.get_pg_log().get_missing().is_missing(oid)) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " is missing" << dendl; + if (pmissing) + *pmissing = oid; + put_snapset_context(ssc); + return -EAGAIN; + } + + ObjectContextRef obc = get_object_context(oid, false); + if (!obc || !obc->obs.exists) { + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " is not present" << dendl; + if (pmissing) + *pmissing = oid; + put_snapset_context(ssc); + return -ENOENT; + } + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset + << " " << oid << " HIT" << dendl; + *pobc = obc; + put_snapset_context(ssc); + return 0; + } + ceph_abort(); //unreachable + } + + dout(10) << __func__ << " " << oid << " @" << oid.snap + << " snapset " << ssc->snapset << dendl; + + // head? + if (oid.snap > ssc->snapset.seq) { + ObjectContextRef obc = get_object_context(head, false); + dout(10) << __func__ << " " << head + << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq + << " -- HIT " << obc->obs + << dendl; + if (!obc->ssc) + obc->ssc = ssc; + else { + ceph_assert(ssc == obc->ssc); + put_snapset_context(ssc); + } + *pobc = obc; + return 0; + } + + // which clone would it be? + unsigned k = 0; + while (k < ssc->snapset.clones.size() && + ssc->snapset.clones[k] < oid.snap) + k++; + if (k == ssc->snapset.clones.size()) { + dout(10) << __func__ << " no clones with last >= oid.snap " + << oid.snap << " -- DNE" << dendl; + put_snapset_context(ssc); + return -ENOENT; + } + hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(), + info.pgid.pool(), oid.get_namespace()); + + if (recovery_state.get_pg_log().get_missing().is_missing(soid)) { + dout(20) << __func__ << " " << soid << " missing, try again later" + << dendl; + if (pmissing) + *pmissing = soid; + put_snapset_context(ssc); + return -EAGAIN; + } + + ObjectContextRef obc = get_object_context(soid, false); + if (!obc || !obc->obs.exists) { + if (pmissing) + *pmissing = soid; + put_snapset_context(ssc); + if (is_primary()) { + if (is_degraded_or_backfilling_object(soid)) { + dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl; + return -EAGAIN; + } else if (is_degraded_on_async_recovery_target(soid)) { + dout(20) << __func__ << " clone is recovering " << soid << dendl; + return -EAGAIN; + } else { + dout(20) << __func__ << " missing clone " << soid << dendl; + return -ENOENT; + } + } else { + dout(20) << __func__ << " replica missing clone" << soid << dendl; + return -ENOENT; + } + } + + if (!obc->ssc) { + obc->ssc = ssc; + } else { + ceph_assert(obc->ssc == ssc); + put_snapset_context(ssc); + } + ssc = 0; + + // clone + dout(20) << __func__ << " " << soid + << " snapset " << obc->ssc->snapset + << dendl; + snapid_t first, last; + auto p = obc->ssc->snapset.clone_snaps.find(soid.snap); + ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); + if (p->second.empty()) { + dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl; + ceph_assert(!cct->_conf->osd_debug_verify_snaps); + return -ENOENT; + } + if (std::find(p->second.begin(), p->second.end(), oid.snap) == + p->second.end()) { + dout(20) << __func__ << " " << soid << " clone_snaps " << p->second + << " does not contain " << oid.snap << " -- DNE" << dendl; + return -ENOENT; + } + if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) { + dout(20) << __func__ << " " << soid << " snap " << oid.snap + << " in removed_snaps_queue" << " -- DNE" << dendl; + return -ENOENT; + } + dout(20) << __func__ << " " << soid << " clone_snaps " << p->second + << " contains " << oid.snap << " -- HIT " << obc->obs << dendl; + *pobc = obc; + return 0; +} + +void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc) +{ + if (obc->ssc) + put_snapset_context(obc->ssc); +} + +void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat) +{ + object_info_t& oi = obc->obs.oi; + + dout(10) << __func__ << " " << oi.soid << dendl; + ceph_assert(!oi.soid.is_snapdir()); + + object_stat_sum_t stat; + stat.num_objects++; + if (oi.is_dirty()) + stat.num_objects_dirty++; + if (oi.is_whiteout()) + stat.num_whiteouts++; + if (oi.is_omap()) + stat.num_objects_omap++; + if (oi.is_cache_pinned()) + stat.num_objects_pinned++; + if (oi.has_manifest()) + stat.num_objects_manifest++; + + if (oi.soid.is_snap()) { + stat.num_object_clones++; + + if (!obc->ssc) + obc->ssc = get_snapset_context(oi.soid, false); + ceph_assert(obc->ssc); + stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap); + } else { + stat.num_bytes += oi.size; + } + + // add it in + pgstat->stats.sum.add(stat); +} + +void PrimaryLogPG::requeue_op_blocked_by_object(const hobject_t &soid) { + map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid); + if (p != waiting_for_blocked_object.end()) { + list<OpRequestRef>& ls = p->second; + dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl; + requeue_ops(ls); + waiting_for_blocked_object.erase(p); + } +} + +void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc) +{ + const hobject_t& soid = obc->obs.oi.soid; + if (obc->is_blocked()) { + dout(10) << __func__ << " " << soid << " still blocked" << dendl; + return; + } + + requeue_op_blocked_by_object(soid); + + map<hobject_t, ObjectContextRef>::iterator i = + objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head()); + if (i != objects_blocked_on_snap_promotion.end()) { + ceph_assert(i->second == obc); + ObjectContextRef head_obc = get_object_context(i->first, false); + head_obc->stop_block(); + // kick blocked ops (head) + requeue_op_blocked_by_object(i->first); + objects_blocked_on_snap_promotion.erase(i); + } + + if (obc->requeue_scrub_on_unblock) { + + obc->requeue_scrub_on_unblock = false; + + dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl; + + // only requeue if we are still active: we may be unblocking + // because we are resetting for a new peering interval + if (is_active()) { + osd->queue_scrub_unblocking(this, is_scrub_blocking_ops()); + } + } +} + +SnapSetContext *PrimaryLogPG::get_snapset_context( + const hobject_t& oid, + bool can_create, + const map<string, bufferlist, less<>> *attrs, + bool oid_existed) +{ + std::lock_guard l(snapset_contexts_lock); + SnapSetContext *ssc; + map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find( + oid.get_snapdir()); + if (p != snapset_contexts.end()) { + if (can_create || p->second->exists) { + ssc = p->second; + } else { + return NULL; + } + } else { + bufferlist bv; + if (!attrs) { + int r = -ENOENT; + if (!(oid.is_head() && !oid_existed)) { + r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv); + } + if (r < 0 && !can_create) + return NULL; + } else { + auto it_ss = attrs->find(SS_ATTR); + ceph_assert(it_ss != attrs->end()); + bv = it_ss->second; + } + ssc = new SnapSetContext(oid.get_snapdir()); + _register_snapset_context(ssc); + if (bv.length()) { + bufferlist::const_iterator bvp = bv.begin(); + try { + ssc->snapset.decode(bvp); + } catch (const ceph::buffer::error& e) { + dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl; + return NULL; + } + ssc->exists = true; + } else { + ssc->exists = false; + } + } + ceph_assert(ssc); + ssc->ref++; + return ssc; +} + +void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc) +{ + std::lock_guard l(snapset_contexts_lock); + --ssc->ref; + if (ssc->ref == 0) { + if (ssc->registered) + snapset_contexts.erase(ssc->oid); + delete ssc; + } +} + +/* + * Return values: + * NONE - didn't pull anything + * YES - pulled what the caller wanted + * HEAD - needed to pull head first + */ +enum { PULL_NONE, PULL_HEAD, PULL_YES }; + +int PrimaryLogPG::recover_missing( + const hobject_t &soid, eversion_t v, + int priority, + PGBackend::RecoveryHandle *h) +{ + dout(10) << __func__ << " sar: " << scrub_after_recovery << dendl; + + if (recovery_state.get_missing_loc().is_unfound(soid)) { + dout(7) << __func__ << " " << soid + << " v " << v + << " but it is unfound" << dendl; + return PULL_NONE; + } + + if (recovery_state.get_missing_loc().is_deleted(soid)) { + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, ObjectContextRef())); + epoch_t cur_epoch = get_osdmap_epoch(); + remove_missing_object(soid, v, new LambdaContext( + [=, this](int) { + std::scoped_lock locker{*this}; + if (!pg_has_reset_since(cur_epoch)) { + bool object_missing = false; + for (const auto& shard : get_acting_recovery_backfill()) { + if (shard == pg_whoami) + continue; + if (recovery_state.get_peer_missing(shard).is_missing(soid)) { + dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl; + object_missing = true; + break; + } + } + if (!object_missing) { + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + if (scrub_after_recovery) + stat_diff.num_objects_repaired = 1; + on_global_recover(soid, stat_diff, true); + } else { + auto recovery_handle = pgbackend->open_recovery_op(); + pgbackend->recover_delete_object(soid, v, recovery_handle); + pgbackend->run_recovery_op(recovery_handle, priority); + } + } + })); + return PULL_YES; + } + + // is this a snapped object? if so, consult the snapset.. we may not need the entire object! + ObjectContextRef obc; + ObjectContextRef head_obc; + if (soid.snap && soid.snap < CEPH_NOSNAP) { + // do we have the head? + hobject_t head = soid.get_head(); + if (recovery_state.get_pg_log().get_missing().is_missing(head)) { + if (recovering.count(head)) { + dout(10) << " missing but already recovering head " << head << dendl; + return PULL_NONE; + } else { + int r = recover_missing( + head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority, + h); + if (r != PULL_NONE) + return PULL_HEAD; + return PULL_NONE; + } + } + head_obc = get_object_context( + head, + false, + 0); + ceph_assert(head_obc); + } + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, obc)); + int r = pgbackend->recover_object( + soid, + v, + head_obc, + obc, + h); + // This is only a pull which shouldn't return an error + ceph_assert(r >= 0); + return PULL_YES; +} + +void PrimaryLogPG::remove_missing_object(const hobject_t &soid, + eversion_t v, Context *on_complete) +{ + dout(20) << __func__ << " " << soid << " " << v << dendl; + ceph_assert(on_complete != nullptr); + // delete locally + ObjectStore::Transaction t; + remove_snap_mapped_object(t, soid); + + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = v; + + epoch_t cur_epoch = get_osdmap_epoch(); + t.register_on_complete(new LambdaContext( + [=, this](int) { + std::unique_lock locker{*this}; + if (!pg_has_reset_since(cur_epoch)) { + ObjectStore::Transaction t2; + on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2); + t2.register_on_complete(on_complete); + int r = osd->store->queue_transaction(ch, std::move(t2), nullptr); + ceph_assert(r == 0); + locker.unlock(); + } else { + locker.unlock(); + on_complete->complete(-EAGAIN); + } + })); + int r = osd->store->queue_transaction(ch, std::move(t), nullptr); + ceph_assert(r == 0); +} + +void PrimaryLogPG::finish_degraded_object(const hobject_t oid) +{ + dout(10) << __func__ << " " << oid << dendl; + if (callbacks_for_degraded_object.count(oid)) { + list<Context*> contexts; + contexts.swap(callbacks_for_degraded_object[oid]); + callbacks_for_degraded_object.erase(oid); + for (list<Context*>::iterator i = contexts.begin(); + i != contexts.end(); + ++i) { + (*i)->complete(0); + } + } + map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find( + oid.get_head()); + if (i != objects_blocked_on_degraded_snap.end() && + i->second == oid.snap) + objects_blocked_on_degraded_snap.erase(i); +} + +void PrimaryLogPG::_committed_pushed_object( + epoch_t epoch, eversion_t last_complete) +{ + std::scoped_lock locker{*this}; + if (!pg_has_reset_since(epoch)) { + recovery_state.recovery_committed_to(last_complete); + } else { + dout(10) << __func__ + << " pg has changed, not touching last_complete_ondisk" << dendl; + } +} + +void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc) +{ + dout(20) << __func__ << dendl; + if (obc) { + dout(20) << "obc = " << *obc << dendl; + } + ceph_assert(active_pushes >= 1); + --active_pushes; + + // requeue an active chunky scrub waiting on recovery ops + if (!recovery_state.is_deleting() && active_pushes == 0 && + is_scrub_active()) { + + osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops()); + } +} + +void PrimaryLogPG::_applied_recovered_object_replica() +{ + dout(20) << __func__ << dendl; + ceph_assert(active_pushes >= 1); + --active_pushes; + + // requeue an active scrub waiting on recovery ops + if (!recovery_state.is_deleting() && active_pushes == 0 && + is_scrub_active()) { + + osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority()); + } +} + +void PrimaryLogPG::on_failed_pull( + const set<pg_shard_t> &from, + const hobject_t &soid, + const eversion_t &v) +{ + dout(20) << __func__ << ": " << soid << dendl; + ceph_assert(recovering.count(soid)); + auto obc = recovering[soid]; + if (obc) { + list<OpRequestRef> blocked_ops; + obc->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + recovering.erase(soid); + for (auto&& i : from) { + if (i != pg_whoami) { // we'll get it below in primary_error + recovery_state.force_object_missing(i, soid, v); + } + } + + dout(0) << __func__ << " " << soid << " from shard " << from + << ", reps on " << recovery_state.get_missing_loc().get_locations(soid) + << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid) + << dendl; + finish_recovery_op(soid); // close out this attempt, + finish_degraded_object(soid); + + if (from.count(pg_whoami)) { + dout(0) << " primary missing oid " << soid << " version " << v << dendl; + primary_error(soid, v); + backfills_in_flight.erase(soid); + } +} + +eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid) +{ + eversion_t v; + pg_missing_item pmi; + bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi); + ceph_assert(is_missing); + v = pmi.have; + dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl; + + ceph_assert(!get_acting_recovery_backfill().empty()); + for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin(); + i != get_acting_recovery_backfill().end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + if (!recovery_state.get_peer_missing(peer).is_missing(oid)) { + continue; + } + eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have; + dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl; + if (h > v) + v = h; + } + + dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl; + return v; +} + +void PrimaryLogPG::do_update_log_missing(OpRequestRef &op) +{ + const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>( + op->get_req()); + ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING); + ObjectStore::Transaction t; + std::optional<eversion_t> op_trim_to, op_roll_forward_to; + if (m->pg_trim_to != eversion_t()) + op_trim_to = m->pg_trim_to; + if (m->pg_roll_forward_to != eversion_t()) + op_roll_forward_to = m->pg_roll_forward_to; + + dout(20) << __func__ + << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl; + + recovery_state.append_log_entries_update_missing( + m->entries, t, op_trim_to, op_roll_forward_to); + eversion_t new_lcod = info.last_complete; + + Context *complete = new LambdaContext( + [=, this](int) { + const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>( + op->get_req()); + std::scoped_lock locker{*this}; + if (!pg_has_reset_since(msg->get_epoch())) { + update_last_complete_ondisk(new_lcod); + MOSDPGUpdateLogMissingReply *reply = + new MOSDPGUpdateLogMissingReply( + spg_t(info.pgid.pgid, primary_shard().shard), + pg_whoami.shard, + msg->get_epoch(), + msg->min_epoch, + msg->get_tid(), + new_lcod); + reply->set_priority(CEPH_MSG_PRIO_HIGH); + msg->get_connection()->send_message(reply); + } + }); + + if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) { + t.register_on_commit(complete); + } else { + /* Hack to work around the fact that ReplicatedBackend sends + * ack+commit if commit happens first + * + * This behavior is no longer necessary, but we preserve it so old + * primaries can keep their repops in order */ + if (pool.info.is_erasure()) { + t.register_on_complete(complete); + } else { + t.register_on_commit(complete); + } + } + int tr = osd->store->queue_transaction( + ch, + std::move(t), + nullptr); + ceph_assert(tr == 0); + op_applied(info.last_update); +} + +void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op) +{ + const MOSDPGUpdateLogMissingReply *m = + static_cast<const MOSDPGUpdateLogMissingReply*>( + op->get_req()); + dout(20) << __func__ << " got reply from " + << m->get_from() << dendl; + + auto it = log_entry_update_waiting_on.find(m->get_tid()); + if (it != log_entry_update_waiting_on.end()) { + if (it->second.waiting_on.count(m->get_from())) { + it->second.waiting_on.erase(m->get_from()); + if (m->last_complete_ondisk != eversion_t()) { + update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk); + } + } else { + osd->clog->error() + << info.pgid << " got reply " + << *m << " from shard we are not waiting for " + << m->get_from(); + } + + if (it->second.waiting_on.empty()) { + repop_all_committed(it->second.repop.get()); + log_entry_update_waiting_on.erase(it); + } + } else { + osd->clog->error() + << info.pgid << " got reply " + << *m << " on unknown tid " << m->get_tid(); + } +} + +/* Mark all unfound objects as lost. + */ +void PrimaryLogPG::mark_all_unfound_lost( + int what, + std::function<void(int,const std::string&,bufferlist&)> on_finish) +{ + dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl; + list<hobject_t> oids; + + dout(30) << __func__ << ": log before:\n"; + recovery_state.get_pg_log().get_log().print(*_dout); + *_dout << dendl; + + mempool::osd_pglog::list<pg_log_entry_t> log_entries; + + utime_t mtime = ceph_clock_now(); + map<hobject_t, pg_missing_item>::const_iterator m = + recovery_state.get_missing_loc().get_needs_recovery().begin(); + map<hobject_t, pg_missing_item>::const_iterator mend = + recovery_state.get_missing_loc().get_needs_recovery().end(); + + ObcLockManager manager; + eversion_t v = get_next_version(); + v.epoch = get_osdmap_epoch(); + uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound(); + while (m != mend) { + const hobject_t &oid(m->first); + if (!recovery_state.get_missing_loc().is_unfound(oid)) { + // We only care about unfound objects + ++m; + continue; + } + + ObjectContextRef obc; + eversion_t prev; + + switch (what) { + case pg_log_entry_t::LOST_MARK: + ceph_abort_msg("actually, not implemented yet!"); + break; + + case pg_log_entry_t::LOST_REVERT: + prev = pick_newest_available(oid); + if (prev > eversion_t()) { + // log it + pg_log_entry_t e( + pg_log_entry_t::LOST_REVERT, oid, v, + m->second.need, 0, osd_reqid_t(), mtime, 0); + e.reverting_to = prev; + e.mark_unrollbackable(); + log_entries.push_back(e); + dout(10) << e << dendl; + + // we are now missing the new version; recovery code will sort it out. + ++v.version; + ++m; + break; + } + + case pg_log_entry_t::LOST_DELETE: + { + pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need, + 0, osd_reqid_t(), mtime, 0); + if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) { + if (pool.info.require_rollback()) { + e.mod_desc.try_rmobject(v.version); + } else { + e.mark_unrollbackable(); + } + } // otherwise, just do what we used to do + dout(10) << e << dendl; + log_entries.push_back(e); + oids.push_back(oid); + + // If context found mark object as deleted in case + // of racing with new creation. This can happen if + // object lost and EIO at primary. + obc = object_contexts.lookup(oid); + if (obc) + obc->obs.exists = false; + + ++v.version; + ++m; + } + break; + + default: + ceph_abort(); + } + } + + recovery_state.update_stats( + [](auto &history, auto &stats) { + stats.stats_invalid = true; + return false; + }); + + submit_log_entries( + log_entries, + std::move(manager), + std::optional<std::function<void(void)> >( + [this, oids, num_unfound, on_finish]() { + if (recovery_state.perform_deletes_during_peering()) { + for (auto oid : oids) { + // clear old locations - merge_new_log_entries will have + // handled rebuilding missing_loc for each of these + // objects if we have the RECOVERY_DELETES flag + recovery_state.object_recovered(oid, object_stat_sum_t()); + } + } + + if (is_recovery_unfound()) { + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery()))); + } else if (is_backfill_unfound()) { + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill()))); + } else { + queue_recovery(); + } + + stringstream ss; + ss << "pg has " << num_unfound + << " objects unfound and apparently lost marking"; + string rs = ss.str(); + dout(0) << "do_command r=" << 0 << " " << rs << dendl; + osd->clog->info() << rs; + bufferlist empty; + on_finish(0, rs, empty); + }), + OpRequestRef()); +} + +void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits) +{ + ceph_assert(repop_queue.empty()); +} + +/* + * pg status change notification + */ + +void PrimaryLogPG::apply_and_flush_repops(bool requeue) +{ + list<OpRequestRef> rq; + + // apply all repops + while (!repop_queue.empty()) { + RepGather *repop = repop_queue.front(); + repop_queue.pop_front(); + dout(10) << " canceling repop tid " << repop->rep_tid << dendl; + repop->rep_aborted = true; + repop->on_committed.clear(); + repop->on_success.clear(); + + if (requeue) { + if (repop->op) { + dout(10) << " requeuing " << *repop->op->get_req() << dendl; + rq.push_back(repop->op); + repop->op = OpRequestRef(); + } + + // also requeue any dups, interleaved into position + auto p = waiting_for_ondisk.find(repop->v); + if (p != waiting_for_ondisk.end()) { + dout(10) << " also requeuing ondisk waiters " << p->second << dendl; + for (auto& i : p->second) { + rq.push_back(std::get<0>(i)); + } + waiting_for_ondisk.erase(p); + } + } + + remove_repop(repop); + } + + ceph_assert(repop_queue.empty()); + + if (requeue) { + requeue_ops(rq); + if (!waiting_for_ondisk.empty()) { + for (auto& i : waiting_for_ondisk) { + for (auto& j : i.second) { + derr << __func__ << ": op " << *(std::get<0>(j)->get_req()) + << " waiting on " << i.first << dendl; + } + } + ceph_assert(waiting_for_ondisk.empty()); + } + } + + waiting_for_ondisk.clear(); +} + +void PrimaryLogPG::on_flushed() +{ + requeue_ops(waiting_for_flush); + if (!is_peered() || !is_primary()) { + pair<hobject_t, ObjectContextRef> i; + while (object_contexts.get_next(i.first, &i)) { + derr << __func__ << ": object " << i.first << " obc still alive" << dendl; + } + ceph_assert(object_contexts.empty()); + } +} + +void PrimaryLogPG::on_removal(ObjectStore::Transaction &t) +{ + dout(10) << __func__ << dendl; + + on_shutdown(); + + t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch())); +} + +void PrimaryLogPG::clear_async_reads() +{ + dout(10) << __func__ << dendl; + for(auto& i : in_progress_async_reads) { + dout(10) << "clear ctx: " + << "OpRequestRef " << i.first + << " OpContext " << i.second + << dendl; + close_op_ctx(i.second); + } +} + +void PrimaryLogPG::clear_cache() +{ + object_contexts.clear(); +} + +void PrimaryLogPG::on_shutdown() +{ + dout(10) << __func__ << dendl; + + if (recovery_queued) { + recovery_queued = false; + osd->clear_queued_recovery(this); + } + + m_scrubber->scrub_clear_state(); + m_scrubber->rm_from_osd_scrubbing(); + + vector<ceph_tid_t> tids; + cancel_copy_ops(false, &tids); + cancel_flush_ops(false, &tids); + cancel_proxy_ops(false, &tids); + cancel_manifest_ops(false, &tids); + cancel_cls_gather_ops(false, &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + + apply_and_flush_repops(false); + cancel_log_updates(); + // we must remove PGRefs, so do this this prior to release_backoffs() callers + clear_backoffs(); + // clean up snap trim references + snap_trimmer_machine.process_event(Reset()); + + pgbackend->on_change(); + + context_registry_on_change(); + object_contexts.clear(); + + clear_async_reads(); + + osd->remote_reserver.cancel_reservation(info.pgid); + osd->local_reserver.cancel_reservation(info.pgid); + + clear_primary_state(); + cancel_recovery(); + + if (is_primary()) { + osd->clear_ready_to_merge(this); + } +} + +void PrimaryLogPG::on_activate_complete() +{ + check_local(); + // waiters + if (!recovery_state.needs_flush()) { + requeue_ops(waiting_for_peered); + } else if (!waiting_for_peered.empty()) { + dout(10) << __func__ << " flushes in progress, moving " + << waiting_for_peered.size() + << " items to waiting_for_flush" + << dendl; + ceph_assert(waiting_for_flush.empty()); + waiting_for_flush.swap(waiting_for_peered); + } + + + // all clean? + if (needs_recovery()) { + dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery()))); + } else if (needs_backfill()) { + dout(10) << "activate queueing backfill" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill()))); + } else { + dout(10) << "activate all replicas clean, no recovery" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::AllReplicasRecovered()))); + } + + publish_stats_to_osd(); + + if (get_backfill_targets().size()) { + last_backfill_started = recovery_state.earliest_backfill(); + new_backfill = true; + ceph_assert(!last_backfill_started.is_max()); + dout(5) << __func__ << ": bft=" << get_backfill_targets() + << " from " << last_backfill_started << dendl; + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + dout(5) << "target shard " << *i + << " from " << recovery_state.get_peer_info(*i).last_backfill + << dendl; + } + } + + hit_set_setup(); + agent_setup(); +} + +void PrimaryLogPG::on_change(ObjectStore::Transaction &t) +{ + dout(10) << __func__ << dendl; + + if (hit_set && hit_set->insert_count() == 0) { + dout(20) << " discarding empty hit_set" << dendl; + hit_set_clear(); + } + + if (recovery_queued) { + recovery_queued = false; + osd->clear_queued_recovery(this); + } + + // requeue everything in the reverse order they should be + // reexamined. + requeue_ops(waiting_for_peered); + requeue_ops(waiting_for_flush); + requeue_ops(waiting_for_active); + requeue_ops(waiting_for_readable); + + vector<ceph_tid_t> tids; + cancel_copy_ops(is_primary(), &tids); + cancel_flush_ops(is_primary(), &tids); + cancel_proxy_ops(is_primary(), &tids); + cancel_manifest_ops(is_primary(), &tids); + cancel_cls_gather_ops(is_primary(), &tids); + osd->objecter->op_cancel(tids, -ECANCELED); + + // requeue object waiters + for (auto& p : waiting_for_unreadable_object) { + release_backoffs(p.first); + } + if (is_primary()) { + requeue_object_waiters(waiting_for_unreadable_object); + } else { + waiting_for_unreadable_object.clear(); + } + for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin(); + p != waiting_for_degraded_object.end(); + waiting_for_degraded_object.erase(p++)) { + release_backoffs(p->first); + if (is_primary()) + requeue_ops(p->second); + else + p->second.clear(); + finish_degraded_object(p->first); + } + + // requeues waiting_for_scrub + m_scrubber->scrub_clear_state(); + + for (auto p = waiting_for_blocked_object.begin(); + p != waiting_for_blocked_object.end(); + waiting_for_blocked_object.erase(p++)) { + if (is_primary()) + requeue_ops(p->second); + else + p->second.clear(); + } + for (auto i = callbacks_for_degraded_object.begin(); + i != callbacks_for_degraded_object.end(); + ) { + finish_degraded_object((i++)->first); + } + ceph_assert(callbacks_for_degraded_object.empty()); + + if (is_primary()) { + requeue_ops(waiting_for_cache_not_full); + } else { + waiting_for_cache_not_full.clear(); + } + objects_blocked_on_cache_full.clear(); + + for (list<pair<OpRequestRef, OpContext*> >::iterator i = + in_progress_async_reads.begin(); + i != in_progress_async_reads.end(); + in_progress_async_reads.erase(i++)) { + close_op_ctx(i->second); + if (is_primary()) + requeue_op(i->first); + } + + // this will requeue ops we were working on but didn't finish, and + // any dups + apply_and_flush_repops(is_primary()); + cancel_log_updates(); + + // do this *after* apply_and_flush_repops so that we catch any newly + // registered watches. + context_registry_on_change(); + + pgbackend->on_change_cleanup(&t); + m_scrubber->cleanup_store(&t); + pgbackend->on_change(); + + // clear snap_trimmer state + snap_trimmer_machine.process_event(Reset()); + + debug_op_order.clear(); + unstable_stats.clear(); + + // we don't want to cache object_contexts through the interval change + // NOTE: we actually assert that all currently live references are dead + // by the time the flush for the next interval completes. + object_contexts.clear(); + + // should have been cleared above by finishing all of the degraded objects + ceph_assert(objects_blocked_on_degraded_snap.empty()); +} + +void PrimaryLogPG::plpg_on_role_change() +{ + dout(10) << __func__ << dendl; + if (get_role() != 0 && hit_set) { + dout(10) << " clearing hit set" << dendl; + hit_set_clear(); + } +} + +void PrimaryLogPG::plpg_on_pool_change() +{ + dout(10) << __func__ << dendl; + // requeue cache full waiters just in case the cache_mode is + // changing away from writeback mode. note that if we are not + // active the normal requeuing machinery is sufficient (and properly + // ordered). + if (is_active() && + pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK && + !waiting_for_cache_not_full.empty()) { + dout(10) << __func__ << " requeuing full waiters (not in writeback) " + << dendl; + requeue_ops(waiting_for_cache_not_full); + objects_blocked_on_cache_full.clear(); + } + hit_set_setup(); + agent_setup(); +} + +// clear state. called on recovery completion AND cancellation. +void PrimaryLogPG::_clear_recovery_state() +{ +#ifdef DEBUG_RECOVERY_OIDS + recovering_oids.clear(); +#endif + dout(15) << __func__ << " flags: " << m_planned_scrub << dendl; + + last_backfill_started = hobject_t(); + set<hobject_t>::iterator i = backfills_in_flight.begin(); + while (i != backfills_in_flight.end()) { + backfills_in_flight.erase(i++); + } + + list<OpRequestRef> blocked_ops; + for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin(); + i != recovering.end(); + recovering.erase(i++)) { + if (i->second) { + i->second->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + } + ceph_assert(backfills_in_flight.empty()); + pending_backfill_updates.clear(); + ceph_assert(recovering.empty()); + pgbackend->clear_recovery_state(); +} + +void PrimaryLogPG::cancel_pull(const hobject_t &soid) +{ + dout(20) << __func__ << ": " << soid << dendl; + ceph_assert(recovering.count(soid)); + ObjectContextRef obc = recovering[soid]; + if (obc) { + list<OpRequestRef> blocked_ops; + obc->drop_recovery_read(&blocked_ops); + requeue_ops(blocked_ops); + } + recovering.erase(soid); + finish_recovery_op(soid); + release_backoffs(soid); + if (waiting_for_degraded_object.count(soid)) { + dout(20) << " kicking degraded waiters on " << soid << dendl; + requeue_ops(waiting_for_degraded_object[soid]); + waiting_for_degraded_object.erase(soid); + } + if (waiting_for_unreadable_object.count(soid)) { + dout(20) << " kicking unreadable waiters on " << soid << dendl; + requeue_ops(waiting_for_unreadable_object[soid]); + waiting_for_unreadable_object.erase(soid); + } + if (is_missing_object(soid)) + recovery_state.set_last_requested(0); + finish_degraded_object(soid); +} + +void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap) +{ + pgbackend->check_recovery_sources(osdmap); +} + +bool PrimaryLogPG::start_recovery_ops( + uint64_t max, + ThreadPool::TPHandle &handle, + uint64_t *ops_started) +{ + uint64_t& started = *ops_started; + started = 0; + bool work_in_progress = false; + bool recovery_started = false; + ceph_assert(is_primary()); + ceph_assert(is_peered()); + ceph_assert(!recovery_state.is_deleting()); + + ceph_assert(recovery_queued); + recovery_queued = false; + + if (!state_test(PG_STATE_RECOVERING) && + !state_test(PG_STATE_BACKFILLING)) { + /* TODO: I think this case is broken and will make do_recovery() + * unhappy since we're returning false */ + dout(10) << "recovery raced and were queued twice, ignoring!" << dendl; + return have_unfound(); + } + + const auto &missing = recovery_state.get_pg_log().get_missing(); + + uint64_t num_unfound = get_num_unfound(); + + if (!recovery_state.have_missing()) { + recovery_state.local_recovery_complete(); + } + + if (!missing.have_missing() || // Primary does not have missing + // or all of the missing objects are unfound. + recovery_state.all_missing_unfound()) { + // Recover the replicas. + started = recover_replicas(max, handle, &recovery_started); + } + if (!started) { + // We still have missing objects that we should grab from replicas. + started += recover_primary(max, handle); + } + if (!started && num_unfound != get_num_unfound()) { + // second chance to recovery replicas + started = recover_replicas(max, handle, &recovery_started); + } + + if (started || recovery_started) + work_in_progress = true; + + bool deferred_backfill = false; + if (recovering.empty() && + state_test(PG_STATE_BACKFILLING) && + !get_backfill_targets().empty() && started < max && + missing.num_missing() == 0 && + waiting_on_backfill.empty()) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) { + dout(10) << "deferring backfill due to NOBACKFILL" << dendl; + deferred_backfill = true; + } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) && + !is_degraded()) { + dout(10) << "deferring backfill due to NOREBALANCE" << dendl; + deferred_backfill = true; + } else if (!recovery_state.is_backfill_reserved()) { + /* DNMNOTE I think this branch is dead */ + dout(10) << "deferring backfill due to !backfill_reserved" << dendl; + if (!backfill_reserving) { + dout(10) << "queueing RequestBackfill" << dendl; + backfill_reserving = true; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill()))); + } + deferred_backfill = true; + } else { + started += recover_backfill(max - started, handle, &work_in_progress); + } + } + + dout(10) << " started " << started << dendl; + osd->logger->inc(l_osd_rop, started); + + if (!recovering.empty() || + work_in_progress || recovery_ops_active > 0 || deferred_backfill) + return !work_in_progress && have_unfound(); + + ceph_assert(recovering.empty()); + ceph_assert(recovery_ops_active == 0); + + dout(10) << __func__ << " needs_recovery: " + << recovery_state.get_missing_loc().get_needs_recovery() + << dendl; + dout(10) << __func__ << " missing_loc: " + << recovery_state.get_missing_loc().get_missing_locs() + << dendl; + int unfound = get_num_unfound(); + if (unfound) { + dout(10) << " still have " << unfound << " unfound" << dendl; + return true; + } + + if (missing.num_missing() > 0) { + // this shouldn't happen! + osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with " + << missing.num_missing() << ": " << missing.get_items(); + return false; + } + + if (needs_recovery()) { + // this shouldn't happen! + // We already checked num_missing() so we must have missing replicas + osd->clog->error() << info.pgid + << " Unexpected Error: recovery ending with missing replicas"; + return false; + } + + if (state_test(PG_STATE_RECOVERING)) { + state_clear(PG_STATE_RECOVERING); + state_clear(PG_STATE_FORCED_RECOVERY); + if (needs_backfill()) { + dout(10) << "recovery done, queuing backfill" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill()))); + } else { + dout(10) << "recovery done, no backfill" << dendl; + state_clear(PG_STATE_FORCED_BACKFILL); + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::AllReplicasRecovered()))); + } + } else { // backfilling + state_clear(PG_STATE_BACKFILLING); + state_clear(PG_STATE_FORCED_BACKFILL); + state_clear(PG_STATE_FORCED_RECOVERY); + dout(10) << "recovery done, backfill done" << dendl; + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::Backfilled()))); + } + + return false; +} + +/** + * do one recovery op. + * return true if done, false if nothing left to do. + */ +uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle) +{ + ceph_assert(is_primary()); + + const auto &missing = recovery_state.get_pg_log().get_missing(); + + dout(10) << __func__ << " recovering " << recovering.size() + << " in pg," + << " missing " << missing << dendl; + + dout(25) << __func__ << " " << missing.get_items() << dendl; + + // look at log! + pg_log_entry_t *latest = 0; + unsigned started = 0; + int skipped = 0; + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + map<version_t, hobject_t>::const_iterator p = + missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested); + while (p != missing.get_rmissing().end()) { + handle.reset_tp_timeout(); + hobject_t soid; + version_t v = p->first; + + auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second); + if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) { + latest = it_objects->second; + ceph_assert(latest->is_update() || latest->is_delete()); + soid = latest->soid; + } else { + latest = 0; + soid = p->second; + } + const pg_missing_item& item = missing.get_items().find(p->second)->second; + ++p; + + hobject_t head = soid.get_head(); + + eversion_t need = item.need; + + dout(10) << __func__ << " " + << soid << " " << item.need + << (missing.is_missing(soid) ? " (missing)":"") + << (missing.is_missing(head) ? " (missing head)":"") + << (recovering.count(soid) ? " (recovering)":"") + << (recovering.count(head) ? " (recovering head)":"") + << dendl; + + if (latest) { + switch (latest->op) { + case pg_log_entry_t::CLONE: + /* + * Handling for this special case removed for now, until we + * can correctly construct an accurate SnapSet from the old + * one. + */ + break; + + case pg_log_entry_t::LOST_REVERT: + { + if (item.have == latest->reverting_to) { + ObjectContextRef obc = get_object_context(soid, true); + + if (obc->obs.oi.version == latest->version) { + // I'm already reverting + dout(10) << " already reverting " << soid << dendl; + } else { + dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl; + obc->obs.oi.version = latest->version; + + ObjectStore::Transaction t; + bufferlist b2; + obc->obs.oi.encode( + b2, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + ceph_assert(!pool.info.require_rollback()); + t.setattr(coll, ghobject_t(soid), OI_ATTR, b2); + + recovery_state.recover_got( + soid, + latest->version, + false, + t); + + ++active_pushes; + + t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc)); + t.register_on_commit(new C_OSD_CommittedPushedObject( + this, + get_osdmap_epoch(), + info.last_complete)); + osd->store->queue_transaction(ch, std::move(t)); + continue; + } + } else { + /* + * Pull the old version of the object. Update missing_loc here to have the location + * of the version we want. + * + * This doesn't use the usual missing_loc paths, but that's okay: + * - if we have it locally, we hit the case above, and go from there. + * - if we don't, we always pass through this case during recovery and set up the location + * properly. + * - this way we don't need to mangle the missing code to be general about needing an old + * version... + */ + eversion_t alternate_need = latest->reverting_to; + dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl; + + set<pg_shard_t> good_peers; + for (auto p = recovery_state.get_peer_missing().begin(); + p != recovery_state.get_peer_missing().end(); + ++p) { + if (p->second.is_missing(soid, need) && + p->second.get_items().at(soid).have == alternate_need) { + good_peers.insert(p->first); + } + } + recovery_state.set_revert_with_targets( + soid, + good_peers); + dout(10) << " will pull " << alternate_need << " or " << need + << " from one of " + << recovery_state.get_missing_loc().get_locations(soid) + << dendl; + } + } + break; + } + } + + if (!recovering.count(soid)) { + if (recovering.count(head)) { + ++skipped; + } else { + int r = recover_missing( + soid, need, recovery_state.get_recovery_op_priority(), h); + switch (r) { + case PULL_YES: + ++started; + break; + case PULL_HEAD: + ++started; + case PULL_NONE: + ++skipped; + break; + default: + ceph_abort(); + } + if (started >= max) + break; + } + } + + // only advance last_requested if we haven't skipped anything + if (!skipped) + recovery_state.set_last_requested(v); + } + + pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority()); + return started; +} + +bool PrimaryLogPG::primary_error( + const hobject_t& soid, eversion_t v) +{ + recovery_state.force_object_missing(pg_whoami, soid, v); + bool uhoh = recovery_state.get_missing_loc().is_unfound(soid); + if (uhoh) + osd->clog->error() << info.pgid << " missing primary copy of " + << soid << ", unfound"; + else + osd->clog->error() << info.pgid << " missing primary copy of " + << soid + << ", will try copies on " + << recovery_state.get_missing_loc().get_locations(soid); + return uhoh; +} + +int PrimaryLogPG::prep_object_replica_deletes( + const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started) +{ + ceph_assert(is_primary()); + dout(10) << __func__ << ": on " << soid << dendl; + + ObjectContextRef obc = get_object_context(soid, false); + if (obc) { + if (!obc->get_recovery_read()) { + dout(20) << "replica delete delayed on " << soid + << "; could not get rw_manager lock" << dendl; + *work_started = true; + return 0; + } else { + dout(20) << "replica delete got recovery read lock on " << soid + << dendl; + } + } + + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + if (!obc) + recovering.insert(make_pair(soid, ObjectContextRef())); + else + recovering.insert(make_pair(soid, obc)); + + pgbackend->recover_delete_object(soid, v, h); + return 1; +} + +int PrimaryLogPG::prep_object_replica_pushes( + const hobject_t& soid, eversion_t v, + PGBackend::RecoveryHandle *h, + bool *work_started) +{ + ceph_assert(is_primary()); + dout(10) << __func__ << ": on " << soid << dendl; + + if (soid.snap && soid.snap < CEPH_NOSNAP) { + // do we have the head and/or snapdir? + hobject_t head = soid.get_head(); + if (recovery_state.get_pg_log().get_missing().is_missing(head)) { + if (recovering.count(head)) { + dout(10) << " missing but already recovering head " << head << dendl; + return 0; + } else { + int r = recover_missing( + head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, + recovery_state.get_recovery_op_priority(), h); + if (r != PULL_NONE) + return 1; + return 0; + } + } + } + + // NOTE: we know we will get a valid oloc off of disk here. + ObjectContextRef obc = get_object_context(soid, false); + if (!obc) { + primary_error(soid, v); + return 0; + } + + if (!obc->get_recovery_read()) { + dout(20) << "recovery delayed on " << soid + << "; could not get rw_manager lock" << dendl; + *work_started = true; + return 0; + } else { + dout(20) << "recovery got recovery read lock on " << soid + << dendl; + } + + start_recovery_op(soid); + ceph_assert(!recovering.count(soid)); + recovering.insert(make_pair(soid, obc)); + + int r = pgbackend->recover_object( + soid, + v, + ObjectContextRef(), + obc, // has snapset context + h); + if (r < 0) { + dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl; + on_failed_pull({ pg_whoami }, soid, v); + return 0; + } + return 1; +} + +uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle, + bool *work_started) +{ + dout(10) << __func__ << "(" << max << ")" << dendl; + uint64_t started = 0; + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + + // this is FAR from an optimal recovery order. pretty lame, really. + ceph_assert(!get_acting_recovery_backfill().empty()); + // choose replicas to recover, replica has the shortest missing list first + // so we can bring it back to normal ASAP + std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing, + async_by_num_missing; + replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1); + for (auto &p: get_acting_recovery_backfill()) { + if (p == get_primary()) { + continue; + } + auto pm = recovery_state.get_peer_missing().find(p); + ceph_assert(pm != recovery_state.get_peer_missing().end()); + auto nm = pm->second.num_missing(); + if (nm != 0) { + if (is_async_recovery_target(p)) { + async_by_num_missing.push_back(make_pair(nm, p)); + } else { + replicas_by_num_missing.push_back(make_pair(nm, p)); + } + } + } + // sort by number of missing objects, in ascending order. + auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs, + const std::pair<unsigned int, pg_shard_t> &rhs) { + return lhs.first < rhs.first; + }; + // acting goes first + std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func); + // then async_recovery_targets + std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func); + replicas_by_num_missing.insert(replicas_by_num_missing.end(), + async_by_num_missing.begin(), async_by_num_missing.end()); + for (auto &replica: replicas_by_num_missing) { + pg_shard_t &peer = replica.second; + ceph_assert(peer != get_primary()); + auto pm = recovery_state.get_peer_missing().find(peer); + ceph_assert(pm != recovery_state.get_peer_missing().end()); + size_t m_sz = pm->second.num_missing(); + + dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl; + dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl; + + // oldest first! + const pg_missing_t &m(pm->second); + for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin(); + p != m.get_rmissing().end() && started < max; + ++p) { + handle.reset_tp_timeout(); + const hobject_t soid(p->second); + + if (recovery_state.get_missing_loc().is_unfound(soid)) { + dout(10) << __func__ << ": " << soid << " still unfound" << dendl; + continue; + } + + const pg_info_t &pi = recovery_state.get_peer_info(peer); + if (soid > pi.last_backfill) { + if (!recovering.count(soid)) { + derr << __func__ << ": object " << soid << " last_backfill " + << pi.last_backfill << dendl; + derr << __func__ << ": object added to missing set for backfill, but " + << "is not in recovering, error!" << dendl; + ceph_abort(); + } + continue; + } + + if (recovering.count(soid)) { + dout(10) << __func__ << ": already recovering " << soid << dendl; + continue; + } + + if (recovery_state.get_missing_loc().is_deleted(soid)) { + dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl; + map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid); + started += prep_object_replica_deletes(soid, r->second.need, h, work_started); + continue; + } + + if (soid.is_snap() && + recovery_state.get_pg_log().get_missing().is_missing( + soid.get_head())) { + dout(10) << __func__ << ": " << soid.get_head() + << " still missing on primary" << dendl; + continue; + } + + if (recovery_state.get_pg_log().get_missing().is_missing(soid)) { + dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl; + continue; + } + + dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl; + map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid); + started += prep_object_replica_pushes(soid, r->second.need, h, work_started); + } + } + + pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority()); + return started; +} + +hobject_t PrimaryLogPG::earliest_peer_backfill() const +{ + hobject_t e = hobject_t::get_max(); + for (const pg_shard_t& peer : get_backfill_targets()) { + const auto iter = peer_backfill_info.find(peer); + ceph_assert(iter != peer_backfill_info.end()); + e = std::min(e, iter->second.begin); + } + return e; +} + +bool PrimaryLogPG::all_peer_done() const +{ + // Primary hasn't got any more objects + ceph_assert(backfill_info.empty()); + + for (const pg_shard_t& bt : get_backfill_targets()) { + const auto piter = peer_backfill_info.find(bt); + ceph_assert(piter != peer_backfill_info.end()); + const BackfillInterval& pbi = piter->second; + // See if peer has more to process + if (!pbi.extends_to_end() || !pbi.empty()) + return false; + } + return true; +} + +/** + * recover_backfill + * + * Invariants: + * + * backfilled: fully pushed to replica or present in replica's missing set (both + * our copy and theirs). + * + * All objects on a backfill_target in + * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed + * objects have been actually deleted and all logically-valid objects are replicated. + * There may be PG objects in this interval yet to be backfilled. + * + * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all + * backfill_targets. There may be objects on backfill_target(s) yet to be deleted. + * + * For a backfill target, all objects < std::min(peer_backfill_info[target].begin, + * backfill_info.begin) in PG are backfilled. No deleted objects in this + * interval remain on the backfill target. + * + * For a backfill target, all objects <= peer_info[target].last_backfill + * have been backfilled to target + * + * There *MAY* be missing/outdated objects between last_backfill_started and + * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client + * io created objects since the last scan. For this reason, we call + * update_range() again before continuing backfill. + */ +uint64_t PrimaryLogPG::recover_backfill( + uint64_t max, + ThreadPool::TPHandle &handle, bool *work_started) +{ + dout(10) << __func__ << " (" << max << ")" + << " bft=" << get_backfill_targets() + << " last_backfill_started " << last_backfill_started + << (new_backfill ? " new_backfill":"") + << dendl; + ceph_assert(!get_backfill_targets().empty()); + + // Initialize from prior backfill state + if (new_backfill) { + // on_activate() was called prior to getting here + ceph_assert(last_backfill_started == recovery_state.earliest_backfill()); + new_backfill = false; + + // initialize BackfillIntervals + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + peer_backfill_info[*i].reset( + recovery_state.get_peer_info(*i).last_backfill); + } + backfill_info.reset(last_backfill_started); + + backfills_in_flight.clear(); + pending_backfill_updates.clear(); + } + + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + dout(10) << "peer osd." << *i + << " info " << recovery_state.get_peer_info(*i) + << " interval " << peer_backfill_info[*i].begin + << "-" << peer_backfill_info[*i].end + << " " << peer_backfill_info[*i].objects.size() << " objects" + << dendl; + } + + // update our local interval to cope with recent changes + backfill_info.begin = last_backfill_started; + update_range(&backfill_info, handle); + + unsigned ops = 0; + vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove; + set<hobject_t> add_to_stat; + + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + peer_backfill_info[*i].trim_to( + std::max( + recovery_state.get_peer_info(*i).last_backfill, + last_backfill_started)); + } + backfill_info.trim_to(last_backfill_started); + + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + while (ops < max) { + if (backfill_info.begin <= earliest_peer_backfill() && + !backfill_info.extends_to_end() && backfill_info.empty()) { + hobject_t next = backfill_info.end; + backfill_info.reset(next); + backfill_info.end = hobject_t::get_max(); + update_range(&backfill_info, handle); + backfill_info.trim(); + } + + dout(20) << " my backfill interval " << backfill_info << dendl; + + bool sent_scan = false; + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + + dout(20) << " peer shard " << bt << " backfill " << pbi << dendl; + if (pbi.begin <= backfill_info.begin && + !pbi.extends_to_end() && pbi.empty()) { + dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl; + epoch_t e = get_osdmap_epoch(); + MOSDPGScan *m = new MOSDPGScan( + MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(), + spg_t(info.pgid.pgid, bt.shard), + pbi.end, hobject_t()); + + if (cct->_conf->osd_op_queue == "mclock_scheduler") { + /* This guard preserves legacy WeightedPriorityQueue behavior for + * now, but should be removed after Reef */ + m->set_priority(recovery_state.get_recovery_op_priority()); + } + osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch()); + ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end()); + waiting_on_backfill.insert(bt); + sent_scan = true; + } + } + + // Count simultaneous scans as a single op and let those complete + if (sent_scan) { + ops++; + start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end + break; + } + + if (backfill_info.empty() && all_peer_done()) { + dout(10) << " reached end for both local and all peers" << dendl; + break; + } + + // Get object within set of peers to operate on and + // the set of targets for which that object applies. + hobject_t check = earliest_peer_backfill(); + + if (check < backfill_info.begin) { + + set<pg_shard_t> check_targets; + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + if (pbi.begin == check) + check_targets.insert(bt); + } + ceph_assert(!check_targets.empty()); + + dout(20) << " BACKFILL removing " << check + << " from peers " << check_targets << dendl; + for (set<pg_shard_t>::iterator i = check_targets.begin(); + i != check_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + ceph_assert(pbi.begin == check); + + to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt)); + pbi.pop_front(); + } + + last_backfill_started = check; + + // Don't increment ops here because deletions + // are cheap and not replied to unlike real recovery_ops, + // and we can't increment ops without requeueing ourself + // for recovery. + } else { + eversion_t& obj_v = backfill_info.objects.begin()->second; + + vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs; + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + // Find all check peers that have the wrong version + if (check == backfill_info.begin && check == pbi.begin) { + if (pbi.objects.begin()->second != obj_v) { + need_ver_targs.push_back(bt); + } else { + keep_ver_targs.push_back(bt); + } + } else { + const pg_info_t& pinfo = recovery_state.get_peer_info(bt); + + // Only include peers that we've caught up to their backfill line + // otherwise, they only appear to be missing this object + // because their pbi.begin > backfill_info.begin. + if (backfill_info.begin > pinfo.last_backfill) + missing_targs.push_back(bt); + else + skip_targs.push_back(bt); + } + } + + if (!keep_ver_targs.empty()) { + // These peers have version obj_v + dout(20) << " BACKFILL keeping " << check + << " with ver " << obj_v + << " on peers " << keep_ver_targs << dendl; + //assert(!waiting_for_degraded_object.count(check)); + } + if (!need_ver_targs.empty() || !missing_targs.empty()) { + ObjectContextRef obc = get_object_context(backfill_info.begin, false); + ceph_assert(obc); + if (obc->get_recovery_read()) { + if (!need_ver_targs.empty()) { + dout(20) << " BACKFILL replacing " << check + << " with ver " << obj_v + << " to peers " << need_ver_targs << dendl; + } + if (!missing_targs.empty()) { + dout(20) << " BACKFILL pushing " << backfill_info.begin + << " with ver " << obj_v + << " to peers " << missing_targs << dendl; + } + vector<pg_shard_t> all_push = need_ver_targs; + all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end()); + + handle.reset_tp_timeout(); + int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h); + if (r < 0) { + *work_started = true; + dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl; + break; + } + ops++; + } else { + *work_started = true; + dout(20) << "backfill blocking on " << backfill_info.begin + << "; could not get rw_manager lock" << dendl; + break; + } + } + dout(20) << "need_ver_targs=" << need_ver_targs + << " keep_ver_targs=" << keep_ver_targs << dendl; + dout(20) << "backfill_targets=" << get_backfill_targets() + << " missing_targs=" << missing_targs + << " skip_targs=" << skip_targs << dendl; + + last_backfill_started = backfill_info.begin; + add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes? + backfill_info.pop_front(); + vector<pg_shard_t> check_targets = need_ver_targs; + check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end()); + for (vector<pg_shard_t>::iterator i = check_targets.begin(); + i != check_targets.end(); + ++i) { + pg_shard_t bt = *i; + BackfillInterval& pbi = peer_backfill_info[bt]; + pbi.pop_front(); + } + } + } + + for (set<hobject_t>::iterator i = add_to_stat.begin(); + i != add_to_stat.end(); + ++i) { + ObjectContextRef obc = get_object_context(*i, false); + ceph_assert(obc); + pg_stat_t stat; + add_object_context_to_pg_stat(obc, &stat); + pending_backfill_updates[*i] = stat; + } + map<pg_shard_t,MOSDPGBackfillRemove*> reqs; + for (unsigned i = 0; i < to_remove.size(); ++i) { + handle.reset_tp_timeout(); + const hobject_t& oid = to_remove[i].get<0>(); + eversion_t v = to_remove[i].get<1>(); + pg_shard_t peer = to_remove[i].get<2>(); + MOSDPGBackfillRemove *m; + auto it = reqs.find(peer); + if (it != reqs.end()) { + m = it->second; + } else { + m = reqs[peer] = new MOSDPGBackfillRemove( + spg_t(info.pgid.pgid, peer.shard), + get_osdmap_epoch()); + if (cct->_conf->osd_op_queue == "mclock_scheduler") { + /* This guard preserves legacy WeightedPriorityQueue behavior for + * now, but should be removed after Reef */ + m->set_priority(recovery_state.get_recovery_op_priority()); + } + } + m->ls.push_back(make_pair(oid, v)); + + if (oid <= last_backfill_started) + pending_backfill_updates[oid]; // add empty stat! + } + for (auto p : reqs) { + osd->send_message_osd_cluster(p.first.osd, p.second, + get_osdmap_epoch()); + } + + pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority()); + + hobject_t backfill_pos = + std::min(backfill_info.begin, earliest_peer_backfill()); + dout(5) << "backfill_pos is " << backfill_pos << dendl; + for (set<hobject_t>::iterator i = backfills_in_flight.begin(); + i != backfills_in_flight.end(); + ++i) { + dout(20) << *i << " is still in flight" << dendl; + } + + hobject_t next_backfill_to_complete = backfills_in_flight.empty() ? + backfill_pos : *(backfills_in_flight.begin()); + hobject_t new_last_backfill = recovery_state.earliest_backfill(); + dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl; + for (map<hobject_t, pg_stat_t>::iterator i = + pending_backfill_updates.begin(); + i != pending_backfill_updates.end() && + i->first < next_backfill_to_complete; + pending_backfill_updates.erase(i++)) { + dout(20) << " pending_backfill_update " << i->first << dendl; + ceph_assert(i->first > new_last_backfill); + // carried from a previous round – if we are here, then we had to + // be requeued (by e.g. on_global_recover()) and those operations + // are done. + recovery_state.update_complete_backfill_object_stats( + i->first, + i->second); + new_last_backfill = i->first; + } + dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl; + + ceph_assert(!pending_backfill_updates.empty() || + new_last_backfill == last_backfill_started); + if (pending_backfill_updates.empty() && + backfill_pos.is_max()) { + ceph_assert(backfills_in_flight.empty()); + new_last_backfill = backfill_pos; + last_backfill_started = backfill_pos; + } + dout(10) << "final new_last_backfill at " << new_last_backfill << dendl; + + // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to + // all the backfill targets. Otherwise, we will move last_backfill up on + // those targets need it and send OP_BACKFILL_PROGRESS to them. + for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin(); + i != get_backfill_targets().end(); + ++i) { + pg_shard_t bt = *i; + const pg_info_t& pinfo = recovery_state.get_peer_info(bt); + + if (new_last_backfill > pinfo.last_backfill) { + recovery_state.update_peer_last_backfill(bt, new_last_backfill); + epoch_t e = get_osdmap_epoch(); + MOSDPGBackfill *m = NULL; + if (pinfo.last_backfill.is_max()) { + m = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_FINISH, + e, + get_last_peering_reset(), + spg_t(info.pgid.pgid, bt.shard)); + // Use default priority here, must match sub_op priority + start_recovery_op(hobject_t::get_max()); + } else { + m = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_PROGRESS, + e, + get_last_peering_reset(), + spg_t(info.pgid.pgid, bt.shard)); + // Use default priority here, must match sub_op priority + } + m->last_backfill = pinfo.last_backfill; + m->stats = pinfo.stats; + + if (cct->_conf->osd_op_queue == "mclock_scheduler") { + /* This guard preserves legacy WeightedPriorityQueue behavior for + * now, but should be removed after Reef */ + m->set_priority(recovery_state.get_recovery_op_priority()); + } + + osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch()); + dout(10) << " peer " << bt + << " num_objects now " << pinfo.stats.stats.sum.num_objects + << " / " << info.stats.stats.sum.num_objects << dendl; + } + } + + if (ops) + *work_started = true; + return ops; +} + +int PrimaryLogPG::prep_backfill_object_push( + hobject_t oid, eversion_t v, + ObjectContextRef obc, + vector<pg_shard_t> peers, + PGBackend::RecoveryHandle *h) +{ + dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl; + ceph_assert(!peers.empty()); + + backfills_in_flight.insert(oid); + recovery_state.prepare_backfill_for_missing(oid, v, peers); + + ceph_assert(!recovering.count(oid)); + + start_recovery_op(oid); + recovering.insert(make_pair(oid, obc)); + + int r = pgbackend->recover_object( + oid, + v, + ObjectContextRef(), + obc, + h); + if (r < 0) { + dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl; + on_failed_pull({ pg_whoami }, oid, v); + } + return r; +} + +void PrimaryLogPG::update_range( + BackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + int local_min = cct->_conf->osd_backfill_scan_min; + int local_max = cct->_conf->osd_backfill_scan_max; + + if (bi->version < info.log_tail) { + dout(10) << __func__<< ": bi is old, rescanning local backfill_info" + << dendl; + bi->version = info.last_update; + scan_range(local_min, local_max, bi, handle); + } + + if (bi->version >= projected_last_update) { + dout(10) << __func__<< ": bi is current " << dendl; + ceph_assert(bi->version == projected_last_update); + } else if (bi->version >= info.log_tail) { + if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) { + /* Because we don't move log_tail on split, the log might be + * empty even if log_tail != last_update. However, the only + * way to get here with an empty log is if log_tail is actually + * eversion_t(), because otherwise the entry which changed + * last_update since the last scan would have to be present. + */ + ceph_assert(bi->version == eversion_t()); + return; + } + + dout(10) << __func__<< ": bi is old, (" << bi->version + << ") can be updated with log to projected_last_update " + << projected_last_update << dendl; + + auto func = [&](const pg_log_entry_t &e) { + dout(10) << __func__ << ": updating from version " << e.version + << dendl; + const hobject_t &soid = e.soid; + if (soid >= bi->begin && + soid < bi->end) { + if (e.is_update()) { + dout(10) << __func__ << ": " << e.soid << " updated to version " + << e.version << dendl; + bi->objects.erase(e.soid); + bi->objects.insert( + make_pair( + e.soid, + e.version)); + } else if (e.is_delete()) { + dout(10) << __func__ << ": " << e.soid << " removed" << dendl; + bi->objects.erase(e.soid); + } + } + }; + dout(10) << "scanning pg log first" << dendl; + recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func); + dout(10) << "scanning projected log" << dendl; + projected_log.scan_log_after(bi->version, func); + bi->version = projected_last_update; + } else { + ceph_abort_msg("scan_range should have raised bi->version past log_tail"); + } +} + +void PrimaryLogPG::scan_range( + int min, int max, BackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + ceph_assert(is_locked()); + dout(10) << "scan_range from " << bi->begin << dendl; + bi->clear_objects(); + + vector<hobject_t> ls; + ls.reserve(max); + int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end); + ceph_assert(r >= 0); + dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl; + dout(20) << ls << dendl; + + for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) { + handle.reset_tp_timeout(); + ObjectContextRef obc; + if (is_primary()) + obc = object_contexts.lookup(*p); + if (obc) { + if (!obc->obs.exists) { + /* If the object does not exist here, it must have been removed + * between the collection_list_partial and here. This can happen + * for the first item in the range, which is usually last_backfill. + */ + continue; + } + bi->objects[*p] = obc->obs.oi.version; + dout(20) << " " << *p << " " << obc->obs.oi.version << dendl; + } else { + bufferlist bl; + int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl); + /* If the object does not exist here, it must have been removed + * between the collection_list_partial and here. This can happen + * for the first item in the range, which is usually last_backfill. + */ + if (r == -ENOENT) + continue; + + ceph_assert(r >= 0); + object_info_t oi(bl); + bi->objects[*p] = oi.version; + dout(20) << " " << *p << " " << oi.version << dendl; + } + } +} + + +/** check_local + * + * verifies that stray objects have been deleted + */ +void PrimaryLogPG::check_local() +{ + dout(10) << __func__ << dendl; + + ceph_assert( + info.last_update >= + recovery_state.get_pg_log().get_tail()); // otherwise we need some help! + + if (!cct->_conf->osd_debug_verify_stray_on_activate) + return; + + // just scan the log. + set<hobject_t> did; + for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin(); + p != recovery_state.get_pg_log().get_log().log.rend(); + ++p) { + if (did.count(p->soid)) + continue; + did.insert(p->soid); + + if (p->is_delete() && !is_missing_object(p->soid)) { + dout(10) << " checking " << p->soid + << " at " << p->version << dendl; + struct stat st; + int r = osd->store->stat( + ch, + ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard), + &st); + if (r != -ENOENT) { + derr << __func__ << " " << p->soid << " exists, but should have been " + << "deleted" << dendl; + ceph_abort_msg("erroneously present object"); + } + } else { + // ignore old(+missing) objects + } + } +} + + + +// =========================== +// hit sets + +hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp) +{ + ostringstream ss; + ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp; + hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", + info.pgid.ps(), info.pgid.pool(), + cct->_conf->osd_hit_set_namespace); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt) +{ + ostringstream ss; + ss << "hit_set_" << info.pgid.pgid << "_archive_"; + if (using_gmt) { + start.gmtime(ss, true /* legacy pre-octopus form */) << "_"; + end.gmtime(ss, true /* legacy pre-octopus form */); + } else { + start.localtime(ss, true /* legacy pre-octopus form */) << "_"; + end.localtime(ss, true /* legacy pre-octopus form */); + } + hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", + info.pgid.ps(), info.pgid.pool(), + cct->_conf->osd_hit_set_namespace); + dout(20) << __func__ << " " << hoid << dendl; + return hoid; +} + +void PrimaryLogPG::hit_set_clear() +{ + dout(20) << __func__ << dendl; + hit_set.reset(); + hit_set_start_stamp = utime_t(); +} + +void PrimaryLogPG::hit_set_setup() +{ + if (!is_active() || + !is_primary()) { + hit_set_clear(); + return; + } + + if (is_active() && is_primary() && + (!pool.info.hit_set_count || + !pool.info.hit_set_period || + pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) { + hit_set_clear(); + + // only primary is allowed to remove all the hit set objects + hit_set_remove_all(); + return; + } + + // FIXME: discard any previous data for now + hit_set_create(); + + // include any writes we know about from the pg log. this doesn't + // capture reads, but it is better than nothing! + hit_set_apply_log(); +} + +void PrimaryLogPG::hit_set_remove_all() +{ + // If any archives are degraded we skip this + for (auto p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + // Once we hit a degraded object just skip + if (is_degraded_or_backfilling_object(aoid)) + return; + if (m_scrubber->write_blocked_by_scrub(aoid)) + return; + } + + if (!info.hit_set.history.empty()) { + auto p = info.hit_set.history.rbegin(); + ceph_assert(p != info.hit_set.history.rend()); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + ceph_assert(!is_degraded_or_backfilling_object(oid)); + ObjectContextRef obc = get_object_context(oid, false); + ceph_assert(obc); + + OpContextUPtr ctx = simple_opc_create(obc); + ctx->at_version = get_next_version(); + ctx->updated_hset_history = info.hit_set; + utime_t now = ceph_clock_now(); + ctx->mtime = now; + hit_set_trim(ctx, 0); + simple_opc_submit(std::move(ctx)); + } + + recovery_state.update_hset(pg_hit_set_history_t()); + if (agent_state) { + agent_state->discard_hit_sets(); + } +} + +void PrimaryLogPG::hit_set_create() +{ + utime_t now = ceph_clock_now(); + // make a copy of the params to modify + HitSet::Params params(pool.info.hit_set_params); + + dout(20) << __func__ << " " << params << dendl; + if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) { + BloomHitSet::Params *p = + static_cast<BloomHitSet::Params*>(params.impl.get()); + + // convert false positive rate so it holds up across the full period + p->set_fpp(p->get_fpp() / pool.info.hit_set_count); + if (p->get_fpp() <= 0.0) + p->set_fpp(.01); // fpp cannot be zero! + + // if we don't have specified size, estimate target size based on the + // previous bin! + if (p->target_size == 0 && hit_set) { + utime_t dur = now - hit_set_start_stamp; + unsigned unique = hit_set->approx_unique_insert_count(); + dout(20) << __func__ << " previous set had approx " << unique + << " unique items over " << dur << " seconds" << dendl; + p->target_size = (double)unique * (double)pool.info.hit_set_period + / (double)dur; + } + if (p->target_size < + static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size)) + p->target_size = cct->_conf->osd_hit_set_min_size; + + if (p->target_size + > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size)) + p->target_size = cct->_conf->osd_hit_set_max_size; + + p->seed = now.sec(); + + dout(10) << __func__ << " target_size " << p->target_size + << " fpp " << p->get_fpp() << dendl; + } + hit_set.reset(new HitSet(params)); + hit_set_start_stamp = now; +} + +/** + * apply log entries to set + * + * this would only happen after peering, to at least capture writes + * during an interval that was potentially lost. + */ +bool PrimaryLogPG::hit_set_apply_log() +{ + if (!hit_set) + return false; + + eversion_t to = info.last_update; + eversion_t from = info.hit_set.current_last_update; + if (to <= from) { + dout(20) << __func__ << " no update" << dendl; + return false; + } + + dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl; + list<pg_log_entry_t>::const_reverse_iterator p = + recovery_state.get_pg_log().get_log().log.rbegin(); + while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to) + ++p; + while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) { + hit_set->insert(p->soid); + ++p; + } + + return true; +} + +void PrimaryLogPG::hit_set_persist() +{ + dout(10) << __func__ << dendl; + bufferlist bl; + unsigned max = pool.info.hit_set_count; + + utime_t now = ceph_clock_now(); + hobject_t oid; + + // If any archives are degraded we skip this persist request + // account for the additional entry being added below + for (auto p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + // Once we hit a degraded object just skip further trim + if (is_degraded_or_backfilling_object(aoid)) + return; + if (m_scrubber->write_blocked_by_scrub(aoid)) + return; + } + + // If backfill is in progress and we could possibly overlap with the + // hit_set_* objects, back off. Since these all have + // hobject_t::hash set to pgid.ps(), and those sort first, we can + // look just at that. This is necessary because our transactions + // may include a modify of the new hit_set *and* a delete of the + // old one, and this may span the backfill boundary. + for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin(); + p != get_backfill_targets().end(); + ++p) { + const pg_info_t& pi = recovery_state.get_peer_info(*p); + if (pi.last_backfill == hobject_t() || + pi.last_backfill.get_hash() == info.pgid.ps()) { + dout(10) << __func__ << " backfill target osd." << *p + << " last_backfill has not progressed past pgid ps" + << dendl; + return; + } + } + + + pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset); + new_hset.begin = hit_set_start_stamp; + new_hset.end = now; + oid = get_hit_set_archive_object( + new_hset.begin, + new_hset.end, + new_hset.using_gmt); + + // If the current object is degraded we skip this persist request + if (m_scrubber->write_blocked_by_scrub(oid)) + return; + + hit_set->seal(); + encode(*hit_set, bl); + dout(20) << __func__ << " archive " << oid << dendl; + + if (agent_state) { + agent_state->add_hit_set(new_hset.begin, hit_set); + uint32_t size = agent_state->hit_set_map.size(); + if (size >= pool.info.hit_set_count) { + size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0; + } + hit_set_in_memory_trim(size); + } + + ObjectContextRef obc = get_object_context(oid, true); + OpContextUPtr ctx = simple_opc_create(obc); + + ctx->at_version = get_next_version(); + ctx->updated_hset_history = info.hit_set; + pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history); + + updated_hit_set_hist.current_last_update = info.last_update; + new_hset.version = ctx->at_version; + + updated_hit_set_hist.history.push_back(new_hset); + hit_set_create(); + + // fabricate an object_info_t and SnapSet + obc->obs.oi.version = ctx->at_version; + obc->obs.oi.mtime = now; + obc->obs.oi.size = bl.length(); + obc->obs.exists = true; + obc->obs.oi.set_data_digest(bl.crc32c(-1)); + + ctx->new_obs = obc->obs; + + ctx->new_snapset = obc->ssc->snapset; + + ctx->delta_stats.num_objects++; + ctx->delta_stats.num_objects_hit_set_archive++; + + ctx->delta_stats.num_bytes += bl.length(); + ctx->delta_stats.num_bytes_hit_set_archive += bl.length(); + + bufferlist bss; + encode(ctx->new_snapset, bss); + bufferlist boi(sizeof(ctx->new_obs.oi)); + encode(ctx->new_obs.oi, boi, + get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + + ctx->op_t->create(oid); + if (bl.length()) { + ctx->op_t->write(oid, 0, bl.length(), bl, 0); + write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges, + 0, bl.length()); + ctx->clean_regions.mark_data_region_dirty(0, bl.length()); + } + map<string, bufferlist, std::less<>> attrs = { + {OI_ATTR, std::move(boi)}, + {SS_ATTR, std::move(bss)} + }; + setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs); + ctx->log.push_back( + pg_log_entry_t( + pg_log_entry_t::MODIFY, + oid, + ctx->at_version, + eversion_t(), + 0, + osd_reqid_t(), + ctx->mtime, + 0) + ); + ctx->log.back().clean_regions = ctx->clean_regions; + + hit_set_trim(ctx, max); + + simple_opc_submit(std::move(ctx)); +} + +void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max) +{ + ceph_assert(ctx->updated_hset_history); + pg_hit_set_history_t &updated_hit_set_hist = + *(ctx->updated_hset_history); + for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) { + list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin(); + ceph_assert(p != updated_hit_set_hist.history.end()); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + + ceph_assert(!is_degraded_or_backfilling_object(oid)); + + dout(20) << __func__ << " removing " << oid << dendl; + ++ctx->at_version.version; + ctx->log.push_back( + pg_log_entry_t(pg_log_entry_t::DELETE, + oid, + ctx->at_version, + p->version, + 0, + osd_reqid_t(), + ctx->mtime, + 0)); + + ctx->op_t->remove(oid); + updated_hit_set_hist.history.pop_front(); + + ObjectContextRef obc = get_object_context(oid, false); + ceph_assert(obc); + --ctx->delta_stats.num_objects; + --ctx->delta_stats.num_objects_hit_set_archive; + ctx->delta_stats.num_bytes -= obc->obs.oi.size; + ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size; + } +} + +void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory) +{ + while (agent_state->hit_set_map.size() > max_in_memory) { + agent_state->remove_oldest_hit_set(); + } +} + + +// ======================================= +// cache agent + +void PrimaryLogPG::agent_setup() +{ + ceph_assert(is_locked()); + if (!is_active() || + !is_primary() || + state_test(PG_STATE_PREMERGE) || + pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || + pool.info.tier_of < 0 || + !get_osdmap()->have_pg_pool(pool.info.tier_of)) { + agent_clear(); + return; + } + if (!agent_state) { + agent_state.reset(new TierAgentState); + + // choose random starting position + agent_state->position = hobject_t(); + agent_state->position.pool = info.pgid.pool(); + agent_state->position.set_hash(pool.info.get_random_pg_position( + info.pgid.pgid, + rand())); + agent_state->start = agent_state->position; + + dout(10) << __func__ << " allocated new state, position " + << agent_state->position << dendl; + } else { + dout(10) << __func__ << " keeping existing state" << dendl; + } + + if (info.stats.stats_invalid) { + osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate"; + } + + agent_choose_mode(); +} + +void PrimaryLogPG::agent_clear() +{ + agent_stop(); + agent_state.reset(NULL); +} + +// Return false if no objects operated on since start of object hash space +bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota) +{ + std::scoped_lock locker{*this}; + if (!agent_state) { + dout(10) << __func__ << " no agent state, stopping" << dendl; + return true; + } + + ceph_assert(!recovery_state.is_deleting()); + + if (agent_state->is_idle()) { + dout(10) << __func__ << " idle, stopping" << dendl; + return true; + } + + osd->logger->inc(l_osd_agent_wake); + + dout(10) << __func__ + << " max " << start_max + << ", flush " << agent_state->get_flush_mode_name() + << ", evict " << agent_state->get_evict_mode_name() + << ", pos " << agent_state->position + << dendl; + ceph_assert(is_primary()); + ceph_assert(is_active()); + + agent_load_hit_sets(); + + const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of); + ceph_assert(base_pool); + + int ls_min = 1; + int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size; + + // list some objects. this conveniently lists clones (oldest to + // newest) before heads... the same order we want to flush in. + // + // NOTE: do not flush the Sequencer. we will assume that the + // listing we get back is imprecise. + vector<hobject_t> ls; + hobject_t next; + int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max, + &ls, &next); + ceph_assert(r >= 0); + dout(20) << __func__ << " got " << ls.size() << " objects" << dendl; + int started = 0; + for (vector<hobject_t>::iterator p = ls.begin(); + p != ls.end(); + ++p) { + if (p->nspace == cct->_conf->osd_hit_set_namespace) { + dout(20) << __func__ << " skip (hit set) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (is_degraded_or_backfilling_object(*p)) { + dout(20) << __func__ << " skip (degraded) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (is_missing_object(p->get_head())) { + dout(20) << __func__ << " skip (missing head) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + ObjectContextRef obc = get_object_context(*p, false, NULL); + if (!obc) { + // we didn't flush; we may miss something here. + dout(20) << __func__ << " skip (no obc) " << *p << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (!obc->obs.exists) { + dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid, + obc->obs.oi.soid.get_head())) { + dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (obc->is_blocked()) { + dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + if (obc->is_request_pending()) { + dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + + // be careful flushing omap to an EC pool. + if (!base_pool->supports_omap() && + obc->obs.oi.is_omap()) { + dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + continue; + } + + if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE && + agent_maybe_evict(obc, false)) + ++started; + else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE && + agent_flush_quota > 0 && agent_maybe_flush(obc)) { + ++started; + --agent_flush_quota; + } + if (started >= start_max) { + // If finishing early, set "next" to the next object + if (++p != ls.end()) + next = *p; + break; + } + } + + if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) { + dout(20) << __func__ << " resetting atime and temp histograms" << dendl; + agent_state->hist_age = 0; + agent_state->temp_hist.decay(); + } + + // Total objects operated on so far + int total_started = agent_state->started + started; + bool need_delay = false; + + dout(20) << __func__ << " start pos " << agent_state->position + << " next start pos " << next + << " started " << total_started << dendl; + + // See if we've made a full pass over the object hash space + // This might check at most ls_max objects a second time to notice that + // we've checked every objects at least once. + if (agent_state->position < agent_state->start && + next >= agent_state->start) { + dout(20) << __func__ << " wrap around " << agent_state->start << dendl; + if (total_started == 0) + need_delay = true; + else + total_started = 0; + agent_state->start = next; + } + agent_state->started = total_started; + + // See if we are starting from beginning + if (next.is_max()) + agent_state->position = hobject_t(); + else + agent_state->position = next; + + // Discard old in memory HitSets + hit_set_in_memory_trim(pool.info.hit_set_count); + + if (need_delay) { + ceph_assert(agent_state->delaying == false); + agent_delay(); + return false; + } + agent_choose_mode(); + return true; +} + +void PrimaryLogPG::agent_load_hit_sets() +{ + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) { + return; + } + + if (agent_state->hit_set_map.size() < info.hit_set.history.size()) { + dout(10) << __func__ << dendl; + for (auto p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); ++p) { + if (agent_state->hit_set_map.count(p->begin.sec()) == 0) { + dout(10) << __func__ << " loading " << p->begin << "-" + << p->end << dendl; + if (!pool.info.is_replicated()) { + // FIXME: EC not supported here yet + derr << __func__ << " on non-replicated pool" << dendl; + break; + } + + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); + if (is_unreadable_object(oid)) { + dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl; + break; + } + + ObjectContextRef obc = get_object_context(oid, false); + if (!obc) { + derr << __func__ << ": could not load hitset " << oid << dendl; + break; + } + + bufferlist bl; + { + int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl); + ceph_assert(r >= 0); + } + HitSetRef hs(new HitSet); + bufferlist::const_iterator pbl = bl.begin(); + decode(*hs, pbl); + agent_state->add_hit_set(p->begin.sec(), hs); + } + } + } +} + +bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc) +{ + if (!obc->obs.oi.is_dirty()) { + dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + if (obc->obs.oi.is_cache_pinned()) { + dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + utime_t now = ceph_clock_now(); + utime_t ob_local_mtime; + if (obc->obs.oi.local_mtime != utime_t()) { + ob_local_mtime = obc->obs.oi.local_mtime; + } else { + ob_local_mtime = obc->obs.oi.mtime; + } + bool evict_mode_full = + (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL); + if (!evict_mode_full && + obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay + (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) { + dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + if (osd->agent_is_active_oid(obc->obs.oi.soid)) { + dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + dout(10) << __func__ << " flushing " << obc->obs.oi << dendl; + + // FIXME: flush anything dirty, regardless of what distribution of + // ages we expect. + + hobject_t oid = obc->obs.oi.soid; + osd->agent_start_op(oid); + // no need to capture a pg ref, can't outlive fop or ctx + std::function<void()> on_flush = [this, oid]() { + osd->agent_finish_op(oid); + }; + + int result = start_flush( + OpRequestRef(), obc, false, NULL, + on_flush); + if (result != -EINPROGRESS) { + on_flush(); + dout(10) << __func__ << " start_flush() failed " << obc->obs.oi + << " with " << result << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + + osd->logger->inc(l_osd_agent_flush); + return true; +} + +bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush) +{ + const hobject_t& soid = obc->obs.oi.soid; + if (!after_flush && obc->obs.oi.is_dirty()) { + dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl; + return false; + } + // This is already checked by agent_work() which passes after_flush = false + if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) { + dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl; + return false; + } + if (!obc->obs.oi.watchers.empty()) { + dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl; + return false; + } + if (obc->is_blocked()) { + dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl; + return false; + } + if (obc->obs.oi.is_cache_pinned()) { + dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl; + return false; + } + + if (soid.snap == CEPH_NOSNAP) { + int result = _verify_no_head_clones(soid, obc->ssc->snapset); + if (result < 0) { + dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl; + return false; + } + } + + if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) { + // is this object old than cache_min_evict_age? + utime_t now = ceph_clock_now(); + utime_t ob_local_mtime; + if (obc->obs.oi.local_mtime != utime_t()) { + ob_local_mtime = obc->obs.oi.local_mtime; + } else { + ob_local_mtime = obc->obs.oi.mtime; + } + if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) { + dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl; + osd->logger->inc(l_osd_agent_skip); + return false; + } + // is this object old and/or cold enough? + int temp = 0; + uint64_t temp_upper = 0, temp_lower = 0; + if (hit_set) + agent_estimate_temp(soid, &temp); + agent_state->temp_hist.add(temp); + agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper); + + dout(20) << __func__ + << " temp " << temp + << " pos " << temp_lower << "-" << temp_upper + << ", evict_effort " << agent_state->evict_effort + << dendl; + dout(30) << "agent_state:\n"; + auto f = Formatter::create_unique(""); + f->open_object_section("agent_state"); + agent_state->dump(f.get()); + f->close_section(); + f->flush(*_dout); + *_dout << dendl; + + if (1000000 - temp_upper >= agent_state->evict_effort) + return false; + } + + dout(10) << __func__ << " evicting " << obc->obs.oi << dendl; + OpContextUPtr ctx = simple_opc_create(obc); + + auto null_op_req = OpRequestRef(); + if (!ctx->lock_manager.get_lock_type( + RWState::RWWRITE, + obc->obs.oi.soid, + obc, + null_op_req)) { + close_op_ctx(ctx.release()); + dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl; + return false; + } + + osd->agent_start_evict_op(); + ctx->register_on_finish( + [this]() { + osd->agent_finish_evict_op(); + }); + + ctx->at_version = get_next_version(); + ceph_assert(ctx->new_obs.exists); + int r = _delete_oid(ctx.get(), true, false); + if (obc->obs.oi.is_omap()) + ctx->delta_stats.num_objects_omap--; + ctx->delta_stats.num_evict++; + ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10); + if (obc->obs.oi.is_dirty()) + --ctx->delta_stats.num_objects_dirty; + ceph_assert(r == 0); + finish_ctx(ctx.get(), pg_log_entry_t::DELETE); + simple_opc_submit(std::move(ctx)); + osd->logger->inc(l_osd_tier_evict); + osd->logger->inc(l_osd_agent_evict); + return true; +} + +void PrimaryLogPG::agent_stop() +{ + dout(20) << __func__ << dendl; + if (agent_state && !agent_state->is_idle()) { + agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE; + agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE; + osd->agent_disable_pg(this, agent_state->evict_effort); + } +} + +void PrimaryLogPG::agent_delay() +{ + dout(20) << __func__ << dendl; + if (agent_state && !agent_state->is_idle()) { + ceph_assert(agent_state->delaying == false); + agent_state->delaying = true; + osd->agent_disable_pg(this, agent_state->evict_effort); + } +} + +void PrimaryLogPG::agent_choose_mode_restart() +{ + dout(20) << __func__ << dendl; + std::scoped_lock locker{*this}; + if (agent_state && agent_state->delaying) { + agent_state->delaying = false; + agent_choose_mode(true); + } +} + +bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op) +{ + bool requeued = false; + // Let delay play out + if (agent_state->delaying) { + dout(20) << __func__ << " " << this << " delaying, ignored" << dendl; + return requeued; + } + + TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE; + TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE; + unsigned evict_effort = 0; + + if (info.stats.stats_invalid) { + // idle; stats can't be trusted until we scrub. + dout(20) << __func__ << " stats invalid (post-split), idle" << dendl; + goto skip_calc; + } + + { + uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid); + ceph_assert(divisor > 0); + + // adjust (effective) user objects down based on the number + // of HitSet objects, which should not count toward our total since + // they cannot be flushed. + uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive; + + // also exclude omap objects if ec backing pool + const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of); + ceph_assert(base_pool); + if (!base_pool->supports_omap()) + unflushable += info.stats.stats.sum.num_objects_omap; + + uint64_t num_user_objects = info.stats.stats.sum.num_objects; + if (num_user_objects > unflushable) + num_user_objects -= unflushable; + else + num_user_objects = 0; + + uint64_t num_user_bytes = info.stats.stats.sum.num_bytes; + uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive; + num_user_bytes -= unflushable_bytes; + uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects); + num_user_bytes += num_overhead_bytes; + + // also reduce the num_dirty by num_objects_omap + int64_t num_dirty = info.stats.stats.sum.num_objects_dirty; + if (!base_pool->supports_omap()) { + if (num_dirty > info.stats.stats.sum.num_objects_omap) + num_dirty -= info.stats.stats.sum.num_objects_omap; + else + num_dirty = 0; + } + + dout(10) << __func__ + << " flush_mode: " + << TierAgentState::get_flush_mode_name(agent_state->flush_mode) + << " evict_mode: " + << TierAgentState::get_evict_mode_name(agent_state->evict_mode) + << " num_objects: " << info.stats.stats.sum.num_objects + << " num_bytes: " << info.stats.stats.sum.num_bytes + << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty + << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap + << " num_dirty: " << num_dirty + << " num_user_objects: " << num_user_objects + << " num_user_bytes: " << num_user_bytes + << " num_overhead_bytes: " << num_overhead_bytes + << " pool.info.target_max_bytes: " << pool.info.target_max_bytes + << " pool.info.target_max_objects: " << pool.info.target_max_objects + << dendl; + + // get dirty, full ratios + uint64_t dirty_micro = 0; + uint64_t full_micro = 0; + if (pool.info.target_max_bytes && num_user_objects > 0) { + uint64_t avg_size = num_user_bytes / num_user_objects; + dirty_micro = + num_dirty * avg_size * 1000000 / + std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1); + full_micro = + num_user_objects * avg_size * 1000000 / + std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1); + } + if (pool.info.target_max_objects > 0) { + uint64_t dirty_objects_micro = + num_dirty * 1000000 / + std::max<uint64_t>(pool.info.target_max_objects / divisor, 1); + if (dirty_objects_micro > dirty_micro) + dirty_micro = dirty_objects_micro; + uint64_t full_objects_micro = + num_user_objects * 1000000 / + std::max<uint64_t>(pool.info.target_max_objects / divisor, 1); + if (full_objects_micro > full_micro) + full_micro = full_objects_micro; + } + dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0) + << " full " << ((float)full_micro / 1000000.0) + << dendl; + + // flush mode + uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro; + uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro; + uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop; + if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) { + flush_target += flush_slop; + flush_high_target += flush_slop; + } else { + flush_target -= std::min(flush_target, flush_slop); + flush_high_target -= std::min(flush_high_target, flush_slop); + } + + if (dirty_micro > flush_high_target) { + flush_mode = TierAgentState::FLUSH_MODE_HIGH; + } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) { + flush_mode = TierAgentState::FLUSH_MODE_LOW; + } + + // evict mode + uint64_t evict_target = pool.info.cache_target_full_ratio_micro; + uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop; + if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) + evict_target += evict_slop; + else + evict_target -= std::min(evict_target, evict_slop); + + if (full_micro > 1000000) { + // evict anything clean + evict_mode = TierAgentState::EVICT_MODE_FULL; + evict_effort = 1000000; + } else if (full_micro > evict_target) { + // set effort in [0..1] range based on where we are between + evict_mode = TierAgentState::EVICT_MODE_SOME; + uint64_t over = full_micro - evict_target; + uint64_t span = 1000000 - evict_target; + evict_effort = std::max(over * 1000000 / span, + uint64_t(1000000.0 * + cct->_conf->osd_agent_min_evict_effort)); + + // quantize effort to avoid too much reordering in the agent_queue. + uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000; + ceph_assert(inc > 0); + uint64_t was = evict_effort; + evict_effort -= evict_effort % inc; + if (evict_effort < inc) + evict_effort = inc; + ceph_assert(evict_effort >= inc && evict_effort <= 1000000); + dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl; + } + } + + skip_calc: + bool old_idle = agent_state->is_idle(); + if (flush_mode != agent_state->flush_mode) { + dout(5) << __func__ << " flush_mode " + << TierAgentState::get_flush_mode_name(agent_state->flush_mode) + << " -> " + << TierAgentState::get_flush_mode_name(flush_mode) + << dendl; + recovery_state.update_stats( + [=, this](auto &history, auto &stats) { + if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) { + osd->agent_inc_high_count(); + stats.stats.sum.num_flush_mode_high = 1; + } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) { + stats.stats.sum.num_flush_mode_low = 1; + } + if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) { + osd->agent_dec_high_count(); + stats.stats.sum.num_flush_mode_high = 0; + } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) { + stats.stats.sum.num_flush_mode_low = 0; + } + return false; + }); + agent_state->flush_mode = flush_mode; + } + if (evict_mode != agent_state->evict_mode) { + dout(5) << __func__ << " evict_mode " + << TierAgentState::get_evict_mode_name(agent_state->evict_mode) + << " -> " + << TierAgentState::get_evict_mode_name(evict_mode) + << dendl; + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL && + is_active()) { + if (op) + requeue_op(op); + requeue_ops(waiting_for_flush); + requeue_ops(waiting_for_active); + requeue_ops(waiting_for_readable); + requeue_ops(waiting_for_scrub); + requeue_ops(waiting_for_cache_not_full); + objects_blocked_on_cache_full.clear(); + requeued = true; + } + recovery_state.update_stats( + [=, this](auto &history, auto &stats) { + if (evict_mode == TierAgentState::EVICT_MODE_SOME) { + stats.stats.sum.num_evict_mode_some = 1; + } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) { + stats.stats.sum.num_evict_mode_full = 1; + } + if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) { + stats.stats.sum.num_evict_mode_some = 0; + } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) { + stats.stats.sum.num_evict_mode_full = 0; + } + return false; + }); + agent_state->evict_mode = evict_mode; + } + uint64_t old_effort = agent_state->evict_effort; + if (evict_effort != agent_state->evict_effort) { + dout(5) << __func__ << " evict_effort " + << ((float)agent_state->evict_effort / 1000000.0) + << " -> " + << ((float)evict_effort / 1000000.0) + << dendl; + agent_state->evict_effort = evict_effort; + } + + // NOTE: we are using evict_effort as a proxy for *all* agent effort + // (including flush). This is probably fine (they should be + // correlated) but it is not precisely correct. + if (agent_state->is_idle()) { + if (!restart && !old_idle) { + osd->agent_disable_pg(this, old_effort); + } + } else { + if (restart || old_idle) { + osd->agent_enable_pg(this, agent_state->evict_effort); + } else if (old_effort != agent_state->evict_effort) { + osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort); + } + } + return requeued; +} + +void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp) +{ + ceph_assert(hit_set); + ceph_assert(temp); + *temp = 0; + if (hit_set->contains(oid)) + *temp = 1000000; + unsigned i = 0; + int last_n = pool.info.hit_set_search_last_n; + for (map<time_t,HitSetRef>::reverse_iterator p = + agent_state->hit_set_map.rbegin(); last_n > 0 && + p != agent_state->hit_set_map.rend(); ++p, ++i) { + if (p->second->contains(oid)) { + *temp += pool.info.get_grade(i); + --last_n; + } + } +} + +// Dup op detection + +bool PrimaryLogPG::already_complete(eversion_t v) +{ + dout(20) << __func__ << ": " << v << dendl; + for (xlist<RepGather*>::iterator i = repop_queue.begin(); + !i.end(); + ++i) { + dout(20) << __func__ << ": " << **i << dendl; + // skip copy from temp object ops + if ((*i)->v == eversion_t()) { + dout(20) << __func__ << ": " << **i + << " version is empty" << dendl; + continue; + } + if ((*i)->v > v) { + dout(20) << __func__ << ": " << **i + << " (*i)->v past v" << dendl; + break; + } + if (!(*i)->all_committed) { + dout(20) << __func__ << ": " << **i + << " not committed, returning false" + << dendl; + return false; + } + } + dout(20) << __func__ << ": returning true" << dendl; + return true; +} + + +// ========================================================================================== +// SCRUB + +void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op) +{ + dout(15) << __func__ << " is scrub active? " << is_scrub_active() << dendl; + op->mark_started(); + + if (!is_scrub_active()) { + dout(10) << __func__ << " scrub isn't active" << dendl; + return; + } + m_scrubber->map_from_replica(op); +} + +bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin, + const hobject_t& end) +{ + pair<hobject_t, ObjectContextRef> next; + next.second = object_contexts.lookup(begin); + next.first = begin; + bool more = true; + while (more && next.first < end) { + if (next.second && next.second->is_blocked()) { + next.second->requeue_scrub_on_unblock = true; + dout(10) << __func__ << ": scrub delayed, " + << next.first << " is blocked" + << dendl; + return false; + } + more = object_contexts.get_next(next.first, &next); + } + return true; +} + + +int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx) +{ + OpRequestRef op = ctx->op; + // Only supports replicated pools + ceph_assert(!pool.info.is_erasure()); + ceph_assert(is_primary()); + + dout(10) << __func__ << " " << soid + << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl; + + if (!is_clean()) { + block_for_clean(soid, op); + return -EAGAIN; + } + + ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid)); + auto& oi = ctx->new_obs.oi; + eversion_t v = oi.version; + + if (primary_error(soid, v)) { + dout(0) << __func__ << " No other replicas available for " << soid << dendl; + // XXX: If we knew that there is no down osd which could include this + // object, it would be nice if we could return EIO here. + // If a "never fail" flag was available, that could be used + // for rbd to NOT return EIO until object marked lost. + + // Drop through to save this op in case an osd comes up with the object. + } + + // Restart the op after object becomes readable again + waiting_for_unreadable_object[soid].push_back(op); + op->mark_delayed("waiting for missing object"); + + ceph_assert(is_clean()); + state_set(PG_STATE_REPAIR); + state_clear(PG_STATE_CLEAN); + queue_peering_event( + PGPeeringEventRef( + std::make_shared<PGPeeringEvent>( + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery()))); + + return -EAGAIN; +} + +/*---SnapTrimmer Logging---*/ +#undef dout_prefix +#define dout_prefix pg->gen_prefix(*_dout) + +void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name) +{ + ldout(pg->cct, 20) << "enter " << state_name << dendl; +} + +void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time) +{ + ldout(pg->cct, 20) << "exit " << state_name << dendl; +} + +bool PrimaryLogPG::SnapTrimmer::permit_trim() { + return + pg->is_clean() && + !pg->is_scrub_queued_or_active() && + !pg->snap_trimq.empty(); +} + +/*---SnapTrimmer states---*/ +#undef dout_prefix +#define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \ + << "SnapTrimmer state<" << get_state_name() << ">: ") + +/* NotTrimming */ +PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "NotTrimming") +{ + context< SnapTrimmer >().log_enter(state_name); +} + +void PrimaryLogPG::NotTrimming::exit() +{ + context< SnapTrimmer >().log_exit(state_name, enter_time); +} + +boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&) +{ + PrimaryLogPG *pg = context< SnapTrimmer >().pg; + ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl; + + if (!(pg->is_primary() && pg->is_active())) { + ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl; + return discard_event(); + } + if (!pg->is_clean() || + pg->snap_trimq.empty()) { + ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl; + return discard_event(); + } + if (pg->is_scrub_queued_or_active()) { + ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl; + return transit< WaitScrub >(); + } else { + return transit< Trimming >(); + } +} + +boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&) +{ + PrimaryLogPG *pg = context< SnapTrimmer >().pg; + ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl; + + pending = nullptr; + if (!context< SnapTrimmer >().can_trim()) { + post_event(KickTrim()); + return transit< NotTrimming >(); + } + + context<Trimming>().snap_to_trim = pg->snap_trimq.range_start(); + ldout(pg->cct, 10) << "NotTrimming: trimming " + << pg->snap_trimq.range_start() + << dendl; + return transit< AwaitAsyncWork >(); +} + +/* AwaitAsyncWork */ +PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx) + : my_base(ctx), + NamedState(nullptr, "Trimming/AwaitAsyncWork") +{ + auto *pg = context< SnapTrimmer >().pg; + context< SnapTrimmer >().log_enter(state_name); + context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg); + pg->state_set(PG_STATE_SNAPTRIM); + pg->state_clear(PG_STATE_SNAPTRIM_ERROR); + pg->publish_stats_to_osd(); +} + +boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&) +{ + PrimaryLogPGRef pg = context< SnapTrimmer >().pg; + snapid_t snap_to_trim = context<Trimming>().snap_to_trim; + auto &in_flight = context<Trimming>().in_flight; + ceph_assert(in_flight.empty()); + + ceph_assert(pg->is_primary() && pg->is_active()); + if (!context< SnapTrimmer >().can_trim()) { + ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl; + post_event(KickTrim()); + return transit< NotTrimming >(); + } + + ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl; + + vector<hobject_t> to_trim; + unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims; + // we need to look for at least 1 snaptrim, otherwise we'll misinterpret + // the ENOENT below and erase snap_to_trim. + ceph_assert(max > 0); + to_trim.reserve(max); + int r = pg->snap_mapper.get_next_objects_to_trim( + snap_to_trim, + max, + &to_trim); + if (r != 0 && r != -ENOENT) { + lderr(pg->cct) << "get_next_objects_to_trim returned " + << cpp_strerror(r) << dendl; + ceph_abort_msg("get_next_objects_to_trim returned an invalid code"); + } else if (r == -ENOENT) { + // Done! + ldout(pg->cct, 10) << "got ENOENT" << dendl; + + pg->snap_trimq.erase(snap_to_trim); + + if (pg->snap_trimq_repeat.count(snap_to_trim)) { + ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl; + pg->snap_trimq_repeat.erase(snap_to_trim); + } else { + ldout(pg->cct, 10) << "adding snap " << snap_to_trim + << " to purged_snaps" + << dendl; + ObjectStore::Transaction t; + pg->recovery_state.adjust_purged_snaps( + [snap_to_trim](auto &purged_snaps) { + purged_snaps.insert(snap_to_trim); + }); + pg->write_if_dirty(t); + + ldout(pg->cct, 10) << "purged_snaps now " + << pg->info.purged_snaps << ", snap_trimq now " + << pg->snap_trimq << dendl; + + int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL); + ceph_assert(tr == 0); + + pg->recovery_state.share_pg_info(); + } + post_event(KickTrim()); + pg->set_snaptrim_duration(); + return transit< NotTrimming >(); + } + ceph_assert(!to_trim.empty()); + + for (auto &&object: to_trim) { + // Get next + ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl; + OpContextUPtr ctx; + int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx); + if (error) { + if (error == -ENOLCK) { + ldout(pg->cct, 10) << "could not get write lock on obj " + << object << dendl; + } else { + pg->state_set(PG_STATE_SNAPTRIM_ERROR); + ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl; + } + if (!in_flight.empty()) { + ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl; + return transit< WaitRepops >(); + } + if (error == -ENOLCK) { + ldout(pg->cct, 10) << "waiting for it to clear" + << dendl; + return transit< WaitRWLock >(); + } else { + return transit< NotTrimming >(); + } + } + + in_flight.insert(object); + ctx->register_on_success( + [pg, object, &in_flight]() { + ceph_assert(in_flight.find(object) != in_flight.end()); + in_flight.erase(object); + if (in_flight.empty()) { + if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) { + pg->snap_trimmer_machine.process_event(Reset()); + } else { + pg->snap_trimmer_machine.process_event(RepopsComplete()); + } + } + }); + + pg->simple_opc_submit(std::move(ctx)); + } + + return transit< WaitRepops >(); +} + +void PrimaryLogPG::setattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const string &key, + bufferlist &val) +{ + t->setattr(obc->obs.oi.soid, key, val); +} + +void PrimaryLogPG::setattrs_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + map<string, bufferlist, less<>> &attrs) +{ + t->setattrs(obc->obs.oi.soid, attrs); +} + +void PrimaryLogPG::rmattr_maybe_cache( + ObjectContextRef obc, + PGTransaction *t, + const string &key) +{ + t->rmattr(obc->obs.oi.soid, key); +} + +int PrimaryLogPG::getattr_maybe_cache( + ObjectContextRef obc, + const string &key, + bufferlist *val) +{ + if (pool.info.is_erasure()) { + map<string, bufferlist>::iterator i = obc->attr_cache.find(key); + if (i != obc->attr_cache.end()) { + if (val) + *val = i->second; + return 0; + } else { + if (obc->obs.exists) { + return -ENODATA; + } else { + return -ENOENT; + } + } + } + return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val); +} + +int PrimaryLogPG::getattrs_maybe_cache( + ObjectContextRef obc, + map<string, bufferlist, less<>> *out) +{ + int r = 0; + ceph_assert(out); + if (pool.info.is_erasure()) { + *out = obc->attr_cache; + } else { + r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out); + } + map<string, bufferlist, less<>> tmp; + for (auto& [key, val]: *out) { + if (key.size() > 1 && key[0] == '_') { + tmp[key.substr(1, key.size())] = std::move(val); + } + } + tmp.swap(*out); + return r; +} + +bool PrimaryLogPG::check_failsafe_full() { + return osd->check_failsafe_full(get_dpp()); +} + +bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid) +{ + return m_scrubber->write_blocked_by_scrub(oid); +} + +void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); } +void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); } + +#ifdef PG_DEBUG_REFS +uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); } +void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); } +#endif + +void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); } +void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); } |