summaryrefslogtreecommitdiffstats
path: root/src/osd/PGBackend.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/osd/PGBackend.h')
-rw-r--r--src/osd/PGBackend.h633
1 files changed, 633 insertions, 0 deletions
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
new file mode 100644
index 00000000..18ef7235
--- /dev/null
+++ b/src/osd/PGBackend.h
@@ -0,0 +1,633 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013,2014 Inktank Storage, Inc.
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PGBACKEND_H
+#define PGBACKEND_H
+
+#include "osd_types.h"
+#include "common/WorkQueue.h"
+#include "include/Context.h"
+#include "os/ObjectStore.h"
+#include "common/LogClient.h"
+#include <string>
+#include "PGTransaction.h"
+
+namespace Scrub {
+ class Store;
+}
+struct shard_info_wrapper;
+struct inconsistent_obj_wrapper;
+
+//forward declaration
+class OSDMap;
+class PGLog;
+typedef std::shared_ptr<const OSDMap> OSDMapRef;
+
+ /**
+ * PGBackend
+ *
+ * PGBackend defines an interface for logic handling IO and
+ * replication on RADOS objects. The PGBackend implementation
+ * is responsible for:
+ *
+ * 1) Handling client operations
+ * 2) Handling object recovery
+ * 3) Handling object access
+ * 4) Handling scrub, deep-scrub, repair
+ */
+ class PGBackend {
+ public:
+ CephContext* cct;
+ protected:
+ ObjectStore *store;
+ const coll_t coll;
+ ObjectStore::CollectionHandle &ch;
+ public:
+ /**
+ * Provides interfaces for PGBackend callbacks
+ *
+ * The intention is that the parent calls into the PGBackend
+ * implementation holding a lock and that the callbacks are
+ * called under the same locks.
+ */
+ class Listener {
+ public:
+ /// Debugging
+ virtual DoutPrefixProvider *get_dpp() = 0;
+
+ /// Recovery
+
+ /**
+ * Called with the transaction recovering oid
+ */
+ virtual void on_local_recover(
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectContextRef obc,
+ bool is_delete,
+ ObjectStore::Transaction *t
+ ) = 0;
+
+ /**
+ * Called when transaction recovering oid is durable and
+ * applied on all replicas
+ */
+ virtual void on_global_recover(
+ const hobject_t &oid,
+ const object_stat_sum_t &stat_diff,
+ bool is_delete
+ ) = 0;
+
+ /**
+ * Called when peer is recovered
+ */
+ virtual void on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info
+ ) = 0;
+
+ virtual void begin_peer_recover(
+ pg_shard_t peer,
+ const hobject_t oid) = 0;
+
+ virtual void failed_push(const list<pg_shard_t> &from,
+ const hobject_t &soid,
+ const eversion_t &need = eversion_t()) = 0;
+ virtual void finish_degraded_object(const hobject_t oid) = 0;
+ virtual void primary_failed(const hobject_t &soid) = 0;
+ virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0;
+ virtual void cancel_pull(const hobject_t &soid) = 0;
+
+ virtual void apply_stats(
+ const hobject_t &soid,
+ const object_stat_sum_t &delta_stats) = 0;
+
+ /**
+ * Called when a read on the primary fails when pushing
+ */
+ virtual void on_primary_error(
+ const hobject_t &oid,
+ eversion_t v
+ ) = 0;
+
+ virtual void backfill_add_missing(
+ const hobject_t &oid,
+ eversion_t v
+ ) = 0;
+
+ virtual void remove_missing_object(const hobject_t &oid,
+ eversion_t v,
+ Context *on_complete) = 0;
+
+
+ /**
+ * Bless a context
+ *
+ * Wraps a context in whatever outer layers the parent usually
+ * uses to call into the PGBackend
+ */
+ virtual Context *bless_context(Context *c) = 0;
+ virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) = 0;
+ virtual GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+ virtual void send_message(int to_osd, Message *m) = 0;
+ virtual void queue_transaction(
+ ObjectStore::Transaction&& t,
+ OpRequestRef op = OpRequestRef()
+ ) = 0;
+ virtual void queue_transactions(
+ vector<ObjectStore::Transaction>& tls,
+ OpRequestRef op = OpRequestRef()
+ ) = 0;
+ virtual epoch_t get_interval_start_epoch() const = 0;
+ virtual epoch_t get_last_peering_reset_epoch() const = 0;
+
+ virtual const set<pg_shard_t> &get_acting_recovery_backfill_shards() const = 0;
+ virtual const set<pg_shard_t> &get_acting_shards() const = 0;
+ virtual const set<pg_shard_t> &get_backfill_shards() const = 0;
+
+ virtual std::ostream& gen_dbg_prefix(std::ostream& out) const = 0;
+
+ virtual const map<hobject_t, set<pg_shard_t>> &get_missing_loc_shards()
+ const = 0;
+
+ virtual const pg_missing_tracker_t &get_local_missing() const = 0;
+ virtual void add_local_next_event(const pg_log_entry_t& e) = 0;
+ virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing()
+ const = 0;
+ virtual boost::optional<const pg_missing_const_i &> maybe_get_shard_missing(
+ pg_shard_t peer) const {
+ if (peer == primary_shard()) {
+ return get_local_missing();
+ } else {
+ map<pg_shard_t, pg_missing_t>::const_iterator i =
+ get_shard_missing().find(peer);
+ if (i == get_shard_missing().end()) {
+ return boost::optional<const pg_missing_const_i &>();
+ } else {
+ return i->second;
+ }
+ }
+ }
+ virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const {
+ auto m = maybe_get_shard_missing(peer);
+ ceph_assert(m);
+ return *m;
+ }
+
+ virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
+ virtual const pg_info_t &get_shard_info(pg_shard_t peer) const {
+ if (peer == primary_shard()) {
+ return get_info();
+ } else {
+ map<pg_shard_t, pg_info_t>::const_iterator i =
+ get_shard_info().find(peer);
+ ceph_assert(i != get_shard_info().end());
+ return i->second;
+ }
+ }
+
+ virtual const PGLog &get_log() const = 0;
+ virtual bool pgb_is_primary() const = 0;
+ virtual const OSDMapRef& pgb_get_osdmap() const = 0;
+ virtual epoch_t pgb_get_osdmap_epoch() const = 0;
+ virtual const pg_info_t &get_info() const = 0;
+ virtual const pg_pool_t &get_pool() const = 0;
+
+ virtual ObjectContextRef get_obc(
+ const hobject_t &hoid,
+ const map<string, bufferlist> &attrs) = 0;
+
+ virtual bool try_lock_for_read(
+ const hobject_t &hoid,
+ ObcLockManager &manager) = 0;
+
+ virtual void release_locks(ObcLockManager &manager) = 0;
+
+ virtual void op_applied(
+ const eversion_t &applied_version) = 0;
+
+ virtual bool should_send_op(
+ pg_shard_t peer,
+ const hobject_t &hoid) = 0;
+
+ virtual bool pg_is_undersized() const = 0;
+ virtual bool pg_is_repair() const = 0;
+
+ virtual void log_operation(
+ const vector<pg_log_entry_t> &logv,
+ const boost::optional<pg_hit_set_history_t> &hset_history,
+ const eversion_t &trim_to,
+ const eversion_t &roll_forward_to,
+ bool transaction_applied,
+ ObjectStore::Transaction &t,
+ bool async = false) = 0;
+
+ virtual void pgb_set_object_snap_mapping(
+ const hobject_t &soid,
+ const set<snapid_t> &snaps,
+ ObjectStore::Transaction *t) = 0;
+
+ virtual void pgb_clear_object_snap_mapping(
+ const hobject_t &soid,
+ ObjectStore::Transaction *t) = 0;
+
+ virtual void update_peer_last_complete_ondisk(
+ pg_shard_t fromosd,
+ eversion_t lcod) = 0;
+
+ virtual void update_last_complete_ondisk(
+ eversion_t lcod) = 0;
+
+ virtual void update_stats(
+ const pg_stat_t &stat) = 0;
+
+ virtual void schedule_recovery_work(
+ GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+ virtual pg_shard_t whoami_shard() const = 0;
+ int whoami() const {
+ return whoami_shard().osd;
+ }
+ spg_t whoami_spg_t() const {
+ return get_info().pgid;
+ }
+
+ virtual spg_t primary_spg_t() const = 0;
+ virtual pg_shard_t primary_shard() const = 0;
+
+ virtual uint64_t min_upacting_features() const = 0;
+ virtual hobject_t get_temp_recovery_object(const hobject_t& target,
+ eversion_t version) = 0;
+
+ virtual void send_message_osd_cluster(
+ int peer, Message *m, epoch_t from_epoch) = 0;
+ virtual void send_message_osd_cluster(
+ Message *m, Connection *con) = 0;
+ virtual void send_message_osd_cluster(
+ Message *m, const ConnectionRef& con) = 0;
+ virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
+ virtual entity_name_t get_cluster_msgr_name() = 0;
+
+ virtual PerfCounters *get_logger() = 0;
+
+ virtual ceph_tid_t get_tid() = 0;
+
+ virtual LogClientTemp clog_error() = 0;
+ virtual LogClientTemp clog_warn() = 0;
+
+ virtual bool check_failsafe_full() = 0;
+
+ virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
+
+ virtual bool pg_is_repair() = 0;
+ virtual void inc_osd_stat_repaired() = 0;
+ virtual void set_osd_stat_repaired(int64_t) = 0;
+ virtual bool pg_is_remote_backfilling() = 0;
+ virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
+ virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
+ virtual void pg_add_num_bytes(int64_t num_bytes) = 0;
+ virtual void pg_sub_num_bytes(int64_t num_bytes) = 0;
+ virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
+ virtual ~Listener() {}
+ };
+ Listener *parent;
+ Listener *get_parent() const { return parent; }
+ PGBackend(CephContext* cct, Listener *l, ObjectStore *store, const coll_t &coll,
+ ObjectStore::CollectionHandle &ch) :
+ cct(cct),
+ store(store),
+ coll(coll),
+ ch(ch),
+ parent(l) {}
+ bool is_primary() const { return get_parent()->pgb_is_primary(); }
+ const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
+ epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
+ const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+ std::ostream& gen_prefix(std::ostream& out) const {
+ return parent->gen_dbg_prefix(out);
+ }
+
+ /**
+ * RecoveryHandle
+ *
+ * We may want to recover multiple objects in the same set of
+ * messages. RecoveryHandle is an interface for the opaque
+ * object used by the implementation to store the details of
+ * the pending recovery operations.
+ */
+ struct RecoveryHandle {
+ bool cache_dont_need;
+ map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > deletes;
+
+ RecoveryHandle(): cache_dont_need(false) {}
+ virtual ~RecoveryHandle() {}
+ };
+
+ /// Get a fresh recovery operation
+ virtual RecoveryHandle *open_recovery_op() = 0;
+
+ /// run_recovery_op: finish the operation represented by h
+ virtual void run_recovery_op(
+ RecoveryHandle *h, ///< [in] op to finish
+ int priority ///< [in] msg priority
+ ) = 0;
+
+ void recover_delete_object(const hobject_t &oid, eversion_t v,
+ RecoveryHandle *h);
+ void send_recovery_deletes(int prio,
+ const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes);
+
+ /**
+ * recover_object
+ *
+ * Triggers a recovery operation on the specified hobject_t
+ * onreadable must be called before onwriteable
+ *
+ * On each replica (primary included), get_parent()->on_not_missing()
+ * must be called when the transaction finalizing the recovery
+ * is queued. Similarly, get_parent()->on_readable() must be called
+ * when the transaction is applied in the backing store.
+ *
+ * get_parent()->on_not_degraded() should be called on the primary
+ * when writes can resume on the object.
+ *
+ * obc may be NULL if the primary lacks the object.
+ *
+ * head may be NULL only if the head/snapdir is missing
+ *
+ * @param missing [in] set of info, missing pairs for queried nodes
+ * @param overlaps [in] mapping of object to file offset overlaps
+ */
+ virtual int recover_object(
+ const hobject_t &hoid, ///< [in] object to recover
+ eversion_t v, ///< [in] version to recover
+ ObjectContextRef head, ///< [in] context of the head/snapdir object
+ ObjectContextRef obc, ///< [in] context of the object
+ RecoveryHandle *h ///< [in,out] handle to attach recovery op to
+ ) = 0;
+
+ /**
+ * true if PGBackend can handle this message while inactive
+ *
+ * If it returns true, handle_message *must* also return true
+ */
+ virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
+
+ /// gives PGBackend a crack at an incoming message
+ bool handle_message(
+ OpRequestRef op ///< [in] message received
+ ); ///< @return true if the message was handled
+
+ /// the variant of handle_message that is overridden by child classes
+ virtual bool _handle_message(OpRequestRef op) = 0;
+
+ virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
+
+
+ /**
+ * clean up any temporary on-disk state due to a pg interval change
+ */
+ void on_change_cleanup(ObjectStore::Transaction *t);
+ /**
+ * implementation should clear itself, contexts blessed prior to on_change
+ * won't be called after on_change()
+ */
+ virtual void on_change() = 0;
+ virtual void clear_recovery_state() = 0;
+
+ virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0;
+ virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
+ virtual int get_ec_data_chunk_count() const { return 0; };
+ virtual int get_ec_stripe_chunk_size() const { return 0; };
+
+ virtual void dump_recovery_info(Formatter *f) const = 0;
+
+ private:
+ set<hobject_t> temp_contents;
+ public:
+ // Track contents of temp collection, clear on reset
+ void add_temp_obj(const hobject_t &oid) {
+ temp_contents.insert(oid);
+ }
+ void add_temp_objs(const set<hobject_t> &oids) {
+ temp_contents.insert(oids.begin(), oids.end());
+ }
+ void clear_temp_obj(const hobject_t &oid) {
+ temp_contents.erase(oid);
+ }
+ void clear_temp_objs(const set<hobject_t> &oids) {
+ for (set<hobject_t>::const_iterator i = oids.begin();
+ i != oids.end();
+ ++i) {
+ temp_contents.erase(*i);
+ }
+ }
+
+ virtual ~PGBackend() {}
+
+ /// execute implementation specific transaction
+ virtual void submit_transaction(
+ const hobject_t &hoid, ///< [in] object
+ const object_stat_sum_t &delta_stats,///< [in] stat change
+ const eversion_t &at_version, ///< [in] version
+ PGTransactionUPtr &&t, ///< [in] trans to execute (move)
+ const eversion_t &trim_to, ///< [in] trim log to here
+ const eversion_t &roll_forward_to, ///< [in] trim rollback info to here
+ const vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
+ /// [in] hitset history (if updated with this transaction)
+ boost::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_all_commit, ///< [in] called when all commit
+ ceph_tid_t tid, ///< [in] tid
+ osd_reqid_t reqid, ///< [in] reqid
+ OpRequestRef op ///< [in] op
+ ) = 0;
+
+ /// submit callback to be called in order with pending writes
+ virtual void call_write_ordered(std::function<void(void)> &&cb) = 0;
+
+ void try_stash(
+ const hobject_t &hoid,
+ version_t v,
+ ObjectStore::Transaction *t);
+
+ void rollback(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t);
+
+ friend class LRBTrimmer;
+ void rollforward(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t);
+
+ void trim(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t);
+
+ void remove(
+ const hobject_t &hoid,
+ ObjectStore::Transaction *t);
+
+ protected:
+
+ void handle_recovery_delete(OpRequestRef op);
+ void handle_recovery_delete_reply(OpRequestRef op);
+
+ /// Reapply old attributes
+ void rollback_setattrs(
+ const hobject_t &hoid,
+ map<string, boost::optional<bufferlist> > &old_attrs,
+ ObjectStore::Transaction *t);
+
+ /// Truncate object to rollback append
+ virtual void rollback_append(
+ const hobject_t &hoid,
+ uint64_t old_size,
+ ObjectStore::Transaction *t);
+
+ /// Unstash object to rollback stash
+ void rollback_stash(
+ const hobject_t &hoid,
+ version_t old_version,
+ ObjectStore::Transaction *t);
+
+ /// Unstash object to rollback stash
+ void rollback_try_stash(
+ const hobject_t &hoid,
+ version_t old_version,
+ ObjectStore::Transaction *t);
+
+ /// Delete object to rollback create
+ void rollback_create(
+ const hobject_t &hoid,
+ ObjectStore::Transaction *t) {
+ remove(hoid, t);
+ }
+
+ /// Clone the extents back into place
+ void rollback_extents(
+ version_t gen,
+ const vector<pair<uint64_t, uint64_t> > &extents,
+ const hobject_t &hoid,
+ ObjectStore::Transaction *t);
+ public:
+
+ /// Trim object stashed at version
+ void trim_rollback_object(
+ const hobject_t &hoid,
+ version_t gen,
+ ObjectStore::Transaction *t);
+
+ /// List objects in collection
+ int objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ vector<hobject_t> *ls,
+ hobject_t *next);
+
+ int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ vector<hobject_t> *ls,
+ vector<ghobject_t> *gen_obs=0);
+
+ int objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out);
+
+ virtual int objects_get_attrs(
+ const hobject_t &hoid,
+ map<string, bufferlist> *out);
+
+ virtual int objects_read_sync(
+ const hobject_t &hoid,
+ uint64_t off,
+ uint64_t len,
+ uint32_t op_flags,
+ bufferlist *bl) = 0;
+
+ virtual void objects_read_async(
+ const hobject_t &hoid,
+ const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ pair<bufferlist*, Context*> > > &to_read,
+ Context *on_complete, bool fast_read = false) = 0;
+
+ virtual bool auto_repair_supported() const = 0;
+ int be_scan_list(
+ ScrubMap &map,
+ ScrubMapBuilder &pos);
+ bool be_compare_scrub_objects(
+ pg_shard_t auth_shard,
+ const ScrubMap::object &auth,
+ const object_info_t& auth_oi,
+ const ScrubMap::object &candidate,
+ shard_info_wrapper& shard_error,
+ inconsistent_obj_wrapper &result,
+ ostream &errorstream,
+ bool has_snapset);
+ map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
+ const hobject_t &obj,
+ const map<pg_shard_t,ScrubMap*> &maps,
+ object_info_t *auth_oi,
+ map<pg_shard_t, shard_info_wrapper> &shard_map,
+ bool &digest_match,
+ spg_t pgid,
+ ostream &errorstream);
+ void be_compare_scrubmaps(
+ const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
+ bool repair,
+ map<hobject_t, set<pg_shard_t>> &missing,
+ map<hobject_t, set<pg_shard_t>> &inconsistent,
+ map<hobject_t, list<pg_shard_t>> &authoritative,
+ map<hobject_t, pair<boost::optional<uint32_t>,
+ boost::optional<uint32_t>>> &missing_digest,
+ int &shallow_errors, int &deep_errors,
+ Scrub::Store *store,
+ const spg_t& pgid,
+ const vector<int> &acting,
+ ostream &errorstream);
+ virtual uint64_t be_get_ondisk_size(
+ uint64_t logical_size) = 0;
+ virtual int be_deep_scrub(
+ const hobject_t &oid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o) = 0;
+ void be_omap_checks(
+ const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
+ omap_stat_t& omap_stats,
+ ostream &warnstream) const;
+
+ static PGBackend *build_pg_backend(
+ const pg_pool_t &pool,
+ const map<string,string>& profile,
+ Listener *l,
+ coll_t coll,
+ ObjectStore::CollectionHandle &ch,
+ ObjectStore *store,
+ CephContext *cct);
+};
+
+#endif