diff options
Diffstat (limited to '')
-rw-r--r-- | src/os/ObjectStore.h | 1925 |
1 files changed, 1925 insertions, 0 deletions
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h new file mode 100644 index 00000000..1c120689 --- /dev/null +++ b/src/os/ObjectStore.h @@ -0,0 +1,1925 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_OBJECTSTORE_H +#define CEPH_OBJECTSTORE_H + +#include "include/Context.h" +#include "include/buffer.h" +#include "include/types.h" +#include "include/stringify.h" +#include "osd/osd_types.h" +#include "common/TrackedOp.h" +#include "common/WorkQueue.h" +#include "ObjectMap.h" + +#include <errno.h> +#include <sys/stat.h> +#include <vector> +#include <map> + +#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun) +#include <sys/statvfs.h> +#else +#include <sys/vfs.h> /* or <sys/statfs.h> */ +#endif + +#define OPS_PER_PTR 32 + +class CephContext; + +using std::vector; +using std::string; +using std::map; + +namespace ceph { + class Formatter; +} + +/* + * low-level interface to the local OSD file system + */ + +class Logger; +class ContextQueue; + +static inline void encode(const map<string,bufferptr> *attrset, bufferlist &bl) { + encode(*attrset, bl); +} + +// this isn't the best place for these, but... +void decode_str_str_map_to_bl(bufferlist::const_iterator& p, bufferlist *out); +void decode_str_set_to_bl(bufferlist::const_iterator& p, bufferlist *out); + +// Flag bits +typedef uint32_t osflagbits_t; +const int SKIP_JOURNAL_REPLAY = 1 << 0; +const int SKIP_MOUNT_OMAP = 1 << 1; + +class ObjectStore { +protected: + string path; + +public: + CephContext* cct; + /** + * create - create an ObjectStore instance. + * + * This is invoked once at initialization time. + * + * @param type type of store. This is a string from the configuration file. + * @param data path (or other descriptor) for data + * @param journal path (or other descriptor) for journal (optional) + * @param flags which filestores should check if applicable + */ + static ObjectStore *create(CephContext *cct, + const string& type, + const string& data, + const string& journal, + osflagbits_t flags = 0); + + /** + * probe a block device to learn the uuid of the owning OSD + * + * @param cct cct + * @param path path to device + * @param fsid [out] osd uuid + */ + static int probe_block_device_fsid( + CephContext *cct, + const string& path, + uuid_d *fsid); + + /** + * Fetch Object Store statistics. + * + * Currently only latency of write and apply times are measured. + * + * This appears to be called with nothing locked. + */ + virtual objectstore_perf_stat_t get_cur_stats() = 0; + + /** + * Fetch Object Store performance counters. + * + * + * This appears to be called with nothing locked. + */ + virtual const PerfCounters* get_perf_counters() const = 0; + + /** + * a collection also orders transactions + * + * Any transactions queued under a given collection will be applied in + * sequence. Transactions queued under different collections may run + * in parallel. + * + * ObjectStore users my get collection handles with open_collection() (or, + * for bootstrapping a new collection, create_new_collection()). + */ + struct CollectionImpl : public RefCountedObject { + const coll_t cid; + + CollectionImpl(const coll_t& c) + : RefCountedObject(NULL, 0), + cid(c) {} + + /// wait for any queued transactions to apply + // block until any previous transactions are visible. specifically, + // collection_list and collection_empty need to reflect prior operations. + virtual void flush() = 0; + + /** + * Async flush_commit + * + * There are two cases: + * 1) collection is currently idle: the method returns true. c is + * not touched. + * 2) collection is not idle: the method returns false and c is + * called asynchronously with a value of 0 once all transactions + * queued on this collection prior to the call have been applied + * and committed. + */ + virtual bool flush_commit(Context *c) = 0; + + const coll_t &get_cid() { + return cid; + } + }; + typedef boost::intrusive_ptr<CollectionImpl> CollectionHandle; + + + /********************************* + * + * Object Contents and semantics + * + * All ObjectStore objects are identified as a named object + * (ghobject_t and hobject_t) in a named collection (coll_t). + * ObjectStore operations support the creation, mutation, deletion + * and enumeration of objects within a collection. Enumeration is + * in sorted key order (where keys are sorted by hash). Object names + * are globally unique. + * + * Each object has four distinct parts: byte data, xattrs, omap_header + * and omap entries. + * + * The data portion of an object is conceptually equivalent to a + * file in a file system. Random and Partial access for both read + * and write operations is required. The ability to have a sparse + * implementation of the data portion of an object is beneficial for + * some workloads, but not required. There is a system-wide limit on + * the maximum size of an object, which is typically around 100 MB. + * + * Xattrs are equivalent to the extended attributes of file + * systems. Xattrs are a set of key/value pairs. Sub-value access + * is not required. It is possible to enumerate the set of xattrs in + * key order. At the implementation level, xattrs are used + * exclusively internal to Ceph and the implementer can expect the + * total size of all of the xattrs on an object to be relatively + * small, i.e., less than 64KB. Much of Ceph assumes that accessing + * xattrs on temporally adjacent object accesses (recent past or + * near future) is inexpensive. + * + * omap_header is a single blob of data. It can be read or written + * in total. + * + * Omap entries are conceptually the same as xattrs + * but in a different address space. In other words, you can have + * the same key as an xattr and an omap entry and they have distinct + * values. Enumeration of xattrs doesn't include omap entries and + * vice versa. The size and access characteristics of omap entries + * are very different from xattrs. In particular, the value portion + * of an omap entry can be quite large (MBs). More importantly, the + * interface must support efficient range queries on omap entries even + * when there are a large numbers of entries. + * + *********************************/ + + /******************************* + * + * Collections + * + * A collection is simply a grouping of objects. Collections have + * names (coll_t) and can be enumerated in order. Like an + * individual object, a collection also has a set of xattrs. + * + * + */ + + + /********************************* + * transaction + * + * A Transaction represents a sequence of primitive mutation + * operations. + * + * Three events in the life of a Transaction result in + * callbacks. Any Transaction can contain any number of callback + * objects (Context) for any combination of the three classes of + * callbacks: + * + * on_applied_sync, on_applied, and on_commit. + * + * The "on_applied" and "on_applied_sync" callbacks are invoked when + * the modifications requested by the Transaction are visible to + * subsequent ObjectStore operations, i.e., the results are + * readable. The only conceptual difference between on_applied and + * on_applied_sync is the specific thread and locking environment in + * which the callbacks operate. "on_applied_sync" is called + * directly by an ObjectStore execution thread. It is expected to + * execute quickly and must not acquire any locks of the calling + * environment. Conversely, "on_applied" is called from the separate + * Finisher thread, meaning that it can contend for calling + * environment locks. NB, on_applied and on_applied_sync are + * sometimes called on_readable and on_readable_sync. + * + * The "on_commit" callback is also called from the Finisher thread + * and indicates that all of the mutations have been durably + * committed to stable storage (i.e., are now software/hardware + * crashproof). + * + * At the implementation level, each mutation primitive (and its + * associated data) can be serialized to a single buffer. That + * serialization, however, does not copy any data, but (using the + * bufferlist library) will reference the original buffers. This + * implies that the buffer that contains the data being submitted + * must remain stable until the on_commit callback completes. In + * practice, bufferlist handles all of this for you and this + * subtlety is only relevant if you are referencing an existing + * buffer via buffer::raw_static. + * + * Some implementations of ObjectStore choose to implement their own + * form of journaling that uses the serialized form of a + * Transaction. This requires that the encode/decode logic properly + * version itself and handle version upgrades that might change the + * format of the encoded Transaction. This has already happened a + * couple of times and the Transaction object contains some helper + * variables that aid in this legacy decoding: + * + * sobject_encoding detects an older/simpler version of oid + * present in pre-bobtail versions of ceph. use_pool_override + * also detects a situation where the pool of an oid can be + * overridden for legacy operations/buffers. For non-legacy + * implementations of ObjectStore, neither of these fields are + * relevant. + * + * + * TRANSACTION ISOLATION + * + * Except as noted above, isolation is the responsibility of the + * caller. In other words, if any storage element (storage element + * == any of the four portions of an object as described above) is + * altered by a transaction (including deletion), the caller + * promises not to attempt to read that element while the + * transaction is pending (here pending means from the time of + * issuance until the "on_applied_sync" callback has been + * received). Violations of isolation need not be detected by + * ObjectStore and there is no corresponding error mechanism for + * reporting an isolation violation (crashing would be the + * appropriate way to report an isolation violation if detected). + * + * Enumeration operations may violate transaction isolation as + * described above when a storage element is being created or + * deleted as part of a transaction. In this case, ObjectStore is + * allowed to consider the enumeration operation to either precede + * or follow the violating transaction element. In other words, the + * presence/absence of the mutated element in the enumeration is + * entirely at the discretion of ObjectStore. The arbitrary ordering + * applies independently to each transaction element. For example, + * if a transaction contains two mutating elements "create A" and + * "delete B". And an enumeration operation is performed while this + * transaction is pending. It is permissible for ObjectStore to + * report any of the four possible combinations of the existence of + * A and B. + * + */ + class Transaction { + public: + enum { + OP_NOP = 0, + OP_TOUCH = 9, // cid, oid + OP_WRITE = 10, // cid, oid, offset, len, bl + OP_ZERO = 11, // cid, oid, offset, len + OP_TRUNCATE = 12, // cid, oid, len + OP_REMOVE = 13, // cid, oid + OP_SETATTR = 14, // cid, oid, attrname, bl + OP_SETATTRS = 15, // cid, oid, attrset + OP_RMATTR = 16, // cid, oid, attrname + OP_CLONE = 17, // cid, oid, newoid + OP_CLONERANGE = 18, // cid, oid, newoid, offset, len + OP_CLONERANGE2 = 30, // cid, oid, newoid, srcoff, len, dstoff + + OP_TRIMCACHE = 19, // cid, oid, offset, len **DEPRECATED** + + OP_MKCOLL = 20, // cid + OP_RMCOLL = 21, // cid + OP_COLL_ADD = 22, // cid, oldcid, oid + OP_COLL_REMOVE = 23, // cid, oid + OP_COLL_SETATTR = 24, // cid, attrname, bl + OP_COLL_RMATTR = 25, // cid, attrname + OP_COLL_SETATTRS = 26, // cid, attrset + OP_COLL_MOVE = 8, // newcid, oldcid, oid + + OP_RMATTRS = 28, // cid, oid + OP_COLL_RENAME = 29, // cid, newcid + + OP_OMAP_CLEAR = 31, // cid + OP_OMAP_SETKEYS = 32, // cid, attrset + OP_OMAP_RMKEYS = 33, // cid, keyset + OP_OMAP_SETHEADER = 34, // cid, header + OP_SPLIT_COLLECTION = 35, // cid, bits, destination + OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination + doesn't create the destination */ + OP_OMAP_RMKEYRANGE = 37, // cid, oid, firstkey, lastkey + OP_COLL_MOVE_RENAME = 38, // oldcid, oldoid, newcid, newoid + + OP_SETALLOCHINT = 39, // cid, oid, object_size, write_size + OP_COLL_HINT = 40, // cid, type, bl + + OP_TRY_RENAME = 41, // oldcid, oldoid, newoid + + OP_COLL_SET_BITS = 42, // cid, bits + + OP_MERGE_COLLECTION = 43, // cid, destination + }; + + // Transaction hint type + enum { + COLL_HINT_EXPECTED_NUM_OBJECTS = 1, + }; + + struct Op { + ceph_le32 op; + ceph_le32 cid; + ceph_le32 oid; + ceph_le64 off; + ceph_le64 len; + ceph_le32 dest_cid; + ceph_le32 dest_oid; //OP_CLONE, OP_CLONERANGE + ceph_le64 dest_off; //OP_CLONERANGE + union { + struct { + ceph_le32 hint_type; //OP_COLL_HINT + }; + struct { + ceph_le32 alloc_hint_flags; //OP_SETALLOCHINT + }; + }; + ceph_le64 expected_object_size; //OP_SETALLOCHINT + ceph_le64 expected_write_size; //OP_SETALLOCHINT + ceph_le32 split_bits; //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS, + //OP_MKCOLL + ceph_le32 split_rem; //OP_SPLIT_COLLECTION2 + } __attribute__ ((packed)) ; + + struct TransactionData { + ceph_le64 ops; + ceph_le32 largest_data_len; + ceph_le32 largest_data_off; + ceph_le32 largest_data_off_in_data_bl; + ceph_le32 fadvise_flags; + + TransactionData() noexcept : + ops(init_le64(0)), + largest_data_len(init_le32(0)), + largest_data_off(init_le32(0)), + largest_data_off_in_data_bl(init_le32(0)), + fadvise_flags(init_le32(0)) { } + + // override default move operations to reset default values + TransactionData(TransactionData&& other) noexcept : + ops(other.ops), + largest_data_len(other.largest_data_len), + largest_data_off(other.largest_data_off), + largest_data_off_in_data_bl(other.largest_data_off_in_data_bl), + fadvise_flags(other.fadvise_flags) { + other.ops = 0; + other.largest_data_len = 0; + other.largest_data_off = 0; + other.largest_data_off_in_data_bl = 0; + other.fadvise_flags = 0; + } + TransactionData& operator=(TransactionData&& other) noexcept { + ops = other.ops; + largest_data_len = other.largest_data_len; + largest_data_off = other.largest_data_off; + largest_data_off_in_data_bl = other.largest_data_off_in_data_bl; + fadvise_flags = other.fadvise_flags; + other.ops = 0; + other.largest_data_len = 0; + other.largest_data_off = 0; + other.largest_data_off_in_data_bl = 0; + other.fadvise_flags = 0; + return *this; + } + + TransactionData(const TransactionData& other) = default; + TransactionData& operator=(const TransactionData& other) = default; + + void encode(bufferlist& bl) const { + bl.append((char*)this, sizeof(TransactionData)); + } + void decode(bufferlist::const_iterator &bl) { + bl.copy(sizeof(TransactionData), (char*)this); + } + } __attribute__ ((packed)) ; + + private: + TransactionData data; + + map<coll_t, __le32> coll_index; + map<ghobject_t, __le32> object_index; + + __le32 coll_id {0}; + __le32 object_id {0}; + + bufferlist data_bl; + bufferlist op_bl; + + list<Context *> on_applied; + list<Context *> on_commit; + list<Context *> on_applied_sync; + + public: + Transaction() = default; + + explicit Transaction(bufferlist::const_iterator &dp) { + decode(dp); + } + explicit Transaction(bufferlist &nbl) { + auto dp = nbl.cbegin(); + decode(dp); + } + + // override default move operations to reset default values + Transaction(Transaction&& other) noexcept : + data(std::move(other.data)), + coll_index(std::move(other.coll_index)), + object_index(std::move(other.object_index)), + coll_id(other.coll_id), + object_id(other.object_id), + data_bl(std::move(other.data_bl)), + op_bl(std::move(other.op_bl)), + on_applied(std::move(other.on_applied)), + on_commit(std::move(other.on_commit)), + on_applied_sync(std::move(other.on_applied_sync)) { + other.coll_id = 0; + other.object_id = 0; + } + + Transaction& operator=(Transaction&& other) noexcept { + data = std::move(other.data); + coll_index = std::move(other.coll_index); + object_index = std::move(other.object_index); + coll_id = other.coll_id; + object_id = other.object_id; + data_bl = std::move(other.data_bl); + op_bl = std::move(other.op_bl); + on_applied = std::move(other.on_applied); + on_commit = std::move(other.on_commit); + on_applied_sync = std::move(other.on_applied_sync); + other.coll_id = 0; + other.object_id = 0; + return *this; + } + + Transaction(const Transaction& other) = default; + Transaction& operator=(const Transaction& other) = default; + + // expose object_index for FileStore::Op's benefit + const map<ghobject_t, __le32>& get_object_index() const { + return object_index; + } + + /* Operations on callback contexts */ + void register_on_applied(Context *c) { + if (!c) return; + on_applied.push_back(c); + } + void register_on_commit(Context *c) { + if (!c) return; + on_commit.push_back(c); + } + void register_on_applied_sync(Context *c) { + if (!c) return; + on_applied_sync.push_back(c); + } + void register_on_complete(Context *c) { + if (!c) return; + RunOnDeleteRef _complete (std::make_shared<RunOnDelete>(c)); + register_on_applied(new ContainerContext<RunOnDeleteRef>(_complete)); + register_on_commit(new ContainerContext<RunOnDeleteRef>(_complete)); + } + bool has_contexts() const { + return + !on_commit.empty() || + !on_applied.empty() || + !on_applied_sync.empty(); + } + + static void collect_contexts( + vector<Transaction>& t, + Context **out_on_applied, + Context **out_on_commit, + Context **out_on_applied_sync) { + ceph_assert(out_on_applied); + ceph_assert(out_on_commit); + ceph_assert(out_on_applied_sync); + list<Context *> on_applied, on_commit, on_applied_sync; + for (auto& i : t) { + on_applied.splice(on_applied.end(), i.on_applied); + on_commit.splice(on_commit.end(), i.on_commit); + on_applied_sync.splice(on_applied_sync.end(), i.on_applied_sync); + } + *out_on_applied = C_Contexts::list_to_context(on_applied); + *out_on_commit = C_Contexts::list_to_context(on_commit); + *out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync); + } + static void collect_contexts( + vector<Transaction>& t, + list<Context*> *out_on_applied, + list<Context*> *out_on_commit, + list<Context*> *out_on_applied_sync) { + ceph_assert(out_on_applied); + ceph_assert(out_on_commit); + ceph_assert(out_on_applied_sync); + for (auto& i : t) { + out_on_applied->splice(out_on_applied->end(), i.on_applied); + out_on_commit->splice(out_on_commit->end(), i.on_commit); + out_on_applied_sync->splice(out_on_applied_sync->end(), + i.on_applied_sync); + } + } + + Context *get_on_applied() { + return C_Contexts::list_to_context(on_applied); + } + Context *get_on_commit() { + return C_Contexts::list_to_context(on_commit); + } + Context *get_on_applied_sync() { + return C_Contexts::list_to_context(on_applied_sync); + } + + void set_fadvise_flags(uint32_t flags) { + data.fadvise_flags = flags; + } + void set_fadvise_flag(uint32_t flag) { + data.fadvise_flags = data.fadvise_flags | flag; + } + uint32_t get_fadvise_flags() { return data.fadvise_flags; } + + void swap(Transaction& other) noexcept { + std::swap(data, other.data); + std::swap(on_applied, other.on_applied); + std::swap(on_commit, other.on_commit); + std::swap(on_applied_sync, other.on_applied_sync); + + std::swap(coll_index, other.coll_index); + std::swap(object_index, other.object_index); + std::swap(coll_id, other.coll_id); + std::swap(object_id, other.object_id); + op_bl.swap(other.op_bl); + data_bl.swap(other.data_bl); + } + + void _update_op(Op* op, + vector<__le32> &cm, + vector<__le32> &om) { + + switch (op->op) { + case OP_NOP: + break; + + case OP_TOUCH: + case OP_REMOVE: + case OP_SETATTR: + case OP_SETATTRS: + case OP_RMATTR: + case OP_RMATTRS: + case OP_COLL_REMOVE: + case OP_OMAP_CLEAR: + case OP_OMAP_SETKEYS: + case OP_OMAP_RMKEYS: + case OP_OMAP_RMKEYRANGE: + case OP_OMAP_SETHEADER: + case OP_WRITE: + case OP_ZERO: + case OP_TRUNCATE: + case OP_SETALLOCHINT: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + op->cid = cm[op->cid]; + op->oid = om[op->oid]; + break; + + case OP_CLONERANGE2: + case OP_CLONE: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + ceph_assert(op->dest_oid < om.size()); + op->cid = cm[op->cid]; + op->oid = om[op->oid]; + op->dest_oid = om[op->dest_oid]; + break; + + case OP_MKCOLL: + case OP_RMCOLL: + case OP_COLL_SETATTR: + case OP_COLL_RMATTR: + case OP_COLL_SETATTRS: + case OP_COLL_HINT: + case OP_COLL_SET_BITS: + ceph_assert(op->cid < cm.size()); + op->cid = cm[op->cid]; + break; + + case OP_COLL_ADD: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + ceph_assert(op->dest_cid < om.size()); + op->cid = cm[op->cid]; + op->dest_cid = cm[op->dest_cid]; + op->oid = om[op->oid]; + break; + + case OP_COLL_MOVE_RENAME: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + ceph_assert(op->dest_cid < cm.size()); + ceph_assert(op->dest_oid < om.size()); + op->cid = cm[op->cid]; + op->oid = om[op->oid]; + op->dest_cid = cm[op->dest_cid]; + op->dest_oid = om[op->dest_oid]; + break; + + case OP_TRY_RENAME: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + ceph_assert(op->dest_oid < om.size()); + op->cid = cm[op->cid]; + op->oid = om[op->oid]; + op->dest_oid = om[op->dest_oid]; + break; + + case OP_SPLIT_COLLECTION2: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->dest_cid < cm.size()); + op->cid = cm[op->cid]; + op->dest_cid = cm[op->dest_cid]; + break; + + case OP_MERGE_COLLECTION: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->dest_cid < cm.size()); + op->cid = cm[op->cid]; + op->dest_cid = cm[op->dest_cid]; + break; + + default: + ceph_abort_msg("Unknown OP"); + } + } + void _update_op_bl( + bufferlist& bl, + vector<__le32> &cm, + vector<__le32> &om) { + for (auto& bp : bl.buffers()) { + ceph_assert(bp.length() % sizeof(Op) == 0); + + char* raw_p = const_cast<char*>(bp.c_str()); + char* raw_end = raw_p + bp.length(); + while (raw_p < raw_end) { + _update_op(reinterpret_cast<Op*>(raw_p), cm, om); + raw_p += sizeof(Op); + } + } + } + /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction + void append(Transaction& other) { + + data.ops = data.ops + other.data.ops; + if (other.data.largest_data_len > data.largest_data_len) { + data.largest_data_len = other.data.largest_data_len; + data.largest_data_off = other.data.largest_data_off; + data.largest_data_off_in_data_bl = data_bl.length() + other.data.largest_data_off_in_data_bl; + } + data.fadvise_flags = data.fadvise_flags | other.data.fadvise_flags; + on_applied.splice(on_applied.end(), other.on_applied); + on_commit.splice(on_commit.end(), other.on_commit); + on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync); + + //append coll_index & object_index + vector<__le32> cm(other.coll_index.size()); + map<coll_t, __le32>::iterator coll_index_p; + for (coll_index_p = other.coll_index.begin(); + coll_index_p != other.coll_index.end(); + ++coll_index_p) { + cm[coll_index_p->second] = _get_coll_id(coll_index_p->first); + } + + vector<__le32> om(other.object_index.size()); + map<ghobject_t, __le32>::iterator object_index_p; + for (object_index_p = other.object_index.begin(); + object_index_p != other.object_index.end(); + ++object_index_p) { + om[object_index_p->second] = _get_object_id(object_index_p->first); + } + + //the other.op_bl SHOULD NOT be changes during append operation, + //we use additional bufferlist to avoid this problem + bufferlist other_op_bl; + { + bufferptr other_op_bl_ptr(other.op_bl.length()); + other.op_bl.copy(0, other.op_bl.length(), other_op_bl_ptr.c_str()); + other_op_bl.append(std::move(other_op_bl_ptr)); + } + + //update other_op_bl with cm & om + //When the other is appended to current transaction, all coll_index and + //object_index in other.op_buffer should be updated by new index of the + //combined transaction + _update_op_bl(other_op_bl, cm, om); + + //append op_bl + op_bl.append(other_op_bl); + //append data_bl + data_bl.append(other.data_bl); + } + + /** Inquires about the Transaction as a whole. */ + + /// How big is the encoded Transaction buffer? + uint64_t get_encoded_bytes() { + //layout: data_bl + op_bl + coll_index + object_index + data + + // coll_index size, object_index size and sizeof(transaction_data) + // all here, so they may be computed at compile-time + size_t final_size = sizeof(__u32) * 2 + sizeof(data); + + // coll_index second and object_index second + final_size += (coll_index.size() + object_index.size()) * sizeof(__le32); + + // coll_index first + for (auto p = coll_index.begin(); p != coll_index.end(); ++p) { + final_size += p->first.encoded_size(); + } + + // object_index first + for (auto p = object_index.begin(); p != object_index.end(); ++p) { + final_size += p->first.encoded_size(); + } + + return data_bl.length() + + op_bl.length() + + final_size; + } + + /// Retain old version for regression testing purposes + uint64_t get_encoded_bytes_test() { + using ceph::encode; + //layout: data_bl + op_bl + coll_index + object_index + data + bufferlist bl; + encode(coll_index, bl); + encode(object_index, bl); + + return data_bl.length() + + op_bl.length() + + bl.length() + + sizeof(data); + } + + uint64_t get_num_bytes() { + return get_encoded_bytes(); + } + /// Size of largest data buffer to the "write" operation encountered so far + uint32_t get_data_length() { + return data.largest_data_len; + } + /// offset within the encoded buffer to the start of the largest data buffer that's encoded + uint32_t get_data_offset() { + if (data.largest_data_off_in_data_bl) { + return data.largest_data_off_in_data_bl + + sizeof(__u8) + // encode struct_v + sizeof(__u8) + // encode compat_v + sizeof(__u32) + // encode len + sizeof(__u32); // data_bl len + } + return 0; // none + } + /// offset of buffer as aligned to destination within object. + int get_data_alignment() { + if (!data.largest_data_len) + return 0; + return (0 - get_data_offset()) & ~CEPH_PAGE_MASK; + } + /// Is the Transaction empty (no operations) + bool empty() { + return !data.ops; + } + /// Number of operations in the transaction + int get_num_ops() { + return data.ops; + } + + /** + * iterator + * + * Helper object to parse Transactions. + * + * ObjectStore instances use this object to step down the encoded + * buffer decoding operation codes and parameters as we go. + * + */ + class iterator { + Transaction *t; + + uint64_t ops; + char* op_buffer_p; + + bufferlist::const_iterator data_bl_p; + + public: + vector<coll_t> colls; + vector<ghobject_t> objects; + + private: + explicit iterator(Transaction *t) + : t(t), + data_bl_p(t->data_bl.cbegin()), + colls(t->coll_index.size()), + objects(t->object_index.size()) { + + ops = t->data.ops; + op_buffer_p = t->op_bl.c_str(); + + map<coll_t, __le32>::iterator coll_index_p; + for (coll_index_p = t->coll_index.begin(); + coll_index_p != t->coll_index.end(); + ++coll_index_p) { + colls[coll_index_p->second] = coll_index_p->first; + } + + map<ghobject_t, __le32>::iterator object_index_p; + for (object_index_p = t->object_index.begin(); + object_index_p != t->object_index.end(); + ++object_index_p) { + objects[object_index_p->second] = object_index_p->first; + } + } + + friend class Transaction; + + public: + + bool have_op() { + return ops > 0; + } + Op* decode_op() { + ceph_assert(ops > 0); + + Op* op = reinterpret_cast<Op*>(op_buffer_p); + op_buffer_p += sizeof(Op); + ops--; + + return op; + } + string decode_string() { + using ceph::decode; + string s; + decode(s, data_bl_p); + return s; + } + void decode_bp(bufferptr& bp) { + using ceph::decode; + decode(bp, data_bl_p); + } + void decode_bl(bufferlist& bl) { + using ceph::decode; + decode(bl, data_bl_p); + } + void decode_attrset(map<string,bufferptr>& aset) { + using ceph::decode; + decode(aset, data_bl_p); + } + void decode_attrset(map<string,bufferlist>& aset) { + using ceph::decode; + decode(aset, data_bl_p); + } + void decode_attrset_bl(bufferlist *pbl) { + decode_str_str_map_to_bl(data_bl_p, pbl); + } + void decode_keyset(set<string> &keys){ + using ceph::decode; + decode(keys, data_bl_p); + } + void decode_keyset_bl(bufferlist *pbl){ + decode_str_set_to_bl(data_bl_p, pbl); + } + + const ghobject_t &get_oid(__le32 oid_id) { + ceph_assert(oid_id < objects.size()); + return objects[oid_id]; + } + const coll_t &get_cid(__le32 cid_id) { + ceph_assert(cid_id < colls.size()); + return colls[cid_id]; + } + uint32_t get_fadvise_flags() const { + return t->get_fadvise_flags(); + } + }; + + iterator begin() { + return iterator(this); + } + +private: + void _build_actions_from_tbl(); + + /** + * Helper functions to encode the various mutation elements of a + * transaction. These are 1:1 with the operation codes (see + * enumeration above). These routines ensure that the + * encoder/creator of a transaction gets the right data in the + * right place. Sadly, there's no corresponding version nor any + * form of seat belts for the decoder. + */ + Op* _get_next_op() { + if (op_bl.get_append_buffer_unused_tail_length() < sizeof(Op)) { + op_bl.reserve(sizeof(Op) * OPS_PER_PTR); + } + // append_hole ensures bptr merging. Even huge number of ops + // shouldn't result in overpopulating bl::_buffers. + char* const p = op_bl.append_hole(sizeof(Op)).c_str(); + memset(p, 0, sizeof(Op)); + return reinterpret_cast<Op*>(p); + } + __le32 _get_coll_id(const coll_t& coll) { + map<coll_t, __le32>::iterator c = coll_index.find(coll); + if (c != coll_index.end()) + return c->second; + + __le32 index_id = coll_id++; + coll_index[coll] = index_id; + return index_id; + } + __le32 _get_object_id(const ghobject_t& oid) { + map<ghobject_t, __le32>::iterator o = object_index.find(oid); + if (o != object_index.end()) + return o->second; + + __le32 index_id = object_id++; + object_index[oid] = index_id; + return index_id; + } + +public: + /// noop. 'nuf said + void nop() { + Op* _op = _get_next_op(); + _op->op = OP_NOP; + data.ops = data.ops + 1; + } + /** + * touch + * + * Ensure the existance of an object in a collection. Create an + * empty object if necessary + */ + void touch(const coll_t& cid, const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_TOUCH; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + /** + * Write data to an offset within an object. If the object is too + * small, it is expanded as needed. It is possible to specify an + * offset beyond the current end of an object and it will be + * expanded as needed. Simple implementations of ObjectStore will + * just zero the data between the old end of the object and the + * newly provided data. More sophisticated implementations of + * ObjectStore will omit the untouched data and store it as a + * "hole" in the file. + * + * Note that a 0-length write does not affect the size of the object. + */ + void write(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len, + const bufferlist& write_data, uint32_t flags = 0) { + using ceph::encode; + uint32_t orig_len = data_bl.length(); + Op* _op = _get_next_op(); + _op->op = OP_WRITE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->off = off; + _op->len = len; + encode(write_data, data_bl); + + ceph_assert(len == write_data.length()); + data.fadvise_flags = data.fadvise_flags | flags; + if (write_data.length() > data.largest_data_len) { + data.largest_data_len = write_data.length(); + data.largest_data_off = off; + data.largest_data_off_in_data_bl = orig_len + sizeof(__u32); // we are about to + } + data.ops = data.ops + 1; + } + /** + * zero out the indicated byte range within an object. Some + * ObjectStore instances may optimize this to release the + * underlying storage space. + * + * If the zero range extends beyond the end of the object, the object + * size is extended, just as if we were writing a buffer full of zeros. + * EXCEPT if the length is 0, in which case (just like a 0-length write) + * we do not adjust the object size. + */ + void zero(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len) { + Op* _op = _get_next_op(); + _op->op = OP_ZERO; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->off = off; + _op->len = len; + data.ops = data.ops + 1; + } + /// Discard all data in the object beyond the specified size. + void truncate(const coll_t& cid, const ghobject_t& oid, uint64_t off) { + Op* _op = _get_next_op(); + _op->op = OP_TRUNCATE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->off = off; + data.ops = data.ops + 1; + } + /// Remove an object. All four parts of the object are removed. + void remove(const coll_t& cid, const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_REMOVE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + /// Set an xattr of an object + void setattr(const coll_t& cid, const ghobject_t& oid, const char* name, bufferlist& val) { + string n(name); + setattr(cid, oid, n, val); + } + /// Set an xattr of an object + void setattr(const coll_t& cid, const ghobject_t& oid, const string& s, bufferlist& val) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_SETATTR; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(s, data_bl); + encode(val, data_bl); + data.ops = data.ops + 1; + } + /// Set multiple xattrs of an object + void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferptr>& attrset) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_SETATTRS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(attrset, data_bl); + data.ops = data.ops + 1; + } + /// Set multiple xattrs of an object + void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferlist>& attrset) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_SETATTRS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(attrset, data_bl); + data.ops = data.ops + 1; + } + /// remove an xattr from an object + void rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) { + string n(name); + rmattr(cid, oid, n); + } + /// remove an xattr from an object + void rmattr(const coll_t& cid, const ghobject_t& oid, const string& s) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_RMATTR; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(s, data_bl); + data.ops = data.ops + 1; + } + /// remove all xattrs from an object + void rmattrs(const coll_t& cid, const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_RMATTRS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + /** + * Clone an object into another object. + * + * Low-cost (e.g., O(1)) cloning (if supported) is best, but + * fallback to an O(n) copy is allowed. All four parts of the + * object are cloned (data, xattrs, omap header, omap + * entries). + * + * The destination named object may already exist, in + * which case its previous contents are discarded. + */ + void clone(const coll_t& cid, const ghobject_t& oid, + const ghobject_t& noid) { + Op* _op = _get_next_op(); + _op->op = OP_CLONE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->dest_oid = _get_object_id(noid); + data.ops = data.ops + 1; + } + /** + * Clone a byte range from one object to another. + * + * The data portion of the destination object receives a copy of a + * portion of the data from the source object. None of the other + * three parts of an object is copied from the source. + * + * The destination object size may be extended to the dstoff + len. + * + * The source range *must* overlap with the source object data. If it does + * not the result is undefined. + */ + void clone_range(const coll_t& cid, const ghobject_t& oid, + const ghobject_t& noid, + uint64_t srcoff, uint64_t srclen, uint64_t dstoff) { + Op* _op = _get_next_op(); + _op->op = OP_CLONERANGE2; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->dest_oid = _get_object_id(noid); + _op->off = srcoff; + _op->len = srclen; + _op->dest_off = dstoff; + data.ops = data.ops + 1; + } + + /// Create the collection + void create_collection(const coll_t& cid, int bits) { + Op* _op = _get_next_op(); + _op->op = OP_MKCOLL; + _op->cid = _get_coll_id(cid); + _op->split_bits = bits; + data.ops = data.ops + 1; + } + + /** + * Give the collection a hint. + * + * @param cid - collection id. + * @param type - hint type. + * @param hint - the hint payload, which contains the customized + * data along with the hint type. + */ + void collection_hint(const coll_t& cid, uint32_t type, const bufferlist& hint) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_COLL_HINT; + _op->cid = _get_coll_id(cid); + _op->hint_type = type; + encode(hint, data_bl); + data.ops = data.ops + 1; + } + + /// remove the collection, the collection must be empty + void remove_collection(const coll_t& cid) { + Op* _op = _get_next_op(); + _op->op = OP_RMCOLL; + _op->cid = _get_coll_id(cid); + data.ops = data.ops + 1; + } + void collection_move(const coll_t& cid, const coll_t &oldcid, const ghobject_t& oid) + __attribute__ ((deprecated)) { + // NOTE: we encode this as a fixed combo of ADD + REMOVE. they + // always appear together, so this is effectively a single MOVE. + Op* _op = _get_next_op(); + _op->op = OP_COLL_ADD; + _op->cid = _get_coll_id(oldcid); + _op->oid = _get_object_id(oid); + _op->dest_cid = _get_coll_id(cid); + data.ops = data.ops + 1; + + _op = _get_next_op(); + _op->op = OP_COLL_REMOVE; + _op->cid = _get_coll_id(oldcid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + void collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + const coll_t &cid, const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_COLL_MOVE_RENAME; + _op->cid = _get_coll_id(oldcid); + _op->oid = _get_object_id(oldoid); + _op->dest_cid = _get_coll_id(cid); + _op->dest_oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + void try_rename(const coll_t &cid, const ghobject_t& oldoid, + const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_TRY_RENAME; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oldoid); + _op->dest_oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + + /// Remove omap from oid + void omap_clear( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid ///< [in] Object from which to remove omap + ) { + Op* _op = _get_next_op(); + _op->op = OP_OMAP_CLEAR; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + /// Set keys on oid omap. Replaces duplicate keys. + void omap_setkeys( + const coll_t& cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object to update + const map<string, bufferlist> &attrset ///< [in] Replacement keys and values + ) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_OMAP_SETKEYS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(attrset, data_bl); + data.ops = data.ops + 1; + } + + /// Set keys on an oid omap (bufferlist variant). + void omap_setkeys( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object to update + const bufferlist &attrset_bl ///< [in] Replacement keys and values + ) { + Op* _op = _get_next_op(); + _op->op = OP_OMAP_SETKEYS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data_bl.append(attrset_bl); + data.ops = data.ops + 1; + } + + /// Remove keys from oid omap + void omap_rmkeys( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object from which to remove the omap + const set<string> &keys ///< [in] Keys to clear + ) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_OMAP_RMKEYS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(keys, data_bl); + data.ops = data.ops + 1; + } + + /// Remove keys from oid omap + void omap_rmkeys( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object from which to remove the omap + const bufferlist &keys_bl ///< [in] Keys to clear + ) { + Op* _op = _get_next_op(); + _op->op = OP_OMAP_RMKEYS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data_bl.append(keys_bl); + data.ops = data.ops + 1; + } + + /// Remove key range from oid omap + void omap_rmkeyrange( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object from which to remove the omap keys + const string& first, ///< [in] first key in range + const string& last ///< [in] first key past range, range is [first,last) + ) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_OMAP_RMKEYRANGE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(first, data_bl); + encode(last, data_bl); + data.ops = data.ops + 1; + } + + /// Set omap header + void omap_setheader( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object + const bufferlist &bl ///< [in] Header value + ) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_OMAP_SETHEADER; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(bl, data_bl); + data.ops = data.ops + 1; + } + + /// Split collection based on given prefixes, objects matching the specified bits/rem are + /// moved to the new collection + void split_collection( + const coll_t &cid, + uint32_t bits, + uint32_t rem, + const coll_t &destination) { + Op* _op = _get_next_op(); + _op->op = OP_SPLIT_COLLECTION2; + _op->cid = _get_coll_id(cid); + _op->dest_cid = _get_coll_id(destination); + _op->split_bits = bits; + _op->split_rem = rem; + data.ops = data.ops + 1; + } + + /// Merge collection into another. + void merge_collection( + coll_t cid, + coll_t destination, + uint32_t bits) { + Op* _op = _get_next_op(); + _op->op = OP_MERGE_COLLECTION; + _op->cid = _get_coll_id(cid); + _op->dest_cid = _get_coll_id(destination); + _op->split_bits = bits; + data.ops = data.ops + 1; + } + + void collection_set_bits( + const coll_t &cid, + int bits) { + Op* _op = _get_next_op(); + _op->op = OP_COLL_SET_BITS; + _op->cid = _get_coll_id(cid); + _op->split_bits = bits; + data.ops = data.ops + 1; + } + + /// Set allocation hint for an object + /// make 0 values(expected_object_size, expected_write_size) noops for all implementations + void set_alloc_hint( + const coll_t &cid, + const ghobject_t &oid, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags + ) { + Op* _op = _get_next_op(); + _op->op = OP_SETALLOCHINT; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->expected_object_size = expected_object_size; + _op->expected_write_size = expected_write_size; + _op->alloc_hint_flags = flags; + data.ops = data.ops + 1; + } + + void encode(bufferlist& bl) const { + //layout: data_bl + op_bl + coll_index + object_index + data + ENCODE_START(9, 9, bl); + encode(data_bl, bl); + encode(op_bl, bl); + encode(coll_index, bl); + encode(object_index, bl); + data.encode(bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + DECODE_START(9, bl); + DECODE_OLDEST(9); + + decode(data_bl, bl); + decode(op_bl, bl); + decode(coll_index, bl); + decode(object_index, bl); + data.decode(bl); + coll_id = coll_index.size(); + object_id = object_index.size(); + + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f); + static void generate_test_instances(list<Transaction*>& o); + }; + + int queue_transaction(CollectionHandle& ch, + Transaction&& t, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) { + vector<Transaction> tls; + tls.push_back(std::move(t)); + return queue_transactions(ch, tls, op, handle); + } + + virtual int queue_transactions( + CollectionHandle& ch, vector<Transaction>& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) = 0; + + + public: + ObjectStore(CephContext* cct, + const std::string& path_) : path(path_), cct(cct) {} + virtual ~ObjectStore() {} + + // no copying + explicit ObjectStore(const ObjectStore& o) = delete; + const ObjectStore& operator=(const ObjectStore& o) = delete; + + // versioning + virtual int upgrade() { + return 0; + } + + virtual void get_db_statistics(Formatter *f) { } + virtual void generate_db_histogram(Formatter *f) { } + virtual int flush_cache(ostream *os = NULL) { return -1; } + virtual void dump_perf_counters(Formatter *f) {} + virtual void dump_cache_stats(Formatter *f) {} + virtual void dump_cache_stats(ostream& os) {} + + virtual string get_type() = 0; + + // mgmt + virtual bool test_mount_in_use() = 0; + virtual int mount() = 0; + virtual int umount() = 0; + virtual int fsck(bool deep) { + return -EOPNOTSUPP; + } + virtual int repair(bool deep) { + return -EOPNOTSUPP; + } + virtual int quick_fix() { + return -EOPNOTSUPP; + } + + virtual void set_cache_shards(unsigned num) { } + + /** + * Returns 0 if the hobject is valid, -error otherwise + * + * Errors: + * -ENAMETOOLONG: locator/namespace/name too large + */ + virtual int validate_hobject_key(const hobject_t &obj) const = 0; + + virtual unsigned get_max_attr_name_length() = 0; + virtual int mkfs() = 0; // wipe + virtual int mkjournal() = 0; // journal only + virtual bool needs_journal() = 0; //< requires a journal + virtual bool wants_journal() = 0; //< prefers a journal + virtual bool allows_journal() = 0; //< allows a journal + + /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda) + virtual int get_devices(std::set<string> *devls) { + return -EOPNOTSUPP; + } + + /// true if a txn is readable immediately after it is queued. + virtual bool is_sync_onreadable() const { + return true; + } + + /** + * is_rotational + * + * Check whether store is backed by a rotational (HDD) or non-rotational + * (SSD) device. + * + * This must be usable *before* the store is mounted. + * + * @return true for HDD, false for SSD + */ + virtual bool is_rotational() { + return true; + } + + /** + * is_journal_rotational + * + * Check whether journal is backed by a rotational (HDD) or non-rotational + * (SSD) device. + * + * + * @return true for HDD, false for SSD + */ + virtual bool is_journal_rotational() { + return true; + } + + virtual string get_default_device_class() { + return is_rotational() ? "hdd" : "ssd"; + } + + virtual int get_numa_node( + int *numa_node, + set<int> *nodes, + set<string> *failed) { + return -EOPNOTSUPP; + } + + + virtual bool can_sort_nibblewise() { + return false; // assume a backend cannot, unless it says otherwise + } + + virtual int statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts = nullptr) = 0; + virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) = 0; + + virtual void collect_metadata(map<string,string> *pm) { } + + /** + * write_meta - write a simple configuration key out-of-band + * + * Write a simple key/value pair for basic store configuration + * (e.g., a uuid or magic number) to an unopened/unmounted store. + * The default implementation writes this to a plaintext file in the + * path. + * + * A newline is appended. + * + * @param key key name (e.g., "fsid") + * @param value value (e.g., a uuid rendered as a string) + * @returns 0 for success, or an error code + */ + virtual int write_meta(const std::string& key, + const std::string& value); + + /** + * read_meta - read a simple configuration key out-of-band + * + * Read a simple key value to an unopened/mounted store. + * + * Trailing whitespace is stripped off. + * + * @param key key name + * @param value pointer to value string + * @returns 0 for success, or an error code + */ + virtual int read_meta(const std::string& key, + std::string *value); + + /** + * get ideal max value for collection_list() + * + * default to some arbitrary values; the implementation will override. + */ + virtual int get_ideal_list_max() { return 64; } + + + /** + * get a collection handle + * + * Provide a trivial handle as a default to avoid converting legacy + * implementations. + */ + virtual CollectionHandle open_collection(const coll_t &cid) = 0; + + /** + * get a collection handle for a soon-to-be-created collection + * + * This handle must be used by queue_transaction that includes a + * create_collection call in order to become valid. It will become the + * reference to the created collection. + */ + virtual CollectionHandle create_new_collection(const coll_t &cid) = 0; + + /** + * set ContextQueue for a collection + * + * After that, oncommits of Transaction will queue into commit_queue. + * And osd ShardThread will call oncommits. + */ + virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0; + + /** + * Synchronous read operations + */ + + /** + * exists -- Test for existance of object + * + * @param cid collection for object + * @param oid oid of object + * @returns true if object exists, false otherwise + */ + virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0; + /** + * set_collection_opts -- set pool options for a collectioninformation for an object + * + * @param cid collection + * @param opts new collection options + * @returns 0 on success, negative error code on failure. + */ + virtual int set_collection_opts( + CollectionHandle& c, + const pool_opts_t& opts) = 0; + + /** + * stat -- get information for an object + * + * @param cid collection for object + * @param oid oid of object + * @param st output information for the object + * @param allow_eio if false, assert on -EIO operation failure + * @returns 0 on success, negative error code on failure. + */ + virtual int stat( + CollectionHandle &c, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false) = 0; + /** + * read -- read a byte range of data from an object + * + * Note: if reading from an offset past the end of the object, we + * return 0 (not, say, -EINVAL). + * + * @param cid collection for object + * @param oid oid of object + * @param offset location offset of first byte to be read + * @param len number of bytes to be read + * @param bl output bufferlist + * @param op_flags is CEPH_OSD_OP_FLAG_* + * @returns number of bytes read on success, or negative error code on failure. + */ + virtual int read( + CollectionHandle &c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0) = 0; + + /** + * fiemap -- get extent map of data of an object + * + * Returns an encoded map of the extents of an object's data portion + * (map<offset,size>). + * + * A non-enlightened implementation is free to return the extent (offset, len) + * as the sole extent. + * + * @param cid collection for object + * @param oid oid of object + * @param offset location offset of first byte to be read + * @param len number of bytes to be read + * @param bl output bufferlist for extent map information. + * @returns 0 on success, negative error code on failure. + */ + virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, + uint64_t offset, size_t len, bufferlist& bl) = 0; + virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, + uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) = 0; + + /** + * getattr -- get an xattr of an object + * + * @param cid collection for object + * @param oid oid of object + * @param name name of attr to read + * @param value place to put output result. + * @returns 0 on success, negative error code on failure. + */ + virtual int getattr(CollectionHandle &c, const ghobject_t& oid, + const char *name, bufferptr& value) = 0; + + /** + * getattr -- get an xattr of an object + * + * @param cid collection for object + * @param oid oid of object + * @param name name of attr to read + * @param value place to put output result. + * @returns 0 on success, negative error code on failure. + */ + int getattr( + CollectionHandle &c, const ghobject_t& oid, + const string& name, bufferlist& value) { + bufferptr bp; + int r = getattr(c, oid, name.c_str(), bp); + value.push_back(bp); + return r; + } + + /** + * getattrs -- get all of the xattrs of an object + * + * @param cid collection for object + * @param oid oid of object + * @param aset place to put output result. + * @returns 0 on success, negative error code on failure. + */ + virtual int getattrs(CollectionHandle &c, const ghobject_t& oid, + map<string,bufferptr>& aset) = 0; + + /** + * getattrs -- get all of the xattrs of an object + * + * @param cid collection for object + * @param oid oid of object + * @param aset place to put output result. + * @returns 0 on success, negative error code on failure. + */ + int getattrs(CollectionHandle &c, const ghobject_t& oid, + map<string,bufferlist>& aset) { + map<string,bufferptr> bmap; + int r = getattrs(c, oid, bmap); + for (map<string,bufferptr>::iterator i = bmap.begin(); + i != bmap.end(); + ++i) { + aset[i->first].append(i->second); + } + return r; + } + + + // collections + + /** + * list_collections -- get all of the collections known to this ObjectStore + * + * @param ls list of the collections in sorted order. + * @returns 0 on success, negative error code on failure. + */ + virtual int list_collections(vector<coll_t>& ls) = 0; + + /** + * does a collection exist? + * + * @param c collection + * @returns true if it exists, false otherwise + */ + virtual bool collection_exists(const coll_t& c) = 0; + + /** + * is a collection empty? + * + * @param c collection + * @param empty true if the specified collection is empty, false otherwise + * @returns 0 on success, negative error code on failure. + */ + virtual int collection_empty(CollectionHandle& c, bool *empty) = 0; + + /** + * return the number of significant bits of the coll_t::pgid. + * + * This should return what the last create_collection or split_collection + * set. A legacy backend may return -EAGAIN if the value is unavailable + * (because we upgraded from an older version, e.g., FileStore). + */ + virtual int collection_bits(CollectionHandle& c) = 0; + + + /** + * list contents of a collection that fall in the range [start, end) and no more than a specified many result + * + * @param c collection + * @param start list object that sort >= this value + * @param end list objects that sort < this value + * @param max return no more than this many results + * @param seq return no objects with snap < seq + * @param ls [out] result + * @param next [out] next item sorts >= this value + * @return zero on success, or negative error + */ + virtual int collection_list(CollectionHandle &c, + const ghobject_t& start, const ghobject_t& end, + int max, + vector<ghobject_t> *ls, ghobject_t *next) = 0; + + virtual int collection_list_legacy(CollectionHandle &c, + const ghobject_t& start, + const ghobject_t& end, int max, + std::vector<ghobject_t> *ls, + ghobject_t *next) { + return collection_list(c, start, end, max, ls, next); + } + + /// OMAP + /// Get omap contents + virtual int omap_get( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) = 0; + + /// Get omap header + virtual int omap_get_header( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio = false ///< [in] don't assert on eio + ) = 0; + + /// Get keys defined on oid + virtual int omap_get_keys( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) = 0; + + /// Get key values + virtual int omap_get_values( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) = 0; + + /// Filters keys into out which are defined on oid + virtual int omap_check_keys( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) = 0; + + /** + * Returns an object map iterator + * + * Warning! The returned iterator is an implicit lock on filestore + * operations in c. Do not use filestore methods on c while the returned + * iterator is live. (Filling in a transaction is no problem). + * + * @return iterator, null on error + */ + virtual ObjectMap::ObjectMapIterator get_omap_iterator( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) = 0; + + virtual int flush_journal() { return -EOPNOTSUPP; } + + virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; } + + virtual int snapshot(const string& name) { return -EOPNOTSUPP; } + + /** + * Set and get internal fsid for this instance. No external data is modified + */ + virtual void set_fsid(uuid_d u) = 0; + virtual uuid_d get_fsid() = 0; + + /** + * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store + * - num objects - total (including witeouts) object count to measure used space for. + */ + virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0; + + + // DEBUG + virtual void inject_data_error(const ghobject_t &oid) {} + virtual void inject_mdata_error(const ghobject_t &oid) {} + + virtual void compact() {} + virtual bool has_builtin_csum() const { + return false; + } +}; +WRITE_CLASS_ENCODER(ObjectStore::Transaction) +WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData) + +ostream& operator<<(ostream& out, const ObjectStore::Transaction& tx); + +#endif |