From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/os/ObjectStore.h | 777 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 777 insertions(+) create mode 100644 src/os/ObjectStore.h (limited to 'src/os/ObjectStore.h') diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h new file mode 100644 index 000000000..bba8627f5 --- /dev/null +++ b/src/os/ObjectStore.h @@ -0,0 +1,777 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_OBJECTSTORE_H +#define CEPH_OBJECTSTORE_H + +#include "include/buffer.h" +#include "include/common_fwd.h" +#include "include/Context.h" +#include "include/interval_set.h" +#include "include/stringify.h" +#include "include/types.h" + +#include "osd/osd_types.h" +#include "common/TrackedOp.h" +#include "common/WorkQueue.h" +#include "ObjectMap.h" +#include "os/Transaction.h" + +#include +#include +#include +#include + +#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun) || defined(_WIN32) +#include +#else +#include /* or */ +#endif + +namespace ceph { + class Formatter; +} + +/* + * low-level interface to the local OSD file system + */ + +class Logger; +class ContextQueue; + +static inline void encode(const std::map *attrset, ceph::buffer::list &bl) { + using ceph::encode; + encode(*attrset, bl); +} + +// Flag bits +typedef uint32_t osflagbits_t; +const int SKIP_JOURNAL_REPLAY = 1 << 0; +const int SKIP_MOUNT_OMAP = 1 << 1; + +class ObjectStore { +protected: + std::string path; + +public: + using Transaction = ceph::os::Transaction; + + CephContext* cct; + /** + * create - create an ObjectStore instance. + * + * This is invoked once at initialization time. + * + * @param type type of store. This is a std::string from the configuration file. + * @param data path (or other descriptor) for data + * @param journal path (or other descriptor) for journal (optional) + * @param flags which filestores should check if applicable + */ + static ObjectStore *create(CephContext *cct, + const std::string& type, + const std::string& data, + const std::string& journal, + osflagbits_t flags = 0); + + /** + * probe a block device to learn the uuid of the owning OSD + * + * @param cct cct + * @param path path to device + * @param fsid [out] osd uuid + */ + static int probe_block_device_fsid( + CephContext *cct, + const std::string& path, + uuid_d *fsid); + + /** + * Fetch Object Store statistics. + * + * Currently only latency of write and apply times are measured. + * + * This appears to be called with nothing locked. + */ + virtual objectstore_perf_stat_t get_cur_stats() = 0; + + /** + * Fetch Object Store performance counters. + * + * + * This appears to be called with nothing locked. + */ + virtual const PerfCounters* get_perf_counters() const = 0; + + /** + * a collection also orders transactions + * + * Any transactions queued under a given collection will be applied in + * sequence. Transactions queued under different collections may run + * in parallel. + * + * ObjectStore users may get collection handles with open_collection() (or, + * for bootstrapping a new collection, create_new_collection()). + */ + struct CollectionImpl : public RefCountedObject { + const coll_t cid; + + /// wait for any queued transactions to apply + // block until any previous transactions are visible. specifically, + // collection_list and collection_empty need to reflect prior operations. + virtual void flush() = 0; + + /** + * Async flush_commit + * + * There are two cases: + * 1) collection is currently idle: the method returns true. c is + * not touched. + * 2) collection is not idle: the method returns false and c is + * called asynchronously with a value of 0 once all transactions + * queued on this collection prior to the call have been applied + * and committed. + */ + virtual bool flush_commit(Context *c) = 0; + + const coll_t &get_cid() { + return cid; + } + protected: + CollectionImpl() = delete; + CollectionImpl(CephContext* cct, const coll_t& c) : RefCountedObject(cct), cid(c) {} + ~CollectionImpl() = default; + }; + using CollectionHandle = ceph::ref_t; + + + /********************************* + * + * Object Contents and semantics + * + * All ObjectStore objects are identified as a named object + * (ghobject_t and hobject_t) in a named collection (coll_t). + * ObjectStore operations support the creation, mutation, deletion + * and enumeration of objects within a collection. Enumeration is + * in sorted key order (where keys are sorted by hash). Object names + * are globally unique. + * + * Each object has four distinct parts: byte data, xattrs, omap_header + * and omap entries. + * + * The data portion of an object is conceptually equivalent to a + * file in a file system. Random and Partial access for both read + * and write operations is required. The ability to have a sparse + * implementation of the data portion of an object is beneficial for + * some workloads, but not required. There is a system-wide limit on + * the maximum size of an object, which is typically around 100 MB. + * + * Xattrs are equivalent to the extended attributes of file + * systems. Xattrs are a std::set of key/value pairs. Sub-value access + * is not required. It is possible to enumerate the std::set of xattrs in + * key order. At the implementation level, xattrs are used + * exclusively internal to Ceph and the implementer can expect the + * total size of all of the xattrs on an object to be relatively + * small, i.e., less than 64KB. Much of Ceph assumes that accessing + * xattrs on temporally adjacent object accesses (recent past or + * near future) is inexpensive. + * + * omap_header is a single blob of data. It can be read or written + * in total. + * + * Omap entries are conceptually the same as xattrs + * but in a different address space. In other words, you can have + * the same key as an xattr and an omap entry and they have distinct + * values. Enumeration of xattrs doesn't include omap entries and + * vice versa. The size and access characteristics of omap entries + * are very different from xattrs. In particular, the value portion + * of an omap entry can be quite large (MBs). More importantly, the + * interface must support efficient range queries on omap entries even + * when there are a large numbers of entries. + * + *********************************/ + + /******************************* + * + * Collections + * + * A collection is simply a grouping of objects. Collections have + * names (coll_t) and can be enumerated in order. Like an + * individual object, a collection also has a std::set of xattrs. + * + * + */ + + + int queue_transaction(CollectionHandle& ch, + Transaction&& t, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) { + std::vector tls; + tls.push_back(std::move(t)); + return queue_transactions(ch, tls, op, handle); + } + + virtual int queue_transactions( + CollectionHandle& ch, std::vector& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) = 0; + + + public: + ObjectStore(CephContext* cct, + const std::string& path_) : path(path_), cct(cct) {} + virtual ~ObjectStore() {} + + // no copying + explicit ObjectStore(const ObjectStore& o) = delete; + const ObjectStore& operator=(const ObjectStore& o) = delete; + + // versioning + virtual int upgrade() { + return 0; + } + + virtual void get_db_statistics(ceph::Formatter *f) { } + virtual void generate_db_histogram(ceph::Formatter *f) { } + virtual int flush_cache(std::ostream *os = NULL) { return -1; } + virtual void dump_perf_counters(ceph::Formatter *f) {} + virtual void dump_cache_stats(ceph::Formatter *f) {} + virtual void dump_cache_stats(std::ostream& os) {} + + virtual std::string get_type() = 0; + + // mgmt + virtual bool test_mount_in_use() = 0; + virtual int mount() = 0; + virtual int umount() = 0; + virtual int fsck(bool deep) { + return -EOPNOTSUPP; + } + virtual int repair(bool deep) { + return -EOPNOTSUPP; + } + virtual int quick_fix() { + return -EOPNOTSUPP; + } + + virtual void set_cache_shards(unsigned num) { } + + /** + * Returns 0 if the hobject is valid, -error otherwise + * + * Errors: + * -ENAMETOOLONG: locator/namespace/name too large + */ + virtual int validate_hobject_key(const hobject_t &obj) const = 0; + + virtual unsigned get_max_attr_name_length() = 0; + virtual int mkfs() = 0; // wipe + virtual int mkjournal() = 0; // journal only + virtual bool needs_journal() = 0; //< requires a journal + virtual bool wants_journal() = 0; //< prefers a journal + virtual bool allows_journal() = 0; //< allows a journal + + // return store min allocation size, if applicable + virtual uint64_t get_min_alloc_size() const { + return 0; + } + + /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda) + virtual int get_devices(std::set *devls) { + return -EOPNOTSUPP; + } + + /// true if a txn is readable immediately after it is queued. + virtual bool is_sync_onreadable() const { + return true; + } + + /** + * is_rotational + * + * Check whether store is backed by a rotational (HDD) or non-rotational + * (SSD) device. + * + * This must be usable *before* the store is mounted. + * + * @return true for HDD, false for SSD + */ + virtual bool is_rotational() { + return true; + } + + /** + * is_journal_rotational + * + * Check whether journal is backed by a rotational (HDD) or non-rotational + * (SSD) device. + * + * + * @return true for HDD, false for SSD + */ + virtual bool is_journal_rotational() { + return true; + } + + virtual std::string get_default_device_class() { + return is_rotational() ? "hdd" : "ssd"; + } + + virtual int get_numa_node( + int *numa_node, + std::set *nodes, + std::set *failed) { + return -EOPNOTSUPP; + } + + + virtual bool can_sort_nibblewise() { + return false; // assume a backend cannot, unless it says otherwise + } + + virtual int statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts = nullptr) = 0; + virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, + bool *per_pool_omap) = 0; + + virtual void collect_metadata(std::map *pm) { } + + /** + * write_meta - write a simple configuration key out-of-band + * + * Write a simple key/value pair for basic store configuration + * (e.g., a uuid or magic number) to an unopened/unmounted store. + * The default implementation writes this to a plaintext file in the + * path. + * + * A newline is appended. + * + * @param key key name (e.g., "fsid") + * @param value value (e.g., a uuid rendered as a std::string) + * @returns 0 for success, or an error code + */ + virtual int write_meta(const std::string& key, + const std::string& value); + + /** + * read_meta - read a simple configuration key out-of-band + * + * Read a simple key value to an unopened/mounted store. + * + * Trailing whitespace is stripped off. + * + * @param key key name + * @param value pointer to value std::string + * @returns 0 for success, or an error code + */ + virtual int read_meta(const std::string& key, + std::string *value); + + /** + * get ideal max value for collection_list() + * + * default to some arbitrary values; the implementation will override. + */ + virtual int get_ideal_list_max() { return 64; } + + + /** + * get a collection handle + * + * Provide a trivial handle as a default to avoid converting legacy + * implementations. + */ + virtual CollectionHandle open_collection(const coll_t &cid) = 0; + + /** + * get a collection handle for a soon-to-be-created collection + * + * This handle must be used by queue_transaction that includes a + * create_collection call in order to become valid. It will become the + * reference to the created collection. + */ + virtual CollectionHandle create_new_collection(const coll_t &cid) = 0; + + /** + * std::set ContextQueue for a collection + * + * After that, oncommits of Transaction will queue into commit_queue. + * And osd ShardThread will call oncommits. + */ + virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0; + + /** + * Synchronous read operations + */ + + /** + * exists -- Test for existance of object + * + * @param cid collection for object + * @param oid oid of object + * @returns true if object exists, false otherwise + */ + virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0; + /** + * set_collection_opts -- std::set pool options for a collectioninformation for an object + * + * @param cid collection + * @param opts new collection options + * @returns 0 on success, negative error code on failure. + */ + virtual int set_collection_opts( + CollectionHandle& c, + const pool_opts_t& opts) = 0; + + /** + * stat -- get information for an object + * + * @param cid collection for object + * @param oid oid of object + * @param st output information for the object + * @param allow_eio if false, assert on -EIO operation failure + * @returns 0 on success, negative error code on failure. + */ + virtual int stat( + CollectionHandle &c, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false) = 0; + /** + * read -- read a byte range of data from an object + * + * Note: if reading from an offset past the end of the object, we + * return 0 (not, say, -EINVAL). + * + * @param cid collection for object + * @param oid oid of object + * @param offset location offset of first byte to be read + * @param len number of bytes to be read + * @param bl output ceph::buffer::list + * @param op_flags is CEPH_OSD_OP_FLAG_* + * @returns number of bytes read on success, or negative error code on failure. + */ + virtual int read( + CollectionHandle &c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + ceph::buffer::list& bl, + uint32_t op_flags = 0) = 0; + + /** + * fiemap -- get extent std::map of data of an object + * + * Returns an encoded std::map of the extents of an object's data portion + * (std::map). + * + * A non-enlightened implementation is free to return the extent (offset, len) + * as the sole extent. + * + * @param cid collection for object + * @param oid oid of object + * @param offset location offset of first byte to be read + * @param len number of bytes to be read + * @param bl output ceph::buffer::list for extent std::map information. + * @returns 0 on success, negative error code on failure. + */ + virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, + uint64_t offset, size_t len, ceph::buffer::list& bl) = 0; + virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, + uint64_t offset, size_t len, std::map& destmap) = 0; + + /** + * readv -- read specfic intervals from an object; + * caller must call fiemap to fill in the extent-map first. + * + * Note: if reading from an offset past the end of the object, we + * return 0 (not, say, -EINVAL). Also the default version of readv + * reads each extent separately synchronously, which can become horribly + * inefficient if the physical layout of the pushing object get massively + * fragmented and hence should be overridden by any real os that + * cares about the performance.. + * + * @param cid collection for object + * @param oid oid of object + * @param m intervals to be read + * @param bl output ceph::buffer::list + * @param op_flags is CEPH_OSD_OP_FLAG_* + * @returns number of bytes read on success, or negative error code on failure. + */ + virtual int readv( + CollectionHandle &c, + const ghobject_t& oid, + interval_set& m, + ceph::buffer::list& bl, + uint32_t op_flags = 0) { + int total = 0; + for (auto p = m.begin(); p != m.end(); p++) { + ceph::buffer::list t; + int r = read(c, oid, p.get_start(), p.get_len(), t, op_flags); + if (r < 0) + return r; + total += r; + // prune fiemap, if necessary + if (p.get_len() != t.length()) { + auto save = p++; + if (t.length() == 0) { + m.erase(save); // Remove this empty interval + } else { + save.set_len(t.length()); // fix interval length + bl.claim_append(t); + } + // Remove any other follow-up intervals present too + while (p != m.end()) { + save = p++; + m.erase(save); + } + break; + } + bl.claim_append(t); + } + return total; + } + + /** + * dump_onode -- dumps onode metadata in human readable form, + intended primiarily for debugging + * + * @param cid collection for object + * @param oid oid of object + * @param section_name section name to create and print under + * @param f Formatter class instance to print to + * @returns 0 on success, negative error code on failure. + */ + virtual int dump_onode( + CollectionHandle &c, + const ghobject_t& oid, + const std::string& section_name, + ceph::Formatter *f) { + return -ENOTSUP; + } + + /** + * getattr -- get an xattr of an object + * + * @param cid collection for object + * @param oid oid of object + * @param name name of attr to read + * @param value place to put output result. + * @returns 0 on success, negative error code on failure. + */ + virtual int getattr(CollectionHandle &c, const ghobject_t& oid, + const char *name, ceph::buffer::ptr& value) = 0; + + /** + * getattr -- get an xattr of an object + * + * @param cid collection for object + * @param oid oid of object + * @param name name of attr to read + * @param value place to put output result. + * @returns 0 on success, negative error code on failure. + */ + int getattr( + CollectionHandle &c, const ghobject_t& oid, + const std::string& name, ceph::buffer::list& value) { + ceph::buffer::ptr bp; + int r = getattr(c, oid, name.c_str(), bp); + value.push_back(bp); + return r; + } + + /** + * getattrs -- get all of the xattrs of an object + * + * @param cid collection for object + * @param oid oid of object + * @param aset place to put output result. + * @returns 0 on success, negative error code on failure. + */ + virtual int getattrs(CollectionHandle &c, const ghobject_t& oid, + std::map& aset) = 0; + + /** + * getattrs -- get all of the xattrs of an object + * + * @param cid collection for object + * @param oid oid of object + * @param aset place to put output result. + * @returns 0 on success, negative error code on failure. + */ + int getattrs(CollectionHandle &c, const ghobject_t& oid, + std::map& aset) { + std::map bmap; + int r = getattrs(c, oid, bmap); + for (auto i = bmap.begin(); i != bmap.end(); ++i) { + aset[i->first].append(i->second); + } + return r; + } + + + // collections + + /** + * list_collections -- get all of the collections known to this ObjectStore + * + * @param ls std::list of the collections in sorted order. + * @returns 0 on success, negative error code on failure. + */ + virtual int list_collections(std::vector& ls) = 0; + + /** + * does a collection exist? + * + * @param c collection + * @returns true if it exists, false otherwise + */ + virtual bool collection_exists(const coll_t& c) = 0; + + /** + * is a collection empty? + * + * @param c collection + * @param empty true if the specified collection is empty, false otherwise + * @returns 0 on success, negative error code on failure. + */ + virtual int collection_empty(CollectionHandle& c, bool *empty) = 0; + + /** + * return the number of significant bits of the coll_t::pgid. + * + * This should return what the last create_collection or split_collection + * std::set. A legacy backend may return -EAGAIN if the value is unavailable + * (because we upgraded from an older version, e.g., FileStore). + */ + virtual int collection_bits(CollectionHandle& c) = 0; + + + /** + * std::list contents of a collection that fall in the range [start, end) and no more than a specified many result + * + * @param c collection + * @param start list object that sort >= this value + * @param end list objects that sort < this value + * @param max return no more than this many results + * @param seq return no objects with snap < seq + * @param ls [out] result + * @param next [out] next item sorts >= this value + * @return zero on success, or negative error + */ + virtual int collection_list(CollectionHandle &c, + const ghobject_t& start, const ghobject_t& end, + int max, + std::vector *ls, ghobject_t *next) = 0; + + virtual int collection_list_legacy(CollectionHandle &c, + const ghobject_t& start, + const ghobject_t& end, int max, + std::vector *ls, + ghobject_t *next) { + return collection_list(c, start, end, max, ls, next); + } + + /// OMAP + /// Get omap contents + virtual int omap_get( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + std::map *out /// < [out] Key to value std::map + ) = 0; + + /// Get omap header + virtual int omap_get_header( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + bool allow_eio = false ///< [in] don't assert on eio + ) = 0; + + /// Get keys defined on oid + virtual int omap_get_keys( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + std::set *keys ///< [out] Keys defined on oid + ) = 0; + + /// Get key values + virtual int omap_get_values( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::set &keys, ///< [in] Keys to get + std::map *out ///< [out] Returned keys and values + ) = 0; + +#ifdef WITH_SEASTAR + virtual int omap_get_values( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::optional &start_after, ///< [in] Keys to get + std::map *out ///< [out] Returned keys and values + ) = 0; +#endif + + /// Filters keys into out which are defined on oid + virtual int omap_check_keys( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::set &keys, ///< [in] Keys to check + std::set *out ///< [out] Subset of keys defined on oid + ) = 0; + + /** + * Returns an object map iterator + * + * Warning! The returned iterator is an implicit lock on filestore + * operations in c. Do not use filestore methods on c while the returned + * iterator is live. (Filling in a transaction is no problem). + * + * @return iterator, null on error + */ + virtual ObjectMap::ObjectMapIterator get_omap_iterator( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) = 0; + + virtual int flush_journal() { return -EOPNOTSUPP; } + + virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; } + + virtual int snapshot(const std::string& name) { return -EOPNOTSUPP; } + + /** + * Set and get internal fsid for this instance. No external data is modified + */ + virtual void set_fsid(uuid_d u) = 0; + virtual uuid_d get_fsid() = 0; + + /** + * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store + * - num objects - total (including witeouts) object count to measure used space for. + */ + virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0; + + + // DEBUG + virtual void inject_data_error(const ghobject_t &oid) {} + virtual void inject_mdata_error(const ghobject_t &oid) {} + + virtual void compact() {} + virtual bool has_builtin_csum() const { + return false; + } +}; + +#endif -- cgit v1.2.3