summaryrefslogtreecommitdiffstats
path: root/src/include
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/include
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/include/CMakeLists.txt35
-rw-r--r--src/include/CompatSet.h273
-rw-r--r--src/include/Context.h502
-rw-r--r--src/include/Distribution.h73
-rw-r--r--src/include/addr_parsing.h28
-rw-r--r--src/include/alloc_ptr.h91
-rw-r--r--src/include/any.h704
-rw-r--r--src/include/bitmapper.h48
-rw-r--r--src/include/blobhash.h47
-rw-r--r--src/include/btree_map.h63
-rw-r--r--src/include/buffer.h1331
-rw-r--r--src/include/buffer_fwd.h19
-rw-r--r--src/include/buffer_raw.h127
-rw-r--r--src/include/byteorder.h109
-rw-r--r--src/include/ceph_assert.h147
-rw-r--r--src/include/ceph_features.h279
-rw-r--r--src/include/ceph_frag.h109
-rw-r--r--src/include/ceph_fs.h982
-rw-r--r--src/include/ceph_fuse.h32
-rw-r--r--src/include/ceph_hash.h14
-rw-r--r--src/include/cephfs/ceph_ll_client.h144
-rwxr-xr-xsrc/include/cephfs/libcephfs.h1869
-rw-r--r--src/include/cmp.h205
-rw-r--r--src/include/color.h13
-rw-r--r--src/include/compact_map.h383
-rw-r--r--src/include/compact_set.h305
-rw-r--r--src/include/compat.h198
-rw-r--r--src/include/config-h.in.cmake366
-rw-r--r--src/include/coredumpctl.h105
-rw-r--r--src/include/counter.h56
-rw-r--r--src/include/cpp-btree/btree.h2396
-rw-r--r--src/include/cpp-btree/btree_container.h349
-rw-r--r--src/include/cpp-btree/btree_map.h130
-rw-r--r--src/include/cpp-btree/btree_set.h121
-rw-r--r--src/include/crc32c.h57
-rw-r--r--src/include/demangle.h48
-rw-r--r--src/include/denc.h1724
-rw-r--r--src/include/elist.h193
-rw-r--r--src/include/encoding.h1505
-rw-r--r--src/include/err.h29
-rw-r--r--src/include/error.h41
-rw-r--r--src/include/event_type.h24
-rw-r--r--src/include/filepath.h247
-rw-r--r--src/include/frag.h602
-rw-r--r--src/include/fs_types.h126
-rw-r--r--src/include/hash.h64
-rw-r--r--src/include/health.h70
-rw-r--r--src/include/inline_memory.h150
-rw-r--r--src/include/int_types.h65
-rw-r--r--src/include/intarith.h193
-rw-r--r--src/include/interval_set.h783
-rw-r--r--src/include/ipaddr.h48
-rw-r--r--src/include/krbd.h97
-rw-r--r--src/include/linux_fiemap.h73
-rw-r--r--src/include/lru.h243
-rw-r--r--src/include/mempool.h547
-rw-r--r--src/include/msgr.h254
-rw-r--r--src/include/object.h214
-rw-r--r--src/include/on_exit.h49
-rw-r--r--src/include/page.h18
-rw-r--r--src/include/rados.h681
l---------src/include/rados/buffer.h1
l---------src/include/rados/buffer_fwd.h1
l---------src/include/rados/crc32c.h1
l---------src/include/rados/inline_memory.h1
-rw-r--r--src/include/rados/librados.h4015
-rw-r--r--src/include/rados/librados.hpp1468
-rw-r--r--src/include/rados/librados_fwd.hpp32
-rw-r--r--src/include/rados/librgw.h36
-rw-r--r--src/include/rados/objclass.h177
l---------src/include/rados/page.h1
-rw-r--r--src/include/rados/rados_types.h29
-rw-r--r--src/include/rados/rados_types.hpp331
-rw-r--r--src/include/rados/rgw_file.h384
-rw-r--r--src/include/radosstriper/libradosstriper.h610
-rw-r--r--src/include/radosstriper/libradosstriper.hpp241
-rw-r--r--src/include/random.h289
-rw-r--r--src/include/rangeset.h250
-rw-r--r--src/include/rbd/features.h102
-rw-r--r--src/include/rbd/librbd.h1243
-rw-r--r--src/include/rbd/librbd.hpp686
-rw-r--r--src/include/rbd/object_map_types.h13
-rw-r--r--src/include/rbd_types.h159
-rw-r--r--src/include/rgw/librgw_admin_user.h63
-rw-r--r--src/include/scope_guard.h47
-rw-r--r--src/include/sock_compat.h43
-rw-r--r--src/include/spinlock.h92
-rw-r--r--src/include/stat.h145
-rw-r--r--src/include/statlite.h72
-rw-r--r--src/include/str_list.h129
-rw-r--r--src/include/str_map.h148
-rw-r--r--src/include/stringify.h33
-rw-r--r--src/include/timegm.h79
-rw-r--r--src/include/types.h604
-rw-r--r--src/include/unordered_map.h11
-rw-r--r--src/include/unordered_set.h10
-rw-r--r--src/include/util.h106
-rw-r--r--src/include/utime.h579
-rw-r--r--src/include/uuid.h83
-rw-r--r--src/include/xlist.h224
100 files changed, 32336 insertions, 0 deletions
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt
new file mode 100644
index 00000000..39cdc6b2
--- /dev/null
+++ b/src/include/CMakeLists.txt
@@ -0,0 +1,35 @@
+install(FILES
+ rados/librados.h
+ rados/rados_types.h
+ rados/rados_types.hpp
+ rados/librados_fwd.hpp
+ rados/librados.hpp
+ buffer.h
+ buffer_fwd.h
+ inline_memory.h
+ page.h
+ crc32c.h
+ rados/objclass.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
+if(WITH_LIBRADOSSTRIPER)
+ install(FILES
+ radosstriper/libradosstriper.h
+ radosstriper/libradosstriper.hpp
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/radosstriper)
+endif()
+
+if(WITH_RBD)
+ install(FILES
+ rbd/features.h
+ rbd/librbd.h
+ rbd/librbd.hpp
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rbd)
+endif()
+
+if(WITH_RADOSGW)
+ install(FILES
+ rados/librgw.h
+ rados/rgw_file.h
+ rgw/librgw_admin_user.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
+endif()
diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h
new file mode 100644
index 00000000..a9e15f76
--- /dev/null
+++ b/src/include/CompatSet.h
@@ -0,0 +1,273 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMPATSET_H
+#define CEPH_COMPATSET_H
+
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/types.h"
+#include "common/Formatter.h"
+
+struct CompatSet {
+
+ struct Feature {
+ uint64_t id;
+ std::string name;
+
+ Feature(uint64_t _id, const std::string& _name) : id(_id), name(_name) {}
+ };
+
+ class FeatureSet {
+ uint64_t mask;
+ std::map<uint64_t, std::string> names;
+
+ public:
+ friend struct CompatSet;
+ friend class CephCompatSet_AllSet_Test;
+ friend class CephCompatSet_other_Test;
+ friend class CephCompatSet_merge_Test;
+ friend std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs);
+ friend std::ostream& operator<<(std::ostream& out, const CompatSet& compat);
+ FeatureSet() : mask(1), names() {}
+ void insert(const Feature& f) {
+ ceph_assert(f.id > 0);
+ ceph_assert(f.id < 64);
+ mask |= ((uint64_t)1<<f.id);
+ names[f.id] = f.name;
+ }
+
+ bool contains(const Feature& f) const {
+ return names.count(f.id);
+ }
+ bool contains(uint64_t f) const {
+ return names.count(f);
+ }
+ /**
+ * Getter instead of using name[] to be const safe
+ */
+ std::string get_name(uint64_t const f) const {
+ std::map<uint64_t, std::string>::const_iterator i = names.find(f);
+ ceph_assert(i != names.end());
+ return i->second;
+ }
+
+ void remove(uint64_t f) {
+ if (names.count(f)) {
+ names.erase(f);
+ mask &= ~((uint64_t)1<<f);
+ }
+ }
+ void remove(const Feature& f) {
+ remove(f.id);
+ }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ /* See below, mask always has the lowest bit set in memory, but
+ * unset in the encoding */
+ encode(mask & (~(uint64_t)1), bl);
+ encode(names, bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(mask, bl);
+ decode(names, bl);
+ /**
+ * Previously, there was a bug where insert did
+ * mask |= f.id rather than mask |= (1 << f.id).
+ * In FeatureSets from those version, mask always
+ * has the lowest bit set. Since then, masks always
+ * have the lowest bit unset.
+ *
+ * When we encounter such a FeatureSet, we have to
+ * reconstruct the mask from the names map.
+ */
+ if (mask & 1) {
+ mask = 1;
+ std::map<uint64_t, std::string> temp_names;
+ temp_names.swap(names);
+ for (auto i = temp_names.begin(); i != temp_names.end(); ++i) {
+ insert(Feature(i->first, i->second));
+ }
+ } else {
+ mask |= 1;
+ }
+ }
+
+ void dump(Formatter *f) const {
+ for (auto p = names.cbegin(); p != names.cend(); ++p) {
+ char s[18];
+ snprintf(s, sizeof(s), "feature_%llu", (unsigned long long)p->first);
+ f->dump_string(s, p->second);
+ }
+ }
+ };
+
+ // These features have no impact on the read / write status
+ FeatureSet compat;
+ // If any of these features are missing, read is possible ( as long
+ // as no incompat feature is missing ) but it is not possible to write
+ FeatureSet ro_compat;
+ // If any of these features are missing, read or write is not possible
+ FeatureSet incompat;
+
+ CompatSet(FeatureSet& _compat, FeatureSet& _ro_compat, FeatureSet& _incompat) :
+ compat(_compat), ro_compat(_ro_compat), incompat(_incompat) {}
+
+ CompatSet() : compat(), ro_compat(), incompat() { }
+
+
+ /* does this filesystem implementation have the
+ features required to read the other? */
+ bool readable(CompatSet const& other) const {
+ return !((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+ }
+
+ /* does this filesystem implementation have the
+ features required to write the other? */
+ bool writeable(CompatSet const& other) const {
+ return readable(other) &&
+ !((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+ }
+
+ /* Compare this CompatSet to another.
+ * CAREFULLY NOTE: This operation is NOT commutative.
+ * a > b DOES NOT imply that b < a.
+ * If returns:
+ * 0: The CompatSets have the same feature set.
+ * 1: This CompatSet's features are a strict superset of the other's.
+ * -1: This CompatSet is missing at least one feature
+ * described in the other. It may still have more features, though.
+ */
+ int compare(const CompatSet& other) {
+ if ((other.compat.mask == compat.mask) &&
+ (other.ro_compat.mask == ro_compat.mask) &&
+ (other.incompat.mask == incompat.mask)) return 0;
+ //okay, they're not the same
+
+ //if we're writeable we have a superset of theirs on incompat and ro_compat
+ if (writeable(other) && !((other.compat.mask ^ compat.mask)
+ & other.compat.mask)) return 1;
+ //if we make it here, we weren't writeable or had a difference compat set
+ return -1;
+ }
+
+ /* Get the features supported by other CompatSet but not this one,
+ * as a CompatSet.
+ */
+ CompatSet unsupported(CompatSet& other) {
+ CompatSet diff;
+ uint64_t other_compat =
+ ((other.compat.mask ^ compat.mask) & other.compat.mask);
+ uint64_t other_ro_compat =
+ ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+ uint64_t other_incompat =
+ ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+ for (int id = 1; id < 64; ++id) {
+ uint64_t mask = (uint64_t)1 << id;
+ if (mask & other_compat) {
+ diff.compat.insert( Feature(id, other.compat.names[id]));
+ }
+ if (mask & other_ro_compat) {
+ diff.ro_compat.insert(Feature(id, other.ro_compat.names[id]));
+ }
+ if (mask & other_incompat) {
+ diff.incompat.insert( Feature(id, other.incompat.names[id]));
+ }
+ }
+ return diff;
+ }
+
+ /* Merge features supported by other CompatSet into this one.
+ * Return: true if some features were merged
+ */
+ bool merge(CompatSet const & other) {
+ uint64_t other_compat =
+ ((other.compat.mask ^ compat.mask) & other.compat.mask);
+ uint64_t other_ro_compat =
+ ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+ uint64_t other_incompat =
+ ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+ if (!other_compat && !other_ro_compat && !other_incompat)
+ return false;
+ for (int id = 1; id < 64; ++id) {
+ uint64_t mask = (uint64_t)1 << id;
+ if (mask & other_compat) {
+ compat.insert( Feature(id, other.compat.get_name(id)));
+ }
+ if (mask & other_ro_compat) {
+ ro_compat.insert(Feature(id, other.ro_compat.get_name(id)));
+ }
+ if (mask & other_incompat) {
+ incompat.insert( Feature(id, other.incompat.get_name(id)));
+ }
+ }
+ return true;
+ }
+
+ void encode(bufferlist& bl) const {
+ compat.encode(bl);
+ ro_compat.encode(bl);
+ incompat.encode(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ compat.decode(bl);
+ ro_compat.decode(bl);
+ incompat.decode(bl);
+ }
+
+ void dump(Formatter *f) const {
+ f->open_object_section("compat");
+ compat.dump(f);
+ f->close_section();
+ f->open_object_section("ro_compat");
+ ro_compat.dump(f);
+ f->close_section();
+ f->open_object_section("incompat");
+ incompat.dump(f);
+ f->close_section();
+ }
+
+ static void generate_test_instances(std::list<CompatSet*>& o) {
+ o.push_back(new CompatSet);
+ o.push_back(new CompatSet);
+ o.back()->compat.insert(Feature(1, "one"));
+ o.back()->compat.insert(Feature(2, "two"));
+ o.back()->ro_compat.insert(Feature(4, "four"));
+ o.back()->incompat.insert(Feature(3, "three"));
+ }
+};
+WRITE_CLASS_ENCODER(CompatSet)
+
+using ceph::operator <<;
+inline std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs)
+{
+ return out << fs.names;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const CompatSet& compat)
+{
+ return out << "compat=" << compat.compat
+ << ",rocompat=" << compat.ro_compat
+ << ",incompat=" << compat.incompat;
+}
+
+#endif
diff --git a/src/include/Context.h b/src/include/Context.h
new file mode 100644
index 00000000..b588b0f1
--- /dev/null
+++ b/src/include/Context.h
@@ -0,0 +1,502 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_CONTEXT_H
+#define CEPH_CONTEXT_H
+
+#include "common/dout.h"
+
+#include <boost/function.hpp>
+#include <list>
+#include <set>
+#include <memory>
+
+#include "include/ceph_assert.h"
+#include "common/Mutex.h"
+
+#define mydout(cct, v) lgeneric_subdout(cct, context, v)
+
+/*
+ * GenContext - abstract callback class
+ */
+template <typename T>
+class GenContext {
+ GenContext(const GenContext& other);
+ const GenContext& operator=(const GenContext& other);
+
+ protected:
+ virtual void finish(T t) = 0;
+
+ public:
+ GenContext() {}
+ virtual ~GenContext() {} // we want a virtual destructor!!!
+
+ template <typename C>
+ void complete(C &&t) {
+ finish(std::forward<C>(t));
+ delete this;
+ }
+};
+
+template <typename T>
+using GenContextURef = std::unique_ptr<GenContext<T> >;
+
+/*
+ * Context - abstract callback class
+ */
+class Finisher;
+class Context {
+ Context(const Context& other);
+ const Context& operator=(const Context& other);
+
+ protected:
+ virtual void finish(int r) = 0;
+
+ // variant of finish that is safe to call "synchronously." override should
+ // return true.
+ virtual bool sync_finish(int r) {
+ return false;
+ }
+
+ public:
+ Context() {}
+ virtual ~Context() {} // we want a virtual destructor!!!
+ virtual void complete(int r) {
+ finish(r);
+ delete this;
+ }
+ virtual bool sync_complete(int r) {
+ if (sync_finish(r)) {
+ delete this;
+ return true;
+ }
+ return false;
+ }
+};
+
+/**
+ * Simple context holding a single object
+ */
+template<class T>
+class ContainerContext : public Context {
+ T obj;
+public:
+ ContainerContext(T &obj) : obj(obj) {}
+ void finish(int r) override {}
+};
+template <typename T>
+ContainerContext<T> *make_container_context(T &&t) {
+ return new ContainerContext<T>(std::forward<T>(t));
+}
+
+template <class T>
+struct Wrapper : public Context {
+ Context *to_run;
+ T val;
+ Wrapper(Context *to_run, T val) : to_run(to_run), val(val) {}
+ void finish(int r) override {
+ if (to_run)
+ to_run->complete(r);
+ }
+};
+struct RunOnDelete {
+ Context *to_run;
+ RunOnDelete(Context *to_run) : to_run(to_run) {}
+ ~RunOnDelete() {
+ if (to_run)
+ to_run->complete(0);
+ }
+};
+typedef std::shared_ptr<RunOnDelete> RunOnDeleteRef;
+
+template <typename T>
+struct LambdaContext : public Context {
+ T t;
+ LambdaContext(T &&t) : t(std::forward<T>(t)) {}
+ void finish(int) override {
+ t();
+ }
+};
+template <typename T>
+LambdaContext<T> *make_lambda_context(T &&t) {
+ return new LambdaContext<T>(std::move(t));
+}
+
+template <typename F, typename T>
+struct LambdaGenContext : GenContext<T> {
+ F f;
+ LambdaGenContext(F &&f) : f(std::forward<F>(f)) {}
+ void finish(T t) override {
+ f(std::forward<T>(t));
+ }
+};
+template <typename T, typename F>
+GenContextURef<T> make_gen_lambda_context(F &&f) {
+ return GenContextURef<T>(new LambdaGenContext<F, T>(std::move(f)));
+}
+
+/*
+ * finish and destroy a list of Contexts
+ */
+template<class C>
+inline void finish_contexts(CephContext *cct, C& finished, int result = 0)
+{
+ if (finished.empty())
+ return;
+
+ C ls;
+ ls.swap(finished); // swap out of place to avoid weird loops
+
+ if (cct)
+ mydout(cct,10) << ls.size() << " contexts to finish with " << result << dendl;
+ for (Context* c : ls) {
+ if (cct)
+ mydout(cct,10) << "---- " << c << dendl;
+ c->complete(result);
+ }
+}
+
+class C_NoopContext : public Context {
+public:
+ void finish(int r) override { }
+};
+
+
+struct C_Lock : public Context {
+ Mutex *lock;
+ Context *fin;
+ C_Lock(Mutex *l, Context *c) : lock(l), fin(c) {}
+ ~C_Lock() override {
+ delete fin;
+ }
+ void finish(int r) override {
+ if (fin) {
+ lock->Lock();
+ fin->complete(r);
+ fin = NULL;
+ lock->Unlock();
+ }
+ }
+};
+
+/*
+ * C_Contexts - set of Contexts
+ *
+ * ContextType must be an ancestor class of ContextInstanceType, or the same class.
+ * ContextInstanceType must be default-constructable.
+ */
+template <class ContextType, class ContextInstanceType, class Container = std::list<ContextType *>>
+class C_ContextsBase : public ContextInstanceType {
+public:
+ CephContext *cct;
+ Container contexts;
+
+ C_ContextsBase(CephContext *cct_)
+ : cct(cct_)
+ {
+ }
+ ~C_ContextsBase() override {
+ for (auto c : contexts) {
+ delete c;
+ }
+ }
+ void add(ContextType* c) {
+ contexts.push_back(c);
+ }
+ void take(Container& ls) {
+ Container c;
+ c.swap(ls);
+ if constexpr (std::is_same_v<Container, std::list<ContextType *>>) {
+ contexts.splice(contexts.end(), c);
+ } else {
+ contexts.insert(contexts.end(), c.begin(), c.end());
+ }
+ }
+ void complete(int r) override {
+ // Neuter any ContextInstanceType custom complete(), because although
+ // I want to look like it, I don't actually want to run its code.
+ Context::complete(r);
+ }
+ void finish(int r) override {
+ finish_contexts(cct, contexts, r);
+ }
+ bool empty() { return contexts.empty(); }
+
+ template<class C>
+ static ContextType *list_to_context(C& cs) {
+ if (cs.size() == 0) {
+ return 0;
+ } else if (cs.size() == 1) {
+ ContextType *c = cs.front();
+ cs.clear();
+ return c;
+ } else {
+ C_ContextsBase<ContextType, ContextInstanceType> *c(new C_ContextsBase<ContextType, ContextInstanceType>(0));
+ c->take(cs);
+ return c;
+ }
+ }
+};
+
+typedef C_ContextsBase<Context, Context> C_Contexts;
+
+/*
+ * C_Gather
+ *
+ * ContextType must be an ancestor class of ContextInstanceType, or the same class.
+ * ContextInstanceType must be default-constructable.
+ *
+ * BUG:? only reports error from last sub to have an error return
+ */
+template <class ContextType, class ContextInstanceType>
+class C_GatherBase {
+private:
+ CephContext *cct;
+ int result;
+ ContextType *onfinish;
+#ifdef DEBUG_GATHER
+ std::set<ContextType*> waitfor;
+#endif
+ int sub_created_count;
+ int sub_existing_count;
+ mutable Mutex lock;
+ bool activated;
+
+ void sub_finish(ContextType* sub, int r) {
+ lock.Lock();
+#ifdef DEBUG_GATHER
+ ceph_assert(waitfor.count(sub));
+ waitfor.erase(sub);
+#endif
+ --sub_existing_count;
+ mydout(cct,10) << "C_GatherBase " << this << ".sub_finish(r=" << r << ") " << sub
+#ifdef DEBUG_GATHER
+ << " (remaining " << waitfor << ")"
+#endif
+ << dendl;
+ if (r < 0 && result == 0)
+ result = r;
+ if ((activated == false) || (sub_existing_count != 0)) {
+ lock.Unlock();
+ return;
+ }
+ lock.Unlock();
+ delete_me();
+ }
+
+ void delete_me() {
+ if (onfinish) {
+ onfinish->complete(result);
+ onfinish = 0;
+ }
+ delete this;
+ }
+
+ class C_GatherSub : public ContextInstanceType {
+ C_GatherBase *gather;
+ public:
+ C_GatherSub(C_GatherBase *g) : gather(g) {}
+ void complete(int r) override {
+ // Cancel any customized complete() functionality
+ // from the Context subclass we're templated for,
+ // we only want to hit that in onfinish, not at each
+ // sub finish. e.g. MDSInternalContext.
+ Context::complete(r);
+ }
+ void finish(int r) override {
+ gather->sub_finish(this, r);
+ gather = 0;
+ }
+ ~C_GatherSub() override {
+ if (gather)
+ gather->sub_finish(this, 0);
+ }
+ };
+
+public:
+ C_GatherBase(CephContext *cct_, ContextType *onfinish_)
+ : cct(cct_), result(0), onfinish(onfinish_),
+ sub_created_count(0), sub_existing_count(0),
+ lock("C_GatherBase::lock", true, false), //disable lockdep
+ activated(false)
+ {
+ mydout(cct,10) << "C_GatherBase " << this << ".new" << dendl;
+ }
+ ~C_GatherBase() {
+ mydout(cct,10) << "C_GatherBase " << this << ".delete" << dendl;
+ }
+ void set_finisher(ContextType *onfinish_) {
+ Mutex::Locker l(lock);
+ ceph_assert(!onfinish);
+ onfinish = onfinish_;
+ }
+ void activate() {
+ lock.Lock();
+ ceph_assert(activated == false);
+ activated = true;
+ if (sub_existing_count != 0) {
+ lock.Unlock();
+ return;
+ }
+ lock.Unlock();
+ delete_me();
+ }
+ ContextType *new_sub() {
+ Mutex::Locker l(lock);
+ ceph_assert(activated == false);
+ sub_created_count++;
+ sub_existing_count++;
+ ContextType *s = new C_GatherSub(this);
+#ifdef DEBUG_GATHER
+ waitfor.insert(s);
+#endif
+ mydout(cct,10) << "C_GatherBase " << this << ".new_sub is " << sub_created_count << " " << s << dendl;
+ return s;
+ }
+
+ inline int get_sub_existing_count() const {
+ Mutex::Locker l(lock);
+ return sub_existing_count;
+ }
+
+ inline int get_sub_created_count() const {
+ Mutex::Locker l(lock);
+ return sub_created_count;
+ }
+};
+
+/*
+ * The C_GatherBuilder remembers each C_Context created by
+ * C_GatherBuilder.new_sub() in a C_Gather. When a C_Context created
+ * by new_sub() is complete(), C_Gather forgets about it. When
+ * C_GatherBuilder notices that there are no C_Context left in
+ * C_Gather, it calls complete() on the C_Context provided as the
+ * second argument of the constructor (finisher).
+ *
+ * How to use C_GatherBuilder:
+ *
+ * 1. Create a C_GatherBuilder on the stack
+ * 2. Call gather_bld.new_sub() as many times as you want to create new subs
+ * It is safe to call this 0 times, or 100, or anything in between.
+ * 3. If you didn't supply a finisher in the C_GatherBuilder constructor,
+ * set one with gather_bld.set_finisher(my_finisher)
+ * 4. Call gather_bld.activate()
+ *
+ * Example:
+ *
+ * C_SaferCond all_done;
+ * C_GatherBuilder gb(g_ceph_context, all_done);
+ * j.submit_entry(1, first, 0, gb.new_sub()); // add a C_Context to C_Gather
+ * j.submit_entry(2, first, 0, gb.new_sub()); // add a C_Context to C_Gather
+ * gb.activate(); // consume C_Context as soon as they complete()
+ * all_done.wait(); // all_done is complete() after all new_sub() are complete()
+ *
+ * The finisher may be called at any point after step 4, including immediately
+ * from the activate() function.
+ * The finisher will never be called before activate().
+ *
+ * Note: Currently, subs must be manually freed by the caller (for some reason.)
+ */
+template <class ContextType, class GatherType>
+class C_GatherBuilderBase
+{
+public:
+ C_GatherBuilderBase(CephContext *cct_)
+ : cct(cct_), c_gather(NULL), finisher(NULL), activated(false)
+ {
+ }
+ C_GatherBuilderBase(CephContext *cct_, ContextType *finisher_)
+ : cct(cct_), c_gather(NULL), finisher(finisher_), activated(false)
+ {
+ }
+ ~C_GatherBuilderBase() {
+ if (c_gather) {
+ ceph_assert(activated); // Don't forget to activate your C_Gather!
+ }
+ else {
+ delete finisher;
+ }
+ }
+ ContextType *new_sub() {
+ if (!c_gather) {
+ c_gather = new GatherType(cct, finisher);
+ }
+ return c_gather->new_sub();
+ }
+ void activate() {
+ if (!c_gather)
+ return;
+ ceph_assert(finisher != NULL);
+ activated = true;
+ c_gather->activate();
+ }
+ void set_finisher(ContextType *finisher_) {
+ finisher = finisher_;
+ if (c_gather)
+ c_gather->set_finisher(finisher);
+ }
+ GatherType *get() const {
+ return c_gather;
+ }
+ bool has_subs() const {
+ return (c_gather != NULL);
+ }
+ int num_subs_created() {
+ ceph_assert(!activated);
+ if (c_gather == NULL)
+ return 0;
+ return c_gather->get_sub_created_count();
+ }
+ int num_subs_remaining() {
+ ceph_assert(!activated);
+ if (c_gather == NULL)
+ return 0;
+ return c_gather->get_sub_existing_count();
+ }
+
+private:
+ CephContext *cct;
+ GatherType *c_gather;
+ ContextType *finisher;
+ bool activated;
+};
+
+typedef C_GatherBase<Context, Context> C_Gather;
+typedef C_GatherBuilderBase<Context, C_Gather > C_GatherBuilder;
+
+class FunctionContext : public Context {
+public:
+ FunctionContext(boost::function<void(int)> &&callback)
+ : m_callback(std::move(callback))
+ {
+ }
+
+ void finish(int r) override {
+ m_callback(r);
+ }
+private:
+ boost::function<void(int)> m_callback;
+};
+
+template <class ContextType>
+class ContextFactory {
+public:
+ virtual ~ContextFactory() {}
+ virtual ContextType *build() = 0;
+};
+
+#undef mydout
+
+#endif
diff --git a/src/include/Distribution.h b/src/include/Distribution.h
new file mode 100644
index 00000000..e4f0b30b
--- /dev/null
+++ b/src/include/Distribution.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_DISTRIBUTION_H
+#define CEPH_DISTRIBUTION_H
+
+#include <vector>
+
+class Distribution {
+ vector<float> p;
+ vector<int> v;
+
+ public:
+ //Distribution() {
+ //}
+
+ unsigned get_width() {
+ return p.size();
+ }
+
+ void clear() {
+ p.clear();
+ v.clear();
+ }
+ void add(int val, float pr) {
+ p.push_back(pr);
+ v.push_back(val);
+ }
+
+ void random() {
+ float sum = 0.0;
+ for (unsigned i=0; i<p.size(); i++) {
+ p[i] = (float)(rand() % 10000);
+ sum += p[i];
+ }
+ for (unsigned i=0; i<p.size(); i++)
+ p[i] /= sum;
+ }
+
+ int sample() {
+ float s = (float)(rand() % 10000) / 10000.0;
+ for (unsigned i=0; i<p.size(); i++) {
+ if (s < p[i]) return v[i];
+ s -= p[i];
+ }
+ ceph_abort();
+ return v[p.size() - 1]; // hmm. :/
+ }
+
+ float normalize() {
+ float s = 0.0;
+ for (unsigned i=0; i<p.size(); i++)
+ s += p[i];
+ for (unsigned i=0; i<p.size(); i++)
+ p[i] /= s;
+ return s;
+ }
+
+};
+
+#endif
diff --git a/src/include/addr_parsing.h b/src/include/addr_parsing.h
new file mode 100644
index 00000000..c205ac75
--- /dev/null
+++ b/src/include/addr_parsing.h
@@ -0,0 +1,28 @@
+/*
+ * addr_parsing.h
+ *
+ * Created on: Sep 14, 2010
+ * Author: gregf
+ * contains functions used by Ceph to convert named addresses
+ * (eg ceph.com) into IP addresses (ie 127.0.0.1).
+ */
+
+#ifndef ADDR_PARSING_H_
+#define ADDR_PARSING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int safe_cat(char **pstr, int *plen, int pos, const char *str2);
+
+/*
+ * returns a string allocated by malloc; caller must free
+ */
+char *resolve_addrs(const char *orig_str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ADDR_PARSING_H_ */
diff --git a/src/include/alloc_ptr.h b/src/include/alloc_ptr.h
new file mode 100644
index 00000000..258c5833
--- /dev/null
+++ b/src/include/alloc_ptr.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ALLOC_PTR_H
+#define CEPH_ALLOC_PTR_H
+
+#include <memory>
+
+template <class T>
+class alloc_ptr
+{
+public:
+ typedef typename std::pointer_traits< std::unique_ptr<T> >::pointer pointer;
+ typedef typename std::pointer_traits< std::unique_ptr<T> >::element_type element_type;
+
+ alloc_ptr() : ptr() {}
+
+ template<class U>
+ alloc_ptr(U&& u) : ptr(std::forward<U>(u)) {}
+
+ alloc_ptr(alloc_ptr<pointer>&& rhs) : ptr(std::move(rhs.ptr)) {}
+ alloc_ptr(const alloc_ptr<pointer>& rhs) = delete;
+ alloc_ptr& operator=(const alloc_ptr<pointer>&& rhs) {
+ ptr = rhs.ptr;
+ }
+ alloc_ptr& operator=(const alloc_ptr<pointer>& rhs) {
+ ptr = rhs.ptr;
+ }
+
+ void swap (alloc_ptr<pointer>& rhs) {
+ ptr.swap(rhs.ptr);
+ }
+ element_type* release() {
+ return ptr.release();
+ }
+ void reset(element_type *p = nullptr) {
+ ptr.reset(p);
+ }
+ element_type* get() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return ptr.get();
+ }
+ element_type& operator*() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return *ptr;
+ }
+ element_type* operator->() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return ptr.get();
+ }
+ operator bool() const {
+ return !!ptr;
+ }
+
+ friend bool operator< (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::less<element_type>(*lhs, *rhs);
+ }
+ friend bool operator<=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::less_equal<element_type>(*lhs, *rhs);
+ }
+ friend bool operator> (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::greater<element_type>(*lhs, *rhs);
+ }
+ friend bool operator>=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::greater_equal<element_type>(*lhs, *rhs);
+ }
+ friend bool operator==(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return *lhs == *rhs;
+ }
+ friend bool operator!=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return *lhs != *rhs;
+ }
+private:
+ mutable std::unique_ptr<element_type> ptr;
+};
+
+#endif
diff --git a/src/include/any.h b/src/include/any.h
new file mode 100644
index 00000000..da59c88f
--- /dev/null
+++ b/src/include/any.h
@@ -0,0 +1,704 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef INCLUDE_STATIC_ANY
+#define INCLUDE_STATIC_ANY
+
+#include <any>
+#include <cstddef>
+#include <initializer_list>
+#include <memory>
+#include <typeinfo>
+#include <type_traits>
+
+#include <boost/smart_ptr/shared_ptr.hpp>
+#include <boost/smart_ptr/make_shared.hpp>
+
+namespace ceph {
+
+namespace _any {
+
+// Shared Functionality
+// --------------------
+//
+// Common implementation details. Most functionality is here. We
+// assume that destructors do not throw. Some of them might and
+// they'll invoke terminate and that's fine.
+//
+// We are using the Curiously Recurring Template Pattern! We require
+// that all classes inheriting from us provide:
+//
+// - `static constexpr size_t capacity`: Maximum capacity. No object
+// larger than this may be
+// stored. `dynamic` for dynamic.
+// - `void* ptr() const noexcept`: returns a pointer to storage.
+// (`alloc_storage` must have been called.
+// `free_storage` must not have been called
+// since.)
+// - `void* alloc_storage(const std::size_t)`: allocate storage
+// - `void free_storage() noexcept`: free storage. Must be idempotent.
+//
+// We provide most of the public interface, as well as the operator function,
+// cast_helper, and the type() call.
+
+// Set `capacity` to this value to indicate that there is no fixed
+// capacity.
+//
+inline constexpr std::size_t dynamic = ~0;
+
+// Driver Function
+// ---------------
+//
+// The usual type-erasure control function trick. This one is simpler
+// than usual since we punt on moving and copying. We could dispense
+// with this and just store a deleter and a pointer to a typeinfo, but
+// that would be twice the space.
+//
+// Moved out here so the type of `func_t` isn't dependent on the
+// enclosing class.
+//
+enum class op { type, destroy };
+template<typename T>
+inline void op_func(const op o, void* p) noexcept {
+ static const std::type_info& type = typeid(T);
+ switch (o) {
+ case op::type:
+ *(reinterpret_cast<const std::type_info**>(p)) = &type;
+ break;
+ case op::destroy:
+ reinterpret_cast<T*>(p)->~T();
+ break;
+ }
+}
+using func_t = void (*)(const op, void* p) noexcept;
+
+// The base class
+// --------------
+//
+// The `storage_t` parameter gives the type of the value that manages
+// storage and allocation. We use it to create a protected data member
+// (named `storage`). This allows us to sidestep the problem in
+// initialization order where, where exposed constructors were using
+// trying to allocate or free storage *before* the data members of the
+// derived class were initialized.
+//
+// Making storage_t a member type of the derived class won't work, due
+// to C++'s rules for nested types being *horrible*. Just downright
+// *horrible*.
+//
+template<typename D, typename storage_t>
+class base {
+ // Make definitions from our superclass visible
+ // --------------------------------------------
+ //
+ // And check that they fit the requirements. At least those that are
+ // statically checkable.
+ //
+ static constexpr std::size_t capacity = D::capacity;
+
+ void* ptr() const noexcept {
+ static_assert(
+ noexcept(static_cast<const D*>(this)->ptr()) &&
+ std::is_same_v<decltype(static_cast<const D*>(this)->ptr()), void*>,
+ "‘void* ptr() const noexcept’ missing from superclass");
+ return static_cast<const D*>(this)->ptr();
+ }
+
+ void* alloc_storage(const std::size_t z) {
+ static_assert(
+ std::is_same_v<decltype(static_cast<D*>(this)->alloc_storage(z)), void*>,
+ "‘void* alloc_storage(const size_t)’ missing from superclass.");
+ return static_cast<D*>(this)->alloc_storage(z);
+ }
+
+ void free_storage() noexcept {
+ static_assert(
+ noexcept(static_cast<D*>(this)->free_storage()) &&
+ std::is_void_v<decltype(static_cast<D*>(this)->free_storage())>,
+ "‘void free_storage() noexcept’ missing from superclass.");
+ static_cast<D*>(this)->free_storage();
+ }
+
+
+ // Pile O' Templates
+ // -----------------
+ //
+ // These are just verbose and better typed once than twice. They're
+ // used for SFINAE and declaring noexcept.
+ //
+ template<class T>
+ struct is_in_place_type_helper : std::false_type {};
+ template<class T>
+ struct is_in_place_type_helper<std::in_place_type_t<T>> : std::true_type {};
+
+ template<class T>
+ static constexpr bool is_in_place_type_v =
+ is_in_place_type_helper<std::decay_t<T>>::value;
+
+ // SFINAE condition for value initialized
+ // constructors/assigners. This is analogous to the standard's
+ // requirement that this overload only participate in overload
+ // resolution if std::decay_t<T> is not the same type as the
+ // any-type, nor a specialization of std::in_place_type_t
+ //
+ template<typename T>
+ using value_condition_t = std::enable_if_t<
+ !std::is_same_v<std::decay_t<T>, D> &&
+ !is_in_place_type_v<std::decay_t<T>>>;
+
+ // This `noexcept` condition for value construction lets
+ // `immobile_any`'s value constructor/assigner be noexcept, so long
+ // as the type's copy or move constructor cooperates.
+ //
+ template<typename T>
+ static constexpr bool value_noexcept_v =
+ std::is_nothrow_constructible_v<std::decay_t<T>, T> && capacity != dynamic;
+
+ // SFINAE condition for in-place constructors/assigners
+ //
+ template<typename T, typename... Args>
+ using in_place_condition_t = std::enable_if_t<std::is_constructible_v<
+ std::decay_t<T>, Args...>>;
+
+ // Analogous to the above. Give noexcept to immobile_any::emplace
+ // when possible.
+ //
+ template<typename T, typename... Args>
+ static constexpr bool in_place_noexcept_v =
+ std::is_nothrow_constructible_v<std::decay_t<T>, Args...> &&
+ capacity != dynamic;
+
+private:
+
+ // Functionality!
+ // --------------
+
+ // The driver function for the currently stored object. Whether this
+ // is null is the canonical way to know whether an instance has a
+ // value.
+ //
+ func_t func = nullptr;
+
+ // Construct an object within ourselves. As you can see we give the
+ // weak exception safety guarantee.
+ //
+ template<typename T, typename ...Args>
+ std::decay_t<T>& construct(Args&& ...args) {
+ using Td = std::decay_t<T>;
+ static_assert(capacity == dynamic || sizeof(Td) <= capacity,
+ "Supplied type is too large for this specialization.");
+ try {
+ func = &op_func<Td>;
+ return *new (reinterpret_cast<Td*>(alloc_storage(sizeof(Td))))
+ Td(std::forward<Args>(args)...);
+ } catch (...) {
+ reset();
+ throw;
+ }
+ }
+
+protected:
+
+ // We hold the storage, even if the superclass class manipulates it,
+ // so that its default initialization comes soon enough for us to
+ // use it in our constructors.
+ //
+ storage_t storage;
+
+public:
+
+ base() noexcept = default;
+ ~base() noexcept {
+ reset();
+ }
+
+protected:
+ // Since some of our derived classes /can/ be copied or moved.
+ //
+ base(const base& rhs) noexcept : func(rhs.func) {
+ if constexpr (std::is_copy_assignable_v<storage_t>) {
+ storage = rhs.storage;
+ }
+ }
+ base& operator =(const base& rhs) noexcept {
+ reset();
+ func = rhs.func;
+ if constexpr (std::is_copy_assignable_v<storage_t>) {
+ storage = rhs.storage;
+ }
+ return *this;
+ }
+
+ base(base&& rhs) noexcept : func(std::move(rhs.func)) {
+ if constexpr (std::is_move_assignable_v<storage_t>) {
+ storage = std::move(rhs.storage);
+ }
+ rhs.func = nullptr;
+ }
+ base& operator =(base&& rhs) noexcept {
+ reset();
+ func = rhs.func;
+ if constexpr (std::is_move_assignable_v<storage_t>) {
+ storage = std::move(rhs.storage);
+ }
+ rhs.func = nullptr;
+ return *this;
+ }
+
+public:
+
+ // Value construct/assign
+ // ----------------------
+ //
+ template<typename T,
+ typename = value_condition_t<T>>
+ base(T&& t) noexcept(value_noexcept_v<T>) {
+ construct<T>(std::forward<T>(t));
+ }
+
+ // On exception, *this is set to empty.
+ //
+ template<typename T,
+ typename = value_condition_t<T>>
+ base& operator =(T&& t) noexcept(value_noexcept_v<T>) {
+ reset();
+ construct<T>(std::forward<T>(t));
+ return *this;
+ }
+
+ // In-place construct/assign
+ // -------------------------
+ //
+ // I really hate the way the C++ standard library treats references
+ // as if they were stepchildren in a Charles Dickens novel. I am
+ // quite upset that std::optional lacks a specialization for
+ // references. There's no legitimate reason for it. The whole
+ // 're-seat or refuse' debate is simply a canard. The optional is
+ // effectively a container, so of course it can be emptied or
+ // reassigned. No, pointers are not an acceptable substitute. A
+ // pointer gives an address in memory which may be null and which
+ // may represent an object or may a location in which an object is
+ // to be created. An optional reference, on the other hand, is a
+ // reference to an initialized, live object or /empty/. This is an
+ // obvious difference that should be communicable to any programmer
+ // reading the code through the type system.
+ //
+ // `std::any`, even in the case of in-place construction,
+ // only stores the decayed type. I suspect this was to get around
+ // the question of whether, for a std::any holding a T&,
+ // std::any_cast<T> should return a copy or throw
+ // std::bad_any_cast.
+ //
+ // I think the appropriate response in that case would be to make a
+ // copy if the type supports it and fail otherwise. Once a concrete
+ // type is known the problem solves itself.
+ //
+ // If one were inclined, one could easily load the driver function
+ // with a heavy subset of the type traits (those that depend only on
+ // the type in question) and simply /ask/ whether it's a reference.
+ //
+ // At the moment, I'm maintaining compatibility with the standard
+ // library except for copy/move semantics.
+ //
+ template<typename T,
+ typename... Args,
+ typename = in_place_condition_t<T, Args...>>
+ base(std::in_place_type_t<T>,
+ Args&& ...args) noexcept(in_place_noexcept_v<T, Args...>) {
+ construct<T>(std::forward<Args>(args)...);
+ }
+
+ // On exception, *this is set to empty.
+ //
+ template<typename T,
+ typename... Args,
+ typename = in_place_condition_t<T>>
+ std::decay_t<T>& emplace(Args&& ...args) noexcept(in_place_noexcept_v<
+ T, Args...>) {
+ reset();
+ return construct<T>(std::forward<Args>(args)...);
+ }
+
+ template<typename T,
+ typename U,
+ typename... Args,
+ typename = in_place_condition_t<T, std::initializer_list<U>,
+ Args...>>
+ base(std::in_place_type_t<T>,
+ std::initializer_list<U> i,
+ Args&& ...args) noexcept(in_place_noexcept_v<T, std::initializer_list<U>,
+ Args...>) {
+ construct<T>(i, std::forward<Args>(args)...);
+ }
+
+ // On exception, *this is set to empty.
+ //
+ template<typename T,
+ typename U,
+ typename... Args,
+ typename = in_place_condition_t<T, std::initializer_list<U>,
+ Args...>>
+ std::decay_t<T>& emplace(std::initializer_list<U> i,
+ Args&& ...args) noexcept(in_place_noexcept_v<T,
+ std::initializer_list<U>,
+ Args...>) {
+ reset();
+ return construct<T>(i,std::forward<Args>(args)...);
+ }
+
+ // Empty ourselves, using the subclass to free any storage.
+ //
+ void reset() noexcept {
+ if (has_value()) {
+ func(op::destroy, ptr());
+ func = nullptr;
+ }
+ free_storage();
+ }
+
+ template<typename U = storage_t,
+ typename = std::enable_if<std::is_swappable_v<storage_t>>>
+ void swap(base& rhs) {
+ using std::swap;
+ swap(func, rhs.func);
+ swap(storage, rhs.storage);
+ }
+
+ // All other functions should use this function to test emptiness
+ // rather than examining `func` directly.
+ //
+ bool has_value() const noexcept {
+ return !!func;
+ }
+
+ // Returns the type of the value stored, if any.
+ //
+ const std::type_info& type() const noexcept {
+ if (has_value()) {
+ const std::type_info* t;
+ func(op::type, reinterpret_cast<void*>(&t));
+ return *t;
+ } else {
+ return typeid(void);
+ }
+ }
+
+ template<typename T, typename U, typename V>
+ friend inline void* cast_helper(const base<U, V>& b) noexcept;
+};
+
+// Function used by all `any_cast` functions
+//
+// Returns a void* to the contents if they exist and match the
+// requested type, otherwise `nullptr`.
+//
+template<typename T, typename U, typename V>
+inline void* cast_helper(const base<U, V>& b) noexcept {
+ if (b.func && ((&op_func<T> == b.func) ||
+ (b.type() == typeid(T)))) {
+ return b.ptr();
+ } else {
+ return nullptr;
+ }
+}
+}
+
+// `any_cast`
+// ==========
+//
+// Just the usual gamut of `any_cast` overloads. These get a bit
+// repetitive and it would be nice to think of a way to collapse them
+// down a bit.
+//
+
+// The pointer pair!
+//
+template<typename T, typename U, typename V>
+inline T* any_cast(_any::base<U, V>* a) noexcept {
+ if (a) {
+ return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a));
+ }
+ return nullptr;
+}
+
+template<typename T, typename U, typename V>
+inline const T* any_cast(const _any::base<U, V>* a) noexcept {
+ if (a) {
+ return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a));
+ }
+ return nullptr;
+}
+
+// While we disallow copying the immobile any itself, we can allow
+// anything with an extracted value that the type supports.
+//
+template<typename T, typename U, typename V>
+inline T any_cast(_any::base<U, V>& a) {
+ static_assert(std::is_reference_v<T> ||
+ std::is_copy_constructible_v<T>,
+ "The supplied type must be either a reference or "
+ "copy constructible.");
+ auto p = any_cast<std::decay_t<T>>(&a);
+ if (p) {
+ return static_cast<T>(*p);
+ }
+ throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline T any_cast(const _any::base<U, V>& a) {
+ static_assert(std::is_reference_v<T> ||
+ std::is_copy_constructible_v<T>,
+ "The supplied type must be either a reference or "
+ "copy constructible.");
+ auto p = any_cast<std::decay_t<T>>(&a);
+ if (p) {
+ return static_cast<T>(*p);
+ }
+ throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline std::enable_if_t<(std::is_move_constructible_v<T> ||
+ std::is_copy_constructible_v<T>) &&
+ !std::is_rvalue_reference_v<T>, T>
+any_cast(_any::base<U, V>&& a) {
+ auto p = any_cast<std::decay_t<T>>(&a);
+ if (p) {
+ return std::move((*p));
+ }
+ throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline std::enable_if_t<std::is_rvalue_reference_v<T>, T>
+any_cast(_any::base<U, V>&& a) {
+ auto p = any_cast<std::decay_t<T>>(&a);
+ if (p) {
+ return static_cast<T>(*p);
+ }
+ throw std::bad_any_cast();
+}
+
+// `immobile_any`
+// ==============
+//
+// Sometimes, uncopyable objects exist and I want to do things with
+// them. The C++ standard library is really quite keen on insisting
+// things be copyable before it deigns to work. I find this annoying.
+//
+// Also, the allocator, while useful, is really not considerate of
+// other people's time. Every time we go to visit it, it takes us
+// quite an awfully long time to get away again. As such, I've been
+// trying to avoid its company whenever it is convenient and seemly.
+//
+// We accept any type that will fit in the declared capacity. You may
+// store types with throwing destructors, but terminate will be
+// invoked when they throw.
+//
+template<std::size_t S>
+class immobile_any : public _any::base<immobile_any<S>,
+ std::aligned_storage_t<S>> {
+ using base = _any::base<immobile_any<S>, std::aligned_storage_t<S>>;
+ friend base;
+
+ using _any::base<immobile_any<S>, std::aligned_storage_t<S>>::storage;
+
+ // Superclass requirements!
+ // ------------------------
+ //
+ // Simple as anything. We have a buffer of fixed size and return the
+ // pointer to it when asked.
+ //
+ static constexpr std::size_t capacity = S;
+ void* ptr() const noexcept {
+ return const_cast<void*>(static_cast<const void*>(&storage));
+ }
+ void* alloc_storage(std::size_t) noexcept {
+ return ptr();
+ }
+ void free_storage() noexcept {}
+
+ static_assert(capacity != _any::dynamic,
+ "That is not a valid size for an immobile_any.");
+
+public:
+
+ immobile_any() noexcept = default;
+
+ immobile_any(const immobile_any&) = delete;
+ immobile_any& operator =(const immobile_any&) = delete;
+ immobile_any(immobile_any&&) = delete;
+ immobile_any& operator =(immobile_any&&) = delete;
+
+ using base::base;
+ using base::operator =;
+
+ void swap(immobile_any&) = delete;
+};
+
+template<typename T, std::size_t S, typename... Args>
+inline immobile_any<S> make_immobile_any(Args&& ...args) {
+ return immobile_any<S>(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, std::size_t S, typename U, typename... Args>
+inline immobile_any<S> make_immobile_any(std::initializer_list<U> i, Args&& ...args) {
+ return immobile_any<S>(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+
+// `unique_any`
+// ============
+//
+// Oh dear. Now we're getting back into allocation. You don't think
+// the allocator noticed all those mean things we said about it, do
+// you?
+//
+// Well. Okay, allocator. Sometimes when it's the middle of the night
+// and you're writing template code you say things you don't exactly
+// mean. If it weren't for you, we wouldn't have any memory to run all
+// our programs in at all. Really, I'm just being considerate of
+// *your* needs, trying to avoid having to run to you every time we
+// instantiate a type, making a few that can be self-sufficient…uh…
+//
+// **Anyway**, this is movable but not copyable, as you should expect
+// from anything with ‘unique’ in the name.
+//
+class unique_any : public _any::base<unique_any, std::unique_ptr<std::byte[]>> {
+ using base = _any::base<unique_any, std::unique_ptr<std::byte[]>>;
+ friend base;
+
+ using base::storage;
+
+ // Superclass requirements
+ // -----------------------
+ //
+ // Our storage is a single chunk of RAM owned by a
+ // `std::unique_ptr`.
+ //
+ static constexpr std::size_t capacity = _any::dynamic;
+ void* ptr() const noexcept {
+ return static_cast<void*>(storage.get());
+ return nullptr;
+ }
+
+ void* alloc_storage(const std::size_t z) {
+ storage.reset(new std::byte[z]);
+ return ptr();
+ }
+
+ void free_storage() noexcept {
+ storage.reset();
+ }
+
+public:
+
+ unique_any() noexcept = default;
+ ~unique_any() noexcept = default;
+
+ unique_any(const unique_any&) = delete;
+ unique_any& operator =(const unique_any&) = delete;
+
+ // We can rely on the behavior of `unique_ptr` and the base class to
+ // give us a default move constructor that does the right thing.
+ //
+ unique_any(unique_any&& rhs) noexcept = default;
+ unique_any& operator =(unique_any&& rhs) = default;
+
+ using base::base;
+ using base::operator =;
+};
+
+inline void swap(unique_any& lhs, unique_any& rhs) noexcept {
+ lhs.swap(rhs);
+}
+
+template<typename T, typename... Args>
+inline unique_any make_unique_any(Args&& ...args) {
+ return unique_any(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, typename U, typename... Args>
+inline unique_any make_unique_any(std::initializer_list<U> i, Args&& ...args) {
+ return unique_any(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+
+// `shared_any`
+// ============
+//
+// Once more with feeling!
+//
+// This is both copyable *and* movable. In case you need that sort of
+// thing. It seemed a reasonable completion.
+//
+class shared_any : public _any::base<shared_any, boost::shared_ptr<std::byte[]>> {
+ using base = _any::base<shared_any, boost::shared_ptr<std::byte[]>>;
+ friend base;
+
+ using base::storage;
+
+ // Superclass requirements
+ // -----------------------
+ //
+ // Our storage is a single chunk of RAM allocated from the
+ // heap. This time it's owned by a `boost::shared_ptr` so we can use
+ // `boost::make_shared_noinit`. (This lets us get the optimization
+ // that allocates array and control block in one without wasting
+ // time on `memset`.)
+ //
+ static constexpr std::size_t capacity = _any::dynamic;
+ void* ptr() const noexcept {
+ return static_cast<void*>(storage.get());
+ }
+
+ void* alloc_storage(std::size_t n) {
+ storage = boost::make_shared_noinit<std::byte[]>(n);
+ return ptr();
+ }
+
+ void free_storage() noexcept {
+ storage.reset();
+ }
+
+public:
+
+ shared_any() noexcept = default;
+ ~shared_any() noexcept = default;
+
+ shared_any(const shared_any& rhs) noexcept = default;
+ shared_any& operator =(const shared_any&) noexcept = default;
+
+ shared_any(shared_any&& rhs) noexcept = default;
+ shared_any& operator =(shared_any&& rhs) noexcept = default;
+
+ using base::base;
+ using base::operator =;
+};
+
+inline void swap(shared_any& lhs, shared_any& rhs) noexcept {
+ lhs.swap(rhs);
+}
+
+template<typename T, typename... Args>
+inline shared_any make_shared_any(Args&& ...args) {
+ return shared_any(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, typename U, typename... Args>
+inline shared_any make_shared_any(std::initializer_list<U> i, Args&& ...args) {
+ return shared_any(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+}
+
+#endif // INCLUDE_STATIC_ANY
diff --git a/src/include/bitmapper.h b/src/include/bitmapper.h
new file mode 100644
index 00000000..5a65cc20
--- /dev/null
+++ b/src/include/bitmapper.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BITMAPPER_H
+#define CEPH_BITMAPPER_H
+
+class bitmapper {
+ char *_data;
+ int _len;
+
+ public:
+ bitmapper() : _data(0), _len(0) { }
+ bitmapper(char *data, int len) : _data(data), _len(len) { }
+
+ void set_data(char *data, int len) { _data = data; _len = len; }
+
+ int bytes() const { return _len; }
+ int bits() const { return _len * 8; }
+
+ bool operator[](int b) const {
+ return get(b);
+ }
+ bool get(int b) const {
+ return _data[b >> 3] & (1 << (b&7));
+ }
+ void set(int b) {
+ _data[b >> 3] |= 1 << (b&7);
+ }
+ void clear(int b) {
+ _data[b >> 3] &= ~(1 << (b&7));
+ }
+ void toggle(int b) {
+ _data[b >> 3] ^= 1 << (b&7);
+ }
+};
+
+#endif
diff --git a/src/include/blobhash.h b/src/include/blobhash.h
new file mode 100644
index 00000000..597884e4
--- /dev/null
+++ b/src/include/blobhash.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BLOBHASH_H
+#define CEPH_BLOBHASH_H
+
+#include "hash.h"
+
+/*
+- this is to make some of the STL types work with 64 bit values, string hash keys, etc.
+- added when i was using an old STL.. maybe try taking these out and see if things
+ compile now?
+*/
+
+class blobhash {
+public:
+ uint32_t operator()(const char *p, unsigned len) {
+ static rjhash<uint32_t> H;
+ uint32_t acc = 0;
+ while (len >= sizeof(acc)) {
+ acc ^= *(uint32_t*)p;
+ p += sizeof(uint32_t);
+ len -= sizeof(uint32_t);
+ }
+ int sh = 0;
+ while (len) {
+ acc ^= (uint32_t)*p << sh;
+ sh += 8;
+ len--;
+ p++;
+ }
+ return H(acc);
+ }
+};
+
+
+#endif
diff --git a/src/include/btree_map.h b/src/include/btree_map.h
new file mode 100644
index 00000000..1f42ea41
--- /dev/null
+++ b/src/include/btree_map.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_INCLUDE_BTREE_MAP_H
+#define CEPH_INCLUDE_BTREE_MAP_H
+
+#include "include/cpp-btree/btree.h"
+#include "include/cpp-btree/btree_map.h"
+#include "include/ceph_assert.h" // cpp-btree uses system assert, blech
+#include "include/encoding.h"
+
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U>
+inline void decode(btree::btree_map<T,U>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U>
+inline void encode_nohead(const btree::btree_map<T,U>& m, bufferlist& bl)
+{
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U>
+inline void decode_nohead(int n, btree::btree_map<T,U>& m, bufferlist::const_iterator& p)
+{
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+#endif
diff --git a/src/include/buffer.h b/src/include/buffer.h
new file mode 100644
index 00000000..774ca052
--- /dev/null
+++ b/src/include/buffer.h
@@ -0,0 +1,1331 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_BUFFER_H
+#define CEPH_BUFFER_H
+
+#if defined(__linux__) || defined(__FreeBSD__)
+#include <stdlib.h>
+#endif
+#include <limits.h>
+
+#ifndef _XOPEN_SOURCE
+# define _XOPEN_SOURCE 600
+#endif
+
+#include <stdio.h>
+#include <sys/uio.h>
+
+#if defined(__linux__) // For malloc(2).
+#include <malloc.h>
+#endif
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifndef __CYGWIN__
+# include <sys/mman.h>
+#endif
+
+#include <iosfwd>
+#include <iomanip>
+#include <list>
+#include <vector>
+#include <string>
+#if __cplusplus >= 201703L
+#include <string_view>
+#endif // __cplusplus >= 201703L
+
+#include <exception>
+#include <type_traits>
+
+#include "page.h"
+#include "crc32c.h"
+#include "buffer_fwd.h"
+
+#ifdef __CEPH__
+# include "include/ceph_assert.h"
+#else
+# include <assert.h>
+#endif
+
+#include "inline_memory.h"
+
+#define CEPH_BUFFER_API
+
+#if defined(HAVE_XIO)
+struct xio_reg_mem;
+class XioDispatchHook;
+#endif
+#ifdef HAVE_SEASTAR
+namespace seastar {
+template <typename T> class temporary_buffer;
+namespace net {
+class packet;
+}
+}
+#endif // HAVE_SEASTAR
+class deleter;
+template<uint8_t S>
+struct sha_digest_t;
+using sha1_digest_t = sha_digest_t<20>;
+
+namespace ceph {
+
+template <class T>
+struct nop_delete {
+ void operator()(T*) {}
+};
+
+// This is not unique_ptr-like smart pointer! It just signalizes ownership
+// but DOES NOT manage the resource. It WILL LEAK if not manually deleted.
+// It's rather a replacement for raw pointer than any other smart one.
+//
+// Considered options:
+// * unique_ptr with custom deleter implemented in .cc (would provide
+// the non-zero-cost resource management),
+// * GSL's owner<T*> (pretty neat but would impose an extra depedency),
+// * unique_ptr with nop deleter,
+// * raw pointer (doesn't embed ownership enforcement - std::move).
+template <class T>
+struct unique_leakable_ptr : public std::unique_ptr<T, ceph::nop_delete<T>> {
+ using std::unique_ptr<T, ceph::nop_delete<T>>::unique_ptr;
+};
+
+namespace buffer CEPH_BUFFER_API {
+inline namespace v14_2_0 {
+
+ /*
+ * exceptions
+ */
+
+ struct error : public std::exception{
+ const char *what() const throw () override;
+ };
+ struct bad_alloc : public error {
+ const char *what() const throw () override;
+ };
+ struct end_of_buffer : public error {
+ const char *what() const throw () override;
+ };
+ struct malformed_input : public error {
+ explicit malformed_input(const std::string& w) {
+ snprintf(buf, sizeof(buf), "buffer::malformed_input: %s", w.c_str());
+ }
+ const char *what() const throw () override;
+ private:
+ char buf[256];
+ };
+ struct error_code : public malformed_input {
+ explicit error_code(int error);
+ int code;
+ };
+
+
+ /// count of cached crc hits (matching input)
+ int get_cached_crc();
+ /// count of cached crc hits (mismatching input, required adjustment)
+ int get_cached_crc_adjusted();
+ /// count of crc cache misses
+ int get_missed_crc();
+ /// enable/disable tracking of cached crcs
+ void track_cached_crc(bool b);
+
+ /*
+ * an abstract raw buffer. with a reference count.
+ */
+ class raw;
+ class raw_malloc;
+ class raw_static;
+ class raw_posix_aligned;
+ class raw_hack_aligned;
+ class raw_char;
+ class raw_claimed_char;
+ class raw_unshareable; // diagnostic, unshareable char buffer
+ class raw_combined;
+ class raw_claim_buffer;
+
+
+ class xio_mempool;
+ class xio_msg_buffer;
+
+ /*
+ * named constructors
+ */
+ ceph::unique_leakable_ptr<raw> copy(const char *c, unsigned len);
+ ceph::unique_leakable_ptr<raw> create(unsigned len);
+ ceph::unique_leakable_ptr<raw> create_in_mempool(unsigned len, int mempool);
+ raw* claim_char(unsigned len, char *buf);
+ raw* create_malloc(unsigned len);
+ raw* claim_malloc(unsigned len, char *buf);
+ raw* create_static(unsigned len, char *buf);
+ ceph::unique_leakable_ptr<raw> create_aligned(unsigned len, unsigned align);
+ ceph::unique_leakable_ptr<raw> create_aligned_in_mempool(unsigned len, unsigned align, int mempool);
+ ceph::unique_leakable_ptr<raw> create_page_aligned(unsigned len);
+ ceph::unique_leakable_ptr<raw> create_small_page_aligned(unsigned len);
+ raw* create_unshareable(unsigned len);
+ raw* create_static(unsigned len, char *buf);
+ raw* claim_buffer(unsigned len, char *buf, deleter del);
+
+#ifdef HAVE_SEASTAR
+ /// create a raw buffer to wrap seastar cpu-local memory, using foreign_ptr to
+ /// make it safe to share between cpus
+ raw* create_foreign(seastar::temporary_buffer<char>&& buf);
+ /// create a raw buffer to wrap seastar cpu-local memory, without the safety
+ /// of foreign_ptr. the caller must otherwise guarantee that the buffer ptr is
+ /// destructed on this cpu
+ raw* create(seastar::temporary_buffer<char>&& buf);
+#endif
+#if defined(HAVE_XIO)
+ raw* create_msg(unsigned len, char *buf, XioDispatchHook *m_hook);
+#endif
+
+ /*
+ * a buffer pointer. references (a subsequence of) a raw buffer.
+ */
+ class CEPH_BUFFER_API ptr {
+ raw *_raw;
+ public: // dirty hack for testing; if it works, this will be abstracted
+ unsigned _off, _len;
+ private:
+
+ void release();
+
+ template<bool is_const>
+ class iterator_impl {
+ const ptr *bp; ///< parent ptr
+ const char *start; ///< starting pointer into bp->c_str()
+ const char *pos; ///< pointer into bp->c_str()
+ const char *end_ptr; ///< pointer to bp->end_c_str()
+ const bool deep; ///< if true, do not allow shallow ptr copies
+
+ iterator_impl(typename std::conditional<is_const, const ptr*, ptr*>::type p,
+ size_t offset, bool d)
+ : bp(p),
+ start(p->c_str() + offset),
+ pos(start),
+ end_ptr(p->end_c_str()),
+ deep(d)
+ {}
+
+ friend class ptr;
+
+ public:
+ using pointer = typename std::conditional<is_const, const char*, char *>::type;
+ pointer get_pos_add(size_t n) {
+ auto r = pos;
+ advance(n);
+ return r;
+ }
+ ptr get_ptr(size_t len) {
+ if (deep) {
+ return buffer::copy(get_pos_add(len), len);
+ } else {
+ size_t off = pos - bp->c_str();
+ advance(len);
+ return ptr(*bp, off, len);
+ }
+ }
+
+ void advance(size_t len) {
+ pos += len;
+ if (pos > end_ptr)
+ throw end_of_buffer();
+ }
+
+ const char *get_pos() {
+ return pos;
+ }
+ const char *get_end() {
+ return end_ptr;
+ }
+
+ size_t get_offset() {
+ return pos - start;
+ }
+
+ bool end() const {
+ return pos == end_ptr;
+ }
+ };
+
+ public:
+ using const_iterator = iterator_impl<true>;
+ using iterator = iterator_impl<false>;
+
+ ptr() : _raw(nullptr), _off(0), _len(0) {}
+ // cppcheck-suppress noExplicitConstructor
+ ptr(raw* r);
+ ptr(ceph::unique_leakable_ptr<raw> r);
+ // cppcheck-suppress noExplicitConstructor
+ ptr(unsigned l);
+ ptr(const char *d, unsigned l);
+ ptr(const ptr& p);
+ ptr(ptr&& p) noexcept;
+ ptr(const ptr& p, unsigned o, unsigned l);
+ ptr(const ptr& p, ceph::unique_leakable_ptr<raw> r);
+ ptr& operator= (const ptr& p);
+ ptr& operator= (ptr&& p) noexcept;
+ ~ptr() {
+ // BE CAREFUL: this destructor is called also for hypercombined ptr_node.
+ // After freeing underlying raw, `*this` can become inaccessible as well!
+ release();
+ }
+
+ bool have_raw() const { return _raw ? true:false; }
+
+ ceph::unique_leakable_ptr<raw> clone();
+ void swap(ptr& other) noexcept;
+
+ iterator begin(size_t offset=0) {
+ return iterator(this, offset, false);
+ }
+ const_iterator begin(size_t offset=0) const {
+ return const_iterator(this, offset, false);
+ }
+ const_iterator cbegin() const {
+ return begin();
+ }
+ const_iterator begin_deep(size_t offset=0) const {
+ return const_iterator(this, offset, true);
+ }
+
+ // misc
+ bool is_aligned(unsigned align) const {
+ return ((long)c_str() & (align-1)) == 0;
+ }
+ bool is_page_aligned() const { return is_aligned(CEPH_PAGE_SIZE); }
+ bool is_n_align_sized(unsigned align) const
+ {
+ return (length() % align) == 0;
+ }
+ bool is_n_page_sized() const { return is_n_align_sized(CEPH_PAGE_SIZE); }
+ bool is_partial() const {
+ return have_raw() && (start() > 0 || end() < raw_length());
+ }
+
+ int get_mempool() const;
+ void reassign_to_mempool(int pool);
+ void try_assign_to_mempool(int pool);
+
+ // accessors
+ raw *get_raw() const { return _raw; }
+ const char *c_str() const;
+ char *c_str();
+ const char *end_c_str() const;
+ char *end_c_str();
+ unsigned length() const { return _len; }
+ unsigned offset() const { return _off; }
+ unsigned start() const { return _off; }
+ unsigned end() const { return _off + _len; }
+ unsigned unused_tail_length() const;
+ const char& operator[](unsigned n) const;
+ char& operator[](unsigned n);
+
+ const char *raw_c_str() const;
+ unsigned raw_length() const;
+ int raw_nref() const;
+
+ void copy_out(unsigned o, unsigned l, char *dest) const;
+
+ unsigned wasted() const;
+
+ int cmp(const ptr& o) const;
+ bool is_zero() const;
+
+ // modifiers
+ void set_offset(unsigned o) {
+#ifdef __CEPH__
+ ceph_assert(raw_length() >= o);
+#else
+ assert(raw_length() >= o);
+#endif
+ _off = o;
+ }
+ void set_length(unsigned l) {
+#ifdef __CEPH__
+ ceph_assert(raw_length() >= l);
+#else
+ assert(raw_length() >= l);
+#endif
+ _len = l;
+ }
+
+ unsigned append(char c);
+ unsigned append(const char *p, unsigned l);
+#if __cplusplus >= 201703L
+ inline unsigned append(std::string_view s) {
+ return append(s.data(), s.length());
+ }
+#endif // __cplusplus >= 201703L
+ void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset = true);
+ void zero(bool crc_reset = true);
+ void zero(unsigned o, unsigned l, bool crc_reset = true);
+ unsigned append_zeros(unsigned l);
+
+#ifdef HAVE_SEASTAR
+ /// create a temporary_buffer, copying the ptr as its deleter
+ operator seastar::temporary_buffer<char>() &;
+ /// convert to temporary_buffer, stealing the ptr as its deleter
+ operator seastar::temporary_buffer<char>() &&;
+#endif // HAVE_SEASTAR
+
+ };
+
+
+ struct ptr_hook {
+ mutable ptr_hook* next;
+
+ ptr_hook() = default;
+ ptr_hook(ptr_hook* const next)
+ : next(next) {
+ }
+ };
+
+ class ptr_node : public ptr_hook, public ptr {
+ public:
+ struct cloner {
+ ptr_node* operator()(const ptr_node& clone_this);
+ };
+ struct disposer {
+ void operator()(ptr_node* const delete_this) {
+ if (!dispose_if_hypercombined(delete_this)) {
+ delete delete_this;
+ }
+ }
+ };
+
+ ~ptr_node() = default;
+
+ static std::unique_ptr<ptr_node, disposer>
+ create(ceph::unique_leakable_ptr<raw> r) {
+ return create_hypercombined(std::move(r));
+ }
+ static std::unique_ptr<ptr_node, disposer> create(raw* const r) {
+ return create_hypercombined(r);
+ }
+ static std::unique_ptr<ptr_node, disposer> create(const unsigned l) {
+ return create_hypercombined(buffer::create(l));
+ }
+ template <class... Args>
+ static std::unique_ptr<ptr_node, disposer> create(Args&&... args) {
+ return std::unique_ptr<ptr_node, disposer>(
+ new ptr_node(std::forward<Args>(args)...));
+ }
+
+ static ptr_node* copy_hypercombined(const ptr_node& copy_this);
+
+ private:
+ template <class... Args>
+ ptr_node(Args&&... args) : ptr(std::forward<Args>(args)...) {
+ }
+ ptr_node(const ptr_node&) = default;
+
+ ptr& operator= (const ptr& p) = delete;
+ ptr& operator= (ptr&& p) noexcept = delete;
+ ptr_node& operator= (const ptr_node& p) = delete;
+ ptr_node& operator= (ptr_node&& p) noexcept = delete;
+ void swap(ptr& other) noexcept = delete;
+ void swap(ptr_node& other) noexcept = delete;
+
+ static bool dispose_if_hypercombined(ptr_node* delete_this);
+ static std::unique_ptr<ptr_node, disposer> create_hypercombined(
+ buffer::raw* r);
+ static std::unique_ptr<ptr_node, disposer> create_hypercombined(
+ ceph::unique_leakable_ptr<raw> r);
+ };
+ /*
+ * list - the useful bit!
+ */
+
+ class CEPH_BUFFER_API list {
+ public:
+ // this the very low-level implementation of singly linked list
+ // ceph::buffer::list is built on. We don't use intrusive slist
+ // of Boost (or any other 3rd party) to save extra dependencies
+ // in our public headers.
+ class buffers_t {
+ // _root.next can be thought as _head
+ ptr_hook _root;
+ ptr_hook* _tail;
+ std::size_t _size;
+
+ public:
+ template <class T>
+ class buffers_iterator {
+ typename std::conditional<
+ std::is_const<T>::value, const ptr_hook*, ptr_hook*>::type cur;
+ template <class U> friend class buffers_iterator;
+ public:
+ using value_type = T;
+ using reference = typename std::add_lvalue_reference<T>::type;
+ using pointer = typename std::add_pointer<T>::type;
+ using difference_type = std::ptrdiff_t;
+ using iterator_category = std::forward_iterator_tag;
+
+ template <class U>
+ buffers_iterator(U* const p)
+ : cur(p) {
+ }
+ template <class U>
+ buffers_iterator(const buffers_iterator<U>& other)
+ : cur(other.cur) {
+ }
+ buffers_iterator() = default;
+
+ T& operator*() const {
+ return *reinterpret_cast<T*>(cur);
+ }
+ T* operator->() const {
+ return reinterpret_cast<T*>(cur);
+ }
+
+ buffers_iterator& operator++() {
+ cur = cur->next;
+ return *this;
+ }
+ buffers_iterator operator++(int) {
+ const auto temp(*this);
+ ++*this;
+ return temp;
+ }
+
+ template <class U>
+ buffers_iterator& operator=(buffers_iterator<U>& other) {
+ cur = other.cur;
+ return *this;
+ }
+
+ bool operator==(const buffers_iterator& rhs) const {
+ return cur == rhs.cur;
+ }
+ bool operator!=(const buffers_iterator& rhs) const {
+ return !(*this==rhs);
+ }
+
+ using citer_t = buffers_iterator<typename std::add_const<T>::type>;
+ operator citer_t() const {
+ return citer_t(cur);
+ }
+ };
+
+ typedef buffers_iterator<const ptr_node> const_iterator;
+ typedef buffers_iterator<ptr_node> iterator;
+
+ typedef const ptr_node& const_reference;
+ typedef ptr_node& reference;
+
+ buffers_t()
+ : _root(&_root),
+ _tail(&_root),
+ _size(0) {
+ }
+ buffers_t(const buffers_t&) = delete;
+ buffers_t(buffers_t&& other)
+ : _root(other._root.next == &other._root ? &_root : other._root.next),
+ _tail(other._tail == &other._root ? &_root : other._tail),
+ _size(other._size) {
+ other._root.next = &other._root;
+ other._tail = &other._root;
+ other._size = 0;
+
+ _tail->next = &_root;
+ }
+ buffers_t& operator=(buffers_t&& other) {
+ if (&other != this) {
+ clear_and_dispose();
+ swap(other);
+ }
+ return *this;
+ }
+
+ void push_back(reference item) {
+ item.next = &_root;
+ // this updates _root.next when called on empty
+ _tail->next = &item;
+ _tail = &item;
+ _size++;
+ }
+
+ void push_front(reference item) {
+ item.next = _root.next;
+ _root.next = &item;
+ _tail = _tail == &_root ? &item : _tail;
+ _size++;
+ }
+
+ // *_after
+ iterator erase_after(const_iterator it) {
+ const auto* to_erase = it->next;
+
+ it->next = to_erase->next;
+ _root.next = _root.next == to_erase ? to_erase->next : _root.next;
+ _tail = _tail == to_erase ? (ptr_hook*)&*it : _tail;
+ _size--;
+ return it->next;
+ }
+
+ void insert_after(const_iterator it, reference item) {
+ item.next = it->next;
+ it->next = &item;
+ _root.next = it == end() ? &item : _root.next;
+ _tail = const_iterator(_tail) == it ? &item : _tail;
+ _size++;
+ }
+
+ void splice_back(buffers_t& other) {
+ if (other._size == 0) {
+ return;
+ }
+
+ other._tail->next = &_root;
+ // will update root.next if empty() == true
+ _tail->next = other._root.next;
+ _tail = other._tail;
+ _size += other._size;
+
+ other._root.next = &other._root;
+ other._tail = &other._root;
+ other._size = 0;
+ }
+
+ std::size_t size() const { return _size; }
+ bool empty() const { return _tail == &_root; }
+
+ const_iterator begin() const {
+ return _root.next;
+ }
+ const_iterator before_begin() const {
+ return &_root;
+ }
+ const_iterator end() const {
+ return &_root;
+ }
+ iterator begin() {
+ return _root.next;
+ }
+ iterator before_begin() {
+ return &_root;
+ }
+ iterator end() {
+ return &_root;
+ }
+
+ reference front() {
+ return reinterpret_cast<reference>(*_root.next);
+ }
+ reference back() {
+ return reinterpret_cast<reference>(*_tail);
+ }
+ const_reference front() const {
+ return reinterpret_cast<const_reference>(*_root.next);
+ }
+ const_reference back() const {
+ return reinterpret_cast<const_reference>(*_tail);
+ }
+
+ void clone_from(const buffers_t& other) {
+ clear_and_dispose();
+ for (auto& node : other) {
+ ptr_node* clone = ptr_node::cloner()(node);
+ push_back(*clone);
+ }
+ }
+ void clear_and_dispose() {
+ for (auto it = begin(); it != end(); /* nop */) {
+ auto& node = *it;
+ it = it->next;
+ ptr_node::disposer()(&node);
+ }
+ _root.next = &_root;
+ _tail = &_root;
+ _size = 0;
+ }
+ iterator erase_after_and_dispose(iterator it) {
+ auto* to_dispose = &*std::next(it);
+ auto ret = erase_after(it);
+ ptr_node::disposer()(to_dispose);
+ return ret;
+ }
+
+ void swap(buffers_t& other) {
+ const auto copy_root = _root;
+ _root.next = \
+ other._root.next == &other._root ? &this->_root : other._root.next;
+ other._root.next = \
+ copy_root.next == &_root ? &other._root : copy_root.next;
+
+ const auto copy_tail = _tail;
+ _tail = other._tail == &other._root ? &this->_root : other._tail;
+ other._tail = copy_tail == &_root ? &other._root : copy_tail;
+
+ _tail->next = &_root;
+ other._tail->next = &other._root;
+ std::swap(_size, other._size);
+ }
+ };
+
+ class iterator;
+
+ private:
+ // my private bits
+ buffers_t _buffers;
+
+ // track bufferptr we can modify (especially ::append() to). Not all bptrs
+ // bufferlist holds have this trait -- if somebody ::push_back(const ptr&),
+ // he expects it won't change.
+ ptr* _carriage;
+ unsigned _len;
+ unsigned _memcopy_count; //the total of memcopy using rebuild().
+
+ template <bool is_const>
+ class CEPH_BUFFER_API iterator_impl {
+ protected:
+ typedef typename std::conditional<is_const,
+ const list,
+ list>::type bl_t;
+ typedef typename std::conditional<is_const,
+ const buffers_t,
+ buffers_t >::type list_t;
+ typedef typename std::conditional<is_const,
+ typename buffers_t::const_iterator,
+ typename buffers_t::iterator>::type list_iter_t;
+ bl_t* bl;
+ list_t* ls; // meh.. just here to avoid an extra pointer dereference..
+ list_iter_t p;
+ unsigned off; // in bl
+ unsigned p_off; // in *p
+ friend class iterator_impl<true>;
+
+ public:
+ using iterator_category = std::forward_iterator_tag;
+ using value_type = typename std::conditional<is_const, const char, char>::type;
+ using difference_type = std::ptrdiff_t;
+ using pointer = typename std::add_pointer<value_type>::type;
+ using reference = typename std::add_lvalue_reference<value_type>::type;
+
+ // constructor. position.
+ iterator_impl()
+ : bl(0), ls(0), off(0), p_off(0) {}
+ iterator_impl(bl_t *l, unsigned o=0);
+ iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po)
+ : bl(l), ls(&bl->_buffers), p(ip), off(o), p_off(po) {}
+ iterator_impl(const list::iterator& i);
+
+ /// get current iterator offset in buffer::list
+ unsigned get_off() const { return off; }
+
+ /// get number of bytes remaining from iterator position to the end of the buffer::list
+ unsigned get_remaining() const { return bl->length() - off; }
+
+ /// true if iterator is at the end of the buffer::list
+ bool end() const {
+ return p == ls->end();
+ //return off == bl->length();
+ }
+
+ void advance(int o) = delete;
+ void advance(unsigned o);
+ void advance(size_t o) { advance(static_cast<unsigned>(o)); }
+ void seek(unsigned o);
+ char operator*() const;
+ iterator_impl& operator++();
+ ptr get_current_ptr() const;
+ bool is_pointing_same_raw(const ptr& other) const;
+
+ bl_t& get_bl() const { return *bl; }
+
+ // copy data out.
+ // note that these all _append_ to dest!
+ void copy(unsigned len, char *dest);
+ // deprecated, use copy_deep()
+ void copy(unsigned len, ptr &dest) __attribute__((deprecated));
+ void copy_deep(unsigned len, ptr &dest);
+ void copy_shallow(unsigned len, ptr &dest);
+ void copy(unsigned len, list &dest);
+ void copy(unsigned len, std::string &dest);
+ void copy_all(list &dest);
+
+ // get a pointer to the currenet iterator position, return the
+ // number of bytes we can read from that position (up to want),
+ // and advance the iterator by that amount.
+ size_t get_ptr_and_advance(size_t want, const char **p);
+
+ /// calculate crc from iterator position
+ uint32_t crc32c(size_t length, uint32_t crc);
+
+ friend bool operator==(const iterator_impl& lhs,
+ const iterator_impl& rhs) {
+ return &lhs.get_bl() == &rhs.get_bl() && lhs.get_off() == rhs.get_off();
+ }
+ friend bool operator!=(const iterator_impl& lhs,
+ const iterator_impl& rhs) {
+ return &lhs.get_bl() != &rhs.get_bl() || lhs.get_off() != rhs.get_off();
+ }
+ };
+
+ public:
+ typedef iterator_impl<true> const_iterator;
+
+ class CEPH_BUFFER_API iterator : public iterator_impl<false> {
+ public:
+ iterator() = default;
+ iterator(bl_t *l, unsigned o=0);
+ iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po);
+ // copy data in
+ void copy_in(unsigned len, const char *src, bool crc_reset = true);
+ void copy_in(unsigned len, const list& otherl);
+ };
+
+ struct reserve_t {
+ char* bp_data;
+ unsigned* bp_len;
+ unsigned* bl_len;
+ };
+
+ class contiguous_appender {
+ ceph::bufferlist& bl;
+ ceph::bufferlist::reserve_t space;
+ char* pos;
+ bool deep;
+
+ /// running count of bytes appended that are not reflected by @pos
+ size_t out_of_band_offset = 0;
+
+ contiguous_appender(bufferlist& bl, size_t len, bool d)
+ : bl(bl),
+ space(bl.obtain_contiguous_space(len)),
+ pos(space.bp_data),
+ deep(d) {
+ }
+
+ void flush_and_continue() {
+ const size_t l = pos - space.bp_data;
+ *space.bp_len += l;
+ *space.bl_len += l;
+ space.bp_data = pos;
+ }
+
+ friend class list;
+
+ public:
+ ~contiguous_appender() {
+ flush_and_continue();
+ }
+
+ size_t get_out_of_band_offset() const {
+ return out_of_band_offset;
+ }
+ void append(const char* __restrict__ p, size_t l) {
+ maybe_inline_memcpy(pos, p, l, 16);
+ pos += l;
+ }
+ char *get_pos_add(size_t len) {
+ char *r = pos;
+ pos += len;
+ return r;
+ }
+ char *get_pos() {
+ return pos;
+ }
+
+ void append(const bufferptr& p) {
+ const auto plen = p.length();
+ if (!plen) {
+ return;
+ }
+ if (deep) {
+ append(p.c_str(), plen);
+ } else {
+ flush_and_continue();
+ bl.append(p);
+ space = bl.obtain_contiguous_space(0);
+ out_of_band_offset += plen;
+ }
+ }
+ void append(const bufferlist& l) {
+ if (deep) {
+ for (const auto &p : l._buffers) {
+ append(p.c_str(), p.length());
+ }
+ } else {
+ flush_and_continue();
+ bl.append(l);
+ space = bl.obtain_contiguous_space(0);
+ out_of_band_offset += l.length();
+ }
+ }
+
+ size_t get_logical_offset() {
+ return out_of_band_offset + (pos - space.bp_data);
+ }
+ };
+
+ contiguous_appender get_contiguous_appender(size_t len, bool deep=false) {
+ return contiguous_appender(*this, len, deep);
+ }
+
+ class contiguous_filler {
+ friend buffer::list;
+ char* pos;
+
+ contiguous_filler(char* const pos) : pos(pos) {}
+
+ public:
+ void advance(const unsigned len) {
+ pos += len;
+ }
+ void copy_in(const unsigned len, const char* const src) {
+ memcpy(pos, src, len);
+ advance(len);
+ }
+ char* c_str() {
+ return pos;
+ }
+ };
+ // The contiguous_filler is supposed to be not costlier than a single
+ // pointer. Keep it dumb, please.
+ static_assert(sizeof(contiguous_filler) == sizeof(char*),
+ "contiguous_filler should be no costlier than pointer");
+
+ class page_aligned_appender {
+ bufferlist *pbl;
+ unsigned min_alloc;
+ ptr buffer;
+ char *pos, *end;
+
+ page_aligned_appender(list *l, unsigned min_pages)
+ : pbl(l),
+ min_alloc(min_pages * CEPH_PAGE_SIZE),
+ pos(nullptr), end(nullptr) {}
+
+ friend class list;
+
+ public:
+ ~page_aligned_appender() {
+ flush();
+ }
+
+ void flush() {
+ if (pos && pos != buffer.c_str()) {
+ size_t len = pos - buffer.c_str();
+ pbl->append(buffer, 0, len);
+ buffer.set_length(buffer.length() - len);
+ buffer.set_offset(buffer.offset() + len);
+ }
+ }
+
+ void append(const char *buf, size_t len) {
+ while (len > 0) {
+ if (!pos) {
+ size_t alloc = (len + CEPH_PAGE_SIZE - 1) & CEPH_PAGE_MASK;
+ if (alloc < min_alloc) {
+ alloc = min_alloc;
+ }
+ buffer = create_page_aligned(alloc);
+ pos = buffer.c_str();
+ end = buffer.end_c_str();
+ }
+ size_t l = len;
+ if (l > (size_t)(end - pos)) {
+ l = end - pos;
+ }
+ memcpy(pos, buf, l);
+ pos += l;
+ buf += l;
+ len -= l;
+ if (pos == end) {
+ pbl->append(buffer, 0, buffer.length());
+ pos = end = nullptr;
+ }
+ }
+ }
+ };
+
+ page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) {
+ return page_aligned_appender(this, min_pages);
+ }
+
+ private:
+ mutable iterator last_p;
+
+ // always_empty_bptr has no underlying raw but its _len is always 0.
+ // This is useful for e.g. get_append_buffer_unused_tail_length() as
+ // it allows to avoid conditionals on hot paths.
+ static ptr always_empty_bptr;
+ ptr_node& refill_append_space(const unsigned len);
+
+ public:
+ // cons/des
+ list()
+ : _carriage(&always_empty_bptr),
+ _len(0),
+ _memcopy_count(0),
+ last_p(this) {
+ }
+ // cppcheck-suppress noExplicitConstructor
+ // cppcheck-suppress noExplicitConstructor
+ list(unsigned prealloc)
+ : _carriage(&always_empty_bptr),
+ _len(0),
+ _memcopy_count(0),
+ last_p(this) {
+ reserve(prealloc);
+ }
+
+ list(const list& other)
+ : _carriage(&always_empty_bptr),
+ _len(other._len),
+ _memcopy_count(other._memcopy_count),
+ last_p(this) {
+ _buffers.clone_from(other._buffers);
+ }
+ list(list&& other) noexcept;
+
+ ~list() {
+ _buffers.clear_and_dispose();
+ }
+
+ list& operator= (const list& other) {
+ if (this != &other) {
+ _carriage = &always_empty_bptr;
+ _buffers.clone_from(other._buffers);
+ _len = other._len;
+ last_p = begin();
+ }
+ return *this;
+ }
+ list& operator= (list&& other) noexcept {
+ _buffers = std::move(other._buffers);
+ _carriage = other._carriage;
+ _len = other._len;
+ _memcopy_count = other._memcopy_count;
+ last_p = begin();
+ other.clear();
+ return *this;
+ }
+
+ uint64_t get_wasted_space() const;
+ unsigned get_num_buffers() const { return _buffers.size(); }
+ const ptr_node& front() const { return _buffers.front(); }
+ const ptr_node& back() const { return _buffers.back(); }
+
+ int get_mempool() const;
+ void reassign_to_mempool(int pool);
+ void try_assign_to_mempool(int pool);
+
+ size_t get_append_buffer_unused_tail_length() const {
+ return _carriage->unused_tail_length();
+ }
+
+ unsigned get_memcopy_count() const {return _memcopy_count; }
+ const buffers_t& buffers() const { return _buffers; }
+ void swap(list& other) noexcept;
+ unsigned length() const {
+#if 0
+ // DEBUG: verify _len
+ unsigned len = 0;
+ for (std::list<ptr>::const_iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++) {
+ len += (*it).length();
+ }
+#ifdef __CEPH__
+ ceph_assert(len == _len);
+#else
+ assert(len == _len);
+#endif // __CEPH__
+#endif
+ return _len;
+ }
+
+ bool contents_equal(const buffer::list& other) const;
+
+ bool is_provided_buffer(const char *dst) const;
+ bool is_aligned(unsigned align) const;
+ bool is_page_aligned() const;
+ bool is_n_align_sized(unsigned align) const;
+ bool is_n_page_sized() const;
+ bool is_aligned_size_and_memory(unsigned align_size,
+ unsigned align_memory) const;
+
+ bool is_zero() const;
+
+ // modifiers
+ void clear() noexcept {
+ _carriage = &always_empty_bptr;
+ _buffers.clear_and_dispose();
+ _len = 0;
+ _memcopy_count = 0;
+ last_p = begin();
+ }
+ void push_back(const ptr& bp) {
+ if (bp.length() == 0)
+ return;
+ _buffers.push_back(*ptr_node::create(bp).release());
+ _len += bp.length();
+ }
+ void push_back(ptr&& bp) {
+ if (bp.length() == 0)
+ return;
+ _len += bp.length();
+ _buffers.push_back(*ptr_node::create(std::move(bp)).release());
+ _carriage = &always_empty_bptr;
+ }
+ void push_back(const ptr_node&) = delete;
+ void push_back(ptr_node&) = delete;
+ void push_back(ptr_node&&) = delete;
+ void push_back(std::unique_ptr<ptr_node, ptr_node::disposer> bp) {
+ if (bp->length() == 0)
+ return;
+ _carriage = bp.get();
+ _len += bp->length();
+ _buffers.push_back(*bp.release());
+ }
+ void push_back(raw* const r) {
+ _buffers.push_back(*ptr_node::create(r).release());
+ _carriage = &_buffers.back();
+ _len += _buffers.back().length();
+ }
+ void push_back(ceph::unique_leakable_ptr<raw> r) {
+ push_back(r.release());
+ }
+
+ void zero();
+ void zero(unsigned o, unsigned l);
+
+ bool is_contiguous() const;
+ void rebuild();
+ void rebuild(std::unique_ptr<ptr_node, ptr_node::disposer> nb);
+ bool rebuild_aligned(unsigned align);
+ // max_buffers = 0 mean don't care _buffers.size(), other
+ // must make _buffers.size() <= max_buffers after rebuilding.
+ bool rebuild_aligned_size_and_memory(unsigned align_size,
+ unsigned align_memory,
+ unsigned max_buffers = 0);
+ bool rebuild_page_aligned();
+
+ void reserve(size_t prealloc);
+
+ // assignment-op with move semantics
+ const static unsigned int CLAIM_DEFAULT = 0;
+ const static unsigned int CLAIM_ALLOW_NONSHAREABLE = 1;
+
+ void claim(list& bl, unsigned int flags = CLAIM_DEFAULT);
+ void claim_append(list& bl, unsigned int flags = CLAIM_DEFAULT);
+ // only for bl is bufferlist::page_aligned_appender
+ void claim_append_piecewise(list& bl);
+
+ // copy with explicit volatile-sharing semantics
+ void share(const list& bl)
+ {
+ if (this != &bl) {
+ clear();
+ for (const auto& bp : bl._buffers) {
+ _buffers.push_back(*ptr_node::create(bp).release());
+ }
+ _len = bl._len;
+ }
+ }
+
+#ifdef HAVE_SEASTAR
+ /// convert the bufferlist into a network packet
+ operator seastar::net::packet() &&;
+#endif
+
+ iterator begin() {
+ return iterator(this, 0);
+ }
+ iterator end() {
+ return iterator(this, _len, _buffers.end(), 0);
+ }
+
+ const_iterator begin() const {
+ return const_iterator(this, 0);
+ }
+ const_iterator cbegin() const {
+ return begin();
+ }
+ const_iterator end() const {
+ return const_iterator(this, _len, _buffers.end(), 0);
+ }
+
+ // crope lookalikes.
+ // **** WARNING: this are horribly inefficient for large bufferlists. ****
+ void copy(unsigned off, unsigned len, char *dest) const;
+ void copy(unsigned off, unsigned len, list &dest) const;
+ void copy(unsigned off, unsigned len, std::string& dest) const;
+ void copy_in(unsigned off, unsigned len, const char *src, bool crc_reset = true);
+ void copy_in(unsigned off, unsigned len, const list& src);
+
+ void append(char c);
+ void append(const char *data, unsigned len);
+ void append(std::string s) {
+ append(s.data(), s.length());
+ }
+#if __cplusplus >= 201703L
+ // To forcibly disambiguate between string and string_view in the
+ // case of arrays
+ template<std::size_t N>
+ void append(const char (&s)[N]) {
+ append(s, N);
+ }
+ void append(const char* s) {
+ append(s, strlen(s));
+ }
+ void append(std::string_view s) {
+ append(s.data(), s.length());
+ }
+#endif // __cplusplus >= 201703L
+ void append(const ptr& bp);
+ void append(ptr&& bp);
+ void append(const ptr& bp, unsigned off, unsigned len);
+ void append(const list& bl);
+ void append(std::istream& in);
+ contiguous_filler append_hole(unsigned len);
+ void append_zero(unsigned len);
+ void prepend_zero(unsigned len);
+
+ reserve_t obtain_contiguous_space(unsigned len);
+
+ /*
+ * get a char
+ */
+ const char& operator[](unsigned n) const;
+ char *c_str();
+ std::string to_str() const;
+
+ void substr_of(const list& other, unsigned off, unsigned len);
+
+ // funky modifer
+ void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */);
+ void write(int off, int len, std::ostream& out) const;
+
+ void encode_base64(list& o);
+ void decode_base64(list& o);
+
+ void write_stream(std::ostream &out) const;
+ void hexdump(std::ostream &out, bool trailing_newline = true) const;
+ int read_file(const char *fn, std::string *error);
+ ssize_t read_fd(int fd, size_t len);
+ int write_file(const char *fn, int mode=0644);
+ int write_fd(int fd) const;
+ int write_fd(int fd, uint64_t offset) const;
+ template<typename VectorT>
+ void prepare_iov(VectorT *piov) const {
+#ifdef __CEPH__
+ ceph_assert(_buffers.size() <= IOV_MAX);
+#else
+ assert(_buffers.size() <= IOV_MAX);
+#endif
+ piov->resize(_buffers.size());
+ unsigned n = 0;
+ for (auto& p : _buffers) {
+ (*piov)[n].iov_base = (void *)p.c_str();
+ (*piov)[n].iov_len = p.length();
+ ++n;
+ }
+ }
+ uint32_t crc32c(uint32_t crc) const;
+ void invalidate_crc();
+ sha1_digest_t sha1();
+
+ // These functions return a bufferlist with a pointer to a single
+ // static buffer. They /must/ not outlive the memory they
+ // reference.
+ static list static_from_mem(char* c, size_t l);
+ static list static_from_cstring(char* c);
+ static list static_from_string(std::string& s);
+ };
+
+} // inline namespace v14_2_0
+
+ /*
+ * efficient hash of one or more bufferlists
+ */
+
+ class hash {
+ uint32_t crc;
+
+ public:
+ hash() : crc(0) { }
+ // cppcheck-suppress noExplicitConstructor
+ hash(uint32_t init) : crc(init) { }
+
+ void update(const buffer::list& bl) {
+ crc = bl.crc32c(crc);
+ }
+
+ uint32_t digest() {
+ return crc;
+ }
+ };
+
+inline bool operator>(bufferlist& l, bufferlist& r) {
+ for (unsigned p = 0; ; p++) {
+ if (l.length() > p && r.length() == p) return true;
+ if (l.length() == p) return false;
+ if (l[p] > r[p]) return true;
+ if (l[p] < r[p]) return false;
+ }
+}
+inline bool operator>=(bufferlist& l, bufferlist& r) {
+ for (unsigned p = 0; ; p++) {
+ if (l.length() > p && r.length() == p) return true;
+ if (r.length() == p && l.length() == p) return true;
+ if (l.length() == p && r.length() > p) return false;
+ if (l[p] > r[p]) return true;
+ if (l[p] < r[p]) return false;
+ }
+}
+
+inline bool operator==(const bufferlist &l, const bufferlist &r) {
+ if (l.length() != r.length())
+ return false;
+ for (unsigned p = 0; p < l.length(); p++) {
+ if (l[p] != r[p])
+ return false;
+ }
+ return true;
+}
+inline bool operator<(bufferlist& l, bufferlist& r) {
+ return r > l;
+}
+inline bool operator<=(bufferlist& l, bufferlist& r) {
+ return r >= l;
+}
+
+
+std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
+
+std::ostream& operator<<(std::ostream& out, const buffer::raw &r);
+
+std::ostream& operator<<(std::ostream& out, const buffer::list& bl);
+
+std::ostream& operator<<(std::ostream& out, const buffer::error& e);
+
+inline bufferhash& operator<<(bufferhash& l, const bufferlist &r) {
+ l.update(r);
+ return l;
+}
+
+} // namespace buffer
+
+#if defined(HAVE_XIO)
+xio_reg_mem* get_xio_mp(const buffer::ptr& bp);
+#endif
+
+} // namespace ceph
+
+#endif
diff --git a/src/include/buffer_fwd.h b/src/include/buffer_fwd.h
new file mode 100644
index 00000000..7fac5963
--- /dev/null
+++ b/src/include/buffer_fwd.h
@@ -0,0 +1,19 @@
+#ifndef BUFFER_FWD_H
+#define BUFFER_FWD_H
+
+namespace ceph {
+ namespace buffer {
+ inline namespace v14_2_0 {
+ class ptr;
+ class list;
+ }
+ class hash;
+ }
+
+ using bufferptr = buffer::ptr;
+ using bufferlist = buffer::list;
+ using bufferhash = buffer::hash;
+}
+
+#endif
+
diff --git a/src/include/buffer_raw.h b/src/include/buffer_raw.h
new file mode 100644
index 00000000..7557795c
--- /dev/null
+++ b/src/include/buffer_raw.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 20127 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BUFFER_RAW_H
+#define CEPH_BUFFER_RAW_H
+
+#include <atomic>
+#include <map>
+#include <utility>
+#include <type_traits>
+#include "include/buffer.h"
+#include "include/mempool.h"
+#include "include/spinlock.h"
+
+namespace ceph::buffer {
+inline namespace v14_2_0 {
+
+ class raw {
+ public:
+ // In the future we might want to have a slab allocator here with few
+ // embedded slots. This would allow to avoid the "if" in dtor of ptr_node.
+ std::aligned_storage<sizeof(ptr_node),
+ alignof(ptr_node)>::type bptr_storage;
+ char *data;
+ unsigned len;
+ std::atomic<unsigned> nref { 0 };
+ int mempool;
+
+ std::pair<size_t, size_t> last_crc_offset {std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max()};
+ std::pair<uint32_t, uint32_t> last_crc_val;
+
+ mutable ceph::spinlock crc_spinlock;
+
+ explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon)
+ : data(nullptr), len(l), nref(0), mempool(mempool) {
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+ }
+ raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon)
+ : data(c), len(l), nref(0), mempool(mempool) {
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+ }
+ virtual ~raw() {
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+ -1, -(int)len);
+ }
+
+ void _set_len(unsigned l) {
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+ -1, -(int)len);
+ len = l;
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+ }
+
+ void reassign_to_mempool(int pool) {
+ if (pool == mempool) {
+ return;
+ }
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+ -1, -(int)len);
+ mempool = pool;
+ mempool::get_pool(mempool::pool_index_t(pool)).adjust_count(1, len);
+ }
+
+ void try_assign_to_mempool(int pool) {
+ if (mempool == mempool::mempool_buffer_anon) {
+ reassign_to_mempool(pool);
+ }
+ }
+
+private:
+ // no copying.
+ // cppcheck-suppress noExplicitConstructor
+ raw(const raw &other) = delete;
+ const raw& operator=(const raw &other) = delete;
+public:
+ char *get_data() {
+ return data;
+ }
+ virtual raw* clone_empty() = 0;
+ ceph::unique_leakable_ptr<raw> clone() {
+ raw* const c = clone_empty();
+ memcpy(c->data, data, len);
+ return ceph::unique_leakable_ptr<raw>(c);
+ }
+ virtual bool is_shareable() const {
+ // true if safe to reference/share the existing buffer copy
+ // false if it is not safe to share the buffer, e.g., due to special
+ // and/or registered memory that is scarce
+ return true;
+ }
+ bool get_crc(const std::pair<size_t, size_t> &fromto,
+ std::pair<uint32_t, uint32_t> *crc) const {
+ std::lock_guard lg(crc_spinlock);
+ if (last_crc_offset == fromto) {
+ *crc = last_crc_val;
+ return true;
+ }
+ return false;
+ }
+ void set_crc(const std::pair<size_t, size_t> &fromto,
+ const std::pair<uint32_t, uint32_t> &crc) {
+ std::lock_guard lg(crc_spinlock);
+ last_crc_offset = fromto;
+ last_crc_val = crc;
+ }
+ void invalidate_crc() {
+ std::lock_guard lg(crc_spinlock);
+ last_crc_offset.first = std::numeric_limits<size_t>::max();
+ last_crc_offset.second = std::numeric_limits<size_t>::max();
+ }
+ };
+
+} // inline namespace v14_2_0
+} // namespace ceph::buffer
+
+#endif // CEPH_BUFFER_RAW_H
diff --git a/src/include/byteorder.h b/src/include/byteorder.h
new file mode 100644
index 00000000..85268543
--- /dev/null
+++ b/src/include/byteorder.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <type_traits>
+#include "acconfig.h"
+#include "int_types.h"
+
+
+#ifdef __GNUC__
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type
+swab(T val) {
+ return __builtin_bswap16(val);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type
+swab(T val) {
+ return __builtin_bswap32(val);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type
+swab(T val) {
+ return __builtin_bswap64(val);
+}
+#else
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type
+swab(T val) {
+ return (val >> 8) | (val << 8);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type
+swab(T val) {
+ return (( val >> 24) |
+ ((val >> 8) & 0xff00) |
+ ((val << 8) & 0xff0000) |
+ ((val << 24)));
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type
+swab(T val) {
+ return (( val >> 56) |
+ ((val >> 40) & 0xff00ull) |
+ ((val >> 24) & 0xff0000ull) |
+ ((val >> 8) & 0xff000000ull) |
+ ((val << 8) & 0xff00000000ull) |
+ ((val << 24) & 0xff0000000000ull) |
+ ((val << 40) & 0xff000000000000ull) |
+ ((val << 56)));
+}
+#endif
+
+// mswab == maybe swab (if not LE)
+#ifdef CEPH_BIG_ENDIAN
+template<typename T>
+inline T mswab(T val) {
+ return swab(val);
+}
+#else
+template<typename T>
+inline T mswab(T val) {
+ return val;
+}
+#endif
+
+template<typename T>
+struct ceph_le {
+ T v;
+ ceph_le<T>& operator=(T nv) {
+ v = mswab(nv);
+ return *this;
+ }
+ operator T() const { return mswab(v); }
+} __attribute__ ((packed));
+
+template<typename T>
+inline bool operator==(ceph_le<T> a, ceph_le<T> b) {
+ return a.v == b.v;
+}
+
+using ceph_le64 = ceph_le<__u64>;
+using ceph_le32 = ceph_le<__u32>;
+using ceph_le16 = ceph_le<__u16>;
+
+inline ceph_le64 init_le64(__u64 x) {
+ ceph_le64 v;
+ v = x;
+ return v;
+}
+inline ceph_le32 init_le32(__u32 x) {
+ ceph_le32 v;
+ v = x;
+ return v;
+}
+inline ceph_le16 init_le16(__u16 x) {
+ ceph_le16 v;
+ v = x;
+ return v;
+}
+
+ /*
+#define cpu_to_le64(x) (x)
+#define cpu_to_le32(x) (x)
+#define cpu_to_le16(x) (x)
+ */
+#define le64_to_cpu(x) ((uint64_t)x)
+#define le32_to_cpu(x) ((__u32)x)
+#define le16_to_cpu(x) ((__u16)x)
diff --git a/src/include/ceph_assert.h b/src/include/ceph_assert.h
new file mode 100644
index 00000000..36d6c430
--- /dev/null
+++ b/src/include/ceph_assert.h
@@ -0,0 +1,147 @@
+#ifndef CEPH_ASSERT_H
+#define CEPH_ASSERT_H
+
+#include <cstdlib>
+#include <string>
+
+#if defined(__linux__)
+#include <features.h>
+
+#ifndef __STRING
+# define __STRING(x) #x
+#endif
+
+#elif defined(__FreeBSD__)
+#include <sys/cdefs.h>
+#define __GNUC_PREREQ(minor, major) __GNUC_PREREQ__(minor, major)
+#elif defined(__sun) || defined(_AIX)
+#include "include/compat.h"
+#include <assert.h>
+#endif
+
+#ifdef __CEPH__
+# include "acconfig.h"
+#endif
+
+class CephContext;
+
+namespace ceph {
+
+struct BackTrace;
+
+/*
+ * Select a function-name variable based on compiler tests, and any compiler
+ * specific overrides.
+ */
+#if defined(HAVE_PRETTY_FUNC)
+# define __CEPH_ASSERT_FUNCTION __PRETTY_FUNCTION__
+#elif defined(HAVE_FUNC)
+# define __CEPH_ASSERT_FUNCTION __func__
+#else
+# define __CEPH_ASSERT_FUNCTION ((__const char *) 0)
+#endif
+
+extern void register_assert_context(CephContext *cct);
+
+struct assert_data {
+ const char *assertion;
+ const char *file;
+ const int line;
+ const char *function;
+};
+
+extern void __ceph_assert_fail(const char *assertion, const char *file, int line, const char *function)
+ __attribute__ ((__noreturn__));
+extern void __ceph_assert_fail(const assert_data &ctx)
+ __attribute__ ((__noreturn__));
+
+extern void __ceph_assertf_fail(const char *assertion, const char *file, int line, const char *function, const char* msg, ...)
+ __attribute__ ((__noreturn__));
+extern void __ceph_assert_warn(const char *assertion, const char *file, int line, const char *function);
+
+[[noreturn]] void __ceph_abort(const char *file, int line, const char *func,
+ const std::string& msg);
+
+[[noreturn]] void __ceph_abortf(const char *file, int line, const char *func,
+ const char* msg, ...);
+
+#define _CEPH_ASSERT_VOID_CAST static_cast<void>
+
+#define assert_warn(expr) \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : __ceph_assert_warn (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION))
+
+}
+
+using namespace ceph;
+
+
+/*
+ * ceph_abort aborts the program with a nice backtrace.
+ *
+ * Currently, it's the same as assert(0), but we may one day make assert a
+ * debug-only thing, like it is in many projects.
+ */
+#define ceph_abort(msg, ...) \
+ __ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, "abort() called")
+
+#define ceph_abort_msg(msg) \
+ __ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, msg)
+
+#define ceph_abort_msgf(...) \
+ __ceph_abortf( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)
+
+#ifdef __SANITIZE_ADDRESS__
+#define ceph_assert(expr) \
+ do { \
+ ((expr)) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : __ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \
+ } while (false)
+#else
+#define ceph_assert(expr) \
+ do { static const ceph::assert_data assert_data_ctx = \
+ {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : __ceph_assert_fail(assert_data_ctx)); } while(false)
+#endif
+
+// this variant will *never* get compiled out to NDEBUG in the future.
+// (ceph_assert currently doesn't either, but in the future it might.)
+#ifdef __SANITIZE_ADDRESS__
+#define ceph_assert_always(expr) \
+ do { \
+ ((expr)) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : __ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \
+ } while(false)
+#else
+#define ceph_assert_always(expr) \
+ do { static const ceph::assert_data assert_data_ctx = \
+ {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : __ceph_assert_fail(assert_data_ctx)); } while(false)
+#endif
+
+// Named by analogy with printf. Along with an expression, takes a format
+// string and parameters which are printed if the assertion fails.
+#define assertf(expr, ...) \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+#define ceph_assertf(expr, ...) \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+
+// this variant will *never* get compiled out to NDEBUG in the future.
+// (ceph_assertf currently doesn't either, but in the future it might.)
+#define ceph_assertf_always(expr, ...) \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+
+#endif
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
new file mode 100644
index 00000000..6fec3a0c
--- /dev/null
+++ b/src/include/ceph_features.h
@@ -0,0 +1,279 @@
+#ifndef __CEPH_FEATURES
+#define __CEPH_FEATURES
+
+#include "sys/types.h"
+
+/*
+ * Each time we reclaim bits for reuse we need to specify another
+ * bitmask that, if all bits are set, indicates we have the new
+ * incarnation of that feature. Base case is 1 (first use)
+ */
+#define CEPH_FEATURE_INCARNATION_1 (0ull)
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
+
+#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \
+ const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \
+ const static uint64_t CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+// this bit is ignored but still advertised by release *when*
+#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
+ const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+ const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+// this bit is ignored by release *unused* and not advertised by
+// release *unadvertised*
+#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
+
+
+// test for a feature. this test is safer than a typical mask against
+// the bit because it ensures that we have the bit AND the marker for the
+// bit's incarnation. this must be used in any case where the features
+// bits may include an old meaning of the bit.
+#define HAVE_FEATURE(x, name) \
+ (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
+
+
+/*
+ * Notes on deprecation:
+ *
+ * A *major* release is a release through which all upgrades must pass
+ * (e.g., jewel). For example, no pre-jewel server will ever talk to
+ * a post-jewel server (mon, osd, etc).
+ *
+ * For feature bits used *only* on the server-side:
+ *
+ * - In the first phase we indicate that a feature is DEPRECATED as of
+ * a particular release. This is the first major release X (say,
+ * jewel) that does not depend on its peers advertising the feature.
+ * That is, it safely assumes its peers all have the feature. We
+ * indicate this with the DEPRECATED macro. For example,
+ *
+ * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL)
+ *
+ * because 10.2.z (jewel) did not care if its peers advertised this
+ * feature bit.
+ *
+ * - In the second phase we stop advertising the the bit and call it
+ * RETIRED. This can normally be done in the *next* major release
+ * following the one in which we marked the feature DEPRECATED. In
+ * the above example, for 12.0.z (luminous) we can say:
+ *
+ * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+ *
+ * - The bit can be reused in the first post-luminous release, 13.0.z
+ * (m).
+ *
+ * This ensures that no two versions who have different meanings for
+ * the bit ever speak to each other.
+ */
+
+/*
+ * Notes on the kernel client:
+ *
+ * - "X" means that the feature bit has been advertised and supported
+ * since kernel X
+ *
+ * - "X req" means that the feature bit has been advertised and required
+ * since kernel X
+ *
+ * The remaining feature bits are not and have never been used by the
+ * kernel client.
+ */
+
+DEFINE_CEPH_FEATURE( 0, 1, UID)
+DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) // 2.6.35 req
+DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
+DEFINE_CEPH_FEATURE( 3, 1, FLOCK) // 2.6.36
+DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) // 4.6 req
+DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
+DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ) // 3.10 req
+DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH) // 2.6.38
+DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
+DEFINE_CEPH_FEATURE( 9, 1, PGID64) // 3.9 req
+DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
+DEFINE_CEPH_FEATURE(11, 1, PGPOOL3) // 3.9 req
+DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
+DEFINE_CEPH_FEATURE(13, 1, OSDENC) // 3.9 req
+DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
+DEFINE_CEPH_FEATURE(15, 1, MONENC)
+DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(16, 3, SERVER_O)
+DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(17, 3, OS_PERF_STAT_NS)
+DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES) // 3.6
+DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT)
+
+DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) // 4.13
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(22, 2, OSD_FIXED_COLLECTION_LIST)
+DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) // 3.19 req (unless nocephx_require_signatures)
+DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS)
+DEFINE_CEPH_FEATURE(24, 2, RECOVERY_RESERVATION_2)
+DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) // 3.9
+DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
+DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) // 3.9
+DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
+DEFINE_CEPH_FEATURE(29, 1, MDSENC) // 4.7
+DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) // 3.9
+DEFINE_CEPH_FEATURE_DEPRECATED(31, 1, MON_SINGLE_PAXOS, NAUTILUS)
+DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) // 3.14
+DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) // 3.14
+DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) // 3.14
+DEFINE_CEPH_FEATURE_DEPRECATED(38, 1, OSD_ERASURE_CODES, MIMIC)
+DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) // 3.15
+DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) // 3.19
+DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3) // 3.15
+DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
+DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2) // 4.3 (for consistency)
+DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND) // 4.13
+DEFINE_CEPH_FEATURE_DEPRECATED(44, 1, ERASURE_CODE_PLUGINS_V2, MIMIC)
+DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA) // 4.17
+DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4) // 4.1
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE_DEPRECATED(50, 1, MON_METADATA, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(51, 1, OSD_BITWISE_HOBJ_SORT, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(52, 1, OSD_PROXY_WRITE_FEATURES, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(53, 1, ERASURE_CODE_PLUGINS_V3, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(54, 1, OSD_HITSET_GMT, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(55, 1, HAMMER_0_94_4, MIMIC)
+DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) // 4.13 (for pg_pool_t >= v25)
+DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) // 4.13
+DEFINE_CEPH_FEATURE_DEPRECATED(57, 1, MON_ROUTE_OSDMAP, MIMIC) // overlap
+DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
+DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5) // 4.5
+DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
+DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
+DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
+DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
+DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
+DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // 4.19, *do not share this bit*
+
+DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinel
+DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
+
+
+/*
+ * Features supported. Should be everything above.
+ */
+#define CEPH_FEATURES_ALL \
+ (CEPH_FEATURE_UID | \
+ CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_FLOCK | \
+ CEPH_FEATURE_SUBSCRIBE2 | \
+ CEPH_FEATURE_MONNAMES | \
+ CEPH_FEATURE_RECONNECT_SEQ | \
+ CEPH_FEATURE_DIRLAYOUTHASH | \
+ CEPH_FEATURE_OBJECTLOCATOR | \
+ CEPH_FEATURE_PGID64 | \
+ CEPH_FEATURE_INCSUBOSDMAP | \
+ CEPH_FEATURE_PGPOOL3 | \
+ CEPH_FEATURE_OSDREPLYMUX | \
+ CEPH_FEATURE_OSDENC | \
+ CEPH_FEATURE_MONENC | \
+ CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_MSG_AUTH | \
+ CEPH_FEATURE_CRUSH_TUNABLES2 | \
+ CEPH_FEATURE_CREATEPOOLID | \
+ CEPH_FEATURE_REPLY_CREATE_INODE | \
+ CEPH_FEATURE_MDSENC | \
+ CEPH_FEATURE_OSDHASHPSPOOL | \
+ CEPH_FEATURE_NEW_OSDOP_ENCODING | \
+ CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \
+ DEPRECATED_CEPH_FEATURE_MON_SINGLE_PAXOS | \
+ CEPH_FEATURE_OSD_CACHEPOOL | \
+ CEPH_FEATURE_CRUSH_V2 | \
+ CEPH_FEATURE_EXPORT_PEER | \
+ DEPRECATED_CEPH_FEATURE_OSD_ERASURE_CODES | \
+ CEPH_FEATURE_OSDMAP_ENC | \
+ CEPH_FEATURE_MDS_INLINE_DATA | \
+ CEPH_FEATURE_CRUSH_TUNABLES3 | \
+ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
+ CEPH_FEATURE_MSGR_KEEPALIVE2 | \
+ CEPH_FEATURE_OSD_POOLRESEND | \
+ DEPRECATED_CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 | \
+ CEPH_FEATURE_OSD_FADVISE_FLAGS | \
+ CEPH_FEATURE_MDS_QUOTA | \
+ CEPH_FEATURE_CRUSH_V4 | \
+ DEPRECATED_CEPH_FEATURE_MON_METADATA | \
+ DEPRECATED_CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT | \
+ DEPRECATED_CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 | \
+ DEPRECATED_CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES | \
+ DEPRECATED_CEPH_FEATURE_OSD_HITSET_GMT | \
+ DEPRECATED_CEPH_FEATURE_HAMMER_0_94_4 | \
+ CEPH_FEATURE_MON_STATEFUL_SUB | \
+ DEPRECATED_CEPH_FEATURE_MON_ROUTE_OSDMAP | \
+ CEPH_FEATURE_CRUSH_TUNABLES5 | \
+ CEPH_FEATURE_SERVER_JEWEL | \
+ CEPH_FEATURE_FS_FILE_LAYOUT_V2 | \
+ CEPH_FEATURE_SERVER_KRAKEN | \
+ CEPH_FEATURE_FS_BTIME | \
+ CEPH_FEATURE_FS_CHANGE_ATTR | \
+ CEPH_FEATURE_MSG_ADDR2 | \
+ CEPH_FEATURE_SERVER_LUMINOUS | \
+ CEPH_FEATURE_RESEND_ON_SPLIT | \
+ CEPH_FEATURE_RADOS_BACKOFF | \
+ CEPH_FEATURE_OSD_RECOVERY_DELETES | \
+ CEPH_FEATURE_SERVER_MIMIC | \
+ CEPH_FEATURE_RECOVERY_RESERVATION_2 | \
+ CEPH_FEATURE_SERVER_NAUTILUS | \
+ CEPH_FEATURE_CEPHX_V2 | \
+ CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \
+ CEPH_FEATURE_OSD_FIXED_COLLECTION_LIST | \
+ 0ULL)
+
+#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
+
+/*
+ * crush related features
+ */
+#define CEPH_FEATURES_CRUSH \
+ (CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_CRUSH_TUNABLES2 | \
+ CEPH_FEATURE_CRUSH_TUNABLES3 | \
+ CEPH_FEATURE_CRUSH_TUNABLES5 | \
+ CEPH_FEATURE_CRUSH_V2 | \
+ CEPH_FEATURE_CRUSH_V4 | \
+ CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS)
+
+/*
+ * make sure we don't try to use the reserved features
+ */
+#define CEPH_STATIC_ASSERT(x) (void)(sizeof(int[((x)==0) ? -1 : 0]))
+
+static inline void ____build_time_check_for_reserved_bits(void) {
+ CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL &
+ (CEPH_FEATURE_RESERVED |
+ DEPRECATED_CEPH_FEATURE_RESERVED_BROKEN)) == 0);
+}
+
+#endif
diff --git a/src/include/ceph_frag.h b/src/include/ceph_frag.h
new file mode 100644
index 00000000..5babb8e9
--- /dev/null
+++ b/src/include/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask. Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ * 8 upper bits = "bits"
+ * 24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value. This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically. However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+ return (b << 24) |
+ (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+ return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+ return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+ return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+ return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+ return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+ /* is sub as specific as us, and contained by us? */
+ return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+ (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f) - 1,
+ ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1,
+ ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+ int newbits = ceph_frag_bits(f) + by;
+ return ceph_frag_make(newbits,
+ ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+ return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+ return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
new file mode 100644
index 00000000..1c73ff37
--- /dev/null
+++ b/src/include/ceph_fs.h
@@ -0,0 +1,982 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2.1
+ */
+
+#ifndef CEPH_FS_H
+#define CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * The data structures defined here are shared between Linux kernel and
+ * user space. Also, those data structures are maintained always in
+ * little-endian byte order, even on big-endian systems. This is handled
+ * differently in kernel vs. user space. For use as kernel headers, the
+ * little-endian fields need to use the __le16/__le32/__le64 types. These
+ * are markers that indicate endian conversion routines must be used
+ * whenever such fields are accessed, which can be verified by checker
+ * tools like "sparse". For use as user-space headers, the little-endian
+ * fields instead use types ceph_le16/ceph_le32/ceph_le64, which are C++
+ * classes that implement automatic endian conversion on every access.
+ * To still allow for header sharing, this file uses the __le types, but
+ * redefines those to the ceph_ types when compiled in user space.
+ */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * subprotocol versions. when specific messages types or high-level
+ * protocols change, bump the affected components. we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSDC_PROTOCOL 24 /* server/client */
+#define CEPH_MDSC_PROTOCOL 32 /* server/client */
+#define CEPH_MONC_PROTOCOL 15 /* server/client */
+
+
+#define CEPH_INO_ROOT 1
+#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
+#define CEPH_INO_LOST_AND_FOUND 4 /* reserved ino for use in recovery */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON 31
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
+ of page size. */
+ __le32 fl_stripe_count; /* over this many objects */
+ __le32 fl_object_size; /* until objects are this big, then move to
+ new objects */
+ __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */
+
+ /* pg -> disk layout */
+ __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
+
+ /* object -> pg layout */
+ __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
+ __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+struct ceph_dir_layout {
+ __u8 dl_dir_hash; /* see ceph_hash.h for ids */
+ __u8 dl_unused1;
+ __u16 dl_unused2;
+ __u32 dl_unused3;
+} __attribute__ ((packed));
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES 0x1
+
+#define CEPH_AES_IV "cephsageyudagreg"
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN 0x0
+#define CEPH_AUTH_NONE 0x1
+#define CEPH_AUTH_CEPHX 0x2
+
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN 0x0
+#define CEPH_CON_MODE_CRC 0x1
+#define CEPH_CON_MODE_SECURE 0x2
+
+extern const char *ceph_con_mode_name(int con_mode);
+
+/* For options with "_", like: GSS_GSS
+ which means: Mode/Protocol to validate "authentication_authorization",
+ where:
+ - Authentication: Verifying the identity of an entity.
+ - Authorization: Verifying that an authenticated entity has
+ the right to access a particular resource.
+*/
+#define CEPH_AUTH_GSS 0x4
+#define CEPH_AUTH_GSS_GSS CEPH_AUTH_GSS
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN 1
+#define CEPH_MSG_PING 2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP 4
+#define CEPH_MSG_MON_GET_MAP 5
+#define CEPH_MSG_MON_GET_OSDMAP 6
+#define CEPH_MSG_MON_METADATA 7
+#define CEPH_MSG_STATFS 13
+#define CEPH_MSG_STATFS_REPLY 14
+#define CEPH_MSG_MON_SUBSCRIBE 15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
+#define CEPH_MSG_AUTH 17
+#define CEPH_MSG_AUTH_REPLY 18
+#define CEPH_MSG_MON_GET_VERSION 19
+#define CEPH_MSG_MON_GET_VERSION_REPLY 20
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP 21
+
+#define CEPH_MSG_CLIENT_SESSION 22
+#define CEPH_MSG_CLIENT_RECONNECT 23
+
+#define CEPH_MSG_CLIENT_REQUEST 24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_RECLAIM 27
+#define CEPH_MSG_CLIENT_RECLAIM_REPLY 28
+#define CEPH_MSG_CLIENT_CAPS 0x310
+#define CEPH_MSG_CLIENT_LEASE 0x311
+#define CEPH_MSG_CLIENT_SNAP 0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
+#define CEPH_MSG_CLIENT_QUOTA 0x314
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY 48
+#define CEPH_MSG_POOLOP 49
+
+
+/* osd */
+#define CEPH_MSG_OSD_MAP 41
+#define CEPH_MSG_OSD_OP 42
+#define CEPH_MSG_OSD_OPREPLY 43
+#define CEPH_MSG_WATCH_NOTIFY 44
+#define CEPH_MSG_OSD_BACKOFF 61
+
+/* FSMap subscribers (see all MDS clusters at once) */
+#define CEPH_MSG_FS_MAP 45
+/* FSMapUser subscribers (get MDS clusters name->ID mapping) */
+#define CEPH_MSG_FS_MAP_USER 103
+
+/* watch-notify operations */
+enum {
+ CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
+ CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
+ CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
+};
+
+const char *ceph_watch_event_name(int o);
+
+/* pool operations */
+enum {
+ POOL_OP_CREATE = 0x01,
+ POOL_OP_DELETE = 0x02,
+ POOL_OP_AUID_CHANGE = 0x03,
+ POOL_OP_CREATE_SNAP = 0x11,
+ POOL_OP_DELETE_SNAP = 0x12,
+ POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
+ POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
+};
+
+struct ceph_mon_request_header {
+ __le64 have_version;
+ __le16 session_mon;
+ __le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+ __le64 kb, kb_used, kb_avail;
+ __le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+ struct ceph_fsid fsid;
+ __le64 version;
+ struct ceph_statfs st;
+} __attribute__ ((packed));
+
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 pool;
+ __le32 op;
+ __le64 __old_auid; // obsolete
+ __le64 snapid;
+ __le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 reply_code;
+ __le32 epoch;
+ char has_data;
+ char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+ __le64 snapid;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+ struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */
+
+struct ceph_mon_subscribe_item {
+ __le64 start;
+ __u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+ __le32 duration; /* seconds */
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mdsmap flags
+ */
+#define CEPH_MDSMAP_NOT_JOINABLE (1<<0) /* standbys cannot join */
+#define CEPH_MDSMAP_DOWN (CEPH_MDSMAP_NOT_JOINABLE) /* backwards compat */
+#define CEPH_MDSMAP_ALLOW_SNAPS (1<<1) /* cluster allowed to create snapshots */
+/* deprecated #define CEPH_MDSMAP_ALLOW_MULTIMDS (1<<2) cluster allowed to have >1 active MDS */
+/* deprecated #define CEPH_MDSMAP_ALLOW_DIRFRAGS (1<<3) cluster allowed to fragment directories */
+#define CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS (1<<4) /* cluster alllowed to enable MULTIMDS
+ and SNAPS at the same time */
+#define CEPH_MDSMAP_ALLOW_STANDBY_REPLAY (1<<5) /* cluster alllowed to enable MULTIMDS */
+
+#define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \
+ CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS)
+
+/*
+ * mds states
+ * > 0 -> in
+ * <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
+ empty log. */
+#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAYONCE -9 /* Legacy, unused */
+#define CEPH_MDS_STATE_NULL -10
+
+#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
+ operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
+#define CEPH_MDS_STATE_DAMAGED 15 /* rank not replayable, need repair */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ * - these are bitmasks.. we can compose them
+ * - they also define the lock ordering by the MDS
+ * - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DVERSION 1
+#define CEPH_LOCK_DN 2
+#define CEPH_LOCK_IVERSION 16 /* mds internal */
+#define CEPH_LOCK_ISNAP 32
+#define CEPH_LOCK_IFILE 64
+#define CEPH_LOCK_IAUTH 128
+#define CEPH_LOCK_ILINK 256
+#define CEPH_LOCK_IDFT 512 /* dir frag tree */
+#define CEPH_LOCK_INEST 1024 /* mds internal */
+#define CEPH_LOCK_IXATTR 2048
+#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
+#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */
+
+/* client_session ops */
+enum {
+ CEPH_SESSION_REQUEST_OPEN,
+ CEPH_SESSION_OPEN,
+ CEPH_SESSION_REQUEST_CLOSE,
+ CEPH_SESSION_CLOSE,
+ CEPH_SESSION_REQUEST_RENEWCAPS,
+ CEPH_SESSION_RENEWCAPS,
+ CEPH_SESSION_STALE,
+ CEPH_SESSION_RECALL_STATE,
+ CEPH_SESSION_FLUSHMSG,
+ CEPH_SESSION_FLUSHMSG_ACK,
+ CEPH_SESSION_FORCE_RO,
+ // A response to REQUEST_OPEN indicating that the client should
+ // permanently desist from contacting the MDS
+ CEPH_SESSION_REJECT,
+ CEPH_SESSION_REQUEST_FLUSH_MDLOG
+};
+
+// flags for state reclaim
+#define CEPH_RECLAIM_RESET 1
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+ __le32 op;
+ __le64 seq;
+ struct ceph_timespec stamp;
+ __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ * & 0x001000 -> write op
+ * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ & & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE 0x001000
+enum {
+ CEPH_MDS_OP_LOOKUP = 0x00100,
+ CEPH_MDS_OP_GETATTR = 0x00101,
+ CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+ CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+ CEPH_MDS_OP_LOOKUPINO = 0x00104,
+ CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+
+ CEPH_MDS_OP_SETXATTR = 0x01105,
+ CEPH_MDS_OP_RMXATTR = 0x01106,
+ CEPH_MDS_OP_SETLAYOUT = 0x01107,
+ CEPH_MDS_OP_SETATTR = 0x01108,
+ CEPH_MDS_OP_SETFILELOCK= 0x01109,
+ CEPH_MDS_OP_GETFILELOCK= 0x00110,
+ CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
+
+ CEPH_MDS_OP_MKNOD = 0x01201,
+ CEPH_MDS_OP_LINK = 0x01202,
+ CEPH_MDS_OP_UNLINK = 0x01203,
+ CEPH_MDS_OP_RENAME = 0x01204,
+ CEPH_MDS_OP_MKDIR = 0x01220,
+ CEPH_MDS_OP_RMDIR = 0x01221,
+ CEPH_MDS_OP_SYMLINK = 0x01222,
+
+ CEPH_MDS_OP_CREATE = 0x01301,
+ CEPH_MDS_OP_OPEN = 0x00302,
+ CEPH_MDS_OP_READDIR = 0x00305,
+
+ CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+ CEPH_MDS_OP_MKSNAP = 0x01400,
+ CEPH_MDS_OP_RMSNAP = 0x01401,
+ CEPH_MDS_OP_LSSNAP = 0x00402,
+ CEPH_MDS_OP_RENAMESNAP = 0x01403,
+
+ // internal op
+ CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
+ CEPH_MDS_OP_EXPORTDIR = 0x01501,
+ CEPH_MDS_OP_FLUSH = 0x01502,
+ CEPH_MDS_OP_ENQUEUE_SCRUB = 0x01503,
+ CEPH_MDS_OP_REPAIR_FRAGSTATS = 0x01504,
+ CEPH_MDS_OP_REPAIR_INODESTATS = 0x01505,
+ CEPH_MDS_OP_UPGRADE_SNAPREALM = 0x01506
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+#ifndef CEPH_SETATTR_MODE
+#define CEPH_SETATTR_MODE (1 << 0)
+#define CEPH_SETATTR_UID (1 << 1)
+#define CEPH_SETATTR_GID (1 << 2)
+#define CEPH_SETATTR_MTIME (1 << 3)
+#define CEPH_SETATTR_ATIME (1 << 4)
+#define CEPH_SETATTR_SIZE (1 << 5)
+#define CEPH_SETATTR_CTIME (1 << 6)
+#define CEPH_SETATTR_MTIME_NOW (1 << 7)
+#define CEPH_SETATTR_ATIME_NOW (1 << 8)
+#define CEPH_SETATTR_BTIME (1 << 9)
+#endif
+#define CEPH_SETATTR_KILL_SGUID (1 << 10)
+
+/*
+ * open request flags
+ */
+#define CEPH_O_RDONLY 00000000
+#define CEPH_O_WRONLY 00000001
+#define CEPH_O_RDWR 00000002
+#define CEPH_O_CREAT 00000100
+#define CEPH_O_EXCL 00000200
+#define CEPH_O_TRUNC 00001000
+#define CEPH_O_LAZY 00020000
+#define CEPH_O_DIRECTORY 00200000
+#define CEPH_O_NOFOLLOW 00400000
+
+int ceph_flags_sys2wire(int flags);
+
+/*
+ * Ceph setxattr request flags.
+ */
+#define CEPH_XATTR_CREATE (1 << 0)
+#define CEPH_XATTR_REPLACE (1 << 1)
+#define CEPH_XATTR_REMOVE (1 << 31)
+
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
+#define CEPH_READDIR_HASH_ORDER (1<<9)
+#define CEPH_READDIR_OFFSET_HASH (1<<10)
+
+/* Note that this is embedded wthin ceph_mds_request_head_legacy. */
+union ceph_mds_request_args_legacy {
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ } __attribute__ ((packed)) getattr;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ } __attribute__ ((packed)) setattr;
+ struct {
+ __le32 frag; /* which dir fragment */
+ __le32 max_entries; /* how many dentries to grab */
+ __le32 max_bytes;
+ __le16 flags;
+ __le32 offset_hash;
+ } __attribute__ ((packed)) readdir;
+ struct {
+ __le32 mode;
+ __le32 rdev;
+ } __attribute__ ((packed)) mknod;
+ struct {
+ __le32 mode;
+ } __attribute__ ((packed)) mkdir;
+ struct {
+ __le32 flags;
+ __le32 mode;
+ __le32 stripe_unit; /* layout for newly created file */
+ __le32 stripe_count; /* ... */
+ __le32 object_size;
+ __le32 pool; /* if >= 0 and CREATEPOOLID feature */
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 old_size; /* if O_TRUNC */
+ } __attribute__ ((packed)) open;
+ struct {
+ __le32 flags;
+ __le32 osdmap_epoch; /* use for set file/dir layout */
+ } __attribute__ ((packed)) setxattr;
+ struct {
+ struct ceph_file_layout layout;
+ } __attribute__ ((packed)) setlayout;
+ struct {
+ __u8 rule; /* currently fcntl or flock */
+ __u8 type; /* shared, exclusive, remove*/
+ __le64 owner; /* who requests/holds the lock */
+ __le64 pid; /* process id requesting the lock */
+ __le64 start; /* initial location to lock */
+ __le64 length; /* num bytes to lock from start */
+ __u8 wait; /* will caller wait for lock to become available? */
+ } __attribute__ ((packed)) filelock_change;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+
+struct ceph_mds_request_head_legacy {
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args_legacy args;
+} __attribute__ ((packed));
+
+/*
+ * Note that this is embedded wthin ceph_mds_request_head. Also, compatibility
+ * with the ceph_mds_request_args_legacy must be maintained!
+ */
+union ceph_mds_request_args {
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ } __attribute__ ((packed)) getattr;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ struct ceph_timespec btime;
+ } __attribute__ ((packed)) setattr;
+ struct {
+ __le32 frag; /* which dir fragment */
+ __le32 max_entries; /* how many dentries to grab */
+ __le32 max_bytes;
+ __le16 flags;
+ __le32 offset_hash;
+ } __attribute__ ((packed)) readdir;
+ struct {
+ __le32 mode;
+ __le32 rdev;
+ } __attribute__ ((packed)) mknod;
+ struct {
+ __le32 mode;
+ } __attribute__ ((packed)) mkdir;
+ struct {
+ __le32 flags;
+ __le32 mode;
+ __le32 stripe_unit; /* layout for newly created file */
+ __le32 stripe_count; /* ... */
+ __le32 object_size;
+ __le32 pool; /* if >= 0 and CREATEPOOLID feature */
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 old_size; /* if O_TRUNC */
+ } __attribute__ ((packed)) open;
+ struct {
+ __le32 flags;
+ __le32 osdmap_epoch; /* use for set file/dir layout */
+ } __attribute__ ((packed)) setxattr;
+ struct {
+ struct ceph_file_layout layout;
+ } __attribute__ ((packed)) setlayout;
+ struct {
+ __u8 rule; /* currently fcntl or flock */
+ __u8 type; /* shared, exclusive, remove*/
+ __le64 owner; /* who requests/holds the lock */
+ __le64 pid; /* process id requesting the lock */
+ __le64 start; /* initial location to lock */
+ __le64 length; /* num bytes to lock from start */
+ __u8 wait; /* will caller wait for lock to become available? */
+ } __attribute__ ((packed)) filelock_change;
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 snapid;
+ __le64 parent;
+ __le32 hash;
+ } __attribute__ ((packed)) lookupino;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_REQUEST_HEAD_VERSION 1
+
+/*
+ * Note that any change to this structure must ensure that it is compatible
+ * with ceph_mds_request_head_legacy.
+ */
+struct ceph_mds_request_head {
+ __le16 version;
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+ __le64 ino, cap_id; /* ino and unique cap id */
+ __le32 caps, wanted; /* new issued, wanted */
+ __le32 seq, issue_seq, mseq;
+ __le32 dname_seq; /* if releasing a dentry lease, a */
+ __le32 dname_len; /* string follows. */
+} __attribute__ ((packed));
+
+static inline void
+copy_from_legacy_head(struct ceph_mds_request_head *head,
+ struct ceph_mds_request_head_legacy *legacy)
+{
+ memcpy(&(head->oldest_client_tid), legacy, sizeof(*legacy));
+}
+
+static inline void
+copy_to_legacy_head(struct ceph_mds_request_head_legacy *legacy,
+ struct ceph_mds_request_head *head)
+{
+ memcpy(legacy, &(head->oldest_client_tid), sizeof(*legacy));
+}
+
+/* client reply */
+struct ceph_mds_reply_head {
+ __le32 op;
+ __le32 result;
+ __le32 mdsmap_epoch;
+ __u8 safe; /* true if committed to disk */
+ __u8 is_dentry, is_target; /* true if dentry, target inode records
+ are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+ __le32 frag; /* this frag splits... */
+ __le32 by; /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+ __le32 nsplits; /* num ceph_frag_tree_split records */
+ struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+ __le32 caps, wanted; /* caps issued, wanted */
+ __le64 cap_id;
+ __le32 seq, mseq;
+ __le64 realm; /* snap realm */
+ __u8 flags; /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* ask client to release the cap */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+ __le16 mask; /* lease type(s) */
+ __le32 duration_ms; /* lease duration */
+ __le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+ __le32 frag; /* fragment */
+ __le32 auth; /* auth mds, if this is a delegation point */
+ __le32 ndist; /* number of mds' this is replicated on */
+ __le32 dist[];
+} __attribute__ ((packed));
+
+#define CEPH_LOCK_FCNTL 1
+#define CEPH_LOCK_FLOCK 2
+#define CEPH_LOCK_FCNTL_INTR 3
+#define CEPH_LOCK_FLOCK_INTR 4
+
+#define CEPH_LOCK_SHARED 1
+#define CEPH_LOCK_EXCL 2
+#define CEPH_LOCK_UNLOCK 4
+
+struct ceph_filelock {
+ __le64 start;/* file offset to start lock at */
+ __le64 length; /* num bytes to lock; 0 for all following start */
+ __le64 client; /* which client holds the lock */
+ __le64 owner; /* who requests/holds the lock */
+ __le64 pid; /* process id holding the lock on the client */
+ __u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
+
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN 0
+#define CEPH_FILE_MODE_RD 1
+#define CEPH_FILE_MODE_WR 2
+#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
+#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
+#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+/* inline data state */
+#define CEPH_INLINE_NONE ((__u64)-1)
+#define CEPH_INLINE_MAX_SIZE CEPH_MIN_STRIPE_UNIT
+
+/* capability bits */
+#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+/* note: these definitions are duplicated in mds/locks.c */
+#define CEPH_CAP_GSHARED 1 /* client can reads */
+#define CEPH_CAP_GEXCL 2 /* client can read and update */
+#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
+#define CEPH_CAP_GRD 8 /* (file) client can read */
+#define CEPH_CAP_GWR 16 /* (file) client can write */
+#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
+
+#define CEPH_CAP_SIMPLE_BITS 2
+#define CEPH_CAP_FILE_BITS 8
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH 2
+#define CEPH_CAP_SLINK 4
+#define CEPH_CAP_SXATTR 6
+#define CEPH_CAP_SFILE 8
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
+#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
+ CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_XATTR_SHARED)
+#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_XATTR_SHARED | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
+ CEPH_CAP_LINK_EXCL | \
+ CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+ CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+ CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+ CEPH_CAP_OP_GRANT, /* mds->client grant */
+ CEPH_CAP_OP_REVOKE, /* mds->client revoke */
+ CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
+ CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
+ CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
+ CEPH_CAP_OP_UPDATE, /* client->mds update */
+ CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
+ CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
+ CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
+ CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
+ CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+ CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
+ CEPH_CAP_OP_RENEW, /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/* extra info for cap import/export */
+struct ceph_mds_cap_peer {
+ __le64 cap_id;
+ __le32 seq;
+ __le32 mseq;
+ __le32 mds;
+ __u8 flags;
+} __attribute__ ((packed));
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps_head {
+ __le32 op; /* CEPH_CAP_OP_* */
+ __le64 ino, realm;
+ __le64 cap_id;
+ __le32 seq, issue_seq;
+ __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+ __le32 migrate_seq;
+ __le64 snap_follows;
+ __le32 snap_trace_len;
+
+ /* authlock */
+ __le32 uid, gid, mode;
+
+ /* linklock */
+ __le32 nlink;
+
+ /* xattrlock */
+ __le32 xattr_len;
+ __le64 xattr_version;
+} __attribute__ ((packed));
+
+struct ceph_mds_caps_body_legacy {
+ union {
+ /* all except export */
+ struct {
+ /* filelock */
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ struct ceph_timespec mtime, atime, ctime;
+ struct ceph_file_layout layout;
+ __le32 time_warp_seq;
+ };
+ /* export message */
+ struct ceph_mds_cap_peer peer;
+ };
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+ __le32 num; /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+ __le64 ino;
+ __le64 cap_id;
+ __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
+#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
+#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
+#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+ __u8 action; /* CEPH_MDS_LEASE_* */
+ __le16 mask; /* which lease */
+ __le64 ino;
+ __le64 first, last; /* snap range */
+ __le32 seq;
+ __le32 duration_ms; /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+ __le32 flock_len; /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+
+struct ceph_mds_cap_reconnect_v1 {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 size;
+ struct ceph_timespec mtime, atime;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+} __attribute__ ((packed));
+
+struct ceph_mds_snaprealm_reconnect {
+ __le64 ino; /* snap realm base */
+ __le64 seq; /* snap seq for this snap realm */
+ __le64 parent; /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+ CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
+ CEPH_SNAP_OP_CREATE,
+ CEPH_SNAP_OP_DESTROY,
+ CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+ __le32 op; /* CEPH_SNAP_OP_* */
+ __le64 split; /* ino to split off, if any */
+ __le32 num_split_inos; /* # inos belonging to new child realm */
+ __le32 num_split_realms; /* # child realms udner new child realm */
+ __le32 trace_len; /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+ __le64 ino; /* ino */
+ __le64 created; /* snap: when created */
+ __le64 parent; /* ino: parent realm */
+ __le64 parent_since; /* snap: same parent since */
+ __le64 seq; /* snap: version */
+ __le32 num_snaps;
+ __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/ceph_fuse.h b/src/include/ceph_fuse.h
new file mode 100644
index 00000000..45881930
--- /dev/null
+++ b/src/include/ceph_fuse.h
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+#ifndef CEPH_FUSE_H
+#define CEPH_FUSE_H
+
+#define FUSE_USE_VERSION 30
+#include "acconfig.h"
+#include <fuse.h>
+
+static inline int filler_compat(fuse_fill_dir_t filler,
+ void *buf, const char *name,
+ const struct stat *stbuf,
+ off_t off)
+{
+ return filler(buf, name, stbuf, off
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ , static_cast<enum fuse_fill_dir_flags>(0)
+#endif
+ );
+}
+#endif /* CEPH_FUSE_H */
diff --git a/src/include/ceph_hash.h b/src/include/ceph_hash.h
new file mode 100644
index 00000000..f9d80ac3
--- /dev/null
+++ b/src/include/ceph_hash.h
@@ -0,0 +1,14 @@
+#ifndef FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+extern bool ceph_str_hash_valid(int type);
+
+#endif
diff --git a/src/include/cephfs/ceph_ll_client.h b/src/include/cephfs/ceph_ll_client.h
new file mode 100644
index 00000000..4f3d4235
--- /dev/null
+++ b/src/include/cephfs/ceph_ll_client.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * scalable distributed file system
+ *
+ * Copyright (C) Jeff Layton <jlayton@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_CEPH_LL_CLIENT_H
+#define CEPH_CEPH_LL_CLIENT_H
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+
+class Fh;
+
+struct inodeno_t;
+struct vinodeno_t;
+typedef struct vinodeno_t vinodeno;
+
+#else /* __cplusplus */
+
+typedef struct Fh Fh;
+
+typedef struct inodeno_t {
+ uint64_t val;
+} inodeno_t;
+
+typedef struct _snapid_t {
+ uint64_t val;
+} snapid_t;
+
+typedef struct vinodeno_t {
+ inodeno_t ino;
+ snapid_t snapid;
+} vinodeno_t;
+
+#endif /* __cplusplus */
+
+/*
+ * Heavily borrowed from David Howells' draft statx patchset.
+ *
+ * Since the xstat patches are still a work in progress, we borrow its data
+ * structures and #defines to implement ceph_getattrx. Once the xstat stuff
+ * has been merged we should drop this and switch over to using that instead.
+ */
+struct ceph_statx {
+ uint32_t stx_mask;
+ uint32_t stx_blksize;
+ uint32_t stx_nlink;
+ uint32_t stx_uid;
+ uint32_t stx_gid;
+ uint16_t stx_mode;
+ uint64_t stx_ino;
+ uint64_t stx_size;
+ uint64_t stx_blocks;
+ dev_t stx_dev;
+ dev_t stx_rdev;
+ struct timespec stx_atime;
+ struct timespec stx_ctime;
+ struct timespec stx_mtime;
+ struct timespec stx_btime;
+ uint64_t stx_version;
+};
+
+#define CEPH_STATX_MODE 0x00000001U /* Want/got stx_mode */
+#define CEPH_STATX_NLINK 0x00000002U /* Want/got stx_nlink */
+#define CEPH_STATX_UID 0x00000004U /* Want/got stx_uid */
+#define CEPH_STATX_GID 0x00000008U /* Want/got stx_gid */
+#define CEPH_STATX_RDEV 0x00000010U /* Want/got stx_rdev */
+#define CEPH_STATX_ATIME 0x00000020U /* Want/got stx_atime */
+#define CEPH_STATX_MTIME 0x00000040U /* Want/got stx_mtime */
+#define CEPH_STATX_CTIME 0x00000080U /* Want/got stx_ctime */
+#define CEPH_STATX_INO 0x00000100U /* Want/got stx_ino */
+#define CEPH_STATX_SIZE 0x00000200U /* Want/got stx_size */
+#define CEPH_STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */
+#define CEPH_STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
+#define CEPH_STATX_BTIME 0x00000800U /* Want/got stx_btime */
+#define CEPH_STATX_VERSION 0x00001000U /* Want/got stx_version */
+#define CEPH_STATX_ALL_STATS 0x00001fffU /* All supported stats */
+
+/*
+ * Compatibility macros until these defines make their way into glibc
+ */
+#ifndef AT_NO_ATTR_SYNC
+#define AT_NO_ATTR_SYNC 0x4000 /* Don't sync attributes with the server */
+#endif
+
+/*
+ * The statx interfaces only allow these flags. In order to allow us to add
+ * others in the future, we disallow setting any that aren't recognized.
+ */
+#define CEPH_REQ_FLAG_MASK (AT_SYMLINK_NOFOLLOW|AT_NO_ATTR_SYNC)
+
+/* delegation recalls */
+typedef void (*ceph_deleg_cb_t)(Fh *fh, void *priv);
+
+/* inode data/metadata invalidation */
+typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino,
+ int64_t off, int64_t len);
+
+/* dentry invalidation */
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+ vinodeno_t ino, const char *name,
+ size_t len);
+
+/* remount entire fs */
+typedef int (*client_remount_callback_t)(void *handle);
+
+/* lock request interrupted */
+typedef void (*client_switch_interrupt_callback_t)(void *handle, void *data);
+
+/* fetch umask of actor */
+typedef mode_t (*client_umask_callback_t)(void *handle);
+
+/* request that application release Inode references */
+typedef void (*client_ino_release_t)(void *handle, vinodeno_t ino);
+
+/*
+ * The handle is an opaque value that gets passed to some callbacks. Any fields
+ * set to NULL will be left alone. There is no way to unregister callbacks.
+ */
+struct ceph_client_callback_args {
+ void *handle;
+ client_ino_callback_t ino_cb;
+ client_dentry_callback_t dentry_cb;
+ client_switch_interrupt_callback_t switch_intr_cb;
+ client_remount_callback_t remount_cb;
+ client_umask_callback_t umask_cb;
+ client_ino_release_t ino_release_cb;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_STATX_H */
+
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
new file mode 100755
index 00000000..c1668769
--- /dev/null
+++ b/src/include/cephfs/libcephfs.h
@@ -0,0 +1,1869 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIB_H
+#define CEPH_LIB_H
+
+#if defined(__linux__)
+#include <features.h>
+#endif
+#include <utime.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/statvfs.h>
+#include <sys/socket.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#include "ceph_ll_client.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBCEPHFS_VER_MAJOR 10
+#define LIBCEPHFS_VER_MINOR 0
+#define LIBCEPHFS_VER_EXTRA 2
+
+#define LIBCEPHFS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBCEPHFS_VERSION_CODE LIBCEPHFS_VERSION(LIBCEPHFS_VER_MAJOR, LIBCEPHFS_VER_MINOR, LIBCEPHFS_VER_EXTRA)
+
+/*
+ * If using glibc check that file offset is 64-bit.
+ */
+#if defined(__GLIBC__) && !defined(__USE_FILE_OFFSET64)
+# error libceph: glibc must define __USE_FILE_OFFSET64 or readdir results will be corrupted
+#endif
+
+/*
+ * XXXX redeclarations from ceph_fs.h, rados.h, etc. We need more of this
+ * in the interface, but shouldn't be re-typing it (and using different
+ * C data types).
+ */
+#ifndef __cplusplus
+
+#define CEPH_INO_ROOT 1
+#define CEPH_NOSNAP ((uint64_t)(-2))
+
+struct ceph_file_layout {
+ /* file -> object mapping */
+ uint32_t fl_stripe_unit; /* stripe unit, in bytes. must be multiple
+ of page size. */
+ uint32_t fl_stripe_count; /* over this many objects */
+ uint32_t fl_object_size; /* until objects are this big, then move to
+ new objects */
+ uint32_t fl_cas_hash; /* 0 = none; 1 = sha256 */
+
+ /* pg -> disk layout */
+ uint32_t fl_object_stripe_unit; /* for per-object parity, if any */
+
+ /* object -> pg layout */
+ uint32_t fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+ uint32_t fl_pg_pool; /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#endif /* ! __cplusplus */
+
+struct UserPerm;
+typedef struct UserPerm UserPerm;
+
+struct Inode;
+typedef struct Inode Inode;
+
+struct ceph_mount_info;
+struct ceph_dir_result;
+struct CephContext;
+
+/* setattr mask bits */
+#ifndef CEPH_SETATTR_MODE
+# define CEPH_SETATTR_MODE 1
+# define CEPH_SETATTR_UID 2
+# define CEPH_SETATTR_GID 4
+# define CEPH_SETATTR_MTIME 8
+# define CEPH_SETATTR_ATIME 16
+# define CEPH_SETATTR_SIZE 32
+# define CEPH_SETATTR_CTIME 64
+# define CEPH_SETATTR_MTIME_NOW 128
+# define CEPH_SETATTR_ATIME_NOW 256
+# define CEPH_SETATTR_BTIME 512
+#endif
+
+/* define error codes for the mount function*/
+# define CEPHFS_ERROR_MON_MAP_BUILD 1000
+# define CEPHFS_ERROR_NEW_CLIENT 1002
+# define CEPHFS_ERROR_MESSENGER_START 1003
+
+/**
+ * Create a UserPerm credential object.
+ *
+ * Some calls (most notably, the ceph_ll_* ones), take a credential object
+ * that represents the credentials that the calling program is using. This
+ * function creates a new credential object for this purpose. Returns a
+ * pointer to the object, or NULL if it can't be allocated.
+ *
+ * Note that the gidlist array is used directly and is not copied. It must
+ * remain valid over the lifetime of the created UserPerm object.
+ *
+ * @param uid uid to be used
+ * @param gid gid to be used
+ * @param ngids number of gids in supplemental grouplist
+ * @param gidlist array of gid_t's in the list of groups
+ */
+UserPerm *ceph_userperm_new(uid_t uid, gid_t gid, int ngids, gid_t *gidlist);
+
+/**
+ * Destroy a UserPerm credential object.
+ *
+ * @param perm pointer to object to be destroyed
+ *
+ * Currently this just frees the object. Note that the gidlist array is not
+ * freed. The caller must do so if it's necessary.
+ */
+void ceph_userperm_destroy(UserPerm *perm);
+
+/**
+ * Get a pointer to the default UserPerm object for the mount.
+ *
+ * @param cmount the mount info handle
+ *
+ * Every cmount has a default set of credentials. This returns a pointer to
+ * that object.
+ *
+ * Unlike with ceph_userperm_new, this object should not be freed.
+ */
+struct UserPerm *ceph_mount_perms(struct ceph_mount_info *cmount);
+
+/**
+ * Set cmount's default permissions
+ *
+ * @param cmount the mount info handle
+ * @param perm permissions to set to default for mount
+ *
+ * Every cmount has a default set of credentials. This does a deep copy of
+ * the given permissions to the ones in the cmount. Must be done after
+ * ceph_init but before ceph_mount.
+ *
+ * Returns 0 on success, and -EISCONN if the cmount is already mounted.
+ */
+int ceph_mount_perms_set(struct ceph_mount_info *cmount, UserPerm *perm);
+
+/**
+ * @defgroup libcephfs_h_init Setup and Teardown
+ * These are the first and last functions that should be called
+ * when using libcephfs.
+ *
+ * @{
+ */
+
+/**
+ * Get the version of libcephfs.
+ *
+ * The version number is major.minor.patch.
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param patch where to store the extra version number
+ */
+const char *ceph_version(int *major, int *minor, int *patch);
+
+/**
+ * Create a mount handle for interacting with Ceph. All libcephfs
+ * functions operate on a mount info handle.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param id the id of the client. This can be a unique id that identifies
+ * this client, and will get appended onto "client.". Callers can
+ * pass in NULL, and the id will be the process id of the client.
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create(struct ceph_mount_info **cmount, const char * const id);
+
+/**
+ * Create a mount handle from a CephContext, which holds the configuration
+ * for the ceph cluster. A CephContext can be acquired from an existing ceph_mount_info
+ * handle, using the @ref ceph_get_mount_context call. Note that using the same CephContext
+ * for two different mount handles results in the same client entity id being used.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param conf reuse this pre-existing CephContext config
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create_with_context(struct ceph_mount_info **cmount, struct CephContext *conf);
+
+
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif // VOIDPTR_RADOS_T
+
+/**
+ * Create a mount handle from a rados_t, for using libcephfs in the
+ * same process as librados.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param cluster reference to already-initialized librados handle
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create_from_rados(struct ceph_mount_info **cmount, rados_t cluster);
+
+/**
+ * Initialize the filesystem client (but do not mount the filesystem yet)
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_init(struct ceph_mount_info *cmount);
+
+/**
+ * Optionally set which filesystem to mount, before calling mount.
+ *
+ * An error will be returned if this libcephfs instance is already
+ * mounted. This function is an alternative to setting the global
+ * client_mds_namespace setting. Using this function enables multiple
+ * libcephfs instances in the same process to mount different filesystems.
+ *
+ * The filesystem name is *not* validated in this function. That happens
+ * during mount(), where an ENOENT error will result if a non-existent
+ * filesystem was specified here.
+ *
+ * @param cmount the mount info handle
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_select_filesystem(struct ceph_mount_info *cmount, const char *fs_name);
+
+
+/**
+ * Perform a mount using the path for the root of the mount.
+ *
+ * It is optional to call ceph_init before this. If ceph_init has
+ * not already been called, it will be called in the course of this operation.
+ *
+ * @param cmount the mount info handle
+ * @param root the path for the root of the mount. This can be an existing
+ * directory within the ceph cluster, but most likely it will
+ * be "/". Passing in NULL is equivalent to "/".
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_mount(struct ceph_mount_info *cmount, const char *root);
+
+/**
+ * Return cluster ID for a mounted ceph filesystem
+ *
+ * Every ceph filesystem has a filesystem ID associated with it. This
+ * function returns that value. If the ceph_mount_info does not refer to a
+ * mounted filesystem, this returns a negative error code.
+ */
+int64_t ceph_get_fs_cid(struct ceph_mount_info *cmount);
+
+/**
+ * Execute a management command remotely on an MDS.
+ *
+ * Must have called ceph_init or ceph_mount before calling this.
+ *
+ * @param mds_spec string representing rank, MDS name, GID or '*'
+ * @param cmd array of null-terminated strings
+ * @param cmdlen length of cmd array
+ * @param inbuf non-null-terminated input data to command
+ * @param inbuflen length in octets of inbuf
+ * @param outbuf populated with pointer to buffer (command output data)
+ * @param outbuflen length of allocated outbuf
+ * @param outs populated with pointer to buffer (command error strings)
+ * @param outslen length of allocated outs
+ *
+ * @return 0 on success, negative error code on failure
+ *
+ */
+int ceph_mds_command(struct ceph_mount_info *cmount,
+ const char *mds_spec,
+ const char **cmd,
+ size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+/**
+ * Free a buffer, such as those used for output arrays from ceph_mds_command
+ */
+void ceph_buffer_free(char *buf);
+
+/**
+ * Unmount a mount handle.
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_unmount(struct ceph_mount_info *cmount);
+
+/**
+ * Abort mds connections
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_abort_conn(struct ceph_mount_info *cmount);
+
+/**
+ * Destroy the mount handle.
+ *
+ * The handle should not be mounted. This should be called on completion of
+ * all libcephfs functions.
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure.
+ */
+int ceph_release(struct ceph_mount_info *cmount);
+
+/**
+ * Deprecated. Unmount and destroy the ceph mount handle. This should be
+ * called on completion of all libcephfs functions.
+ *
+ * Equivalent to ceph_unmount() + ceph_release() without error handling.
+ *
+ * @param cmount the mount handle to shutdown
+ */
+void ceph_shutdown(struct ceph_mount_info *cmount);
+
+/**
+ * Get a global id for current instance
+ *
+ * The handle should not be mounted. This should be called on completion of
+ * all libcephfs functions.
+ *
+ * @param cmount the mount handle
+ * @returns instance global id
+ */
+uint64_t ceph_get_instance_id(struct ceph_mount_info *cmount);
+
+/**
+ * Extract the CephContext from the mount point handle.
+ *
+ * @param cmount the ceph mount handle to get the context from.
+ * @returns the CephContext associated with the mount handle.
+ */
+struct CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount);
+
+/*
+ * Check mount status.
+ *
+ * Return non-zero value if mounted. Otherwise, zero.
+ */
+int ceph_is_mounted(struct ceph_mount_info *cmount);
+
+/** @} init */
+
+/**
+ * @defgroup libcephfs_h_config Config
+ * Functions for manipulating the Ceph configuration at runtime.
+ *
+ * @{
+ */
+
+/**
+ * Load the ceph configuration from the specified config file.
+ *
+ * @param cmount the mount handle to load the configuration into.
+ * @param path_list the configuration file path
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_read_file(struct ceph_mount_info *cmount, const char *path_list);
+
+/**
+ * Parse the command line arguments and load the configuration parameters.
+ *
+ * @param cmount the mount handle to load the configuration parameters into.
+ * @param argc count of the arguments in argv
+ * @param argv the argument list
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, const char **argv);
+
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre ceph_mount() has not been called on the handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cmount handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *var);
+
+/** Sets a configuration value from a string.
+ *
+ * @param cmount the mount handle to set the configuration value on
+ * @param option the configuration option to set
+ * @param value the value of the configuration option to set
+ *
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_conf_set(struct ceph_mount_info *cmount, const char *option, const char *value);
+
+/**
+ * Gets the configuration value as a string.
+ *
+ * @param cmount the mount handle to set the configuration value on
+ * @param option the config option to get
+ * @param buf the buffer to fill with the value
+ * @param len the length of the buffer.
+ * @returns the size of the buffer filled in with the value, or negative error code on failure
+ */
+int ceph_conf_get(struct ceph_mount_info *cmount, const char *option, char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * @defgroup libcephfs_h_fsops File System Operations.
+ * Functions for getting/setting file system wide information specific to a particular
+ * mount handle.
+ *
+ * @{
+ */
+
+/**
+ * Perform a statfs on the ceph file system. This call fills in file system wide statistics
+ * into the passed in buffer.
+ *
+ * @param cmount the ceph mount handle to use for performing the statfs.
+ * @param path can be any path within the mounted filesystem
+ * @param stbuf the file system statistics filled in by this function.
+ * @return 0 on success, negative error code otherwise.
+ */
+int ceph_statfs(struct ceph_mount_info *cmount, const char *path, struct statvfs *stbuf);
+
+/**
+ * Synchronize all filesystem data to persistent media.
+ *
+ * @param cmount the ceph mount handle to use for performing the sync_fs.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_sync_fs(struct ceph_mount_info *cmount);
+
+/**
+ * Get the current working directory.
+ *
+ * @param cmount the ceph mount to get the current working directory for.
+ * @returns the path to the current working directory
+ */
+const char* ceph_getcwd(struct ceph_mount_info *cmount);
+
+/**
+ * Change the current working directory.
+ *
+ * @param cmount the ceph mount to change the current working directory for.
+ * @param path the path to the working directory to change into.
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_chdir(struct ceph_mount_info *cmount, const char *path);
+
+/** @} fsops */
+
+/**
+ * @defgroup libcephfs_h_dir Directory Operations.
+ * Functions for manipulating and listing directories.
+ *
+ * @{
+ */
+
+/**
+ * Open the given directory.
+ *
+ * @param cmount the ceph mount handle to use to open the directory
+ * @param name the path name of the directory to open. Must be either an absolute path
+ * or a path relative to the current working directory.
+ * @param dirpp the directory result pointer structure to fill in.
+ * @returns 0 on success or negative error code otherwise.
+ */
+int ceph_opendir(struct ceph_mount_info *cmount, const char *name, struct ceph_dir_result **dirpp);
+
+/**
+ * Close the open directory.
+ *
+ * @param cmount the ceph mount handle to use for closing the directory
+ * @param dirp the directory result pointer (set by ceph_opendir) to close
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_closedir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Get the next entry in an open directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry to return.
+ * @returns the next directory entry or NULL if at the end of the directory (or the directory
+ * is empty. This pointer should not be freed by the caller, and is only safe to
+ * access between return and the next call to ceph_readdir or ceph_closedir.
+ */
+struct dirent * ceph_readdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * A safe version of ceph_readdir, where the directory entry struct is allocated by the caller.
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry to return.
+ * @param de the directory entry pointer filled in with the next directory entry of the dirp state.
+ * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached,
+ * and a negative error code on failure.
+ */
+int ceph_readdir_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de);
+
+/**
+ * A safe version of ceph_readdir that also returns the file statistics (readdir+stat).
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir_plus_r.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry to return.
+ * @param de the directory entry pointer filled in with the next directory entry of the dirp state.
+ * @param stx the stats of the file/directory of the entry returned
+ * @param want mask showing desired inode attrs for returned entry
+ * @param flags bitmask of flags to use when filling out attributes
+ * @param out optional returned Inode argument. If non-NULL, then a reference will be taken on
+ * the inode and the pointer set on success.
+ * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached,
+ * and a negative error code on failure.
+ */
+int ceph_readdirplus_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de,
+ struct ceph_statx *stx, unsigned want, unsigned flags, struct Inode **out);
+
+/**
+ * Gets multiple directory entries.
+ *
+ * @param cmount the ceph mount handle to use for performing the getdents.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry/entries to return.
+ * @param name an array of struct dirent that gets filled in with the to fill returned directory entries into.
+ * @param buflen the length of the buffer, which should be the number of dirent structs * sizeof(struct dirent).
+ * @returns the length of the buffer that was filled in, will always be multiples of sizeof(struct dirent), or a
+ * negative error code. If the buffer is not large enough for a single entry, -ERANGE is returned.
+ */
+int ceph_getdents(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen);
+
+/**
+ * Gets multiple directory names.
+ *
+ * @param cmount the ceph mount handle to use for performing the getdents.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry/entries to return.
+ * @param name a buffer to fill in with directory entry names.
+ * @param buflen the length of the buffer that can be filled in.
+ * @returns the length of the buffer filled in with entry names, or a negative error code on failure.
+ * If the buffer isn't large enough for a single entry, -ERANGE is returned.
+ */
+int ceph_getdnames(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen);
+
+/**
+ * Rewind the directory stream to the beginning of the directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the rewinddir.
+ * @param dirp the directory stream pointer to rewind.
+ */
+void ceph_rewinddir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Get the current position of a directory stream.
+ *
+ * @param cmount the ceph mount handle to use for performing the telldir.
+ * @param dirp the directory stream pointer to get the current position of.
+ * @returns the position of the directory stream. Note that the offsets returned
+ * by ceph_telldir do not have a particular order (cannot be compared with
+ * inequality).
+ */
+int64_t ceph_telldir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Move the directory stream to a position specified by the given offset.
+ *
+ * @param cmount the ceph mount handle to use for performing the seekdir.
+ * @param dirp the directory stream pointer to move.
+ * @param offset the position to move the directory stream to. This offset should be
+ * a value returned by seekdir. Note that this value does not refer to the nth
+ * entry in a directory, and can not be manipulated with plus or minus.
+ */
+void ceph_seekdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, int64_t offset);
+
+/**
+ * Create a directory.
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the directory to create. This must be either an
+ * absolute path or a relative path off of the current working directory.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdir(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Create multiple directories at once.
+ *
+ * @param cmount the ceph mount handle to use for making the directories.
+ * @param path the full path of directories and sub-directories that should
+ * be created.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdirs(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Remove a directory.
+ *
+ * @param cmount the ceph mount handle to use for removing directories.
+ * @param path the path of the directory to remove.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_rmdir(struct ceph_mount_info *cmount, const char *path);
+
+/** @} dir */
+
+/**
+ * @defgroup libcephfs_h_links Links and Link Handling.
+ * Functions for creating and manipulating hard links and symbolic inks.
+ *
+ * @{
+ */
+
+/**
+ * Create a link.
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_link(struct ceph_mount_info *cmount, const char *existing, const char *newname);
+
+/**
+ * Read a symbolic link.
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param path the path to the symlink to read
+ * @param buf the buffer to hold the path of the file that the symlink points to.
+ * @param size the length of the buffer
+ * @returns number of bytes copied on success or negative error code on failure
+ */
+int ceph_readlink(struct ceph_mount_info *cmount, const char *path, char *buf, int64_t size);
+
+/**
+ * Creates a symbolic link.
+ *
+ * @param cmount the ceph mount handle to use for creating the symbolic link.
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on failure.
+ */
+int ceph_symlink(struct ceph_mount_info *cmount, const char *existing, const char *newname);
+
+/** @} links */
+
+/**
+ * @defgroup libcephfs_h_files File manipulation and handling.
+ * Functions for creating and manipulating files.
+ *
+ * @{
+ */
+
+/**
+ * Removes a file, link, or symbolic link. If the file/link has multiple links to it, the
+ * file will not disappear from the namespace until all references to it are removed.
+ *
+ * @param cmount the ceph mount handle to use for performing the unlink.
+ * @param path the path of the file or link to unlink.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_unlink(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Rename a file or directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the rename.
+ * @param from the path to the existing file or directory.
+ * @param to the new name of the file or directory
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_rename(struct ceph_mount_info *cmount, const char *from, const char *to);
+
+/**
+ * Get an open file's extended statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param fd the file descriptor of the file to get statistics of.
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (only AT_NO_ATTR_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fstatx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx,
+ unsigned int want, unsigned int flags);
+
+/**
+ * Get a file's extended statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (only AT_NO_ATTR_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_statx(struct ceph_mount_info *cmount, const char *path, struct ceph_statx *stx,
+ unsigned int want, unsigned int flags);
+
+/**
+ * Get a file's statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stbuf the stat struct that will be filled in with the file's statistics.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_stat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf);
+
+/**
+ * Get a file's statistics and attributes, without following symlinks.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stbuf the stat struct that will be filled in with the file's statistics.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lstat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf);
+
+/**
+ * Get the open file's statistics.
+ *
+ * @param cmount the ceph mount handle to use for performing the fstat.
+ * @param fd the file descriptor of the file to get statistics of.
+ * @param stbuf the stat struct of the file's statistics, filled in by the
+ * function.
+ * @returns 0 on success or a negative error code on failure
+ */
+int ceph_fstat(struct ceph_mount_info *cmount, int fd, struct stat *stbuf);
+
+/**
+ * Set a file's attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the setattr.
+ * @param relpath the path to the file/directory to set the attributes of.
+ * @param stx the statx struct that must include attribute values to set on the file.
+ * @param mask a mask of all the CEPH_SETATTR_* values that have been set in the statx struct.
+ * @param flags mask of AT_* flags (only AT_ATTR_NOFOLLOW is respected for now)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_setattrx(struct ceph_mount_info *cmount, const char *relpath, struct ceph_statx *stx, int mask, int flags);
+
+/**
+ * Set a file's attributes (extended version).
+ *
+ * @param cmount the ceph mount handle to use for performing the setattr.
+ * @param fd the fd of the open file/directory to set the attributes of.
+ * @param stx the statx struct that must include attribute values to set on the file.
+ * @param mask a mask of all the stat values that have been set on the stat struct.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fsetattrx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx, int mask);
+
+/**
+ * Change the mode bits (permissions) of a file/directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param path the path to the file/directory to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_chmod(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Change the mode bits (permissions) of an open file.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param fd the open file descriptor to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fchmod(struct ceph_mount_info *cmount, int fd, mode_t mode);
+
+/**
+ * Change the ownership of a file/directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param path the path of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_chown(struct ceph_mount_info *cmount, const char *path, int uid, int gid);
+
+/**
+ * Change the ownership of a file from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param fd the fd of the open file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fchown(struct ceph_mount_info *cmount, int fd, int uid, int gid);
+
+/**
+ * Change the ownership of a file/directory, don't follow symlinks.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param path the path of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lchown(struct ceph_mount_info *cmount, const char *path, int uid, int gid);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param buf holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utime(struct ceph_mount_info *cmount, const char *path, struct utimbuf *buf);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param buf holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futime(struct ceph_mount_info *cmount, int fd, struct utimbuf *buf);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times, don't follow symlinks.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lutimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futimes(struct ceph_mount_info *cmount, int fd, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futimens(struct ceph_mount_info *cmount, int fd, struct timespec times[2]);
+
+/**
+ * Apply or remove an advisory lock.
+ *
+ * @param cmount the ceph mount handle to use for performing the lock.
+ * @param fd the open file descriptor to change advisory lock.
+ * @param operation the advisory lock operation to be performed on the file
+ * descriptor among LOCK_SH (shared lock), LOCK_EX (exclusive lock),
+ * or LOCK_UN (remove lock). The LOCK_NB value can be ORed to perform a
+ * non-blocking operation.
+ * @param owner the user-supplied owner identifier (an arbitrary integer)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_flock(struct ceph_mount_info *cmount, int fd, int operation,
+ uint64_t owner);
+
+/**
+ * Truncate the file to the given size. If this operation causes the
+ * file to expand, the empty bytes will be filled in with zeros.
+ *
+ * @param cmount the ceph mount handle to use for performing the truncate.
+ * @param path the path to the file to truncate.
+ * @param size the new size of the file.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_truncate(struct ceph_mount_info *cmount, const char *path, int64_t size);
+
+/**
+ * Make a block or character special file.
+ *
+ * @param cmount the ceph mount handle to use for performing the mknod.
+ * @param path the path to the special file.
+ * @param mode the permissions to use and the type of special file. The type can be
+ * one of S_IFREG, S_IFCHR, S_IFBLK, S_IFIFO.
+ * @param rdev If the file type is S_IFCHR or S_IFBLK then this parameter specifies the
+ * major and minor numbers of the newly created device special file. Otherwise,
+ * it is ignored.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_mknod(struct ceph_mount_info *cmount, const char *path, mode_t mode, dev_t rdev);
+/**
+ * Create and/or open a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param path the path of the file to open. If the flags parameter includes O_CREAT,
+ * the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ * is specified in the flags.
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_open(struct ceph_mount_info *cmount, const char *path, int flags, mode_t mode);
+
+/**
+ * Create and/or open a file with a specific file layout.
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param path the path of the file to open. If the flags parameter includes O_CREAT,
+ * the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ * is specified in the flags.
+ * @param stripe_unit the stripe unit size (option, 0 for default)
+ * @param stripe_count the stripe count (optional, 0 for default)
+ * @param object_size the object size (optional, 0 for default)
+ * @param data_pool name of target data pool name (optional, NULL or empty string for default)
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_open_layout(struct ceph_mount_info *cmount, const char *path, int flags,
+ mode_t mode, int stripe_unit, int stripe_count, int object_size,
+ const char *data_pool);
+
+/**
+ * Close the open file.
+ *
+ * @param cmount the ceph mount handle to use for performing the close.
+ * @param fd the file descriptor referring to the open file.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_close(struct ceph_mount_info *cmount, int fd);
+
+/**
+ * Reposition the open file stream based on the given offset.
+ *
+ * @param cmount the ceph mount handle to use for performing the lseek.
+ * @param fd the open file descriptor referring to the open file and holding the
+ * current position of the stream.
+ * @param offset the offset to set the stream to
+ * @param whence the flag to indicate what type of seeking to perform:
+ * SEEK_SET: the offset is set to the given offset in the file.
+ * SEEK_CUR: the offset is set to the current location plus @e offset bytes.
+ * SEEK_END: the offset is set to the end of the file plus @e offset bytes.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int whence);
+/**
+ * Read data from the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param buf the buffer to read data into
+ * @param size the initial size of the buffer
+ * @param offset the offset in the file to read from. If this value is negative, the
+ * function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_read(struct ceph_mount_info *cmount, int fd, char *buf, int64_t size, int64_t offset);
+
+/**
+ * Read data from the file.
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset in the file to read from. If this value is negative, the
+ * function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_preadv(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+ int64_t offset);
+
+/**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param buf the bytes to write to the file
+ * @param size the size of the buf array
+ * @param offset the offset of the file write into. If this value is negative, the
+ * function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_write(struct ceph_mount_info *cmount, int fd, const char *buf, int64_t size,
+ int64_t offset);
+
+/**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset of the file write into. If this value is negative, the
+ * function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_pwritev(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+ int64_t offset);
+
+/**
+ * Truncate a file to the given size.
+ *
+ * @param cmount the ceph mount handle to use for performing the ftruncate.
+ * @param fd the file descriptor of the file to truncate
+ * @param size the new size of the file
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_ftruncate(struct ceph_mount_info *cmount, int fd, int64_t size);
+
+/**
+ * Synchronize an open file to persistent media.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param syncdataonly a boolean whether to synchronize metadata and data (0)
+ * or just data (1).
+ * @return 0 on success or a negative error code on failure.
+ */
+int ceph_fsync(struct ceph_mount_info *cmount, int fd, int syncdataonly);
+
+/**
+ * Preallocate or release disk space for the file for the byte range.
+ *
+ * @param cmount the ceph mount handle to use for performing the fallocate.
+ * @param fd the file descriptor of the file to fallocate.
+ * @param mode the flags determines the operation to be performed on the given range.
+ * default operation (0) allocate and initialize to zero the file in the byte range,
+ * and the file size will be changed if offset + length is greater than
+ * the file size. if the FALLOC_FL_KEEP_SIZE flag is specified in the mode,
+ * the file size will not be changed. if the FALLOC_FL_PUNCH_HOLE flag is
+ * specified in the mode, the operation is deallocate space and zero the byte range.
+ * @param offset the byte range starting.
+ * @param length the length of the range.
+ * @return 0 on success or a negative error code on failure.
+ */
+int ceph_fallocate(struct ceph_mount_info *cmount, int fd, int mode,
+ int64_t offset, int64_t length);
+
+/**
+ * Enable/disable lazyio for the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param enable a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio(struct ceph_mount_info *cmount, int fd, int enable);
+
+
+/**
+ * Flushes the write buffer for the file thereby propogating the buffered write to the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param offset a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio_propagate(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count);
+
+
+/**
+ * Flushes the write buffer for the file and invalidate the read cache. This allows a subsequent read operation to read and cache data directly from the file and hence everyone's propagated writes would be visible.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param offset a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio_synchronize(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count);
+
+/** @} file */
+
+/**
+ * @defgroup libcephfs_h_xattr Extended Attribute manipulation and handling.
+ * Functions for creating and manipulating extended attributes on files.
+ *
+ * @{
+ */
+
+/**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param path the path to the file
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_getxattr(struct ceph_mount_info *cmount, const char *path, const char *name,
+ void *value, size_t size);
+
+/**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param fd the open file descriptor referring to the file to get extended attribute from.
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_fgetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+ void *value, size_t size);
+
+/**
+ * Get an extended attribute without following symbolic links. This function is
+ * identical to ceph_getxattr, but if the path refers to a symbolic link,
+ * we get the extended attributes of the symlink rather than the attributes
+ * of the link itself.
+ *
+ * @param cmount the ceph mount handle to use for performing the lgetxattr.
+ * @param path the path to the file
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_lgetxattr(struct ceph_mount_info *cmount, const char *path, const char *name,
+ void *value, size_t size);
+
+/**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param path the path to the file.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_listxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
+
+/**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param fd the open file descriptor referring to the file to list extended attributes on.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_flistxattr(struct ceph_mount_info *cmount, int fd, char *list, size_t size);
+
+/**
+ * Get the list of extended attribute keys on a file, but do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the llistxattr.
+ * @param path the path to the file.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_llistxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
+
+/**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_removexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param fd the open file descriptor referring to the file to remove extended attribute from.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fremovexattr(struct ceph_mount_info *cmount, int fd, const char *name);
+
+/**
+ * Remove the extended attribute from a file, do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the lremovexattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lremovexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ * CEPH_XATTR_CREATE: create the extended attribute. Must not exist.
+ * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_setxattr(struct ceph_mount_info *cmount, const char *path, const char *name,
+ const void *value, size_t size, int flags);
+
+/**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param fd the open file descriptor referring to the file to set extended attribute on.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ * CEPH_XATTR_CREATE: create the extended attribute. Must not exist.
+ * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fsetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+ const void *value, size_t size, int flags);
+
+/**
+ * Set an extended attribute on a file, do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the lsetxattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ * CEPH_XATTR_CREATE: create the extended attribute. Must not exist.
+ * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char *name,
+ const void *value, size_t size, int flags);
+
+/** @} xattr */
+
+/**
+ * @defgroup libcephfs_h_filelayout Control File Layout.
+ * Functions for setting and getting the file layout of existing files.
+ *
+ * @{
+ */
+
+/**
+ * Get the file striping unit from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping unit.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file striping count from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping count.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file object size from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file object size.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file pool information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file pool information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the name of the pool a opened file is stored in,
+ *
+ * Write the name of the file's pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen);
+
+/**
+ * get the name of a pool by id
+ *
+ * Given a pool's numeric identifier, get the pool's alphanumeric name.
+ *
+ * @param cmount the ceph mount handle to use
+ * @param pool the numeric pool id
+ * @param buf buffer to sore the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough
+ */
+int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t buflen);
+
+/**
+ * Get the name of the pool a file is stored in
+ *
+ * Write the name of the file's pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t buflen);
+
+/**
+ * Get the default pool name of cephfs
+ * Write the name of the default pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ * @param cmount the ceph mount handle to use.
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_default_data_pool_name(struct ceph_mount_info *cmount, char *buf, size_t buflen);
+
+/**
+ * Get the file layout from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file layout.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file replication information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file replication information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the id of the named pool.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param pool_name the name of the pool.
+ * @returns the pool id, or a negative error code on failure.
+ */
+int ceph_get_pool_id(struct ceph_mount_info *cmount, const char *pool_name);
+
+/**
+ * Get the pool replication factor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param pool_id the pool id to look up
+ * @returns the replication factor, or a negative error code on failure.
+ */
+int ceph_get_pool_replication(struct ceph_mount_info *cmount, int pool_id);
+
+/**
+ * Get the OSD address where the primary copy of a file stripe is located.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the open file descriptor referring to the file to get the striping unit of.
+ * @param offset the offset into the file to specify the stripe. The offset can be
+ * anywhere within the stripe unit.
+ * @param addr the address of the OSD holding that stripe
+ * @param naddr the capacity of the address passed in.
+ * @returns the size of the addressed filled into the @e addr parameter, or a negative
+ * error code on failure.
+ */
+int ceph_get_file_stripe_address(struct ceph_mount_info *cmount, int fd, int64_t offset,
+ struct sockaddr_storage *addr, int naddr);
+
+/**
+ * Get the list of OSDs where the objects containing a file offset are located.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the open file descriptor referring to the file.
+ * @param offset the offset within the file.
+ * @param length return the number of bytes between the offset and the end of
+ * the stripe unit (optional).
+ * @param osds an integer array to hold the OSD ids.
+ * @param nosds the size of the integer array.
+ * @returns the number of items stored in the output array, or -ERANGE if the
+ * array is not large enough.
+ */
+int ceph_get_file_extent_osds(struct ceph_mount_info *cmount, int fd,
+ int64_t offset, int64_t *length, int *osds, int nosds);
+
+/**
+ * Get the fully qualified CRUSH location of an OSD.
+ *
+ * Returns (type, name) string pairs for each device in the CRUSH bucket
+ * hierarchy starting from the given osd to the root. Each pair element is
+ * separated by a NULL character.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param osd the OSD id.
+ * @param path buffer to store location.
+ * @param len size of buffer.
+ * @returns the amount of bytes written into the buffer, or -ERANGE if the
+ * array is not large enough.
+ */
+int ceph_get_osd_crush_location(struct ceph_mount_info *cmount,
+ int osd, char *path, size_t len);
+
+/**
+ * Get the network address of an OSD.
+ *
+ * @param cmount the ceph mount handle.
+ * @param osd the OSD id.
+ * @param addr the OSD network address.
+ * @returns zero on success, other returns a negative error code.
+ */
+int ceph_get_osd_addr(struct ceph_mount_info *cmount, int osd,
+ struct sockaddr_storage *addr);
+
+/**
+ * Get the file layout stripe unit granularity.
+ * @param cmount the ceph mount handle.
+ * @returns the stripe unit granularity or a negative error code on failure.
+ */
+int ceph_get_stripe_unit_granularity(struct ceph_mount_info *cmount);
+
+/** @} filelayout */
+
+/**
+ * No longer available. Do not use.
+ * These functions will return -EOPNOTSUPP.
+ */
+int ceph_set_default_file_stripe_unit(struct ceph_mount_info *cmount, int stripe);
+int ceph_set_default_file_stripe_count(struct ceph_mount_info *cmount, int count);
+int ceph_set_default_object_size(struct ceph_mount_info *cmount, int size);
+int ceph_set_default_preferred_pg(struct ceph_mount_info *cmount, int osd);
+int ceph_set_default_file_replication(struct ceph_mount_info *cmount, int replication);
+
+/**
+ * Read from local replicas when possible.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param val a boolean to set (1) or clear (0) the option to favor local objects
+ * for reads.
+ * @returns 0
+ */
+int ceph_localize_reads(struct ceph_mount_info *cmount, int val);
+
+/**
+ * Get the osd id of the local osd (if any)
+ *
+ * @param cmount the ceph mount handle to use.
+ * @returns the osd (if any) local to the node where this call is made, otherwise
+ * -1 is returned.
+ */
+int ceph_get_local_osd(struct ceph_mount_info *cmount);
+
+/** @} default_filelayout */
+
+/**
+ * Get the capabilities currently issued to the client.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the file descriptor to get issued
+ * @returns the current capabilities issued to this client
+ * for the open file
+ */
+int ceph_debug_get_fd_caps(struct ceph_mount_info *cmount, int fd);
+
+/**
+ * Get the capabilities currently issued to the client.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path to the file
+ * @returns the current capabilities issued to this client
+ * for the file
+ */
+int ceph_debug_get_file_caps(struct ceph_mount_info *cmount, const char *path);
+
+/* Low Level */
+struct Inode *ceph_ll_get_inode(struct ceph_mount_info *cmount,
+ vinodeno_t vino);
+int ceph_ll_lookup_inode(
+ struct ceph_mount_info *cmount,
+ struct inodeno_t ino,
+ Inode **inode);
+
+/**
+ * Get the root inode of FS. Increase counter of references for root Inode. You must call ceph_ll_forget for it!
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param parent pointer to pointer to Inode struct. Pointer to root inode will be returned
+ * @returns 0 if all good
+ */
+int ceph_ll_lookup_root(struct ceph_mount_info *cmount,
+ Inode **parent);
+int ceph_ll_lookup(struct ceph_mount_info *cmount, Inode *parent,
+ const char *name, Inode **out, struct ceph_statx *stx,
+ unsigned want, unsigned flags, const UserPerm *perms);
+int ceph_ll_put(struct ceph_mount_info *cmount, struct Inode *in);
+int ceph_ll_forget(struct ceph_mount_info *cmount, struct Inode *in,
+ int count);
+int ceph_ll_walk(struct ceph_mount_info *cmount, const char* name, Inode **i,
+ struct ceph_statx *stx, unsigned int want, unsigned int flags,
+ const UserPerm *perms);
+int ceph_ll_getattr(struct ceph_mount_info *cmount, struct Inode *in,
+ struct ceph_statx *stx, unsigned int want, unsigned int flags,
+ const UserPerm *perms);
+int ceph_ll_setattr(struct ceph_mount_info *cmount, struct Inode *in,
+ struct ceph_statx *stx, int mask, const UserPerm *perms);
+int ceph_ll_open(struct ceph_mount_info *cmount, struct Inode *in, int flags,
+ struct Fh **fh, const UserPerm *perms);
+off_t ceph_ll_lseek(struct ceph_mount_info *cmount, struct Fh* filehandle,
+ off_t offset, int whence);
+int ceph_ll_read(struct ceph_mount_info *cmount, struct Fh* filehandle,
+ int64_t off, uint64_t len, char* buf);
+int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh,
+ int syncdataonly);
+int ceph_ll_sync_inode(struct ceph_mount_info *cmount, struct Inode *in,
+ int syncdataonly);
+int ceph_ll_fallocate(struct ceph_mount_info *cmount, struct Fh *fh,
+ int mode, int64_t offset, int64_t length);
+int ceph_ll_write(struct ceph_mount_info *cmount, struct Fh* filehandle,
+ int64_t off, uint64_t len, const char *data);
+int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh,
+ const struct iovec *iov, int iovcnt, int64_t off);
+int64_t ceph_ll_writev(struct ceph_mount_info *cmount, struct Fh *fh,
+ const struct iovec *iov, int iovcnt, int64_t off);
+int ceph_ll_close(struct ceph_mount_info *cmount, struct Fh* filehandle);
+int ceph_ll_iclose(struct ceph_mount_info *cmount, struct Inode *in, int mode);
+/**
+ * Get xattr value by xattr name.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param in file handle
+ * @param name name of attribute
+ * @param value pointer to begin buffer
+ * @param size buffer size
+ * @param perms pointer to UserPerms object
+ * @returns size of returned buffer. Negative number in error case
+ */
+int ceph_ll_getxattr(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, void *value, size_t size,
+ const UserPerm *perms);
+int ceph_ll_setxattr(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, const void *value, size_t size,
+ int flags, const UserPerm *perms);
+int ceph_ll_listxattr(struct ceph_mount_info *cmount, struct Inode *in,
+ char *list, size_t buf_size, size_t *list_size,
+ const UserPerm *perms);
+int ceph_ll_removexattr(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, const UserPerm *perms);
+int ceph_ll_create(struct ceph_mount_info *cmount, Inode *parent,
+ const char *name, mode_t mode, int oflags, Inode **outp,
+ Fh **fhp, struct ceph_statx *stx, unsigned want,
+ unsigned lflags, const UserPerm *perms);
+int ceph_ll_mknod(struct ceph_mount_info *cmount, Inode *parent,
+ const char *name, mode_t mode, dev_t rdev, Inode **out,
+ struct ceph_statx *stx, unsigned want, unsigned flags,
+ const UserPerm *perms);
+int ceph_ll_mkdir(struct ceph_mount_info *cmount, Inode *parent,
+ const char *name, mode_t mode, Inode **out,
+ struct ceph_statx *stx, unsigned want,
+ unsigned flags, const UserPerm *perms);
+int ceph_ll_link(struct ceph_mount_info *cmount, struct Inode *in,
+ struct Inode *newparent, const char *name,
+ const UserPerm *perms);
+int ceph_ll_opendir(struct ceph_mount_info *cmount, struct Inode *in,
+ struct ceph_dir_result **dirpp, const UserPerm *perms);
+int ceph_ll_releasedir(struct ceph_mount_info *cmount,
+ struct ceph_dir_result* dir);
+int ceph_ll_rename(struct ceph_mount_info *cmount, struct Inode *parent,
+ const char *name, struct Inode *newparent,
+ const char *newname, const UserPerm *perms);
+int ceph_ll_unlink(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, const UserPerm *perms);
+int ceph_ll_statfs(struct ceph_mount_info *cmount, struct Inode *in,
+ struct statvfs *stbuf);
+int ceph_ll_readlink(struct ceph_mount_info *cmount, struct Inode *in,
+ char *buf, size_t bufsize, const UserPerm *perms);
+int ceph_ll_symlink(struct ceph_mount_info *cmount,
+ Inode *in, const char *name, const char *value,
+ Inode **out, struct ceph_statx *stx,
+ unsigned want, unsigned flags,
+ const UserPerm *perms);
+int ceph_ll_rmdir(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, const UserPerm *perms);
+uint32_t ceph_ll_stripe_unit(struct ceph_mount_info *cmount,
+ struct Inode *in);
+uint32_t ceph_ll_file_layout(struct ceph_mount_info *cmount,
+ struct Inode *in,
+ struct ceph_file_layout *layout);
+uint64_t ceph_ll_snap_seq(struct ceph_mount_info *cmount,
+ struct Inode *in);
+int ceph_ll_get_stripe_osd(struct ceph_mount_info *cmount,
+ struct Inode *in,
+ uint64_t blockno,
+ struct ceph_file_layout* layout);
+int ceph_ll_num_osds(struct ceph_mount_info *cmount);
+int ceph_ll_osdaddr(struct ceph_mount_info *cmount,
+ int osd, uint32_t *addr);
+uint64_t ceph_ll_get_internal_offset(struct ceph_mount_info *cmount,
+ struct Inode *in, uint64_t blockno);
+int ceph_ll_read_block(struct ceph_mount_info *cmount,
+ struct Inode *in, uint64_t blockid,
+ char* bl, uint64_t offset, uint64_t length,
+ struct ceph_file_layout* layout);
+int ceph_ll_write_block(struct ceph_mount_info *cmount,
+ struct Inode *in, uint64_t blockid,
+ char* buf, uint64_t offset,
+ uint64_t length, struct ceph_file_layout* layout,
+ uint64_t snapseq, uint32_t sync);
+int ceph_ll_commit_blocks(struct ceph_mount_info *cmount,
+ struct Inode *in, uint64_t offset, uint64_t range);
+
+
+int ceph_ll_getlk(struct ceph_mount_info *cmount,
+ Fh *fh, struct flock *fl, uint64_t owner);
+int ceph_ll_setlk(struct ceph_mount_info *cmount,
+ Fh *fh, struct flock *fl, uint64_t owner, int sleep);
+
+int ceph_ll_lazyio(struct ceph_mount_info *cmount, Fh *fh, int enable);
+
+/*
+ * Delegation support
+ *
+ * Delegations are way for an application to request exclusive or
+ * semi-exclusive access to an Inode. The client requests the delegation and
+ * if it's successful it can reliably cache file data and metadata until the
+ * delegation is recalled.
+ *
+ * Recalls are issued via a callback function, provided by the application.
+ * Callback functions should act something like signal handlers. You want to
+ * do as little as possible in the callback. Any major work should be deferred
+ * in some fashion as it's difficult to predict the context in which this
+ * function will be called.
+ *
+ * Once the delegation has been recalled, the application should return it as
+ * soon as possible. The application has client_deleg_timeout seconds to
+ * return it, after which the cmount structure is forcibly unmounted and
+ * further calls into it fail.
+ *
+ * The application can set the client_deleg_timeout config option to suit its
+ * needs, but it should take care to choose a value that allows it to avoid
+ * forcible eviction from the cluster in the event of an application bug.
+ */
+
+/* Commands for manipulating delegation state */
+#ifndef CEPH_DELEGATION_NONE
+# define CEPH_DELEGATION_NONE 0
+# define CEPH_DELEGATION_RD 1
+# define CEPH_DELEGATION_WR 2
+#endif
+
+/**
+ * Get the amount of time that the client has to return caps
+ * @param cmount the ceph mount handle to use.
+ *
+ * In the event that a client does not return its caps, the MDS may blacklist
+ * it after this timeout. Applications should check this value and ensure
+ * that they set the delegation timeout to a value lower than this.
+ *
+ * This call returns the cap return timeout (in seconds) for this cmount, or
+ * zero if it's not mounted.
+ */
+uint32_t ceph_get_cap_return_timeout(struct ceph_mount_info *cmount);
+
+/**
+ * Set the delegation timeout for the mount (thereby enabling delegations)
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the delegation timeout (in seconds)
+ *
+ * Since the client could end up blacklisted if it doesn't return delegations
+ * in time, we mandate that any application wanting to use delegations
+ * explicitly set the timeout beforehand. Until this call is done on the
+ * mount, attempts to set a delegation will return -ETIME.
+ *
+ * Once a delegation is recalled, if it is not returned in this amount of
+ * time, the cmount will be forcibly unmounted and further access attempts
+ * will fail (usually with -ENOTCONN errors).
+ *
+ * This value is further vetted against the cap return timeout, and this call
+ * can fail with -EINVAL if the timeout value is too long. Delegations can be
+ * disabled again by setting the timeout to 0.
+ */
+int ceph_set_deleg_timeout(struct ceph_mount_info *cmount, uint32_t timeout);
+
+/**
+ * Request a delegation on an open Fh
+ * @param cmount the ceph mount handle to use.
+ * @param fh file handle
+ * @param cmd CEPH_DELEGATION_* command
+ * @param cb callback function for recalling delegation
+ * @param priv opaque token passed back during recalls
+ *
+ * Returns 0 if the delegation was granted, -EAGAIN if there was a conflict
+ * and other error codes if there is a fatal error of some sort (e.g. -ENOMEM,
+ * -ETIME)
+ */
+int ceph_ll_delegation(struct ceph_mount_info *cmount, Fh *fh,
+ unsigned int cmd, ceph_deleg_cb_t cb, void *priv);
+
+mode_t ceph_umask(struct ceph_mount_info *cmount, mode_t mode);
+
+/* state reclaim */
+#define CEPH_RECLAIM_RESET 1
+
+/**
+ * Set ceph client uuid
+ * @param cmount the ceph mount handle to use.
+ * @param uuid the uuid to set
+ *
+ * Must be called before mount.
+ */
+void ceph_set_uuid(struct ceph_mount_info *cmount, const char *uuid);
+
+/**
+ * Set ceph client session timeout
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the timeout to set
+ *
+ * Must be called before mount.
+ */
+void ceph_set_session_timeout(struct ceph_mount_info *cmount, unsigned timeout);
+
+/**
+ * Start to reclaim states of other client
+ * @param cmount the ceph mount handle to use.
+ * @param uuid uuid of client whose states need to be reclaimed
+ * @param flags flags that control how states get reclaimed
+ *
+ * Returns 0 success, -EOPNOTSUPP if mds does not support the operation,
+ * -ENOENT if CEPH_RECLAIM_RESET is specified and there is no client
+ * with the given uuid, -ENOTRECOVERABLE in all other error cases.
+ */
+int ceph_start_reclaim(struct ceph_mount_info *cmount,
+ const char *uuid, unsigned flags);
+
+/**
+ * finish reclaiming states of other client (
+ * @param cmount the ceph mount handle to use.
+ */
+void ceph_finish_reclaim(struct ceph_mount_info *cmount);
+
+/**
+ * Register a set of callbacks to be used with this cmount
+ * @param cmount the ceph mount handle on which the cb's should be registerd
+ * @param args callback arguments to register with the cmount
+ *
+ * Any fields set to NULL will be ignored. There currently is no way to
+ * unregister these callbacks, so this is a one-way change.
+ */
+void ceph_ll_register_callbacks(struct ceph_mount_info *cmount,
+ struct ceph_client_callback_args *args);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/cmp.h b/src/include/cmp.h
new file mode 100644
index 00000000..79372fde
--- /dev/null
+++ b/src/include/cmp.h
@@ -0,0 +1,205 @@
+#ifndef __CEPH_CMP_H
+#define __CEPH_CMP_H
+
+/*
+ * macros to define comparison operators for classes with small numbers of members.
+ */
+
+#define WRITE_EQ_OPERATORS_1(type, a) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a; \
+ }
+
+#define WRITE_CMP_OPERATORS_1(type, a) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a; \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a; \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a >= r.a; \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a <= r.a; \
+ }
+
+#define WRITE_EQ_OPERATORS_2(type, a, b) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b; \
+ }
+
+#define WRITE_CMP_OPERATORS_2(type, a, b) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b)); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b)); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b >= r.b)); \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b <= r.b)); \
+ }
+
+
+#define WRITE_EQ_OPERATORS_3(type, a, b, c) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b && l.c == r.c; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b || l.c != r.c; \
+ }
+
+#define WRITE_CMP_OPERATORS_3(type, a, b, c) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c)))); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c)))); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c >= r.c)))); \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c <= r.c)))); \
+ }
+
+#define WRITE_EQ_OPERATORS_4(type, a, b, c, d) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d; \
+ }
+
+#define WRITE_CMP_OPERATORS_4(type, a, b, c, d) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d)))))); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d)))))); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d >= r.d)))))); \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d <= r.d)))))); \
+ }
+
+
+
+#define WRITE_EQ_OPERATORS_5(type, a, b, c, d, e) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d && l.e == r.e; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d || l.e != r.e; \
+ }
+
+#define WRITE_CMP_OPERATORS_5(type, a, b, c, d, e) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d || \
+ (l.d == r.d && l.e > r.e))))))); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d || \
+ (l.d == r.d && (l.e < r.e)))))))); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d || \
+ (l.d == r.d && l.e >= r.e))))))); \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d || \
+ (l.d == r.d && l.e <= r.e))))))); \
+ }
+
+#define WRITE_EQ_OPERATORS_7(type, a, b, c, d, e, f, g) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d && l.e == r.e && l.f == r.f && l.g == r.g; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d || l.e != r.e || l.f != r.f || l.g != r.g; \
+ }
+#define WRITE_CMP_OPERATORS_7(type, a, b, c, d, e, f, g) \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d || \
+ (l.d == r.d && (l.e < r.e || \
+ (l.e == r.e && (l.f < r.f || \
+ (l.f == r.f && l.g <= r.g))))))))))); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d || \
+ (l.d == r.d && (l.e > r.e || \
+ (l.e == r.e && (l.f > r.f || \
+ (l.f == r.f && l.g >= r.g))))))))))); \
+ } \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d || \
+ (l.d == r.d && (l.e > r.e || \
+ (l.e == r.e && (l.f > r.f || \
+ (l.f == r.f && l.g > r.g))))))))))); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d || \
+ (l.d == r.d && (l.e < r.e || \
+ (l.e == r.e && (l.f < r.f || \
+ (l.f == r.f && l.g < r.g))))))))))); \
+ }
+#endif
diff --git a/src/include/color.h b/src/include/color.h
new file mode 100644
index 00000000..6c8df40e
--- /dev/null
+++ b/src/include/color.h
@@ -0,0 +1,13 @@
+#ifndef CEPH_COLOR_H
+#define CEPH_COLOR_H
+
+#define TEXT_NORMAL "\033[0m"
+/*#define TEXT_HAZARD "\033[5;31m"*/
+#define TEXT_RED "\033[0;31m"
+#define TEXT_GREEN "\033[0;32m"
+#define TEXT_YELLOW "\033[0;33m"
+#define TEXT_BLUE "\033[0;34m"
+#define TEXT_MAGENTA "\033[0;35m"
+#define TEXT_CYAN "\033[0;36m"
+
+#endif
diff --git a/src/include/compact_map.h b/src/include/compact_map.h
new file mode 100644
index 00000000..3ccb7982
--- /dev/null
+++ b/src/include/compact_map.h
@@ -0,0 +1,383 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_MAP_H
+#define CEPH_COMPACT_MAP_H
+
+#include "buffer.h"
+#include "encoding.h"
+
+#include <map>
+#include <memory>
+
+#include "include/encoding.h"
+
+template <class Key, class T, class Map>
+class compact_map_base {
+protected:
+ std::unique_ptr<Map> map;
+ void alloc_internal() {
+ if (!map)
+ map.reset(new Map);
+ }
+ void free_internal() {
+ map.reset();
+ }
+ template <class It>
+ class const_iterator_base {
+ const compact_map_base *map;
+ It it;
+ const_iterator_base() : map(0) { }
+ const_iterator_base(const compact_map_base* m) : map(m) { }
+ const_iterator_base(const compact_map_base *m, const It& i) : map(m), it(i) { }
+ friend class compact_map_base;
+ friend class iterator_base;
+ public:
+ const_iterator_base(const const_iterator_base& o) {
+ map = o.map;
+ it = o.it;
+ }
+ bool operator==(const const_iterator_base& o) const {
+ return (map == o.map) && (!map->map || it == o.it);
+ }
+ bool operator!=(const const_iterator_base& o) const {
+ return !(*this == o);;
+ }
+ const_iterator_base& operator=(const const_iterator_base& o) {
+ map = o.map;
+ it = o.it;
+ return *this;
+ }
+ const_iterator_base& operator++() {
+ ++it;
+ return *this;
+ }
+ const_iterator_base& operator--() {
+ --it;
+ return *this;
+ }
+ const std::pair<const Key,T>& operator*() {
+ return *it;
+ }
+ const std::pair<const Key,T>* operator->() {
+ return it.operator->();
+ }
+ };
+ template <class It>
+ class iterator_base {
+ private:
+ const compact_map_base* map;
+ It it;
+ iterator_base() : map(0) { }
+ iterator_base(compact_map_base* m) : map(m) { }
+ iterator_base(compact_map_base* m, const It& i) : map(m), it(i) { }
+ friend class compact_map_base;
+ public:
+ iterator_base(const iterator_base& o) {
+ map = o.map;
+ it = o.it;
+ }
+ bool operator==(const iterator_base& o) const {
+ return (map == o.map) && (!map->map || it == o.it);
+ }
+ bool operator!=(const iterator_base& o) const {
+ return !(*this == o);;
+ }
+ iterator_base& operator=(const iterator_base& o) {
+ map = o.map;
+ it = o.it;
+ return *this;
+ }
+ iterator_base& operator++() {
+ ++it;
+ return *this;
+ }
+ iterator_base operator++(int) {
+ iterator_base tmp = *this;
+ ++it;
+ return tmp;
+ }
+ iterator_base& operator--() {
+ --it;
+ return *this;
+ }
+ std::pair<const Key,T>& operator*() {
+ return *it;
+ }
+ std::pair<const Key,T>* operator->() {
+ return it.operator->();
+ }
+ operator const_iterator_base<It>() const {
+ return const_iterator_base<It>(map, it);
+ }
+ };
+
+public:
+ class iterator : public iterator_base<typename Map::iterator> {
+ public:
+ iterator() { }
+ iterator(const iterator_base<typename Map::iterator>& o)
+ : iterator_base<typename Map::iterator>(o) { }
+ iterator(compact_map_base* m) : iterator_base<typename Map::iterator>(m) { }
+ iterator(compact_map_base* m, const typename Map::iterator& i)
+ : iterator_base<typename Map::iterator>(m, i) { }
+ };
+ class const_iterator : public const_iterator_base<typename Map::const_iterator> {
+ public:
+ const_iterator() { }
+ const_iterator(const iterator_base<typename Map::const_iterator>& o)
+ : const_iterator_base<typename Map::const_iterator>(o) { }
+ const_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_iterator>(m) { }
+ const_iterator(const compact_map_base* m, const typename Map::const_iterator& i)
+ : const_iterator_base<typename Map::const_iterator>(m, i) { }
+ };
+ class reverse_iterator : public iterator_base<typename Map::reverse_iterator> {
+ public:
+ reverse_iterator() { }
+ reverse_iterator(const iterator_base<typename Map::reverse_iterator>& o)
+ : iterator_base<typename Map::reverse_iterator>(o) { }
+ reverse_iterator(compact_map_base* m) : iterator_base<typename Map::reverse_iterator>(m) { }
+ reverse_iterator(compact_map_base* m, const typename Map::reverse_iterator& i)
+ : iterator_base<typename Map::reverse_iterator>(m, i) { }
+ };
+ class const_reverse_iterator : public const_iterator_base<typename Map::const_reverse_iterator> {
+ public:
+ const_reverse_iterator() { }
+ const_reverse_iterator(const iterator_base<typename Map::const_reverse_iterator>& o)
+ : iterator_base<typename Map::const_reverse_iterator>(o) { }
+ const_reverse_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_reverse_iterator>(m) { }
+ const_reverse_iterator(const compact_map_base* m, const typename Map::const_reverse_iterator& i)
+ : const_iterator_base<typename Map::const_reverse_iterator>(m, i) { }
+ };
+ compact_map_base(const compact_map_base& o) {
+ if (o.map) {
+ alloc_internal();
+ *map = *o.map;
+ }
+ }
+ compact_map_base() {}
+ ~compact_map_base() {}
+
+ bool empty() const {
+ return !map || map->empty();
+ }
+ size_t size() const {
+ return map ? map->size() : 0;
+ }
+ bool operator==(const compact_map_base& o) const {
+ return (empty() && o.empty()) || (map && o.map && *map == *o.map);
+ }
+ bool operator!=(const compact_map_base& o) const {
+ return !(*this == o);
+ }
+ size_t count (const Key& k) const {
+ return map ? map->count(k) : 0;
+ }
+ iterator erase (iterator p) {
+ if (map) {
+ ceph_assert(this == p.map);
+ auto it = map->erase(p.it);
+ if (map->empty()) {
+ free_internal();
+ return iterator(this);
+ } else {
+ return iterator(this, it);
+ }
+ } else {
+ return iterator(this);
+ }
+ }
+ size_t erase (const Key& k) {
+ if (!map)
+ return 0;
+ size_t r = map->erase(k);
+ if (map->empty())
+ free_internal();
+ return r;
+ }
+ void clear() {
+ free_internal();
+ }
+ void swap(compact_map_base& o) {
+ map.swap(o.map);
+ }
+ compact_map_base& operator=(const compact_map_base& o) {
+ if (o.map) {
+ alloc_internal();
+ *map = *o.map;
+ } else
+ free_internal();
+ return *this;
+ }
+ iterator insert(const std::pair<const Key, T>& val) {
+ alloc_internal();
+ return iterator(this, map->insert(val));
+ }
+ template <class... Args>
+ std::pair<iterator,bool> emplace ( Args&&... args ) {
+ alloc_internal();
+ auto em = map->emplace(std::forward<Args>(args)...);
+ return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+ }
+ iterator begin() {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->begin());
+ }
+ iterator end() {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->end());
+ }
+ reverse_iterator rbegin() {
+ if (!map)
+ return reverse_iterator(this);
+ return reverse_iterator(this, map->rbegin());
+ }
+ reverse_iterator rend() {
+ if (!map)
+ return reverse_iterator(this);
+ return reverse_iterator(this, map->rend());
+ }
+ iterator find(const Key& k) {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->find(k));
+ }
+ iterator lower_bound(const Key& k) {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->lower_bound(k));
+ }
+ iterator upper_bound(const Key& k) {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->upper_bound(k));
+ }
+ const_iterator begin() const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->begin());
+ }
+ const_iterator end() const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->end());
+ }
+ const_reverse_iterator rbegin() const {
+ if (!map)
+ return const_reverse_iterator(this);
+ return const_reverse_iterator(this, map->rbegin());
+ }
+ const_reverse_iterator rend() const {
+ if (!map)
+ return const_reverse_iterator(this);
+ return const_reverse_iterator(this, map->rend());
+ }
+ const_iterator find(const Key& k) const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->find(k));
+ }
+ const_iterator lower_bound(const Key& k) const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->lower_bound(k));
+ }
+ const_iterator upper_bound(const Key& k) const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->upper_bound(k));
+ }
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ if (map)
+ encode(*map, bl);
+ else
+ encode((uint32_t)0, bl);
+ }
+ void encode(bufferlist &bl, uint64_t features) const {
+ using ceph::encode;
+ if (map)
+ encode(*map, bl, features);
+ else
+ encode((uint32_t)0, bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ using ceph::decode_nohead;
+ uint32_t n;
+ decode(n, p);
+ if (n > 0) {
+ alloc_internal();
+ decode_nohead(n, *map, p);
+ } else
+ free_internal();
+ }
+};
+
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, bufferlist& bl) {
+ m.encode(bl);
+}
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, bufferlist& bl,
+ uint64_t features) {
+ m.encode(bl, features);
+}
+template<class Key, class T, class Map>
+inline void decode(compact_map_base<Key, T, Map>& m, bufferlist::const_iterator& p) {
+ m.decode(p);
+}
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_map : public compact_map_base<Key, T, std::map<Key,T,Compare,Alloc> > {
+public:
+ T& operator[](const Key& k) {
+ this->alloc_internal();
+ return (*(this->map))[k];
+ }
+};
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T, Compare, Alloc>& m)
+{
+ out << "{";
+ bool first = true;
+ for (const auto &p : m) {
+ if (!first)
+ out << ",";
+ out << p.first << "=" << p.second;
+ first = false;
+ }
+ out << "}";
+ return out;
+}
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T,Compare,Alloc> > {
+};
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T, Compare, Alloc>& m)
+{
+ out << "{{";
+ bool first = true;
+ for (const auto &p : m) {
+ if (!first)
+ out << ",";
+ out << p.first << "=" << p.second;
+ first = false;
+ }
+ out << "}}";
+ return out;
+}
+#endif
diff --git a/src/include/compact_set.h b/src/include/compact_set.h
new file mode 100644
index 00000000..ba743fb0
--- /dev/null
+++ b/src/include/compact_set.h
@@ -0,0 +1,305 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_SET_H
+#define CEPH_COMPACT_SET_H
+
+#include "buffer.h"
+#include "encoding.h"
+
+#include <memory>
+#include <set>
+
+template <class T, class Set>
+class compact_set_base {
+protected:
+ std::unique_ptr<Set> set;
+ void alloc_internal() {
+ if (!set)
+ set.reset(new Set);
+ }
+ void free_internal() {
+ set.reset();
+ }
+ template <class It>
+ class iterator_base {
+ private:
+ const compact_set_base* set;
+ It it;
+ iterator_base() : set(0) { }
+ iterator_base(const compact_set_base* s) : set(s) { }
+ iterator_base(const compact_set_base* s, const It& i) : set(s), it(i) { }
+ friend class compact_set_base;
+ public:
+ iterator_base(const iterator_base& o) {
+ set = o.set;
+ it = o.it;
+ }
+ bool operator==(const iterator_base& o) const {
+ return (set == o.set) && (!set->set || it == o.it);
+ }
+ bool operator!=(const iterator_base& o) const {
+ return !(*this == o);;
+ }
+ iterator_base& operator=(const iterator_base& o) {
+ set->set = o.set;
+ it = o.it;
+ return *this;
+ }
+ iterator_base& operator++() {
+ ++it;
+ return *this;
+ }
+ iterator_base operator++(int) {
+ iterator_base tmp = *this;
+ ++it;
+ return tmp;
+ }
+ iterator_base& operator--() {
+ --it;
+ return *this;
+ }
+ const T& operator*() {
+ return *it;
+ }
+ };
+public:
+ class const_iterator : public iterator_base<typename Set::const_iterator> {
+ public:
+ const_iterator() { }
+ const_iterator(const iterator_base<typename Set::const_iterator>& o)
+ : iterator_base<typename Set::const_iterator>(o) { }
+ const_iterator(const compact_set_base* s) : iterator_base<typename Set::const_iterator>(s) { }
+ const_iterator(const compact_set_base* s, const typename Set::const_iterator& i)
+ : iterator_base<typename Set::const_iterator>(s, i) { }
+ };
+ class iterator : public iterator_base<typename Set::iterator> {
+ public:
+ iterator() { }
+ iterator(const iterator_base<typename Set::iterator>& o)
+ : iterator_base<typename Set::iterator>(o) { }
+ iterator(compact_set_base* s) : iterator_base<typename Set::iterator>(s) { }
+ iterator(compact_set_base* s, const typename Set::iterator& i)
+ : iterator_base<typename Set::iterator>(s, i) { }
+ operator const_iterator() const {
+ return const_iterator(this->set, this->it);
+ }
+ };
+ class const_reverse_iterator : public iterator_base<typename Set::const_reverse_iterator> {
+ public:
+ const_reverse_iterator() { }
+ const_reverse_iterator(const iterator_base<typename Set::const_reverse_iterator>& o)
+ : iterator_base<typename Set::const_reverse_iterator>(o) { }
+ const_reverse_iterator(const compact_set_base* s) : iterator_base<typename Set::const_reverse_iterator>(s) { }
+ const_reverse_iterator(const compact_set_base* s, const typename Set::const_reverse_iterator& i)
+ : iterator_base<typename Set::const_reverse_iterator>(s, i) { }
+ };
+ class reverse_iterator : public iterator_base<typename Set::reverse_iterator> {
+ public:
+ reverse_iterator() { }
+ reverse_iterator(const iterator_base<typename Set::reverse_iterator>& o)
+ : iterator_base<typename Set::reverse_iterator>(o) { }
+ reverse_iterator(compact_set_base* s) : iterator_base<typename Set::reverse_iterator>(s) { }
+ reverse_iterator(compact_set_base* s, const typename Set::reverse_iterator& i)
+ : iterator_base<typename Set::reverse_iterator>(s, i) { }
+ operator const_iterator() const {
+ return const_iterator(this->set, this->it);
+ }
+ };
+
+ compact_set_base() {}
+ compact_set_base(const compact_set_base& o) {
+ if (o.set) {
+ alloc_internal();
+ *set = *o.set;
+ }
+ }
+ ~compact_set_base() {}
+
+
+ bool empty() const {
+ return !set || set->empty();
+ }
+ size_t size() const {
+ return set ? set->size() : 0;
+ }
+ bool operator==(const compact_set_base& o) const {
+ return (empty() && o.empty()) || (set && o.set && *set == *o.set);
+ }
+ bool operator!=(const compact_set_base& o) const {
+ return !(*this == o);
+ }
+ size_t count(const T& t) const {
+ return set ? set->count(t) : 0;
+ }
+ iterator erase (iterator p) {
+ if (set) {
+ ceph_assert(this == p.set);
+ auto it = set->erase(p.it);
+ if (set->empty()) {
+ free_internal();
+ return iterator(this);
+ } else {
+ return iterator(this, it);
+ }
+ } else {
+ return iterator(this);
+ }
+ }
+ size_t erase (const T& t) {
+ if (!set)
+ return 0;
+ size_t r = set->erase(t);
+ if (set->empty())
+ free_internal();
+ return r;
+ }
+ void clear() {
+ free_internal();
+ }
+ void swap(compact_set_base& o) {
+ set.swap(o.set);
+ }
+ compact_set_base& operator=(const compact_set_base& o) {
+ if (o.set) {
+ alloc_internal();
+ *set = *o.set;
+ } else
+ free_internal();
+ return *this;
+ }
+ std::pair<iterator,bool> insert(const T& t) {
+ alloc_internal();
+ std::pair<typename Set::iterator,bool> r = set->insert(t);
+ return std::make_pair(iterator(this, r.first), r.second);
+ }
+ template <class... Args>
+ std::pair<iterator,bool> emplace ( Args&&... args ) {
+ alloc_internal();
+ auto em = set->emplace(std::forward<Args>(args)...);
+ return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+ }
+
+ iterator begin() {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->begin());
+ }
+ iterator end() {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->end());
+ }
+ reverse_iterator rbegin() {
+ if (!set)
+ return reverse_iterator(this);
+ return reverse_iterator(this, set->rbegin());
+ }
+ reverse_iterator rend() {
+ if (!set)
+ return reverse_iterator(this);
+ return reverse_iterator(this, set->rend());
+ }
+ iterator find(const T& t) {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->find(t));
+ }
+ iterator lower_bound(const T& t) {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->lower_bound(t));
+ }
+ iterator upper_bound(const T& t) {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->upper_bound(t));
+ }
+ const_iterator begin() const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->begin());
+ }
+ const_iterator end() const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->end());
+ }
+ const_reverse_iterator rbegin() const {
+ if (!set)
+ return const_reverse_iterator(this);
+ return const_reverse_iterator(this, set->rbegin());
+ }
+ const_reverse_iterator rend() const {
+ if (!set)
+ return const_reverse_iterator(this);
+ return const_reverse_iterator(this, set->rend());
+ }
+ const_iterator find(const T& t) const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->find(t));
+ }
+ const_iterator lower_bound(const T& t) const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->lower_bound(t));
+ }
+ const_iterator upper_bound(const T& t) const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->upper_bound(t));
+ }
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ if (set)
+ encode(*set, bl);
+ else
+ encode((uint32_t)0, bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ uint32_t n;
+ decode(n, p);
+ if (n > 0) {
+ alloc_internal();
+ decode_nohead(n, *set, p);
+ } else
+ free_internal();
+ }
+};
+
+template<class T, class Set>
+inline void encode(const compact_set_base<T, Set>& m, bufferlist& bl) {
+ m.encode(bl);
+}
+template<class T, class Set>
+inline void decode(compact_set_base<T, Set>& m, bufferlist::const_iterator& p) {
+ m.decode(p);
+}
+
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+class compact_set : public compact_set_base<T, std::set<T, Compare, Alloc> > {
+};
+
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+inline std::ostream& operator<<(std::ostream& out, const compact_set<T,Compare,Alloc>& s)
+{
+ bool first = true;
+ for (auto &v : s) {
+ if (!first)
+ out << ",";
+ out << v;
+ first = false;
+ }
+ return out;
+}
+#endif
diff --git a/src/include/compat.h b/src/include/compat.h
new file mode 100644
index 00000000..7c75dac2
--- /dev/null
+++ b/src/include/compat.h
@@ -0,0 +1,198 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_COMPAT_H
+#define CEPH_COMPAT_H
+
+#include "acconfig.h"
+#include <sys/types.h>
+
+#if defined(__linux__)
+#define PROCPREFIX
+#endif
+
+#include <sys/stat.h>
+#ifndef ACCESSPERMS
+#define ACCESSPERMS (S_IRWXU|S_IRWXG|S_IRWXO)
+#endif
+
+#if defined(__FreeBSD__)
+
+// FreeBSD supports Linux procfs with its compatibility module
+// And all compatibility stuff is standard mounted on this
+#define PROCPREFIX "/compat/linux"
+
+#ifndef MSG_MORE
+#define MSG_MORE 0
+#endif
+
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+/* And include the extra required include file */
+#include <pthread_np.h>
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#define cpu_set_t cpuset_t
+int sched_setaffinity(pid_t pid, size_t cpusetsize,
+ cpu_set_t *mask);
+
+#endif /* __FreeBSD__ */
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+/* Make sure that ENODATA is defined in the correct way */
+#ifdef ENODATA
+#if (ENODATA == 9919)
+// #warning ENODATA already defined to be 9919, redefining to fix
+// Silencing this warning because it fires at all files where compat.h
+// is included after boost files.
+//
+// This value stems from the definition in the boost library
+// And when this case occurs it is due to the fact that boost files
+// are included before this file. Redefinition might not help in this
+// case since already parsed code has evaluated to the wrong value.
+// This would warrrant for d definition that would actually be evaluated
+// at the location of usage and report a possible conflict.
+// This is left up to a future improvement
+#elif (ENODATA != 87)
+// #warning ENODATA already defined to a value different from 87 (ENOATRR), refining to fix
+#endif
+#undef ENODATA
+#endif
+#define ENODATA ENOATTR
+
+// Fix clock accuracy
+#if !defined(CLOCK_MONOTONIC_COARSE)
+#if defined(CLOCK_MONOTONIC_FAST)
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_FAST
+#else
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+#endif
+#endif
+#if !defined(CLOCK_REALTIME_COARSE)
+#if defined(CLOCK_REALTIME_FAST)
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME_FAST
+#else
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#endif
+#endif
+
+/* get PATH_MAX */
+#include <limits.h>
+
+#ifndef EUCLEAN
+#define EUCLEAN 117
+#endif
+#ifndef EREMOTEIO
+#define EREMOTEIO 121
+#endif
+#ifndef EKEYREJECTED
+#define EKEYREJECTED 129
+#endif
+#ifndef XATTR_CREATE
+#define XATTR_CREATE 1
+#endif
+
+#ifndef HOST_NAME_MAX
+#ifdef MAXHOSTNAMELEN
+#define HOST_NAME_MAX MAXHOSTNAMELEN
+#else
+#define HOST_NAME_MAX 255
+#endif
+#endif
+
+#endif /* __APPLE__ */
+
+/* O_LARGEFILE is not defined/required on OSX/FreeBSD */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+/* Could be relevant for other platforms */
+#ifndef ERESTART
+#define ERESTART EINTR
+#endif
+
+#ifndef TEMP_FAILURE_RETRY
+#define TEMP_FAILURE_RETRY(expression) ({ \
+ __typeof(expression) __result; \
+ do { \
+ __result = (expression); \
+ } while (__result == -1 && errno == EINTR); \
+ __result; })
+#endif
+
+#ifdef __cplusplus
+# define VOID_TEMP_FAILURE_RETRY(expression) \
+ static_cast<void>(TEMP_FAILURE_RETRY(expression))
+#else
+# define VOID_TEMP_FAILURE_RETRY(expression) \
+ do { (void)TEMP_FAILURE_RETRY(expression); } while (0)
+#endif
+
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#define lseek64(fd, offset, whence) lseek(fd, offset, whence)
+#endif
+
+#if defined(__sun) || defined(_AIX)
+#define LOG_AUTHPRIV (10<<3)
+#define LOG_FTP (11<<3)
+#define __STRING(x) "x"
+#define IFTODT(mode) (((mode) & 0170000) >> 12)
+#endif
+
+#if defined(_AIX)
+#define MSG_DONTWAIT MSG_NONBLOCK
+#endif
+
+#if defined(HAVE_PTHREAD_SETNAME_NP)
+ #if defined(__APPLE__)
+ #define ceph_pthread_setname(thread, name) ({ \
+ int __result = 0; \
+ if (thread == pthread_self()) \
+ __result = pthread_setname_np(name); \
+ __result; })
+ #else
+ #define ceph_pthread_setname pthread_setname_np
+ #endif
+#elif defined(HAVE_PTHREAD_SET_NAME_NP)
+ /* Fix a small name diff and return 0 */
+ #define ceph_pthread_setname(thread, name) ({ \
+ pthread_set_name_np(thread, name); \
+ 0; })
+#else
+ /* compiler warning free success noop */
+ #define ceph_pthread_setname(thread, name) ({ \
+ int __i = 0; \
+ __i; })
+#endif
+
+#if defined(HAVE_PTHREAD_GETNAME_NP)
+ #define ceph_pthread_getname pthread_getname_np
+#elif defined(HAVE_PTHREAD_GET_NAME_NP)
+ #define ceph_pthread_getname(thread, name, len) ({ \
+ pthread_get_name_np(thread, name, len); \
+ 0; })
+#else
+ /* compiler warning free success noop */
+ #define ceph_pthread_getname(thread, name, len) ({ \
+ if (name != NULL) \
+ *name = '\0'; \
+ 0; })
+#endif
+
+int ceph_posix_fallocate(int fd, off_t offset, off_t len);
+
+int pipe_cloexec(int pipefd[2]);
+
+#endif /* !CEPH_COMPAT_H */
diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake
new file mode 100644
index 00000000..acced696
--- /dev/null
+++ b/src/include/config-h.in.cmake
@@ -0,0 +1,366 @@
+/* config.h file expanded by Cmake for build */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+/* fallocate(2) is supported */
+#cmakedefine CEPH_HAVE_FALLOCATE
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#cmakedefine HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#cmakedefine HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `syncfs' function. */
+#cmakedefine HAVE_SYS_SYNCFS 1
+
+/* sync_file_range(2) is supported */
+#cmakedefine HAVE_SYNC_FILE_RANGE
+
+/* Define if you have mallinfo */
+#cmakedefine HAVE_MALLINFO
+
+/* Define to 1 if you have the `pwritev' function. */
+#cmakedefine HAVE_PWRITEV 1
+
+/* Define to 1 if you have the <sys/mount.h> header file. */
+#cmakedefine HAVE_SYS_MOUNT_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#cmakedefine HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#cmakedefine HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/vfs.h> header file. */
+#cmakedefine HAVE_SYS_VFS_H 1
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#cmakedefine HAVE_EXECINFO_H 1
+
+/* Define to 1 if the system has the type `__be16'. */
+#cmakedefine HAVE___BE16 1
+
+/* Define to 1 if the system has the type `__be32'. */
+#cmakedefine HAVE___BE32 1
+
+/* Define to 1 if the system has the type `__be64'. */
+#cmakedefine HAVE___BE64 1
+
+/* Define to 1 if the system has the type `__le16'. */
+#cmakedefine HAVE___LE16 1
+
+/* Define to 1 if the system has the type `__le32'. */
+#cmakedefine HAVE___LE32 1
+
+/* Define to 1 if the system has the type `__le64'. */
+#cmakedefine HAVE___LE64 1
+
+/* Define to 1 if the system has the type `__s16'. */
+#cmakedefine HAVE___S16 1
+
+/* Define to 1 if the system has the type `__s32'. */
+#cmakedefine HAVE___S32 1
+
+/* Define to 1 if the system has the type `__s64'. */
+#cmakedefine HAVE___S64 1
+
+/* Define to 1 if the system has the type `__s8'. */
+#cmakedefine HAVE___S8 1
+
+/* Define to 1 if the system has the type `__u16'. */
+#cmakedefine HAVE___U16 1
+
+/* Define to 1 if the system has the type `__u32'. */
+#cmakedefine HAVE___U32 1
+
+/* Define to 1 if the system has the type `__u64'. */
+#cmakedefine HAVE___U64 1
+
+/* Define to 1 if the system has the type `__u8'. */
+#cmakedefine HAVE___U8 1
+
+/* Define if you have res_nquery */
+#cmakedefine HAVE_RES_NQUERY
+
+/* Defined if you have LZ4 */
+#cmakedefine HAVE_LZ4
+
+/* Defined if you have BROTLI */
+#cmakedefine HAVE_BROTLI
+
+/* Defined if you have libaio */
+#cmakedefine HAVE_LIBAIO
+
+/* Defind if you have POSIX AIO */
+#cmakedefine HAVE_POSIXAIO
+
+/* Defined if OpenLDAP enabled */
+#cmakedefine HAVE_OPENLDAP
+
+/* Define if you have fuse */
+#cmakedefine HAVE_LIBFUSE
+
+/* Define to 1 if you have libxfs */
+#cmakedefine HAVE_LIBXFS 1
+
+/* SPDK conditional compilation */
+#cmakedefine HAVE_SPDK
+
+/* DPDK conditional compilation */
+#cmakedefine HAVE_DPDK
+
+/* PMEM conditional compilation */
+#cmakedefine HAVE_PMEM
+
+/* Defined if LevelDB supports bloom filters */
+#cmakedefine HAVE_LEVELDB_FILTER_POLICY
+
+/* Define if you have tcmalloc */
+#cmakedefine HAVE_LIBTCMALLOC
+
+/* Define if have curl_multi_wait() */
+#cmakedefine HAVE_CURL_MULTI_WAIT 1
+
+/* Define if using NSS. */
+#cmakedefine USE_NSS
+
+/* Define if using OpenSSL. */
+#cmakedefine USE_OPENSSL
+
+/* Accelio conditional compilation */
+#cmakedefine HAVE_XIO
+
+
+/* AsyncMessenger RDMA conditional compilation */
+#cmakedefine HAVE_RDMA
+
+/* ibverbs experimental conditional compilation */
+#cmakedefine HAVE_IBV_EXP
+
+/* define if bluestore enabled */
+#cmakedefine WITH_BLUESTORE
+
+/* define if cephfs enabled */
+#cmakedefine WITH_CEPHFS
+
+/*define if GSSAPI/KRB5 enabled */
+#cmakedefine HAVE_GSSAPI
+
+/* define if rbd enabled */
+#cmakedefine WITH_RBD
+
+/* define if kernel rbd enabled */
+#cmakedefine WITH_KRBD
+
+/* define if key-value-store is enabled */
+#cmakedefine WITH_KVS
+
+/* define if radosgw enabled */
+#cmakedefine WITH_RADOSGW
+
+/* define if radosgw enabled */
+#cmakedefine WITH_RADOSGW_FCGI_FRONTEND
+
+/* define if leveldb is enabled */
+#cmakedefine WITH_LEVELDB
+
+/* define if radosgw's beast frontend enabled */
+#cmakedefine WITH_RADOSGW_BEAST_FRONTEND
+
+/* define if radosgw has openssl support */
+#cmakedefine WITH_CURL_OPENSSL
+
+/* define if HAVE_THREAD_SAFE_RES_QUERY */
+#cmakedefine HAVE_THREAD_SAFE_RES_QUERY
+
+/* define if HAVE_REENTRANT_STRSIGNAL */
+#cmakedefine HAVE_REENTRANT_STRSIGNAL
+
+/* Define if you want to use LTTng */
+#cmakedefine WITH_LTTNG
+
+/* Define if you want to OSD function instrumentation */
+#cmakedefine WITH_OSD_INSTRUMENT_FUNCTIONS
+
+/* Define if you want to use Babeltrace */
+#cmakedefine WITH_BABELTRACE
+
+/* Define to 1 if you have the <babeltrace/babeltrace.h> header file. */
+#cmakedefine HAVE_BABELTRACE_BABELTRACE_H 1
+
+/* Define to 1 if you have the <babeltrace/ctf/events.h> header file. */
+#cmakedefine HAVE_BABELTRACE_CTF_EVENTS_H 1
+
+/* Define to 1 if you have the <babeltrace/ctf/iterator.h> header file. */
+#cmakedefine HAVE_BABELTRACE_CTF_ITERATOR_H 1
+
+/* Define to 1 if you have the <arpa/nameser_compat.h> header file. */
+#cmakedefine HAVE_ARPA_NAMESER_COMPAT_H 1
+
+/* FastCGI headers are in /usr/include/fastcgi */
+#cmakedefine FASTCGI_INCLUDE_DIR
+
+/* splice(2) is supported */
+#cmakedefine CEPH_HAVE_SPLICE
+
+/* Define if you want C_Gather debugging */
+#cmakedefine DEBUG_GATHER
+
+/* Define to 1 if you have the `getgrouplist' function. */
+#cmakedefine HAVE_GETGROUPLIST 1
+
+/* LTTng is disabled, so define this macro to be nothing. */
+#cmakedefine tracepoint
+
+/* Define to 1 if you have fdatasync. */
+#cmakedefine HAVE_FDATASYNC 1
+
+/* Defined if you have librocksdb enabled */
+#cmakedefine HAVE_LIBROCKSDB
+
+/* Define to 1 if you have the <valgrind/helgrind.h> header file. */
+#cmakedefine HAVE_VALGRIND_HELGRIND_H 1
+
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+#cmakedefine HAVE_SYS_PRCTL_H 1
+
+/* Define to 1 if you have the <linux/types.h> header file. */
+#cmakedefine HAVE_LINUX_TYPES_H 1
+
+/* Define to 1 if you have the <linux/version.h> header file. */
+#cmakedefine HAVE_LINUX_VERSION_H 1
+
+/* Define to 1 if you have sched.h. */
+#cmakedefine HAVE_SCHED 1
+
+/* Define to 1 if you have sigdescr_np. */
+#cmakedefine HAVE_SIGDESCR_NP 1
+
+/* Support SSE (Streaming SIMD Extensions) instructions */
+#cmakedefine HAVE_SSE
+
+/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
+#cmakedefine HAVE_SSE2
+
+/* Define to 1 if you have the `pipe2' function. */
+#cmakedefine HAVE_PIPE2 1
+
+/* Support NEON instructions */
+#cmakedefine HAVE_NEON
+
+/* Define if you have pthread_spin_init */
+#cmakedefine HAVE_PTHREAD_SPINLOCK
+
+/* name_to_handle_at exists */
+#cmakedefine HAVE_NAME_TO_HANDLE_AT
+
+/* we have a recent yasm and are x86_64 */
+#cmakedefine HAVE_GOOD_YASM_ELF64
+
+/* yasm can also build the isa-l */
+#cmakedefine HAVE_BETTER_YASM_ELF64
+
+/* Define to 1 if strerror_r returns char *. */
+#cmakedefine STRERROR_R_CHAR_P 1
+
+/* Defined if you have libzfs enabled */
+#cmakedefine HAVE_LIBZFS
+
+/* Define if the C compiler supports __func__ */
+#cmakedefine HAVE_FUNC
+
+/* Define if the C compiler supports __PRETTY_FUNCTION__ */
+#cmakedefine HAVE_PRETTY_FUNC
+
+/* Have eventfd extension. */
+#cmakedefine HAVE_EVENTFD
+
+/* Define if enabling coverage. */
+#cmakedefine ENABLE_COVERAGE
+
+/* Defined if you want pg ref debugging */
+#cmakedefine PG_DEBUG_REFS
+
+/* Support ARMv8 CRC instructions */
+#cmakedefine HAVE_ARMV8_CRC
+
+/* Support ARMv8 CRYPTO instructions */
+#cmakedefine HAVE_ARMV8_CRYPTO
+
+/* Support ARMv8 CRC and CRYPTO intrinsics */
+#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+
+/* Define if you have struct stat.st_mtimespec.tv_nsec */
+#cmakedefine HAVE_STAT_ST_MTIMESPEC_TV_NSEC
+
+/* Define if you have struct stat.st_mtim.tv_nsec */
+#cmakedefine HAVE_STAT_ST_MTIM_TV_NSEC
+
+/* Define if compiler supports static_cast<> */
+#cmakedefine HAVE_STATIC_CAST
+
+/* Version number of package */
+#cmakedefine VERSION "@VERSION@"
+
+/* Defined if pthread_setname_np() is available */
+#cmakedefine HAVE_PTHREAD_SETNAME_NP 1
+
+/* Defined if pthread_rwlockattr_setkind_np() is available */
+#cmakedefine HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP
+
+/* Defined if blkin enabled */
+#cmakedefine WITH_BLKIN
+
+/* Defined if pthread_set_name_np() is available */
+#cmakedefine HAVE_PTHREAD_SET_NAME_NP
+
+/* Defined if pthread_getname_np() is available */
+#cmakedefine HAVE_PTHREAD_GETNAME_NP 1
+
+/* Support POWER8 instructions */
+#cmakedefine HAVE_POWER8
+
+/* Define if endian type is big endian */
+#cmakedefine CEPH_BIG_ENDIAN
+
+/* Define if endian type is little endian */
+#cmakedefine CEPH_LITTLE_ENDIAN
+
+#cmakedefine MGR_PYTHON_EXECUTABLE "@MGR_PYTHON_EXECUTABLE@"
+
+/* Define to 1 if you have the `getprogname' function. */
+#cmakedefine HAVE_GETPROGNAME 1
+
+/* Defined if getentropy() is available */
+#cmakedefine HAVE_GETENTROPY
+
+/* Defined if boost::context is available */
+#cmakedefine HAVE_BOOST_CONTEXT
+
+/* Defined if libradosstriper is enabled: */
+#cmakedefine WITH_LIBRADOSSTRIPER
+
+/* Defined if OpenSSL is available for the rgw beast frontend */
+#cmakedefine WITH_RADOSGW_BEAST_OPENSSL
+
+/* Defined if rabbitmq-c is available for rgw amqp push endpoint */
+#cmakedefine WITH_RADOSGW_AMQP_ENDPOINT
+
+/* Defined if libedkafka is available for rgw kafka push endpoint */
+#cmakedefine WITH_RADOSGW_KAFKA_ENDPOINT
+
+/* Defined if std::map::merge() is supported */
+#cmakedefine HAVE_STDLIB_MAP_SPLICING
+
+/* Defined if Intel QAT compress/decompress is supported */
+#cmakedefine HAVE_QATZIP
+
+/* Define if seastar is available. */
+#cmakedefine HAVE_SEASTAR
+
+/* Define if unit tests are built. */
+#cmakedefine UNIT_TESTS_BUILT
+
+#endif /* CONFIG_H */
diff --git a/src/include/coredumpctl.h b/src/include/coredumpctl.h
new file mode 100644
index 00000000..60fab432
--- /dev/null
+++ b/src/include/coredumpctl.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include "acconfig.h"
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <iostream>
+#include <sys/prctl.h>
+#include "common/errno.h"
+
+class PrCtl {
+ int saved_state = -1;
+ static int get_dumpable() {
+ int r = prctl(PR_GET_DUMPABLE);
+ if (r == -1) {
+ r = errno;
+ std::cerr << "warning: unable to get dumpable flag: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ static int set_dumpable(bool new_state) {
+ int r = prctl(PR_SET_DUMPABLE, new_state);
+ if (r) {
+ r = -errno;
+ std::cerr << "warning: unable to " << (new_state ? "set" : "unset")
+ << " dumpable flag: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+public:
+ PrCtl(int new_state = 0) {
+ int r = get_dumpable();
+ if (r == -1) {
+ return;
+ }
+ if (r != new_state) {
+ if (!set_dumpable(new_state)) {
+ saved_state = r;
+ }
+ }
+ }
+ ~PrCtl() {
+ if (saved_state < 0) {
+ return;
+ }
+ set_dumpable(saved_state);
+ }
+};
+
+#else
+#include <sys/resource.h>
+#ifdef RLIMIT_CORE
+#include <iostream>
+#include <sys/resource.h>
+#include "common/errno.h"
+
+class PrCtl {
+ rlimit saved_lim;
+ static int get_dumpable(rlimit* saved) {
+ int r = getrlimit(RLIMIT_CORE, saved);
+ if (r) {
+ r = errno;
+ std::cerr << "warning: unable to getrlimit(): " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ static void set_dumpable(const rlimit& rlim) {
+ int r = setrlimit(RLIMIT_CORE, &rlim);
+ if (r) {
+ r = -errno;
+ std::cerr << "warning: unable to setrlimit(): " << cpp_strerror(r)
+ << std::endl;
+ }
+ }
+public:
+ PrCtl(int new_state = 0) {
+ int r = get_dumpable(&saved_lim);
+ if (r == -1) {
+ return;
+ }
+ rlimit new_lim;
+ if (new_state) {
+ new_lim.rlim_cur = saved_lim.rlim_max;
+ } else {
+ new_lim.rlim_cur = new_lim.rlim_max = 0;
+ }
+ if (new_lim.rlim_cur == saved_lim.rlim_cur) {
+ return;
+ }
+ set_dumpable(new_lim);
+ }
+ ~PrCtl() {
+ set_dumpable(saved_lim);
+ }
+};
+#else
+struct PrCtl {
+ // to silence the Wunused-variable warning
+ PrCtl() {}
+};
+
+#endif // RLIMIT_CORE
+#endif
diff --git a/src/include/counter.h b/src/include/counter.h
new file mode 100644
index 00000000..61ed7409
--- /dev/null
+++ b/src/include/counter.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COUNTER_H
+#define CEPH_COUNTER_H
+
+#include <atomic>
+
+template <typename T>
+class Counter {
+public:
+ Counter() {
+ _count()++;
+ _increments()++;
+ }
+ Counter(const Counter &rhs) {
+ _count()++;
+ _increments()++;
+ }
+ Counter(Counter &&rhs) {}
+ ~Counter() {
+ _count()--;
+ }
+ static uint64_t count() {
+ return _count();
+ }
+ static uint64_t increments() {
+ return _increments();
+ }
+ static uint64_t decrements() {
+ return increments()-count();
+ }
+
+private:
+ static std::atomic<uint64_t> &_count() {
+ static std::atomic<uint64_t> c;
+ return c;
+ }
+ static std::atomic<uint64_t> &_increments() {
+ static std::atomic<uint64_t> i;
+ return i;
+ }
+};
+
+#endif
diff --git a/src/include/cpp-btree/btree.h b/src/include/cpp-btree/btree.h
new file mode 100644
index 00000000..0a40e0e1
--- /dev/null
+++ b/src/include/cpp-btree/btree.h
@@ -0,0 +1,2396 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree implementation of the STL set and map interfaces. A btree is both
+// smaller and faster than STL set/map. The red-black tree implementation of
+// STL set/map has an overhead of 3 pointers (left, right and parent) plus the
+// node color information for each stored value. So a set<int32> consumes 20
+// bytes for each value stored. This btree implementation stores multiple
+// values on fixed size nodes (usually 256 bytes) and doesn't store child
+// pointers for leaf nodes. The result is that a btree_set<int32> may use much
+// less memory per stored value. For the random insertion benchmark in
+// btree_test.cc, a btree_set<int32> with node-size of 256 uses 4.9 bytes per
+// stored value.
+//
+// The packing of multiple values on to each node of a btree has another effect
+// besides better space utilization: better cache locality due to fewer cache
+// lines being accessed. Better cache locality translates into faster
+// operations.
+//
+// CAVEATS
+//
+// Insertions and deletions on a btree can cause splitting, merging or
+// rebalancing of btree nodes. And even without these operations, insertions
+// and deletions on a btree will move values around within a node. In both
+// cases, the result is that insertions and deletions can invalidate iterators
+// pointing to values other than the one being inserted/deleted. This is
+// notably different from STL set/map which takes care to not invalidate
+// iterators on insert/erase except, of course, for iterators pointing to the
+// value being erased. A partial workaround when erasing is available:
+// erase() returns an iterator pointing to the item just after the one that was
+// erased (or end() if none exists). See also safe_btree.
+
+// PERFORMANCE
+//
+// btree_bench --benchmarks=. 2>&1 | ./benchmarks.awk
+//
+// Run on pmattis-warp.nyc (4 X 2200 MHz CPUs); 2010/03/04-15:23:06
+// Benchmark STL(ns) B-Tree(ns) @ <size>
+// --------------------------------------------------------
+// BM_set_int32_insert 1516 608 +59.89% <256> [40.0, 5.2]
+// BM_set_int32_lookup 1160 414 +64.31% <256> [40.0, 5.2]
+// BM_set_int32_fulllookup 960 410 +57.29% <256> [40.0, 4.4]
+// BM_set_int32_delete 1741 528 +69.67% <256> [40.0, 5.2]
+// BM_set_int32_queueaddrem 3078 1046 +66.02% <256> [40.0, 5.5]
+// BM_set_int32_mixedaddrem 3600 1384 +61.56% <256> [40.0, 5.3]
+// BM_set_int32_fifo 227 113 +50.22% <256> [40.0, 4.4]
+// BM_set_int32_fwditer 158 26 +83.54% <256> [40.0, 5.2]
+// BM_map_int32_insert 1551 636 +58.99% <256> [48.0, 10.5]
+// BM_map_int32_lookup 1200 508 +57.67% <256> [48.0, 10.5]
+// BM_map_int32_fulllookup 989 487 +50.76% <256> [48.0, 8.8]
+// BM_map_int32_delete 1794 628 +64.99% <256> [48.0, 10.5]
+// BM_map_int32_queueaddrem 3189 1266 +60.30% <256> [48.0, 11.6]
+// BM_map_int32_mixedaddrem 3822 1623 +57.54% <256> [48.0, 10.9]
+// BM_map_int32_fifo 151 134 +11.26% <256> [48.0, 8.8]
+// BM_map_int32_fwditer 161 32 +80.12% <256> [48.0, 10.5]
+// BM_set_int64_insert 1546 636 +58.86% <256> [40.0, 10.5]
+// BM_set_int64_lookup 1200 512 +57.33% <256> [40.0, 10.5]
+// BM_set_int64_fulllookup 971 487 +49.85% <256> [40.0, 8.8]
+// BM_set_int64_delete 1745 616 +64.70% <256> [40.0, 10.5]
+// BM_set_int64_queueaddrem 3163 1195 +62.22% <256> [40.0, 11.6]
+// BM_set_int64_mixedaddrem 3760 1564 +58.40% <256> [40.0, 10.9]
+// BM_set_int64_fifo 146 103 +29.45% <256> [40.0, 8.8]
+// BM_set_int64_fwditer 162 31 +80.86% <256> [40.0, 10.5]
+// BM_map_int64_insert 1551 720 +53.58% <256> [48.0, 20.7]
+// BM_map_int64_lookup 1214 612 +49.59% <256> [48.0, 20.7]
+// BM_map_int64_fulllookup 994 592 +40.44% <256> [48.0, 17.2]
+// BM_map_int64_delete 1778 764 +57.03% <256> [48.0, 20.7]
+// BM_map_int64_queueaddrem 3189 1547 +51.49% <256> [48.0, 20.9]
+// BM_map_int64_mixedaddrem 3779 1887 +50.07% <256> [48.0, 21.6]
+// BM_map_int64_fifo 147 145 +1.36% <256> [48.0, 17.2]
+// BM_map_int64_fwditer 162 41 +74.69% <256> [48.0, 20.7]
+// BM_set_string_insert 1989 1966 +1.16% <256> [64.0, 44.5]
+// BM_set_string_lookup 1709 1600 +6.38% <256> [64.0, 44.5]
+// BM_set_string_fulllookup 1573 1529 +2.80% <256> [64.0, 35.4]
+// BM_set_string_delete 2520 1920 +23.81% <256> [64.0, 44.5]
+// BM_set_string_queueaddrem 4706 4309 +8.44% <256> [64.0, 48.3]
+// BM_set_string_mixedaddrem 5080 4654 +8.39% <256> [64.0, 46.7]
+// BM_set_string_fifo 318 512 -61.01% <256> [64.0, 35.4]
+// BM_set_string_fwditer 182 93 +48.90% <256> [64.0, 44.5]
+// BM_map_string_insert 2600 2227 +14.35% <256> [72.0, 55.8]
+// BM_map_string_lookup 2068 1730 +16.34% <256> [72.0, 55.8]
+// BM_map_string_fulllookup 1859 1618 +12.96% <256> [72.0, 44.0]
+// BM_map_string_delete 3168 2080 +34.34% <256> [72.0, 55.8]
+// BM_map_string_queueaddrem 5840 4701 +19.50% <256> [72.0, 59.4]
+// BM_map_string_mixedaddrem 6400 5200 +18.75% <256> [72.0, 57.8]
+// BM_map_string_fifo 398 596 -49.75% <256> [72.0, 44.0]
+// BM_map_string_fwditer 243 113 +53.50% <256> [72.0, 55.8]
+
+#ifndef UTIL_BTREE_BTREE_H__
+#define UTIL_BTREE_BTREE_H__
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#include <new>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "include/ceph_assert.h"
+
+namespace btree {
+
+// Inside a btree method, if we just call swap(), it will choose the
+// btree::swap method, which we don't want. And we can't say ::swap
+// because then MSVC won't pickup any std::swap() implementations. We
+// can't just use std::swap() directly because then we don't get the
+// specialization for types outside the std namespace. So the solution
+// is to have a special swap helper function whose name doesn't
+// collide with other swap functions defined by the btree classes.
+template <typename T>
+inline void btree_swap_helper(T &a, T &b) {
+ using std::swap;
+ swap(a, b);
+}
+
+// A template helper used to select A or B based on a condition.
+template<bool cond, typename A, typename B>
+struct if_{
+ typedef A type;
+};
+
+template<typename A, typename B>
+struct if_<false, A, B> {
+ typedef B type;
+};
+
+// Types small_ and big_ are promise that sizeof(small_) < sizeof(big_)
+typedef char small_;
+
+struct big_ {
+ char dummy[2];
+};
+
+// A compile-time assertion.
+template <bool>
+struct CompileAssert {
+};
+
+#define COMPILE_ASSERT(expr, msg) \
+ typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// A helper type used to indicate that a key-compare-to functor has been
+// provided. A user can specify a key-compare-to functor by doing:
+//
+// struct MyStringComparer
+// : public util::btree::btree_key_compare_to_tag {
+// int operator()(const string &a, const string &b) const {
+// return a.compare(b);
+// }
+// };
+//
+// Note that the return type is an int and not a bool. There is a
+// COMPILE_ASSERT which enforces this return type.
+struct btree_key_compare_to_tag {
+};
+
+// A helper class that indicates if the Compare parameter is derived from
+// btree_key_compare_to_tag.
+template <typename Compare>
+struct btree_is_key_compare_to
+ : public std::is_convertible<Compare, btree_key_compare_to_tag> {
+};
+
+// A helper class to convert a boolean comparison into a three-way
+// "compare-to" comparison that returns a negative value to indicate
+// less-than, zero to indicate equality and a positive value to
+// indicate greater-than. This helper class is specialized for
+// less<string> and greater<string>. The btree_key_compare_to_adapter
+// class is provided so that btree users automatically get the more
+// efficient compare-to code when using common google string types
+// with common comparison functors.
+template <typename Compare>
+struct btree_key_compare_to_adapter : Compare {
+ btree_key_compare_to_adapter() { }
+ btree_key_compare_to_adapter(const Compare &c) : Compare(c) { }
+ btree_key_compare_to_adapter(const btree_key_compare_to_adapter<Compare> &c)
+ : Compare(c) {
+ }
+};
+
+template <>
+struct btree_key_compare_to_adapter<std::less<std::string> >
+ : public btree_key_compare_to_tag {
+ btree_key_compare_to_adapter() {}
+ btree_key_compare_to_adapter(const std::less<std::string>&) {}
+ btree_key_compare_to_adapter(
+ const btree_key_compare_to_adapter<std::less<std::string> >&) {}
+ int operator()(const std::string &a, const std::string &b) const {
+ return a.compare(b);
+ }
+};
+
+template <>
+struct btree_key_compare_to_adapter<std::greater<std::string> >
+ : public btree_key_compare_to_tag {
+ btree_key_compare_to_adapter() {}
+ btree_key_compare_to_adapter(const std::greater<std::string>&) {}
+ btree_key_compare_to_adapter(
+ const btree_key_compare_to_adapter<std::greater<std::string> >&) {}
+ int operator()(const std::string &a, const std::string &b) const {
+ return b.compare(a);
+ }
+};
+
+// A helper class that allows a compare-to functor to behave like a plain
+// compare functor. This specialization is used when we do not have a
+// compare-to functor.
+template <typename Key, typename Compare, bool HaveCompareTo>
+struct btree_key_comparer {
+ btree_key_comparer() {}
+ btree_key_comparer(Compare c) : comp(c) {}
+ static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
+ return comp(x, y);
+ }
+ bool operator()(const Key &x, const Key &y) const {
+ return bool_compare(comp, x, y);
+ }
+ Compare comp;
+};
+
+// A specialization of btree_key_comparer when a compare-to functor is
+// present. We need a plain (boolean) comparison in some parts of the btree
+// code, such as insert-with-hint.
+template <typename Key, typename Compare>
+struct btree_key_comparer<Key, Compare, true> {
+ btree_key_comparer() {}
+ btree_key_comparer(Compare c) : comp(c) {}
+ static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
+ return comp(x, y) < 0;
+ }
+ bool operator()(const Key &x, const Key &y) const {
+ return bool_compare(comp, x, y);
+ }
+ Compare comp;
+};
+
+// A helper function to compare to keys using the specified compare
+// functor. This dispatches to the appropriate btree_key_comparer comparison,
+// depending on whether we have a compare-to functor or not (which depends on
+// whether Compare is derived from btree_key_compare_to_tag).
+template <typename Key, typename Compare>
+static bool btree_compare_keys(
+ const Compare &comp, const Key &x, const Key &y) {
+ typedef btree_key_comparer<Key, Compare,
+ btree_is_key_compare_to<Compare>::value> key_comparer;
+ return key_comparer::bool_compare(comp, x, y);
+}
+
+template <typename Key, typename Compare,
+ typename Alloc, int TargetNodeSize, int ValueSize>
+struct btree_common_params {
+ // If Compare is derived from btree_key_compare_to_tag then use it as the
+ // key_compare type. Otherwise, use btree_key_compare_to_adapter<> which will
+ // fall-back to Compare if we don't have an appropriate specialization.
+ typedef typename if_<
+ btree_is_key_compare_to<Compare>::value,
+ Compare, btree_key_compare_to_adapter<Compare> >::type key_compare;
+ // A type which indicates if we have a key-compare-to functor or a plain old
+ // key-compare functor.
+ typedef btree_is_key_compare_to<key_compare> is_key_compare_to;
+
+ typedef Alloc allocator_type;
+ typedef Key key_type;
+ typedef ssize_t size_type;
+ typedef ptrdiff_t difference_type;
+
+ enum {
+ kTargetNodeSize = TargetNodeSize,
+
+ // Available space for values. This is largest for leaf nodes,
+ // which has overhead no fewer than two pointers.
+ kNodeValueSpace = TargetNodeSize - 2 * sizeof(void*),
+ };
+
+ // This is an integral type large enough to hold as many
+ // ValueSize-values as will fit a node of TargetNodeSize bytes.
+ typedef typename if_<
+ (kNodeValueSpace / ValueSize) >= 256,
+ uint16_t,
+ uint8_t>::type node_count_type;
+};
+
+// A parameters structure for holding the type parameters for a btree_map.
+template <typename Key, typename Data, typename Compare,
+ typename Alloc, int TargetNodeSize>
+struct btree_map_params
+ : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
+ sizeof(Key) + sizeof(Data)> {
+ typedef Data data_type;
+ typedef Data mapped_type;
+ typedef std::pair<const Key, data_type> value_type;
+ typedef std::pair<Key, data_type> mutable_value_type;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+
+ enum {
+ kValueSize = sizeof(Key) + sizeof(data_type),
+ };
+
+ static const Key& key(const value_type &x) { return x.first; }
+ static const Key& key(const mutable_value_type &x) { return x.first; }
+ static void swap(mutable_value_type *a, mutable_value_type *b) {
+ btree_swap_helper(a->first, b->first);
+ btree_swap_helper(a->second, b->second);
+ }
+};
+
+// A parameters structure for holding the type parameters for a btree_set.
+template <typename Key, typename Compare, typename Alloc, int TargetNodeSize>
+struct btree_set_params
+ : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
+ sizeof(Key)> {
+ typedef std::false_type data_type;
+ typedef std::false_type mapped_type;
+ typedef Key value_type;
+ typedef value_type mutable_value_type;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+
+ enum {
+ kValueSize = sizeof(Key),
+ };
+
+ static const Key& key(const value_type &x) { return x; }
+ static void swap(mutable_value_type *a, mutable_value_type *b) {
+ btree_swap_helper<mutable_value_type>(*a, *b);
+ }
+};
+
+// An adapter class that converts a lower-bound compare into an upper-bound
+// compare.
+template <typename Key, typename Compare>
+struct btree_upper_bound_adapter : public Compare {
+ btree_upper_bound_adapter(Compare c) : Compare(c) {}
+ bool operator()(const Key &a, const Key &b) const {
+ return !static_cast<const Compare&>(*this)(b, a);
+ }
+};
+
+template <typename Key, typename CompareTo>
+struct btree_upper_bound_compare_to_adapter : public CompareTo {
+ btree_upper_bound_compare_to_adapter(CompareTo c) : CompareTo(c) {}
+ int operator()(const Key &a, const Key &b) const {
+ return static_cast<const CompareTo&>(*this)(b, a);
+ }
+};
+
+// Dispatch helper class for using linear search with plain compare.
+template <typename K, typename N, typename Compare>
+struct btree_linear_search_plain_compare {
+ static int lower_bound(const K &k, const N &n, Compare comp) {
+ return n.linear_search_plain_compare(k, 0, n.count(), comp);
+ }
+ static int upper_bound(const K &k, const N &n, Compare comp) {
+ typedef btree_upper_bound_adapter<K, Compare> upper_compare;
+ return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+ }
+};
+
+// Dispatch helper class for using linear search with compare-to
+template <typename K, typename N, typename CompareTo>
+struct btree_linear_search_compare_to {
+ static int lower_bound(const K &k, const N &n, CompareTo comp) {
+ return n.linear_search_compare_to(k, 0, n.count(), comp);
+ }
+ static int upper_bound(const K &k, const N &n, CompareTo comp) {
+ typedef btree_upper_bound_adapter<K,
+ btree_key_comparer<K, CompareTo, true> > upper_compare;
+ return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+ }
+};
+
+// Dispatch helper class for using binary search with plain compare.
+template <typename K, typename N, typename Compare>
+struct btree_binary_search_plain_compare {
+ static int lower_bound(const K &k, const N &n, Compare comp) {
+ return n.binary_search_plain_compare(k, 0, n.count(), comp);
+ }
+ static int upper_bound(const K &k, const N &n, Compare comp) {
+ typedef btree_upper_bound_adapter<K, Compare> upper_compare;
+ return n.binary_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+ }
+};
+
+// Dispatch helper class for using binary search with compare-to.
+template <typename K, typename N, typename CompareTo>
+struct btree_binary_search_compare_to {
+ static int lower_bound(const K &k, const N &n, CompareTo comp) {
+ return n.binary_search_compare_to(k, 0, n.count(), CompareTo());
+ }
+ static int upper_bound(const K &k, const N &n, CompareTo comp) {
+ typedef btree_upper_bound_adapter<K,
+ btree_key_comparer<K, CompareTo, true> > upper_compare;
+ return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+ }
+};
+
+// A node in the btree holding. The same node type is used for both internal
+// and leaf nodes in the btree, though the nodes are allocated in such a way
+// that the children array is only valid in internal nodes.
+template <typename Params>
+class btree_node {
+ public:
+ typedef Params params_type;
+ typedef btree_node<Params> self_type;
+ typedef typename Params::key_type key_type;
+ typedef typename Params::data_type data_type;
+ typedef typename Params::value_type value_type;
+ typedef typename Params::mutable_value_type mutable_value_type;
+ typedef typename Params::pointer pointer;
+ typedef typename Params::const_pointer const_pointer;
+ typedef typename Params::reference reference;
+ typedef typename Params::const_reference const_reference;
+ typedef typename Params::key_compare key_compare;
+ typedef typename Params::size_type size_type;
+ typedef typename Params::difference_type difference_type;
+ // Typedefs for the various types of node searches.
+ typedef btree_linear_search_plain_compare<
+ key_type, self_type, key_compare> linear_search_plain_compare_type;
+ typedef btree_linear_search_compare_to<
+ key_type, self_type, key_compare> linear_search_compare_to_type;
+ typedef btree_binary_search_plain_compare<
+ key_type, self_type, key_compare> binary_search_plain_compare_type;
+ typedef btree_binary_search_compare_to<
+ key_type, self_type, key_compare> binary_search_compare_to_type;
+ // If we have a valid key-compare-to type, use linear_search_compare_to,
+ // otherwise use linear_search_plain_compare.
+ typedef typename if_<
+ Params::is_key_compare_to::value,
+ linear_search_compare_to_type,
+ linear_search_plain_compare_type>::type linear_search_type;
+ // If we have a valid key-compare-to type, use binary_search_compare_to,
+ // otherwise use binary_search_plain_compare.
+ typedef typename if_<
+ Params::is_key_compare_to::value,
+ binary_search_compare_to_type,
+ binary_search_plain_compare_type>::type binary_search_type;
+ // If the key is an integral or floating point type, use linear search which
+ // is faster than binary search for such types. Might be wise to also
+ // configure linear search based on node-size.
+ typedef typename if_<
+ std::is_integral<key_type>::value ||
+ std::is_floating_point<key_type>::value,
+ linear_search_type, binary_search_type>::type search_type;
+
+ struct base_fields {
+ typedef typename Params::node_count_type field_type;
+
+ // A boolean indicating whether the node is a leaf or not.
+ bool leaf;
+ // The position of the node in the node's parent.
+ field_type position;
+ // The maximum number of values the node can hold.
+ field_type max_count;
+ // The count of the number of values in the node.
+ field_type count;
+ // A pointer to the node's parent.
+ btree_node *parent;
+ };
+
+ enum {
+ kValueSize = params_type::kValueSize,
+ kTargetNodeSize = params_type::kTargetNodeSize,
+
+ // Compute how many values we can fit onto a leaf node.
+ kNodeTargetValues = (kTargetNodeSize - sizeof(base_fields)) / kValueSize,
+ // We need a minimum of 3 values per internal node in order to perform
+ // splitting (1 value for the two nodes involved in the split and 1 value
+ // propagated to the parent as the delimiter for the split).
+ kNodeValues = kNodeTargetValues >= 3 ? kNodeTargetValues : 3,
+
+ kExactMatch = 1 << 30,
+ kMatchMask = kExactMatch - 1,
+ };
+
+ struct leaf_fields : public base_fields {
+ // The array of values. Only the first count of these values have been
+ // constructed and are valid.
+ mutable_value_type values[kNodeValues];
+ };
+
+ struct internal_fields : public leaf_fields {
+ // The array of child pointers. The keys in children_[i] are all less than
+ // key(i). The keys in children_[i + 1] are all greater than key(i). There
+ // are always count + 1 children.
+ btree_node *children[kNodeValues + 1];
+ };
+
+ struct root_fields : public internal_fields {
+ btree_node *rightmost;
+ size_type size;
+ };
+
+ public:
+ // Getter/setter for whether this is a leaf node or not. This value doesn't
+ // change after the node is created.
+ bool leaf() const { return fields_.leaf; }
+
+ // Getter for the position of this node in its parent.
+ int position() const { return fields_.position; }
+ void set_position(int v) { fields_.position = v; }
+
+ // Getter/setter for the number of values stored in this node.
+ int count() const { return fields_.count; }
+ void set_count(int v) { fields_.count = v; }
+ int max_count() const { return fields_.max_count; }
+
+ // Getter for the parent of this node.
+ btree_node* parent() const { return fields_.parent; }
+ // Getter for whether the node is the root of the tree. The parent of the
+ // root of the tree is the leftmost node in the tree which is guaranteed to
+ // be a leaf.
+ bool is_root() const { return parent()->leaf(); }
+ void make_root() {
+ ceph_assert(parent()->is_root());
+ fields_.parent = fields_.parent->parent();
+ }
+
+ // Getter for the rightmost root node field. Only valid on the root node.
+ btree_node* rightmost() const { return fields_.rightmost; }
+ btree_node** mutable_rightmost() { return &fields_.rightmost; }
+
+ // Getter for the size root node field. Only valid on the root node.
+ size_type size() const { return fields_.size; }
+ size_type* mutable_size() { return &fields_.size; }
+
+ // Getters for the key/value at position i in the node.
+ const key_type& key(int i) const {
+ return params_type::key(fields_.values[i]);
+ }
+ reference value(int i) {
+ return reinterpret_cast<reference>(fields_.values[i]);
+ }
+ const_reference value(int i) const {
+ return reinterpret_cast<const_reference>(fields_.values[i]);
+ }
+ mutable_value_type* mutable_value(int i) {
+ return &fields_.values[i];
+ }
+
+ // Swap value i in this node with value j in node x.
+ void value_swap(int i, btree_node *x, int j) {
+ params_type::swap(mutable_value(i), x->mutable_value(j));
+ }
+
+ // Getters/setter for the child at position i in the node.
+ btree_node* child(int i) const { return fields_.children[i]; }
+ btree_node** mutable_child(int i) { return &fields_.children[i]; }
+ void set_child(int i, btree_node *c) {
+ *mutable_child(i) = c;
+ c->fields_.parent = this;
+ c->fields_.position = i;
+ }
+
+ // Returns the position of the first value whose key is not less than k.
+ template <typename Compare>
+ int lower_bound(const key_type &k, const Compare &comp) const {
+ return search_type::lower_bound(k, *this, comp);
+ }
+ // Returns the position of the first value whose key is greater than k.
+ template <typename Compare>
+ int upper_bound(const key_type &k, const Compare &comp) const {
+ return search_type::upper_bound(k, *this, comp);
+ }
+
+ // Returns the position of the first value whose key is not less than k using
+ // linear search performed using plain compare.
+ template <typename Compare>
+ int linear_search_plain_compare(
+ const key_type &k, int s, int e, const Compare &comp) const {
+ while (s < e) {
+ if (!btree_compare_keys(comp, key(s), k)) {
+ break;
+ }
+ ++s;
+ }
+ return s;
+ }
+
+ // Returns the position of the first value whose key is not less than k using
+ // linear search performed using compare-to.
+ template <typename Compare>
+ int linear_search_compare_to(
+ const key_type &k, int s, int e, const Compare &comp) const {
+ while (s < e) {
+ int c = comp(key(s), k);
+ if (c == 0) {
+ return s | kExactMatch;
+ } else if (c > 0) {
+ break;
+ }
+ ++s;
+ }
+ return s;
+ }
+
+ // Returns the position of the first value whose key is not less than k using
+ // binary search performed using plain compare.
+ template <typename Compare>
+ int binary_search_plain_compare(
+ const key_type &k, int s, int e, const Compare &comp) const {
+ while (s != e) {
+ int mid = (s + e) / 2;
+ if (btree_compare_keys(comp, key(mid), k)) {
+ s = mid + 1;
+ } else {
+ e = mid;
+ }
+ }
+ return s;
+ }
+
+ // Returns the position of the first value whose key is not less than k using
+ // binary search performed using compare-to.
+ template <typename CompareTo>
+ int binary_search_compare_to(
+ const key_type &k, int s, int e, const CompareTo &comp) const {
+ while (s != e) {
+ int mid = (s + e) / 2;
+ int c = comp(key(mid), k);
+ if (c < 0) {
+ s = mid + 1;
+ } else if (c > 0) {
+ e = mid;
+ } else {
+ // Need to return the first value whose key is not less than k, which
+ // requires continuing the binary search. Note that we are guaranteed
+ // that the result is an exact match because if "key(mid-1) < k" the
+ // call to binary_search_compare_to() will return "mid".
+ s = binary_search_compare_to(k, s, mid, comp);
+ return s | kExactMatch;
+ }
+ }
+ return s;
+ }
+
+ // Inserts the value x at position i, shifting all existing values and
+ // children at positions >= i to the right by 1.
+ void insert_value(int i, const value_type &x);
+
+ // Removes the value at position i, shifting all existing values and children
+ // at positions > i to the left by 1.
+ void remove_value(int i);
+
+ // Rebalances a node with its right sibling.
+ void rebalance_right_to_left(btree_node *sibling, int to_move);
+ void rebalance_left_to_right(btree_node *sibling, int to_move);
+
+ // Splits a node, moving a portion of the node's values to its right sibling.
+ void split(btree_node *sibling, int insert_position);
+
+ // Merges a node with its right sibling, moving all of the values and the
+ // delimiting key in the parent node onto itself.
+ void merge(btree_node *sibling);
+
+ // Swap the contents of "this" and "src".
+ void swap(btree_node *src);
+
+#ifdef NDEBUG
+ static constexpr auto no_debug = true;
+#else
+ static constexpr auto no_debug = false;
+#endif
+ // Node allocation/deletion routines.
+ static btree_node* init_leaf(
+ leaf_fields *f, btree_node *parent, int max_count) {
+ btree_node *n = reinterpret_cast<btree_node*>(f);
+ f->leaf = 1;
+ f->position = 0;
+ f->max_count = max_count;
+ f->count = 0;
+ f->parent = parent;
+ if (!no_debug) {
+ memset(&f->values, 0, max_count * sizeof(value_type));
+ }
+ return n;
+ }
+ static btree_node* init_internal(internal_fields *f, btree_node *parent) {
+ btree_node *n = init_leaf(f, parent, kNodeValues);
+ f->leaf = 0;
+ if (!no_debug) {
+ memset(f->children, 0, sizeof(f->children));
+ }
+ return n;
+ }
+ static btree_node* init_root(root_fields *f, btree_node *parent) {
+ btree_node *n = init_internal(f, parent);
+ f->rightmost = parent;
+ f->size = parent->count();
+ return n;
+ }
+ void destroy() {
+ for (int i = 0; i < count(); ++i) {
+ value_destroy(i);
+ }
+ }
+
+ private:
+ void value_init(int i) {
+ new (&fields_.values[i]) mutable_value_type;
+ }
+ void value_init(int i, const value_type &x) {
+ new (&fields_.values[i]) mutable_value_type(x);
+ }
+ void value_destroy(int i) {
+ fields_.values[i].~mutable_value_type();
+ }
+
+ private:
+ root_fields fields_;
+
+ private:
+ btree_node(const btree_node&);
+ void operator=(const btree_node&);
+};
+
+template <typename Node, typename Reference, typename Pointer>
+struct btree_iterator {
+ typedef typename Node::key_type key_type;
+ typedef typename Node::size_type size_type;
+ typedef typename Node::difference_type difference_type;
+ typedef typename Node::params_type params_type;
+
+ typedef Node node_type;
+ typedef typename std::remove_const<Node>::type normal_node;
+ typedef const Node const_node;
+ typedef typename params_type::value_type value_type;
+ typedef typename params_type::pointer normal_pointer;
+ typedef typename params_type::reference normal_reference;
+ typedef typename params_type::const_pointer const_pointer;
+ typedef typename params_type::const_reference const_reference;
+
+ typedef Pointer pointer;
+ typedef Reference reference;
+ typedef std::bidirectional_iterator_tag iterator_category;
+
+ typedef btree_iterator<
+ normal_node, normal_reference, normal_pointer> iterator;
+ typedef btree_iterator<
+ const_node, const_reference, const_pointer> const_iterator;
+ typedef btree_iterator<Node, Reference, Pointer> self_type;
+
+ btree_iterator()
+ : node(NULL),
+ position(-1) {
+ }
+ btree_iterator(Node *n, int p)
+ : node(n),
+ position(p) {
+ }
+ btree_iterator(const iterator &x)
+ : node(x.node),
+ position(x.position) {
+ }
+
+ // Increment/decrement the iterator.
+ void increment() {
+ if (node->leaf() && ++position < node->count()) {
+ return;
+ }
+ increment_slow();
+ }
+ void increment_by(int count);
+ void increment_slow();
+
+ void decrement() {
+ if (node->leaf() && --position >= 0) {
+ return;
+ }
+ decrement_slow();
+ }
+ void decrement_slow();
+
+ bool operator==(const const_iterator &x) const {
+ return node == x.node && position == x.position;
+ }
+ bool operator!=(const const_iterator &x) const {
+ return node != x.node || position != x.position;
+ }
+
+ // Accessors for the key/value the iterator is pointing at.
+ const key_type& key() const {
+ return node->key(position);
+ }
+ reference operator*() const {
+ return node->value(position);
+ }
+ pointer operator->() const {
+ return &node->value(position);
+ }
+
+ self_type& operator++() {
+ increment();
+ return *this;
+ }
+ self_type& operator--() {
+ decrement();
+ return *this;
+ }
+ self_type operator++(int) {
+ self_type tmp = *this;
+ ++*this;
+ return tmp;
+ }
+ self_type operator--(int) {
+ self_type tmp = *this;
+ --*this;
+ return tmp;
+ }
+
+ // The node in the tree the iterator is pointing at.
+ Node *node;
+ // The position within the node of the tree the iterator is pointing at.
+ int position;
+};
+
+// Dispatch helper class for using btree::internal_locate with plain compare.
+struct btree_internal_locate_plain_compare {
+ template <typename K, typename T, typename Iter>
+ static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
+ return t.internal_locate_plain_compare(k, iter);
+ }
+};
+
+// Dispatch helper class for using btree::internal_locate with compare-to.
+struct btree_internal_locate_compare_to {
+ template <typename K, typename T, typename Iter>
+ static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
+ return t.internal_locate_compare_to(k, iter);
+ }
+};
+
+template <typename Params>
+class btree : public Params::key_compare {
+ typedef btree<Params> self_type;
+ typedef btree_node<Params> node_type;
+ typedef typename node_type::base_fields base_fields;
+ typedef typename node_type::leaf_fields leaf_fields;
+ typedef typename node_type::internal_fields internal_fields;
+ typedef typename node_type::root_fields root_fields;
+ typedef typename Params::is_key_compare_to is_key_compare_to;
+
+ friend class btree_internal_locate_plain_compare;
+ friend class btree_internal_locate_compare_to;
+ typedef typename if_<
+ is_key_compare_to::value,
+ btree_internal_locate_compare_to,
+ btree_internal_locate_plain_compare>::type internal_locate_type;
+
+ enum {
+ kNodeValues = node_type::kNodeValues,
+ kMinNodeValues = kNodeValues / 2,
+ kValueSize = node_type::kValueSize,
+ kExactMatch = node_type::kExactMatch,
+ kMatchMask = node_type::kMatchMask,
+ };
+
+ // A helper class to get the empty base class optimization for 0-size
+ // allocators. Base is internal_allocator_type.
+ // (e.g. empty_base_handle<internal_allocator_type, node_type*>). If Base is
+ // 0-size, the compiler doesn't have to reserve any space for it and
+ // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base
+ // class optimization] for more details.
+ template <typename Base, typename Data>
+ struct empty_base_handle : public Base {
+ empty_base_handle(const Base &b, const Data &d)
+ : Base(b),
+ data(d) {
+ }
+ Data data;
+ };
+
+ struct node_stats {
+ node_stats(ssize_t l, ssize_t i)
+ : leaf_nodes(l),
+ internal_nodes(i) {
+ }
+
+ node_stats& operator+=(const node_stats &x) {
+ leaf_nodes += x.leaf_nodes;
+ internal_nodes += x.internal_nodes;
+ return *this;
+ }
+
+ ssize_t leaf_nodes;
+ ssize_t internal_nodes;
+ };
+
+ public:
+ typedef Params params_type;
+ typedef typename Params::key_type key_type;
+ typedef typename Params::data_type data_type;
+ typedef typename Params::mapped_type mapped_type;
+ typedef typename Params::value_type value_type;
+ typedef typename Params::key_compare key_compare;
+ typedef typename Params::pointer pointer;
+ typedef typename Params::const_pointer const_pointer;
+ typedef typename Params::reference reference;
+ typedef typename Params::const_reference const_reference;
+ typedef typename Params::size_type size_type;
+ typedef typename Params::difference_type difference_type;
+ typedef btree_iterator<node_type, reference, pointer> iterator;
+ typedef typename iterator::const_iterator const_iterator;
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+
+ typedef typename Params::allocator_type allocator_type;
+ typedef typename allocator_type::template rebind<char>::other
+ internal_allocator_type;
+
+ public:
+ // Default constructor.
+ btree(const key_compare &comp, const allocator_type &alloc);
+
+ // Copy constructor.
+ btree(const self_type &x);
+
+ // Destructor.
+ ~btree() {
+ clear();
+ }
+
+ // Iterator routines.
+ iterator begin() {
+ return iterator(leftmost(), 0);
+ }
+ const_iterator begin() const {
+ return const_iterator(leftmost(), 0);
+ }
+ iterator end() {
+ return iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
+ }
+ const_iterator end() const {
+ return const_iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
+ }
+ reverse_iterator rbegin() {
+ return reverse_iterator(end());
+ }
+ const_reverse_iterator rbegin() const {
+ return const_reverse_iterator(end());
+ }
+ reverse_iterator rend() {
+ return reverse_iterator(begin());
+ }
+ const_reverse_iterator rend() const {
+ return const_reverse_iterator(begin());
+ }
+
+ // Finds the first element whose key is not less than key.
+ iterator lower_bound(const key_type &key) {
+ return internal_end(
+ internal_lower_bound(key, iterator(root(), 0)));
+ }
+ const_iterator lower_bound(const key_type &key) const {
+ return internal_end(
+ internal_lower_bound(key, const_iterator(root(), 0)));
+ }
+
+ // Finds the first element whose key is greater than key.
+ iterator upper_bound(const key_type &key) {
+ return internal_end(
+ internal_upper_bound(key, iterator(root(), 0)));
+ }
+ const_iterator upper_bound(const key_type &key) const {
+ return internal_end(
+ internal_upper_bound(key, const_iterator(root(), 0)));
+ }
+
+ // Finds the range of values which compare equal to key. The first member of
+ // the returned pair is equal to lower_bound(key). The second member pair of
+ // the pair is equal to upper_bound(key).
+ std::pair<iterator,iterator> equal_range(const key_type &key) {
+ return std::make_pair(lower_bound(key), upper_bound(key));
+ }
+ std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
+ return std::make_pair(lower_bound(key), upper_bound(key));
+ }
+
+ // Inserts a value into the btree only if it does not already exist. The
+ // boolean return value indicates whether insertion succeeded or failed. The
+ // ValuePointer type is used to avoid instatiating the value unless the key
+ // is being inserted. Value is not dereferenced if the key already exists in
+ // the btree. See btree_map::operator[].
+ template <typename ValuePointer>
+ std::pair<iterator,bool> insert_unique(const key_type &key, ValuePointer value);
+
+ // Inserts a value into the btree only if it does not already exist. The
+ // boolean return value indicates whether insertion succeeded or failed.
+ std::pair<iterator,bool> insert_unique(const value_type &v) {
+ return insert_unique(params_type::key(v), &v);
+ }
+
+ // Insert with hint. Check to see if the value should be placed immediately
+ // before position in the tree. If it does, then the insertion will take
+ // amortized constant time. If not, the insertion will take amortized
+ // logarithmic time as if a call to insert_unique(v) were made.
+ iterator insert_unique(iterator position, const value_type &v);
+
+ // Insert a range of values into the btree.
+ template <typename InputIterator>
+ void insert_unique(InputIterator b, InputIterator e);
+
+ // Inserts a value into the btree. The ValuePointer type is used to avoid
+ // instatiating the value unless the key is being inserted. Value is not
+ // dereferenced if the key already exists in the btree. See
+ // btree_map::operator[].
+ template <typename ValuePointer>
+ iterator insert_multi(const key_type &key, ValuePointer value);
+
+ // Inserts a value into the btree.
+ iterator insert_multi(const value_type &v) {
+ return insert_multi(params_type::key(v), &v);
+ }
+
+ // Insert with hint. Check to see if the value should be placed immediately
+ // before position in the tree. If it does, then the insertion will take
+ // amortized constant time. If not, the insertion will take amortized
+ // logarithmic time as if a call to insert_multi(v) were made.
+ iterator insert_multi(iterator position, const value_type &v);
+
+ // Insert a range of values into the btree.
+ template <typename InputIterator>
+ void insert_multi(InputIterator b, InputIterator e);
+
+ void assign(const self_type &x);
+
+ // Erase the specified iterator from the btree. The iterator must be valid
+ // (i.e. not equal to end()). Return an iterator pointing to the node after
+ // the one that was erased (or end() if none exists).
+ iterator erase(iterator iter);
+
+ // Erases range. Returns the number of keys erased.
+ int erase(iterator begin, iterator end);
+
+ // Erases the specified key from the btree. Returns 1 if an element was
+ // erased and 0 otherwise.
+ int erase_unique(const key_type &key);
+
+ // Erases all of the entries matching the specified key from the
+ // btree. Returns the number of elements erased.
+ int erase_multi(const key_type &key);
+
+ // Finds the iterator corresponding to a key or returns end() if the key is
+ // not present.
+ iterator find_unique(const key_type &key) {
+ return internal_end(
+ internal_find_unique(key, iterator(root(), 0)));
+ }
+ const_iterator find_unique(const key_type &key) const {
+ return internal_end(
+ internal_find_unique(key, const_iterator(root(), 0)));
+ }
+ iterator find_multi(const key_type &key) {
+ return internal_end(
+ internal_find_multi(key, iterator(root(), 0)));
+ }
+ const_iterator find_multi(const key_type &key) const {
+ return internal_end(
+ internal_find_multi(key, const_iterator(root(), 0)));
+ }
+
+ // Returns a count of the number of times the key appears in the btree.
+ size_type count_unique(const key_type &key) const {
+ const_iterator begin = internal_find_unique(
+ key, const_iterator(root(), 0));
+ if (!begin.node) {
+ // The key doesn't exist in the tree.
+ return 0;
+ }
+ return 1;
+ }
+ // Returns a count of the number of times the key appears in the btree.
+ size_type count_multi(const key_type &key) const {
+ return distance(lower_bound(key), upper_bound(key));
+ }
+
+ // Clear the btree, deleting all of the values it contains.
+ void clear();
+
+ // Swap the contents of *this and x.
+ void swap(self_type &x);
+
+ // Assign the contents of x to *this.
+ self_type& operator=(const self_type &x) {
+ if (&x == this) {
+ // Don't copy onto ourselves.
+ return *this;
+ }
+ assign(x);
+ return *this;
+ }
+
+ key_compare* mutable_key_comp() {
+ return this;
+ }
+ const key_compare& key_comp() const {
+ return *this;
+ }
+ bool compare_keys(const key_type &x, const key_type &y) const {
+ return btree_compare_keys(key_comp(), x, y);
+ }
+
+ // Dump the btree to the specified ostream. Requires that operator<< is
+ // defined for Key and Value.
+ void dump(std::ostream &os) const {
+ if (root() != NULL) {
+ internal_dump(os, root(), 0);
+ }
+ }
+
+ // Verifies the structure of the btree.
+ void verify() const;
+
+ // Size routines. Note that empty() is slightly faster than doing size()==0.
+ size_type size() const {
+ if (empty()) return 0;
+ if (root()->leaf()) return root()->count();
+ return root()->size();
+ }
+ size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+ bool empty() const { return root() == NULL; }
+
+ // The height of the btree. An empty tree will have height 0.
+ size_type height() const {
+ size_type h = 0;
+ if (root()) {
+ // Count the length of the chain from the leftmost node up to the
+ // root. We actually count from the root back around to the level below
+ // the root, but the calculation is the same because of the circularity
+ // of that traversal.
+ const node_type *n = root();
+ do {
+ ++h;
+ n = n->parent();
+ } while (n != root());
+ }
+ return h;
+ }
+
+ // The number of internal, leaf and total nodes used by the btree.
+ size_type leaf_nodes() const {
+ return internal_stats(root()).leaf_nodes;
+ }
+ size_type internal_nodes() const {
+ return internal_stats(root()).internal_nodes;
+ }
+ size_type nodes() const {
+ node_stats stats = internal_stats(root());
+ return stats.leaf_nodes + stats.internal_nodes;
+ }
+
+ // The total number of bytes used by the btree.
+ size_type bytes_used() const {
+ node_stats stats = internal_stats(root());
+ if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) {
+ return sizeof(*this) +
+ sizeof(base_fields) + root()->max_count() * sizeof(value_type);
+ } else {
+ return sizeof(*this) +
+ sizeof(root_fields) - sizeof(internal_fields) +
+ stats.leaf_nodes * sizeof(leaf_fields) +
+ stats.internal_nodes * sizeof(internal_fields);
+ }
+ }
+
+ // The average number of bytes used per value stored in the btree.
+ static double average_bytes_per_value() {
+ // Returns the number of bytes per value on a leaf node that is 75%
+ // full. Experimentally, this matches up nicely with the computed number of
+ // bytes per value in trees that had their values inserted in random order.
+ return sizeof(leaf_fields) / (kNodeValues * 0.75);
+ }
+
+ // The fullness of the btree. Computed as the number of elements in the btree
+ // divided by the maximum number of elements a tree with the current number
+ // of nodes could hold. A value of 1 indicates perfect space
+ // utilization. Smaller values indicate space wastage.
+ double fullness() const {
+ return double(size()) / (nodes() * kNodeValues);
+ }
+ // The overhead of the btree structure in bytes per node. Computed as the
+ // total number of bytes used by the btree minus the number of bytes used for
+ // storing elements divided by the number of elements.
+ double overhead() const {
+ if (empty()) {
+ return 0.0;
+ }
+ return (bytes_used() - size() * kValueSize) / double(size());
+ }
+
+ private:
+ // Internal accessor routines.
+ node_type* root() { return root_.data; }
+ const node_type* root() const { return root_.data; }
+ node_type** mutable_root() { return &root_.data; }
+
+ // The rightmost node is stored in the root node.
+ node_type* rightmost() {
+ return (!root() || root()->leaf()) ? root() : root()->rightmost();
+ }
+ const node_type* rightmost() const {
+ return (!root() || root()->leaf()) ? root() : root()->rightmost();
+ }
+ node_type** mutable_rightmost() { return root()->mutable_rightmost(); }
+
+ // The leftmost node is stored as the parent of the root node.
+ node_type* leftmost() { return root() ? root()->parent() : NULL; }
+ const node_type* leftmost() const { return root() ? root()->parent() : NULL; }
+
+ // The size of the tree is stored in the root node.
+ size_type* mutable_size() { return root()->mutable_size(); }
+
+ // Allocator routines.
+ internal_allocator_type* mutable_internal_allocator() {
+ return static_cast<internal_allocator_type*>(&root_);
+ }
+ const internal_allocator_type& internal_allocator() const {
+ return *static_cast<const internal_allocator_type*>(&root_);
+ }
+
+ // Node creation/deletion routines.
+ node_type* new_internal_node(node_type *parent) {
+ internal_fields *p = reinterpret_cast<internal_fields*>(
+ mutable_internal_allocator()->allocate(sizeof(internal_fields)));
+ return node_type::init_internal(p, parent);
+ }
+ node_type* new_internal_root_node() {
+ root_fields *p = reinterpret_cast<root_fields*>(
+ mutable_internal_allocator()->allocate(sizeof(root_fields)));
+ return node_type::init_root(p, root()->parent());
+ }
+ node_type* new_leaf_node(node_type *parent) {
+ leaf_fields *p = reinterpret_cast<leaf_fields*>(
+ mutable_internal_allocator()->allocate(sizeof(leaf_fields)));
+ return node_type::init_leaf(p, parent, kNodeValues);
+ }
+ node_type* new_leaf_root_node(int max_count) {
+ leaf_fields *p = reinterpret_cast<leaf_fields*>(
+ mutable_internal_allocator()->allocate(
+ sizeof(base_fields) + max_count * sizeof(value_type)));
+ return node_type::init_leaf(p, reinterpret_cast<node_type*>(p), max_count);
+ }
+ void delete_internal_node(node_type *node) {
+ node->destroy();
+ ceph_assert(node != root());
+ mutable_internal_allocator()->deallocate(
+ reinterpret_cast<char*>(node), sizeof(internal_fields));
+ }
+ void delete_internal_root_node() {
+ root()->destroy();
+ mutable_internal_allocator()->deallocate(
+ reinterpret_cast<char*>(root()), sizeof(root_fields));
+ }
+ void delete_leaf_node(node_type *node) {
+ node->destroy();
+ mutable_internal_allocator()->deallocate(
+ reinterpret_cast<char*>(node),
+ sizeof(base_fields) + node->max_count() * sizeof(value_type));
+ }
+
+ // Rebalances or splits the node iter points to.
+ void rebalance_or_split(iterator *iter);
+
+ // Merges the values of left, right and the delimiting key on their parent
+ // onto left, removing the delimiting key and deleting right.
+ void merge_nodes(node_type *left, node_type *right);
+
+ // Tries to merge node with its left or right sibling, and failing that,
+ // rebalance with its left or right sibling. Returns true if a merge
+ // occurred, at which point it is no longer valid to access node. Returns
+ // false if no merging took place.
+ bool try_merge_or_rebalance(iterator *iter);
+
+ // Tries to shrink the height of the tree by 1.
+ void try_shrink();
+
+ iterator internal_end(iterator iter) {
+ return iter.node ? iter : end();
+ }
+ const_iterator internal_end(const_iterator iter) const {
+ return iter.node ? iter : end();
+ }
+
+ // Inserts a value into the btree immediately before iter. Requires that
+ // key(v) <= iter.key() and (--iter).key() <= key(v).
+ iterator internal_insert(iterator iter, const value_type &v);
+
+ // Returns an iterator pointing to the first value >= the value "iter" is
+ // pointing at. Note that "iter" might be pointing to an invalid location as
+ // iter.position == iter.node->count(). This routine simply moves iter up in
+ // the tree to a valid location.
+ template <typename IterType>
+ static IterType internal_last(IterType iter);
+
+ // Returns an iterator pointing to the leaf position at which key would
+ // reside in the tree. We provide 2 versions of internal_locate. The first
+ // version (internal_locate_plain_compare) always returns 0 for the second
+ // field of the pair. The second version (internal_locate_compare_to) is for
+ // the key-compare-to specialization and returns either kExactMatch (if the
+ // key was found in the tree) or -kExactMatch (if it wasn't) in the second
+ // field of the pair. The compare_to specialization allows the caller to
+ // avoid a subsequent comparison to determine if an exact match was made,
+ // speeding up string keys.
+ template <typename IterType>
+ std::pair<IterType, int> internal_locate(
+ const key_type &key, IterType iter) const;
+ template <typename IterType>
+ std::pair<IterType, int> internal_locate_plain_compare(
+ const key_type &key, IterType iter) const;
+ template <typename IterType>
+ std::pair<IterType, int> internal_locate_compare_to(
+ const key_type &key, IterType iter) const;
+
+ // Internal routine which implements lower_bound().
+ template <typename IterType>
+ IterType internal_lower_bound(
+ const key_type &key, IterType iter) const;
+
+ // Internal routine which implements upper_bound().
+ template <typename IterType>
+ IterType internal_upper_bound(
+ const key_type &key, IterType iter) const;
+
+ // Internal routine which implements find_unique().
+ template <typename IterType>
+ IterType internal_find_unique(
+ const key_type &key, IterType iter) const;
+
+ // Internal routine which implements find_multi().
+ template <typename IterType>
+ IterType internal_find_multi(
+ const key_type &key, IterType iter) const;
+
+ // Deletes a node and all of its children.
+ void internal_clear(node_type *node);
+
+ // Dumps a node and all of its children to the specified ostream.
+ void internal_dump(std::ostream &os, const node_type *node, int level) const;
+
+ // Verifies the tree structure of node.
+ int internal_verify(const node_type *node,
+ const key_type *lo, const key_type *hi) const;
+
+ node_stats internal_stats(const node_type *node) const {
+ if (!node) {
+ return node_stats(0, 0);
+ }
+ if (node->leaf()) {
+ return node_stats(1, 0);
+ }
+ node_stats res(0, 1);
+ for (int i = 0; i <= node->count(); ++i) {
+ res += internal_stats(node->child(i));
+ }
+ return res;
+ }
+
+ private:
+ empty_base_handle<internal_allocator_type, node_type*> root_;
+
+ private:
+ // A never instantiated helper function that returns big_ if we have a
+ // key-compare-to functor or if R is bool and small_ otherwise.
+ template <typename R>
+ static typename if_<
+ if_<is_key_compare_to::value,
+ std::is_same<R, int>,
+ std::is_same<R, bool> >::type::value,
+ big_, small_>::type key_compare_checker(R);
+
+ // A never instantiated helper function that returns the key comparison
+ // functor.
+ static key_compare key_compare_helper();
+
+ // Verify that key_compare returns a bool. This is similar to the way
+ // is_convertible in base/type_traits.h works. Note that key_compare_checker
+ // is never actually invoked. The compiler will select which
+ // key_compare_checker() to instantiate and then figure out the size of the
+ // return type of key_compare_checker() at compile time which we then check
+ // against the sizeof of big_.
+ COMPILE_ASSERT(
+ sizeof(key_compare_checker(key_compare_helper()(key_type(), key_type()))) ==
+ sizeof(big_),
+ key_comparison_function_must_return_bool);
+
+ // Note: We insist on kTargetValues, which is computed from
+ // Params::kTargetNodeSize, must fit the base_fields::field_type.
+ COMPILE_ASSERT(kNodeValues <
+ (1 << (8 * sizeof(typename base_fields::field_type))),
+ target_node_size_too_large);
+
+ // Test the assumption made in setting kNodeValueSpace.
+ COMPILE_ASSERT(sizeof(base_fields) >= 2 * sizeof(void*),
+ node_space_assumption_incorrect);
+};
+
+////
+// btree_node methods
+template <typename P>
+inline void btree_node<P>::insert_value(int i, const value_type &x) {
+ ceph_assert(i <= count());
+ value_init(count(), x);
+ for (int j = count(); j > i; --j) {
+ value_swap(j, this, j - 1);
+ }
+ set_count(count() + 1);
+
+ if (!leaf()) {
+ ++i;
+ for (int j = count(); j > i; --j) {
+ *mutable_child(j) = child(j - 1);
+ child(j)->set_position(j);
+ }
+ *mutable_child(i) = NULL;
+ }
+}
+
+template <typename P>
+inline void btree_node<P>::remove_value(int i) {
+ if (!leaf()) {
+ ceph_assert(child(i + 1)->count() == 0);
+ for (int j = i + 1; j < count(); ++j) {
+ *mutable_child(j) = child(j + 1);
+ child(j)->set_position(j);
+ }
+ *mutable_child(count()) = NULL;
+ }
+
+ set_count(count() - 1);
+ for (; i < count(); ++i) {
+ value_swap(i, this, i + 1);
+ }
+ value_destroy(i);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_right_to_left(btree_node *src, int to_move) {
+ ceph_assert(parent() == src->parent());
+ ceph_assert(position() + 1 == src->position());
+ ceph_assert(src->count() >= count());
+ ceph_assert(to_move >= 1);
+ ceph_assert(to_move <= src->count());
+
+ // Make room in the left node for the new values.
+ for (int i = 0; i < to_move; ++i) {
+ value_init(i + count());
+ }
+
+ // Move the delimiting value to the left node and the new delimiting value
+ // from the right node.
+ value_swap(count(), parent(), position());
+ parent()->value_swap(position(), src, to_move - 1);
+
+ // Move the values from the right to the left node.
+ for (int i = 1; i < to_move; ++i) {
+ value_swap(count() + i, src, i - 1);
+ }
+ // Shift the values in the right node to their correct position.
+ for (int i = to_move; i < src->count(); ++i) {
+ src->value_swap(i - to_move, src, i);
+ }
+ for (int i = 1; i <= to_move; ++i) {
+ src->value_destroy(src->count() - i);
+ }
+
+ if (!leaf()) {
+ // Move the child pointers from the right to the left node.
+ for (int i = 0; i < to_move; ++i) {
+ set_child(1 + count() + i, src->child(i));
+ }
+ for (int i = 0; i <= src->count() - to_move; ++i) {
+ ceph_assert(i + to_move <= src->max_count());
+ src->set_child(i, src->child(i + to_move));
+ *src->mutable_child(i + to_move) = NULL;
+ }
+ }
+
+ // Fixup the counts on the src and dest nodes.
+ set_count(count() + to_move);
+ src->set_count(src->count() - to_move);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_left_to_right(btree_node *dest, int to_move) {
+ ceph_assert(parent() == dest->parent());
+ ceph_assert(position() + 1 == dest->position());
+ ceph_assert(count() >= dest->count());
+ ceph_assert(to_move >= 1);
+ ceph_assert(to_move <= count());
+
+ // Make room in the right node for the new values.
+ for (int i = 0; i < to_move; ++i) {
+ dest->value_init(i + dest->count());
+ }
+ for (int i = dest->count() - 1; i >= 0; --i) {
+ dest->value_swap(i, dest, i + to_move);
+ }
+
+ // Move the delimiting value to the right node and the new delimiting value
+ // from the left node.
+ dest->value_swap(to_move - 1, parent(), position());
+ parent()->value_swap(position(), this, count() - to_move);
+ value_destroy(count() - to_move);
+
+ // Move the values from the left to the right node.
+ for (int i = 1; i < to_move; ++i) {
+ value_swap(count() - to_move + i, dest, i - 1);
+ value_destroy(count() - to_move + i);
+ }
+
+ if (!leaf()) {
+ // Move the child pointers from the left to the right node.
+ for (int i = dest->count(); i >= 0; --i) {
+ dest->set_child(i + to_move, dest->child(i));
+ *dest->mutable_child(i) = NULL;
+ }
+ for (int i = 1; i <= to_move; ++i) {
+ dest->set_child(i - 1, child(count() - to_move + i));
+ *mutable_child(count() - to_move + i) = NULL;
+ }
+ }
+
+ // Fixup the counts on the src and dest nodes.
+ set_count(count() - to_move);
+ dest->set_count(dest->count() + to_move);
+}
+
+template <typename P>
+void btree_node<P>::split(btree_node *dest, int insert_position) {
+ ceph_assert(dest->count() == 0);
+
+ // We bias the split based on the position being inserted. If we're
+ // inserting at the beginning of the left node then bias the split to put
+ // more values on the right node. If we're inserting at the end of the
+ // right node then bias the split to put more values on the left node.
+ if (insert_position == 0) {
+ dest->set_count(count() - 1);
+ } else if (insert_position == max_count()) {
+ dest->set_count(0);
+ } else {
+ dest->set_count(count() / 2);
+ }
+ set_count(count() - dest->count());
+ ceph_assert(count() >= 1);
+
+ // Move values from the left sibling to the right sibling.
+ for (int i = 0; i < dest->count(); ++i) {
+ dest->value_init(i);
+ value_swap(count() + i, dest, i);
+ value_destroy(count() + i);
+ }
+
+ // The split key is the largest value in the left sibling.
+ set_count(count() - 1);
+ parent()->insert_value(position(), value_type());
+ value_swap(count(), parent(), position());
+ value_destroy(count());
+ parent()->set_child(position() + 1, dest);
+
+ if (!leaf()) {
+ for (int i = 0; i <= dest->count(); ++i) {
+ ceph_assert(child(count() + i + 1) != NULL);
+ dest->set_child(i, child(count() + i + 1));
+ *mutable_child(count() + i + 1) = NULL;
+ }
+ }
+}
+
+template <typename P>
+void btree_node<P>::merge(btree_node *src) {
+ ceph_assert(parent() == src->parent());
+ ceph_assert(position() + 1 == src->position());
+
+ // Move the delimiting value to the left node.
+ value_init(count());
+ value_swap(count(), parent(), position());
+
+ // Move the values from the right to the left node.
+ for (int i = 0; i < src->count(); ++i) {
+ value_init(1 + count() + i);
+ value_swap(1 + count() + i, src, i);
+ src->value_destroy(i);
+ }
+
+ if (!leaf()) {
+ // Move the child pointers from the right to the left node.
+ for (int i = 0; i <= src->count(); ++i) {
+ set_child(1 + count() + i, src->child(i));
+ *src->mutable_child(i) = NULL;
+ }
+ }
+
+ // Fixup the counts on the src and dest nodes.
+ set_count(1 + count() + src->count());
+ src->set_count(0);
+
+ // Remove the value on the parent node.
+ parent()->remove_value(position());
+}
+
+template <typename P>
+void btree_node<P>::swap(btree_node *x) {
+ ceph_assert(leaf() == x->leaf());
+
+ // Swap the values.
+ for (int i = count(); i < x->count(); ++i) {
+ value_init(i);
+ }
+ for (int i = x->count(); i < count(); ++i) {
+ x->value_init(i);
+ }
+ int n = std::max(count(), x->count());
+ for (int i = 0; i < n; ++i) {
+ value_swap(i, x, i);
+ }
+ for (int i = count(); i < x->count(); ++i) {
+ x->value_destroy(i);
+ }
+ for (int i = x->count(); i < count(); ++i) {
+ value_destroy(i);
+ }
+
+ if (!leaf()) {
+ // Swap the child pointers.
+ for (int i = 0; i <= n; ++i) {
+ btree_swap_helper(*mutable_child(i), *x->mutable_child(i));
+ }
+ for (int i = 0; i <= count(); ++i) {
+ x->child(i)->fields_.parent = x;
+ }
+ for (int i = 0; i <= x->count(); ++i) {
+ child(i)->fields_.parent = this;
+ }
+ }
+
+ // Swap the counts.
+ btree_swap_helper(fields_.count, x->fields_.count);
+}
+
+////
+// btree_iterator methods
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_slow() {
+ if (node->leaf()) {
+ ceph_assert(position >= node->count());
+ self_type save(*this);
+ while (position == node->count() && !node->is_root()) {
+ ceph_assert(node->parent()->child(node->position()) == node);
+ position = node->position();
+ node = node->parent();
+ }
+ if (position == node->count()) {
+ *this = save;
+ }
+ } else {
+ ceph_assert(position < node->count());
+ node = node->child(position + 1);
+ while (!node->leaf()) {
+ node = node->child(0);
+ }
+ position = 0;
+ }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_by(int count) {
+ while (count > 0) {
+ if (node->leaf()) {
+ int rest = node->count() - position;
+ position += std::min(rest, count);
+ count = count - rest;
+ if (position < node->count()) {
+ return;
+ }
+ } else {
+ --count;
+ }
+ increment_slow();
+ }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::decrement_slow() {
+ if (node->leaf()) {
+ ceph_assert(position <= -1);
+ self_type save(*this);
+ while (position < 0 && !node->is_root()) {
+ ceph_assert(node->parent()->child(node->position()) == node);
+ position = node->position() - 1;
+ node = node->parent();
+ }
+ if (position < 0) {
+ *this = save;
+ }
+ } else {
+ ceph_assert(position >= 0);
+ node = node->child(position);
+ while (!node->leaf()) {
+ node = node->child(node->count());
+ }
+ position = node->count() - 1;
+ }
+}
+
+////
+// btree methods
+template <typename P>
+btree<P>::btree(const key_compare &comp, const allocator_type &alloc)
+ : key_compare(comp),
+ root_(alloc, NULL) {
+}
+
+template <typename P>
+btree<P>::btree(const self_type &x)
+ : key_compare(x.key_comp()),
+ root_(x.internal_allocator(), NULL) {
+ assign(x);
+}
+
+template <typename P> template <typename ValuePointer>
+std::pair<typename btree<P>::iterator, bool>
+btree<P>::insert_unique(const key_type &key, ValuePointer value) {
+ if (empty()) {
+ *mutable_root() = new_leaf_root_node(1);
+ }
+
+ std::pair<iterator, int> res = internal_locate(key, iterator(root(), 0));
+ iterator &iter = res.first;
+ if (res.second == kExactMatch) {
+ // The key already exists in the tree, do nothing.
+ return std::make_pair(internal_last(iter), false);
+ } else if (!res.second) {
+ iterator last = internal_last(iter);
+ if (last.node && !compare_keys(key, last.key())) {
+ // The key already exists in the tree, do nothing.
+ return std::make_pair(last, false);
+ }
+ }
+
+ return std::make_pair(internal_insert(iter, *value), true);
+}
+
+template <typename P>
+inline typename btree<P>::iterator
+btree<P>::insert_unique(iterator position, const value_type &v) {
+ if (!empty()) {
+ const key_type &key = params_type::key(v);
+ if (position == end() || compare_keys(key, position.key())) {
+ iterator prev = position;
+ if (position == begin() || compare_keys((--prev).key(), key)) {
+ // prev.key() < key < position.key()
+ return internal_insert(position, v);
+ }
+ } else if (compare_keys(position.key(), key)) {
+ iterator next = position;
+ ++next;
+ if (next == end() || compare_keys(key, next.key())) {
+ // position.key() < key < next.key()
+ return internal_insert(next, v);
+ }
+ } else {
+ // position.key() == key
+ return position;
+ }
+ }
+ return insert_unique(v).first;
+}
+
+template <typename P> template <typename InputIterator>
+void btree<P>::insert_unique(InputIterator b, InputIterator e) {
+ for (; b != e; ++b) {
+ insert_unique(end(), *b);
+ }
+}
+
+template <typename P> template <typename ValuePointer>
+typename btree<P>::iterator
+btree<P>::insert_multi(const key_type &key, ValuePointer value) {
+ if (empty()) {
+ *mutable_root() = new_leaf_root_node(1);
+ }
+
+ iterator iter = internal_upper_bound(key, iterator(root(), 0));
+ if (!iter.node) {
+ iter = end();
+ }
+ return internal_insert(iter, *value);
+}
+
+template <typename P>
+typename btree<P>::iterator
+btree<P>::insert_multi(iterator position, const value_type &v) {
+ if (!empty()) {
+ const key_type &key = params_type::key(v);
+ if (position == end() || !compare_keys(position.key(), key)) {
+ iterator prev = position;
+ if (position == begin() || !compare_keys(key, (--prev).key())) {
+ // prev.key() <= key <= position.key()
+ return internal_insert(position, v);
+ }
+ } else {
+ iterator next = position;
+ ++next;
+ if (next == end() || !compare_keys(next.key(), key)) {
+ // position.key() < key <= next.key()
+ return internal_insert(next, v);
+ }
+ }
+ }
+ return insert_multi(v);
+}
+
+template <typename P> template <typename InputIterator>
+void btree<P>::insert_multi(InputIterator b, InputIterator e) {
+ for (; b != e; ++b) {
+ insert_multi(end(), *b);
+ }
+}
+
+template <typename P>
+void btree<P>::assign(const self_type &x) {
+ clear();
+
+ *mutable_key_comp() = x.key_comp();
+ *mutable_internal_allocator() = x.internal_allocator();
+
+ // Assignment can avoid key comparisons because we know the order of the
+ // values is the same order we'll store them in.
+ for (const_iterator iter = x.begin(); iter != x.end(); ++iter) {
+ if (empty()) {
+ insert_multi(*iter);
+ } else {
+ // If the btree is not empty, we can just insert the new value at the end
+ // of the tree!
+ internal_insert(end(), *iter);
+ }
+ }
+}
+
+template <typename P>
+typename btree<P>::iterator btree<P>::erase(iterator iter) {
+ bool internal_delete = false;
+ if (!iter.node->leaf()) {
+ // Deletion of a value on an internal node. Swap the key with the largest
+ // value of our left child. This is easy, we just decrement iter.
+ iterator tmp_iter(iter--);
+ ceph_assert(iter.node->leaf());
+ ceph_assert(!compare_keys(tmp_iter.key(), iter.key()));
+ iter.node->value_swap(iter.position, tmp_iter.node, tmp_iter.position);
+ internal_delete = true;
+ --*mutable_size();
+ } else if (!root()->leaf()) {
+ --*mutable_size();
+ }
+
+ // Delete the key from the leaf.
+ iter.node->remove_value(iter.position);
+
+ // We want to return the next value after the one we just erased. If we
+ // erased from an internal node (internal_delete == true), then the next
+ // value is ++(++iter). If we erased from a leaf node (internal_delete ==
+ // false) then the next value is ++iter. Note that ++iter may point to an
+ // internal node and the value in the internal node may move to a leaf node
+ // (iter.node) when rebalancing is performed at the leaf level.
+
+ // Merge/rebalance as we walk back up the tree.
+ iterator res(iter);
+ for (;;) {
+ if (iter.node == root()) {
+ try_shrink();
+ if (empty()) {
+ return end();
+ }
+ break;
+ }
+ if (iter.node->count() >= kMinNodeValues) {
+ break;
+ }
+ bool merged = try_merge_or_rebalance(&iter);
+ if (iter.node->leaf()) {
+ res = iter;
+ }
+ if (!merged) {
+ break;
+ }
+ iter.node = iter.node->parent();
+ }
+
+ // Adjust our return value. If we're pointing at the end of a node, advance
+ // the iterator.
+ if (res.position == res.node->count()) {
+ res.position = res.node->count() - 1;
+ ++res;
+ }
+ // If we erased from an internal node, advance the iterator.
+ if (internal_delete) {
+ ++res;
+ }
+ return res;
+}
+
+template <typename P>
+int btree<P>::erase(iterator begin, iterator end) {
+ int count = distance(begin, end);
+ for (int i = 0; i < count; i++) {
+ begin = erase(begin);
+ }
+ return count;
+}
+
+template <typename P>
+int btree<P>::erase_unique(const key_type &key) {
+ iterator iter = internal_find_unique(key, iterator(root(), 0));
+ if (!iter.node) {
+ // The key doesn't exist in the tree, return nothing done.
+ return 0;
+ }
+ erase(iter);
+ return 1;
+}
+
+template <typename P>
+int btree<P>::erase_multi(const key_type &key) {
+ iterator begin = internal_lower_bound(key, iterator(root(), 0));
+ if (!begin.node) {
+ // The key doesn't exist in the tree, return nothing done.
+ return 0;
+ }
+ // Delete all of the keys between begin and upper_bound(key).
+ iterator end = internal_end(
+ internal_upper_bound(key, iterator(root(), 0)));
+ return erase(begin, end);
+}
+
+template <typename P>
+void btree<P>::clear() {
+ if (root() != NULL) {
+ internal_clear(root());
+ }
+ *mutable_root() = NULL;
+}
+
+template <typename P>
+void btree<P>::swap(self_type &x) {
+ std::swap(static_cast<key_compare&>(*this), static_cast<key_compare&>(x));
+ std::swap(root_, x.root_);
+}
+
+template <typename P>
+void btree<P>::verify() const {
+ if (root() != NULL) {
+ ceph_assert(size() == internal_verify(root(), NULL, NULL));
+ ceph_assert(leftmost() == (++const_iterator(root(), -1)).node);
+ ceph_assert(rightmost() == (--const_iterator(root(), root()->count())).node);
+ ceph_assert(leftmost()->leaf());
+ ceph_assert(rightmost()->leaf());
+ } else {
+ ceph_assert(size() == 0);
+ ceph_assert(leftmost() == NULL);
+ ceph_assert(rightmost() == NULL);
+ }
+}
+
+template <typename P>
+void btree<P>::rebalance_or_split(iterator *iter) {
+ node_type *&node = iter->node;
+ int &insert_position = iter->position;
+ ceph_assert(node->count() == node->max_count());
+
+ // First try to make room on the node by rebalancing.
+ node_type *parent = node->parent();
+ if (node != root()) {
+ if (node->position() > 0) {
+ // Try rebalancing with our left sibling.
+ node_type *left = parent->child(node->position() - 1);
+ if (left->count() < left->max_count()) {
+ // We bias rebalancing based on the position being inserted. If we're
+ // inserting at the end of the right node then we bias rebalancing to
+ // fill up the left node.
+ int to_move = (left->max_count() - left->count()) /
+ (1 + (insert_position < left->max_count()));
+ to_move = std::max(1, to_move);
+
+ if (((insert_position - to_move) >= 0) ||
+ ((left->count() + to_move) < left->max_count())) {
+ left->rebalance_right_to_left(node, to_move);
+
+ ceph_assert(node->max_count() - node->count() == to_move);
+ insert_position = insert_position - to_move;
+ if (insert_position < 0) {
+ insert_position = insert_position + left->count() + 1;
+ node = left;
+ }
+
+ ceph_assert(node->count() < node->max_count());
+ return;
+ }
+ }
+ }
+
+ if (node->position() < parent->count()) {
+ // Try rebalancing with our right sibling.
+ node_type *right = parent->child(node->position() + 1);
+ if (right->count() < right->max_count()) {
+ // We bias rebalancing based on the position being inserted. If we're
+ // inserting at the beginning of the left node then we bias rebalancing
+ // to fill up the right node.
+ int to_move = (right->max_count() - right->count()) /
+ (1 + (insert_position > 0));
+ to_move = std::max(1, to_move);
+
+ if ((insert_position <= (node->count() - to_move)) ||
+ ((right->count() + to_move) < right->max_count())) {
+ node->rebalance_left_to_right(right, to_move);
+
+ if (insert_position > node->count()) {
+ insert_position = insert_position - node->count() - 1;
+ node = right;
+ }
+
+ ceph_assert(node->count() < node->max_count());
+ return;
+ }
+ }
+ }
+
+ // Rebalancing failed, make sure there is room on the parent node for a new
+ // value.
+ if (parent->count() == parent->max_count()) {
+ iterator parent_iter(node->parent(), node->position());
+ rebalance_or_split(&parent_iter);
+ }
+ } else {
+ // Rebalancing not possible because this is the root node.
+ if (root()->leaf()) {
+ // The root node is currently a leaf node: create a new root node and set
+ // the current root node as the child of the new root.
+ parent = new_internal_root_node();
+ parent->set_child(0, root());
+ *mutable_root() = parent;
+ ceph_assert(*mutable_rightmost() == parent->child(0));
+ } else {
+ // The root node is an internal node. We do not want to create a new root
+ // node because the root node is special and holds the size of the tree
+ // and a pointer to the rightmost node. So we create a new internal node
+ // and move all of the items on the current root into the new node.
+ parent = new_internal_node(parent);
+ parent->set_child(0, parent);
+ parent->swap(root());
+ node = parent;
+ }
+ }
+
+ // Split the node.
+ node_type *split_node;
+ if (node->leaf()) {
+ split_node = new_leaf_node(parent);
+ node->split(split_node, insert_position);
+ if (rightmost() == node) {
+ *mutable_rightmost() = split_node;
+ }
+ } else {
+ split_node = new_internal_node(parent);
+ node->split(split_node, insert_position);
+ }
+
+ if (insert_position > node->count()) {
+ insert_position = insert_position - node->count() - 1;
+ node = split_node;
+ }
+}
+
+template <typename P>
+void btree<P>::merge_nodes(node_type *left, node_type *right) {
+ left->merge(right);
+ if (right->leaf()) {
+ if (rightmost() == right) {
+ *mutable_rightmost() = left;
+ }
+ delete_leaf_node(right);
+ } else {
+ delete_internal_node(right);
+ }
+}
+
+template <typename P>
+bool btree<P>::try_merge_or_rebalance(iterator *iter) {
+ node_type *parent = iter->node->parent();
+ if (iter->node->position() > 0) {
+ // Try merging with our left sibling.
+ node_type *left = parent->child(iter->node->position() - 1);
+ if ((1 + left->count() + iter->node->count()) <= left->max_count()) {
+ iter->position += 1 + left->count();
+ merge_nodes(left, iter->node);
+ iter->node = left;
+ return true;
+ }
+ }
+ if (iter->node->position() < parent->count()) {
+ // Try merging with our right sibling.
+ node_type *right = parent->child(iter->node->position() + 1);
+ if ((1 + iter->node->count() + right->count()) <= right->max_count()) {
+ merge_nodes(iter->node, right);
+ return true;
+ }
+ // Try rebalancing with our right sibling. We don't perform rebalancing if
+ // we deleted the first element from iter->node and the node is not
+ // empty. This is a small optimization for the common pattern of deleting
+ // from the front of the tree.
+ if ((right->count() > kMinNodeValues) &&
+ ((iter->node->count() == 0) ||
+ (iter->position > 0))) {
+ int to_move = (right->count() - iter->node->count()) / 2;
+ to_move = std::min(to_move, right->count() - 1);
+ iter->node->rebalance_right_to_left(right, to_move);
+ return false;
+ }
+ }
+ if (iter->node->position() > 0) {
+ // Try rebalancing with our left sibling. We don't perform rebalancing if
+ // we deleted the last element from iter->node and the node is not
+ // empty. This is a small optimization for the common pattern of deleting
+ // from the back of the tree.
+ node_type *left = parent->child(iter->node->position() - 1);
+ if ((left->count() > kMinNodeValues) &&
+ ((iter->node->count() == 0) ||
+ (iter->position < iter->node->count()))) {
+ int to_move = (left->count() - iter->node->count()) / 2;
+ to_move = std::min(to_move, left->count() - 1);
+ left->rebalance_left_to_right(iter->node, to_move);
+ iter->position += to_move;
+ return false;
+ }
+ }
+ return false;
+}
+
+template <typename P>
+void btree<P>::try_shrink() {
+ if (root()->count() > 0) {
+ return;
+ }
+ // Deleted the last item on the root node, shrink the height of the tree.
+ if (root()->leaf()) {
+ ceph_assert(size() == 0);
+ delete_leaf_node(root());
+ *mutable_root() = NULL;
+ } else {
+ node_type *child = root()->child(0);
+ if (child->leaf()) {
+ // The child is a leaf node so simply make it the root node in the tree.
+ child->make_root();
+ delete_internal_root_node();
+ *mutable_root() = child;
+ } else {
+ // The child is an internal node. We want to keep the existing root node
+ // so we move all of the values from the child node into the existing
+ // (empty) root node.
+ child->swap(root());
+ delete_internal_node(child);
+ }
+ }
+}
+
+template <typename P> template <typename IterType>
+inline IterType btree<P>::internal_last(IterType iter) {
+ while (iter.node && iter.position == iter.node->count()) {
+ iter.position = iter.node->position();
+ iter.node = iter.node->parent();
+ if (iter.node->leaf()) {
+ iter.node = NULL;
+ }
+ }
+ return iter;
+}
+
+template <typename P>
+inline typename btree<P>::iterator
+btree<P>::internal_insert(iterator iter, const value_type &v) {
+ if (!iter.node->leaf()) {
+ // We can't insert on an internal node. Instead, we'll insert after the
+ // previous value which is guaranteed to be on a leaf node.
+ --iter;
+ ++iter.position;
+ }
+ if (iter.node->count() == iter.node->max_count()) {
+ // Make room in the leaf for the new item.
+ if (iter.node->max_count() < kNodeValues) {
+ // Insertion into the root where the root is smaller that the full node
+ // size. Simply grow the size of the root node.
+ ceph_assert(iter.node == root());
+ iter.node = new_leaf_root_node(
+ std::min<int>(kNodeValues, 2 * iter.node->max_count()));
+ iter.node->swap(root());
+ delete_leaf_node(root());
+ *mutable_root() = iter.node;
+ } else {
+ rebalance_or_split(&iter);
+ ++*mutable_size();
+ }
+ } else if (!root()->leaf()) {
+ ++*mutable_size();
+ }
+ iter.node->insert_value(iter.position, v);
+ return iter;
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate(
+ const key_type &key, IterType iter) const {
+ return internal_locate_type::dispatch(key, *this, iter);
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate_plain_compare(
+ const key_type &key, IterType iter) const {
+ for (;;) {
+ iter.position = iter.node->lower_bound(key, key_comp());
+ if (iter.node->leaf()) {
+ break;
+ }
+ iter.node = iter.node->child(iter.position);
+ }
+ return std::make_pair(iter, 0);
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate_compare_to(
+ const key_type &key, IterType iter) const {
+ for (;;) {
+ int res = iter.node->lower_bound(key, key_comp());
+ iter.position = res & kMatchMask;
+ if (res & kExactMatch) {
+ return std::make_pair(iter, static_cast<int>(kExactMatch));
+ }
+ if (iter.node->leaf()) {
+ break;
+ }
+ iter.node = iter.node->child(iter.position);
+ }
+ return std::make_pair(iter, -kExactMatch);
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_lower_bound(
+ const key_type &key, IterType iter) const {
+ if (iter.node) {
+ for (;;) {
+ iter.position =
+ iter.node->lower_bound(key, key_comp()) & kMatchMask;
+ if (iter.node->leaf()) {
+ break;
+ }
+ iter.node = iter.node->child(iter.position);
+ }
+ iter = internal_last(iter);
+ }
+ return iter;
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_upper_bound(
+ const key_type &key, IterType iter) const {
+ if (iter.node) {
+ for (;;) {
+ iter.position = iter.node->upper_bound(key, key_comp());
+ if (iter.node->leaf()) {
+ break;
+ }
+ iter.node = iter.node->child(iter.position);
+ }
+ iter = internal_last(iter);
+ }
+ return iter;
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_find_unique(
+ const key_type &key, IterType iter) const {
+ if (iter.node) {
+ std::pair<IterType, int> res = internal_locate(key, iter);
+ if (res.second == kExactMatch) {
+ return res.first;
+ }
+ if (!res.second) {
+ iter = internal_last(res.first);
+ if (iter.node && !compare_keys(key, iter.key())) {
+ return iter;
+ }
+ }
+ }
+ return IterType(NULL, 0);
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_find_multi(
+ const key_type &key, IterType iter) const {
+ if (iter.node) {
+ iter = internal_lower_bound(key, iter);
+ if (iter.node) {
+ iter = internal_last(iter);
+ if (iter.node && !compare_keys(key, iter.key())) {
+ return iter;
+ }
+ }
+ }
+ return IterType(NULL, 0);
+}
+
+template <typename P>
+void btree<P>::internal_clear(node_type *node) {
+ if (!node->leaf()) {
+ for (int i = 0; i <= node->count(); ++i) {
+ internal_clear(node->child(i));
+ }
+ if (node == root()) {
+ delete_internal_root_node();
+ } else {
+ delete_internal_node(node);
+ }
+ } else {
+ delete_leaf_node(node);
+ }
+}
+
+template <typename P>
+void btree<P>::internal_dump(
+ std::ostream &os, const node_type *node, int level) const {
+ for (int i = 0; i < node->count(); ++i) {
+ if (!node->leaf()) {
+ internal_dump(os, node->child(i), level + 1);
+ }
+ for (int j = 0; j < level; ++j) {
+ os << " ";
+ }
+ os << node->key(i) << " [" << level << "]\n";
+ }
+ if (!node->leaf()) {
+ internal_dump(os, node->child(node->count()), level + 1);
+ }
+}
+
+template <typename P>
+int btree<P>::internal_verify(
+ const node_type *node, const key_type *lo, const key_type *hi) const {
+ ceph_assert(node->count() > 0);
+ ceph_assert(node->count() <= node->max_count());
+ if (lo) {
+ ceph_assert(!compare_keys(node->key(0), *lo));
+ }
+ if (hi) {
+ ceph_assert(!compare_keys(*hi, node->key(node->count() - 1)));
+ }
+ for (int i = 1; i < node->count(); ++i) {
+ ceph_assert(!compare_keys(node->key(i), node->key(i - 1)));
+ }
+ int count = node->count();
+ if (!node->leaf()) {
+ for (int i = 0; i <= node->count(); ++i) {
+ ceph_assert(node->child(i) != NULL);
+ ceph_assert(node->child(i)->parent() == node);
+ ceph_assert(node->child(i)->position() == i);
+ count += internal_verify(
+ node->child(i),
+ (i == 0) ? lo : &node->key(i - 1),
+ (i == node->count()) ? hi : &node->key(i));
+ }
+ }
+ return count;
+}
+
+} // namespace btree
+
+#endif // UTIL_BTREE_BTREE_H__
diff --git a/src/include/cpp-btree/btree_container.h b/src/include/cpp-btree/btree_container.h
new file mode 100644
index 00000000..fb617abe
--- /dev/null
+++ b/src/include/cpp-btree/btree_container.h
@@ -0,0 +1,349 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_BTREE_BTREE_CONTAINER_H__
+#define UTIL_BTREE_BTREE_CONTAINER_H__
+
+#include <iosfwd>
+#include <utility>
+
+#include "btree.h"
+
+namespace btree {
+
+// A common base class for btree_set, btree_map, btree_multiset and
+// btree_multimap.
+template <typename Tree>
+class btree_container {
+ typedef btree_container<Tree> self_type;
+
+ public:
+ typedef typename Tree::params_type params_type;
+ typedef typename Tree::key_type key_type;
+ typedef typename Tree::value_type value_type;
+ typedef typename Tree::key_compare key_compare;
+ typedef typename Tree::allocator_type allocator_type;
+ typedef typename Tree::pointer pointer;
+ typedef typename Tree::const_pointer const_pointer;
+ typedef typename Tree::reference reference;
+ typedef typename Tree::const_reference const_reference;
+ typedef typename Tree::size_type size_type;
+ typedef typename Tree::difference_type difference_type;
+ typedef typename Tree::iterator iterator;
+ typedef typename Tree::const_iterator const_iterator;
+ typedef typename Tree::reverse_iterator reverse_iterator;
+ typedef typename Tree::const_reverse_iterator const_reverse_iterator;
+
+ public:
+ // Default constructor.
+ btree_container(const key_compare &comp, const allocator_type &alloc)
+ : tree_(comp, alloc) {
+ }
+
+ // Copy constructor.
+ btree_container(const self_type &x)
+ : tree_(x.tree_) {
+ }
+
+ // Iterator routines.
+ iterator begin() { return tree_.begin(); }
+ const_iterator begin() const { return tree_.begin(); }
+ iterator end() { return tree_.end(); }
+ const_iterator end() const { return tree_.end(); }
+ reverse_iterator rbegin() { return tree_.rbegin(); }
+ const_reverse_iterator rbegin() const { return tree_.rbegin(); }
+ reverse_iterator rend() { return tree_.rend(); }
+ const_reverse_iterator rend() const { return tree_.rend(); }
+
+ // Lookup routines.
+ iterator lower_bound(const key_type &key) {
+ return tree_.lower_bound(key);
+ }
+ const_iterator lower_bound(const key_type &key) const {
+ return tree_.lower_bound(key);
+ }
+ iterator upper_bound(const key_type &key) {
+ return tree_.upper_bound(key);
+ }
+ const_iterator upper_bound(const key_type &key) const {
+ return tree_.upper_bound(key);
+ }
+ std::pair<iterator,iterator> equal_range(const key_type &key) {
+ return tree_.equal_range(key);
+ }
+ std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
+ return tree_.equal_range(key);
+ }
+
+ // Utility routines.
+ void clear() {
+ tree_.clear();
+ }
+ void swap(self_type &x) {
+ tree_.swap(x.tree_);
+ }
+ void dump(std::ostream &os) const {
+ tree_.dump(os);
+ }
+ void verify() const {
+ tree_.verify();
+ }
+
+ // Size routines.
+ size_type size() const { return tree_.size(); }
+ size_type max_size() const { return tree_.max_size(); }
+ bool empty() const { return tree_.empty(); }
+ size_type height() const { return tree_.height(); }
+ size_type internal_nodes() const { return tree_.internal_nodes(); }
+ size_type leaf_nodes() const { return tree_.leaf_nodes(); }
+ size_type nodes() const { return tree_.nodes(); }
+ size_type bytes_used() const { return tree_.bytes_used(); }
+ static double average_bytes_per_value() {
+ return Tree::average_bytes_per_value();
+ }
+ double fullness() const { return tree_.fullness(); }
+ double overhead() const { return tree_.overhead(); }
+
+ bool operator==(const self_type& x) const {
+ if (size() != x.size()) {
+ return false;
+ }
+ for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi) {
+ if (*i != *xi) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool operator!=(const self_type& other) const {
+ return !operator==(other);
+ }
+
+
+ protected:
+ Tree tree_;
+};
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const btree_container<T> &b) {
+ b.dump(os);
+ return os;
+}
+
+// A common base class for btree_set and safe_btree_set.
+template <typename Tree>
+class btree_unique_container : public btree_container<Tree> {
+ typedef btree_unique_container<Tree> self_type;
+ typedef btree_container<Tree> super_type;
+
+ public:
+ typedef typename Tree::key_type key_type;
+ typedef typename Tree::value_type value_type;
+ typedef typename Tree::size_type size_type;
+ typedef typename Tree::key_compare key_compare;
+ typedef typename Tree::allocator_type allocator_type;
+ typedef typename Tree::iterator iterator;
+ typedef typename Tree::const_iterator const_iterator;
+
+ public:
+ // Default constructor.
+ btree_unique_container(const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ }
+
+ // Copy constructor.
+ btree_unique_container(const self_type &x)
+ : super_type(x) {
+ }
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_unique_container(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ insert(b, e);
+ }
+
+ // Lookup routines.
+ iterator find(const key_type &key) {
+ return this->tree_.find_unique(key);
+ }
+ const_iterator find(const key_type &key) const {
+ return this->tree_.find_unique(key);
+ }
+ size_type count(const key_type &key) const {
+ return this->tree_.count_unique(key);
+ }
+
+ // Insertion routines.
+ std::pair<iterator,bool> insert(const value_type &x) {
+ return this->tree_.insert_unique(x);
+ }
+ iterator insert(iterator position, const value_type &x) {
+ return this->tree_.insert_unique(position, x);
+ }
+ template <typename InputIterator>
+ void insert(InputIterator b, InputIterator e) {
+ this->tree_.insert_unique(b, e);
+ }
+
+ // Deletion routines.
+ int erase(const key_type &key) {
+ return this->tree_.erase_unique(key);
+ }
+ // Erase the specified iterator from the btree. The iterator must be valid
+ // (i.e. not equal to end()). Return an iterator pointing to the node after
+ // the one that was erased (or end() if none exists).
+ iterator erase(const iterator &iter) {
+ return this->tree_.erase(iter);
+ }
+ void erase(const iterator &first, const iterator &last) {
+ this->tree_.erase(first, last);
+ }
+};
+
+// A common base class for btree_map and safe_btree_map.
+template <typename Tree>
+class btree_map_container : public btree_unique_container<Tree> {
+ typedef btree_map_container<Tree> self_type;
+ typedef btree_unique_container<Tree> super_type;
+
+ public:
+ typedef typename Tree::key_type key_type;
+ typedef typename Tree::data_type data_type;
+ typedef typename Tree::value_type value_type;
+ typedef typename Tree::mapped_type mapped_type;
+ typedef typename Tree::key_compare key_compare;
+ typedef typename Tree::allocator_type allocator_type;
+
+ private:
+ // A pointer-like object which only generates its value when
+ // dereferenced. Used by operator[] to avoid constructing an empty data_type
+ // if the key already exists in the map.
+ struct generate_value {
+ generate_value(const key_type &k)
+ : key(k) {
+ }
+ value_type operator*() const {
+ return std::make_pair(key, data_type());
+ }
+ const key_type &key;
+ };
+
+ public:
+ // Default constructor.
+ btree_map_container(const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ }
+
+ // Copy constructor.
+ btree_map_container(const self_type &x)
+ : super_type(x) {
+ }
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_map_container(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(b, e, comp, alloc) {
+ }
+
+ // Insertion routines.
+ data_type& operator[](const key_type &key) {
+ return this->tree_.insert_unique(key, generate_value(key)).first->second;
+ }
+};
+
+// A common base class for btree_multiset and btree_multimap.
+template <typename Tree>
+class btree_multi_container : public btree_container<Tree> {
+ typedef btree_multi_container<Tree> self_type;
+ typedef btree_container<Tree> super_type;
+
+ public:
+ typedef typename Tree::key_type key_type;
+ typedef typename Tree::value_type value_type;
+ typedef typename Tree::size_type size_type;
+ typedef typename Tree::key_compare key_compare;
+ typedef typename Tree::allocator_type allocator_type;
+ typedef typename Tree::iterator iterator;
+ typedef typename Tree::const_iterator const_iterator;
+
+ public:
+ // Default constructor.
+ btree_multi_container(const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ }
+
+ // Copy constructor.
+ btree_multi_container(const self_type &x)
+ : super_type(x) {
+ }
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_multi_container(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ insert(b, e);
+ }
+
+ // Lookup routines.
+ iterator find(const key_type &key) {
+ return this->tree_.find_multi(key);
+ }
+ const_iterator find(const key_type &key) const {
+ return this->tree_.find_multi(key);
+ }
+ size_type count(const key_type &key) const {
+ return this->tree_.count_multi(key);
+ }
+
+ // Insertion routines.
+ iterator insert(const value_type &x) {
+ return this->tree_.insert_multi(x);
+ }
+ iterator insert(iterator position, const value_type &x) {
+ return this->tree_.insert_multi(position, x);
+ }
+ template <typename InputIterator>
+ void insert(InputIterator b, InputIterator e) {
+ this->tree_.insert_multi(b, e);
+ }
+
+ // Deletion routines.
+ int erase(const key_type &key) {
+ return this->tree_.erase_multi(key);
+ }
+ // Erase the specified iterator from the btree. The iterator must be valid
+ // (i.e. not equal to end()). Return an iterator pointing to the node after
+ // the one that was erased (or end() if none exists).
+ iterator erase(const iterator &iter) {
+ return this->tree_.erase(iter);
+ }
+ void erase(const iterator &first, const iterator &last) {
+ this->tree_.erase(first, last);
+ }
+};
+
+} // namespace btree
+
+#endif // UTIL_BTREE_BTREE_CONTAINER_H__
diff --git a/src/include/cpp-btree/btree_map.h b/src/include/cpp-btree/btree_map.h
new file mode 100644
index 00000000..b83489f0
--- /dev/null
+++ b/src/include/cpp-btree/btree_map.h
@@ -0,0 +1,130 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree_map<> implements the STL unique sorted associative container
+// interface and the pair associative container interface (a.k.a map<>) using a
+// btree. A btree_multimap<> implements the STL multiple sorted associative
+// container interface and the pair associtive container interface (a.k.a
+// multimap<>) using a btree. See btree.h for details of the btree
+// implementation and caveats.
+
+#ifndef UTIL_BTREE_BTREE_MAP_H__
+#define UTIL_BTREE_BTREE_MAP_H__
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// The btree_map class is needed mainly for its constructors.
+template <typename Key, typename Value,
+ typename Compare = std::less<Key>,
+ typename Alloc = std::allocator<std::pair<const Key, Value> >,
+ int TargetNodeSize = 256>
+class btree_map : public btree_map_container<
+ btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+ typedef btree_map<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+ typedef btree_map_params<
+ Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+ typedef btree<params_type> btree_type;
+ typedef btree_map_container<btree_type> super_type;
+
+ public:
+ typedef typename btree_type::key_compare key_compare;
+ typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+ // Default constructor.
+ btree_map(const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ }
+
+ // Copy constructor.
+ btree_map(const self_type &x)
+ : super_type(x) {
+ }
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_map(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(b, e, comp, alloc) {
+ }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(btree_map<K, V, C, A, N> &x,
+ btree_map<K, V, C, A, N> &y) {
+ x.swap(y);
+}
+
+// The btree_multimap class is needed mainly for its constructors.
+template <typename Key, typename Value,
+ typename Compare = std::less<Key>,
+ typename Alloc = std::allocator<std::pair<const Key, Value> >,
+ int TargetNodeSize = 256>
+class btree_multimap : public btree_multi_container<
+ btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+ typedef btree_multimap<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+ typedef btree_map_params<
+ Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+ typedef btree<params_type> btree_type;
+ typedef btree_multi_container<btree_type> super_type;
+
+ public:
+ typedef typename btree_type::key_compare key_compare;
+ typedef typename btree_type::allocator_type allocator_type;
+ typedef typename btree_type::data_type data_type;
+ typedef typename btree_type::mapped_type mapped_type;
+
+ public:
+ // Default constructor.
+ btree_multimap(const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ }
+
+ // Copy constructor.
+ btree_multimap(const self_type &x)
+ : super_type(x) {
+ }
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_multimap(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(b, e, comp, alloc) {
+ }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(btree_multimap<K, V, C, A, N> &x,
+ btree_multimap<K, V, C, A, N> &y) {
+ x.swap(y);
+}
+
+} // namespace btree
+
+#endif // UTIL_BTREE_BTREE_MAP_H__
diff --git a/src/include/cpp-btree/btree_set.h b/src/include/cpp-btree/btree_set.h
new file mode 100644
index 00000000..f9b2e75d
--- /dev/null
+++ b/src/include/cpp-btree/btree_set.h
@@ -0,0 +1,121 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree_set<> implements the STL unique sorted associative container
+// interface (a.k.a set<>) using a btree. A btree_multiset<> implements the STL
+// multiple sorted associative container interface (a.k.a multiset<>) using a
+// btree. See btree.h for details of the btree implementation and caveats.
+
+#ifndef UTIL_BTREE_BTREE_SET_H__
+#define UTIL_BTREE_BTREE_SET_H__
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// The btree_set class is needed mainly for its constructors.
+template <typename Key,
+ typename Compare = std::less<Key>,
+ typename Alloc = std::allocator<Key>,
+ int TargetNodeSize = 256>
+class btree_set : public btree_unique_container<
+ btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
+
+ typedef btree_set<Key, Compare, Alloc, TargetNodeSize> self_type;
+ typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
+ typedef btree<params_type> btree_type;
+ typedef btree_unique_container<btree_type> super_type;
+
+ public:
+ typedef typename btree_type::key_compare key_compare;
+ typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+ // Default constructor.
+ btree_set(const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ }
+
+ // Copy constructor.
+ btree_set(const self_type &x)
+ : super_type(x) {
+ }
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_set(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(b, e, comp, alloc) {
+ }
+};
+
+template <typename K, typename C, typename A, int N>
+inline void swap(btree_set<K, C, A, N> &x, btree_set<K, C, A, N> &y) {
+ x.swap(y);
+}
+
+// The btree_multiset class is needed mainly for its constructors.
+template <typename Key,
+ typename Compare = std::less<Key>,
+ typename Alloc = std::allocator<Key>,
+ int TargetNodeSize = 256>
+class btree_multiset : public btree_multi_container<
+ btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
+
+ typedef btree_multiset<Key, Compare, Alloc, TargetNodeSize> self_type;
+ typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
+ typedef btree<params_type> btree_type;
+ typedef btree_multi_container<btree_type> super_type;
+
+ public:
+ typedef typename btree_type::key_compare key_compare;
+ typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+ // Default constructor.
+ btree_multiset(const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ }
+
+ // Copy constructor.
+ btree_multiset(const self_type &x)
+ : super_type(x) {
+ }
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_multiset(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(b, e, comp, alloc) {
+ }
+};
+
+template <typename K, typename C, typename A, int N>
+inline void swap(btree_multiset<K, C, A, N> &x,
+ btree_multiset<K, C, A, N> &y) {
+ x.swap(y);
+}
+
+} // namespace btree
+
+#endif // UTIL_BTREE_BTREE_SET_H__
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
new file mode 100644
index 00000000..dd4ede66
--- /dev/null
+++ b/src/include/crc32c.h
@@ -0,0 +1,57 @@
+#ifndef CEPH_CRC32C_H
+#define CEPH_CRC32C_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
+
+/*
+ * this is a static global with the chosen crc32c implementation for
+ * the given architecture.
+ */
+extern ceph_crc32c_func_t ceph_crc32c_func;
+
+extern ceph_crc32c_func_t ceph_choose_crc32(void);
+
+/**
+ * calculate crc32c for data that is entirely 0 (ZERO)
+ *
+ * Note: works the same as ceph_crc32c_func for data == nullptr,
+ * but faster than the optimized assembly on certain architectures.
+ * This is faster than intel optimized assembly, but not as fast as
+ * ppc64le optimized assembly.
+ *
+ * @param crc initial value
+ * @param length length of buffer
+ */
+uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length);
+
+/**
+ * calculate crc32c
+ *
+ * Note: if the data pointer is NULL, we calculate a crc value as if
+ * it were zero-filled.
+ *
+ * @param crc initial value
+ * @param data pointer to data buffer
+ * @param length length of buffer
+ */
+static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
+{
+#ifndef HAVE_POWER8
+ if (!data && length > 16)
+ return ceph_crc32c_zeros(crc, length);
+#endif /* HAVE_POWER8 */
+
+ return ceph_crc32c_func(crc, data, length);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/demangle.h b/src/include/demangle.h
new file mode 100644
index 00000000..9e46d952
--- /dev/null
+++ b/src/include/demangle.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INCLUDE_DEMANGLE
+#define CEPH_INCLUDE_DEMANGLE
+
+//// Stole this code from http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
+#ifdef __GNUG__
+#include <cstdlib>
+#include <memory>
+#include <cxxabi.h>
+
+static std::string ceph_demangle(const char* name)
+{
+ int status = -4; // some arbitrary value to eliminate the compiler warning
+
+ // enable c++11 by passing the flag -std=c++11 to g++
+ std::unique_ptr<char, void(*)(void*)> res {
+ abi::__cxa_demangle(name, NULL, NULL, &status),
+ std::free
+ };
+
+ return (status == 0) ? res.get() : name ;
+}
+
+#else
+
+// does nothing if not g++
+static std::string demangle(const char* name)
+{
+ return name;
+}
+
+#endif
+
+
+#endif
diff --git a/src/include/denc.h b/src/include/denc.h
new file mode 100644
index 00000000..a6a0fcaa
--- /dev/null
+++ b/src/include/denc.h
@@ -0,0 +1,1724 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+// If you #include "include/encoding.h" you get the old-style *and*
+// the new-style definitions. (The old-style needs denc_traits<> in
+// order to disable the container helpers when new-style traits are
+// present.)
+
+// You can also just #include "include/denc.h" and get only the
+// new-style helpers. The eventual goal is to drop the legacy
+// definitions.
+
+#ifndef _ENC_DEC_H
+#define _ENC_DEC_H
+
+#include <array>
+#include <cstring>
+#include <map>
+#include <optional>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/intrusive/set.hpp>
+#include <boost/optional.hpp>
+
+#include "include/ceph_assert.h" // boost clobbers this
+#include "include/intarith.h"
+#include "include/int_types.h"
+
+#include "buffer.h"
+#include "byteorder.h"
+
+#include "common/convenience.h"
+
+template<typename T, typename=void>
+struct denc_traits {
+ static constexpr bool supported = false;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = true;
+};
+
+template<typename T>
+inline constexpr bool denc_supported = denc_traits<T>::supported;
+
+
+// hack for debug only; FIXME
+//#include <iostream>
+//using std::cout;
+
+// Define this to compile in a dump of all encoded objects to disk to
+// populate ceph-object-corpus. Note that there is an almost
+// identical implementation in encoding.h, but you only need to define
+// ENCODE_DUMP_PATH here.
+//
+// See src/test/encoding/generate-corpus-objects.sh.
+//
+//#define ENCODE_DUMP_PATH /tmp/something
+
+#ifdef ENCODE_DUMP_PATH
+# include <cstdio>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <fcntl.h>
+# define ENCODE_STR(x) #x
+# define ENCODE_STRINGIFY(x) ENCODE_STR(x)
+# define DENC_DUMP_PRE(Type) \
+ char *__denc_dump_pre = p.get_pos();
+ // this hackery with bits below is just to get a semi-reasonable
+ // distribution across time. it is somewhat exponential but not
+ // quite.
+# define DENC_DUMP_POST(Type) \
+ do { \
+ static int i = 0; \
+ i++; \
+ int bits = 0; \
+ for (unsigned t = i; t; bits++) \
+ t &= t - 1; \
+ if (bits > 2) \
+ break; \
+ char fn[PATH_MAX]; \
+ snprintf(fn, sizeof(fn), \
+ ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #Type, \
+ getpid(), i++); \
+ int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC, 0644); \
+ if (fd >= 0) { \
+ size_t len = p.get_pos() - __denc_dump_pre; \
+ int r = ::write(fd, __denc_dump_pre, len); \
+ (void)r; \
+ ::close(fd); \
+ } \
+ } while (0)
+#else
+# define DENC_DUMP_PRE(Type)
+# define DENC_DUMP_POST(Type)
+#endif
+
+
+/*
+
+ top level level functions look like so
+ ======================================
+
+ inline void denc(const T& o, size_t& p, uint64_t features=0);
+ inline void denc(const T& o, buffer::list::contiguous_appender& p,
+ uint64_t features=0);
+ inline void denc(T& o, buffer::ptr::const_iterator& p, uint64_t features=0);
+
+ or (for featured objects)
+
+ inline void denc(const T& o, size_t& p, uint64_t features);
+ inline void denc(const T& o, buffer::list::contiguous_appender& p,
+ uint64_t features);
+ inline void denc(T& o, buffer::ptr::const_iterator& p, uint64_t features);
+
+ - These are symmetrical, so that they can be used from the magic DENC
+ method of writing the bound_encode/encode/decode methods all in one go;
+ they differ only in the type of p.
+
+ - These are automatically fabricated via a template that calls into
+ the denc_traits<> methods (see below), provided denc_traits<T>::supported
+ is defined and true. They never need to be written explicitly.
+
+
+ static denc_traits<> definitions look like so
+ =============================================
+
+ template<>
+ struct denc_traits<T> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const T &o, size_t& p, uint64_t f=0);
+ static void encode(const T &o, buffer::list::contiguous_appender& p,
+ uint64_t f=0);
+ static void decode(T& o, buffer::ptr::const_iterator &p, uint64_t f=0);
+ };
+
+ or (for featured objects)
+
+ template<>
+ struct denc_traits<T> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = true;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const T &o, size_t& p, uint64_t f);
+ static void encode(const T &o, buffer::list::contiguous_appender& p,
+ uint64_t f);
+ static void decode(T& o, buffer::ptr::const_iterator &p, uint64_t f=0);
+ };
+
+ - denc_traits<T> is normally declared via the WRITE_CLASS_DENC(type) macro,
+ which is used in place of the old-style WRITE_CLASS_ENCODER(type) macro.
+ There are _FEATURED and _BOUNDED variants. The class traits simply call
+ into class methods of the same name (see below).
+
+ - denc_traits<T> can also be written explicitly for some type to indicate
+ how it should be encoded. This is the "source of truth" for how a type
+ is encoded.
+
+ - denc_traits<T> are declared for the base integer types, string, bufferptr,
+ and bufferlist base types.
+
+ - denc_traits<std::foo<T>>-like traits are declared for standard container
+ types.
+
+
+ class methods look like so
+ ==========================
+
+ void bound_encode(size_t& p) const;
+ void encode(buffer::list::contiguous_appender& p) const;
+ void decode(buffer::ptr::const_iterator &p);
+
+ or (for featured objects)
+
+ void bound_encode(size_t& p, uint64_t f) const;
+ void encode(buffer::list::contiguous_appender& p, uint64_t f) const;
+ void decode(buffer::ptr::const_iterator &p);
+
+ - These are normally invoked by the denc_traits<> methods that are
+ declared via WRITE_CLASS_DENC, although you can also invoke them explicitly
+ in your code.
+
+ - These methods are optimised for contiguous buffer, but denc() will try
+ rebuild a contigous one if the decoded bufferlist is segmented. If you are
+ concerned about the cost, you might want to define yet another method:
+
+ void decode(buffer::list::iterator &p);
+
+ - These can be defined either explicitly (as above), or can be "magically"
+ defined all in one go using the DENC macro and DENC_{START,FINISH} helpers
+ (which work like the legacy {ENCODE,DECODE}_{START,FINISH} macros):
+
+ class foo_t {
+ ...
+ DENC(foo_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.foo, p);
+ denc(v.bar, p);
+ denc(v.baz, p);
+ DENC_FINISH(p);
+ }
+ ...
+ };
+ WRITE_CLASS_DENC(foo_t)
+
+ */
+
+// ---------------------------------------------------------------------
+// raw types
+namespace _denc {
+template<typename T, typename... Us>
+inline constexpr bool is_any_of = (... || std::is_same_v<T, Us>);
+
+template<typename T, typename=void> struct underlying_type {
+ using type = T;
+};
+template<typename T>
+struct underlying_type<T, std::enable_if_t<std::is_enum_v<T>>> {
+ using type = std::underlying_type_t<T>;
+};
+template<typename T>
+using underlying_type_t = typename underlying_type<T>::type;
+}
+
+template<class It>
+struct is_const_iterator
+ : std::conditional_t<std::is_const_v<std::remove_pointer_t<typename It::pointer>>,
+ std::true_type,
+ std::false_type>
+{};
+template<>
+struct is_const_iterator<size_t> : std::false_type {};
+template<>
+struct is_const_iterator<buffer::list::contiguous_appender> : std::false_type {
+ // appender is used for *changing* the buffer
+};
+template<class It>
+inline constexpr bool is_const_iterator_v = is_const_iterator<It>::value;
+
+template<typename T, class It>
+std::enable_if_t<is_const_iterator_v<It>, const T&>
+get_pos_add(It& i) {
+ return *reinterpret_cast<const T*>(i.get_pos_add(sizeof(T)));
+}
+
+template<typename T, class It>
+std::enable_if_t<!is_const_iterator_v<It>, T&>
+get_pos_add(It& i) {
+ return *reinterpret_cast<T*>(i.get_pos_add(sizeof(T)));
+}
+
+template<typename T>
+struct denc_traits<
+ T,
+ std::enable_if_t<
+ _denc::is_any_of<_denc::underlying_type_t<T>,
+ ceph_le64, ceph_le32, ceph_le16, uint8_t
+#ifndef _CHAR_IS_SIGNED
+ , int8_t
+#endif
+ >>> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+ static void bound_encode(const T &o, size_t& p, uint64_t f=0) {
+ p += sizeof(T);
+ }
+ template<class It>
+ static std::enable_if_t<!is_const_iterator_v<It>>
+ encode(const T &o, It& p, uint64_t f=0) {
+ get_pos_add<T>(p) = o;
+ }
+ template<class It>
+ static std::enable_if_t<is_const_iterator_v<It>>
+ decode(T& o, It& p, uint64_t f=0) {
+ o = get_pos_add<T>(p);
+ }
+ static void decode(T& o, buffer::list::const_iterator &p) {
+ p.copy(sizeof(T), reinterpret_cast<char*>(&o));
+ }
+};
+
+
+// -----------------------------------------------------------------------
+// integer types
+
+// itype == internal type
+// otype == external type, i.e., the type on the wire
+
+// NOTE: the overload resolution ensures that the legacy encode/decode methods
+// defined for int types is preferred to the ones defined using the specialized
+// template, and hence get selected. This machinery prevents these these from
+// getting glued into the legacy encode/decode methods; the overhead of setting
+// up a contiguous_appender etc is likely to be slower.
+namespace _denc {
+
+template<typename T, typename=void> struct ExtType {
+ using type = void;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int16_t> ||
+ std::is_same_v<T, uint16_t>>> {
+ using type = ceph_le16;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int32_t> ||
+ std::is_same_v<T, uint32_t>>> {
+ using type = ceph_le32;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int64_t> ||
+ std::is_same_v<T, uint64_t>>> {
+ using type = ceph_le64;
+};
+
+template<>
+struct ExtType<bool> {
+ using type = uint8_t;
+};
+template<typename T>
+using ExtType_t = typename ExtType<T>::type;
+} // namespace _denc
+
+template<typename T>
+struct denc_traits<T, std::enable_if_t<!std::is_void_v<_denc::ExtType_t<T>>>>
+{
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+ using etype = _denc::ExtType_t<T>;
+ static void bound_encode(const T &o, size_t& p, uint64_t f=0) {
+ p += sizeof(etype);
+ }
+ template<class It>
+ static std::enable_if_t<!is_const_iterator_v<It>>
+ encode(const T &o, It& p, uint64_t f=0) {
+ get_pos_add<etype>(p) = o;
+ }
+ template<class It>
+ static std::enable_if_t<is_const_iterator_v<It>>
+ decode(T& o, It &p, uint64_t f=0) {
+ o = get_pos_add<etype>(p);
+ }
+ static void decode(T& o, buffer::list::const_iterator &p) {
+ etype e;
+ p.copy(sizeof(etype), reinterpret_cast<char*>(&e));
+ o = e;
+ }
+};
+
+// varint
+//
+// high bit of each byte indicates another byte follows.
+template<typename T>
+inline void denc_varint(T v, size_t& p) {
+ p += sizeof(T) + 1;
+}
+
+template<typename T>
+inline void denc_varint(T v, bufferlist::contiguous_appender& p) {
+ uint8_t byte = v & 0x7f;
+ v >>= 7;
+ while (v) {
+ byte |= 0x80;
+ get_pos_add<__u8>(p) = byte;
+ byte = (v & 0x7f);
+ v >>= 7;
+ }
+ get_pos_add<__u8>(p) = byte;
+}
+
+template<typename T>
+inline void denc_varint(T& v, bufferptr::const_iterator& p) {
+ uint8_t byte = *(__u8*)p.get_pos_add(1);
+ v = byte & 0x7f;
+ int shift = 7;
+ while (byte & 0x80) {
+ byte = get_pos_add<__u8>(p);
+ v |= (T)(byte & 0x7f) << shift;
+ shift += 7;
+ }
+}
+
+
+// signed varint encoding
+//
+// low bit = 1 = negative, 0 = positive
+// high bit of every byte indicates whether another byte follows.
+inline void denc_signed_varint(int64_t v, size_t& p) {
+ p += sizeof(v) + 2;
+}
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_signed_varint(int64_t v, It& p) {
+ if (v < 0) {
+ v = (-v << 1) | 1;
+ } else {
+ v <<= 1;
+ }
+ denc_varint(v, p);
+}
+
+template<typename T, class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_signed_varint(T& v, It& p)
+{
+ int64_t i = 0;
+ denc_varint(i, p);
+ if (i & 1) {
+ v = -(i >> 1);
+ } else {
+ v = i >> 1;
+ }
+}
+
+// varint + lowz encoding
+//
+// first(low) 2 bits = how many low zero bits (nibbles)
+// high bit of each byte = another byte follows
+// (so, 5 bits data in first byte, 7 bits data thereafter)
+inline void denc_varint_lowz(uint64_t v, size_t& p) {
+ p += sizeof(v) + 2;
+}
+inline void denc_varint_lowz(uint64_t v, bufferlist::contiguous_appender& p) {
+ int lowznib = v ? (ctz(v) / 4) : 0;
+ if (lowznib > 3)
+ lowznib = 3;
+ v >>= lowznib * 4;
+ v <<= 2;
+ v |= lowznib;
+ denc_varint(v, p);
+}
+
+template<typename T>
+inline void denc_varint_lowz(T& v, bufferptr::const_iterator& p)
+{
+ uint64_t i = 0;
+ denc_varint(i, p);
+ int lowznib = (i & 3);
+ i >>= 2;
+ i <<= lowznib * 4;
+ v = i;
+}
+
+// signed varint + lowz encoding
+//
+// first low bit = 1 for negative, 0 for positive
+// next 2 bits = how many low zero bits (nibbles)
+// high bit of each byte = another byte follows
+// (so, 4 bits data in first byte, 7 bits data thereafter)
+inline void denc_signed_varint_lowz(int64_t v, size_t& p) {
+ p += sizeof(v) + 2;
+}
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_signed_varint_lowz(int64_t v, It& p) {
+ bool negative = false;
+ if (v < 0) {
+ v = -v;
+ negative = true;
+ }
+ unsigned lowznib = v ? (ctz(v) / 4) : 0u;
+ if (lowznib > 3)
+ lowznib = 3;
+ v >>= lowznib * 4;
+ v <<= 3;
+ v |= lowznib << 1;
+ v |= (int)negative;
+ denc_varint(v, p);
+}
+
+template<typename T, class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_signed_varint_lowz(T& v, It& p)
+{
+ int64_t i = 0;
+ denc_varint(i, p);
+ int lowznib = (i & 6) >> 1;
+ if (i & 1) {
+ i >>= 3;
+ i <<= lowznib * 4;
+ v = -i;
+ } else {
+ i >>= 3;
+ i <<= lowznib * 4;
+ v = i;
+ }
+}
+
+
+// LBA
+//
+// first 1-3 bits = how many low zero bits
+// *0 = 12 (common 4 K alignment case)
+// *01 = 16
+// *011 = 20
+// *111 = byte
+// then 28-30 bits of data
+// then last bit = another byte follows
+// high bit of each subsequent byte = another byte follows
+inline void denc_lba(uint64_t v, size_t& p) {
+ p += sizeof(v) + 2;
+}
+
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_lba(uint64_t v, It& p) {
+ int low_zero_nibbles = v ? (int)(ctz(v) / 4) : 0;
+ int pos;
+ uint32_t word;
+ int t = low_zero_nibbles - 3;
+ if (t < 0) {
+ pos = 3;
+ word = 0x7;
+ } else if (t < 3) {
+ v >>= (low_zero_nibbles * 4);
+ pos = t + 1;
+ word = (1 << t) - 1;
+ } else {
+ v >>= 20;
+ pos = 3;
+ word = 0x3;
+ }
+ word |= (v << pos) & 0x7fffffff;
+ v >>= 31 - pos;
+ if (!v) {
+ *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word;
+ return;
+ }
+ word |= 0x80000000;
+ *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word;
+ uint8_t byte = v & 0x7f;
+ v >>= 7;
+ while (v) {
+ byte |= 0x80;
+ *(__u8*)p.get_pos_add(1) = byte;
+ byte = (v & 0x7f);
+ v >>= 7;
+ }
+ *(__u8*)p.get_pos_add(1) = byte;
+}
+
+template<class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_lba(uint64_t& v, It& p) {
+ uint32_t word = *(ceph_le32*)p.get_pos_add(sizeof(uint32_t));
+ int shift;
+ switch (word & 7) {
+ case 0:
+ case 2:
+ case 4:
+ case 6:
+ v = (uint64_t)(word & 0x7ffffffe) << (12 - 1);
+ shift = 12 + 30;
+ break;
+ case 1:
+ case 5:
+ v = (uint64_t)(word & 0x7ffffffc) << (16 - 2);
+ shift = 16 + 29;
+ break;
+ case 3:
+ v = (uint64_t)(word & 0x7ffffff8) << (20 - 3);
+ shift = 20 + 28;
+ break;
+ case 7:
+ v = (uint64_t)(word & 0x7ffffff8) >> 3;
+ shift = 28;
+ }
+ uint8_t byte = word >> 24;
+ while (byte & 0x80) {
+ byte = *(__u8*)p.get_pos_add(1);
+ v |= (uint64_t)(byte & 0x7f) << shift;
+ shift += 7;
+ }
+}
+
+
+// ---------------------------------------------------------------------
+// denc top-level methods that call into denc_traits<T> methods
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported> denc(
+ const T& o,
+ size_t& p,
+ uint64_t f=0)
+{
+ if constexpr (traits::featured) {
+ traits::bound_encode(o, p, f);
+ } else {
+ traits::bound_encode(o, p);
+ }
+}
+
+template<typename T, class It, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !is_const_iterator_v<It>>
+denc(const T& o,
+ It& p,
+ uint64_t features=0)
+{
+ if constexpr (traits::featured) {
+ traits::encode(o, p, features);
+ } else {
+ traits::encode(o, p);
+ }
+}
+
+template<typename T, class It, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && is_const_iterator_v<It>>
+denc(T& o,
+ It& p,
+ uint64_t features=0)
+{
+ if constexpr (traits::featured) {
+ traits::decode(o, p, features);
+ } else {
+ traits::decode(o, p);
+ }
+}
+
+namespace _denc {
+template<typename T, typename = void>
+struct has_legacy_denc : std::false_type {};
+template<typename T>
+struct has_legacy_denc<T, decltype(std::declval<T&>()
+ .decode(std::declval<
+ bufferlist::const_iterator&>()))>
+ : std::true_type {
+ static void decode(T& v, bufferlist::const_iterator& p) {
+ v.decode(p);
+ }
+};
+template<typename T>
+struct has_legacy_denc<T,
+ std::enable_if_t<
+ !denc_traits<T>::need_contiguous>> : std::true_type {
+ static void decode(T& v, bufferlist::const_iterator& p) {
+ denc_traits<T>::decode(v, p);
+ }
+};
+}
+
+template<typename T,
+ typename traits=denc_traits<T>,
+ typename has_legacy_denc=_denc::has_legacy_denc<T>>
+inline std::enable_if_t<traits::supported &&
+ has_legacy_denc::value> denc(
+ T& o,
+ buffer::list::const_iterator& p)
+{
+ has_legacy_denc::decode(o, p);
+}
+
+// ---------------------------------------------------------------------
+// base types and containers
+
+//
+// std::string
+//
+template<typename A>
+struct denc_traits<std::basic_string<char,std::char_traits<char>,A>> {
+private:
+ using value_type = std::basic_string<char,std::char_traits<char>,A>;
+
+public:
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = false;
+
+ static void bound_encode(const value_type& s, size_t& p, uint64_t f=0) {
+ p += sizeof(uint32_t) + s.size();
+ }
+ template<class It>
+ static void encode(const value_type& s,
+ It& p,
+ uint64_t f=0) {
+ denc((uint32_t)s.size(), p);
+ memcpy(p.get_pos_add(s.size()), s.data(), s.size());
+ }
+ template<class It>
+ static void decode(value_type& s,
+ It& p,
+ uint64_t f=0) {
+ uint32_t len;
+ denc(len, p);
+ decode_nohead(len, s, p);
+ }
+ static void decode(value_type& s, buffer::list::const_iterator& p)
+ {
+ uint32_t len;
+ denc(len, p);
+ decode_nohead(len, s, p);
+ }
+ template<class It>
+ static void decode_nohead(size_t len, value_type& s, It& p) {
+ s.clear();
+ if (len) {
+ s.append(p.get_pos_add(len), len);
+ }
+ }
+ static void decode_nohead(size_t len, value_type& s,
+ buffer::list::const_iterator& p) {
+ if (len) {
+ if constexpr (std::is_same_v<value_type, std::string>) {
+ s.clear();
+ p.copy(len, s);
+ } else {
+ s.resize(len);
+ p.copy(len, s.data());
+ }
+ } else {
+ s.clear();
+ }
+ }
+ template<class It>
+ static std::enable_if_t<!is_const_iterator_v<It>>
+ encode_nohead(const value_type& s, It& p) {
+ auto len = s.length();
+ maybe_inline_memcpy(p.get_pos_add(len), s.data(), len, 16);
+ }
+};
+
+//
+// bufferptr
+//
+template<>
+struct denc_traits<bufferptr> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = false;
+ static void bound_encode(const bufferptr& v, size_t& p, uint64_t f=0) {
+ p += sizeof(uint32_t) + v.length();
+ }
+ template <class It>
+ static std::enable_if_t<!is_const_iterator_v<It>>
+ encode(const bufferptr& v, It& p, uint64_t f=0) {
+ denc((uint32_t)v.length(), p);
+ p.append(v);
+ }
+ template <class It>
+ static std::enable_if_t<is_const_iterator_v<It>>
+ decode(bufferptr& v, It& p, uint64_t f=0) {
+ uint32_t len;
+ denc(len, p);
+ v = p.get_ptr(len);
+ }
+ static void decode(bufferptr& v, buffer::list::const_iterator& p) {
+ uint32_t len;
+ denc(len, p);
+ bufferlist s;
+ p.copy(len, s);
+ if (len) {
+ if (s.get_num_buffers() == 1)
+ v = s.front();
+ else
+ v = buffer::copy(s.c_str(), s.length());
+ }
+ }
+};
+
+//
+// bufferlist
+//
+template<>
+struct denc_traits<bufferlist> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = false;
+ static void bound_encode(const bufferlist& v, size_t& p, uint64_t f=0) {
+ p += sizeof(uint32_t) + v.length();
+ }
+ static void encode(const bufferlist& v, buffer::list::contiguous_appender& p,
+ uint64_t f=0) {
+ denc((uint32_t)v.length(), p);
+ p.append(v);
+ }
+ static void decode(bufferlist& v, buffer::ptr::const_iterator& p, uint64_t f=0) {
+ uint32_t len;
+ denc(len, p);
+ v.clear();
+ v.push_back(p.get_ptr(len));
+ }
+ static void decode(bufferlist& v, buffer::list::const_iterator& p) {
+ uint32_t len;
+ denc(len, p);
+ v.clear();
+ p.copy(len, v);
+ }
+ static void encode_nohead(const bufferlist& v,
+ buffer::list::contiguous_appender& p) {
+ p.append(v);
+ }
+ static void decode_nohead(size_t len, bufferlist& v,
+ buffer::ptr::const_iterator& p) {
+ v.clear();
+ if (len) {
+ v.append(p.get_ptr(len));
+ }
+ }
+ static void decode_nohead(size_t len, bufferlist& v,
+ buffer::list::const_iterator& p) {
+ v.clear();
+ p.copy(len, v);
+ }
+};
+
+//
+// std::pair<A, B>
+//
+template<typename A, typename B>
+struct denc_traits<
+ std::pair<A, B>,
+ std::enable_if_t<denc_supported<A> && denc_supported<B>>> {
+ typedef denc_traits<A> a_traits;
+ typedef denc_traits<B> b_traits;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = a_traits::featured || b_traits::featured ;
+ static constexpr bool bounded = a_traits::bounded && b_traits::bounded;
+ static constexpr bool need_contiguous = (a_traits::need_contiguous ||
+ b_traits::need_contiguous);
+
+ static void bound_encode(const std::pair<A,B>& v, size_t& p, uint64_t f = 0) {
+ if constexpr (featured) {
+ denc(v.first, p, f);
+ denc(v.second, p, f);
+ } else {
+ denc(v.first, p);
+ denc(v.second, p);
+ }
+ }
+
+ static void encode(const std::pair<A,B>& v, bufferlist::contiguous_appender& p,
+ uint64_t f = 0) {
+ if constexpr (featured) {
+ denc(v.first, p, f);
+ denc(v.second, p, f);
+ } else {
+ denc(v.first, p);
+ denc(v.second, p);
+ }
+ }
+
+ static void decode(std::pair<A,B>& v, buffer::ptr::const_iterator& p, uint64_t f=0) {
+ denc(v.first, p, f);
+ denc(v.second, p, f);
+ }
+ template<typename AA=A>
+ static std::enable_if_t<!!sizeof(AA) && !need_contiguous>
+ decode(std::pair<A,B>& v, buffer::list::const_iterator& p,
+ uint64_t f = 0) {
+ denc(v.first, p);
+ denc(v.second, p);
+ }
+};
+
+namespace _denc {
+ template<template<class...> class C, typename Details, typename ...Ts>
+ struct container_base {
+ private:
+ using container = C<Ts...>;
+ using T = typename Details::T;
+
+ public:
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ template<typename U=T>
+ static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+ p += sizeof(uint32_t);
+ if constexpr (traits::bounded) {
+ if (!s.empty()) {
+ // STL containers use weird element types like std::pair<const K, V>;
+ // cast to something we have denc_traits for.
+ size_t elem_size = 0;
+ if constexpr (traits::featured) {
+ denc(static_cast<const T&>(*s.begin()), elem_size, f);
+ } else {
+ denc(static_cast<const T&>(*s.begin()), elem_size);
+ }
+ p += sizeof(uint32_t) + elem_size * s.size();
+ }
+ } else {
+ for (const T& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ }
+
+ template<typename U=T>
+ static void encode(const container& s, buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ denc((uint32_t)s.size(), p);
+ if constexpr (traits::featured) {
+ encode_nohead(s, p, f);
+ } else {
+ encode_nohead(s, p);
+ }
+ }
+ static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) {
+ uint32_t num;
+ denc(num, p);
+ decode_nohead(num, s, p, f);
+ }
+ template<typename U=T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode(container& s, buffer::list::const_iterator& p) {
+ uint32_t num;
+ denc(num, p);
+ decode_nohead(num, s, p);
+ }
+
+ // nohead
+ static void encode_nohead(const container& s, buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ for (const T& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ static void decode_nohead(size_t num, container& s,
+ buffer::ptr::const_iterator& p, uint64_t f=0) {
+ s.clear();
+ Details::reserve(s, num);
+ while (num--) {
+ T t;
+ denc(t, p, f);
+ Details::insert(s, std::move(t));
+ }
+ }
+ template<typename U=T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode_nohead(size_t num, container& s,
+ buffer::list::const_iterator& p) {
+ s.clear();
+ Details::reserve(s, num);
+ while (num--) {
+ T t;
+ denc(t, p);
+ Details::insert(s, std::move(t));
+ }
+ }
+ };
+
+ template<typename T>
+ class container_has_reserve {
+ template<typename U, U> struct SFINAE_match;
+ template<typename U>
+ static std::true_type test(SFINAE_match<T(*)(typename T::size_type),
+ &U::reserve>*);
+
+ template<typename U>
+ static std::false_type test(...);
+
+ public:
+ static constexpr bool value = decltype(
+ test<denc_traits<T>>(0))::value;
+ };
+ template<typename T>
+ inline constexpr bool container_has_reserve_v =
+ container_has_reserve<T>::value;
+
+
+ template<typename Container>
+ struct container_details_base {
+ using T = typename Container::value_type;
+ static void reserve(Container& c, size_t s) {
+ if constexpr (container_has_reserve_v<Container>) {
+ c.reserve(s);
+ }
+ }
+ };
+
+ template<typename Container>
+ struct pushback_details : public container_details_base<Container> {
+ template<typename ...Args>
+ static void insert(Container& c, Args&& ...args) {
+ c.emplace_back(std::forward<Args>(args)...);
+ }
+ };
+}
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+ std::list<T, Ts...>,
+ typename std::enable_if_t<denc_traits<T>::supported>>
+ : public _denc::container_base<std::list,
+ _denc::pushback_details<std::list<T, Ts...>>,
+ T, Ts...> {};
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+ std::vector<T, Ts...>,
+ typename std::enable_if_t<denc_traits<T>::supported>>
+ : public _denc::container_base<std::vector,
+ _denc::pushback_details<std::vector<T, Ts...>>,
+ T, Ts...> {};
+
+namespace _denc {
+ template<typename Container>
+ struct setlike_details : public container_details_base<Container> {
+ using T = typename Container::value_type;
+ template<typename ...Args>
+ static void insert(Container& c, Args&& ...args) {
+ c.emplace_hint(c.cend(), std::forward<Args>(args)...);
+ }
+ };
+}
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+ std::set<T, Ts...>,
+ std::enable_if_t<denc_traits<T>::supported>>
+ : public _denc::container_base<std::set,
+ _denc::setlike_details<std::set<T, Ts...>>,
+ T, Ts...> {};
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+ boost::container::flat_set<T, Ts...>,
+ std::enable_if_t<denc_traits<T>::supported>>
+ : public _denc::container_base<
+ boost::container::flat_set,
+ _denc::setlike_details<boost::container::flat_set<T, Ts...>>,
+ T, Ts...> {};
+
+namespace _denc {
+ template<typename Container>
+ struct maplike_details : public container_details_base<Container> {
+ using T = std::pair<typename Container::key_type,
+ typename Container::mapped_type>;
+ template<typename ...Args>
+ static void insert(Container& c, Args&& ...args) {
+ c.emplace_hint(c.cend(), std::forward<Args>(args)...);
+ }
+ };
+}
+
+template<typename A, typename B, typename ...Ts>
+struct denc_traits<
+ std::map<A, B, Ts...>,
+ std::enable_if_t<denc_traits<A>::supported &&
+ denc_traits<B>::supported>>
+ : public _denc::container_base<std::map,
+ _denc::maplike_details<std::map<A, B, Ts...>>,
+ A, B, Ts...> {};
+
+template<typename A, typename B, typename ...Ts>
+struct denc_traits<
+ boost::container::flat_map<A, B, Ts...>,
+ std::enable_if_t<denc_traits<A>::supported &&
+ denc_traits<B>::supported>>
+ : public _denc::container_base<
+ boost::container::flat_map,
+ _denc::maplike_details<boost::container::flat_map<
+ A, B, Ts...>>,
+ A, B, Ts...> {};
+
+template<typename T, size_t N>
+struct denc_traits<
+ std::array<T, N>,
+ std::enable_if_t<denc_traits<T>::supported>> {
+private:
+ using container = std::array<T, N>;
+public:
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = traits::bounded;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+ if constexpr (traits::bounded) {
+ if constexpr (traits::featured) {
+ if (!s.empty()) {
+ size_t elem_size = 0;
+ denc(*s.begin(), elem_size, f);
+ p += elem_size * s.size();
+ }
+ } else {
+ size_t elem_size = 0;
+ denc(*s.begin(), elem_size);
+ p += elem_size * N;
+ }
+ } else {
+ for (const auto& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ }
+
+ static void encode(const container& s, buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ for (const auto& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) {
+ for (auto& e : s)
+ denc(e, p, f);
+ }
+ template<typename U=T>
+ static std::enable_if_t<!!sizeof(U) &&
+ !need_contiguous>
+ decode(container& s, buffer::list::const_iterator& p) {
+ for (auto& e : s) {
+ denc(e, p);
+ }
+ }
+};
+
+template<typename... Ts>
+struct denc_traits<
+ std::tuple<Ts...>,
+ std::enable_if_t<(denc_traits<Ts>::supported && ...)>> {
+
+private:
+ static_assert(sizeof...(Ts) > 0,
+ "Zero-length tuples are not supported.");
+ using container = std::tuple<Ts...>;
+
+public:
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = (denc_traits<Ts>::featured || ...);
+ static constexpr bool bounded = (denc_traits<Ts>::bounded && ...);
+ static constexpr bool need_contiguous =
+ (denc_traits<Ts>::need_contiguous || ...);
+
+ template<typename U = container>
+ static std::enable_if_t<denc_traits<U>::featured>
+ bound_encode(const container& s, size_t& p, uint64_t f) {
+ ceph::for_each(s, [&p, f] (const auto& e) {
+ if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ });
+ }
+ template<typename U = container>
+ static std::enable_if_t<!denc_traits<U>::featured>
+ bound_encode(const container& s, size_t& p) {
+ ceph::for_each(s, [&p] (const auto& e) {
+ denc(e, p);
+ });
+ }
+
+ template<typename U = container>
+ static std::enable_if_t<denc_traits<U>::featured>
+ encode(const container& s, buffer::list::contiguous_appender& p, uint64_t f) {
+ ceph::for_each(s, [&p, f] (const auto& e) {
+ if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ });
+ }
+ template<typename U = container>
+ static std::enable_if_t<!denc_traits<U>::featured>
+ encode(const container& s, buffer::list::contiguous_appender& p) {
+ ceph::for_each(s, [&p] (const auto& e) {
+ denc(e, p);
+ });
+ }
+
+ static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) {
+ ceph::for_each(s, [&p] (auto& e) {
+ denc(e, p);
+ });
+ }
+
+ template<typename U = container>
+ static std::enable_if_t<!denc_traits<U>::need_contiguous>
+ decode(container& s, buffer::list::const_iterator& p, uint64_t f = 0) {
+ ceph::for_each(s, [&p] (auto& e) {
+ denc(e, p);
+ });
+ }
+};
+
+//
+// boost::optional<T>
+//
+template<typename T>
+struct denc_traits<
+ boost::optional<T>,
+ std::enable_if_t<denc_traits<T>::supported>> {
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ static void bound_encode(const boost::optional<T>& v, size_t& p,
+ uint64_t f = 0) {
+ p += sizeof(bool);
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void encode(const boost::optional<T>& v,
+ bufferlist::contiguous_appender& p,
+ uint64_t f = 0) {
+ denc((bool)v, p);
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void decode(boost::optional<T>& v, buffer::ptr::const_iterator& p,
+ uint64_t f = 0) {
+ bool x;
+ denc(x, p, f);
+ if (x) {
+ v = T{};
+ denc(*v, p, f);
+ } else {
+ v = boost::none;
+ }
+ }
+
+ template<typename U = T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode(boost::optional<T>& v, buffer::list::const_iterator& p) {
+ bool x;
+ denc(x, p);
+ if (x) {
+ v = T{};
+ denc(*v, p);
+ } else {
+ v = boost::none;
+ }
+ }
+
+ template<typename U = T>
+ static void encode_nohead(const boost::optional<T>& v,
+ bufferlist::contiguous_appender& p,
+ uint64_t f = 0) {
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void decode_nohead(bool num, boost::optional<T>& v,
+ buffer::ptr::const_iterator& p, uint64_t f = 0) {
+ if (num) {
+ v = T();
+ denc(*v, p, f);
+ } else {
+ v = boost::none;
+ }
+ }
+};
+
+template<>
+struct denc_traits<boost::none_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+
+ static void bound_encode(const boost::none_t& v, size_t& p) {
+ p += sizeof(bool);
+ }
+
+ static void encode(const boost::none_t& v,
+ bufferlist::contiguous_appender& p) {
+ denc(false, p);
+ }
+};
+
+//
+// std::optional<T>
+//
+template<typename T>
+struct denc_traits<
+ std::optional<T>,
+ std::enable_if_t<denc_traits<T>::supported>> {
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ static void bound_encode(const std::optional<T>& v, size_t& p,
+ uint64_t f = 0) {
+ p += sizeof(bool);
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void encode(const std::optional<T>& v,
+ bufferlist::contiguous_appender& p,
+ uint64_t f = 0) {
+ denc((bool)v, p);
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void decode(std::optional<T>& v, buffer::ptr::const_iterator& p,
+ uint64_t f = 0) {
+ bool x;
+ denc(x, p, f);
+ if (x) {
+ v = T{};
+ denc(*v, p, f);
+ } else {
+ v = std::nullopt;
+ }
+ }
+
+ template<typename U = T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode(std::optional<T>& v, buffer::list::const_iterator& p) {
+ bool x;
+ denc(x, p);
+ if (x) {
+ v = T{};
+ denc(*v, p);
+ } else {
+ v = std::nullopt;
+ }
+ }
+
+ static void encode_nohead(const std::optional<T>& v,
+ bufferlist::contiguous_appender& p,
+ uint64_t f = 0) {
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void decode_nohead(bool num, std::optional<T>& v,
+ buffer::ptr::const_iterator& p, uint64_t f = 0) {
+ if (num) {
+ v = T();
+ denc(*v, p, f);
+ } else {
+ v = std::nullopt;
+ }
+ }
+};
+
+template<>
+struct denc_traits<std::nullopt_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+
+ static void bound_encode(const std::nullopt_t& v, size_t& p) {
+ p += sizeof(bool);
+ }
+
+ static void encode(const std::nullopt_t& v,
+ bufferlist::contiguous_appender& p) {
+ denc(false, p);
+ }
+};
+
+// ----------------------------------------------------------------------
+// class helpers
+
+// Write denc_traits<> for a class that defines bound_encode/encode/decode
+// methods.
+
+#define WRITE_CLASS_DENC(T) _DECLARE_CLASS_DENC(T, false)
+#define WRITE_CLASS_DENC_BOUNDED(T) _DECLARE_CLASS_DENC(T, true)
+#define _DECLARE_CLASS_DENC(T, b) \
+ template<> struct denc_traits<T> { \
+ static constexpr bool supported = true; \
+ static constexpr bool featured = false; \
+ static constexpr bool bounded = b; \
+ static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\
+ static void bound_encode(const T& v, size_t& p, uint64_t f=0) { \
+ v.bound_encode(p); \
+ } \
+ static void encode(const T& v, buffer::list::contiguous_appender& p, \
+ uint64_t f=0) { \
+ v.encode(p); \
+ } \
+ static void decode(T& v, buffer::ptr::const_iterator& p, uint64_t f=0) { \
+ v.decode(p); \
+ } \
+ };
+
+#define WRITE_CLASS_DENC_FEATURED(T) _DECLARE_CLASS_DENC_FEATURED(T, false)
+#define WRITE_CLASS_DENC_FEATURED_BOUNDED(T) _DECLARE_CLASS_DENC_FEATURED(T, true)
+#define _DECLARE_CLASS_DENC_FEATURED(T, b) \
+ template<> struct denc_traits<T> { \
+ static constexpr bool supported = true; \
+ static constexpr bool featured = true; \
+ static constexpr bool bounded = b; \
+ static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\
+ static void bound_encode(const T& v, size_t& p, uint64_t f) { \
+ v.bound_encode(p, f); \
+ } \
+ static void encode(const T& v, buffer::list::contiguous_appender& p, \
+ uint64_t f) { \
+ v.encode(p, f); \
+ } \
+ static void decode(T& v, buffer::ptr::const_iterator& p, uint64_t f=0) { \
+ v.decode(p, f); \
+ } \
+ };
+
+
+// ----------------------------------------------------------------------
+// encode/decode wrappers
+
+// These glue the new-style denc world into old-style calls to encode
+// and decode by calling into denc_traits<> methods (when present).
+
+namespace ceph {
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::featured> encode(
+ const T& o,
+ bufferlist& bl,
+ uint64_t features_unused=0)
+{
+ size_t len = 0;
+ traits::bound_encode(o, len);
+ auto a = bl.get_contiguous_appender(len);
+ traits::encode(o, a);
+}
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && traits::featured> encode(
+ const T& o, bufferlist& bl,
+ uint64_t features)
+{
+ size_t len = 0;
+ traits::bound_encode(o, len, features);
+ auto a = bl.get_contiguous_appender(len);
+ traits::encode(o, a, features);
+}
+
+template<typename T,
+ typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::need_contiguous> decode(
+ T& o,
+ bufferlist::const_iterator& p)
+{
+ if (p.end())
+ throw buffer::end_of_buffer();
+ const auto& bl = p.get_bl();
+ const auto remaining = bl.length() - p.get_off();
+ // it is expensive to rebuild a contigous buffer and drop it, so avoid this.
+ if (!p.is_pointing_same_raw(bl.back()) && remaining > CEPH_PAGE_SIZE) {
+ traits::decode(o, p);
+ } else {
+ // ensure we get a contigous buffer... until the end of the
+ // bufferlist. we don't really know how much we'll need here,
+ // unfortunately. hopefully it is already contiguous and we're just
+ // bumping the raw ref and initializing the ptr tmp fields.
+ bufferptr tmp;
+ auto t = p;
+ t.copy_shallow(remaining, tmp);
+ auto cp = std::cbegin(tmp);
+ traits::decode(o, cp);
+ p.advance(cp.get_offset());
+ }
+}
+
+template<typename T,
+ typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && traits::need_contiguous> decode(
+ T& o,
+ bufferlist::const_iterator& p)
+{
+ if (p.end())
+ throw buffer::end_of_buffer();
+ // ensure we get a contigous buffer... until the end of the
+ // bufferlist. we don't really know how much we'll need here,
+ // unfortunately. hopefully it is already contiguous and we're just
+ // bumping the raw ref and initializing the ptr tmp fields.
+ bufferptr tmp;
+ auto t = p;
+ t.copy_shallow(p.get_bl().length() - p.get_off(), tmp);
+ auto cp = std::cbegin(tmp);
+ traits::decode(o, cp);
+ p.advance(cp.get_offset());
+}
+
+// nohead variants
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported &&
+ !traits::featured> encode_nohead(
+ const T& o,
+ bufferlist& bl)
+{
+ size_t len = 0;
+ traits::bound_encode(o, len);
+ auto a = bl.get_contiguous_appender(len);
+ traits::encode_nohead(o, a);
+}
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::featured> decode_nohead(
+ size_t num,
+ T& o,
+ bufferlist::const_iterator& p)
+{
+ if (!num)
+ return;
+ if (p.end())
+ throw buffer::end_of_buffer();
+ if constexpr (traits::need_contiguous) {
+ bufferptr tmp;
+ auto t = p;
+ if constexpr (denc_traits<typename T::value_type>::bounded) {
+ size_t element_size = 0;
+ typename T::value_type v;
+ denc_traits<typename T::value_type>::bound_encode(v, element_size);
+ t.copy_shallow(num * element_size, tmp);
+ } else {
+ t.copy_shallow(p.get_bl().length() - p.get_off(), tmp);
+ }
+ auto cp = std::cbegin(tmp);
+ traits::decode_nohead(num, o, cp);
+ p.advance(cp.get_offset());
+ } else {
+ traits::decode_nohead(num, o, p);
+ }
+}
+}
+
+
+// ----------------------------------------------------------------
+// DENC
+
+// These are some class methods we need to do the version and length
+// wrappers for DENC_{START,FINISH} for inter-version
+// interoperability.
+
+#define DENC_HELPERS \
+ /* bound_encode */ \
+ static void _denc_start(size_t& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **, uint32_t *) { \
+ p += 2 + 4; \
+ } \
+ static void _denc_finish(size_t& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **, uint32_t *) { } \
+ /* encode */ \
+ static void _denc_start(bufferlist::contiguous_appender& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **len_pos, \
+ uint32_t *start_oob_off) { \
+ denc(*struct_v, p); \
+ denc(*struct_compat, p); \
+ *len_pos = p.get_pos_add(4); \
+ *start_oob_off = p.get_out_of_band_offset(); \
+ } \
+ static void _denc_finish(bufferlist::contiguous_appender& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **len_pos, \
+ uint32_t *start_oob_off) { \
+ *(ceph_le32*)*len_pos = p.get_pos() - *len_pos - sizeof(uint32_t) + \
+ p.get_out_of_band_offset() - *start_oob_off; \
+ } \
+ /* decode */ \
+ static void _denc_start(buffer::ptr::const_iterator& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **start_pos, \
+ uint32_t *struct_len) { \
+ denc(*struct_v, p); \
+ denc(*struct_compat, p); \
+ denc(*struct_len, p); \
+ *start_pos = const_cast<char*>(p.get_pos()); \
+ } \
+ static void _denc_finish(buffer::ptr::const_iterator& p, \
+ __u8 *struct_v, __u8 *struct_compat, \
+ char **start_pos, \
+ uint32_t *struct_len) { \
+ const char *pos = p.get_pos(); \
+ char *end = *start_pos + *struct_len; \
+ ceph_assert(pos <= end); \
+ if (pos < end) { \
+ p.advance(end - pos); \
+ } \
+ }
+
+// Helpers for versioning the encoding. These correspond to the
+// {ENCODE,DECODE}_{START,FINISH} macros.
+
+#define DENC_START(v, compat, p) \
+ __u8 struct_v = v; \
+ __u8 struct_compat = compat; \
+ char *_denc_pchar; \
+ uint32_t _denc_u32; \
+ _denc_start(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32); \
+ do {
+
+#define DENC_FINISH(p) \
+ } while (false); \
+ _denc_finish(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32);
+
+
+// ----------------------------------------------------------------------
+
+// Helpers for writing a unified bound_encode/encode/decode
+// implementation that won't screw up buffer size estimations.
+
+#define DENC(Type, v, p) \
+ DENC_HELPERS \
+ void bound_encode(size_t& p) const { \
+ _denc_friend(*this, p); \
+ } \
+ void encode(bufferlist::contiguous_appender& p) const { \
+ DENC_DUMP_PRE(Type); \
+ _denc_friend(*this, p); \
+ DENC_DUMP_POST(Type); \
+ } \
+ void decode(buffer::ptr::const_iterator& p) { \
+ _denc_friend(*this, p); \
+ } \
+ template<typename T, typename P> \
+ friend std::enable_if_t<std::is_same_v<T, Type> || \
+ std::is_same_v<T, const Type>> \
+ _denc_friend(T& v, P& p)
+
+#define DENC_FEATURED(Type, v, p, f) \
+ DENC_HELPERS \
+ void bound_encode(size_t& p, uint64_t f) const { \
+ _denc_friend(*this, p, f); \
+ } \
+ void encode(bufferlist::contiguous_appender& p, uint64_t f) const { \
+ DENC_DUMP_PRE(Type); \
+ _denc_friend(*this, p, f); \
+ DENC_DUMP_POST(Type); \
+ } \
+ void decode(buffer::ptr::const_iterator& p, uint64_t f=0) { \
+ _denc_friend(*this, p, f); \
+ } \
+ template<typename T, typename P> \
+ friend std::enable_if_t<std::is_same_v<T, Type> || \
+ std::is_same_v<T, const Type>> \
+ _denc_friend(T& v, P& p, uint64_t f)
+
+#endif
diff --git a/src/include/elist.h b/src/include/elist.h
new file mode 100644
index 00000000..38be35db
--- /dev/null
+++ b/src/include/elist.h
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ELIST_H
+#define CEPH_ELIST_H
+
+/*
+ * elist: embedded list.
+ *
+ * requirements:
+ * - elist<T>::item be embedded in the parent class
+ * - items are _always_ added to the list via the same elist<T>::item at the same
+ * fixed offset in the class.
+ * - begin(), front(), back() methods take the member offset as an argument for traversal.
+ *
+ */
+
+#define member_offset(cls, member) ((size_t)(&((cls*)1)->member) - 1)
+
+template<typename T>
+class elist {
+public:
+ struct item {
+ item *_prev, *_next;
+
+ item(T i=0) : _prev(this), _next(this) {}
+ ~item() {
+ ceph_assert(!is_on_list());
+ }
+
+ item(const item& other) = delete;
+ const item& operator= (const item& right) = delete;
+
+
+ bool empty() const { return _prev == this; }
+ bool is_on_list() const { return !empty(); }
+
+ bool remove_myself() {
+ if (_next == this) {
+ ceph_assert(_prev == this);
+ return false;
+ }
+ _next->_prev = _prev;
+ _prev->_next = _next;
+ _prev = _next = this;
+ return true;
+ }
+
+ void insert_after(item *other) {
+ ceph_assert(other->empty());
+ other->_prev = this;
+ other->_next = _next;
+ _next->_prev = other;
+ _next = other;
+ }
+ void insert_before(item *other) {
+ ceph_assert(other->empty());
+ other->_next = this;
+ other->_prev = _prev;
+ _prev->_next = other;
+ _prev = other;
+ }
+
+ T get_item(size_t offset) {
+ ceph_assert(offset);
+ return (T)(((char *)this) - offset);
+ }
+ };
+
+private:
+ item _head;
+ size_t item_offset;
+
+public:
+ elist(const elist& other);
+ const elist& operator=(const elist& other);
+
+ elist(size_t o) : _head(NULL), item_offset(o) {}
+ ~elist() {
+ ceph_assert(_head.empty());
+ }
+
+ bool empty() const {
+ return _head.empty();
+ }
+
+ void clear() {
+ while (!_head.empty())
+ pop_front();
+ }
+
+ void push_front(item *i) {
+ if (!i->empty())
+ i->remove_myself();
+ _head.insert_after(i);
+ }
+ void push_back(item *i) {
+ if (!i->empty())
+ i->remove_myself();
+ _head.insert_before(i);
+ }
+
+ T front(size_t o=0) {
+ ceph_assert(!_head.empty());
+ return _head._next->get_item(o ? o : item_offset);
+ }
+ T back(size_t o=0) {
+ ceph_assert(!_head.empty());
+ return _head._prev->get_item(o ? o : item_offset);
+ }
+
+ void pop_front() {
+ ceph_assert(!empty());
+ _head._next->remove_myself();
+ }
+ void pop_back() {
+ ceph_assert(!empty());
+ _head._prev->remove_myself();
+ }
+
+ void clear_list() {
+ while (!empty())
+ pop_front();
+ }
+
+ enum mode_t {
+ MAGIC, CURRENT, CACHE_NEXT
+ };
+
+ class iterator {
+ private:
+ item *head;
+ item *cur, *next;
+ size_t item_offset;
+ mode_t mode;
+ public:
+ iterator(item *h, size_t o, mode_t m) :
+ head(h), cur(h->_next), next(cur->_next), item_offset(o),
+ mode(m) {
+ ceph_assert(item_offset > 0);
+ }
+ T operator*() {
+ return cur->get_item(item_offset);
+ }
+ iterator& operator++() {
+ ceph_assert(cur);
+ ceph_assert(cur != head);
+ if (mode == MAGIC) {
+ // if 'cur' appears to be valid, use that. otherwise,
+ // use cached 'next'.
+ // this is a bit magic, and probably a bad idea... :/
+ if (cur->empty())
+ cur = next;
+ else
+ cur = cur->_next;
+ } else if (mode == CURRENT)
+ cur = cur->_next;
+ else if (mode == CACHE_NEXT)
+ cur = next;
+ else
+ ceph_abort();
+ next = cur->_next;
+ return *this;
+ }
+ bool end() const {
+ return cur == head;
+ }
+ };
+
+ iterator begin(size_t o=0) {
+ return iterator(&_head, o ? o : item_offset, MAGIC);
+ }
+ iterator begin_use_current(size_t o=0) {
+ return iterator(&_head, o ? o : item_offset, CURRENT);
+ }
+ iterator begin_cache_next(size_t o=0) {
+ return iterator(&_head, o ? o : item_offset, CACHE_NEXT);
+ }
+};
+
+
+#endif
diff --git a/src/include/encoding.h b/src/include/encoding.h
new file mode 100644
index 00000000..61219024
--- /dev/null
+++ b/src/include/encoding.h
@@ -0,0 +1,1505 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_ENCODING_H
+#define CEPH_ENCODING_H
+
+#include <set>
+#include <map>
+#include <deque>
+#include <vector>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <boost/container/small_vector.hpp>
+#include <boost/optional/optional_io.hpp>
+#include <boost/tuple/tuple.hpp>
+
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+#include "common/ceph_time.h"
+
+#include "include/int_types.h"
+
+#include "common/convenience.h"
+
+#include "byteorder.h"
+#include "buffer.h"
+
+// pull in the new-style encoding so that we get the denc_traits<> definition.
+#include "denc.h"
+
+#include "assert.h"
+
+using namespace ceph;
+
+namespace ceph {
+
+/*
+ * Notes on feature encoding:
+ *
+ * - The default encode() methods have a features argument with a default parameter
+ * (which goes to zero).
+ * - Normal classes will use WRITE_CLASS_ENCODER, with that features=0 default.
+ * - Classes that _require_ features will use WRITE_CLASS_ENCODER_FEATURES, which
+ * does not define the default. Any caller must explicitly pass it in.
+ * - STL container macros have two encode variants: one with a features arg, and one
+ * without.
+ *
+ * The result:
+ * - A feature encode() method will fail to compile if a value is not
+ * passed in.
+ * - The feature varianet of the STL templates will be used when the feature arg is
+ * provided. It will be passed through to any template arg types, but it will be
+ * ignored when not needed.
+ */
+
+// --------------------------------------
+// base types
+
+template<class T>
+inline void encode_raw(const T& t, bufferlist& bl)
+{
+ bl.append((char*)&t, sizeof(t));
+}
+template<class T>
+inline void decode_raw(T& t, bufferlist::const_iterator &p)
+{
+ p.copy(sizeof(t), (char*)&t);
+}
+
+#define WRITE_RAW_ENCODER(type) \
+ inline void encode(const type &v, ::ceph::bufferlist& bl, uint64_t features=0) { ::ceph::encode_raw(v, bl); } \
+ inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { ::ceph::decode_raw(v, p); }
+
+WRITE_RAW_ENCODER(__u8)
+#ifndef _CHAR_IS_SIGNED
+WRITE_RAW_ENCODER(__s8)
+#endif
+WRITE_RAW_ENCODER(char)
+WRITE_RAW_ENCODER(ceph_le64)
+WRITE_RAW_ENCODER(ceph_le32)
+WRITE_RAW_ENCODER(ceph_le16)
+
+inline void encode(const bool &v, bufferlist& bl) {
+ __u8 vv = v;
+ encode_raw(vv, bl);
+}
+inline void decode(bool &v, bufferlist::const_iterator& p) {
+ __u8 vv;
+ decode_raw(vv, p);
+ v = vv;
+}
+
+
+// -----------------------------------
+// int types
+
+#define WRITE_INTTYPE_ENCODER(type, etype) \
+ inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \
+ ceph_##etype e; \
+ e = v; \
+ ::ceph::encode_raw(e, bl); \
+ } \
+ inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { \
+ ceph_##etype e; \
+ ::ceph::decode_raw(e, p); \
+ v = e; \
+ }
+
+WRITE_INTTYPE_ENCODER(uint64_t, le64)
+WRITE_INTTYPE_ENCODER(int64_t, le64)
+WRITE_INTTYPE_ENCODER(uint32_t, le32)
+WRITE_INTTYPE_ENCODER(int32_t, le32)
+WRITE_INTTYPE_ENCODER(uint16_t, le16)
+WRITE_INTTYPE_ENCODER(int16_t, le16)
+
+// -----------------------------------
+// float types
+//
+// NOTE: The following code assumes all supported platforms use IEEE binary32
+// as float and IEEE binary64 as double floating-point format. The assumption
+// is verified by the assertions below.
+//
+// Under this assumption, we can use raw encoding of floating-point types
+// on little-endian machines, but we still need to perform a byte swap
+// on big-endian machines to ensure cross-architecture compatibility.
+// To achive that, we reinterpret the values as integers first, which are
+// byte-swapped via the ceph_le types as above. The extra conversions
+// are optimized away on little-endian machines by the compiler.
+#define WRITE_FLTTYPE_ENCODER(type, itype, etype) \
+ static_assert(sizeof(type) == sizeof(itype)); \
+ static_assert(std::numeric_limits<type>::is_iec559, \
+ "floating-point type not using IEEE754 format"); \
+ inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \
+ ceph_##etype e; \
+ e = *reinterpret_cast<itype *>(&v); \
+ ::ceph::encode_raw(e, bl); \
+ } \
+ inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { \
+ ceph_##etype e; \
+ ::ceph::decode_raw(e, p); \
+ *reinterpret_cast<itype *>(&v) = e; \
+ }
+
+WRITE_FLTTYPE_ENCODER(float, uint32_t, le32)
+WRITE_FLTTYPE_ENCODER(double, uint64_t, le64)
+
+// see denc.h for ENCODE_DUMP_PATH discussion and definition.
+#ifdef ENCODE_DUMP_PATH
+# define ENCODE_DUMP_PRE() \
+ unsigned pre_off = bl.length()
+# define ENCODE_DUMP_POST(cl) \
+ do { \
+ static int i = 0; \
+ i++; \
+ int bits = 0; \
+ for (unsigned t = i; t; bits++) \
+ t &= t - 1; \
+ if (bits > 2) \
+ break; \
+ char fn[PATH_MAX]; \
+ snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #cl, getpid(), i++); \
+ int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC, 0644); \
+ if (fd >= 0) { \
+ ::ceph::bufferlist sub; \
+ sub.substr_of(bl, pre_off, bl.length() - pre_off); \
+ sub.write_fd(fd); \
+ ::close(fd); \
+ } \
+ } while (0)
+#else
+# define ENCODE_DUMP_PRE()
+# define ENCODE_DUMP_POST(cl)
+#endif
+
+
+#define WRITE_CLASS_ENCODER(cl) \
+ inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features=0) { \
+ ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); } \
+ inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_MEMBER_ENCODER(cl) \
+ inline void encode(const cl &c, ::ceph::bufferlist &bl) const { \
+ ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); } \
+ inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_ENCODER_FEATURES(cl) \
+ inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features) { \
+ ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); } \
+ inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_ENCODER_OPTIONAL_FEATURES(cl) \
+ inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features = 0) { \
+ ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); } \
+ inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+
+// string
+inline void encode(std::string_view s, bufferlist& bl, uint64_t features=0)
+{
+ __u32 len = s.length();
+ encode(len, bl);
+ if (len)
+ bl.append(s.data(), len);
+}
+inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0)
+{
+ return encode(std::string_view(s), bl, features);
+}
+inline void decode(std::string& s, bufferlist::const_iterator& p)
+{
+ __u32 len;
+ decode(len, p);
+ s.clear();
+ p.copy(len, s);
+}
+
+inline void encode_nohead(std::string_view s, bufferlist& bl)
+{
+ bl.append(s.data(), s.length());
+}
+inline void encode_nohead(const std::string& s, bufferlist& bl)
+{
+ encode_nohead(std::string_view(s), bl);
+}
+inline void decode_nohead(int len, std::string& s, bufferlist::const_iterator& p)
+{
+ s.clear();
+ p.copy(len, s);
+}
+
+// const char* (encode only, string compatible)
+inline void encode(const char *s, bufferlist& bl)
+{
+ encode(std::string_view(s, strlen(s)), bl);
+}
+
+
+// -----------------------------
+// buffers
+
+// bufferptr (encapsulated)
+inline void encode(const buffer::ptr& bp, bufferlist& bl)
+{
+ __u32 len = bp.length();
+ encode(len, bl);
+ if (len)
+ bl.append(bp);
+}
+inline void decode(buffer::ptr& bp, bufferlist::const_iterator& p)
+{
+ __u32 len;
+ decode(len, p);
+
+ bufferlist s;
+ p.copy(len, s);
+
+ if (len) {
+ if (s.get_num_buffers() == 1)
+ bp = s.front();
+ else
+ bp = buffer::copy(s.c_str(), s.length());
+ }
+}
+
+// bufferlist (encapsulated)
+inline void encode(const bufferlist& s, bufferlist& bl)
+{
+ __u32 len = s.length();
+ encode(len, bl);
+ bl.append(s);
+}
+inline void encode_destructively(bufferlist& s, bufferlist& bl)
+{
+ __u32 len = s.length();
+ encode(len, bl);
+ bl.claim_append(s);
+}
+inline void decode(bufferlist& s, bufferlist::const_iterator& p)
+{
+ __u32 len;
+ decode(len, p);
+ s.clear();
+ p.copy(len, s);
+}
+
+inline void encode_nohead(const bufferlist& s, bufferlist& bl)
+{
+ bl.append(s);
+}
+inline void decode_nohead(int len, bufferlist& s, bufferlist::const_iterator& p)
+{
+ s.clear();
+ p.copy(len, s);
+}
+
+// Time, since the templates are defined in std::chrono
+
+template<typename Clock, typename Duration,
+ typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr>
+void encode(const std::chrono::time_point<Clock, Duration>& t,
+ ceph::bufferlist &bl) {
+ auto ts = Clock::to_timespec(t);
+ // A 32 bit count of seconds causes me vast unhappiness.
+ uint32_t s = ts.tv_sec;
+ uint32_t ns = ts.tv_nsec;
+ encode(s, bl);
+ encode(ns, bl);
+}
+
+template<typename Clock, typename Duration,
+ typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr>
+void decode(std::chrono::time_point<Clock, Duration>& t,
+ bufferlist::const_iterator& p) {
+ uint32_t s;
+ uint32_t ns;
+ decode(s, p);
+ decode(ns, p);
+ struct timespec ts = {
+ static_cast<time_t>(s),
+ static_cast<long int>(ns)};
+
+ t = Clock::from_timespec(ts);
+}
+
+template<typename Rep, typename Period,
+ typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr>
+void encode(const std::chrono::duration<Rep, Period>& d,
+ ceph::bufferlist &bl) {
+ using namespace std::chrono;
+ uint32_t s = duration_cast<seconds>(d).count();
+ uint32_t ns = (duration_cast<nanoseconds>(d) % seconds(1)).count();
+ encode(s, bl);
+ encode(ns, bl);
+}
+
+template<typename Rep, typename Period,
+ typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr>
+void decode(std::chrono::duration<Rep, Period>& d,
+ bufferlist::const_iterator& p) {
+ uint32_t s;
+ uint32_t ns;
+ decode(s, p);
+ decode(ns, p);
+ d = std::chrono::seconds(s) + std::chrono::nanoseconds(ns);
+}
+
+// -----------------------------
+// STL container types
+
+template<typename T>
+inline void encode(const boost::optional<T> &p, bufferlist &bl);
+template<typename T>
+inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp);
+template<class A, class B, class C>
+inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl);
+template<class A, class B, class C>
+inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp);
+template<class A, class B,
+ typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported || !b_traits::supported>
+encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features);
+template<class A, class B,
+ typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported ||
+ !b_traits::supported>
+encode(const std::pair<A,B> &p, bufferlist &bl);
+template<class A, class B,
+ typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported ||
+ !b_traits::supported>
+decode(std::pair<A,B> &pa, bufferlist::const_iterator &p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::list<T, Alloc>& ls, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p);
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist& bl);
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist& bl, uint64_t features);
+template<class T, class Alloc>
+inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s,
+ bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s,
+ bufferlist::iterator& p);
+template<class T, class Comp, class Alloc>
+inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc>
+inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::vector<T,Alloc>& v, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p);
+template<class T,class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist& bl,
+ uint64_t features);
+template<class T, class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist& bl);
+template<class T, class Alloc>
+inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist::const_iterator& p);
+// small_vector
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p);
+// std::map
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported ||
+ !u_traits::supported>
+encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl,
+ uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc>
+inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
+ uint64_t features);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class Hash, class Pred, class Alloc>
+inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl);
+template<class T, class Hash, class Pred, class Alloc>
+inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features);
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl);
+template<class T, class Alloc>
+inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::array<T, N>& v, bufferlist::const_iterator& p);
+
+// full bl decoder
+template<class T>
+inline void decode(T &o, const bufferlist& bl)
+{
+ auto p = bl.begin();
+ decode(o, p);
+ ceph_assert(p.end());
+}
+
+// boost optional
+template<typename T>
+inline void encode(const boost::optional<T> &p, bufferlist &bl)
+{
+ __u8 present = static_cast<bool>(p);
+ encode(present, bl);
+ if (p)
+ encode(p.get(), bl);
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+template<typename T>
+inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp)
+{
+ __u8 present;
+ decode(present, bp);
+ if (present) {
+ p = T{};
+ decode(p.get(), bp);
+ } else {
+ p = boost::none;
+ }
+}
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+// std::tuple
+template<typename... Ts>
+inline void encode(const std::tuple<Ts...> &t, bufferlist& bl)
+{
+ ceph::for_each(t, [&bl](const auto& e) {
+ encode(e, bl);
+ });
+}
+template<typename... Ts>
+inline void decode(std::tuple<Ts...> &t, bufferlist::const_iterator &bp)
+{
+ ceph::for_each(t, [&bp](auto& e) {
+ decode(e, bp);
+ });
+}
+
+//triple boost::tuple
+template<class A, class B, class C>
+inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl)
+{
+ encode(boost::get<0>(t), bl);
+ encode(boost::get<1>(t), bl);
+ encode(boost::get<2>(t), bl);
+}
+template<class A, class B, class C>
+inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp)
+{
+ decode(boost::get<0>(t), bp);
+ decode(boost::get<1>(t), bp);
+ decode(boost::get<2>(t), bp);
+}
+
+// std::pair<A,B>
+template<class A, class B,
+ typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported || !b_traits::supported>
+ encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features)
+{
+ encode(p.first, bl, features);
+ encode(p.second, bl, features);
+}
+template<class A, class B,
+ typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported ||
+ !b_traits::supported>
+ encode(const std::pair<A,B> &p, bufferlist &bl)
+{
+ encode(p.first, bl);
+ encode(p.second, bl);
+}
+template<class A, class B, typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported ||
+ !b_traits::supported>
+ decode(std::pair<A,B> &pa, bufferlist::const_iterator &p)
+{
+ decode(pa.first, p);
+ decode(pa.second, p);
+}
+
+// std::list<T>
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::list<T, Alloc>& ls, bufferlist& bl)
+{
+ __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1)
+ encode(n, bl);
+ for (auto p = ls.begin(); p != ls.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features)
+{
+ // should i pre- or post- count?
+ if (!ls.empty()) {
+ unsigned pos = bl.length();
+ unsigned n = 0;
+ encode(n, bl);
+ for (auto p = ls.begin(); p != ls.end(); ++p) {
+ n++;
+ encode(*p, bl, features);
+ }
+ ceph_le32 en;
+ en = n;
+ bl.copy_in(pos, sizeof(en), (char*)&en);
+ } else {
+ __u32 n = (__u32)(ls.size()); // FIXME: this is slow on a list.
+ encode(n, bl);
+ for (auto p = ls.begin(); p != ls.end(); ++p)
+ encode(*p, bl, features);
+ }
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ ls.clear();
+ while (n--) {
+ ls.emplace_back();
+ decode(ls.back(), p);
+ }
+}
+
+// std::list<std::shared_ptr<T>>
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist& bl)
+{
+ __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1)
+ encode(n, bl);
+ for (const auto& ref : ls) {
+ encode(*ref, bl);
+ }
+}
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1)
+ encode(n, bl);
+ for (const auto& ref : ls) {
+ encode(*ref, bl, features);
+ }
+}
+template<class T, class Alloc>
+inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ ls.clear();
+ while (n--) {
+ auto ref = std::make_shared<T>();
+ decode(*ref, p);
+ ls.emplace_back(std::move(ref));
+ }
+}
+
+// std::set<T>
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
+{
+ __u32 n = (__u32)(s.size());
+ encode(n, bl);
+ for (auto p = s.begin(); p != s.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ s.clear();
+ while (n--) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+template<class T, class Comp, class Alloc, typename traits>
+inline typename std::enable_if<!traits::supported>::type
+ encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
+{
+ for (auto p = s.begin(); p != s.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+ for (int i=0; i<len; i++) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+// boost::container::flat_set<T>
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl)
+{
+ __u32 n = (__u32)(s.size());
+ encode(n, bl);
+ for (const auto& e : s)
+ encode(e, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ s.clear();
+ s.reserve(n);
+ while (n--) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s,
+ bufferlist& bl)
+{
+ for (const auto& e : s)
+ encode(e, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s,
+ bufferlist::iterator& p)
+{
+ s.reserve(len);
+ for (int i=0; i<len; i++) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+// multiset
+template<class T, class Comp, class Alloc>
+inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl)
+{
+ __u32 n = (__u32)(s.size());
+ encode(n, bl);
+ for (auto p = s.begin(); p != s.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Comp, class Alloc>
+inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ s.clear();
+ while (n--) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (auto p = v.begin(); p != v.end(); ++p)
+ encode(*p, bl, features);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::vector<T,Alloc>& v, bufferlist& bl)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (auto p = v.begin(); p != v.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ v.resize(n);
+ for (__u32 i=0; i<n; i++)
+ decode(v[i], p);
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl)
+{
+ for (auto p = v.begin(); p != v.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p)
+{
+ v.resize(len);
+ for (__u32 i=0; i<v.size(); i++)
+ decode(v[i], p);
+}
+
+// small vector
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (const auto& i : v)
+ encode(i, bl, features);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (const auto& i : v)
+ encode(i, bl);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ v.resize(n);
+ for (auto& i : v)
+ decode(i, p);
+}
+
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl)
+{
+ for (const auto& i : v)
+ encode(i, bl);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p)
+{
+ v.resize(len);
+ for (auto& i : v)
+ decode(i, p);
+}
+
+
+// vector (shared_ptr)
+template<class T,class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist& bl,
+ uint64_t features)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (const auto& ref : v) {
+ if (ref)
+ encode(*ref, bl, features);
+ else
+ encode(T(), bl, features);
+ }
+}
+template<class T, class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist& bl)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (const auto& ref : v) {
+ if (ref)
+ encode(*ref, bl);
+ else
+ encode(T(), bl);
+ }
+}
+template<class T, class Alloc>
+inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ v.clear();
+ v.reserve(n);
+ while (n--) {
+ auto ref = std::make_shared<T>();
+ decode(*ref, p);
+ v.emplace_back(std::move(ref));
+ }
+}
+
+// map
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported ||
+ !u_traits::supported>
+ encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
+{
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+// boost::container::flat-map
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (typename boost::container::flat_map<T,U,Comp>::const_iterator p
+ = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl,
+ uint64_t features)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ m.reserve(n);
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.reserve(m.size() + n);
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist& bl)
+{
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist& bl, uint64_t features)
+{
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist::const_iterator& p)
+{
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+// multimap
+template<class T, class U, class Comp, class Alloc>
+inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ typename std::pair<T,U> tu = std::pair<T,U>();
+ decode(tu.first, p);
+ typename std::multimap<T,U,Comp,Alloc>::iterator it = m.insert(tu);
+ decode(it->second, p);
+ }
+}
+
+// ceph::unordered_map
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
+ uint64_t features)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+// ceph::unordered_set
+template<class T, class Hash, class Pred, class Alloc>
+inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Hash, class Pred, class Alloc>
+inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ m.insert(k);
+ }
+}
+
+// deque
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features)
+{
+ __u32 n = ls.size();
+ encode(n, bl);
+ for (auto p = ls.begin(); p != ls.end(); ++p)
+ encode(*p, bl, features);
+}
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl)
+{
+ __u32 n = ls.size();
+ encode(n, bl);
+ for (auto p = ls.begin(); p != ls.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Alloc>
+inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ ls.clear();
+ while (n--) {
+ ls.emplace_back();
+ decode(ls.back(), p);
+ }
+}
+
+// std::array<T, N>
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features)
+{
+ for (const auto& e : v)
+ encode(e, bl, features);
+}
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl)
+{
+ for (const auto& e : v)
+ encode(e, bl);
+}
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode(std::array<T, N>& v, bufferlist::const_iterator& p)
+{
+ for (auto& e : v)
+ decode(e, p);
+}
+}
+
+/*
+ * guards
+ */
+
+/**
+ * start encoding block
+ *
+ * @param v current (code) version of the encoding
+ * @param compat oldest code version that can decode it
+ * @param bl bufferlist to encode to
+ *
+ */
+#define ENCODE_START(v, compat, bl) \
+ __u8 struct_v = v; \
+ __u8 struct_compat = compat; \
+ ceph_le32 struct_len; \
+ auto filler = (bl).append_hole(sizeof(struct_v) + \
+ sizeof(struct_compat) + sizeof(struct_len)); \
+ const auto starting_bl_len = (bl).length(); \
+ using ::ceph::encode; \
+ do {
+
+/**
+ * finish encoding block
+ *
+ * @param bl bufferlist we were encoding to
+ * @param new_struct_compat struct-compat value to use
+ */
+#define ENCODE_FINISH_NEW_COMPAT(bl, new_struct_compat) \
+ } while (false); \
+ if (new_struct_compat) { \
+ struct_compat = new_struct_compat; \
+ } \
+ struct_len = (bl).length() - starting_bl_len; \
+ filler.copy_in(sizeof(struct_v), (char *)&struct_v); \
+ filler.copy_in(sizeof(struct_compat), \
+ (char *)&struct_compat); \
+ filler.copy_in(sizeof(struct_len), (char *)&struct_len);
+
+#define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0)
+
+#define DECODE_ERR_OLDVERSION(func, v, compatv) \
+ (std::string(func) + " no longer understand old encoding version " #v " < " + std::to_string(compatv))
+
+#define DECODE_ERR_PAST(func) \
+ (std::string(func) + " decode past end of struct encoding")
+
+/**
+ * check for very old encoding
+ *
+ * If the encoded data is older than oldestv, raise an exception.
+ *
+ * @param oldestv oldest version of the code we can successfully decode.
+ */
+#define DECODE_OLDEST(oldestv) \
+ if (struct_v < oldestv) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, oldestv));
+
+/**
+ * start a decoding block
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param bl bufferlist::iterator for the encoded data
+ */
+#define DECODE_START(v, bl) \
+ __u8 struct_v, struct_compat; \
+ using ::ceph::decode; \
+ decode(struct_v, bl); \
+ decode(struct_compat, bl); \
+ if (v < struct_compat) \
+ throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
+ __u32 struct_len; \
+ decode(struct_len, bl); \
+ if (struct_len > bl.get_remaining()) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ unsigned struct_end = bl.get_off() + struct_len; \
+ do {
+
+/* BEWARE: any change to this macro MUST be also reflected in the duplicative
+ * DECODE_START_LEGACY_COMPAT_LEN! */
+#define __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, skip_v, bl) \
+ using ::ceph::decode; \
+ __u8 struct_v; \
+ decode(struct_v, bl); \
+ if (struct_v >= compatv) { \
+ __u8 struct_compat; \
+ decode(struct_compat, bl); \
+ if (v < struct_compat) \
+ throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
+ } else if (skip_v) { \
+ if (bl.get_remaining() < skip_v) \
+ throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ bl.advance(skip_v); \
+ } \
+ unsigned struct_end = 0; \
+ if (struct_v >= lenv) { \
+ __u32 struct_len; \
+ decode(struct_len, bl); \
+ if (struct_len > bl.get_remaining()) \
+ throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ struct_end = bl.get_off() + struct_len; \
+ } \
+ do {
+
+/**
+ * start a decoding block with legacy support for older encoding schemes
+ *
+ * The old encoding schemes has a __u8 struct_v only, or lacked either
+ * the compat version or length. Skip those fields conditionally.
+ *
+ * Most of the time, v, compatv, and lenv will all match the version
+ * where the structure was switched over to the new macros.
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param compatv oldest version that includes a __u8 compat version field
+ * @param lenv oldest version that includes a __u32 length wrapper
+ * @param bl bufferlist::iterator containing the encoded data
+ */
+
+/* BEWARE: this is duplication of __DECODE_START_LEGACY_COMPAT_LEN which
+ * MUST be changed altogether. For the rationale behind code duplication,
+ * please `git blame` and refer to the commit message. */
+#define DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, bl) \
+ using ::ceph::decode; \
+ __u8 struct_v; \
+ decode(struct_v, bl); \
+ if (struct_v >= compatv) { \
+ __u8 struct_compat; \
+ decode(struct_compat, bl); \
+ if (v < struct_compat) \
+ throw buffer::malformed_input(DECODE_ERR_OLDVERSION( \
+ __PRETTY_FUNCTION__, v, struct_compat)); \
+ } \
+ unsigned struct_end = 0; \
+ if (struct_v >= lenv) { \
+ __u32 struct_len; \
+ decode(struct_len, bl); \
+ if (struct_len > bl.get_remaining()) \
+ throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ struct_end = bl.get_off() + struct_len; \
+ } \
+ do {
+
+/**
+ * start a decoding block with legacy support for older encoding schemes
+ *
+ * This version of the macro assumes the legacy encoding had a 32 bit
+ * version
+ *
+ * The old encoding schemes has a __u8 struct_v only, or lacked either
+ * the compat version or length. Skip those fields conditionally.
+ *
+ * Most of the time, v, compatv, and lenv will all match the version
+ * where the structure was switched over to the new macros.
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param compatv oldest version that includes a __u8 compat version field
+ * @param lenv oldest version that includes a __u32 length wrapper
+ * @param bl bufferlist::iterator containing the encoded data
+ */
+#define DECODE_START_LEGACY_COMPAT_LEN_32(v, compatv, lenv, bl) \
+ __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 3u, bl)
+
+#define DECODE_START_LEGACY_COMPAT_LEN_16(v, compatv, lenv, bl) \
+ __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 1u, bl)
+
+/**
+ * finish decode block
+ *
+ * @param bl bufferlist::iterator we were decoding from
+ */
+#define DECODE_FINISH(bl) \
+ } while (false); \
+ if (struct_end) { \
+ if (bl.get_off() > struct_end) \
+ throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ if (bl.get_off() < struct_end) \
+ bl.advance(struct_end - bl.get_off()); \
+ }
+
+namespace ceph {
+
+/*
+ * Encoders/decoders to read from current offset in a file handle and
+ * encode/decode the data according to argument types.
+ */
+inline ssize_t decode_file(int fd, std::string &str)
+{
+ bufferlist bl;
+ __u32 len = 0;
+ bl.read_fd(fd, sizeof(len));
+ decode(len, bl);
+ bl.read_fd(fd, len);
+ decode(str, bl);
+ return bl.length();
+}
+
+inline ssize_t decode_file(int fd, bufferptr &bp)
+{
+ bufferlist bl;
+ __u32 len = 0;
+ bl.read_fd(fd, sizeof(len));
+ decode(len, bl);
+ bl.read_fd(fd, len);
+ auto bli = std::cbegin(bl);
+
+ decode(bp, bli);
+ return bl.length();
+}
+}
+
+#endif
diff --git a/src/include/err.h b/src/include/err.h
new file mode 100644
index 00000000..ba4b32ae
--- /dev/null
+++ b/src/include/err.h
@@ -0,0 +1,29 @@
+#ifndef CEPH_ERR_H
+#define CEPH_ERR_H
+
+/*
+ * adapted from linux 2.6.24 include/linux/err.h
+ */
+#define MAX_ERRNO 4095
+#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO)
+
+#include <errno.h>
+
+/* this generates a warning in c++; caller can do the cast manually
+static inline void *ERR_PTR(long error)
+{
+ return (void *) error;
+}
+*/
+
+static inline long PTR_ERR(const void *ptr)
+{
+ return (long) ptr;
+}
+
+static inline long IS_ERR(const void *ptr)
+{
+ return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+#endif
diff --git a/src/include/error.h b/src/include/error.h
new file mode 100644
index 00000000..a548d975
--- /dev/null
+++ b/src/include/error.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__)
+
+#define ASSERT(c) \
+ ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1))
+
+/* print usage error message and exit */
+extern void userror(const char *use, const char *fmt, ...);
+
+/* print system error message and exit */
+extern void syserror(const char *fmt, ...);
+
+/* print error message and exit */
+extern void exiterror(const char *fmt, ...);
+
+/* print error message */
+extern void error(const char *fmt, ...);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/include/event_type.h b/src/include/event_type.h
new file mode 100644
index 00000000..aa6ddedb
--- /dev/null
+++ b/src/include/event_type.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_EVENT_TYPE_H
+#define CEPH_COMMON_EVENT_TYPE_H
+
+#define EVENT_SOCKET_TYPE_NONE 0
+#define EVENT_SOCKET_TYPE_PIPE 1
+#define EVENT_SOCKET_TYPE_EVENTFD 2
+
+#endif
diff --git a/src/include/filepath.h b/src/include/filepath.h
new file mode 100644
index 00000000..832016ac
--- /dev/null
+++ b/src/include/filepath.h
@@ -0,0 +1,247 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILEPATH_H
+#define CEPH_FILEPATH_H
+
+/*
+ * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string.
+ * -> should it be different? how? should this[0] be "", with depth 4?
+ *
+ */
+
+
+#include <iosfwd>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "buffer.h"
+#include "encoding.h"
+#include "include/types.h"
+#include "include/fs_types.h"
+
+#include "common/Formatter.h"
+
+
+class filepath {
+ inodeno_t ino; // base inode. ino=0 implies pure relative path.
+ string path; // relative path.
+
+ /** bits - path segments
+ * this is ['a', 'b', 'c'] for both the aboslute and relative case.
+ *
+ * NOTE: this value is LAZILY maintained... i.e. it's a cache
+ */
+ mutable vector<string> bits;
+ bool encoded;
+
+ void rebuild_path() {
+ path.clear();
+ for (unsigned i=0; i<bits.size(); i++) {
+ if (i) path += "/";
+ path += bits[i];
+ }
+ }
+ void parse_bits() const {
+ bits.clear();
+ int off = 0;
+ while (off < (int)path.length()) {
+ int nextslash = path.find('/', off);
+ if (nextslash < 0)
+ nextslash = path.length(); // no more slashes
+ if (((nextslash - off) > 0) || encoded) {
+ // skip empty components unless they were introduced deliberately
+ // see commit message for more detail
+ bits.push_back( path.substr(off,nextslash-off) );
+ }
+ off = nextslash+1;
+ }
+ }
+
+ public:
+ filepath() : ino(0), encoded(false) { }
+ filepath(std::string_view s, inodeno_t i) : ino(i), path(s), encoded(false) { }
+ filepath(const string& s, inodeno_t i) : ino(i), path(s), encoded(false) { }
+ filepath(const char* s, inodeno_t i) : ino(i), path(s), encoded(false) { }
+ filepath(const filepath& o) {
+ ino = o.ino;
+ path = o.path;
+ bits = o.bits;
+ encoded = o.encoded;
+ }
+ filepath(inodeno_t i) : ino(i), encoded(false) { }
+
+ /*
+ * if we are fed a relative path as a string, either set ino=0 (strictly
+ * relative) or 1 (absolute). throw out any leading '/'.
+ */
+ filepath(std::string_view s) : encoded(false) {
+ set_path(s);
+ }
+ filepath(const char *s) : encoded(false) {
+ set_path(std::string_view(s));
+ }
+
+ void set_path(std::string_view s, inodeno_t b) {
+ path = s;
+ ino = b;
+ }
+ void set_path(std::string_view s) {
+ if (s[0] == '/') {
+ path = s.substr(1);
+ ino = 1;
+ } else {
+ ino = 0;
+ path = s;
+ }
+ bits.clear();
+ }
+
+
+ // accessors
+ inodeno_t get_ino() const { return ino; }
+ const string& get_path() const { return path; }
+ const char *c_str() const { return path.c_str(); }
+
+ int length() const { return path.length(); }
+ unsigned depth() const {
+ if (bits.empty() && path.length() > 0) parse_bits();
+ return bits.size();
+ }
+ bool empty() const { return path.length() == 0 && ino == 0; }
+
+ bool absolute() const { return ino == 1; }
+ bool pure_relative() const { return ino == 0; }
+ bool ino_relative() const { return ino > 0; }
+
+ const string& operator[](int i) const {
+ if (bits.empty() && path.length() > 0) parse_bits();
+ return bits[i];
+ }
+
+ const string& last_dentry() const {
+ if (bits.empty() && path.length() > 0) parse_bits();
+ ceph_assert(!bits.empty());
+ return bits[ bits.size()-1 ];
+ }
+
+ filepath prefixpath(int s) const {
+ filepath t(ino);
+ for (int i=0; i<s; i++)
+ t.push_dentry(bits[i]);
+ return t;
+ }
+ filepath postfixpath(int s) const {
+ filepath t;
+ for (unsigned i=s; i<bits.size(); i++)
+ t.push_dentry(bits[i]);
+ return t;
+ }
+
+
+ // modifiers
+ // string can be relative "a/b/c" (ino=0) or absolute "/a/b/c" (ino=1)
+ void _set_ino(inodeno_t i) { ino = i; }
+ void clear() {
+ ino = 0;
+ path = "";
+ bits.clear();
+ }
+
+ void pop_dentry() {
+ if (bits.empty() && path.length() > 0)
+ parse_bits();
+ bits.pop_back();
+ rebuild_path();
+ }
+ void push_dentry(std::string_view s) {
+ if (bits.empty() && path.length() > 0)
+ parse_bits();
+ if (!bits.empty())
+ path += "/";
+ path += s;
+ bits.emplace_back(s);
+ }
+ void push_dentry(const string& s) {
+ push_dentry(std::string_view(s));
+ }
+ void push_dentry(const char *cs) {
+ push_dentry(std::string_view(cs, strlen(cs)));
+ }
+ void push_front_dentry(const string& s) {
+ bits.insert(bits.begin(), s);
+ rebuild_path();
+ }
+ void append(const filepath& a) {
+ ceph_assert(a.pure_relative());
+ for (unsigned i=0; i<a.depth(); i++)
+ push_dentry(a[i]);
+ }
+
+ // encoding
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ __u8 struct_v = 1;
+ encode(struct_v, bl);
+ encode(ino, bl);
+ encode(path, bl);
+ }
+ void decode(bufferlist::const_iterator& blp) {
+ using ceph::decode;
+ bits.clear();
+ __u8 struct_v;
+ decode(struct_v, blp);
+ decode(ino, blp);
+ decode(path, blp);
+ encoded = true;
+ }
+ void dump(Formatter *f) const {
+ f->dump_unsigned("base_ino", ino);
+ f->dump_string("relative_path", path);
+ }
+ static void generate_test_instances(list<filepath*>& o) {
+ o.push_back(new filepath);
+ o.push_back(new filepath("/usr/bin", 0));
+ o.push_back(new filepath("/usr/sbin", 1));
+ o.push_back(new filepath("var/log", 1));
+ o.push_back(new filepath("foo/bar", 101));
+ }
+
+ bool is_last_dot_or_dotdot() const {
+ if (depth() > 0) {
+ std::string dname = last_dentry();
+ if (dname == "." || dname == "..") {
+ return true;
+ }
+ }
+
+ return false;
+ }
+};
+
+WRITE_CLASS_ENCODER(filepath)
+
+inline ostream& operator<<(ostream& out, const filepath& path)
+{
+ if (path.get_ino()) {
+ out << '#' << path.get_ino();
+ if (path.length())
+ out << '/';
+ }
+ return out << path.get_path();
+}
+
+#endif
diff --git a/src/include/frag.h b/src/include/frag.h
new file mode 100644
index 00000000..5e8b154f
--- /dev/null
+++ b/src/include/frag.h
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_FRAG_H
+#define CEPH_FRAG_H
+
+#include <boost/container/small_vector.hpp>
+
+#include <iostream>
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "buffer.h"
+#include "compact_map.h"
+
+#include "ceph_frag.h"
+#include "include/encoding.h"
+#include "include/ceph_assert.h"
+
+#include "common/dout.h"
+
+/*
+ *
+ * the goal here is to use a binary split strategy to partition a namespace.
+ * frag_t represents a particular fragment. bits() tells you the size of the
+ * fragment, and value() it's name. this is roughly analogous to an ip address
+ * and netmask.
+ *
+ * fragtree_t represents an entire namespace and it's partition. it essentially
+ * tells you where fragments are split into other fragments, and by how much
+ * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
+ *
+ * this vaguely resembles a btree, in that when a fragment becomes large or small
+ * we can split or merge, except that there is no guarantee of being balanced.
+ *
+ * presumably we are partitioning the output of a (perhaps specialized) hash
+ * function.
+ */
+
+/**
+ * frag_t
+ *
+ * description of an individual fragment. that is, a particular piece
+ * of the overall namespace.
+ *
+ * this is conceptually analogous to an ip address and netmask.
+ *
+ * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
+ *
+ * we write it as v/b, where v is a value and b is the number of bits.
+ * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that,
+ * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on.
+ *
+ * this makes the right most bit of v the "most significant", which is the
+ * opposite of what we usually see.
+ */
+
+/*
+ * TODO:
+ * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial)
+ * iteration efficient (see, e.g., try_assimilate_children()
+ * - rework frag_t so that we mask the left-most (most significant) bits instead of
+ * the right-most (least significant) bits. just because it's more intuitive, and
+ * matches the network/netmask concept.
+ */
+
+class frag_t {
+ /*
+ * encoding is dictated by frag_* functions in ceph_fs.h. use those
+ * helpers _exclusively_.
+ */
+public:
+ using _frag_t = uint32_t;
+
+ frag_t() = default;
+ frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
+ frag_t(_frag_t e) : _enc(e) { }
+
+ // constructors
+ void from_unsigned(unsigned e) { _enc = e; }
+
+ // accessors
+ unsigned value() const { return ceph_frag_value(_enc); }
+ unsigned bits() const { return ceph_frag_bits(_enc); }
+ unsigned mask() const { return ceph_frag_mask(_enc); }
+ unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
+
+ operator _frag_t() const { return _enc; }
+
+ // tests
+ bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
+ bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
+ bool is_root() const { return bits() == 0; }
+ frag_t parent() const {
+ ceph_assert(bits() > 0);
+ return frag_t(ceph_frag_parent(_enc));
+ }
+
+ // splitting
+ frag_t make_child(int i, int nb) const {
+ ceph_assert(i < (1<<nb));
+ return frag_t(ceph_frag_make_child(_enc, nb, i));
+ }
+ template<typename T>
+ void split(int nb, T& fragments) const {
+ ceph_assert(nb > 0);
+ unsigned nway = 1 << nb;
+ for (unsigned i=0; i<nway; i++)
+ fragments.push_back(make_child(i, nb));
+ }
+
+ // binary splitting
+ frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
+ frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
+
+ bool is_left() const { return ceph_frag_is_left_child(_enc); }
+ bool is_right() const { return ceph_frag_is_right_child(_enc); }
+ frag_t get_sibling() const {
+ ceph_assert(!is_root());
+ return frag_t(ceph_frag_sibling(_enc));
+ }
+
+ // sequencing
+ bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
+ bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
+ frag_t next() const {
+ ceph_assert(!is_rightmost());
+ return frag_t(ceph_frag_next(_enc));
+ }
+
+ // parse
+ bool parse(const char *s) {
+ int pvalue, pbits;
+ int r = sscanf(s, "%x/%d", &pvalue, &pbits);
+ if (r == 2) {
+ *this = frag_t(pvalue, pbits);
+ return true;
+ }
+ return false;
+ }
+
+ void encode(bufferlist& bl) const {
+ encode_raw(_enc, bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ __u32 v;
+ decode_raw(v, p);
+ _enc = v;
+ }
+
+private:
+ _frag_t _enc = 0;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
+{
+ //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
+ unsigned num = hb.bits();
+ if (num) {
+ unsigned val = hb.value();
+ for (unsigned bit = 23; num; num--, bit--)
+ out << ((val & (1<<bit)) ? '1':'0');
+ }
+ return out << '*';
+}
+
+inline void encode(const frag_t &f, bufferlist& bl) { f.encode(bl); }
+inline void decode(frag_t &f, bufferlist::const_iterator& p) { f.decode(p); }
+
+using frag_vec_t = boost::container::small_vector<frag_t, 4>;
+
+/**
+ * fragtree_t -- partition an entire namespace into one or more frag_t's.
+ */
+class fragtree_t {
+ // pairs <f, b>:
+ // frag_t f is split by b bits.
+ // if child frag_t does not appear, it is not split.
+public:
+ compact_map<frag_t,int32_t> _splits;
+
+public:
+ // -------------
+ // basics
+ void swap(fragtree_t& other) {
+ _splits.swap(other._splits);
+ }
+ void clear() {
+ _splits.clear();
+ }
+
+ // -------------
+ // accessors
+ bool empty() const {
+ return _splits.empty();
+ }
+ int get_split(const frag_t hb) const {
+ compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
+ if (p == _splits.end())
+ return 0;
+ else
+ return p->second;
+ }
+
+
+ bool is_leaf(frag_t x) const {
+ frag_vec_t s;
+ get_leaves_under(x, s);
+ //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
+ return s.size() == 1 && s.front() == x;
+ }
+
+ /**
+ * get_leaves -- list all leaves
+ */
+ template<typename T>
+ void get_leaves(T& c) const {
+ return get_leaves_under_split(frag_t(), c);
+ }
+
+ /**
+ * get_leaves_under_split -- list all leaves under a known split point (or root)
+ */
+ template<typename T>
+ void get_leaves_under_split(frag_t under, T& c) const {
+ frag_vec_t s;
+ s.push_back(under);
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ int nb = get_split(t);
+ if (nb)
+ t.split(nb, s); // queue up children
+ else
+ c.push_back(t); // not spit, it's a leaf.
+ }
+ }
+
+ /**
+ * get_branch -- get branch point at OR above frag @a x
+ * - may be @a x itself, if @a x is a split
+ * - may be root (frag_t())
+ */
+ frag_t get_branch(frag_t x) const {
+ while (1) {
+ if (x == frag_t()) return x; // root
+ if (get_split(x)) return x; // found it!
+ x = x.parent();
+ }
+ }
+
+ /**
+ * get_branch_above -- get a branch point above frag @a x
+ * - may be root (frag_t())
+ * - may NOT be @a x, even if @a x is a split.
+ */
+ frag_t get_branch_above(frag_t x) const {
+ while (1) {
+ if (x == frag_t()) return x; // root
+ x = x.parent();
+ if (get_split(x)) return x; // found it!
+ }
+ }
+
+
+ /**
+ * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
+ * - may be @a x itself, if @a x is a split or leaf
+ * - may be root (frag_t())
+ */
+ frag_t get_branch_or_leaf(frag_t x) const {
+ frag_t branch = get_branch(x);
+ int nb = get_split(branch);
+ if (nb > 0 && // if branch is a split, and
+ branch.bits() + nb <= x.bits()) // one of the children is or contains x
+ return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf)
+ else
+ return branch;
+ }
+
+ /**
+ * get_leaves_under(x, ls) -- search for any leaves fully contained by x
+ */
+ template<typename T>
+ void get_leaves_under(frag_t x, T& c) const {
+ frag_vec_t s;
+ s.push_back(get_branch_or_leaf(x));
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ if (t.bits() >= x.bits() && // if t is more specific than x, and
+ !x.contains(t)) // x does not contain t,
+ continue; // then skip
+ int nb = get_split(t);
+ if (nb)
+ t.split(nb, s); // queue up children
+ else if (x.contains(t))
+ c.push_back(t); // not spit, it's a leaf.
+ }
+ }
+
+ /**
+ * contains(fg) -- does fragtree contain the specific frag @a x
+ */
+ bool contains(frag_t x) const {
+ frag_vec_t s;
+ s.push_back(get_branch(x));
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ if (t.bits() >= x.bits() && // if t is more specific than x, and
+ !x.contains(t)) // x does not contain t,
+ continue; // then skip
+ int nb = get_split(t);
+ if (nb) {
+ if (t == x) return false; // it's split.
+ t.split(nb, s); // queue up children
+ } else {
+ if (t == x) return true; // it's there.
+ }
+ }
+ return false;
+ }
+
+ /**
+ * operator[] -- map a (hash?) value to a frag
+ */
+ frag_t operator[](unsigned v) const {
+ frag_t t;
+ while (1) {
+ ceph_assert(t.contains(v));
+ int nb = get_split(t);
+
+ // is this a leaf?
+ if (nb == 0) return t; // done.
+
+ // pick appropriate child fragment.
+ unsigned nway = 1 << nb;
+ unsigned i;
+ for (i=0; i<nway; i++) {
+ frag_t n = t.make_child(i, nb);
+ if (n.contains(v)) {
+ t = n;
+ break;
+ }
+ }
+ ceph_assert(i < nway);
+ }
+ }
+
+
+ // ---------------
+ // modifiers
+ void split(frag_t x, int b, bool simplify=true) {
+ ceph_assert(is_leaf(x));
+ _splits[x] = b;
+
+ if (simplify)
+ try_assimilate_children(get_branch_above(x));
+ }
+ void merge(frag_t x, int b, bool simplify=true) {
+ ceph_assert(!is_leaf(x));
+ ceph_assert(_splits[x] == b);
+ _splits.erase(x);
+
+ if (simplify)
+ try_assimilate_children(get_branch_above(x));
+ }
+
+ /*
+ * if all of a given split's children are identically split,
+ * then the children can be assimilated.
+ */
+ void try_assimilate_children(frag_t x) {
+ int nb = get_split(x);
+ if (!nb) return;
+ frag_vec_t children;
+ x.split(nb, children);
+ int childbits = 0;
+ for (auto& frag : children) {
+ int cb = get_split(frag);
+ if (!cb) return; // nope.
+ if (childbits && cb != childbits) return; // not the same
+ childbits = cb;
+ }
+ // all children are split with childbits!
+ for (auto& frag : children)
+ _splits.erase(frag);
+ _splits[x] += childbits;
+ }
+
+ bool force_to_leaf(CephContext *cct, frag_t x) {
+ if (is_leaf(x))
+ return false;
+
+ lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
+
+ frag_t parent = get_branch_or_leaf(x);
+ ceph_assert(parent.bits() <= x.bits());
+ lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
+
+ // do we need to split from parent to x?
+ if (parent.bits() < x.bits()) {
+ int spread = x.bits() - parent.bits();
+ int nb = get_split(parent);
+ lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
+ if (nb == 0) {
+ // easy: split parent (a leaf) by the difference
+ lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
+ split(parent, spread);
+ ceph_assert(is_leaf(x));
+ return true;
+ }
+ ceph_assert(nb > spread);
+
+ // add an intermediary split
+ merge(parent, nb, false);
+ split(parent, spread, false);
+
+ frag_vec_t subs;
+ parent.split(spread, subs);
+ for (auto& frag : subs) {
+ lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl;
+ split(frag, nb - spread, false);
+ }
+ }
+
+ // x is now a leaf or split.
+ // hoover up any children.
+ frag_vec_t s;
+ s.push_back(x);
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ int nb = get_split(t);
+ if (nb) {
+ lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
+ merge(t, nb, false); // merge this point, and
+ t.split(nb, s); // queue up children
+ }
+ }
+
+ lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
+ ceph_assert(is_leaf(x));
+ return true;
+ }
+
+ // encoding
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(_splits, bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ decode(_splits, p);
+ }
+ void encode_nohead(bufferlist& bl) const {
+ using ceph::encode;
+ for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+ p != _splits.end();
+ ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+ }
+ void decode_nohead(int n, bufferlist::const_iterator& p) {
+ using ceph::decode;
+ _splits.clear();
+ while (n-- > 0) {
+ frag_t f;
+ decode(f, p);
+ decode(_splits[f], p);
+ }
+ }
+
+ void print(std::ostream& out) {
+ out << "fragtree_t(";
+ frag_vec_t s;
+ s.push_back(frag_t());
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ // newline + indent?
+ if (t.bits()) {
+ out << std::endl;
+ for (unsigned i=0; i<t.bits(); i++) out << ' ';
+ }
+ int nb = get_split(t);
+ if (nb) {
+ out << t << " %" << nb;
+ t.split(nb, s); // queue up children
+ } else {
+ out << t;
+ }
+ }
+ out << ")";
+ }
+
+ void dump(Formatter *f) const {
+ f->open_array_section("splits");
+ for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+ p != _splits.end();
+ ++p) {
+ f->open_object_section("split");
+ std::ostringstream frag_str;
+ frag_str << p->first;
+ f->dump_string("frag", frag_str.str());
+ f->dump_int("children", p->second);
+ f->close_section(); // split
+ }
+ f->close_section(); // splits
+ }
+};
+WRITE_CLASS_ENCODER(fragtree_t)
+
+inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
+ return l._splits == r._splits;
+}
+inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
+ return l._splits != r._splits;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
+{
+ out << "fragtree_t(";
+
+ for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
+ p != ft._splits.end();
+ ++p) {
+ if (p != ft._splits.begin())
+ out << " ";
+ out << p->first << "^" << p->second;
+ }
+ return out << ")";
+}
+
+
+/**
+ * fragset_t -- a set of fragments
+ */
+class fragset_t {
+ std::set<frag_t> _set;
+
+public:
+ const std::set<frag_t> &get() const { return _set; }
+ std::set<frag_t>::iterator begin() { return _set.begin(); }
+ std::set<frag_t>::iterator end() { return _set.end(); }
+
+ bool empty() const { return _set.empty(); }
+
+ bool contains(frag_t f) const {
+ while (1) {
+ if (_set.count(f)) return true;
+ if (f.bits() == 0) return false;
+ f = f.parent();
+ }
+ }
+
+ void insert(frag_t f) {
+ _set.insert(f);
+ simplify();
+ }
+
+ void simplify() {
+ while (1) {
+ bool clean = true;
+ std::set<frag_t>::iterator p = _set.begin();
+ while (p != _set.end()) {
+ if (!p->is_root() &&
+ _set.count(p->get_sibling())) {
+ _set.erase(p->get_sibling());
+ _set.insert(p->parent());
+ _set.erase(p++);
+ clean = false;
+ } else {
+ p++;
+ }
+ }
+ if (clean)
+ break;
+ }
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs)
+{
+ return out << "fragset_t(" << fs.get() << ")";
+}
+
+#endif
diff --git a/src/include/fs_types.h b/src/include/fs_types.h
new file mode 100644
index 00000000..2132db9a
--- /dev/null
+++ b/src/include/fs_types.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_INCLUDE_FS_TYPES_H
+#define CEPH_INCLUDE_FS_TYPES_H
+
+#include "types.h"
+
+// --------------------------------------
+// ino
+
+typedef uint64_t _inodeno_t;
+
+struct inodeno_t {
+ _inodeno_t val;
+ inodeno_t() : val(0) {}
+ // cppcheck-suppress noExplicitConstructor
+ inodeno_t(_inodeno_t v) : val(v) {}
+ inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; }
+ operator _inodeno_t() const { return val; }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(val, bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ decode(val, p);
+ }
+} __attribute__ ((__may_alias__));
+WRITE_CLASS_ENCODER(inodeno_t)
+
+template<>
+struct denc_traits<inodeno_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const inodeno_t &o, size_t& p) {
+ denc(o.val, p);
+ }
+ static void encode(const inodeno_t &o, buffer::list::contiguous_appender& p) {
+ denc(o.val, p);
+ }
+ static void decode(inodeno_t& o, buffer::ptr::const_iterator &p) {
+ denc(o.val, p);
+ }
+};
+
+inline ostream& operator<<(ostream& out, const inodeno_t& ino) {
+ return out << hex << "0x" << ino.val << dec;
+}
+
+namespace std {
+ template<> struct hash< inodeno_t >
+ {
+ size_t operator()( const inodeno_t& x ) const
+ {
+ static rjhash<uint64_t> H;
+ return H(x.val);
+ }
+ };
+} // namespace std
+
+
+// file modes
+
+inline bool file_mode_is_readonly(int mode) {
+ return (mode & CEPH_FILE_MODE_WR) == 0;
+}
+
+
+// dentries
+#define MAX_DENTRY_LEN 255
+
+// --
+namespace ceph {
+ class Formatter;
+}
+void dump(const ceph_file_layout& l, ceph::Formatter *f);
+void dump(const ceph_dir_layout& l, ceph::Formatter *f);
+
+
+
+// file_layout_t
+
+struct file_layout_t {
+ // file -> object mapping
+ uint32_t stripe_unit; ///< stripe unit, in bytes,
+ uint32_t stripe_count; ///< over this many objects
+ uint32_t object_size; ///< until objects are this big
+
+ int64_t pool_id; ///< rados pool id
+ string pool_ns; ///< rados pool namespace
+
+ file_layout_t(uint32_t su=0, uint32_t sc=0, uint32_t os=0)
+ : stripe_unit(su),
+ stripe_count(sc),
+ object_size(os),
+ pool_id(-1) {
+ }
+
+ static file_layout_t get_default() {
+ return file_layout_t(1<<22, 1, 1<<22);
+ }
+
+ uint64_t get_period() const {
+ return static_cast<uint64_t>(stripe_count) * object_size;
+ }
+
+ void from_legacy(const ceph_file_layout& fl);
+ void to_legacy(ceph_file_layout *fl) const;
+
+ bool is_valid() const;
+
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<file_layout_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(file_layout_t)
+
+WRITE_EQ_OPERATORS_5(file_layout_t, stripe_unit, stripe_count, object_size, pool_id, pool_ns);
+
+ostream& operator<<(ostream& out, const file_layout_t &layout);
+
+#endif
diff --git a/src/include/hash.h b/src/include/hash.h
new file mode 100644
index 00000000..2ab95448
--- /dev/null
+++ b/src/include/hash.h
@@ -0,0 +1,64 @@
+#ifndef CEPH_HASH_H
+#define CEPH_HASH_H
+
+#include "acconfig.h"
+
+// Robert Jenkins' function for mixing 32-bit values
+// http://burtleburtle.net/bob/hash/evahash.html
+// a, b = random bits, c = input and output
+
+#define hashmix(a,b,c) \
+ a=a-b; a=a-c; a=a^(c>>13); \
+ b=b-c; b=b-a; b=b^(a<<8); \
+ c=c-a; c=c-b; c=c^(b>>13); \
+ a=a-b; a=a-c; a=a^(c>>12); \
+ b=b-c; b=b-a; b=b^(a<<16); \
+ c=c-a; c=c-b; c=c^(b>>5); \
+ a=a-b; a=a-c; a=a^(c>>3); \
+ b=b-c; b=b-a; b=b^(a<<10); \
+ c=c-a; c=c-b; c=c^(b>>15);
+
+
+//namespace ceph {
+
+template <class _Key> struct rjhash { };
+
+inline uint64_t rjhash64(uint64_t key) {
+ key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+ key = key ^ (key >> 24);
+ key = (key + (key << 3)) + (key << 8); // key * 265
+ key = key ^ (key >> 14);
+ key = (key + (key << 2)) + (key << 4); // key * 21
+ key = key ^ (key >> 28);
+ key = key + (key << 31);
+ return key;
+}
+
+inline uint32_t rjhash32(uint32_t a) {
+ a = (a+0x7ed55d16) + (a<<12);
+ a = (a^0xc761c23c) ^ (a>>19);
+ a = (a+0x165667b1) + (a<<5);
+ a = (a+0xd3a2646c) ^ (a<<9);
+ a = (a+0xfd7046c5) + (a<<3);
+ a = (a^0xb55a4f09) ^ (a>>16);
+ return a;
+}
+
+
+template<> struct rjhash<uint32_t> {
+ inline size_t operator()(const uint32_t x) const {
+ return rjhash32(x);
+ }
+};
+
+template<> struct rjhash<uint64_t> {
+ inline size_t operator()(const uint64_t x) const {
+ return rjhash64(x);
+ }
+};
+
+//}
+
+
+
+#endif
diff --git a/src/include/health.h b/src/include/health.h
new file mode 100644
index 00000000..5c00225e
--- /dev/null
+++ b/src/include/health.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "include/encoding.h"
+
+// health_status_t
+enum health_status_t {
+ HEALTH_ERR = 0,
+ HEALTH_WARN = 1,
+ HEALTH_OK = 2,
+};
+
+inline void encode(health_status_t hs, bufferlist& bl) {
+ using ceph::encode;
+ uint8_t v = hs;
+ encode(v, bl);
+}
+inline void decode(health_status_t& hs, bufferlist::const_iterator& p) {
+ using ceph::decode;
+ uint8_t v;
+ decode(v, p);
+ hs = health_status_t(v);
+}
+template<>
+struct denc_traits<health_status_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+ static void bound_encode(const bufferptr& v, size_t& p, uint64_t f=0) {
+ p++;
+ }
+ static void encode(const health_status_t& v,
+ buffer::list::contiguous_appender& p,
+ uint64_t f=0) {
+ ::denc((uint8_t)v, p);
+ }
+ static void decode(health_status_t& v, buffer::ptr::const_iterator& p,
+ uint64_t f=0) {
+ uint8_t tmp;
+ ::denc(tmp, p);
+ v = health_status_t(tmp);
+ }
+ static void decode(health_status_t& v, buffer::list::const_iterator& p,
+ uint64_t f=0) {
+ uint8_t tmp;
+ ::denc(tmp, p);
+ v = health_status_t(tmp);
+ }
+};
+
+inline std::ostream& operator<<(std::ostream &oss, const health_status_t status) {
+ switch (status) {
+ case HEALTH_ERR:
+ oss << "HEALTH_ERR";
+ break;
+ case HEALTH_WARN:
+ oss << "HEALTH_WARN";
+ break;
+ case HEALTH_OK:
+ oss << "HEALTH_OK";
+ break;
+ }
+ return oss;
+}
diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h
new file mode 100644
index 00000000..48d88976
--- /dev/null
+++ b/src/include/inline_memory.h
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_INLINE_MEMORY_H
+#define CEPH_INLINE_MEMORY_H
+
+#if defined(__GNUC__)
+
+// optimize for the common case, which is very small copies
+static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+ size_t inline_len)
+ __attribute__((always_inline));
+
+void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+ size_t inline_len)
+{
+ if (l > inline_len) {
+ return memcpy(dest, src, l);
+ }
+ switch (l) {
+ case 8:
+ return __builtin_memcpy(dest, src, 8);
+ case 4:
+ return __builtin_memcpy(dest, src, 4);
+ case 3:
+ return __builtin_memcpy(dest, src, 3);
+ case 2:
+ return __builtin_memcpy(dest, src, 2);
+ case 1:
+ return __builtin_memcpy(dest, src, 1);
+ default:
+ int cursor = 0;
+ while (l >= sizeof(uint64_t)) {
+ __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+ sizeof(uint64_t));
+ cursor += sizeof(uint64_t);
+ l -= sizeof(uint64_t);
+ }
+ while (l >= sizeof(uint32_t)) {
+ __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+ sizeof(uint32_t));
+ cursor += sizeof(uint32_t);
+ l -= sizeof(uint32_t);
+ }
+ while (l > 0) {
+ *((char*)dest + cursor) = *((char*)src + cursor);
+ cursor++;
+ l--;
+ }
+ }
+ return dest;
+}
+
+#else
+
+#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l)
+
+#endif
+
+
+#if defined(__GNUC__) && defined(__x86_64__)
+
+namespace ceph {
+typedef unsigned uint128_t __attribute__ ((mode (TI)));
+}
+using ceph::uint128_t;
+
+static inline bool mem_is_zero(const char *data, size_t len)
+ __attribute__((always_inline));
+
+bool mem_is_zero(const char *data, size_t len)
+{
+ // we do have XMM registers in x86-64, so if we need to check at least
+ // 16 bytes, make use of them
+ if (len / sizeof(uint128_t) > 0) {
+ // align data pointer to 16 bytes, otherwise it'll segfault due to bug
+ // in (at least some) GCC versions (using MOVAPS instead of MOVUPS).
+ // check up to 15 first bytes while at it.
+ while (((unsigned long long)data) & 15) {
+ if (*(uint8_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint8_t);
+ --len;
+ }
+
+ const char* data_start = data;
+ const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t);
+
+ while (data < max128) {
+ if (*(uint128_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint128_t);
+ }
+ len -= (data - data_start);
+ }
+
+ const char* max = data + len;
+ const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t);
+ while (data < max32) {
+ if (*(uint32_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint32_t);
+ }
+ while (data < max) {
+ if (*(uint8_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint8_t);
+ }
+ return true;
+}
+
+#else // gcc and x86_64
+
+static inline bool mem_is_zero(const char *data, size_t len) {
+ const char *end = data + len;
+ const char* end64 = data + (len / sizeof(uint64_t))*sizeof(uint64_t);
+
+ while (data < end64) {
+ if (*(uint64_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint64_t);
+ }
+
+ while (data < end) {
+ if (*data != 0) {
+ return false;
+ }
+ ++data;
+ }
+ return true;
+}
+
+#endif // !x86_64
+
+#endif
diff --git a/src/include/int_types.h b/src/include/int_types.h
new file mode 100644
index 00000000..56b2723f
--- /dev/null
+++ b/src/include/int_types.h
@@ -0,0 +1,65 @@
+#ifndef CEPH_INTTYPES_H
+#define CEPH_INTTYPES_H
+
+#include "acconfig.h"
+
+#include <inttypes.h>
+
+#ifdef HAVE_LINUX_TYPES_H
+#include <linux/types.h>
+#else
+#ifndef HAVE___U8
+typedef uint8_t __u8;
+#endif
+
+#ifndef HAVE___S8
+typedef int8_t __s8;
+#endif
+
+#ifndef HAVE___U16
+typedef uint16_t __u16;
+#endif
+
+#ifndef HAVE___S16
+typedef int16_t __s16;
+#endif
+
+#ifndef HAVE___U32
+typedef uint32_t __u32;
+#endif
+
+#ifndef HAVE___S32
+typedef int32_t __s32;
+#endif
+
+#ifndef HAVE___U64
+typedef uint64_t __u64;
+#endif
+
+#ifndef HAVE___S64
+typedef int64_t __s64;
+#endif
+#endif /* LINUX_TYPES_H */
+
+#define __bitwise__
+
+typedef __u16 __bitwise__ __le16;
+typedef __u16 __bitwise__ __be16;
+typedef __u32 __bitwise__ __le32;
+typedef __u32 __bitwise__ __be32;
+typedef __u64 __bitwise__ __le64;
+typedef __u64 __bitwise__ __be64;
+
+#ifndef BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#endif
+
+#ifndef BOOST_MPL_LIMIT_VECTOR_SIZE
+#define BOOST_MPL_LIMIT_VECTOR_SIZE 30 // or whatever you need
+#endif
+
+#ifndef BOOST_MPL_LIMIT_MAP_SIZE
+#define BOOST_MPL_LIMIT_MAP_SIZE 30 // or whatever you need
+#endif
+
+#endif
diff --git a/src/include/intarith.h b/src/include/intarith.h
new file mode 100644
index 00000000..e912cbe7
--- /dev/null
+++ b/src/include/intarith.h
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INTARITH_H
+#define CEPH_INTARITH_H
+
+#include <type_traits>
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> div_round_up(T n, U d) {
+ return (n + d - 1) / d;
+}
+
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> round_up_to(T n, U d) {
+ return (n % d ? (n + d - n % d) : n);
+}
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> shift_round_up(T x, U y) {
+ return (x + (1 << y) - 1) >> y;
+}
+
+/*
+ * Wrapper to determine if value is a power of 2
+ */
+template<typename T>
+constexpr inline bool isp2(T x) {
+ return (x & (x - 1)) == 0;
+}
+
+/*
+ * Wrappers for various sorts of alignment and rounding. The "align" must
+ * be a power of 2. Often times it is a block, sector, or page.
+ */
+
+/*
+ * return x rounded down to an align boundary
+ * eg, p2align(1200, 1024) == 1024 (1*align)
+ * eg, p2align(1024, 1024) == 1024 (1*align)
+ * eg, p2align(0x1234, 0x100) == 0x1200 (0x12*align)
+ * eg, p2align(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+template<typename T>
+constexpr inline T p2align(T x, T align) {
+ return x & -align;
+}
+
+/*
+ * return x % (mod) align
+ * eg, p2phase(0x1234, 0x100) == 0x34 (x-0x12*align)
+ * eg, p2phase(0x5600, 0x100) == 0x00 (x-0x56*align)
+ */
+template<typename T>
+constexpr inline T p2phase(T x, T align) {
+ return x & (align - 1);
+}
+
+/*
+ * return how much space is left in this block (but if it's perfectly
+ * aligned, return 0).
+ * eg, p2nphase(0x1234, 0x100) == 0xcc (0x13*align-x)
+ * eg, p2nphase(0x5600, 0x100) == 0x00 (0x56*align-x)
+ */
+template<typename T>
+constexpr inline T p2nphase(T x, T align) {
+ return -x & (align - 1);
+}
+
+/*
+ * return x rounded up to an align boundary
+ * eg, p2roundup(0x1234, 0x100) == 0x1300 (0x13*align)
+ * eg, p2roundup(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+template<typename T>
+constexpr inline T p2roundup(T x, T align) {
+ return -(-x & -align);
+}
+
+// count trailing zeros.
+// NOTE: the builtin is nondeterministic on 0 input
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) <= sizeof(unsigned)),
+ unsigned>::type ctz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_ctz(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned int) &&
+ sizeof(T) <= sizeof(unsigned long)),
+ unsigned>::type ctz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_ctzl(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned long) &&
+ sizeof(T) <= sizeof(unsigned long long)),
+ unsigned>::type ctz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_ctzll(v);
+}
+
+// count leading zeros
+// NOTE: the builtin is nondeterministic on 0 input
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) <= sizeof(unsigned)),
+ unsigned>::type clz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_clz(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned int) &&
+ sizeof(T) <= sizeof(unsigned long)),
+ unsigned>::type clz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_clzl(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned long) &&
+ sizeof(T) <= sizeof(unsigned long long)),
+ unsigned>::type clz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_clzll(v);
+}
+
+// count bits (set + any 0's that follow)
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) <= sizeof(unsigned)),
+ unsigned>::type cbits(T v) {
+ if (v == 0)
+ return 0;
+ return (sizeof(v) * 8) - __builtin_clz(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned int) &&
+ sizeof(T) <= sizeof(unsigned long)),
+ unsigned>::type cbits(T v) {
+ if (v == 0)
+ return 0;
+ return (sizeof(v) * 8) - __builtin_clzl(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned long) &&
+ sizeof(T) <= sizeof(unsigned long long)),
+ unsigned>::type cbits(T v) {
+ if (v == 0)
+ return 0;
+ return (sizeof(v) * 8) - __builtin_clzll(v);
+}
+
+#endif
diff --git a/src/include/interval_set.h b/src/include/interval_set.h
new file mode 100644
index 00000000..4fb6be45
--- /dev/null
+++ b/src/include/interval_set.h
@@ -0,0 +1,783 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_INTERVAL_SET_H
+#define CEPH_INTERVAL_SET_H
+
+#include <iterator>
+#include <map>
+#include <ostream>
+
+#include "encoding.h"
+
+/*
+ * *** NOTE ***
+ *
+ * This class is written to work with a variety of map-like containers,
+ * *include* ones that invalidate iterators when they are modified (e.g.,
+ * flat_map and btree_map).
+ */
+
+template<typename T, typename Map = std::map<T,T>>
+class interval_set {
+ public:
+ using value_type = T;
+
+ class const_iterator;
+
+ class iterator : public std::iterator <std::forward_iterator_tag, T>
+ {
+ public:
+ explicit iterator(typename Map::iterator iter)
+ : _iter(iter)
+ { }
+
+ // For the copy constructor and assignment operator, the compiler-generated functions, which
+ // perform simple bitwise copying, should be fine.
+
+ bool operator==(const iterator& rhs) const {
+ return (_iter == rhs._iter);
+ }
+
+ bool operator!=(const iterator& rhs) const {
+ return (_iter != rhs._iter);
+ }
+
+ // Dereference this iterator to get a pair.
+ std::pair < T, T > &operator*() {
+ return *_iter;
+ }
+
+ // Return the interval start.
+ T get_start() const {
+ return _iter->first;
+ }
+
+ // Return the interval length.
+ T get_len() const {
+ return _iter->second;
+ }
+ T get_end() const {
+ return _iter->first + _iter->second;
+ }
+
+ // Set the interval length.
+ void set_len(T len) {
+ _iter->second = len;
+ }
+
+ // Preincrement
+ iterator &operator++()
+ {
+ ++_iter;
+ return *this;
+ }
+
+ // Postincrement
+ iterator operator++(int)
+ {
+ iterator prev(_iter);
+ ++_iter;
+ return prev;
+ }
+
+ friend class interval_set<T,Map>::const_iterator;
+
+ protected:
+ typename Map::iterator _iter;
+ friend class interval_set<T,Map>;
+ };
+
+ class const_iterator : public std::iterator <std::forward_iterator_tag, T>
+ {
+ public:
+ explicit const_iterator(typename Map::const_iterator iter)
+ : _iter(iter)
+ { }
+
+ const_iterator(const iterator &i)
+ : _iter(i._iter)
+ { }
+
+ // For the copy constructor and assignment operator, the compiler-generated functions, which
+ // perform simple bitwise copying, should be fine.
+
+ bool operator==(const const_iterator& rhs) const {
+ return (_iter == rhs._iter);
+ }
+
+ bool operator!=(const const_iterator& rhs) const {
+ return (_iter != rhs._iter);
+ }
+
+ // Dereference this iterator to get a pair.
+ std::pair < T, T > operator*() const {
+ return *_iter;
+ }
+
+ // Return the interval start.
+ T get_start() const {
+ return _iter->first;
+ }
+ T get_end() const {
+ return _iter->first + _iter->second;
+ }
+
+ // Return the interval length.
+ T get_len() const {
+ return _iter->second;
+ }
+
+ // Preincrement
+ const_iterator &operator++()
+ {
+ ++_iter;
+ return *this;
+ }
+
+ // Postincrement
+ const_iterator operator++(int)
+ {
+ const_iterator prev(_iter);
+ ++_iter;
+ return prev;
+ }
+
+ protected:
+ typename Map::const_iterator _iter;
+ };
+
+ interval_set() : _size(0) {}
+ interval_set(Map& other) {
+ m.swap(other);
+ _size = 0;
+ for (auto& i : m) {
+ _size += i.second;
+ }
+ }
+
+ int num_intervals() const
+ {
+ return m.size();
+ }
+
+ typename interval_set<T,Map>::iterator begin() {
+ return typename interval_set<T,Map>::iterator(m.begin());
+ }
+
+ typename interval_set<T,Map>::iterator lower_bound(T start) {
+ return typename interval_set<T,Map>::iterator(find_inc_m(start));
+ }
+
+ typename interval_set<T,Map>::iterator end() {
+ return typename interval_set<T,Map>::iterator(m.end());
+ }
+
+ typename interval_set<T,Map>::const_iterator begin() const {
+ return typename interval_set<T,Map>::const_iterator(m.begin());
+ }
+
+ typename interval_set<T,Map>::const_iterator lower_bound(T start) const {
+ return typename interval_set<T,Map>::const_iterator(find_inc(start));
+ }
+
+ typename interval_set<T,Map>::const_iterator end() const {
+ return typename interval_set<T,Map>::const_iterator(m.end());
+ }
+
+ // helpers
+ private:
+ typename Map::const_iterator find_inc(T start) const {
+ typename Map::const_iterator p = m.lower_bound(start); // p->first >= start
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ p--; // might overlap?
+ if (p->first + p->second <= start)
+ p++; // it doesn't.
+ }
+ return p;
+ }
+
+ typename Map::iterator find_inc_m(T start) {
+ typename Map::iterator p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ p--; // might overlap?
+ if (p->first + p->second <= start)
+ p++; // it doesn't.
+ }
+ return p;
+ }
+
+ typename Map::const_iterator find_adj(T start) const {
+ typename Map::const_iterator p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ p--; // might touch?
+ if (p->first + p->second < start)
+ p++; // it doesn't.
+ }
+ return p;
+ }
+
+ typename Map::iterator find_adj_m(T start) {
+ typename Map::iterator p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ p--; // might touch?
+ if (p->first + p->second < start)
+ p++; // it doesn't.
+ }
+ return p;
+ }
+
+ void intersection_size_asym(const interval_set &s, const interval_set &l) {
+ typename decltype(m)::const_iterator ps = s.m.begin(), pl;
+ ceph_assert(ps != s.m.end());
+ T offset = ps->first;
+ bool first = true;
+ typename decltype(m)::iterator mi = m.begin();
+
+ while (1) {
+ if (first)
+ first = false;
+ pl = l.find_inc(offset);
+ if (pl == l.m.end())
+ break;
+ while (ps != s.m.end() && ps->first + ps->second <= pl->first)
+ ++ps;
+ if (ps == s.m.end())
+ break;
+ offset = pl->first + pl->second;
+ if (offset <= ps->first) {
+ offset = ps->first;
+ continue;
+ }
+
+ if (*ps == *pl) {
+ do {
+ mi = m.insert(mi, *ps);
+ _size += ps->second;
+ ++ps;
+ ++pl;
+ } while (ps != s.m.end() && pl != l.m.end() && *ps == *pl);
+ if (ps == s.m.end())
+ break;
+ offset = ps->first;
+ continue;
+ }
+
+ T start = std::max<T>(ps->first, pl->first);
+ T en = std::min<T>(ps->first + ps->second, offset);
+ ceph_assert(en > start);
+ typename decltype(m)::value_type i{start, en - start};
+ mi = m.insert(mi, i);
+ _size += i.second;
+ if (ps->first + ps->second <= offset) {
+ ++ps;
+ if (ps == s.m.end())
+ break;
+ offset = ps->first;
+ }
+ }
+ }
+
+ bool subset_size_sym(const interval_set &b) const {
+ auto pa = m.begin(), pb = b.m.begin();
+ const auto a_end = m.end(), b_end = b.m.end();
+
+ while (pa != a_end && pb != b_end) {
+ while (pb->first + pb->second <= pa->first) {
+ ++pb;
+ if (pb == b_end)
+ return false;
+ }
+
+ if (*pa == *pb) {
+ do {
+ ++pa;
+ ++pb;
+ } while (pa != a_end && pb != b_end && *pa == *pb);
+ continue;
+ }
+
+ // interval begins before other
+ if (pa->first < pb->first)
+ return false;
+ // interval is longer than other
+ if (pa->first + pa->second > pb->first + pb->second)
+ return false;
+
+ ++pa;
+ }
+
+ return pa == a_end;
+ }
+
+ public:
+ bool operator==(const interval_set& other) const {
+ return _size == other._size && m == other.m;
+ }
+
+ int64_t size() const {
+ return _size;
+ }
+
+ void bound_encode(size_t& p) const {
+ denc_traits<Map>::bound_encode(m, p);
+ }
+ void encode(bufferlist::contiguous_appender& p) const {
+ denc(m, p);
+ }
+ void decode(bufferptr::const_iterator& p) {
+ denc(m, p);
+ _size = 0;
+ for (const auto& i : m) {
+ _size += i.second;
+ }
+ }
+ void decode(bufferlist::iterator& p) {
+ denc(m, p);
+ _size = 0;
+ for (const auto& i : m) {
+ _size += i.second;
+ }
+ }
+
+ void encode_nohead(bufferlist::contiguous_appender& p) const {
+ denc_traits<Map>::encode_nohead(m, p);
+ }
+ void decode_nohead(int n, bufferptr::const_iterator& p) {
+ denc_traits<Map>::decode_nohead(n, m, p);
+ _size = 0;
+ for (const auto& i : m) {
+ _size += i.second;
+ }
+ }
+
+ void clear() {
+ m.clear();
+ _size = 0;
+ }
+
+ bool contains(T i, T *pstart=0, T *plen=0) const {
+ typename Map::const_iterator p = find_inc(i);
+ if (p == m.end()) return false;
+ if (p->first > i) return false;
+ if (p->first+p->second <= i) return false;
+ ceph_assert(p->first <= i && p->first+p->second > i);
+ if (pstart)
+ *pstart = p->first;
+ if (plen)
+ *plen = p->second;
+ return true;
+ }
+ bool contains(T start, T len) const {
+ typename Map::const_iterator p = find_inc(start);
+ if (p == m.end()) return false;
+ if (p->first > start) return false;
+ if (p->first+p->second <= start) return false;
+ ceph_assert(p->first <= start && p->first+p->second > start);
+ if (p->first+p->second < start+len) return false;
+ return true;
+ }
+ bool intersects(T start, T len) const {
+ interval_set a;
+ a.insert(start, len);
+ interval_set i;
+ i.intersection_of( *this, a );
+ if (i.empty()) return false;
+ return true;
+ }
+
+ // outer range of set
+ bool empty() const {
+ return m.empty();
+ }
+ T range_start() const {
+ ceph_assert(!empty());
+ typename Map::const_iterator p = m.begin();
+ return p->first;
+ }
+ T range_end() const {
+ ceph_assert(!empty());
+ typename Map::const_iterator p = m.end();
+ p--;
+ return p->first+p->second;
+ }
+
+ // interval start after p (where p not in set)
+ bool starts_after(T i) const {
+ ceph_assert(!contains(i));
+ typename Map::const_iterator p = find_inc(i);
+ if (p == m.end()) return false;
+ return true;
+ }
+ T start_after(T i) const {
+ ceph_assert(!contains(i));
+ typename Map::const_iterator p = find_inc(i);
+ return p->first;
+ }
+
+ // interval end that contains start
+ T end_after(T start) const {
+ ceph_assert(contains(start));
+ typename Map::const_iterator p = find_inc(start);
+ return p->first+p->second;
+ }
+
+ void insert(T val) {
+ insert(val, 1);
+ }
+
+ void insert(T start, T len, T *pstart=0, T *plen=0) {
+ //cout << "insert " << start << "~" << len << endl;
+ ceph_assert(len > 0);
+ _size += len;
+ typename Map::iterator p = find_adj_m(start);
+ if (p == m.end()) {
+ m[start] = len; // new interval
+ if (pstart)
+ *pstart = start;
+ if (plen)
+ *plen = len;
+ } else {
+ if (p->first < start) {
+
+ if (p->first + p->second != start) {
+ //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
+ ceph_abort();
+ }
+
+ p->second += len; // append to end
+
+ typename Map::iterator n = p;
+ n++;
+ if (pstart)
+ *pstart = p->first;
+ if (n != m.end() &&
+ start+len == n->first) { // combine with next, too!
+ p->second += n->second;
+ if (plen)
+ *plen = p->second;
+ m.erase(n);
+ } else {
+ if (plen)
+ *plen = p->second;
+ }
+ } else {
+ if (start+len == p->first) {
+ if (pstart)
+ *pstart = start;
+ if (plen)
+ *plen = len + p->second;
+ T psecond = p->second;
+ m.erase(p);
+ m[start] = len + psecond; // append to front
+ } else {
+ ceph_assert(p->first > start+len);
+ if (pstart)
+ *pstart = start;
+ if (plen)
+ *plen = len;
+ m[start] = len; // new interval
+ }
+ }
+ }
+ }
+
+ void swap(interval_set<T,Map>& other) {
+ m.swap(other.m);
+ std::swap(_size, other._size);
+ }
+
+ void erase(iterator &i) {
+ _size -= i.get_len();
+ ceph_assert(_size >= 0);
+ m.erase(i._iter);
+ }
+
+ void erase(T val) {
+ erase(val, 1);
+ }
+
+ void erase(T start, T len,
+ std::function<bool(T, T)> claim = {}) {
+ typename Map::iterator p = find_inc_m(start);
+
+ _size -= len;
+ ceph_assert(_size >= 0);
+
+ ceph_assert(p != m.end());
+ ceph_assert(p->first <= start);
+
+ T before = start - p->first;
+ ceph_assert(p->second >= before+len);
+ T after = p->second - before - len;
+ if (before) {
+ if (claim && claim(p->first, before)) {
+ _size -= before;
+ m.erase(p);
+ } else {
+ p->second = before; // shorten bit before
+ }
+ } else {
+ m.erase(p);
+ }
+ if (after) {
+ if (claim && claim(start + len, after)) {
+ _size -= after;
+ } else {
+ m[start + len] = after;
+ }
+ }
+ }
+
+ void subtract(const interval_set &a) {
+ for (typename Map::const_iterator p = a.m.begin();
+ p != a.m.end();
+ p++)
+ erase(p->first, p->second);
+ }
+
+ void insert(const interval_set &a) {
+ for (typename Map::const_iterator p = a.m.begin();
+ p != a.m.end();
+ p++)
+ insert(p->first, p->second);
+ }
+
+
+ void intersection_of(const interval_set &a, const interval_set &b) {
+ ceph_assert(&a != this);
+ ceph_assert(&b != this);
+ clear();
+
+ const interval_set *s, *l;
+
+ if (a.size() < b.size()) {
+ s = &a;
+ l = &b;
+ } else {
+ s = &b;
+ l = &a;
+ }
+
+ if (!s->size())
+ return;
+
+ /*
+ * Use the lower_bound algorithm for larger size ratios
+ * where it performs better, but not for smaller size
+ * ratios where sequential search performs better.
+ */
+ if (l->size() / s->size() >= 10) {
+ intersection_size_asym(*s, *l);
+ return;
+ }
+
+ typename Map::const_iterator pa = a.m.begin();
+ typename Map::const_iterator pb = b.m.begin();
+ typename decltype(m)::iterator mi = m.begin();
+
+ while (pa != a.m.end() && pb != b.m.end()) {
+ // passing?
+ if (pa->first + pa->second <= pb->first)
+ { pa++; continue; }
+ if (pb->first + pb->second <= pa->first)
+ { pb++; continue; }
+
+ if (*pa == *pb) {
+ do {
+ mi = m.insert(mi, *pa);
+ _size += pa->second;
+ ++pa;
+ ++pb;
+ } while (pa != a.m.end() && pb != b.m.end() && *pa == *pb);
+ continue;
+ }
+
+ T start = std::max(pa->first, pb->first);
+ T en = std::min(pa->first+pa->second, pb->first+pb->second);
+ ceph_assert(en > start);
+ typename decltype(m)::value_type i{start, en - start};
+ mi = m.insert(mi, i);
+ _size += i.second;
+ if (pa->first+pa->second > pb->first+pb->second)
+ pb++;
+ else
+ pa++;
+ }
+ }
+ void intersection_of(const interval_set& b) {
+ interval_set a;
+ swap(a);
+ intersection_of(a, b);
+ }
+
+ void union_of(const interval_set &a, const interval_set &b) {
+ ceph_assert(&a != this);
+ ceph_assert(&b != this);
+ clear();
+
+ //cout << "union_of" << endl;
+
+ // a
+ m = a.m;
+ _size = a._size;
+
+ // - (a*b)
+ interval_set ab;
+ ab.intersection_of(a, b);
+ subtract(ab);
+
+ // + b
+ insert(b);
+ return;
+ }
+ void union_of(const interval_set &b) {
+ interval_set a;
+ swap(a);
+ union_of(a, b);
+ }
+ void union_insert(T off, T len) {
+ interval_set a;
+ a.insert(off, len);
+ union_of(a);
+ }
+
+ bool subset_of(const interval_set &big) const {
+ if (!size())
+ return true;
+ if (size() > big.size())
+ return false;
+ if (range_end() > big.range_end())
+ return false;
+
+ /*
+ * Use the lower_bound algorithm for larger size ratios
+ * where it performs better, but not for smaller size
+ * ratios where sequential search performs better.
+ */
+ if (big.size() / size() < 10)
+ return subset_size_sym(big);
+
+ for (typename Map::const_iterator i = m.begin();
+ i != m.end();
+ i++)
+ if (!big.contains(i->first, i->second)) return false;
+ return true;
+ }
+
+ /*
+ * build a subset of @other, starting at or after @start, and including
+ * @len worth of values, skipping holes. e.g.,
+ * span_of([5~10,20~5], 8, 5) -> [8~2,20~3]
+ */
+ void span_of(const interval_set &other, T start, T len) {
+ clear();
+ typename Map::const_iterator p = other.find_inc(start);
+ if (p == other.m.end())
+ return;
+ if (p->first < start) {
+ if (p->first + p->second < start)
+ return;
+ if (p->first + p->second < start + len) {
+ T howmuch = p->second - (start - p->first);
+ insert(start, howmuch);
+ len -= howmuch;
+ p++;
+ } else {
+ insert(start, len);
+ return;
+ }
+ }
+ while (p != other.m.end() && len > 0) {
+ if (p->second < len) {
+ insert(p->first, p->second);
+ len -= p->second;
+ p++;
+ } else {
+ insert(p->first, len);
+ return;
+ }
+ }
+ }
+
+ /*
+ * Move contents of m into another Map. Use that instead of
+ * encoding interval_set into bufferlist then decoding it back into Map.
+ */
+ void move_into(Map& other) {
+ other = std::move(m);
+ }
+
+private:
+ // data
+ int64_t _size;
+ Map m; // map start -> len
+};
+
+// declare traits explicitly because (1) it's templatized, and (2) we
+// want to include _nohead variants.
+template<typename T, typename Map>
+struct denc_traits<interval_set<T,Map>> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = denc_traits<T,Map>::need_contiguous;
+ static void bound_encode(const interval_set<T,Map>& v, size_t& p) {
+ v.bound_encode(p);
+ }
+ static void encode(const interval_set<T,Map>& v,
+ bufferlist::contiguous_appender& p) {
+ v.encode(p);
+ }
+ static void decode(interval_set<T,Map>& v, bufferptr::const_iterator& p) {
+ v.decode(p);
+ }
+ template<typename U=T>
+ static typename std::enable_if<sizeof(U) && !need_contiguous>::type
+ decode(interval_set<T,Map>& v, bufferlist::iterator& p) {
+ v.decode(p);
+ }
+ static void encode_nohead(const interval_set<T,Map>& v,
+ bufferlist::contiguous_appender& p) {
+ v.encode_nohead(p);
+ }
+ static void decode_nohead(size_t n, interval_set<T,Map>& v,
+ bufferptr::const_iterator& p) {
+ v.decode_nohead(n, p);
+ }
+};
+
+
+template<class T, typename Map>
+inline std::ostream& operator<<(std::ostream& out, const interval_set<T,Map> &s) {
+ out << "[";
+ const char *prequel = "";
+ for (typename interval_set<T,Map>::const_iterator i = s.begin();
+ i != s.end();
+ ++i)
+ {
+ out << prequel << i.get_start() << "~" << i.get_len();
+ prequel = ",";
+ }
+ out << "]";
+ return out;
+}
+
+
+#endif
diff --git a/src/include/ipaddr.h b/src/include/ipaddr.h
new file mode 100644
index 00000000..e8bed829
--- /dev/null
+++ b/src/include/ipaddr.h
@@ -0,0 +1,48 @@
+#ifndef CEPH_IPADDR_H
+#define CEPH_IPADDR_H
+
+class entity_addr_t;
+
+/*
+ * Find an IP address that is in the wanted subnet.
+ *
+ * If there are multiple matches, the first one is returned; this order
+ * is system-dependent and should not be relied on.
+ */
+const struct ifaddrs *find_ip_in_subnet(const struct ifaddrs *addrs,
+ const struct sockaddr *net,
+ unsigned int prefix_len,
+ int numa_node = -1);
+
+/*
+ * Validate and parse IPv4 or IPv6 network
+ *
+ * Given a network (e.g. "192.168.0.0/24") and pointers to a sockaddr_storage
+ * struct and an unsigned int:
+ *
+ * if the network string is valid, return true and populate sockaddr_storage
+ * and prefix_len;
+ *
+ * if the network string is invalid, return false.
+ */
+bool parse_network(const char *s,
+ struct sockaddr_storage *network,
+ unsigned int *prefix_len);
+bool parse_network(const char *s,
+ entity_addr_t *network,
+ unsigned int *prefix_len);
+
+void netmask_ipv6(const struct in6_addr *addr,
+ unsigned int prefix_len,
+ struct in6_addr *out);
+
+void netmask_ipv4(const struct in_addr *addr,
+ unsigned int prefix_len,
+ struct in_addr *out);
+
+bool network_contains(
+ const struct entity_addr_t& network,
+ unsigned int prefix_len,
+ const struct entity_addr_t& addr);
+
+#endif
diff --git a/src/include/krbd.h b/src/include/krbd.h
new file mode 100644
index 00000000..977d45fe
--- /dev/null
+++ b/src/include/krbd.h
@@ -0,0 +1,97 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_KRBD_H
+#define CEPH_KRBD_H
+
+#include "rados/librados.h"
+
+/*
+ * Don't wait for udev add uevents in krbd_map() and udev remove
+ * uevents in krbd_unmap*(). Instead, make do with the respective
+ * kernel uevents and return as soon as they are received.
+ *
+ * systemd-udevd sends out udev uevents after it finishes processing
+ * the respective kernel uevents, which mostly boils down to executing
+ * all matching udev rules. With this flag set, on return from
+ * krbd_map() systemd-udevd may still be poking at the device: it
+ * may still be open with tools such as blkid and various ioctls to
+ * be run against it, none of the persistent symlinks to the device
+ * node may be there, etc. udev used to be responsible for creating
+ * the device node as well, but that has been handled by devtmpfs in
+ * the kernel for many years now, so the device node (as returned
+ * through @pdevnode) is guaranteed to be there.
+ *
+ * If set, krbd_map() and krbd_unmap*() can be invoked from any
+ * network namespace that is owned by the initial user namespace
+ * (which is a formality because things like loading kernel modules
+ * and creating block devices are not namespaced and require global
+ * privileges, i.e. capabilities in the initial user namespace).
+ * Otherwise, krbd_map() and krbd_unmap*() must be invoked from
+ * the initial network namespace.
+ *
+ * If set, krbd_unmap*() doesn't attempt to settle the udev queue
+ * before retrying unmap for the last time. Some EBUSY errors due
+ * to systemd-udevd poking at the device at the time krbd_unmap*()
+ * is invoked that are otherwise covered by the retry logic may be
+ * returned.
+ */
+#define KRBD_CTX_F_NOUDEV (1U << 0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct krbd_ctx;
+
+int krbd_create_from_context(rados_config_t cct, uint32_t flags,
+ struct krbd_ctx **pctx);
+void krbd_destroy(struct krbd_ctx *ctx);
+
+int krbd_map(struct krbd_ctx *ctx,
+ const char *pool_name,
+ const char *nspace_name,
+ const char *image_name,
+ const char *snap_name,
+ const char *options,
+ char **pdevnode);
+int krbd_is_mapped(struct krbd_ctx *ctx,
+ const char *pool_name,
+ const char *nspace_name,
+ const char *image_name,
+ const char *snap_name,
+ char **pdevnode);
+
+int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
+ const char *options);
+int krbd_unmap_by_spec(struct krbd_ctx *ctx,
+ const char *pool_name,
+ const char *nspace_name,
+ const char *image_name,
+ const char *snap_name,
+ const char *options);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+namespace ceph {
+ class Formatter;
+}
+
+int krbd_showmapped(struct krbd_ctx *ctx, ceph::Formatter *f);
+
+#endif /* __cplusplus */
+
+#endif /* CEPH_KRBD_H */
diff --git a/src/include/linux_fiemap.h b/src/include/linux_fiemap.h
new file mode 100644
index 00000000..36046b5c
--- /dev/null
+++ b/src/include/linux_fiemap.h
@@ -0,0 +1,73 @@
+/*
+ * FS_IOC_FIEMAP ioctl infrastructure.
+ *
+ * Some portions copyright (C) 2007 Cluster File Systems, Inc
+ *
+ * Authors: Mark Fasheh <mfasheh@suse.com>
+ * Kalpak Shah <kalpak.shah@sun.com>
+ * Andreas Dilger <adilger@sun.com>
+ */
+#ifndef _LINUX_FIEMAP_H
+#define _LINUX_FIEMAP_H
+
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD_)
+#include <sys/types.h>
+#endif
+
+#include "include/int_types.h"
+
+struct fiemap_extent {
+ __u64 fe_logical; /* logical offset in bytes for the start of
+ * the extent from the beginning of the file */
+ __u64 fe_physical; /* physical offset in bytes for the start
+ * of the extent from the beginning of the disk */
+ __u64 fe_length; /* length in bytes for this extent */
+ __u64 fe_reserved64[2];
+ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
+ __u32 fe_reserved[3];
+};
+
+struct fiemap {
+ __u64 fm_start; /* logical offset (inclusive) at
+ * which to start mapping (in) */
+ __u64 fm_length; /* logical length of mapping which
+ * userspace wants (in) */
+ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
+ __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+ __u32 fm_extent_count; /* size of fm_extents array (in) */
+ __u32 fm_reserved;
+ struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET (~0ULL)
+
+#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
+
+#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.
+ * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read
+ * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs.
+ * Sets EXTENT_NO_BYPASS. */
+#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be
+ * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata.
+ * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block.
+ * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but
+ * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively
+ * support extents. Result
+ * merged for efficiency. */
+#define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other
+ * files. */
+
+#endif /* _LINUX_FIEMAP_H */
diff --git a/src/include/lru.h b/src/include/lru.h
new file mode 100644
index 00000000..1e30cdfe
--- /dev/null
+++ b/src/include/lru.h
@@ -0,0 +1,243 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_LRU_H
+#define CEPH_LRU_H
+
+#include <math.h>
+#include <stdint.h>
+
+#include "common/config.h"
+#include "xlist.h"
+
+class LRUObject {
+public:
+ LRUObject() : lru(), lru_link(this), lru_pinned(false) { }
+ ~LRUObject();
+
+ // pin/unpin item in cache
+ void lru_pin();
+ void lru_unpin();
+ bool lru_is_expireable() const { return !lru_pinned; }
+
+ friend class LRU;
+private:
+ class LRU *lru;
+ xlist<LRUObject *>::item lru_link;
+ bool lru_pinned;
+};
+
+class LRU {
+public:
+ LRU() : num_pinned(0), midpoint(0.6) {}
+
+ uint64_t lru_get_size() const { return lru_get_top()+lru_get_bot()+lru_get_pintail(); }
+ uint64_t lru_get_top() const { return top.size(); }
+ uint64_t lru_get_bot() const{ return bottom.size(); }
+ uint64_t lru_get_pintail() const { return pintail.size(); }
+ uint64_t lru_get_num_pinned() const { return num_pinned; }
+
+ void lru_set_midpoint(double f) { midpoint = fmin(1.0, fmax(0.0, f)); }
+
+ void lru_clear() {
+ while (!top.empty()) {
+ lru_remove(top.front());
+ }
+ while (!bottom.empty()) {
+ lru_remove(bottom.front());
+ }
+ while (!pintail.empty()) {
+ lru_remove(pintail.front());
+ }
+ ceph_assert(num_pinned == 0);
+ }
+
+ // insert at top of lru
+ void lru_insert_top(LRUObject *o) {
+ ceph_assert(!o->lru);
+ o->lru = this;
+ top.push_front(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
+ }
+
+ // insert at mid point in lru
+ void lru_insert_mid(LRUObject *o) {
+ ceph_assert(!o->lru);
+ o->lru = this;
+ bottom.push_front(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
+ }
+
+ // insert at bottom of lru
+ void lru_insert_bot(LRUObject *o) {
+ ceph_assert(!o->lru);
+ o->lru = this;
+ bottom.push_back(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
+ }
+
+ // remove an item
+ LRUObject *lru_remove(LRUObject *o) {
+ if (!o->lru) return o;
+ auto list = o->lru_link.get_list();
+ ceph_assert(list == &top || list == &bottom || list == &pintail);
+ o->lru_link.remove_myself();
+ if (o->lru_pinned) num_pinned--;
+ o->lru = nullptr;
+ adjust();
+ return o;
+ }
+
+ // touch item -- move to head of lru
+ bool lru_touch(LRUObject *o) {
+ if (!o->lru) {
+ lru_insert_top(o);
+ } else {
+ ceph_assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ ceph_assert(list == &top || list == &bottom || list == &pintail);
+ top.push_front(&o->lru_link);
+ adjust();
+ }
+ return true;
+ }
+
+ // touch item -- move to midpoint (unless already higher)
+ bool lru_midtouch(LRUObject *o) {
+ if (!o->lru) {
+ lru_insert_mid(o);
+ } else {
+ ceph_assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ ceph_assert(list == &top || list == &bottom || list == &pintail);
+ if (list == &top) return false;
+ bottom.push_front(&o->lru_link);
+ adjust();
+ }
+ return true;
+ }
+
+ // touch item -- move to bottom
+ bool lru_bottouch(LRUObject *o) {
+ if (!o->lru) {
+ lru_insert_bot(o);
+ } else {
+ ceph_assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ ceph_assert(list == &top || list == &bottom || list == &pintail);
+ bottom.push_back(&o->lru_link);
+ adjust();
+ }
+ return true;
+ }
+
+ void lru_touch_entire_pintail() {
+ // promote entire pintail to the top lru
+ while (pintail.size() > 0) {
+ top.push_back(&pintail.front()->lru_link);
+ adjust();
+ }
+ }
+
+ // expire -- expire a single item
+ LRUObject *lru_get_next_expire() {
+ adjust();
+ // look through tail of bot
+ while (bottom.size()) {
+ LRUObject *p = bottom.back();
+ if (!p->lru_pinned) return p;
+
+ // move to pintail
+ pintail.push_front(&p->lru_link);
+ }
+
+ // ok, try head then
+ while (top.size()) {
+ LRUObject *p = top.back();
+ if (!p->lru_pinned) return p;
+
+ // move to pintail
+ pintail.push_front(&p->lru_link);
+ }
+
+ // no luck!
+ return NULL;
+ }
+
+ LRUObject *lru_expire() {
+ LRUObject *p = lru_get_next_expire();
+ if (p)
+ return lru_remove(p);
+ return NULL;
+ }
+
+ void lru_status() {
+ //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl;
+ }
+
+protected:
+ // adjust top/bot balance, as necessary
+ void adjust() {
+ uint64_t toplen = top.size();
+ uint64_t topwant = (midpoint * (double)(lru_get_size() - num_pinned));
+ /* move items from below midpoint (bottom) to top: move midpoint forward */
+ for (uint64_t i = toplen; i < topwant; i++) {
+ top.push_back(&bottom.front()->lru_link);
+ }
+ /* or: move items from above midpoint (top) to bottom: move midpoint backwards */
+ for (uint64_t i = toplen; i > topwant; i--) {
+ bottom.push_front(&top.back()->lru_link);
+ }
+ }
+
+ uint64_t num_pinned;
+ double midpoint;
+
+ friend class LRUObject;
+private:
+ typedef xlist<LRUObject *> LRUList;
+ LRUList top, bottom, pintail;
+};
+
+inline LRUObject::~LRUObject() {
+ if (lru) {
+ lru->lru_remove(this);
+ }
+}
+
+inline void LRUObject::lru_pin() {
+ if (lru && !lru_pinned) {
+ lru->num_pinned++;
+ }
+ lru_pinned = true;
+}
+
+inline void LRUObject::lru_unpin() {
+ if (lru && lru_pinned) {
+ lru->num_pinned--;
+
+ // move from pintail -> bot
+ if (lru_link.get_list() == &lru->pintail) {
+ lru->lru_bottouch(this);
+ }
+ }
+ lru_pinned = false;
+}
+
+#endif
diff --git a/src/include/mempool.h b/src/include/mempool.h
new file mode 100644
index 00000000..9cee3825
--- /dev/null
+++ b/src/include/mempool.h
@@ -0,0 +1,547 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef _CEPH_INCLUDE_MEMPOOL_H
+#define _CEPH_INCLUDE_MEMPOOL_H
+
+#include <cstddef>
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <vector>
+#include <list>
+#include <mutex>
+#include <atomic>
+#include <typeinfo>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/flat_map.hpp>
+
+#include <common/Formatter.h>
+#include "include/ceph_assert.h"
+#include "include/compact_map.h"
+#include "include/compact_set.h"
+
+
+/*
+
+Memory Pools
+============
+
+A memory pool is a method for accounting the consumption of memory of
+a set of containers.
+
+Memory pools are statically declared (see pool_index_t).
+
+Each memory pool tracks the number of bytes and items it contains.
+
+Allocators can be declared and associated with a type so that they are
+tracked independently of the pool total. This additional accounting
+is optional and only incurs an overhead if the debugging is enabled at
+runtime. This allows developers to see what types are consuming the
+pool resources.
+
+
+Declaring
+---------
+
+Using memory pools is very easy.
+
+To create a new memory pool, simply add a new name into the list of
+memory pools that's defined in "DEFINE_MEMORY_POOLS_HELPER". That's
+it. :)
+
+For each memory pool that's created a C++ namespace is also
+automatically created (name is same as in DEFINE_MEMORY_POOLS_HELPER).
+That namespace contains a set of common STL containers that are predefined
+with the appropriate allocators.
+
+Thus for mempool "osd" we have automatically available to us:
+
+ mempool::osd::map
+ mempool::osd::multimap
+ mempool::osd::set
+ mempool::osd::multiset
+ mempool::osd::list
+ mempool::osd::vector
+ mempool::osd::unordered_map
+
+
+Putting objects in a mempool
+----------------------------
+
+In order to use a memory pool with a particular type, a few additional
+declarations are needed.
+
+For a class:
+
+ struct Foo {
+ MEMPOOL_CLASS_HELPERS();
+ ...
+ };
+
+Then, in an appropriate .cc file,
+
+ MEMPOOL_DEFINE_OBJECT_FACTORY(Foo, foo, osd);
+
+The second argument can generally be identical to the first, except
+when the type contains a nested scope. For example, for
+BlueStore::Onode, we need to do
+
+ MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
+ bluestore_meta);
+
+(This is just because we need to name some static variables and we
+can't use :: in a variable name.)
+
+XXX Note: the new operator hard-codes the allocation size to the size of the
+object given in MEMPOOL_DEFINE_OBJECT_FACTORY. For this reason, you cannot
+incorporate mempools into a base class without also defining a helper/factory
+for the child class as well (as the base class is usually smaller than the
+child class).
+
+In order to use the STL containers, simply use the namespaced variant
+of the container type. For example,
+
+ mempool::osd::map<int> myvec;
+
+Introspection
+-------------
+
+The simplest way to interrogate the process is with
+
+ Formater *f = ...
+ mempool::dump(f);
+
+This will dump information about *all* memory pools. When debug mode
+is enabled, the runtime complexity of dump is O(num_shards *
+num_types). When debug name is disabled it is O(num_shards).
+
+You can also interrogate a specific pool programmatically with
+
+ size_t bytes = mempool::unittest_2::allocated_bytes();
+ size_t items = mempool::unittest_2::allocated_items();
+
+The runtime complexity is O(num_shards).
+
+Note that you cannot easily query per-type, primarily because debug
+mode is optional and you should not rely on that information being
+available.
+
+*/
+
+namespace mempool {
+
+// --------------------------------------------------------------
+// define memory pools
+
+#define DEFINE_MEMORY_POOLS_HELPER(f) \
+ f(bloom_filter) \
+ f(bluestore_alloc) \
+ f(bluestore_cache_data) \
+ f(bluestore_cache_onode) \
+ f(bluestore_cache_meta) \
+ f(bluestore_cache_other) \
+ f(bluestore_Buffer) \
+ f(bluestore_Extent) \
+ f(bluestore_Blob) \
+ f(bluestore_SharedBlob) \
+ f(bluestore_inline_bl) \
+ f(bluestore_fsck) \
+ f(bluestore_txc) \
+ f(bluestore_writing_deferred) \
+ f(bluestore_writing) \
+ f(bluefs) \
+ f(bluefs_file_reader) \
+ f(bluefs_file_writer) \
+ f(buffer_anon) \
+ f(buffer_meta) \
+ f(osd) \
+ f(osd_mapbl) \
+ f(osd_pglog) \
+ f(osdmap) \
+ f(osdmap_mapping) \
+ f(pgmap) \
+ f(mds_co) \
+ f(unittest_1) \
+ f(unittest_2)
+
+
+// give them integer ids
+#define P(x) mempool_##x,
+enum pool_index_t {
+ DEFINE_MEMORY_POOLS_HELPER(P)
+ num_pools // Must be last.
+};
+#undef P
+
+extern bool debug_mode;
+extern void set_debug_mode(bool d);
+
+// --------------------------------------------------------------
+class pool_t;
+
+// we shard pool stats across many shard_t's to reduce the amount
+// of cacheline ping pong.
+enum {
+ num_shard_bits = 5
+};
+enum {
+ num_shards = 1 << num_shard_bits
+};
+
+// align shard to a cacheline
+struct shard_t {
+ std::atomic<size_t> bytes = {0};
+ std::atomic<size_t> items = {0};
+ char __padding[128 - sizeof(std::atomic<size_t>)*2];
+} __attribute__ ((aligned (128)));
+
+static_assert(sizeof(shard_t) == 128, "shard_t should be cacheline-sized");
+
+struct stats_t {
+ ssize_t items = 0;
+ ssize_t bytes = 0;
+ void dump(ceph::Formatter *f) const {
+ f->dump_int("items", items);
+ f->dump_int("bytes", bytes);
+ }
+
+ stats_t& operator+=(const stats_t& o) {
+ items += o.items;
+ bytes += o.bytes;
+ return *this;
+ }
+};
+
+pool_t& get_pool(pool_index_t ix);
+const char *get_pool_name(pool_index_t ix);
+
+struct type_t {
+ const char *type_name;
+ size_t item_size;
+ std::atomic<ssize_t> items = {0}; // signed
+};
+
+struct type_info_hash {
+ std::size_t operator()(const std::type_info& k) const {
+ return k.hash_code();
+ }
+};
+
+class pool_t {
+ shard_t shard[num_shards];
+
+ mutable std::mutex lock; // only used for types list
+ std::unordered_map<const char *, type_t> type_map;
+
+public:
+ //
+ // How much this pool consumes. O(<num_shards>)
+ //
+ size_t allocated_bytes() const;
+ size_t allocated_items() const;
+
+ void adjust_count(ssize_t items, ssize_t bytes);
+
+ static size_t pick_a_shard_int() {
+ // Dirt cheap, see:
+ // https://fossies.org/dox/glibc-2.32/pthread__self_8c_source.html
+ size_t me = (size_t)pthread_self();
+ size_t i = (me >> 12) & ((1 << num_shard_bits) - 1);
+ return i;
+ }
+
+ shard_t* pick_a_shard() {
+ size_t i = pick_a_shard_int();
+ return &shard[i];
+ }
+
+ type_t *get_type(const std::type_info& ti, size_t size) {
+ std::lock_guard<std::mutex> l(lock);
+ auto p = type_map.find(ti.name());
+ if (p != type_map.end()) {
+ return &p->second;
+ }
+ type_t &t = type_map[ti.name()];
+ t.type_name = ti.name();
+ t.item_size = size;
+ return &t;
+ }
+
+ // get pool stats. by_type is not populated if !debug
+ void get_stats(stats_t *total,
+ std::map<std::string, stats_t> *by_type) const;
+
+ void dump(ceph::Formatter *f, stats_t *ptotal=0) const;
+};
+
+void dump(ceph::Formatter *f);
+
+
+// STL allocator for use with containers. All actual state
+// is stored in the static pool_allocator_base_t, which saves us from
+// passing the allocator to container constructors.
+
+template<pool_index_t pool_ix, typename T>
+class pool_allocator {
+ pool_t *pool;
+ type_t *type = nullptr;
+
+public:
+ typedef pool_allocator<pool_ix, T> allocator_type;
+ typedef T value_type;
+ typedef value_type *pointer;
+ typedef const value_type * const_pointer;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ template<typename U> struct rebind {
+ typedef pool_allocator<pool_ix,U> other;
+ };
+
+ void init(bool force_register) {
+ pool = &get_pool(pool_ix);
+ if (debug_mode || force_register) {
+ type = pool->get_type(typeid(T), sizeof(T));
+ }
+ }
+
+ pool_allocator(bool force_register=false) {
+ init(force_register);
+ }
+ template<typename U>
+ pool_allocator(const pool_allocator<pool_ix,U>&) {
+ init(false);
+ }
+
+ T* allocate(size_t n, void *p = nullptr) {
+ size_t total = sizeof(T) * n;
+ shard_t *shard = pool->pick_a_shard();
+ shard->bytes += total;
+ shard->items += n;
+ if (type) {
+ type->items += n;
+ }
+ T* r = reinterpret_cast<T*>(new char[total]);
+ return r;
+ }
+
+ void deallocate(T* p, size_t n) {
+ size_t total = sizeof(T) * n;
+ shard_t *shard = pool->pick_a_shard();
+ shard->bytes -= total;
+ shard->items -= n;
+ if (type) {
+ type->items -= n;
+ }
+ delete[] reinterpret_cast<char*>(p);
+ }
+
+ T* allocate_aligned(size_t n, size_t align, void *p = nullptr) {
+ size_t total = sizeof(T) * n;
+ shard_t *shard = pool->pick_a_shard();
+ shard->bytes += total;
+ shard->items += n;
+ if (type) {
+ type->items += n;
+ }
+ char *ptr;
+ int rc = ::posix_memalign((void**)(void*)&ptr, align, total);
+ if (rc)
+ throw std::bad_alloc();
+ T* r = reinterpret_cast<T*>(ptr);
+ return r;
+ }
+
+ void deallocate_aligned(T* p, size_t n) {
+ size_t total = sizeof(T) * n;
+ shard_t *shard = pool->pick_a_shard();
+ shard->bytes -= total;
+ shard->items -= n;
+ if (type) {
+ type->items -= n;
+ }
+ ::free(p);
+ }
+
+ void destroy(T* p) {
+ p->~T();
+ }
+
+ template<class U>
+ void destroy(U *p) {
+ p->~U();
+ }
+
+ void construct(T* p, const T& val) {
+ ::new ((void *)p) T(val);
+ }
+
+ template<class U, class... Args> void construct(U* p,Args&&... args) {
+ ::new((void *)p) U(std::forward<Args>(args)...);
+ }
+
+ bool operator==(const pool_allocator&) const { return true; }
+ bool operator!=(const pool_allocator&) const { return false; }
+};
+
+
+// Namespace mempool
+
+#define P(x) \
+ namespace x { \
+ static const mempool::pool_index_t id = mempool::mempool_##x; \
+ template<typename v> \
+ using pool_allocator = mempool::pool_allocator<id,v>; \
+ \
+ using string = std::basic_string<char,std::char_traits<char>, \
+ pool_allocator<char>>; \
+ \
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using map = std::map<k, v, cmp, \
+ pool_allocator<std::pair<const k,v>>>; \
+ \
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using compact_map = compact_map<k, v, cmp, \
+ pool_allocator<std::pair<const k,v>>>; \
+ \
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using compact_multimap = compact_multimap<k, v, cmp, \
+ pool_allocator<std::pair<const k,v>>>; \
+ \
+ template<typename k, typename cmp = std::less<k> > \
+ using compact_set = compact_set<k, cmp, pool_allocator<k>>; \
+ \
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using multimap = std::multimap<k,v,cmp, \
+ pool_allocator<std::pair<const k, \
+ v>>>; \
+ \
+ template<typename k, typename cmp = std::less<k> > \
+ using set = std::set<k,cmp,pool_allocator<k>>; \
+ \
+ template<typename k, typename cmp = std::less<k> > \
+ using flat_set = boost::container::flat_set<k,cmp,pool_allocator<k>>; \
+ \
+ template<typename k, typename v, typename cmp = std::less<k> > \
+ using flat_map = boost::container::flat_map<k,v,cmp, \
+ pool_allocator<std::pair<k,v>>>; \
+ \
+ template<typename v> \
+ using list = std::list<v,pool_allocator<v>>; \
+ \
+ template<typename v> \
+ using vector = std::vector<v,pool_allocator<v>>; \
+ \
+ template<typename k, typename v, \
+ typename h=std::hash<k>, \
+ typename eq = std::equal_to<k>> \
+ using unordered_map = \
+ std::unordered_map<k,v,h,eq,pool_allocator<std::pair<const k,v>>>;\
+ \
+ inline size_t allocated_bytes() { \
+ return mempool::get_pool(id).allocated_bytes(); \
+ } \
+ inline size_t allocated_items() { \
+ return mempool::get_pool(id).allocated_items(); \
+ } \
+ };
+
+DEFINE_MEMORY_POOLS_HELPER(P)
+
+#undef P
+
+};
+
+// the elements allocated by mempool is in the same memory space as the ones
+// allocated by the default allocator. so compare them in an efficient way:
+// libstdc++'s std::equal is specialized to use memcmp if T is integer or
+// pointer. this is good enough for our usecase. use
+// std::is_trivially_copyable<T> to expand the support to more types if
+// nececssary.
+template<typename T, mempool::pool_index_t pool_index>
+bool operator==(const std::vector<T, std::allocator<T>>& lhs,
+ const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs)
+{
+ return (lhs.size() == rhs.size() &&
+ std::equal(lhs.begin(), lhs.end(), rhs.begin()));
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator!=(const std::vector<T, std::allocator<T>>& lhs,
+ const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs)
+{
+ return !(lhs == rhs);
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator==(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs,
+ const std::vector<T, std::allocator<T>>& rhs)
+{
+ return rhs == lhs;
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator!=(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs,
+ const std::vector<T, std::allocator<T>>& rhs)
+{
+ return !(lhs == rhs);
+}
+
+// Use this for any type that is contained by a container (unless it
+// is a class you defined; see below).
+#define MEMPOOL_DECLARE_FACTORY(obj, factoryname, pool) \
+ namespace mempool { \
+ namespace pool { \
+ extern pool_allocator<obj> alloc_##factoryname; \
+ } \
+ }
+
+#define MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \
+ namespace mempool { \
+ namespace pool { \
+ pool_allocator<obj> alloc_##factoryname = {true}; \
+ } \
+ }
+
+// Use this for each class that belongs to a mempool. For example,
+//
+// class T {
+// MEMPOOL_CLASS_HELPERS();
+// ...
+// };
+//
+#define MEMPOOL_CLASS_HELPERS() \
+ void *operator new(size_t size); \
+ void *operator new[](size_t size) noexcept { \
+ ceph_abort_msg("no array new"); \
+ return nullptr; } \
+ void operator delete(void *); \
+ void operator delete[](void *) { ceph_abort_msg("no array delete"); }
+
+
+// Use this in some particular .cc file to match each class with a
+// MEMPOOL_CLASS_HELPERS().
+#define MEMPOOL_DEFINE_OBJECT_FACTORY(obj,factoryname,pool) \
+ MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \
+ void *obj::operator new(size_t size) { \
+ return mempool::pool::alloc_##factoryname.allocate(1); \
+ } \
+ void obj::operator delete(void *p) { \
+ return mempool::pool::alloc_##factoryname.deallocate((obj*)p, 1); \
+ }
+
+#endif
diff --git a/src/include/msgr.h b/src/include/msgr.h
new file mode 100644
index 00000000..f7b2a078
--- /dev/null
+++ b/src/include/msgr.h
@@ -0,0 +1,254 @@
+#ifndef CEPH_MSGR_H
+#define CEPH_MSGR_H
+
+#ifndef __KERNEL__
+#include <sys/socket.h> // for struct sockaddr_storage
+#endif
+
+#include "include/int_types.h"
+
+/* See comment in ceph_fs.h. */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT_LEGACY 6789 /* legacy default monitor port */
+#define CEPH_MON_PORT_IANA 3300 /* IANA monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST 6789
+
+/*
+ * tcp connection banner. include a protocol version. and adjust
+ * whenever the wire protocol changes. try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+
+
+/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2_PREFIX "ceph v2\n"
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \
+ const static uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+ const static uint64_t CEPH_MSGR2_FEATUREMASK_##name = \
+ (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+ (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES (0ull)
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+ return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+ __u8 type; /* CEPH_ENTITY_TYPE_* */
+ __le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON 0x01
+#define CEPH_ENTITY_TYPE_MDS 0x02
+#define CEPH_ENTITY_TYPE_OSD 0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_MGR 0x10
+#define CEPH_ENTITY_TYPE_AUTH 0x20
+
+#define CEPH_ENTITY_TYPE_ANY 0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+ __le32 type;
+ __le32 nonce; /* unique id for process (e.g. pid) */
+ struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+ struct ceph_entity_name name;
+ struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
+ incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
+ with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
+ with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
+#define CEPH_MSGR_TAG_MSG 7 /* message */
+#define CEPH_MSGR_TAG_ACK 8 /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
+#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2 14
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive reply */
+#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* ceph v2 doing server challenge */
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+ __le64 features; /* supported feature bits */
+ __le32 host_type; /* CEPH_ENTITY_TYPE_* */
+ __le32 global_seq; /* count connections initiated by this host */
+ __le32 connect_seq; /* count connections initiated in this session */
+ __le32 protocol_version;
+ __le32 authorizer_protocol;
+ __le32 authorizer_len;
+ __u8 flags; /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+ __u8 tag;
+ __le64 features; /* feature bits for this session */
+ __le32 global_seq;
+ __le32 connect_seq;
+ __le32 protocol_version;
+ __le32 authorizer_len;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_inst src, orig_src;
+ __le32 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_name src;
+
+ /* oldest code we think can decode this. unknown if zero. */
+ __le16 compat_version;
+ __le16 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header2 {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 data_pre_padding_len;
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ __le64 ack_seq;
+ __u8 flags;
+ /* oldest code we think can decode this. unknown if zero. */
+ __le16 compat_version;
+ __le16 reserved;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW 64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH 196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ * ceph_msg_footer_old does not support digital signatures on messages PLR
+ */
+
+struct ceph_msg_footer_old {
+ __le32 front_crc, middle_crc, data_crc;
+ __u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_msg_footer {
+ __le32 front_crc, middle_crc, data_crc;
+ // sig holds the 64 bits of the digital signature for the message PLR
+ __le64 sig;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
+#define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/object.h b/src/include/object.h
new file mode 100644
index 00000000..99ca58f9
--- /dev/null
+++ b/src/include/object.h
@@ -0,0 +1,214 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJECT_H
+#define CEPH_OBJECT_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <iosfwd>
+#include <iomanip>
+
+#include "include/rados.h"
+#include "include/unordered_map.h"
+
+#include "hash.h"
+#include "encoding.h"
+#include "ceph_hash.h"
+#include "cmp.h"
+
+using namespace std;
+
+struct object_t {
+ string name;
+
+ object_t() {}
+ // cppcheck-suppress noExplicitConstructor
+ object_t(const char *s) : name(s) {}
+ // cppcheck-suppress noExplicitConstructor
+ object_t(const string& s) : name(s) {}
+
+ void swap(object_t& o) {
+ name.swap(o.name);
+ }
+ void clear() {
+ name.clear();
+ }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(name, bl);
+ }
+ void decode(bufferlist::const_iterator &bl) {
+ using ceph::decode;
+ decode(name, bl);
+ }
+};
+WRITE_CLASS_ENCODER(object_t)
+
+inline bool operator==(const object_t& l, const object_t& r) {
+ return l.name == r.name;
+}
+inline bool operator!=(const object_t& l, const object_t& r) {
+ return l.name != r.name;
+}
+inline bool operator>(const object_t& l, const object_t& r) {
+ return l.name > r.name;
+}
+inline bool operator<(const object_t& l, const object_t& r) {
+ return l.name < r.name;
+}
+inline bool operator>=(const object_t& l, const object_t& r) {
+ return l.name >= r.name;
+}
+inline bool operator<=(const object_t& l, const object_t& r) {
+ return l.name <= r.name;
+}
+inline ostream& operator<<(ostream& out, const object_t& o) {
+ return out << o.name;
+}
+
+namespace std {
+ template<> struct hash<object_t> {
+ size_t operator()(const object_t& r) const {
+ //static hash<string> H;
+ //return H(r.name);
+ return ceph_str_hash_linux(r.name.c_str(), r.name.length());
+ }
+ };
+} // namespace std
+
+
+struct file_object_t {
+ uint64_t ino, bno;
+ mutable char buf[34];
+
+ file_object_t(uint64_t i=0, uint64_t b=0) : ino(i), bno(b) {
+ buf[0] = 0;
+ }
+
+ const char *c_str() const {
+ if (!buf[0])
+ snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)bno);
+ return buf;
+ }
+
+ operator object_t() {
+ return object_t(c_str());
+ }
+};
+
+
+// ---------------------------
+// snaps
+
+struct snapid_t {
+ uint64_t val;
+ // cppcheck-suppress noExplicitConstructor
+ snapid_t(uint64_t v=0) : val(v) {}
+ snapid_t operator+=(snapid_t o) { val += o.val; return *this; }
+ snapid_t operator++() { ++val; return *this; }
+ operator uint64_t() const { return val; }
+};
+
+inline void encode(snapid_t i, bufferlist &bl) { encode(i.val, bl); }
+inline void decode(snapid_t &i, bufferlist::const_iterator &p) { decode(i.val, p); }
+
+template<>
+struct denc_traits<snapid_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const snapid_t& o, size_t& p) {
+ denc(o.val, p);
+ }
+ static void encode(const snapid_t &o, buffer::list::contiguous_appender& p) {
+ denc(o.val, p);
+ }
+ static void decode(snapid_t& o, buffer::ptr::const_iterator &p) {
+ denc(o.val, p);
+ }
+};
+
+inline ostream& operator<<(ostream& out, const snapid_t& s) {
+ if (s == CEPH_NOSNAP)
+ return out << "head";
+ else if (s == CEPH_SNAPDIR)
+ return out << "snapdir";
+ else
+ return out << hex << s.val << dec;
+}
+
+
+struct sobject_t {
+ object_t oid;
+ snapid_t snap;
+
+ sobject_t() : snap(0) {}
+ sobject_t(object_t o, snapid_t s) : oid(o), snap(s) {}
+
+ void swap(sobject_t& o) {
+ oid.swap(o.oid);
+ snapid_t t = snap;
+ snap = o.snap;
+ o.snap = t;
+ }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(oid, bl);
+ encode(snap, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(oid, bl);
+ decode(snap, bl);
+ }
+};
+WRITE_CLASS_ENCODER(sobject_t)
+
+inline bool operator==(const sobject_t &l, const sobject_t &r) {
+ return l.oid == r.oid && l.snap == r.snap;
+}
+inline bool operator!=(const sobject_t &l, const sobject_t &r) {
+ return l.oid != r.oid || l.snap != r.snap;
+}
+inline bool operator>(const sobject_t &l, const sobject_t &r) {
+ return l.oid > r.oid || (l.oid == r.oid && l.snap > r.snap);
+}
+inline bool operator<(const sobject_t &l, const sobject_t &r) {
+ return l.oid < r.oid || (l.oid == r.oid && l.snap < r.snap);
+}
+inline bool operator>=(const sobject_t &l, const sobject_t &r) {
+ return l.oid > r.oid || (l.oid == r.oid && l.snap >= r.snap);
+}
+inline bool operator<=(const sobject_t &l, const sobject_t &r) {
+ return l.oid < r.oid || (l.oid == r.oid && l.snap <= r.snap);
+}
+inline ostream& operator<<(ostream& out, const sobject_t &o) {
+ return out << o.oid << "/" << o.snap;
+}
+namespace std {
+ template<> struct hash<sobject_t> {
+ size_t operator()(const sobject_t &r) const {
+ static hash<object_t> H;
+ static rjhash<uint64_t> I;
+ return H(r.oid) ^ I(r.snap);
+ }
+ };
+} // namespace std
+
+#endif
diff --git a/src/include/on_exit.h b/src/include/on_exit.h
new file mode 100644
index 00000000..c412ab33
--- /dev/null
+++ b/src/include/on_exit.h
@@ -0,0 +1,49 @@
+#ifndef CEPH_ON_EXIT_H
+#define CEPH_ON_EXIT_H
+
+#include <pthread.h>
+#include <vector>
+
+#include "include/ceph_assert.h"
+/*
+ * Create a static instance at the file level to get callbacks called when the
+ * process exits via main() or exit().
+ */
+
+class OnExitManager {
+ public:
+ typedef void (*callback_t)(void *arg);
+
+ OnExitManager() {
+ int ret = pthread_mutex_init(&lock_, NULL);
+ ceph_assert(ret == 0);
+ }
+
+ ~OnExitManager() {
+ pthread_mutex_lock(&lock_);
+ std::vector<struct cb>::iterator it;
+ for (it = funcs_.begin(); it != funcs_.end(); it++) {
+ it->func(it->arg);
+ }
+ funcs_.clear();
+ pthread_mutex_unlock(&lock_);
+ }
+
+ void add_callback(callback_t func, void *arg) {
+ pthread_mutex_lock(&lock_);
+ struct cb callback = { func, arg };
+ funcs_.push_back(callback);
+ pthread_mutex_unlock(&lock_);
+ }
+
+ private:
+ struct cb {
+ callback_t func;
+ void *arg;
+ };
+
+ std::vector<struct cb> funcs_;
+ pthread_mutex_t lock_;
+};
+
+#endif
diff --git a/src/include/page.h b/src/include/page.h
new file mode 100644
index 00000000..db6e2058
--- /dev/null
+++ b/src/include/page.h
@@ -0,0 +1,18 @@
+#ifndef CEPH_PAGE_H
+#define CEPH_PAGE_H
+
+namespace ceph {
+ // these are in common/page.cc
+ extern unsigned _page_size;
+ extern unsigned long _page_mask;
+ extern unsigned _page_shift;
+}
+
+#endif
+
+
+#define CEPH_PAGE_SIZE ceph::_page_size
+#define CEPH_PAGE_MASK ceph::_page_mask
+#define CEPH_PAGE_SHIFT ceph::_page_shift
+
+
diff --git a/src/include/rados.h b/src/include/rados.h
new file mode 100644
index 00000000..bbcf0867
--- /dev/null
+++ b/src/include/rados.h
@@ -0,0 +1,681 @@
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include <string.h>
+#include <stdbool.h>
+#include "msgr.h"
+
+/* See comment in ceph_fs.h. */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+ unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+ const struct ceph_fsid *b)
+{
+ return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
+
+struct ceph_timespec {
+ __le32 tv_sec;
+ __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH 1
+#define CEPH_OBJECT_LAYOUT_LINEAR 2
+#define CEPH_OBJECT_LAYOUT_HASHINO 3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH 0
+#define CEPH_PG_LAYOUT_HASH 1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+ __le16 preferred; /* preferred primary osd */
+ __le16 ps; /* placement seed */
+ __le32 pool; /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg pool types
+ *
+ * NOTE: These map 1:1 on to the pg_pool_t::TYPE_* values. They are
+ * duplicated here only for CrushCompiler's benefit.
+ */
+#define CEPH_PG_TYPE_REPLICATED 1
+/* #define CEPH_PG_TYPE_RAID4 2 never implemented */
+#define CEPH_PG_TYPE_ERASURE 3
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time. b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+ if ((x & bmask) < b)
+ return x & bmask;
+ else
+ return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+ struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
+ __le32 ol_stripe_unit; /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+ __le32 epoch;
+ __le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS (1<<0)
+#define CEPH_OSD_UP (1<<1)
+#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
+#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
+#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */
+#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */
+#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */
+#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */
+#define CEPH_OSD_NOUP (1<<8) /* osd can not be marked up */
+#define CEPH_OSD_NODOWN (1<<9) /* osd can not be marked down */
+#define CEPH_OSD_NOIN (1<<10) /* osd can not be marked in */
+#define CEPH_OSD_NOOUT (1<<11) /* osd can not be marked out */
+
+extern const char *ceph_osd_state_name(int s);
+
+/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN 0x10000
+#define CEPH_OSD_OUT 0
+
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
+#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
+#define CEPH_OSDMAP_PURGED_SNAPDIRS (1<<20) /* osds have converted snapsets */
+#define CEPH_OSDMAP_NOSNAPTRIM (1<<21) /* disable snap trimming */
+#define CEPH_OSDMAP_PGLOG_HARDLIMIT (1<<22) /* put a hard limit on pg log length */
+
+/* these are hidden in 'ceph status' view */
+#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \
+ CEPH_OSDMAP_REQUIRE_KRAKEN | \
+ CEPH_OSDMAP_REQUIRE_LUMINOUS | \
+ CEPH_OSDMAP_RECOVERY_DELETES | \
+ CEPH_OSDMAP_SORTBITWISE | \
+ CEPH_OSDMAP_PURGED_SNAPDIRS | \
+ CEPH_OSDMAP_PGLOG_HARDLIMIT)
+#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \
+ CEPH_OSDMAP_REQUIRE_KRAKEN | \
+ CEPH_OSDMAP_REQUIRE_LUMINOUS)
+
+/*
+ * major ceph release numbers
+ */
+#define CEPH_RELEASE_ARGONAUT 1
+#define CEPH_RELEASE_BOBTAIL 2
+#define CEPH_RELEASE_CUTTLEFISH 3
+#define CEPH_RELEASE_DUMPLING 4
+#define CEPH_RELEASE_EMPEROR 5
+#define CEPH_RELEASE_FIREFLY 6
+#define CEPH_RELEASE_GIANT 7
+#define CEPH_RELEASE_HAMMER 8
+#define CEPH_RELEASE_INFERNALIS 9
+#define CEPH_RELEASE_JEWEL 10
+#define CEPH_RELEASE_KRAKEN 11
+#define CEPH_RELEASE_LUMINOUS 12
+#define CEPH_RELEASE_MIMIC 13
+#define CEPH_RELEASE_NAUTILUS 14
+#define CEPH_RELEASE_MAX 15 /* highest + 1 */
+
+extern const char *ceph_release_name(int r);
+extern int ceph_release_from_name(const char *s);
+extern uint64_t ceph_release_features(int r);
+extern int ceph_release_from_features(uint64_t features);
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
+
+/*
+ * osd ops
+ *
+ * WARNING: do not use these op codes directly. Use the helpers
+ * defined below instead. In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
+ */
+#define CEPH_OSD_OP_MODE 0xf000
+#define CEPH_OSD_OP_MODE_RD 0x1000
+#define CEPH_OSD_OP_MODE_WR 0x2000
+#define CEPH_OSD_OP_MODE_RMW 0x3000
+#define CEPH_OSD_OP_MODE_SUB 0x4000
+#define CEPH_OSD_OP_MODE_CACHE 0x8000
+
+#define CEPH_OSD_OP_TYPE 0x0f00
+#define CEPH_OSD_OP_TYPE_DATA 0x0200
+#define CEPH_OSD_OP_TYPE_ATTR 0x0300
+#define CEPH_OSD_OP_TYPE_EXEC 0x0400
+#define CEPH_OSD_OP_TYPE_PG 0x0500
+// LEAVE UNUSED 0x0600 used to be multiobject ops
+
+#define __CEPH_OSD_OP1(mode, nr) \
+ (CEPH_OSD_OP_MODE_##mode | (nr))
+
+#define __CEPH_OSD_OP(mode, type, nr) \
+ (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr))
+
+#define __CEPH_FORALL_OSD_OPS(f) \
+ /** data **/ \
+ /* read */ \
+ f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \
+ f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \
+ f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \
+ f(CHECKSUM, __CEPH_OSD_OP(RD, DATA, 31), "checksum") \
+ \
+ /* fancy read */ \
+ f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \
+ f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \
+ \
+ f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \
+ f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \
+ \
+ /* versioning */ \
+ f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \
+ \
+ f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \
+ \
+ f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \
+ \
+ /* sync */ \
+ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \
+ \
+ /* write */ \
+ f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \
+ f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \
+ f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \
+ f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \
+ f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \
+ \
+ /* fancy write */ \
+ f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \
+ f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \
+ f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \
+ f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \
+ \
+ f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \
+ f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \
+ f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \
+ \
+ f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \
+ f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \
+ \
+ f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \
+ \
+ /* omap */ \
+ f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \
+ f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \
+ f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \
+ f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \
+ f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \
+ f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \
+ f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \
+ f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \
+ f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \
+ \
+ /* tiering */ \
+ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \
+ /* was copy-get-classic */ \
+ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \
+ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \
+ f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \
+ f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \
+ f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \
+ f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \
+ \
+ /* convert tmap to omap */ \
+ f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \
+ \
+ /* hints */ \
+ f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \
+ \
+ /* cache pin/unpin */ \
+ f(CACHE_PIN, __CEPH_OSD_OP(WR, DATA, 36), "cache-pin") \
+ f(CACHE_UNPIN, __CEPH_OSD_OP(WR, DATA, 37), "cache-unpin") \
+ \
+ /* ESX/SCSI */ \
+ f(WRITESAME, __CEPH_OSD_OP(WR, DATA, 38), "write-same") \
+ f(CMPEXT, __CEPH_OSD_OP(RD, DATA, 32), "cmpext") \
+ \
+ /* Extensible */ \
+ f(SET_REDIRECT, __CEPH_OSD_OP(WR, DATA, 39), "set-redirect") \
+ f(SET_CHUNK, __CEPH_OSD_OP(WR, DATA, 40), "set-chunk") \
+ f(TIER_PROMOTE, __CEPH_OSD_OP(WR, DATA, 41), "tier-promote") \
+ f(UNSET_MANIFEST, __CEPH_OSD_OP(WR, DATA, 42), "unset-manifest") \
+ \
+ /** attrs **/ \
+ /* read */ \
+ f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \
+ f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \
+ f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \
+ \
+ /* write */ \
+ f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \
+ f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \
+ f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \
+ f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \
+ \
+ /** subop **/ \
+ f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \
+ f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \
+ f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \
+ f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \
+ f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \
+ f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \
+ f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \
+ /* 8 used to be scrub-stop */ \
+ f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \
+ \
+ /** exec **/ \
+ /* note: the RD bit here is wrong; see special-case below in helper */ \
+ f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \
+ \
+ /** pg **/ \
+ f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \
+ f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \
+ f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \
+ f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get") \
+ f(PGNLS, __CEPH_OSD_OP(RD, PG, 5), "pgnls") \
+ f(PGNLS_FILTER, __CEPH_OSD_OP(RD, PG, 6), "pgnls-filter") \
+ f(SCRUBLS, __CEPH_OSD_OP(RD, PG, 7), "scrubls")
+
+enum {
+#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode),
+__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
+#undef GENERATE_ENUM_ENTRY
+};
+
+static inline int ceph_osd_op_type_data(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+ return (op & CEPH_OSD_OP_MODE_RD) &&
+ op != CEPH_OSD_OP_CALL;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+ return op & CEPH_OSD_OP_MODE_WR;
+}
+static inline int ceph_osd_op_mode_cache(int op)
+{
+ return op & CEPH_OSD_OP_MODE_CACHE;
+}
+static inline bool ceph_osd_op_uses_extent(int op)
+{
+ switch(op) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_MAPEXT:
+ case CEPH_OSD_OP_MASKTRUNC:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_SYNC_READ:
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ case CEPH_OSD_OP_TRUNCATE:
+ case CEPH_OSD_OP_ZERO:
+ case CEPH_OSD_OP_APPEND:
+ case CEPH_OSD_OP_TRIMTRUNC:
+ case CEPH_OSD_OP_CMPEXT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * and objclass.h. Any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
+#define CEPH_OSD_TMAP_RM 'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
+
+extern const char *ceph_osd_op_name(int op);
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+ CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
+ CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
+ CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
+ CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
+ CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
+ CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
+ CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
+ CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
+ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
+ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
+ CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */
+ CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
+ CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000, /* ignore pool overlay */
+ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
+ CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000, /* map snap direct to clone id
+ */
+ CEPH_OSD_FLAG_ENFORCE_SNAPC =0x100000, /* use snapc provided even if
+ pool uses pool snaps */
+ CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
+ CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
+ CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
+ CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
+ CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000, /* ignore redirection */
+};
+
+enum {
+ CEPH_OSD_OP_FLAG_EXCL = 0x1, /* EXCL object create */
+ CEPH_OSD_OP_FLAG_FAILOK = 0x2, /* continue despite failure */
+ CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */
+ CEPH_OSD_OP_FLAG_WITH_REFERENCE = 0x80, /* need reference couting */
+ CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */
+};
+
+#define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED 108 /* blacklisted */
+
+/* xattr comparison */
+enum {
+ CEPH_OSD_CMPXATTR_OP_EQ = 1,
+ CEPH_OSD_CMPXATTR_OP_NE = 2,
+ CEPH_OSD_CMPXATTR_OP_GT = 3,
+ CEPH_OSD_CMPXATTR_OP_GTE = 4,
+ CEPH_OSD_CMPXATTR_OP_LT = 5,
+ CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+ CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+ CEPH_OSD_CMPXATTR_MODE_U64 = 2
+};
+
+enum {
+ CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
+ * cloneid */
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
+};
+
+enum {
+ CEPH_OSD_TMAP2OMAP_NULLOK = 1,
+};
+
+enum {
+ CEPH_OSD_WATCH_OP_UNWATCH = 0,
+ CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+ /* note: use only ODD ids to prevent pre-giant code from
+ interpreting the op as UNWATCH */
+ CEPH_OSD_WATCH_OP_WATCH = 3,
+ CEPH_OSD_WATCH_OP_RECONNECT = 5,
+ CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+enum {
+ CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32 = 0,
+ CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64 = 1,
+ CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C = 2
+};
+
+const char *ceph_osd_watch_op_name(int o);
+
+enum {
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+
+const char *ceph_osd_alloc_hint_flag_name(int f);
+
+enum {
+ CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+ CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+ CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
+
+const char *ceph_osd_backoff_op_name(int op);
+
+/*
+ * an individual object operation. each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+ __le16 op; /* CEPH_OSD_OP_* */
+ __le32 flags; /* CEPH_OSD_OP_FLAG_* */
+ union {
+ struct {
+ __le64 offset, length;
+ __le64 truncate_size;
+ __le32 truncate_seq;
+ } __attribute__ ((packed)) extent;
+ struct {
+ __le32 name_len;
+ __le32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
+ } __attribute__ ((packed)) xattr;
+ struct {
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
+ __le32 indata_len;
+ } __attribute__ ((packed)) cls;
+ struct {
+ __le64 count;
+ __le32 start_epoch; /* for the pgls sequence */
+ } __attribute__ ((packed)) pgls;
+ struct {
+ __le64 snapid;
+ } __attribute__ ((packed)) snap;
+ struct {
+ __le64 cookie;
+ __le64 ver; /* no longer used */
+ __u8 op; /* CEPH_OSD_WATCH_OP_* */
+ __u32 gen; /* registration generation */
+ __u32 timeout; /* connection timeout */
+ } __attribute__ ((packed)) watch;
+ struct {
+ __le64 cookie;
+ } __attribute__ ((packed)) notify;
+ struct {
+ __le64 unused;
+ __le64 ver;
+ } __attribute__ ((packed)) assert_ver;
+ struct {
+ __le64 offset, length;
+ __le64 src_offset;
+ } __attribute__ ((packed)) clonerange;
+ struct {
+ __le64 max; /* max data in reply */
+ } __attribute__ ((packed)) copy_get;
+ struct {
+ __le64 snapid;
+ __le64 src_version;
+ __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
+ /*
+ * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
+ * for src object, flags for dest object are in
+ * ceph_osd_op::flags.
+ */
+ __le32 src_fadvise_flags;
+ } __attribute__ ((packed)) copy_from;
+ struct {
+ struct ceph_timespec stamp;
+ } __attribute__ ((packed)) hit_set_get;
+ struct {
+ __u8 flags;
+ } __attribute__ ((packed)) tmap2omap;
+ struct {
+ __le64 expected_object_size;
+ __le64 expected_write_size;
+ __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+ } __attribute__ ((packed)) alloc_hint;
+ struct {
+ __le64 offset;
+ __le64 length;
+ __le64 data_length;
+ } __attribute__ ((packed)) writesame;
+ struct {
+ __le64 offset;
+ __le64 length;
+ __le32 chunk_size;
+ __u8 type; /* CEPH_OSD_CHECKSUM_OP_TYPE_* */
+ } __attribute__ ((packed)) checksum;
+ };
+ __le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * Check the compatibility of struct ceph_osd_op
+ * (2+4+(2*8+8+4)+4) = (sizeof(ceph_osd_op::op) +
+ * sizeof(ceph_osd_op::flags) +
+ * sizeof(ceph_osd_op::extent) +
+ * sizeof(ceph_osd_op::payload_len))
+ */
+#ifdef __cplusplus
+static_assert(sizeof(ceph_osd_op) == (2+4+(2*8+8+4)+4),
+ "sizeof(ceph_osd_op) breaks the compatibility");
+#endif
+
+struct ceph_osd_reply_head {
+ __le32 client_inc; /* client incarnation */
+ __le32 flags;
+ struct ceph_object_layout layout;
+ __le32 osdmap_epoch;
+ struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+ __le32 result; /* result code */
+
+ __le32 object_len; /* length of object name */
+ __le32 num_ops;
+ struct ceph_osd_op ops[0]; /* ops[], object */
+} __attribute__ ((packed));
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h
new file mode 120000
index 00000000..51fc03be
--- /dev/null
+++ b/src/include/rados/buffer.h
@@ -0,0 +1 @@
+../buffer.h \ No newline at end of file
diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h
new file mode 120000
index 00000000..bd1f6f1b
--- /dev/null
+++ b/src/include/rados/buffer_fwd.h
@@ -0,0 +1 @@
+../buffer_fwd.h \ No newline at end of file
diff --git a/src/include/rados/crc32c.h b/src/include/rados/crc32c.h
new file mode 120000
index 00000000..19ef4317
--- /dev/null
+++ b/src/include/rados/crc32c.h
@@ -0,0 +1 @@
+../crc32c.h \ No newline at end of file
diff --git a/src/include/rados/inline_memory.h b/src/include/rados/inline_memory.h
new file mode 120000
index 00000000..48f0d443
--- /dev/null
+++ b/src/include/rados/inline_memory.h
@@ -0,0 +1 @@
+../inline_memory.h \ No newline at end of file
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
new file mode 100644
index 00000000..58a65afa
--- /dev/null
+++ b/src/include/rados/librados.h
@@ -0,0 +1,4015 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRADOS_H
+#define CEPH_LIBRADOS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <unistd.h>
+#include <string.h>
+#include "rados_types.h"
+
+#include <sys/time.h>
+
+#ifndef CEPH_OSD_TMAP_SET
+/* These are also defined in rados.h and objclass.h. Keep them in sync! */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c'
+#define CEPH_OSD_TMAP_RM 'r'
+#endif
+
+#define LIBRADOS_VER_MAJOR 3
+#define LIBRADOS_VER_MINOR 0
+#define LIBRADOS_VER_EXTRA 0
+
+#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA)
+
+#define LIBRADOS_SUPPORTS_WATCH 1
+#define LIBRADOS_SUPPORTS_SERVICES 1
+#define LIBRADOS_SUPPORTS_GETADDRS 1
+#define LIBRADOS_SUPPORTS_APP_METADATA 1
+
+/* RADOS lock flags
+ * They are also defined in cls_lock_types.h. Keep them in sync!
+ */
+#define LIBRADOS_LOCK_FLAG_RENEW 0x1
+
+/*
+ * Constants for rados_write_op_create().
+ */
+#define LIBRADOS_CREATE_EXCLUSIVE 1
+#define LIBRADOS_CREATE_IDEMPOTENT 0
+
+/*
+ * Flags that can be set on a per-op basis via
+ * rados_read_op_set_flags() and rados_write_op_set_flags().
+ */
+enum {
+ // fail a create operation if the object already exists
+ LIBRADOS_OP_FLAG_EXCL = 0x1,
+ // allow the transaction to succeed even if the flagged op fails
+ LIBRADOS_OP_FLAG_FAILOK = 0x2,
+ // indicate read/write op random
+ LIBRADOS_OP_FLAG_FADVISE_RANDOM = 0x4,
+ // indicate read/write op sequential
+ LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8,
+ // indicate read/write data will be accessed in the near future (by someone)
+ LIBRADOS_OP_FLAG_FADVISE_WILLNEED = 0x10,
+ // indicate read/write data will not accessed in the near future (by anyone)
+ LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20,
+ // indicate read/write data will not accessed again (by *this* client)
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40,
+ // optionally support FUA (force unit access) on write requests
+ LIBRADOS_OP_FLAG_FADVISE_FUA = 0x80,
+};
+
+#define CEPH_RADOS_API
+
+/**
+ * @name xattr comparison operations
+ * Operators for comparing xattrs on objects, and aborting the
+ * rados_read_op or rados_write_op transaction if the comparison
+ * fails.
+ *
+ * @{
+ */
+enum {
+ LIBRADOS_CMPXATTR_OP_EQ = 1,
+ LIBRADOS_CMPXATTR_OP_NE = 2,
+ LIBRADOS_CMPXATTR_OP_GT = 3,
+ LIBRADOS_CMPXATTR_OP_GTE = 4,
+ LIBRADOS_CMPXATTR_OP_LT = 5,
+ LIBRADOS_CMPXATTR_OP_LTE = 6
+};
+/** @} */
+
+/**
+ * @name Operation Flags
+ * Flags for rados_read_op_operate(), rados_write_op_operate(),
+ * rados_aio_read_op_operate(), and rados_aio_write_op_operate().
+ * See librados.hpp for details.
+ * @{
+ */
+enum {
+ LIBRADOS_OPERATION_NOFLAG = 0,
+ LIBRADOS_OPERATION_BALANCE_READS = 1,
+ LIBRADOS_OPERATION_LOCALIZE_READS = 2,
+ LIBRADOS_OPERATION_ORDER_READS_WRITES = 4,
+ LIBRADOS_OPERATION_IGNORE_CACHE = 8,
+ LIBRADOS_OPERATION_SKIPRWLOCKS = 16,
+ LIBRADOS_OPERATION_IGNORE_OVERLAY = 32,
+ /* send requests to cluster despite the cluster or pool being marked
+ full; ops will either succeed (e.g., delete) or return EDQUOT or
+ ENOSPC. */
+ LIBRADOS_OPERATION_FULL_TRY = 64,
+ /*
+ * Mainly for delete op
+ */
+ LIBRADOS_OPERATION_FULL_FORCE = 128,
+ LIBRADOS_OPERATION_IGNORE_REDIRECT = 256,
+ LIBRADOS_OPERATION_ORDERSNAP = 512,
+};
+/** @} */
+
+/**
+ * @name Alloc hint flags
+ * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2()
+ * indicating future IO patterns.
+ * @{
+ */
+enum {
+ LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+/** @} */
+
+typedef enum {
+ LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0,
+ LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1,
+ LIBRADOS_CHECKSUM_TYPE_CRC32C = 2
+} rados_checksum_type_t;
+
+/*
+ * snap id contants
+ */
+#define LIBRADOS_SNAP_HEAD ((uint64_t)(-2))
+#define LIBRADOS_SNAP_DIR ((uint64_t)(-1))
+
+/**
+ * @typedef rados_t
+ *
+ * A handle for interacting with a RADOS cluster. It encapsulates all
+ * RADOS client configuration, including username, key for
+ * authentication, logging, and debugging. Talking different clusters
+ * -- or to the same cluster with different users -- requires
+ * different cluster handles.
+ */
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif //VOIDPTR_RADOS_T
+
+/**
+ * @typedef rados_config_t
+ *
+ * A handle for the ceph configuration context for the rados_t cluster
+ * instance. This can be used to share configuration context/state
+ * (e.g., logging configuration) between librados instance.
+ *
+ * @warning The config context does not have independent reference
+ * counting. As such, a rados_config_t handle retrieved from a given
+ * rados_t is only valid as long as that rados_t.
+ */
+typedef void *rados_config_t;
+
+/**
+ * @typedef rados_ioctx_t
+ *
+ * An io context encapsulates a few settings for all I/O operations
+ * done on it:
+ * - pool - set when the io context is created (see rados_ioctx_create())
+ * - snapshot context for writes (see
+ * rados_ioctx_selfmanaged_snap_set_write_ctx())
+ * - snapshot id to read from (see rados_ioctx_snap_set_read())
+ * - object locator for all single-object operations (see
+ * rados_ioctx_locator_set_key())
+ * - namespace for all single-object operations (see
+ * rados_ioctx_set_namespace()). Set to LIBRADOS_ALL_NSPACES
+ * before rados_nobjects_list_open() will list all objects in all
+ * namespaces.
+ *
+ * @warning Changing any of these settings is not thread-safe -
+ * librados users must synchronize any of these changes on their own,
+ * or use separate io contexts for each thread
+ */
+typedef void *rados_ioctx_t;
+
+/**
+ * @typedef rados_list_ctx_t
+ *
+ * An iterator for listing the objects in a pool.
+ * Used with rados_nobjects_list_open(),
+ * rados_nobjects_list_next(), rados_nobjects_list_next2(), and
+ * rados_nobjects_list_close().
+ */
+typedef void *rados_list_ctx_t;
+
+/**
+ * @typedef rados_object_list_cursor
+ *
+ * The cursor used with rados_enumerate_objects
+ * and accompanying methods.
+ */
+typedef void * rados_object_list_cursor;
+
+/**
+ * @struct rados_object_list_item
+ *
+ * The item populated by rados_object_list in
+ * the results array.
+ */
+typedef struct rados_object_list_item {
+
+ /// oid length
+ size_t oid_length;
+ /// name of the object
+ char *oid;
+ /// namespace length
+ size_t nspace_length;
+ /// the object namespace
+ char *nspace;
+ /// locator length
+ size_t locator_length;
+ /// object locator
+ char *locator;
+} rados_object_list_item;
+
+/**
+ * @typedef rados_snap_t
+ * The id of a snapshot.
+ */
+typedef uint64_t rados_snap_t;
+
+/**
+ * @typedef rados_xattrs_iter_t
+ * An iterator for listing extended attrbutes on an object.
+ * Used with rados_getxattrs(), rados_getxattrs_next(), and
+ * rados_getxattrs_end().
+ */
+typedef void *rados_xattrs_iter_t;
+
+/**
+ * @typedef rados_omap_iter_t
+ * An iterator for listing omap key/value pairs on an object.
+ * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(),
+ * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and
+ * rados_omap_get_end().
+ */
+typedef void *rados_omap_iter_t;
+
+/**
+ * @struct rados_pool_stat_t
+ * Usage information for a pool.
+ */
+struct rados_pool_stat_t {
+ /// space used in bytes
+ uint64_t num_bytes;
+ /// space used in KB
+ uint64_t num_kb;
+ /// number of objects in the pool
+ uint64_t num_objects;
+ /// number of clones of objects
+ uint64_t num_object_clones;
+ /// num_objects * num_replicas
+ uint64_t num_object_copies;
+ /// number of objects missing on primary
+ uint64_t num_objects_missing_on_primary;
+ /// number of objects found on no OSDs
+ uint64_t num_objects_unfound;
+ /// number of objects replicated fewer times than they should be
+ /// (but found on at least one OSD)
+ uint64_t num_objects_degraded;
+ /// number of objects read
+ uint64_t num_rd;
+ /// objects read in KB
+ uint64_t num_rd_kb;
+ /// number of objects written
+ uint64_t num_wr;
+ /// objects written in KB
+ uint64_t num_wr_kb;
+ /// bytes originally provided by user
+ uint64_t num_user_bytes;
+ /// bytes passed compression
+ uint64_t compressed_bytes_orig;
+ /// bytes resulted after compression
+ uint64_t compressed_bytes;
+ /// bytes allocated at storage
+ uint64_t compressed_bytes_alloc;
+};
+
+/**
+ * @struct rados_cluster_stat_t
+ * Cluster-wide usage information
+ */
+struct rados_cluster_stat_t {
+ /// total device size
+ uint64_t kb;
+ /// total used
+ uint64_t kb_used;
+ /// total available/free
+ uint64_t kb_avail;
+ /// number of objects
+ uint64_t num_objects;
+};
+
+/**
+ * @typedef rados_write_op_t
+ *
+ * An object write operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_write_op() rados_release_write_op()
+ * - Extended attribute manipulation: rados_write_op_cmpxattr()
+ * rados_write_op_cmpxattr(), rados_write_op_setxattr(),
+ * rados_write_op_rmxattr()
+ * - Object map key/value pairs: rados_write_op_omap_set(),
+ * rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(),
+ * rados_write_op_omap_cmp()
+ * - Object properties: rados_write_op_assert_exists(),
+ * rados_write_op_assert_version()
+ * - Creating objects: rados_write_op_create()
+ * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero
+ * rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove,
+ * rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext()
+ * - Hints: rados_write_op_set_alloc_hint()
+ * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate()
+ */
+typedef void *rados_write_op_t;
+
+/**
+ * @typedef rados_read_op_t
+ *
+ * An object read operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_read_op() rados_release_read_op()
+ * - Extended attribute manipulation: rados_read_op_cmpxattr(),
+ * rados_read_op_getxattr(), rados_read_op_getxattrs()
+ * - Object map key/value pairs: rados_read_op_omap_get_vals(),
+ * rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(),
+ * rados_read_op_omap_cmp()
+ * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(),
+ * rados_read_op_assert_version()
+ * - IO on objects: rados_read_op_read(), rados_read_op_checksum(),
+ * rados_read_op_cmpext()
+ * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf()
+ * - Request properties: rados_read_op_set_flags()
+ * - Performing the operation: rados_read_op_operate(),
+ * rados_aio_read_op_operate()
+ */
+typedef void *rados_read_op_t;
+
+/**
+ * @typedef rados_completion_t
+ * Represents the state of an asynchronous operation - it contains the
+ * return value once the operation completes, and can be used to block
+ * until the operation is complete or safe.
+ */
+typedef void *rados_completion_t;
+
+/**
+ * @struct blkin_trace_info
+ * blkin trace information for Zipkin tracing
+ */
+struct blkin_trace_info;
+
+/**
+ * Get the version of librados.
+ *
+ * The version number is major.minor.extra. Note that this is
+ * unrelated to the Ceph version number.
+ *
+ * TODO: define version semantics, i.e.:
+ * - incrementing major is for backwards-incompatible changes
+ * - incrementing minor is for backwards-compatible changes
+ * - incrementing extra is for bug fixes
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param extra where to store the extra version number
+ */
+CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra);
+
+/**
+ * @name Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using librados.
+ *
+ * @{
+ */
+
+/**
+ * Create a handle for communicating with a RADOS cluster.
+ *
+ * Ceph environment variables are read when this is called, so if
+ * $CEPH_ARGS specifies everything you need to connect, no further
+ * configuration is necessary.
+ *
+ * @param cluster where to store the handle
+ * @param id the user to connect as (i.e. admin, not client.admin)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id);
+
+/**
+ * Extended version of rados_create.
+ *
+ * Like rados_create, but
+ * 1) don't assume 'client\.'+id; allow full specification of name
+ * 2) allow specification of cluster name
+ * 3) flags for future expansion
+ */
+CEPH_RADOS_API int rados_create2(rados_t *pcluster,
+ const char *const clustername,
+ const char * const name, uint64_t flags);
+
+/**
+ * Initialize a cluster handle from an existing configuration.
+ *
+ * Share configuration state with another rados_t instance.
+ *
+ * @param cluster where to store the handle
+ * @param cct the existing configuration to use
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create_with_context(rados_t *cluster,
+ rados_config_t cct);
+
+/**
+ * Ping the monitor with ID mon_id, storing the resulting reply in
+ * buf (if specified) with a maximum size of len.
+ *
+ * The result buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param[in] mon_id ID of the monitor to ping
+ * @param[out] outstr double pointer with the resulting reply
+ * @param[out] outstrlen pointer with the size of the reply in outstr
+ */
+CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id,
+ char **outstr, size_t *outstrlen);
+
+/**
+ * Connect to the cluster.
+ *
+ * @note BUG: Before calling this, calling a function that communicates with the
+ * cluster will crash.
+ *
+ * @pre The cluster handle is configured with at least a monitor
+ * address. If cephx is enabled, a client name and secret must also be
+ * set.
+ *
+ * @post If this succeeds, any function in librados may be used
+ *
+ * @param cluster The cluster to connect to.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_connect(rados_t cluster);
+
+/**
+ * Disconnects from the cluster.
+ *
+ * For clean up, this is only necessary after rados_connect() has
+ * succeeded.
+ *
+ * @warning This does not guarantee any asynchronous writes have
+ * completed. To do that, you must call rados_aio_flush() on all open
+ * io contexts.
+ *
+ * @warning We implicitly call rados_watch_flush() on shutdown. If
+ * there are watches being used, this should be done explicitly before
+ * destroying the relevant IoCtx. We do it here as a safety measure.
+ *
+ * @post the cluster handle cannot be used again
+ *
+ * @param cluster the cluster to shutdown
+ */
+CEPH_RADOS_API void rados_shutdown(rados_t cluster);
+
+/** @} init */
+
+/**
+ * @name Configuration
+ * These functions read and update Ceph configuration for a cluster
+ * handle. Any configuration changes must be done before connecting to
+ * the cluster.
+ *
+ * Options that librados users might want to set include:
+ * - mon_host
+ * - auth_supported
+ * - key, keyfile, or keyring when using cephx
+ * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog
+ * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms
+ *
+ * See docs.ceph.com for information about available configuration options`
+ *
+ * @{
+ */
+
+/**
+ * Configure the cluster handle using a Ceph config file
+ *
+ * If path is NULL, the default locations are searched, and the first
+ * found is used. The locations are:
+ * - $CEPH_CONF (environment variable)
+ * - /etc/ceph/ceph.conf
+ * - ~/.ceph/config
+ * - ceph.conf (in the current working directory)
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param path path to a Ceph configuration file
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path);
+
+/**
+ * Configure the cluster handle with command line arguments
+ *
+ * argv can contain any common Ceph command line option, including any
+ * configuration parameter prefixed by '--' and replacing spaces with
+ * dashes or underscores. For example, the following options are equivalent:
+ * - --mon-host 10.0.0.1:6789
+ * - --mon_host 10.0.0.1:6789
+ * - -m 10.0.0.1:6789
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc,
+ const char **argv);
+
+
+/**
+ * Configure the cluster handle with command line arguments, returning
+ * any remainders. Same rados_conf_parse_argv, except for extra
+ * remargv argument to hold returns unrecognized arguments.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @param remargv char* array for returned unrecognized arguments
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc,
+ const char **argv,
+ const char **remargv);
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cluster cluster handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var);
+
+/**
+ * Set a configuration option
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param option option to set
+ * @param value value of the option
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when the option is not a Ceph configuration option
+ */
+CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option,
+ const char *value);
+
+/**
+ * Get the value of a configuration option
+ *
+ * @param cluster configuration to read
+ * @param option which option to read
+ * @param buf where to write the configuration value
+ * @param len the size of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENAMETOOLONG if the buffer is too short to contain the
+ * requested value
+ */
+CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option,
+ char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * Read usage info about the cluster
+ *
+ * This tells you total space, space used, space available, and number
+ * of objects. These are not updated immediately when data is written,
+ * they are eventually consistent.
+ *
+ * @param cluster cluster to query
+ * @param result where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cluster_stat(rados_t cluster,
+ struct rados_cluster_stat_t *result);
+
+/**
+ * Get the fsid of the cluster as a hexadecimal string.
+ *
+ * The fsid is a unique id of an entire Ceph cluster.
+ *
+ * @param cluster where to get the fsid
+ * @param buf where to write the fsid
+ * @param len the size of buf in bytes (should be 37)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the buffer is too short to contain the
+ * fsid
+ */
+CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len);
+
+/**
+ * Get/wait for the most recent osdmap
+ *
+ * @param cluster the cluster to shutdown
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster);
+
+/**
+ * @name Pools
+ *
+ * RADOS pools are separate namespaces for objects. Pools may have
+ * different crush rules associated with them, so they could have
+ * differing replication levels or placement strategies. RADOS
+ * permissions are also tied to pools - users can have different read,
+ * write, and execute permissions on a per-pool basis.
+ *
+ * @{
+ */
+
+/**
+ * List pools
+ *
+ * Gets a list of pool names as NULL-terminated strings. The pool
+ * names will be placed in the supplied buffer one after another.
+ * After the last pool name, there will be two 0 bytes in a row.
+ *
+ * If len is too short to fit all the pool name entries we need, we will fill
+ * as much as we can.
+ *
+ * Buf may be null to determine the buffer size needed to list all pools.
+ *
+ * @param cluster cluster handle
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len);
+
+/**
+ * List inconsistent placement groups of the given pool
+ *
+ * Gets a list of inconsistent placement groups as NULL-terminated strings.
+ * The placement group names will be placed in the supplied buffer one after
+ * another. After the last name, there will be two 0 types in a row.
+ *
+ * If len is too short to fit all the placement group entries we need, we will
+ * fill as much as we can.
+ *
+ * @param cluster cluster handle
+ * @param pool pool ID
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool,
+ char *buf, size_t len);
+
+/**
+ * Get a configuration handle for a rados cluster handle
+ *
+ * This handle is valid only as long as the cluster handle is valid.
+ *
+ * @param cluster cluster handle
+ * @returns config handle for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster);
+
+/**
+ * Get a global id for current instance
+ *
+ * This id is a unique representation of current connection to the cluster
+ *
+ * @param cluster cluster handle
+ * @returns instance global id
+ */
+CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster);
+
+/**
+ * Gets the minimum compatible OSD version
+ *
+ * @param cluster cluster handle
+ * @param[out] require_osd_release minimum compatible OSD version
+ * based upon the current features
+ * @returns 0 on sucess, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster,
+ int8_t* require_osd_release);
+
+/**
+ * Gets the minimum compatible client version
+ *
+ * @param cluster cluster handle
+ * @param[out] min_compat_client minimum compatible client version
+ * based upon the current features
+ * @param[out] require_min_compat_client required minimum client version
+ * based upon explicit setting
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster,
+ int8_t* min_compat_client,
+ int8_t* require_min_compat_client);
+
+/**
+ * Create an io context
+ *
+ * The io context allows you to perform operations within a particular
+ * pool. For more details see rados_ioctx_t.
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name name of the pool
+ * @param ioctx where to store the io context
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name,
+ rados_ioctx_t *ioctx);
+CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id,
+ rados_ioctx_t *ioctx);
+
+/**
+ * The opposite of rados_ioctx_create
+ *
+ * This just tells librados that you no longer need to use the io context.
+ * It may not be freed immediately if there are pending asynchronous
+ * requests on it, but you should not use an io context again after
+ * calling this function on it.
+ *
+ * @warning This does not guarantee any asynchronous
+ * writes have completed. You must call rados_aio_flush()
+ * on the io context before destroying it to do that.
+ *
+ * @warning If this ioctx is used by rados_watch, the caller needs to
+ * be sure that all registered watches are disconnected via
+ * rados_unwatch() and that rados_watch_flush() is called. This
+ * ensures that a racing watch callback does not make use of a
+ * destroyed ioctx.
+ *
+ * @param io the io context to dispose of
+ */
+CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io);
+
+/**
+ * Get configuration handle for a pool handle
+ *
+ * @param io pool handle
+ * @returns rados_config_t for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io);
+
+/**
+ * Get the cluster handle used by this rados_ioctx_t
+ * Note that this is a weak reference, and should not
+ * be destroyed via rados_shutdown().
+ *
+ * @param io the io context
+ * @returns the cluster handle for this io context
+ */
+CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io);
+
+/**
+ * Get pool usage statistics
+ *
+ * Fills in a rados_pool_stat_t after querying the cluster.
+ *
+ * @param io determines which pool to query
+ * @param stats where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io,
+ struct rados_pool_stat_t *stats);
+
+/**
+ * Get the id of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name which pool to look up
+ * @returns id of the pool
+ * @returns -ENOENT if the pool is not found
+ */
+CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster,
+ const char *pool_name);
+
+/**
+ * Get the name of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param id the id of the pool
+ * @param buf where to store the pool name
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id,
+ char *buf, size_t maxlen);
+
+/**
+ * Create a pool with default settings
+ *
+ * The default crush rule is rule 0.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name);
+
+/**
+ * Create a pool owned by a specific auid.
+ *
+ * DEPRECATED: auid support has been removed, and this call will be removed in a future
+ * release.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster,
+ const char *pool_name,
+ uint64_t auid)
+ __attribute__((deprecated));
+
+/**
+ * Create a pool with a specific CRUSH rule
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool1
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster,
+ const char *pool_name,
+ uint8_t crush_rule_num);
+
+/**
+ * Create a pool with a specific CRUSH rule and auid
+ *
+ * DEPRECATED: auid support has been removed and this call will be removed
+ * in a future release.
+ *
+ * This is a combination of rados_pool_create_with_crush_rule() and
+ * rados_pool_create_with_auid().
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool2
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster,
+ const char *pool_name,
+ uint64_t auid,
+ uint8_t crush_rule_num)
+ __attribute__((deprecated));
+
+/**
+ * Returns the pool that is the base tier for this pool.
+ *
+ * The return value is the ID of the pool that should be used to read from/write to.
+ * If tiering is not set up for the pool, returns \c pool.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool ID of the pool to query
+ * @param[out] base_tier base tier, or \c pool if tiering is not configured
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool,
+ int64_t* base_tier);
+
+/**
+ * Delete a pool and all data inside it
+ *
+ * The pool is removed from the cluster immediately,
+ * but the actual data is deleted in the background.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool_name which pool to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name);
+
+/**
+ * Attempt to change an io context's associated auid "owner"
+ *
+ * DEPRECATED: auid support has been removed and this call has no effect.
+ *
+ * Requires that you have write permission on both the current and new
+ * auid.
+ *
+ * @param io reference to the pool to change.
+ * @param auid the auid you wish the io to have.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid)
+ __attribute__((deprecated));
+
+
+/**
+ * Get the auid of a pool
+ *
+ * DEPRECATED: auid support has been removed and this call always reports
+ * CEPH_AUTH_UID_DEFAULT (-1).
+
+ * @param io pool to query
+ * @param auid where to store the auid
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid)
+ __attribute__((deprecated));
+
+/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+/**
+ * Test whether the specified pool requires alignment or not.
+ *
+ * @param io pool to query
+ * @param req 1 if alignment is supported, 0 if not.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io,
+ int *req);
+
+/* deprecated, use rados_ioctx_pool_required_alignment2 instead */
+CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+/**
+ * Get the alignment flavor of a pool
+ *
+ * @param io pool to query
+ * @param alignment where to store the alignment flavor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io,
+ uint64_t *alignment);
+
+/**
+ * Get the pool id of the io context
+ *
+ * @param io the io context to query
+ * @returns the id of the pool the io context uses
+ */
+CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io);
+
+/**
+ * Get the pool name of the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf,
+ unsigned maxlen);
+
+/** @} pools */
+
+/**
+ * @name Object Locators
+ *
+ * @{
+ */
+
+/**
+ * Set the key for mapping objects to pgs within an io context.
+ *
+ * The key is used instead of the object name to determine which
+ * placement groups an object is put in. This affects all subsequent
+ * operations of the io context - until a different locator key is
+ * set, all objects in this io context will be placed in the same pg.
+ *
+ * @param io the io context to change
+ * @param key the key to use as the object locator, or NULL to discard
+ * any previously set key
+ */
+CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io,
+ const char *key);
+
+/**
+ * Set the namespace for objects within an io context
+ *
+ * The namespace specification further refines a pool into different
+ * domains. The mapping of objects to pgs is also based on this
+ * value.
+ *
+ * @param io the io context to change
+ * @param nspace the name to use as the namespace, or NULL use the
+ * default namespace
+ */
+CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io,
+ const char *nspace);
+
+/**
+ * Get the namespace for objects within the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf,
+ unsigned maxlen);
+
+/** @} obj_loc */
+
+/**
+ * @name Listing Objects
+ * @{
+ */
+/**
+ * Start listing objects in a pool
+ *
+ * @param io the pool to list from
+ * @param ctx the handle to store list context in
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io,
+ rados_list_ctx_t *ctx);
+
+/**
+ * Return hash position of iterator, rounded to the current PG
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @returns current hash position, rounded to the current pg
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx);
+
+/**
+ * Reposition object iterator to a different hash position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param pos hash position to move to
+ * @returns actual (rounded) position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx,
+ uint32_t pos);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor position to move to
+ * @returns rounded position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx,
+ rados_object_list_cursor cursor);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * The returned handle must be released with rados_object_list_cursor_free().
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor where to store cursor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx,
+ rados_object_list_cursor *cursor);
+
+/**
+ * Get the next object name and locator in the pool
+ *
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key,
+ const char **nspace);
+
+/**
+ * Get the next object name, locator and their sizes in the pool
+ *
+ * The sizes allow to list objects with \0 (the NUL character)
+ * in .e.g *entry. Is is unusual see such object names but a bug
+ * in a client has risen the need to handle them as well.
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @param entry_size where to store the size of name of the entry
+ * @param key_size where to store the size of object locator (set to NULL to ignore)
+ * @param nspace_size where to store the size of object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key,
+ const char **nspace,
+ size_t *entry_size,
+ size_t *key_size,
+ size_t *nspace_size);
+
+/**
+ * Close the object listing handle.
+ *
+ * This should be called when the handle is no longer needed.
+ * The handle should not be used after it has been closed.
+ *
+ * @param ctx the handle to close
+ */
+CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx);
+
+/**
+ * Get cursor handle pointing to the *beginning* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool. It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin(
+ rados_ioctx_t io);
+
+/**
+ * Get cursor handle pointing to the *end* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool. It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io);
+
+/**
+ * Check if a cursor has reached the end of a pool
+ *
+ * @param io ioctx
+ * @param cur cursor
+ * @returns 1 if the cursor has reached the end of the pool, 0 otherwise
+ */
+CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io,
+ rados_object_list_cursor cur);
+
+/**
+ * Release a cursor
+ *
+ * Release a cursor. The handle may not be used after this point.
+ *
+ * @param io ioctx
+ * @param cur cursor
+ */
+CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io,
+ rados_object_list_cursor cur);
+
+/**
+ * Compare two cursor positions
+ *
+ * Compare two cursors, and indicate whether the first cursor precedes,
+ * matches, or follows the second.
+ *
+ * @param io ioctx
+ * @param lhs first cursor
+ * @param rhs second cursor
+ * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs
+ */
+CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io,
+ rados_object_list_cursor lhs, rados_object_list_cursor rhs);
+
+/**
+ * @return the number of items set in the results array
+ */
+CEPH_RADOS_API int rados_object_list(rados_ioctx_t io,
+ const rados_object_list_cursor start,
+ const rados_object_list_cursor finish,
+ const size_t result_size,
+ const char *filter_buf,
+ const size_t filter_buf_len,
+ rados_object_list_item *results,
+ rados_object_list_cursor *next);
+
+CEPH_RADOS_API void rados_object_list_free(
+ const size_t result_size,
+ rados_object_list_item *results);
+
+/**
+ * Obtain cursors delineating a subset of a range. Use this
+ * when you want to split up the work of iterating over the
+ * global namespace. Expected use case is when you are iterating
+ * in parallel, with `m` workers, and each worker taking an id `n`.
+ *
+ * @param io ioctx
+ * @param start start of the range to be sliced up (inclusive)
+ * @param finish end of the range to be sliced up (exclusive)
+ * @param n which of the m chunks you would like to get cursors for
+ * @param m how many chunks to divide start-finish into
+ * @param split_start cursor populated with start of the subrange (inclusive)
+ * @param split_finish cursor populated with end of the subrange (exclusive)
+ */
+CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io,
+ const rados_object_list_cursor start,
+ const rados_object_list_cursor finish,
+ const size_t n,
+ const size_t m,
+ rados_object_list_cursor *split_start,
+ rados_object_list_cursor *split_finish);
+
+
+/** @} Listing Objects */
+
+/**
+ * @name Snapshots
+ *
+ * RADOS snapshots are based upon sequence numbers that form a
+ * snapshot context. They are pool-specific. The snapshot context
+ * consists of the current snapshot sequence number for a pool, and an
+ * array of sequence numbers at which snapshots were taken, in
+ * descending order. Whenever a snapshot is created or deleted, the
+ * snapshot sequence number for the pool is increased. To add a new
+ * snapshot, the new snapshot sequence number must be increased and
+ * added to the snapshot context.
+ *
+ * There are two ways to manage these snapshot contexts:
+ * -# within the RADOS cluster
+ * These are called pool snapshots, and store the snapshot context
+ * in the OSDMap. These represent a snapshot of all the objects in
+ * a pool.
+ * -# within the RADOS clients
+ * These are called self-managed snapshots, and push the
+ * responsibility for keeping track of the snapshot context to the
+ * clients. For every write, the client must send the snapshot
+ * context. In librados, this is accomplished with
+ * rados_selfmanaged_snap_set_write_ctx(). These are more
+ * difficult to manage, but are restricted to specific objects
+ * instead of applying to an entire pool.
+ *
+ * @{
+ */
+
+/**
+ * Create a pool-wide snapshot
+ *
+ * @param io the pool to snapshot
+ * @param snapname the name of the snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io,
+ const char *snapname);
+
+/**
+ * Delete a pool snapshot
+ *
+ * @param io the pool to delete the snapshot from
+ * @param snapname which snapshot to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io,
+ const char *snapname);
+
+/**
+ * Rollback an object to a pool snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapname which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid,
+ const char *snapname);
+
+/**
+ * @warning Deprecated: Use rados_ioctx_snap_rollback() instead
+ */
+CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid,
+ const char *snapname)
+ __attribute__((deprecated));
+
+/**
+ * Set the snapshot from which reads are performed.
+ *
+ * Subsequent reads will return data as it was at the time of that
+ * snapshot.
+ *
+ * @param io the io context to change
+ * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no
+ * snapshot (i.e. normal operation)
+ */
+CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io,
+ rados_snap_t snap);
+
+/**
+ * Allocate an ID for a self-managed snapshot
+ *
+ * Get a unique ID to put in the snaphot context to create a
+ * snapshot. A clone of an object is not created until a write with
+ * the new snapshot context is completed.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+ rados_snap_t *snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+ rados_snap_t *snapid,
+ rados_completion_t completion);
+
+/**
+ * Remove a self-managed snapshot
+ *
+ * This increases the snapshot sequence number, which will cause
+ * snapshots to be removed lazily.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+ rados_snap_t snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+ rados_snap_t snapid,
+ rados_completion_t completion);
+
+/**
+ * Rollback an object to a self-managed snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapid which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io,
+ const char *oid,
+ rados_snap_t snapid);
+
+/**
+ * Set the snapshot context for use when writing to objects
+ *
+ * This is stored in the io context, and applies to all future writes.
+ *
+ * @param io the io context to change
+ * @param seq the newest snapshot sequence number for the pool
+ * @param snaps array of snapshots in sorted by descending id
+ * @param num_snaps how many snaphosts are in the snaps array
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snaps are not in descending order
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io,
+ rados_snap_t seq,
+ rados_snap_t *snaps,
+ int num_snaps);
+
+/**
+ * List all the ids of pool snapshots
+ *
+ * If the output array does not have enough space to fit all the
+ * snapshots, -ERANGE is returned and the caller should retry with a
+ * larger array.
+ *
+ * @param io the pool to read from
+ * @param snaps where to store the results
+ * @param maxlen the number of rados_snap_t that fit in the snaps array
+ * @returns number of snapshots on success, negative error code on failure
+ * @returns -ERANGE is returned if the snaps array is too short
+ */
+CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps,
+ int maxlen);
+
+/**
+ * Get the id of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param name the snapshot to find
+ * @param id where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name,
+ rados_snap_t *id);
+
+/**
+ * Get the name of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param id the snapshot to find
+ * @param name where to store the result
+ * @param maxlen the size of the name array
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the name array is too small
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id,
+ char *name, int maxlen);
+
+/**
+ * Find when a pool snapshot occurred
+ *
+ * @param io the pool the snapshot was taken in
+ * @param id the snapshot to lookup
+ * @param t where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id,
+ time_t *t);
+
+/** @} Snapshots */
+
+/**
+ * @name Synchronous I/O
+ * Writes are replicated to a number of OSDs based on the
+ * configuration of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_ioctx_wait_for_complete(). For greater data safety, use the
+ * asynchronous functions and rados_aio_wait_for_safe().
+ *
+ * @{
+ */
+
+/**
+ * Return the version of the last object read or written to.
+ *
+ * This exposes the internal version number of the last object read or
+ * written via this io context
+ *
+ * @param io the io context to check
+ * @returns last read or written object version
+ */
+CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object, starting at
+ * offset *off*. The value of *len* must be <= UINT_MAX/2.
+ *
+ * @note This will never return a positive value not equal to len.
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len, uint64_t off);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len);
+
+/**
+ * Write the same *data_len* bytes from *buf* multiple times into the
+ * *oid* object. *write_len* bytes are written in total, which must be
+ * a multiple of *data_len*. The value of *write_len* and *data_len*
+ * must be <= UINT_MAX/2.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t data_len,
+ size_t write_len, uint64_t off);
+
+/**
+ * Append *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len);
+
+/**
+ * Read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf,
+ size_t len, uint64_t off);
+
+/**
+ * Compute checksum from object data
+ *
+ * The io context determines the snapshot to checksum, if any was set
+ * by rados_ioctx_snap_set_read(). The length of the init_value and
+ * resulting checksum are dependent upon the checksum type:
+ *
+ * XXHASH64: le64
+ * XXHASH32: le32
+ * CRC32C: le32
+ *
+ * The checksum result is encoded the following manner:
+ *
+ * le32 num_checksum_chunks
+ * {
+ * leXX checksum for chunk (where XX = appropriate size for the checksum type)
+ * } * num_checksum_chunks
+ *
+ * @param io the context in which to perform the checksum
+ * @param oid the name of the object to checksum
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param len the number of bytes to checksum
+ * @param off the offset to start checksumming in the object
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result
+ * @param checksum_len the number of bytes available for the result
+ * @return negative error code on failure
+ */
+CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid,
+ rados_checksum_type_t type,
+ const char *init_value, size_t init_value_len,
+ size_t len, uint64_t off, size_t chunk_size,
+ char *pchecksum, size_t checksum_len);
+
+/**
+ * Delete an object
+ *
+ * @note This does not delete any snapshots of the object.
+ *
+ * @param io the pool to delete the object from
+ * @param oid the name of the object to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @param io the context in which to truncate
+ * @param oid the name of the object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid,
+ uint64_t size);
+
+/**
+ * Compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o name of the object
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o,
+ const char *cmp_buf, size_t cmp_len,
+ uint64_t off);
+
+/**
+ * @name Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o,
+ const char *name, char *buf, size_t len);
+
+/**
+ * Set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o,
+ const char *name, const char *buf,
+ size_t len);
+
+/**
+ * Delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o,
+ const char *name);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid,
+ rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter,
+ const char **name, const char **val,
+ size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Get the next omap key/value pair on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key is
+ * null-terminated, and val has length len. If the end of the list has
+ * been reached, key and val are NULL, and len is 0. key and val will
+ * not be accessible after rados_omap_get_end() is called on iter, so
+ * if they are needed after that they should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter,
+ char **key,
+ char **val,
+ size_t *len);
+
+/**
+ * Get the next omap key/value pair on the object. Note that it's
+ * perfectly safe to mix calls to rados_omap_get_next and
+ * rados_omap_get_next2.
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key has length
+ * keylen and val has length vallen. If the end of the list has
+ * been reached, key and val are NULL, and keylen and vallen is 0.
+ * key and val will not be accessible after rados_omap_get_end()
+ * is called on iter, so if they are needed after that they
+ * should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param key_len where to store the number of bytes in key
+ * @param val_len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter,
+ char **key,
+ char **val,
+ size_t *key_len,
+ size_t *val_len);
+
+/**
+ * Return number of elements in the iterator
+ *
+ * @param iter the iterator of which to return the size
+ */
+CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter);
+
+/**
+ * Close the omap iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter);
+
+/**
+ * Get object stats (size/mtime)
+ *
+ * TODO: when are these set, and by whom? can they be out of date?
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize,
+ time_t *pmtime);
+/**
+ * Execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param oid the object to call the method on
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns the length of the output, or
+ * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For
+ * methods that don't return data, the return value is
+ * method-specific.
+ */
+CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid,
+ const char *cls, const char *method,
+ const char *in_buf, size_t in_len, char *buf,
+ size_t out_len);
+
+
+/** @} Synchronous I/O */
+
+/**
+ * @name Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_callback_t
+ * Callbacks for asynchrous operations take two parameters:
+ * - cb the completion that has finished
+ * - arg application defined data made available to the callback function
+ */
+typedef void (*rados_callback_t)(rados_completion_t cb, void *arg);
+
+/**
+ * Constructs a completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * TODO: more complete documentation of this elsewhere (in the RADOS docs?)
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all relpicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg,
+ rados_callback_t cb_complete,
+ rados_callback_t cb_safe,
+ rados_completion_t *pc);
+
+/**
+ * Block until an operation completes
+ *
+ * This means it is in memory on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c);
+
+/**
+ * Block until an operation is safe
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c);
+
+/**
+ * Has an asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c);
+
+/**
+ * Block until an operation completes and callback completes
+ *
+ * This means it is in memory on all replicas and can be read.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c);
+
+/**
+ * Block until an operation is safe and callback has completed
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c);
+
+/**
+ * Has an asynchronous operation and callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe and has the callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c);
+
+/**
+ * Get the return value of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns return value of the operation
+ */
+CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c);
+
+/**
+ * Get the internal object version of the target of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns version number of the asychronous operation's target
+ */
+CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c);
+
+/**
+ * Release a completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c completion to release
+ */
+CEPH_RADOS_API void rados_aio_release(rados_completion_t c);
+
+/**
+ * Write data to an object asynchronously
+ *
+ * Queues the write and returns. The return value of the completion
+ * will be 0 on success, negative error code on failure.
+ *
+ * @param io the context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len, uint64_t off);
+
+/**
+ * Asynchronously append data to an object
+ *
+ * Queues the append and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the append is safe and complete
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len);
+
+/**
+ * Asynchronously write an entire object
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ * Queues the write_full and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write_full is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len);
+
+/**
+ * Asynchronously write the same buffer multiple times
+ *
+ * Queues the writesame and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the writesame is safe and complete
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t data_len,
+ size_t write_len, uint64_t off);
+
+/**
+ * Asynchronously remove an object
+ *
+ * Queues the remove and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion);
+
+/**
+ * Asynchronously read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @note only the 'complete' callback of the completion will be called.
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param completion what to do when the read is complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ char *buf, size_t len, uint64_t off);
+
+/**
+ * Block until all pending writes in an io context are safe
+ *
+ * This is not equivalent to calling rados_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @note BUG: always returns 0, should be void or accept a timeout
+ *
+ * @param io the context to flush
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io);
+
+
+/**
+ * Schedule a callback for when all currently pending
+ * aio writes are safe. This is a non-blocking version of
+ * rados_aio_flush().
+ *
+ * @param io the context to flush
+ * @param completion what to do when the writes are safe
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io,
+ rados_completion_t completion);
+
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param completion what to do when the stat is complete
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ uint64_t *psize, time_t *pmtime);
+
+/**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o the name of the object to compare with
+ * @param completion what to do when the comparison is complete
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off);
+
+/**
+ * Cancel async operation
+ *
+ * @param io ioctx
+ * @param completion completion handle
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io,
+ rados_completion_t completion);
+
+/**
+ * Asynchronously execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param o name of the object
+ * @param completion what to do when the exec completes
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *cls, const char *method,
+ const char *in_buf, size_t in_len,
+ char *buf, size_t out_len);
+
+/** @} Asynchronous I/O */
+
+/**
+ * @name Asynchronous Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Asynchronously get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param completion what to do when the getxattr completes
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name, char *buf, size_t len);
+
+/**
+ * Asynchronously set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param completion what to do when the setxattr completes
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name, const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param completion what to do when the rmxattr completes
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name);
+
+/**
+ * Asynchronously start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param completion what to do when the getxattrs completes
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ rados_xattrs_iter_t *iter);
+
+/** @} Asynchronous Xattrs */
+
+/**
+ * @name Watch/Notify
+ *
+ * Watch/notify is a protocol to help communicate among clients. It
+ * can be used to sychronize client state. All that's needed is a
+ * well-known object name (for example, rbd uses the header object of
+ * an image).
+ *
+ * Watchers register an interest in an object, and receive all
+ * notifies on that object. A notify attempts to communicate with all
+ * clients watching an object, and blocks on the notifier until each
+ * client responds or a timeout is reached.
+ *
+ * See rados_watch() and rados_notify() for more details.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_watchcb_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param opcode undefined
+ * @param ver version of the watched object
+ * @param arg application-specific data
+ *
+ * @note BUG: opcode is an internal detail that shouldn't be exposed
+ * @note BUG: ver is unused
+ */
+typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg);
+
+/**
+ * @typedef rados_watchcb2_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param arg opaque user-defined value provided to rados_watch2()
+ * @param notify_id an id for this notify event
+ * @param handle the watcher handle we are notifying
+ * @param notifier_id the unique client id for the notifier
+ * @param data payload from the notifier
+ * @param datalen length of payload buffer
+ */
+typedef void (*rados_watchcb2_t)(void *arg,
+ uint64_t notify_id,
+ uint64_t handle,
+ uint64_t notifier_id,
+ void *data,
+ size_t data_len);
+
+/**
+ * @typedef rados_watcherrcb_t
+ *
+ * Callback activated when we encounter an error with the watch session.
+ * This can happen when the location of the objects moves within the
+ * cluster and we fail to register our watch with the new object location,
+ * or when our connection with the object OSD is otherwise interrupted and
+ * we may have missed notify events.
+ *
+ * @param pre opaque user-defined value provided to rados_watch2()
+ * @param err error code
+ */
+ typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @note BUG: librados should provide a way for watchers to notice connection resets
+ * @note BUG: the ver parameter does not work, and -ERANGE will never be returned
+ * (See URL tracker.ceph.com/issues/2592)
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param ver expected version of the object
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param arg application defined data to pass when watchcb is called
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the version of the object is greater than ver
+ */
+CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver,
+ uint64_t *cookie,
+ rados_watchcb_t watchcb, void *arg)
+ __attribute__((deprecated));
+
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to the
+ * primary OSD for a watched object, the watch will be removed after
+ * a timeout configured with osd_client_watch_timeout.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ void *arg);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ uint32_t timeout,
+ void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o,
+ rados_completion_t completion, uint64_t *handle,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after the number of seconds that configured in timeout parameter.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o,
+ rados_completion_t completion, uint64_t *handle,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ uint32_t timeout,
+ void *arg);
+
+/**
+ * Check on the status of a watch
+ *
+ * Return the number of milliseconds since the watch was last confirmed.
+ * Or, if there has been an error, return that.
+ *
+ * If there is an error, the watch is no longer valid, and should be
+ * destroyed with rados_unwatch2(). The the user is still interested
+ * in the object, a new watch should be created with rados_watch2().
+ *
+ * @param io the pool the object is in
+ * @param cookie the watch handle
+ * @returns ms since last confirmed on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the watched object (ignored)
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie)
+ __attribute__((deprecated));
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Asynchronous unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie,
+ rados_completion_t completion);
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * @note BUG: the timeout is not changeable via the C API
+ * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param ver obsolete - just pass zero
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver,
+ const char *buf, int buf_len)
+ __attribute__((deprecated));
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * The reply buffer is optional. If specified, the client will get
+ * back an encoded buffer that includes the ids of the clients that
+ * acknowledged the notify as well as their notify ack payloads (if
+ * any). Clients that timed out are not included. Even clients that
+ * do not include a notify ack payload are included in the list but
+ * have a 0-length payload associated with them. The format:
+ *
+ * le32 num_acks
+ * {
+ * le64 gid global id for the client (for client.1234 that's 1234)
+ * le64 cookie cookie for the client
+ * le32 buflen length of reply message buffer
+ * u8 * buflen payload
+ * } * num_acks
+ * le32 num_timeouts
+ * {
+ * le64 gid global id for the client
+ * le64 cookie cookie for the client
+ * } * num_timeouts
+ *
+ * Note: There may be multiple instances of the same gid if there are
+ * multiple watchers registered via the same client.
+ *
+ * Note: The buffer must be released with rados_buffer_free() when the
+ * user is done with it.
+ *
+ * Note: Since the result buffer includes clients that time out, it
+ * will be set even when rados_notify() returns an error code (like
+ * -ETIMEDOUT).
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param o the name of the object
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @param timeout_ms notify timeout (in ms)
+ * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free)
+ * @param reply_buffer_len pointer to size of reply buffer
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *buf, int buf_len,
+ uint64_t timeout_ms, char **reply_buffer,
+ size_t *reply_buffer_len);
+CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o,
+ const char *buf, int buf_len,
+ uint64_t timeout_ms,
+ char **reply_buffer, size_t *reply_buffer_len);
+
+/**
+ * Acknolwedge receipt of a notify
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param notify_id the notify_id we got on the watchcb2_t callback
+ * @param cookie the watcher handle
+ * @param buf payload to return to notifier (optional)
+ * @param buf_len payload length
+ * @returns 0 on success
+ */
+CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o,
+ uint64_t notify_id, uint64_t cookie,
+ const char *buf, int buf_len);
+
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will block until all pending watch/notify callbacks have
+ * been executed and the queue is empty. It should usually be called
+ * after shutting down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ */
+CEPH_RADOS_API int rados_watch_flush(rados_t cluster);
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will be nonblock, and the completion will be called
+ * until all pending watch/notify callbacks have been executed and
+ * the queue is empty. It should usually be called after shutting
+ * down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ * @param completion what to do when operation has been attempted
+ */
+CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion);
+
+/** @} Watch/Notify */
+
+/**
+ * Pin an object in the cache tier
+ *
+ * When an object is pinned in the cache tier, it stays in the cache
+ * tier, and won't be flushed out.
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o);
+
+/**
+ * Unpin an object in the cache tier
+ *
+ * After an object is unpinned in the cache tier, it can be flushed out
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o);
+
+/**
+ * @name Hints
+ *
+ * @{
+ */
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+/** @} Hints */
+
+/**
+ * @name Object Operations
+ *
+ * A single rados operation can do multiple operations on one object
+ * atomically. The whole operation will succeed or fail, and no partial
+ * results will be visible.
+ *
+ * Operations may be either reads, which can return data, or writes,
+ * which cannot. The effects of writes are applied and visible all at
+ * once, so an operation that sets an xattr and then checks its value
+ * will not see the updated value.
+ *
+ * @{
+ */
+
+/**
+ * Create a new rados_write_op_t write operation. This will store all actions
+ * to be performed atomically. You must call rados_release_write_op when you are
+ * finished with it.
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_write_op_t rados_create_write_op(void);
+
+/**
+ * Free a rados_write_op_t, must be called when you're done with it.
+ * @param write_op operation to deallocate, created with rados_create_write_op
+ */
+CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op);
+
+/**
+ * Set flags for the last operation added to this write_op.
+ * At least one op must have been added to the write_op.
+ * @param write_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op,
+ int flags);
+
+/**
+ * Ensure that the object exists before writing
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before writing. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ * then rados_write_op_operate will return -ERANGE instead of
+ * executing the op.
+ * - If the object's version is less than the asserted version
+ * then rados_write_op_operate will return -EOVERFLOW instead
+ * of executing the op.
+ * @param write_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param write_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ * on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off,
+ int *prval);
+
+/**
+ * Ensure that given xattr satisfies comparison.
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op,
+ const char *name,
+ uint8_t comparison_operator,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t key_len,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Set an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr
+ * @param value buffer to set xattr to
+ * @param value_len length of buffer to set xattr to
+ */
+CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op,
+ const char *name,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Remove an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to remove
+ */
+CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op,
+ const char *name);
+
+/**
+ * Create the object
+ * @param write_op operation to add this action to
+ * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or
+ LIBRADOS_CREATE_IDEMPOTENT
+ * will error if the object already exists.
+ * @param category category string (DEPRECATED, HAS NO EFFECT)
+ */
+CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op,
+ int exclusive,
+ const char* category);
+
+/**
+ * Write to offset
+ * @param write_op operation to add this action to
+ * @param offset offset to write to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len,
+ uint64_t offset);
+
+/**
+ * Write whole object, atomically replacing it.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len);
+
+/**
+ * Write the same buffer multiple times
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param data_len length of buffer
+ * @param write_len total number of bytes to write, as a multiple of @c data_len
+ * @param offset offset to write to
+ */
+CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op,
+ const char *buffer,
+ size_t data_len,
+ size_t write_len,
+ uint64_t offset);
+
+/**
+ * Append to end of object.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len);
+/**
+ * Remove object
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op);
+
+/**
+ * Truncate an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to truncate to
+ */
+CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op,
+ uint64_t offset);
+
+/**
+ * Zero part of an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to zero
+ * @param len length to zero
+ */
+CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op,
+ uint64_t offset,
+ uint64_t len);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * @param write_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ int *prval);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op,
+ char const* const* keys,
+ char const* const* vals,
+ const size_t *lens,
+ size_t num);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param key_lens array of lengths corresponding to each key
+ * @param val_lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op,
+ char const* const* keys,
+ char const* const* vals,
+ const size_t *key_lens,
+ const size_t *val_lens,
+ size_t num);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to remove
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op,
+ char const* const* keys,
+ size_t keys_len);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of char arrays representing keys to remove
+ * @param key_lens array of size_t values representing length of each key
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op,
+ char const* const* keys,
+ const size_t* key_lens,
+ size_t keys_len);
+
+/**
+ * Remove all key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ const char *oid,
+ time_t *mtime,
+ int flags);
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+
+CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ const char *oid,
+ struct timespec *mtime,
+ int flags);
+
+/**
+ * Perform a write operation asynchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ rados_completion_t completion,
+ const char *oid,
+ time_t *mtime,
+ int flags);
+
+/**
+ * Create a new rados_read_op_t write operation. This will store all
+ * actions to be performed atomically. You must call
+ * rados_release_read_op when you are finished with it (after it
+ * completes, or you decide not to send it in the first place).
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_read_op_t rados_create_read_op(void);
+
+/**
+ * Free a rados_read_op_t, must be called when you're done with it.
+ * @param read_op operation to deallocate, created with rados_create_read_op
+ */
+CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op);
+
+/**
+ * Set flags for the last operation added to this read_op.
+ * At least one op must have been added to the read_op.
+ * @param read_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags);
+
+/**
+ * Ensure that the object exists before reading
+ * @param read_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before reading. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ * then rados_read_op_operate will return -ERANGE instead of
+ * executing the op.
+ * - If the object's version is less than the asserted version
+ * then rados_read_op_operate will return -EOVERFLOW instead
+ * of executing the op.
+ * @param read_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param read_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ * on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off,
+ int *prval);
+
+/**
+ * Ensure that the an xattr satisfies a comparison
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param read_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op,
+ const char *name,
+ uint8_t comparison_operator,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @param read_op operation to add this action to
+ * @param iter where to store the iterator
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op,
+ rados_xattrs_iter_t *iter,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t key_len,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Get object size and mtime
+ * @param read_op operation to add this action to
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op,
+ uint64_t *psize,
+ time_t *pmtime,
+ int *prval);
+
+/**
+ * Read bytes from offset into buffer.
+ *
+ * prlen will be filled with the number of bytes read if successful.
+ * A short read can only occur if the read reaches the end of the
+ * object.
+ *
+ * @param read_op operation to add this action to
+ * @param offset offset to read from
+ * @param len length of buffer
+ * @param buffer where to put the data
+ * @param bytes_read where to store the number of bytes read by this action
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op,
+ uint64_t offset,
+ size_t len,
+ char *buffer,
+ size_t *bytes_read,
+ int *prval);
+
+/**
+ * Compute checksum from object data
+ *
+ * @param read_op operation to add this action to
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param offset the offset to start checksumming in the object
+ * @param len the number of bytes to checksum
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result for this action
+ * @param checksum_len the number of bytes available for the result
+ * @param prval where to store the return value for this action
+ */
+CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op,
+ rados_checksum_type_t type,
+ const char *init_value,
+ size_t init_value_len,
+ uint64_t offset, size_t len,
+ size_t chunk_size, char *pchecksum,
+ size_t checksum_len, int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * The output buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf where to put librados-allocated output buffer
+ * @param out_len length of out_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ char **out_buf,
+ size_t *out_len,
+ int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * If the output buffer is too small, prval will
+ * be set to -ERANGE and used_len will be 0.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf user-provided buffer to read into
+ * @param out_len length of out_buf in bytes
+ * @param used_len where to store the number of bytes read into out_buf
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ char *out_buf,
+ size_t out_len,
+ size_t *used_len,
+ int *prval);
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op,
+ const char *start_after,
+ const char *filter_prefix,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ int *prval)
+ __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op,
+ const char *start_after,
+ const char *filter_prefix,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ unsigned char *pmore,
+ int *prval);
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op,
+ const char *start_after,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ int *prval)
+ __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op,
+ const char *start_after,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ unsigned char *pmore,
+ int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to null-terminated keys to get
+ * @param keys_len the number of strings in keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op,
+ char const* const* keys,
+ size_t keys_len,
+ rados_omap_iter_t *iter,
+ int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to keys to get
+ * @param num_keys the number of strings in keys
+ * @param key_lens array of size_t's describing each key len (in bytes)
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op,
+ char const* const* keys,
+ size_t num_keys,
+ const size_t* key_lens,
+ rados_omap_iter_t *iter,
+ int *prval);
+
+/**
+ * Perform a read operation synchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op,
+ rados_ioctx_t io,
+ const char *oid,
+ int flags);
+
+/**
+ * Perform a read operation asynchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op,
+ rados_ioctx_t io,
+ rados_completion_t completion,
+ const char *oid,
+ int flags);
+
+/** @} Object Operations */
+
+/**
+ * Take an exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid,
+ const char * name, const char * cookie,
+ const char * desc,
+ struct timeval * duration,
+ uint8_t flags);
+
+/**
+ * Take a shared lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag The tag of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o,
+ const char * name, const char * cookie,
+ const char * tag, const char * desc,
+ struct timeval * duration, uint8_t flags);
+
+/**
+ * Release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o,
+ const char *name, const char *cookie);
+
+/**
+ * Asynchronous release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @param completion what to do when operation has been attempted
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o,
+ const char *name, const char *cookie,
+ rados_completion_t completion);
+
+/**
+ * List clients that have locked the named object lock and information about
+ * the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the object lock
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o,
+ const char *name, int *exclusive,
+ char *tag, size_t *tag_len,
+ char *clients, size_t *clients_len,
+ char *cookies, size_t *cookies_len,
+ char *addrs, size_t *addrs_len);
+
+/**
+ * Releases a shared or exclusive lock on an object, which was taken by the
+ * specified client.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param client the client currently holding the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ * @returns -EINVAL if the client cannot be parsed
+ */
+CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o,
+ const char *name, const char *client,
+ const char *cookie);
+
+/**
+ * Blacklists the specified client from the OSDs
+ *
+ * @param cluster cluster handle
+ * @param client_address client address
+ * @param expire_seconds number of seconds to blacklist (0 for default)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_blacklist_add(rados_t cluster,
+ char *client_address,
+ uint32_t expire_seconds);
+
+/**
+ * Gets addresses of the RADOS session, suitable for blacklisting.
+ *
+ * @param cluster cluster handle
+ * @param addrs the output string.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs);
+
+CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io);
+
+CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io);
+
+/**
+ * Enable an application on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param force 0 if only single application per pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io,
+ const char *app_name, int force);
+
+/**
+ * List all enabled applications
+ *
+ * If the provided buffer is too short, the required length is filled in and
+ * -ERANGE is returned. Otherwise, the buffers are filled with the application
+ * names, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param values buffer in which to store application names
+ * @param values_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values,
+ size_t *values_len);
+
+/**
+ * Get application metadata value from pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value result buffer
+ * @param value_len maximum len of value
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io,
+ const char *app_name,
+ const char *key, char *value,
+ size_t *value_len);
+
+/**
+ * Set application metadata on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io,
+ const char *app_name,
+ const char *key,
+ const char *value);
+
+/**
+ * Remove application metadata from a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io,
+ const char *app_name,
+ const char *key);
+
+/**
+ * List all metadata key/value pairs associated with an application.
+ *
+ * This iterates over all metadata, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are filled
+ * in and -ERANGE is returned. Otherwise, the buffers are filled with
+ * the keys and values of the metadata, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param keys buffer in which to store key names
+ * @param key_len number of bytes in keys buffer
+ * @param values buffer in which to store values
+ * @param vals_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io,
+ const char *app_name,
+ char *keys, size_t *key_len,
+ char *values,
+ size_t *vals_len);
+
+/**
+ * @name Mon/OSD/PG Commands
+ *
+ * These interfaces send commands relating to the monitor, OSD, or PGs.
+ *
+ * @{
+ */
+
+/**
+ * Send monitor command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd,
+ size_t cmdlen, const char *inbuf,
+ size_t inbuflen, char **outbuf,
+ size_t *outbuflen, char **outs,
+ size_t *outslen);
+
+/**
+ * Send ceph-mgr command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd,
+ size_t cmdlen, const char *inbuf,
+ size_t inbuflen, char **outbuf,
+ size_t *outbuflen, char **outs,
+ size_t *outslen);
+
+/**
+ * Send monitor command to a specific monitor.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param name target monitor's name
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+/**
+ * free a rados-allocated buffer
+ *
+ * Release memory allocated by librados calls like rados_mon_command().
+ *
+ * @param buf buffer pointer
+ */
+CEPH_RADOS_API void rados_buffer_free(char *buf);
+
+CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+CEPH_RADOS_API int rados_mgr_command(rados_t cluster,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log. The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback_t)(void *arg,
+ const char *line,
+ const char *who,
+ uint64_t sec, uint64_t nsec,
+ uint64_t seq, const char *level,
+ const char *msg);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log. The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback2_t)(void *arg,
+ const char *line,
+ const char *channel,
+ const char *who,
+ const char *name,
+ uint64_t sec, uint64_t nsec,
+ uint64_t seq, const char *level,
+ const char *msg);
+
+CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level,
+ rados_log_callback_t cb, void *arg);
+CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level,
+ rados_log_callback2_t cb, void *arg);
+
+
+/**
+ * register daemon instance for a service
+ *
+ * Register us as a daemon providing a particular service. We identify
+ * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname').
+ * The metadata is a map of keys and values with arbitrary static metdata
+ * for this instance. The encoding is a series of NULL-terminated strings,
+ * alternating key names and values, terminating with an empty key name.
+ * For example, "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}.
+ *
+ * For the lifetime of the librados instance, regular beacons will be sent
+ * to the cluster to maintain our registration in the service map.
+ *
+ * @param cluster handle
+ * @param service service name
+ * @param daemon daemon instance name
+ * @param metadata_dict static daemon metadata dict
+ */
+CEPH_RADOS_API int rados_service_register(
+ rados_t cluster,
+ const char *service,
+ const char *daemon,
+ const char *metadata_dict);
+
+/**
+ * update daemon status
+ *
+ * Update our mutable status information in the service map.
+ *
+ * The status dict is encoded the same way the daemon metadata is encoded
+ * for rados_service_register. For example, "foo\0bar\0this\0that\0\0" is
+ * {foo=bar,this=that}.
+ *
+ * @param cluster rados cluster handle
+ * @param status_dict status dict
+ */
+CEPH_RADOS_API int rados_service_update_status(
+ rados_t cluster,
+ const char *status_dict);
+
+/** @} Mon/OSD/PG commands */
+
+/*
+ * These methods are no longer supported and return -ENOTSUP where possible.
+ */
+CEPH_RADOS_API int rados_objects_list_open(
+ rados_ioctx_t io,
+ rados_list_ctx_t *ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position(
+ rados_list_ctx_t ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_seek(
+ rados_list_ctx_t ctx,
+ uint32_t pos) __attribute__((deprecated));
+CEPH_RADOS_API int rados_objects_list_next(
+ rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key) __attribute__((deprecated));
+CEPH_RADOS_API void rados_objects_list_close(
+ rados_list_ctx_t ctx) __attribute__((deprecated));
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
new file mode 100644
index 00000000..0c047c43
--- /dev/null
+++ b/src/include/rados/librados.hpp
@@ -0,0 +1,1468 @@
+#ifndef __LIBRADOS_HPP
+#define __LIBRADOS_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <utility>
+#include "buffer.h"
+
+#include "librados.h"
+#include "librados_fwd.hpp"
+#include "rados_types.hpp"
+
+namespace libradosstriper
+{
+ class RadosStriper;
+}
+
+namespace librados {
+
+using ceph::bufferlist;
+
+struct AioCompletionImpl;
+struct IoCtxImpl;
+struct ListObjectImpl;
+class NObjectIteratorImpl;
+struct ObjListCtx;
+class ObjectOperationImpl;
+struct PlacementGroupImpl;
+struct PoolAsyncCompletionImpl;
+
+typedef struct rados_cluster_stat_t cluster_stat_t;
+typedef struct rados_pool_stat_t pool_stat_t;
+
+typedef void *list_ctx_t;
+typedef uint64_t auid_t;
+typedef void *config_t;
+
+typedef struct {
+ std::string client;
+ std::string cookie;
+ std::string address;
+} locker_t;
+
+typedef std::map<std::string, pool_stat_t> stats_map;
+
+typedef void *completion_t;
+typedef void (*callback_t)(completion_t cb, void *arg);
+
+inline namespace v14_2_0 {
+
+ class IoCtx;
+ class RadosClient;
+
+ class CEPH_RADOS_API ListObject
+ {
+ public:
+ const std::string& get_nspace() const;
+ const std::string& get_oid() const;
+ const std::string& get_locator() const;
+
+ ListObject();
+ ~ListObject();
+ ListObject( const ListObject&);
+ ListObject& operator=(const ListObject& rhs);
+ private:
+ ListObject(ListObjectImpl *impl);
+
+ friend class librados::NObjectIteratorImpl;
+ friend std::ostream& operator<<(std::ostream& out, const ListObject& lop);
+
+ ListObjectImpl *impl;
+ };
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop);
+
+ class CEPH_RADOS_API NObjectIterator;
+
+ class CEPH_RADOS_API ObjectCursor
+ {
+ public:
+ ObjectCursor();
+ ObjectCursor(const ObjectCursor &rhs);
+ explicit ObjectCursor(rados_object_list_cursor c);
+ ~ObjectCursor();
+ ObjectCursor& operator=(const ObjectCursor& rhs);
+ bool operator<(const ObjectCursor &rhs) const;
+ bool operator==(const ObjectCursor &rhs) const;
+ void set(rados_object_list_cursor c);
+
+ friend class IoCtx;
+ friend class librados::NObjectIteratorImpl;
+ friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+ std::string to_str() const;
+ bool from_str(const std::string& s);
+
+ protected:
+ rados_object_list_cursor c_cursor;
+ };
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+ class CEPH_RADOS_API NObjectIterator : public std::iterator <std::forward_iterator_tag, ListObject> {
+ public:
+ static const NObjectIterator __EndObjectIterator;
+ NObjectIterator(): impl(NULL) {}
+ ~NObjectIterator();
+ NObjectIterator(const NObjectIterator &rhs);
+ NObjectIterator& operator=(const NObjectIterator& rhs);
+
+ bool operator==(const NObjectIterator& rhs) const;
+ bool operator!=(const NObjectIterator& rhs) const;
+ const ListObject& operator*() const;
+ const ListObject* operator->() const;
+ NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions
+ NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions
+ friend class IoCtx;
+ friend class librados::NObjectIteratorImpl;
+
+ /// get current hash position of the iterator, rounded to the current pg
+ uint32_t get_pg_hash_position() const;
+
+ /// move the iterator to a given hash position. this may (will!) be rounded
+ /// to the nearest pg. errors are thrown as exceptions
+ uint32_t seek(uint32_t pos);
+
+ /// move the iterator to a given cursor position. errors are thrown as exceptions
+ uint32_t seek(const ObjectCursor& cursor);
+
+ /// get current cursor position
+ ObjectCursor get_cursor();
+
+ /**
+ * Configure PGLS filter to be applied OSD-side (requires caller
+ * to know/understand the format expected by the OSD)
+ */
+ void set_filter(const bufferlist &bl);
+
+ private:
+ NObjectIterator(ObjListCtx *ctx_);
+ void get_next();
+ NObjectIteratorImpl *impl;
+ };
+
+ class CEPH_RADOS_API ObjectItem
+ {
+ public:
+ std::string oid;
+ std::string nspace;
+ std::string locator;
+ };
+
+ /// DEPRECATED; do not use
+ class CEPH_RADOS_API WatchCtx {
+ public:
+ virtual ~WatchCtx();
+ virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0;
+ };
+
+ class CEPH_RADOS_API WatchCtx2 {
+ public:
+ virtual ~WatchCtx2();
+ /**
+ * Callback activated when we receive a notify event.
+ *
+ * @param notify_id unique id for this notify event
+ * @param cookie the watcher we are notifying
+ * @param notifier_id the unique client id of the notifier
+ * @param bl opaque notify payload (from the notifier)
+ */
+ virtual void handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl) = 0;
+
+ /**
+ * Callback activated when we encounter an error with the watch.
+ *
+ * Errors we may see:
+ * -ENOTCONN : our watch was disconnected
+ * -ETIMEDOUT : our watch is still valid, but we may have missed
+ * a notify event.
+ *
+ * @param cookie the watcher with the problem
+ * @param err error
+ */
+ virtual void handle_error(uint64_t cookie, int err) = 0;
+ };
+
+ struct CEPH_RADOS_API AioCompletion {
+ AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {}
+ int set_complete_callback(void *cb_arg, callback_t cb);
+ int set_safe_callback(void *cb_arg, callback_t cb);
+ int wait_for_complete();
+ int wait_for_safe();
+ int wait_for_complete_and_cb();
+ int wait_for_safe_and_cb();
+ bool is_complete();
+ bool is_safe();
+ bool is_complete_and_cb();
+ bool is_safe_and_cb();
+ int get_return_value();
+ int get_version() __attribute__ ((deprecated));
+ uint64_t get_version64();
+ void release();
+ AioCompletionImpl *pc;
+ };
+
+ struct CEPH_RADOS_API PoolAsyncCompletion {
+ PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {}
+ int set_callback(void *cb_arg, callback_t cb);
+ int wait();
+ bool is_complete();
+ int get_return_value();
+ void release();
+ PoolAsyncCompletionImpl *pc;
+ };
+
+ /**
+ * These are per-op flags which may be different among
+ * ops added to an ObjectOperation.
+ */
+ enum ObjectOperationFlags {
+ OP_EXCL = LIBRADOS_OP_FLAG_EXCL,
+ OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK,
+ OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM,
+ OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL,
+ OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED,
+ OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
+ OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE,
+ };
+
+ class CEPH_RADOS_API ObjectOperationCompletion {
+ public:
+ virtual ~ObjectOperationCompletion() {}
+ virtual void handle_completion(int r, bufferlist& outbl) = 0;
+ };
+
+ /**
+ * These flags apply to the ObjectOperation as a whole.
+ *
+ * BALANCE_READS and LOCALIZE_READS should only be used
+ * when reading from data you're certain won't change,
+ * like a snapshot, or where eventual consistency is ok.
+ *
+ * ORDER_READS_WRITES will order reads the same way writes are
+ * ordered (e.g., waiting for degraded objects). In particular, it
+ * will make a write followed by a read sequence be preserved.
+ *
+ * IGNORE_CACHE will skip the caching logic on the OSD that normally
+ * handles promotion of objects between tiers. This allows an operation
+ * to operate (or read) the cached (or uncached) object, even if it is
+ * not coherent.
+ *
+ * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and
+ * process the op directly on the destination pool. This is useful
+ * for CACHE_FLUSH and CACHE_EVICT operations.
+ */
+ enum ObjectOperationGlobalFlags {
+ OPERATION_NOFLAG = LIBRADOS_OPERATION_NOFLAG,
+ OPERATION_BALANCE_READS = LIBRADOS_OPERATION_BALANCE_READS,
+ OPERATION_LOCALIZE_READS = LIBRADOS_OPERATION_LOCALIZE_READS,
+ OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES,
+ OPERATION_IGNORE_CACHE = LIBRADOS_OPERATION_IGNORE_CACHE,
+ OPERATION_SKIPRWLOCKS = LIBRADOS_OPERATION_SKIPRWLOCKS,
+ OPERATION_IGNORE_OVERLAY = LIBRADOS_OPERATION_IGNORE_OVERLAY,
+ // send requests to cluster despite the cluster or pool being
+ // marked full; ops will either succeed (e.g., delete) or return
+ // EDQUOT or ENOSPC
+ OPERATION_FULL_TRY = LIBRADOS_OPERATION_FULL_TRY,
+ //mainly for delete
+ OPERATION_FULL_FORCE = LIBRADOS_OPERATION_FULL_FORCE,
+ OPERATION_IGNORE_REDIRECT = LIBRADOS_OPERATION_IGNORE_REDIRECT,
+ OPERATION_ORDERSNAP = LIBRADOS_OPERATION_ORDERSNAP,
+ };
+
+ /*
+ * Alloc hint flags for the alloc_hint operation.
+ */
+ enum AllocHintFlags {
+ ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ ALLOC_HINT_FLAG_LONGLIVED = 128,
+ ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+ };
+
+ /*
+ * ObjectOperation : compound object operation
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectOperation
+ {
+ public:
+ ObjectOperation();
+ virtual ~ObjectOperation();
+
+ size_t size();
+ void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated));
+ //flag mean ObjectOperationFlags
+ void set_op_flags2(int flags);
+
+ void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval);
+ void cmpxattr(const char *name, uint8_t op, const bufferlist& val);
+ void cmpxattr(const char *name, uint8_t op, uint64_t v);
+ void exec(const char *cls, const char *method, bufferlist& inbl);
+ void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval);
+ void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion);
+ /**
+ * Guard operation with a check that object version == ver
+ *
+ * @param ver [in] version to check
+ */
+ void assert_version(uint64_t ver);
+
+ /**
+ * Guard operation with a check that the object already exists
+ */
+ void assert_exists();
+
+ /**
+ * get key/value pairs for specified keys
+ *
+ * @param assertions [in] comparison assertions
+ * @param prval [out] place error code in prval upon completion
+ *
+ * assertions has the form of mappings from keys to (comparison rval, assertion)
+ * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ].
+ *
+ * That is, to assert that the value at key 'foo' is greater than 'bar':
+ *
+ * ObjectReadOperation op;
+ * int r;
+ * map<string, pair<bufferlist, int> > assertions;
+ * bufferlist bar(string('bar'));
+ * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT);
+ * op.omap_cmp(assertions, &r);
+ */
+ void omap_cmp(
+ const std::map<std::string, std::pair<bufferlist, int> > &assertions,
+ int *prval);
+
+ protected:
+ ObjectOperationImpl *impl;
+ ObjectOperation(const ObjectOperation& rhs);
+ ObjectOperation& operator=(const ObjectOperation& rhs);
+ friend class IoCtx;
+ friend class Rados;
+ };
+
+ /*
+ * ObjectWriteOperation : compound object write operation
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation
+ {
+ protected:
+ time_t *unused;
+ public:
+ ObjectWriteOperation() : unused(NULL) {}
+ ~ObjectWriteOperation() override {}
+
+ void mtime(time_t *pt);
+ void mtime2(struct timespec *pts);
+
+ void create(bool exclusive);
+ void create(bool exclusive,
+ const std::string& category); ///< NOTE: category is unused
+
+ void write(uint64_t off, const bufferlist& bl);
+ void write_full(const bufferlist& bl);
+ void writesame(uint64_t off, uint64_t write_len,
+ const bufferlist& bl);
+ void append(const bufferlist& bl);
+ void remove();
+ void truncate(uint64_t off);
+ void zero(uint64_t off, uint64_t len);
+ void rmxattr(const char *name);
+ void setxattr(const char *name, const bufferlist& bl);
+ void setxattr(const char *name, const bufferlist&& bl);
+ void tmap_update(const bufferlist& cmdbl);
+ void tmap_put(const bufferlist& bl);
+ void selfmanaged_snap_rollback(uint64_t snapid);
+
+ /**
+ * Rollback an object to the specified snapshot id
+ *
+ * Used with pool snapshots
+ *
+ * @param snapid [in] snopshot id specified
+ */
+ void snap_rollback(uint64_t snapid);
+
+ /**
+ * set keys and values according to map
+ *
+ * @param map [in] keys and values to set
+ */
+ void omap_set(const std::map<std::string, bufferlist> &map);
+
+ /**
+ * set header
+ *
+ * @param bl [in] header to set
+ */
+ void omap_set_header(const bufferlist &bl);
+
+ /**
+ * Clears omap contents
+ */
+ void omap_clear();
+
+ /**
+ * Clears keys in to_rm
+ *
+ * @param to_rm [in] keys to remove
+ */
+ void omap_rm_keys(const std::set<std::string> &to_rm);
+
+ /**
+ * Copy an object
+ *
+ * Copies an object from another location. The operation is atomic in that
+ * the copy either succeeds in its entirety or fails (e.g., because the
+ * source object was modified while the copy was in progress).
+ *
+ * @param src source object name
+ * @param src_ioctx ioctx for the source object
+ * @param src_version current version of the source object
+ * @param src_fadvise_flags the fadvise flags for source object
+ */
+ void copy_from(const std::string& src, const IoCtx& src_ioctx,
+ uint64_t src_version, uint32_t src_fadvise_flags);
+
+ /**
+ * undirty an object
+ *
+ * Clear an objects dirty flag
+ */
+ void undirty();
+
+ /**
+ * Set allocation hint for an object
+ *
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags flags ()
+ */
+ void set_alloc_hint(uint64_t expected_object_size,
+ uint64_t expected_write_size);
+ void set_alloc_hint2(uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+ /**
+ * Pin/unpin an object in cache tier
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ void cache_pin();
+ void cache_unpin();
+
+ /**
+ * Extensible tier
+ *
+ * Set redirect target
+ */
+ void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx,
+ uint64_t tgt_version, int flag = 0);
+ void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx,
+ std::string tgt_oid, uint64_t tgt_offset, int flag = 0);
+ void tier_promote();
+ void unset_manifest();
+
+
+ friend class IoCtx;
+ };
+
+ /*
+ * ObjectReadOperation : compound object operation that return value
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation
+ {
+ public:
+ ObjectReadOperation() {}
+ ~ObjectReadOperation() override {}
+
+ void stat(uint64_t *psize, time_t *pmtime, int *prval);
+ void stat2(uint64_t *psize, struct timespec *pts, int *prval);
+ void getxattr(const char *name, bufferlist *pbl, int *prval);
+ void getxattrs(std::map<std::string, bufferlist> *pattrs, int *prval);
+ void read(size_t off, uint64_t len, bufferlist *pbl, int *prval);
+ void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl,
+ uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl,
+ int *prval);
+
+ /**
+ * see aio_sparse_read()
+ */
+ void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m,
+ bufferlist *data_bl, int *prval);
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list no keys smaller than start_after
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals(
+ const std::string &start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list no keys smaller than start_after
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals2(
+ const std::string &start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore,
+ int *prval);
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param filter_prefix [in] list only keys beginning with filter_prefix
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals(
+ const std::string &start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_vals2: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param filter_prefix [in] list only keys beginning with filter_prefix
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param pmore [out] pointer to bool indicating whether there are more keys
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals2(
+ const std::string &start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore,
+ int *prval);
+
+
+ /**
+ * omap_get_keys: keys from the object omap
+ *
+ * Get up to max_return keys beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param max_return [in] list no more than max_return keys
+ * @param out_keys [out] place returned values in out_keys on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_keys(const std::string &start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_keys2: keys from the object omap
+ *
+ * Get up to max_return keys beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param max_return [in] list no more than max_return keys
+ * @param out_keys [out] place returned values in out_keys on completion
+ * @param pmore [out] pointer to bool indicating whether there are more keys
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_keys2(const std::string &start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ bool *pmore,
+ int *prval);
+
+ /**
+ * omap_get_header: get header from object omap
+ *
+ * @param header [out] place header here upon completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_header(bufferlist *header, int *prval);
+
+ /**
+ * get key/value pairs for specified keys
+ *
+ * @param keys [in] keys to get
+ * @param map [out] place key/value pairs found here on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals_by_keys(const std::set<std::string> &keys,
+ std::map<std::string, bufferlist> *map,
+ int *prval);
+
+ /**
+ * list_watchers: Get list watchers of object
+ *
+ * @param out_watchers [out] place returned values in out_watchers on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void list_watchers(std::list<obj_watch_t> *out_watchers, int *prval);
+
+ /**
+ * list snapshot clones associated with a logical object
+ *
+ * This will include a record for each version of the object,
+ * include the "HEAD" (which will have a cloneid of SNAP_HEAD).
+ * Each clone includes a vector of snap ids for which it is
+ * defined to exist.
+ *
+ * NOTE: this operation must be submitted from an IoCtx with a
+ * read snapid of SNAP_DIR for reliable results.
+ *
+ * @param out_snaps [out] pointer to resulting snap_set_t
+ * @param prval [out] place error code in prval upon completion
+ */
+ void list_snaps(snap_set_t *out_snaps, int *prval);
+
+ /**
+ * query dirty state of an object
+ *
+ * @param isdirty [out] pointer to resulting bool
+ * @param prval [out] place error code in prval upon completion
+ */
+ void is_dirty(bool *isdirty, int *prval);
+
+ /**
+ * flush a cache tier object to backing tier; will block racing
+ * updates.
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promotion.
+ */
+ void cache_flush();
+
+ /**
+ * Flush a cache tier object to backing tier; will EAGAIN if we race
+ * with an update. Must be used with the SKIPRWLOCKS flag.
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promotion.
+ */
+ void cache_try_flush();
+
+ /**
+ * evict a clean cache tier object
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promote on the OSD (that is then evicted).
+ */
+ void cache_evict();
+ };
+
+ /* IoCtx : This is a context in which we can perform I/O.
+ * It includes a Pool,
+ *
+ * Typical use (error checking omitted):
+ *
+ * IoCtx p;
+ * rados.ioctx_create("my_pool", p);
+ * p->stat(&stats);
+ * ... etc ...
+ *
+ * NOTE: be sure to call watch_flush() prior to destroying any IoCtx
+ * that is used for watch events to ensure that racing callbacks
+ * have completed.
+ */
+ class CEPH_RADOS_API IoCtx
+ {
+ public:
+ IoCtx();
+ static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool);
+ IoCtx(const IoCtx& rhs);
+ IoCtx& operator=(const IoCtx& rhs);
+ IoCtx(IoCtx&& rhs) noexcept;
+ IoCtx& operator=(IoCtx&& rhs) noexcept;
+
+ ~IoCtx();
+
+ bool is_valid() const;
+
+ // Close our pool handle
+ void close();
+
+ // deep copy
+ void dup(const IoCtx& rhs);
+
+ // set pool auid
+ int set_auid(uint64_t auid_)
+ __attribute__ ((deprecated));
+
+ // set pool auid
+ int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+
+ // get pool auid
+ int get_auid(uint64_t *auid_)
+ __attribute__ ((deprecated));
+
+ uint64_t get_instance_id() const;
+
+ std::string get_pool_name();
+
+ bool pool_requires_alignment();
+ int pool_requires_alignment2(bool * req);
+ uint64_t pool_required_alignment();
+ int pool_required_alignment2(uint64_t * alignment);
+
+ // create an object
+ int create(const std::string& oid, bool exclusive);
+ int create(const std::string& oid, bool exclusive,
+ const std::string& category); ///< category is unused
+
+ /**
+ * write bytes to an object at a specified offset
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+ /**
+ * append bytes to an object
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int append(const std::string& oid, bufferlist& bl, size_t len);
+ /**
+ * replace object contents with provided data
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write_full(const std::string& oid, bufferlist& bl);
+ int writesame(const std::string& oid, bufferlist& bl,
+ size_t write_len, uint64_t off);
+ int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+ int checksum(const std::string& o, rados_checksum_type_t type,
+ const bufferlist &init_value_bl, size_t len, uint64_t off,
+ size_t chunk_size, bufferlist *pbl);
+ int remove(const std::string& oid);
+ int remove(const std::string& oid, int flags);
+ int trunc(const std::string& oid, uint64_t size);
+ int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m);
+ int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl);
+ int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off);
+ int getxattr(const std::string& oid, const char *name, bufferlist& bl);
+ int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset);
+ int setxattr(const std::string& oid, const char *name, bufferlist& bl);
+ int rmxattr(const std::string& oid, const char *name);
+ int stat(const std::string& oid, uint64_t *psize, time_t *pmtime);
+ int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts);
+ int exec(const std::string& oid, const char *cls, const char *method,
+ bufferlist& inbl, bufferlist& outbl);
+ /**
+ * modify object tmap based on encoded update sequence
+ *
+ * NOTE: this call steals the contents of @param bl
+ */
+ int tmap_update(const std::string& oid, bufferlist& cmdbl);
+
+ int omap_get_vals(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals);
+ int omap_get_vals2(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore);
+ int omap_get_vals(const std::string& oid,
+ const std::string& start_after,
+ const std::string& filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals);
+ int omap_get_vals2(const std::string& oid,
+ const std::string& start_after,
+ const std::string& filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore);
+ int omap_get_keys(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys);
+ int omap_get_keys2(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ bool *pmore);
+ int omap_get_header(const std::string& oid,
+ bufferlist *bl);
+ int omap_get_vals_by_keys(const std::string& oid,
+ const std::set<std::string>& keys,
+ std::map<std::string, bufferlist> *vals);
+ int omap_set(const std::string& oid,
+ const std::map<std::string, bufferlist>& map);
+ int omap_set_header(const std::string& oid,
+ const bufferlist& bl);
+ int omap_clear(const std::string& oid);
+ int omap_rm_keys(const std::string& oid,
+ const std::set<std::string>& keys);
+
+ void snap_set_read(snap_t seq);
+ int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector<snap_t>& snaps);
+
+ // Create a snapshot with a given name
+ int snap_create(const char *snapname);
+
+ // Look up a snapshot by name.
+ // Returns 0 on success; error code otherwise
+ int snap_lookup(const char *snapname, snap_t *snap);
+
+ // Gets a timestamp for a snap
+ int snap_get_stamp(snap_t snapid, time_t *t);
+
+ // Gets the name of a snap
+ int snap_get_name(snap_t snapid, std::string *s);
+
+ // Remove a snapshot from this pool
+ int snap_remove(const char *snapname);
+
+ int snap_list(std::vector<snap_t> *snaps);
+
+ int snap_rollback(const std::string& oid, const char *snapname);
+
+ // Deprecated name kept for backward compatibility - same as snap_rollback()
+ int rollback(const std::string& oid, const char *snapname)
+ __attribute__ ((deprecated));
+
+ int selfmanaged_snap_create(uint64_t *snapid);
+ void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c);
+
+ int selfmanaged_snap_remove(uint64_t snapid);
+ void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c);
+
+ int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid);
+
+ // Advisory locking on rados objects.
+ int lock_exclusive(const std::string &oid, const std::string &name,
+ const std::string &cookie,
+ const std::string &description,
+ struct timeval * duration, uint8_t flags);
+
+ int lock_shared(const std::string &oid, const std::string &name,
+ const std::string &cookie, const std::string &tag,
+ const std::string &description,
+ struct timeval * duration, uint8_t flags);
+
+ int unlock(const std::string &oid, const std::string &name,
+ const std::string &cookie);
+
+ int break_lock(const std::string &oid, const std::string &name,
+ const std::string &client, const std::string &cookie);
+
+ int list_lockers(const std::string &oid, const std::string &name,
+ int *exclusive,
+ std::string *tag,
+ std::list<librados::locker_t> *lockers);
+
+
+ /// Start enumerating objects for a pool. Errors are thrown as exceptions.
+ NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist());
+ /// Start enumerating objects for a pool starting from a hash position.
+ /// Errors are thrown as exceptions.
+ NObjectIterator nobjects_begin(uint32_t start_hash_position,
+ const bufferlist &filter=bufferlist());
+ /// Start enumerating objects for a pool starting from cursor. Errors are
+ /// thrown as exceptions.
+ NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor,
+ const bufferlist &filter=bufferlist());
+ /// Iterator indicating the end of a pool
+ const NObjectIterator& nobjects_end() const;
+
+ /// Get cursor for pool beginning
+ ObjectCursor object_list_begin();
+
+ /// Get cursor for pool end
+ ObjectCursor object_list_end();
+
+ /// Check whether a cursor is at the end of a pool
+ bool object_list_is_end(const ObjectCursor &oc);
+
+ /// List some objects between two cursors
+ int object_list(const ObjectCursor &start, const ObjectCursor &finish,
+ const size_t result_count,
+ const bufferlist &filter,
+ std::vector<ObjectItem> *result,
+ ObjectCursor *next);
+
+ /// Generate cursors that include the N out of Mth slice of the pool
+ void object_list_slice(
+ const ObjectCursor start,
+ const ObjectCursor finish,
+ const size_t n,
+ const size_t m,
+ ObjectCursor *split_start,
+ ObjectCursor *split_finish);
+
+ /**
+ * List available hit set objects
+ *
+ * @param uint32_t [in] hash position to query
+ * @param c [in] completion
+ * @param pls [out] list of available intervals
+ */
+ int hit_set_list(uint32_t hash, AioCompletion *c,
+ std::list< std::pair<time_t, time_t> > *pls);
+
+ /**
+ * Retrieve hit set for a given hash, and time
+ *
+ * @param hash [in] hash position
+ * @param c [in] completion
+ * @param stamp [in] time interval that falls within the hit set's interval
+ * @param pbl [out] buffer to store the result in
+ */
+ int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp,
+ bufferlist *pbl);
+
+ uint64_t get_last_version();
+
+ int aio_read(const std::string& oid, AioCompletion *c,
+ bufferlist *pbl, size_t len, uint64_t off);
+ /**
+ * Asynchronously read from an object at a particular snapshot
+ *
+ * This is the same as normal aio_read, except that it chooses
+ * the snapshot to read from from its arguments instead of the
+ * internal IoCtx state.
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param pbl where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @param snapid the id of the snapshot to read from
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_read(const std::string& oid, AioCompletion *c,
+ bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid);
+ int aio_sparse_read(const std::string& oid, AioCompletion *c,
+ std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+ size_t len, uint64_t off);
+ /**
+ * Asynchronously read existing extents from an object at a
+ * particular snapshot
+ *
+ * This is the same as normal aio_sparse_read, except that it chooses
+ * the snapshot to read from from its arguments instead of the
+ * internal IoCtx state.
+ *
+ * m will be filled in with a map of extents in the object,
+ * mapping offsets to lengths (in bytes) within the range
+ * requested. The data for all of the extents are stored
+ * back-to-back in offset order in data_bl.
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param m where to store the map of extents
+ * @param data_bl where to store the data
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @param snapid the id of the snapshot to read from
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_sparse_read(const std::string& oid, AioCompletion *c,
+ std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+ size_t len, uint64_t off, uint64_t snapid);
+ /**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param off object byte offset at which to start the comparison
+ * @param cmp_bl buffer containing bytes to be compared with object contents
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+ int aio_cmpext(const std::string& oid,
+ librados::AioCompletion *c,
+ uint64_t off,
+ bufferlist& cmp_bl);
+ int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t len, uint64_t off);
+ int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t len);
+ int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl);
+ int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t write_len, uint64_t off);
+
+ /**
+ * Asynchronously remove an object
+ *
+ * Queues the remove and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param oid the name of the object
+ * @param c what to do when the remove is safe and complete
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than SNAP_HEAD
+ */
+ int aio_remove(const std::string& oid, AioCompletion *c);
+ int aio_remove(const std::string& oid, AioCompletion *c, int flags);
+
+ /**
+ * Wait for all currently pending aio writes to be safe.
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush();
+
+ /**
+ * Schedule a callback for when all currently pending
+ * aio writes are safe. This is a non-blocking version of
+ * aio_flush().
+ *
+ * @param c what to do when the writes are safe
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush_async(AioCompletion *c);
+ int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+ int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map<std::string, bufferlist>& attrset);
+ int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+ int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name);
+ int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime);
+ int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts);
+
+ /**
+ * Cancel aio operation
+ *
+ * @param c completion handle
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_cancel(AioCompletion *c);
+
+ int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method,
+ bufferlist& inbl, bufferlist *outbl);
+
+ /*
+ * asynchronous version of unlock
+ */
+ int aio_unlock(const std::string &oid, const std::string &name,
+ const std::string &cookie, AioCompletion *c);
+
+ // compound object operations
+ int operate(const std::string& oid, ObjectWriteOperation *op);
+ int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl);
+ int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op);
+ int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags);
+ /**
+ * Schedule an async write operation with explicit snapshot parameters
+ *
+ * This is the same as the first aio_operate(), except that it
+ * gets the snapshot context from its arguments instead of the
+ * IoCtx internal state.
+ *
+ * @param oid the object to operate on
+ * @param c what to do when the operation is complete and safe
+ * @param op which operations to perform
+ * @param seq latest selfmanaged snapshot sequence number for this object
+ * @param snaps currently existing selfmanaged snapshot ids for this object
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps,
+ const blkin_trace_info *trace_info);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps, int flags,
+ const blkin_trace_info *trace_info);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, bufferlist *pbl);
+
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, snap_t snapid, int flags,
+ bufferlist *pbl)
+ __attribute__ ((deprecated));
+
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, int flags,
+ bufferlist *pbl);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, int flags,
+ bufferlist *pbl, const blkin_trace_info *trace_info);
+
+ // watch/notify
+ int watch2(const std::string& o, uint64_t *handle,
+ librados::WatchCtx2 *ctx);
+ int watch3(const std::string& o, uint64_t *handle,
+ librados::WatchCtx2 *ctx, uint32_t timeout);
+ int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle,
+ librados::WatchCtx2 *ctx);
+ int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle,
+ librados::WatchCtx2 *ctx, uint32_t timeout);
+ int unwatch2(uint64_t handle);
+ int aio_unwatch(uint64_t handle, AioCompletion *c);
+ /**
+ * Send a notify event to watchers
+ *
+ * Upon completion the pbl bufferlist reply payload will be
+ * encoded like so:
+ *
+ * le32 num_acks
+ * {
+ * le64 gid global id for the client (for client.1234 that's 1234)
+ * le64 cookie cookie for the client
+ * le32 buflen length of reply message buffer
+ * u8 * buflen payload
+ * } * num_acks
+ * le32 num_timeouts
+ * {
+ * le64 gid global id for the client
+ * le64 cookie cookie for the client
+ * } * num_timeouts
+ *
+ *
+ */
+ int notify2(const std::string& o, ///< object
+ bufferlist& bl, ///< optional broadcast payload
+ uint64_t timeout_ms, ///< timeout (in ms)
+ bufferlist *pbl); ///< reply buffer
+ int aio_notify(const std::string& o, ///< object
+ AioCompletion *c, ///< completion when notify completes
+ bufferlist& bl, ///< optional broadcast payload
+ uint64_t timeout_ms, ///< timeout (in ms)
+ bufferlist *pbl); ///< reply buffer
+
+ int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers);
+ int list_snaps(const std::string& o, snap_set_t *out_snaps);
+ void set_notify_timeout(uint32_t timeout);
+
+ /// acknowledge a notify we received.
+ void notify_ack(const std::string& o, ///< watched object
+ uint64_t notify_id, ///< notify id
+ uint64_t cookie, ///< our watch handle
+ bufferlist& bl); ///< optional reply payload
+
+ /***
+ * check on watch validity
+ *
+ * Check if a watch is valid. If so, return the number of
+ * milliseconds since we last confirmed its liveness. If there is
+ * a known error, return it.
+ *
+ * If there is an error, the watch is no longer valid, and should
+ * be destroyed with unwatch(). The user is still interested in
+ * the object, a new watch should be created with watch().
+ *
+ * @param cookie watch handle
+ * @returns ms since last confirmed valid, or error
+ */
+ int watch_check(uint64_t cookie);
+
+ // old, deprecated versions
+ int watch(const std::string& o, uint64_t ver, uint64_t *cookie,
+ librados::WatchCtx *ctx) __attribute__ ((deprecated));
+ int notify(const std::string& o, uint64_t ver, bufferlist& bl)
+ __attribute__ ((deprecated));
+ int unwatch(const std::string& o, uint64_t cookie)
+ __attribute__ ((deprecated));
+
+ /**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it
+ * was submitted with a OP_FAILOK flag set) and is not guaranteed
+ * to do anything on the backend.
+ *
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+ int set_alloc_hint(const std::string& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+ int set_alloc_hint2(const std::string& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+ // assert version for next sync operations
+ void set_assert_version(uint64_t ver);
+
+ /**
+ * Pin/unpin an object in cache tier
+ *
+ * @param o the name of the object
+ * @returns 0 on success, negative error code on failure
+ */
+ int cache_pin(const std::string& o);
+ int cache_unpin(const std::string& o);
+
+ std::string get_pool_name() const;
+
+ void locator_set_key(const std::string& key);
+ void set_namespace(const std::string& nspace);
+ std::string get_namespace() const;
+
+ int64_t get_id();
+
+ // deprecated versions
+ uint32_t get_object_hash_position(const std::string& oid)
+ __attribute__ ((deprecated));
+ uint32_t get_object_pg_hash_position(const std::string& oid)
+ __attribute__ ((deprecated));
+
+ int get_object_hash_position2(const std::string& oid, uint32_t *hash_position);
+ int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position);
+
+ config_t cct();
+
+ void set_osdmap_full_try();
+ void unset_osdmap_full_try();
+
+ int application_enable(const std::string& app_name, bool force);
+ int application_enable_async(const std::string& app_name,
+ bool force, PoolAsyncCompletion *c);
+ int application_list(std::set<std::string> *app_names);
+ int application_metadata_get(const std::string& app_name,
+ const std::string &key,
+ std::string *value);
+ int application_metadata_set(const std::string& app_name,
+ const std::string &key,
+ const std::string& value);
+ int application_metadata_remove(const std::string& app_name,
+ const std::string &key);
+ int application_metadata_list(const std::string& app_name,
+ std::map<std::string, std::string> *values);
+
+ private:
+ /* You can only get IoCtx instances from Rados */
+ IoCtx(IoCtxImpl *io_ctx_impl_);
+
+ friend class Rados; // Only Rados can use our private constructor to create IoCtxes.
+ friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl
+ friend class ObjectWriteOperation; // copy_from needs to see our IoCtxImpl
+
+ IoCtxImpl *io_ctx_impl;
+ };
+
+ struct CEPH_RADOS_API PlacementGroup {
+ PlacementGroup();
+ PlacementGroup(const PlacementGroup&);
+ ~PlacementGroup();
+ bool parse(const char*);
+ std::unique_ptr<PlacementGroupImpl> impl;
+ };
+
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&);
+
+ class CEPH_RADOS_API Rados
+ {
+ public:
+ static void version(int *major, int *minor, int *extra);
+
+ Rados();
+ explicit Rados(IoCtx& ioctx);
+ ~Rados();
+ static void from_rados_t(rados_t cluster, Rados &rados);
+
+ int init(const char * const id);
+ int init2(const char * const name, const char * const clustername,
+ uint64_t flags);
+ int init_with_context(config_t cct_);
+ config_t cct();
+ int connect();
+ void shutdown();
+ int watch_flush();
+ int aio_watch_flush(AioCompletion*);
+ int conf_read_file(const char * const path) const;
+ int conf_parse_argv(int argc, const char ** argv) const;
+ int conf_parse_argv_remainder(int argc, const char ** argv,
+ const char ** remargv) const;
+ int conf_parse_env(const char *env) const;
+ int conf_set(const char *option, const char *value);
+ int conf_get(const char *option, std::string &val);
+
+ int service_daemon_register(
+ const std::string& service, ///< service name (e.g., 'rgw')
+ const std::string& name, ///< daemon name (e.g., 'gwfoo')
+ const std::map<std::string,std::string>& metadata); ///< static metadata about daemon
+ int service_daemon_update_status(
+ std::map<std::string,std::string>&& status);
+
+ int pool_create(const char *name);
+ int pool_create(const char *name, uint64_t auid)
+ __attribute__ ((deprecated));
+ int pool_create(const char *name, uint64_t auid, uint8_t crush_rule)
+ __attribute__ ((deprecated));
+ int pool_create_with_rule(const char *name, uint8_t crush_rule);
+ int pool_create_async(const char *name, PoolAsyncCompletion *c);
+ int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+ int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+ int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c);
+ int pool_get_base_tier(int64_t pool, int64_t* base_tier);
+ int pool_delete(const char *name);
+ int pool_delete_async(const char *name, PoolAsyncCompletion *c);
+ int64_t pool_lookup(const char *name);
+ int pool_reverse_lookup(int64_t id, std::string *name);
+
+ uint64_t get_instance_id();
+
+ int get_min_compatible_osd(int8_t* require_osd_release);
+ int get_min_compatible_client(int8_t* min_compat_client,
+ int8_t* require_min_compat_client);
+
+ int mon_command(std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int mgr_command(std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int osd_command(int osdid, std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+
+ int ioctx_create(const char *name, IoCtx &pioctx);
+ int ioctx_create2(int64_t pool_id, IoCtx &pioctx);
+
+ // Features useful for test cases
+ void test_blacklist_self(bool set);
+
+ /* pool info */
+ int pool_list(std::list<std::string>& v);
+ int pool_list2(std::list<std::pair<int64_t, std::string> >& v);
+ int get_pool_stats(std::list<std::string>& v,
+ stats_map& result);
+ /// deprecated; use simpler form. categories no longer supported.
+ int get_pool_stats(std::list<std::string>& v,
+ std::map<std::string, stats_map>& stats);
+ /// deprecated; categories no longer supported
+ int get_pool_stats(std::list<std::string>& v,
+ std::string& category,
+ std::map<std::string, stats_map>& stats);
+ /// check if pool has selfmanaged snaps
+ bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname);
+
+ int cluster_stat(cluster_stat_t& result);
+ int cluster_fsid(std::string *fsid);
+
+ /**
+ * List inconsistent placement groups in the given pool
+ *
+ * @param pool_id the pool id
+ * @param pgs [out] the inconsistent PGs
+ */
+ int get_inconsistent_pgs(int64_t pool_id,
+ std::vector<PlacementGroup>* pgs);
+ /**
+ * List the inconsistent objects found in a given PG by last scrub
+ *
+ * @param pg the placement group returned by @c pg_list()
+ * @param start_after the first returned @c objects
+ * @param max_return the max number of the returned @c objects
+ * @param c what to do when the operation is complete and safe
+ * @param objects [out] the objects where inconsistencies are found
+ * @param interval [in,out] an epoch indicating current interval
+ * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+ * the current interval begin epoch is different.
+ */
+ int get_inconsistent_objects(const PlacementGroup& pg,
+ const object_id_t &start_after,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_obj_t>* objects,
+ uint32_t* interval);
+ /**
+ * List the inconsistent snapsets found in a given PG by last scrub
+ *
+ * @param pg the placement group returned by @c pg_list()
+ * @param start_after the first returned @c objects
+ * @param max_return the max number of the returned @c objects
+ * @param c what to do when the operation is complete and safe
+ * @param snapsets [out] the objects where inconsistencies are found
+ * @param interval [in,out] an epoch indicating current interval
+ * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+ * the current interval begin epoch is different.
+ */
+ int get_inconsistent_snapsets(const PlacementGroup& pg,
+ const object_id_t &start_after,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_snapset_t>* snapset,
+ uint32_t* interval);
+
+ /// get/wait for the most recent osdmap
+ int wait_for_latest_osdmap();
+
+ int blacklist_add(const std::string& client_address,
+ uint32_t expire_seconds);
+
+ /*
+ * pool aio
+ *
+ * It is up to the caller to release the completion handler, even if the pool_create_async()
+ * and/or pool_delete_async() fails and does not send the async request
+ */
+ static PoolAsyncCompletion *pool_async_create_completion();
+
+ // -- aio --
+ static AioCompletion *aio_create_completion();
+ static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete,
+ callback_t cb_safe);
+
+ friend std::ostream& operator<<(std::ostream &oss, const Rados& r);
+ private:
+ // We don't allow assignment or copying
+ Rados(const Rados& rhs);
+ const Rados& operator=(const Rados& rhs);
+ RadosClient *client;
+ };
+
+} // namespace v14_2_0
+} // namespace librados
+
+#endif
+
diff --git a/src/include/rados/librados_fwd.hpp b/src/include/rados/librados_fwd.hpp
new file mode 100644
index 00000000..8926d097
--- /dev/null
+++ b/src/include/rados/librados_fwd.hpp
@@ -0,0 +1,32 @@
+#ifndef __LIBRADOS_FWD_HPP
+#define __LIBRADOS_FWD_HPP
+
+namespace libradosstriper {
+
+class RadosStriper;
+
+} // namespace libradosstriper
+
+namespace librados {
+inline namespace v14_2_0 {
+
+class AioCompletion;
+class IoCtx;
+class ListObject;
+class NObjectIterator;
+class ObjectCursor;
+class ObjectItem;
+class ObjectOperation;
+class ObjectOperationCompletion;
+class ObjectReadOperation;
+class ObjectWriteOperation;
+class PlacementGroup;
+class PoolAsyncCompletion;
+class Rados;
+class WatchCtx;
+class WatchCtx2;
+
+} // inline namespace v14_2_0
+} // namespace librados
+
+#endif // __LIBRADOS_FWD_HPP
diff --git a/src/include/rados/librgw.h b/src/include/rados/librgw.h
new file mode 100644
index 00000000..c20e96be
--- /dev/null
+++ b/src/include/rados/librgw.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_LIBRGW_H
+#define CEPH_LIBRGW_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_VER_MAJOR 1
+#define LIBRGW_VER_MINOR 1
+#define LIBRGW_VER_EXTRA 0
+
+#define LIBRGW_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_VERSION_CODE LIBRGW_VERSION(LIBRGW_VER_MAJOR, LIBRGW_VER_MINOR, LIBRGW_VER_EXTRA)
+
+typedef void* librgw_t;
+int librgw_create(librgw_t *rgw, int argc, char **argv);
+void librgw_shutdown(librgw_t rgw);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LIBRGW_H */
diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h
new file mode 100644
index 00000000..80ae69d2
--- /dev/null
+++ b/src/include/rados/objclass.h
@@ -0,0 +1,177 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+
+#ifdef __cplusplus
+
+#include "buffer.h"
+
+extern "C" {
+#endif
+
+#define CEPH_CLS_API [[gnu::visibility("default")]]
+
+#define CLS_VER(maj,min) \
+int __cls_ver__## maj ## _ ##min = 0; \
+int __cls_ver_maj = maj; \
+int __cls_ver_min = min;
+
+#define CLS_NAME(name) \
+int __cls_name__## name = 0; \
+const char *__cls_name = #name;
+
+#define CLS_INIT(name) \
+CEPH_CLS_API void __cls_init()
+
+#define CLS_METHOD_RD 0x1 /// method executes read operations
+#define CLS_METHOD_WR 0x2 /// method executes write operations
+#define CLS_METHOD_PROMOTE 0x8 /// method cannot be proxied to base tier
+
+#define CLS_LOG(level, fmt, ...) \
+ cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__)
+
+/**
+ * Initialize a class.
+ */
+void __cls_init();
+
+/**
+ * @typdef cls_handle_t
+ *
+ * A handle for interacting with the object class.
+ */
+typedef void *cls_handle_t;
+
+/**
+ * @typedef cls_method_handle_t
+ *
+ * A handle for interacting with the method of the object class.
+ */
+typedef void *cls_method_handle_t;
+
+/**
+ * @typedef cls_method_context_t
+ *
+ * A context for the method of the object class.
+ */
+typedef void* cls_method_context_t;
+
+/*class utils*/
+extern int cls_log(int level, const char *format, ...)
+ __attribute__((__format__(printf, 2, 3)));
+
+/* class registration api */
+extern int cls_register(const char *name, cls_handle_t *handle);
+
+#ifdef __cplusplus
+}
+
+/**
+ * @typedef cls_method_cxx_call_t
+ *
+ */
+typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx,
+ class ceph::buffer::list *inbl, class ceph::buffer::list *outbl);
+
+/**
+ * Register a method.
+ *
+ * @param hclass
+ * @param method
+ * @param flags
+ * @param class_call
+ * @param handle
+ */
+extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags,
+ cls_method_cxx_call_t class_call, cls_method_handle_t *handle);
+
+/**
+ * Create an object.
+ *
+ * @param hctx
+ * @param exclusive
+ */
+extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive);
+
+/**
+ * Remove an object.
+ *
+ * @param hctx
+ */
+extern int cls_cxx_remove(cls_method_context_t hctx);
+
+/**
+ * Check on the status of an object.
+ *
+ * @param hctx
+ * @param size
+ * @param mtime
+ */
+extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
+
+/**
+ * Read contents of an object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Write to the object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Get xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param outbl
+ */
+extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
+ ceph::bufferlist *outbl);
+
+/**
+ * Set xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param inbl
+ */
+extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
+ ceph::bufferlist *inbl);
+
+/**
+ * Get value corresponding to a key from the map.
+ *
+ * @param hctx
+ * @param key
+ * @param outbl
+ */
+extern int cls_cxx_map_get_val(cls_method_context_t hctx,
+ const std::string &key, ceph::bufferlist *outbl);
+
+/**
+ * Set value corresponding to a key in the map.
+ *
+ * @param hctx
+ * @param key
+ * @param inbl
+ */
+extern int cls_cxx_map_set_val(cls_method_context_t hctx,
+ const std::string &key, ceph::bufferlist *inbl);
+
+#endif
+
+#endif
diff --git a/src/include/rados/page.h b/src/include/rados/page.h
new file mode 120000
index 00000000..cf983e83
--- /dev/null
+++ b/src/include/rados/page.h
@@ -0,0 +1 @@
+../page.h \ No newline at end of file
diff --git a/src/include/rados/rados_types.h b/src/include/rados/rados_types.h
new file mode 100644
index 00000000..0712f489
--- /dev/null
+++ b/src/include/rados/rados_types.h
@@ -0,0 +1,29 @@
+#ifndef CEPH_RADOS_TYPES_H
+#define CEPH_RADOS_TYPES_H
+
+#include <stdint.h>
+
+/**
+ * @struct obj_watch_t
+ * One item from list_watchers
+ */
+struct obj_watch_t {
+ /// Address of the Watcher
+ char addr[256];
+ /// Watcher ID
+ int64_t watcher_id;
+ /// Cookie
+ uint64_t cookie;
+ /// Timeout in Seconds
+ uint32_t timeout_seconds;
+};
+
+/**
+ *
+ * Pass as nspace argument to rados_ioctx_set_namespace()
+ * before calling rados_nobjects_list_open() to return
+ * all objects in all namespaces.
+ */
+#define LIBRADOS_ALL_NSPACES "\001"
+
+#endif
diff --git a/src/include/rados/rados_types.hpp b/src/include/rados/rados_types.hpp
new file mode 100644
index 00000000..8c02dd83
--- /dev/null
+++ b/src/include/rados/rados_types.hpp
@@ -0,0 +1,331 @@
+#ifndef CEPH_RADOS_TYPES_HPP
+#define CEPH_RADOS_TYPES_HPP
+
+#include <map>
+#include <utility>
+#include <vector>
+#include <stdint.h>
+#include <string>
+
+#include "buffer.h"
+#include "rados_types.h"
+
+namespace librados {
+
+typedef uint64_t snap_t;
+
+enum {
+ SNAP_HEAD = (uint64_t)(-2),
+ SNAP_DIR = (uint64_t)(-1)
+};
+
+struct clone_info_t {
+ snap_t cloneid;
+ std::vector<snap_t> snaps; // ascending
+ std::vector< std::pair<uint64_t,uint64_t> > overlap; // with next newest
+ uint64_t size;
+ clone_info_t() : cloneid(0), size(0) {}
+};
+
+struct snap_set_t {
+ std::vector<clone_info_t> clones; // ascending
+ snap_t seq; // newest snapid seen by the object
+ snap_set_t() : seq(0) {}
+};
+
+struct object_id_t {
+ std::string name;
+ std::string nspace;
+ std::string locator;
+ snap_t snap = 0;
+ object_id_t() = default;
+ object_id_t(const std::string& name,
+ const std::string& nspace,
+ const std::string& locator,
+ snap_t snap)
+ : name(name),
+ nspace(nspace),
+ locator(locator),
+ snap(snap)
+ {}
+};
+
+struct err_t {
+ enum : uint64_t {
+ SHARD_MISSING = 1 << 1,
+ SHARD_STAT_ERR = 1 << 2,
+ SHARD_READ_ERR = 1 << 3,
+ DATA_DIGEST_MISMATCH_OI = 1 << 9, // Old
+ DATA_DIGEST_MISMATCH_INFO = 1 << 9,
+ OMAP_DIGEST_MISMATCH_OI = 1 << 10, // Old
+ OMAP_DIGEST_MISMATCH_INFO = 1 << 10,
+ SIZE_MISMATCH_OI = 1 << 11, // Old
+ SIZE_MISMATCH_INFO = 1 << 11,
+ SHARD_EC_HASH_MISMATCH = 1 << 12,
+ SHARD_EC_SIZE_MISMATCH = 1 << 13,
+ OI_ATTR_MISSING = 1 << 14, // Old
+ INFO_MISSING = 1 << 14,
+ OI_ATTR_CORRUPTED = 1 << 15, // Old
+ INFO_CORRUPTED = 1 << 15,
+ SS_ATTR_MISSING = 1 << 16, // Old
+ SNAPSET_MISSING = 1 << 16,
+ SS_ATTR_CORRUPTED = 1 << 17, // Old
+ SNAPSET_CORRUPTED = 1 << 17,
+ OBJ_SIZE_OI_MISMATCH = 1 << 18, // Old
+ OBJ_SIZE_INFO_MISMATCH = 1 << 18,
+ HINFO_MISSING = 1 << 19,
+ HINFO_CORRUPTED = 1 << 20
+ // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+ };
+ uint64_t errors = 0;
+ static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED;
+ static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH;
+ bool has_shard_missing() const {
+ return errors & SHARD_MISSING;
+ }
+ bool has_stat_error() const {
+ return errors & SHARD_STAT_ERR;
+ }
+ bool has_read_error() const {
+ return errors & SHARD_READ_ERR;
+ }
+ bool has_data_digest_mismatch_oi() const { // Compatibility
+ return errors & DATA_DIGEST_MISMATCH_OI;
+ }
+ bool has_data_digest_mismatch_info() const {
+ return errors & DATA_DIGEST_MISMATCH_INFO;
+ }
+ bool has_omap_digest_mismatch_oi() const { // Compatibility
+ return errors & OMAP_DIGEST_MISMATCH_OI;
+ }
+ bool has_omap_digest_mismatch_info() const {
+ return errors & OMAP_DIGEST_MISMATCH_INFO;
+ }
+ bool has_size_mismatch_oi() const { // Compatibility
+ return errors & SIZE_MISMATCH_OI;
+ }
+ bool has_size_mismatch_info() const {
+ return errors & SIZE_MISMATCH_INFO;
+ }
+ bool has_ec_hash_error() const {
+ return errors & SHARD_EC_HASH_MISMATCH;
+ }
+ bool has_ec_size_error() const {
+ return errors & SHARD_EC_SIZE_MISMATCH;
+ }
+ bool has_oi_attr_missing() const { // Compatibility
+ return errors & OI_ATTR_MISSING;
+ }
+ bool has_info_missing() const {
+ return errors & INFO_MISSING;
+ }
+ bool has_oi_attr_corrupted() const { // Compatibility
+ return errors & OI_ATTR_CORRUPTED;
+ }
+ bool has_info_corrupted() const {
+ return errors & INFO_CORRUPTED;
+ }
+ bool has_ss_attr_missing() const { // Compatibility
+ return errors & SS_ATTR_MISSING;
+ }
+ bool has_snapset_missing() const {
+ return errors & SNAPSET_MISSING;
+ }
+ bool has_ss_attr_corrupted() const { // Compatibility
+ return errors & SS_ATTR_CORRUPTED;
+ }
+ bool has_snapset_corrupted() const {
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool has_shallow_errors() const {
+ return errors & SHALLOW_ERRORS;
+ }
+ bool has_deep_errors() const {
+ return errors & DEEP_ERRORS;
+ }
+ bool has_obj_size_oi_mismatch() const { // Compatibility
+ return errors & OBJ_SIZE_OI_MISMATCH;
+ }
+ bool has_obj_size_info_mismatch() const {
+ return errors & OBJ_SIZE_INFO_MISMATCH;
+ }
+ bool has_hinfo_missing() const {
+ return errors & HINFO_MISSING;
+ }
+ bool has_hinfo_corrupted() const {
+ return errors & HINFO_CORRUPTED;
+ }
+};
+
+struct shard_info_t : err_t {
+ std::map<std::string, ceph::bufferlist> attrs;
+ uint64_t size = -1;
+ bool omap_digest_present = false;
+ uint32_t omap_digest = 0;
+ bool data_digest_present = false;
+ uint32_t data_digest = 0;
+ bool selected_oi = false;
+ bool primary = false;
+};
+
+struct osd_shard_t {
+ int32_t osd;
+ int8_t shard;
+};
+
+inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) {
+ if (lhs.osd < rhs.osd)
+ return true;
+ else if (lhs.osd > rhs.osd)
+ return false;
+ else
+ return lhs.shard < rhs.shard;
+}
+
+struct obj_err_t {
+ enum : uint64_t {
+ OBJECT_INFO_INCONSISTENCY = 1 << 1,
+ // XXX: Can an older rados binary work if these bits stay the same?
+ DATA_DIGEST_MISMATCH = 1 << 4,
+ OMAP_DIGEST_MISMATCH = 1 << 5,
+ SIZE_MISMATCH = 1 << 6,
+ ATTR_VALUE_MISMATCH = 1 << 7,
+ ATTR_NAME_MISMATCH = 1 << 8,
+ SNAPSET_INCONSISTENCY = 1 << 9,
+ HINFO_INCONSISTENCY = 1 << 10,
+ SIZE_TOO_LARGE = 1 << 11,
+ // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+ };
+ uint64_t errors = 0;
+ static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH
+ |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE;
+ static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH;
+ bool has_object_info_inconsistency() const {
+ return errors & OBJECT_INFO_INCONSISTENCY;
+ }
+ bool has_data_digest_mismatch() const {
+ return errors & DATA_DIGEST_MISMATCH;
+ }
+ bool has_omap_digest_mismatch() const {
+ return errors & OMAP_DIGEST_MISMATCH;
+ }
+ bool has_size_mismatch() const {
+ return errors & SIZE_MISMATCH;
+ }
+ bool has_attr_value_mismatch() const {
+ return errors & ATTR_VALUE_MISMATCH;
+ }
+ bool has_attr_name_mismatch() const {
+ return errors & ATTR_NAME_MISMATCH;
+ }
+ bool has_shallow_errors() const {
+ return errors & SHALLOW_ERRORS;
+ }
+ bool has_deep_errors() const {
+ return errors & DEEP_ERRORS;
+ }
+ bool has_snapset_inconsistency() const {
+ return errors & SNAPSET_INCONSISTENCY;
+ }
+ bool has_hinfo_inconsistency() const {
+ return errors & HINFO_INCONSISTENCY;
+ }
+ bool has_size_too_large() const {
+ return errors & SIZE_TOO_LARGE;
+ }
+};
+
+struct inconsistent_obj_t : obj_err_t {
+ inconsistent_obj_t() = default;
+ inconsistent_obj_t(const object_id_t& object)
+ : object{object}, version(0)
+ {}
+ object_id_t object;
+ uint64_t version; // XXX: Redundant with object info attr
+ std::map<osd_shard_t, shard_info_t> shards;
+ err_t union_shards;
+};
+
+struct inconsistent_snapset_t {
+ inconsistent_snapset_t() = default;
+ inconsistent_snapset_t(const object_id_t& head)
+ : object{head}
+ {}
+ enum {
+ SNAPSET_MISSING = 1 << 0,
+ SNAPSET_CORRUPTED = 1 << 1,
+ CLONE_MISSING = 1 << 2,
+ SNAP_ERROR = 1 << 3,
+ HEAD_MISMATCH = 1 << 4, // Unused
+ HEADLESS_CLONE = 1 << 5,
+ SIZE_MISMATCH = 1 << 6,
+ OI_MISSING = 1 << 7, // Old
+ INFO_MISSING = 1 << 7,
+ OI_CORRUPTED = 1 << 8, // Old
+ INFO_CORRUPTED = 1 << 8,
+ EXTRA_CLONES = 1 << 9,
+ };
+ uint64_t errors = 0;
+ object_id_t object;
+ // Extra clones
+ std::vector<snap_t> clones;
+ std::vector<snap_t> missing;
+ ceph::bufferlist ss_bl;
+
+ bool ss_attr_missing() const { // Compatibility
+ return errors & SNAPSET_MISSING;
+ }
+ bool snapset_missing() const {
+ return errors & SNAPSET_MISSING;
+ }
+ bool ss_attr_corrupted() const { // Compatibility
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool snapset_corrupted() const {
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool clone_missing() const {
+ return errors & CLONE_MISSING;
+ }
+ bool snapset_mismatch() const { // Compatibility
+ return errors & SNAP_ERROR;
+ }
+ bool snapset_error() const {
+ return errors & SNAP_ERROR;
+ }
+ bool head_mismatch() const { // Compatibility
+ return false;
+ }
+ bool headless() const {
+ return errors & HEADLESS_CLONE;
+ }
+ bool size_mismatch() const {
+ return errors & SIZE_MISMATCH;
+ }
+ bool oi_attr_missing() const { // Compatibility
+ return errors & OI_MISSING;
+ }
+ bool info_missing() const {
+ return errors & INFO_MISSING;
+ }
+ bool oi_attr_corrupted() const { // Compatibility
+ return errors & OI_CORRUPTED;
+ }
+ bool info_corrupted() const {
+ return errors & INFO_CORRUPTED;
+ }
+ bool extra_clones() const {
+ return errors & EXTRA_CLONES;
+ }
+};
+
+/**
+ * @var all_nspaces
+ * Pass as nspace argument to IoCtx::set_namespace()
+ * before calling nobjects_begin() to iterate
+ * through all objects in all namespaces.
+ */
+const std::string all_nspaces(LIBRADOS_ALL_NSPACES);
+
+}
+#endif
diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h
new file mode 100644
index 00000000..66cf627a
--- /dev/null
+++ b/src/include/rados/rgw_file.h
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * convert RGW commands to file commands
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef RADOS_RGW_FILE_H
+#define RADOS_RGW_FILE_H
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "librgw.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_FILE_VER_MAJOR 1
+#define LIBRGW_FILE_VER_MINOR 1
+#define LIBRGW_FILE_VER_EXTRA 7
+
+#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
+
+/*
+ * object types
+ */
+enum rgw_fh_type {
+ RGW_FS_TYPE_NIL = 0,
+ RGW_FS_TYPE_FILE,
+ RGW_FS_TYPE_DIRECTORY,
+ RGW_FS_TYPE_SYMBOLIC_LINK,
+};
+
+/*
+ * dynamic allocated handle to support nfs handle
+ */
+
+/* content-addressable hash */
+struct rgw_fh_hk {
+ uint64_t bucket;
+ uint64_t object;
+};
+
+struct rgw_file_handle
+{
+ /* content-addressable hash */
+ struct rgw_fh_hk fh_hk;
+ void *fh_private; /* librgw private data */
+ /* object type */
+ enum rgw_fh_type fh_type;
+};
+
+struct rgw_fs
+{
+ librgw_t rgw;
+ void *fs_private;
+ struct rgw_file_handle* root_fh;
+};
+
+
+/* XXX mount info hypothetical--emulate Unix, support at least
+ * UUID-length fsid */
+struct rgw_statvfs {
+ uint64_t f_bsize; /* file system block size */
+ uint64_t f_frsize; /* fragment size */
+ uint64_t f_blocks; /* size of fs in f_frsize units */
+ uint64_t f_bfree; /* # free blocks */
+ uint64_t f_bavail; /* # free blocks for unprivileged users */
+ uint64_t f_files; /* # inodes */
+ uint64_t f_ffree; /* # free inodes */
+ uint64_t f_favail; /* # free inodes for unprivileged users */
+ uint64_t f_fsid[2]; /* file system ID */
+ uint64_t f_flag; /* mount flags */
+ uint64_t f_namemax; /* maximum filename length */
+};
+
+
+void rgwfile_version(int *major, int *minor, int *extra);
+
+/*
+ lookup object by name (POSIX style)
+*/
+#define RGW_LOOKUP_FLAG_NONE 0x0000
+#define RGW_LOOKUP_FLAG_CREATE 0x0001
+#define RGW_LOOKUP_FLAG_RCB 0x0002 /* readdir callback hint */
+#define RGW_LOOKUP_FLAG_DIR 0x0004
+#define RGW_LOOKUP_FLAG_FILE 0x0008
+
+#define RGW_LOOKUP_TYPE_FLAGS \
+ (RGW_LOOKUP_FLAG_DIR|RGW_LOOKUP_FLAG_FILE)
+
+int rgw_lookup(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char *path,
+ struct rgw_file_handle **fh,
+ struct stat *st, uint32_t mask, uint32_t flags);
+
+/*
+ lookup object by handle (NFS style)
+*/
+int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk,
+ struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+ * release file handle
+ */
+#define RGW_FH_RELE_FLAG_NONE 0x0000
+
+int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ attach rgw namespace
+*/
+#define RGW_MOUNT_FLAG_NONE 0x0000
+
+int rgw_mount(librgw_t rgw, const char *uid, const char *key,
+ const char *secret, struct rgw_fs **rgw_fs,
+ uint32_t flags);
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *key,
+ const char *secret, const char *root, struct rgw_fs **rgw_fs,
+ uint32_t flags);
+
+/*
+ register invalidate callbacks
+*/
+#define RGW_REG_INVALIDATE_FLAG_NONE 0x0000
+
+typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk);
+
+int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb,
+ void *arg, uint32_t flags);
+
+/*
+ detach rgw namespace
+*/
+#define RGW_UMOUNT_FLAG_NONE 0x0000
+
+int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags);
+
+
+/*
+ get filesystem attributes
+*/
+#define RGW_STATFS_FLAG_NONE 0x0000
+
+int rgw_statfs(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ struct rgw_statvfs *vfs_st,
+ uint32_t flags);
+
+
+/* XXX (get|set)attr mask bits */
+#define RGW_SETATTR_MODE 1
+#define RGW_SETATTR_UID 2
+#define RGW_SETATTR_GID 4
+#define RGW_SETATTR_MTIME 8
+#define RGW_SETATTR_ATIME 16
+#define RGW_SETATTR_SIZE 32
+#define RGW_SETATTR_CTIME 64
+
+/*
+ create file
+*/
+#define RGW_CREATE_FLAG_NONE 0x0000
+
+int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ const char *name, struct stat *st, uint32_t mask,
+ struct rgw_file_handle **fh, uint32_t posix_flags,
+ uint32_t flags);
+
+/*
+ create a symbolic link
+ */
+#define RGW_CREATELINK_FLAG_NONE 0x0000
+int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ const char *name, const char *link_path, struct stat *st,
+ uint32_t mask, struct rgw_file_handle **fh, uint32_t posix_flags,
+ uint32_t flags);
+
+/*
+ create a new directory
+*/
+#define RGW_MKDIR_FLAG_NONE 0x0000
+
+int rgw_mkdir(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ const char *name, struct stat *st, uint32_t mask,
+ struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+ rename object
+*/
+#define RGW_RENAME_FLAG_NONE 0x0000
+
+int rgw_rename(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *olddir, const char* old_name,
+ struct rgw_file_handle *newdir, const char* new_name,
+ uint32_t flags);
+
+/*
+ remove file or directory
+*/
+#define RGW_UNLINK_FLAG_NONE 0x0000
+
+int rgw_unlink(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char* path,
+ uint32_t flags);
+
+/*
+ read directory content
+*/
+typedef bool (*rgw_readdir_cb)(const char *name, void *arg, uint64_t offset,
+ struct stat *st, uint32_t mask,
+ uint32_t flags);
+
+#define RGW_READDIR_FLAG_NONE 0x0000
+#define RGW_READDIR_FLAG_DOTDOT 0x0001 /* send dot names */
+
+int rgw_readdir(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, uint64_t *offset,
+ rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+ uint32_t flags);
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char *name,
+ rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+ uint32_t flags);
+
+/* project offset of dirent name */
+#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
+
+int rgw_dirent_offset(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ const char *name, int64_t *offset,
+ uint32_t flags);
+
+/*
+ get unix attributes for object
+*/
+#define RGW_GETATTR_FLAG_NONE 0x0000
+
+int rgw_getattr(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, struct stat *st,
+ uint32_t flags);
+
+/*
+ set unix attributes for object
+*/
+#define RGW_SETATTR_FLAG_NONE 0x0000
+
+int rgw_setattr(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, struct stat *st,
+ uint32_t mask, uint32_t flags);
+
+/*
+ truncate file
+*/
+#define RGW_TRUNCATE_FLAG_NONE 0x0000
+
+int rgw_truncate(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t size,
+ uint32_t flags);
+
+/*
+ open file
+*/
+#define RGW_OPEN_FLAG_NONE 0x0000
+#define RGW_OPEN_FLAG_CREATE 0x0001
+#define RGW_OPEN_FLAG_V3 0x0002 /* ops have v3 semantics */
+#define RGW_OPEN_FLAG_STATELESS 0x0002 /* alias it */
+
+int rgw_open(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ uint32_t posix_flags, uint32_t flags);
+
+/*
+ close file
+*/
+
+#define RGW_CLOSE_FLAG_NONE 0x0000
+#define RGW_CLOSE_FLAG_RELE 0x0001
+
+int rgw_close(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ read data from file
+*/
+#define RGW_READ_FLAG_NONE 0x0000
+
+int rgw_read(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_read, void *buffer,
+ uint32_t flags);
+
+/*
+ read symbolic link
+*/
+#define RGW_READLINK_FLAG_NONE 0x0000
+
+int rgw_readlink(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_read, void *buffer,
+ uint32_t flags);
+
+/*
+ write data to file
+*/
+#define RGW_WRITE_FLAG_NONE 0x0000
+
+int rgw_write(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_written, void *buffer,
+ uint32_t flags);
+
+#define RGW_UIO_NONE 0x0000
+#define RGW_UIO_GIFT 0x0001
+#define RGW_UIO_FREE 0x0002
+#define RGW_UIO_BUFQ 0x0004
+
+struct rgw_uio;
+typedef void (*rgw_uio_release)(struct rgw_uio *, uint32_t);
+
+/* buffer vector descriptors */
+struct rgw_vio {
+ void *vio_p1;
+ void *vio_u1;
+ void *vio_base;
+ int32_t vio_len;
+};
+
+struct rgw_uio {
+ rgw_uio_release uio_rele;
+ void *uio_p1;
+ void *uio_u1;
+ uint64_t uio_offset;
+ uint64_t uio_resid;
+ uint32_t uio_cnt;
+ uint32_t uio_flags;
+ struct rgw_vio *uio_vio; /* appended vectors */
+};
+
+typedef struct rgw_uio rgw_uio;
+
+int rgw_readv(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+int rgw_writev(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+/*
+ sync written data
+*/
+#define RGW_FSYNC_FLAG_NONE 0x0000
+
+int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ NFS commit operation
+*/
+
+#define RGW_COMMIT_FLAG_NONE 0x0000
+
+int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint64_t offset, uint64_t length, uint32_t flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RADOS_RGW_FILE_H */
diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h
new file mode 100644
index 00000000..7eb33596
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.h
@@ -0,0 +1,610 @@
+#ifndef CEPH_LIBRADOSSTRIPER_H
+#define CEPH_LIBRADOSSTRIPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+#include "../rados/librados.h"
+
+#define LIBRADOSSTRIPER_VER_MAJOR 0
+#define LIBRADOSSTRIPER_VER_MINOR 0
+#define LIBRADOSSTRIPER_VER_EXTRA 0
+
+#define LIBRADOSSTRIPER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOSSTRIPER_VERSION_CODE LIBRADOSSTRIPER_VERSION(LIBRADOSSTRIPER_VER_MAJOR, LIBRADOSSTRIPER_VER_MINOR, LIBRADOSSTRIPER_VER_EXTRA)
+
+/**
+ * @typedef rados_striper_t
+ *
+ * A handle for interacting with striped objects in a RADOS cluster.
+ */
+typedef void *rados_striper_t;
+
+/**
+ * @defgroup libradosstriper_h_init Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using libradosstriper.
+ *
+ * @{
+ */
+
+/**
+ * Creates a rados striper using the given io context
+ * Striper has initially default object layout.
+ * See rados_striper_set_object_layout_*() to change this
+ *
+ * @param ioctx the rados context to use
+ * @param striper where to store the rados striper
+ * @returns 0 on success, negative error code on failure
+ */
+ int rados_striper_create(rados_ioctx_t ioctx,
+ rados_striper_t *striper);
+
+/**
+ * Destroys a rados striper
+ *
+ * @param striper the striper to destroy
+ */
+void rados_striper_destroy(rados_striper_t striper);
+
+/**
+ * Sets the object layout's stripe unit of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_unit the stripe_unit value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper,
+ unsigned int stripe_unit);
+
+/**
+ * Sets the object layout's stripe count of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_count the stripe_count value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_count(rados_striper_t striper,
+ unsigned int stripe_count);
+
+/**
+ * Sets the object layout's object_size of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param object_size the object_size value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_object_size(rados_striper_t striper,
+ unsigned int object_size);
+
+/** @} init */
+
+/**
+ * @defgroup libradosstriper_h_synch_io Synchronous I/O
+ * Writes are striped to several rados objects which are then
+ * replicated to a number of OSDs based on the configuration
+ * of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_striper_ioctx_wait_for_complete().
+ *
+ * @{
+ */
+
+/**
+ * Synchronously write data to a striped object at the specified offset
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_write(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Synchronously write an entire striped object
+ *
+ * The striped object is filled with the provided data. If the striped object exists,
+ * it is truncated and then written.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_write_full(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len);
+
+/**
+ * Append data to an object
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_append(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len);
+
+/**
+ * Synchronously read data from a striped object at the specified offset
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+int rados_striper_read(rados_striper_t striper,
+ const char *soid,
+ char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Synchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_remove(rados_striper_t striper,
+ const char* soid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @note the truncation is not fully atomic. The metadata part is,
+ * so the behavior will be atomic from user point of view when
+ * the object size is reduced. However, in case of failure, old data
+ * may stay around, hidden. They may reappear if the object size is
+ * later grown, instead of the expected 0s. When growing the
+ * object and in case of failure, the new 0 data may not be
+ * fully created. This can lead to ENOENT errors when
+ * writing/reading the missing parts.
+ * @note the truncation can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during truncation (same EBUSY return code)
+ * @param io the rados context to use
+ * @param soid the name of the striped object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_trunc(rados_striper_t striper, const char *soid, uint64_t size);
+
+/** @} Synchronous I/O */
+
+/**
+ * @defgroup libradosstriper_h_xattrs Xattrs
+ * Extended attributes are stored as extended attributes on the
+ * first rados regular object of the striped object.
+ * Thus, they have the same limitations as the underlying
+ * rados extended attributes.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the getxattr will occur
+ * @param oid name of the striped object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+int rados_striper_getxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name,
+ char *buf,
+ size_t len);
+
+/**
+ * Set an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the setxattr will occur
+ * @param oid name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_setxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name,
+ const char *buf,
+ size_t len);
+
+/**
+ * Delete an extended attribute from a striped object.
+ *
+ * @param striper the striper in which the rmxattr will occur
+ * @param oid name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_rmxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name);
+
+/**
+ * Start iterating over xattrs on a striped object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param striper the striper in which the getxattrs will occur
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs(rados_striper_t striper,
+ const char *oid,
+ rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the striped object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs_next(rados_xattrs_iter_t iter,
+ const char **name,
+ const char **val,
+ size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+void rados_striper_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Synchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_stat(rados_striper_t striper,
+ const char* soid,
+ uint64_t *psize,
+ time_t *pmtime);
+
+/**
+ * @defgroup libradosstriper_h_asynch_io Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_striper_multi_completion_t
+ * Represents the state of a set of asynchronous operations
+ * it contains the aggregated return value once the operations complete
+ * and can be used to block until all operations are complete and/or safe.
+ */
+typedef void *rados_striper_multi_completion_t;
+
+/**
+ * Constructs a multi completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all relpicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+int rados_striper_multi_aio_create_completion(void *cb_arg,
+ rados_callback_t cb_complete,
+ rados_callback_t cb_safe,
+ rados_striper_multi_completion_t *pc);
+
+/**
+ * Block until all operation complete
+ *
+ * This means data is in memory on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operation are safe
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations complete and callback completes
+ *
+ * This means data is in memory on all replicas and can be read.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations are safe and callback has completed
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation and callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe and has the callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Get the return value of a multi asychronous operation
+ *
+ * The return value is set when all operations are complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operations to inspect
+ * @returns aggregated return value of the operations
+ */
+int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c);
+
+/**
+ * Release a multi asynchrnous IO completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c multi completion to release
+ */
+void rados_striper_multi_aio_release(rados_striper_multi_completion_t c);
+
+/**
+ * Asynchronously write data to a striped object at the specified offset
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Asynchronously appends data to a striped object
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_append(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously fills and object with the provided data.
+ * If the object exists, it is truncated and then written.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write_full(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously read data from a striped object at the specified offset
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the read is safe and complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_read(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ char *buf,
+ const size_t len,
+ uint64_t off);
+
+/**
+ * Asynchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, negative error code on failure
+ */
+
+int rados_striper_aio_remove(rados_striper_t striper,
+ const char* soid,
+ rados_completion_t completion);
+
+/**
+ * Block until all pending writes in a striper are safe
+ *
+ * This is not equivalent to calling rados_striper_multi_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @param striper the striper in which the flush will occur
+ * @returns 0 on success, negative error code on failure
+*/
+void rados_striper_aio_flush(rados_striper_t striper);
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param completion what to do when the stats is complete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_aio_stat(rados_striper_t striper,
+ const char* soid,
+ rados_completion_t completion,
+ uint64_t *psize,
+ time_t *pmtime);
+
+/** @} Asynchronous I/O */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp
new file mode 100644
index 00000000..674a56b7
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.hpp
@@ -0,0 +1,241 @@
+#ifndef __LIBRADOSSTRIPER_HPP
+#define __LIBRADOSSTRIPER_HPP
+
+#include <string.h>
+#include <string>
+#include <map>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+
+#include "libradosstriper.h"
+
+namespace libradosstriper
+{
+ struct RadosStriperImpl;
+ struct MultiAioCompletionImpl;
+
+ /*
+ * Completion object for multiple asynchronous IO
+ * It allows to internally handle several "requests"
+ */
+ struct MultiAioCompletion {
+ MultiAioCompletion(MultiAioCompletionImpl *pc_) : pc(pc_) {}
+ ~MultiAioCompletion();
+ int set_complete_callback(void *cb_arg, librados::callback_t cb);
+ int set_safe_callback(void *cb_arg, librados::callback_t cb);
+ void wait_for_complete();
+ void wait_for_safe();
+ void wait_for_complete_and_cb();
+ void wait_for_safe_and_cb();
+ bool is_complete();
+ bool is_safe();
+ bool is_complete_and_cb();
+ bool is_safe_and_cb();
+ int get_return_value();
+ void release();
+ MultiAioCompletionImpl *pc;
+ };
+
+ /* RadosStriper : This class allows to perform read/writes on striped objects
+ *
+ * Typical use (error checking omitted):
+ *
+ * RadosStriper rs;
+ * RadosStriper.striper_create("my_cluster", rs);
+ * bufferlist bl;
+ * ... put data in bl ...
+ * rs.write(object_name, bl, len, offset);
+ * bufferlist bl2;
+ * rs.read(object_name, &bl2, len, offset);
+ * ...
+ */
+ class RadosStriper
+ {
+ public:
+
+ /*
+ * constructor
+ */
+ RadosStriper();
+
+ /*
+ * builds the C counter part of a RadosStriper
+ */
+ static void to_rados_striper_t(RadosStriper &striper,
+ rados_striper_t *s);
+
+ /*
+ * copy constructor
+ */
+ RadosStriper(const RadosStriper& rs);
+
+ /*
+ * operator=
+ */
+ RadosStriper& operator=(const RadosStriper& rs);
+
+ /*
+ * destructor
+ * Internally calling close() if an object is currently opened
+ */
+ ~RadosStriper();
+
+ /*
+ * create method
+ */
+ static int striper_create(librados::IoCtx& ioctx,
+ RadosStriper *striper);
+
+ /*
+ * set object layout's stripe unit
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_stripe_unit(unsigned int stripe_unit);
+
+ /*
+ * set object layout's stripe count
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_stripe_count(unsigned int stripe_count);
+
+ /*
+ * set object layout's object size
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_object_size(unsigned int object_size);
+
+ /**
+ * Get the value of an extended attribute on a striped object
+ */
+ int getxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+ /**
+ * Set the value of an extended attribute on a striped object
+ */
+ int setxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+ /**
+ * Delete an extended attribute from a striped object
+ */
+ int rmxattr(const std::string& oid, const char *name);
+
+ /**
+ * Start iterating over xattrs on a striped object.
+ */
+ int getxattrs(const std::string& oid,
+ std::map<std::string, ceph::bufferlist>& attrset);
+
+ /**
+ * synchronously write to the striped object at the specified offset.
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write(const std::string& soid, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+ /**
+ * synchronously fill the striped object with the specified data
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write_full(const std::string& soid, const ceph::bufferlist& bl);
+
+ /**
+ * synchronously append data to the striped object
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int append(const std::string& soid, const ceph::bufferlist& bl, size_t len);
+
+ /**
+ * asynchronously write to the striped object at the specified offset.
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+ /**
+ * asynchronously fill the striped object with the specified data
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl);
+
+ /**
+ * asynchronously append data to the striped object
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len);
+
+ /**
+ * synchronously read from the striped object at the specified offset.
+ */
+ int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off);
+
+ /**
+ * asynchronously read from the striped object at the specified offset.
+ */
+ int aio_read(const std::string& soid, librados::AioCompletion *c, ceph::bufferlist *pbl, size_t len, uint64_t off);
+
+ /**
+ * synchronously get striped object stats (size/mtime)
+ */
+ int stat(const std::string& soid, uint64_t *psize, time_t *pmtime);
+ int stat2(const std::string& soid, uint64_t *psize, struct timespec *pts);
+
+ /**
+ * asynchronously get striped object stats (size/mtime)
+ */
+ int aio_stat(const std::string& soid, librados::AioCompletion *c,
+ uint64_t *psize, time_t *pmtime);
+ int aio_stat2(const std::string& soid, librados::AioCompletion *c,
+ uint64_t *psize, struct timespec *pts);
+
+ /**
+ * deletes a striped object.
+ * There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ */
+ int remove(const std::string& soid);
+ int remove(const std::string& soid, int flags);
+
+ /**
+ * asynchronous remove of striped objects
+ * See synchronous version for comments on (lack of) atomicity
+ */
+ int aio_remove(const std::string& soid, librados::AioCompletion *c);
+ int aio_remove(const std::string& soid, librados::AioCompletion *c, int flags);
+
+ /**
+ * Resizes a striped object
+ * the truncation can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during truncation (same EBUSY return code)
+ */
+ int trunc(const std::string& oid, uint64_t size);
+
+ /**
+ * Wait for all currently pending aio writes to be safe.
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush();
+
+ /**
+ * creation of multi aio completion objects
+ */
+ static MultiAioCompletion *multi_aio_create_completion();
+ static MultiAioCompletion *multi_aio_create_completion(void *cb_arg,
+ librados::callback_t cb_complete,
+ librados::callback_t cb_safe);
+
+ private:
+ RadosStriperImpl *rados_striper_impl;
+
+ };
+
+}
+
+#endif
diff --git a/src/include/random.h b/src/include/random.h
new file mode 100644
index 00000000..b3cb80c3
--- /dev/null
+++ b/src/include/random.h
@@ -0,0 +1,289 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+*/
+
+#ifndef CEPH_RANDOM_H
+#define CEPH_RANDOM_H 1
+
+#include <mutex>
+#include <random>
+#include <type_traits>
+#include <boost/optional.hpp>
+
+// Basic random number facility, adapted from N3551:
+namespace ceph::util {
+
+inline namespace version_1_0_2 {
+
+namespace detail {
+
+template <typename T0, typename T1>
+using larger_of = typename std::conditional<
+ sizeof(T0) >= sizeof(T1),
+ T0, T1>
+ ::type;
+
+// avoid mixing floating point and integers:
+template <typename NumberT0, typename NumberT1>
+using has_compatible_numeric_types =
+ std::disjunction<
+ std::conjunction<
+ std::is_floating_point<NumberT0>, std::is_floating_point<NumberT1>
+ >,
+ std::conjunction<
+ std::is_integral<NumberT0>, std::is_integral<NumberT1>
+ >
+ >;
+
+
+// Select the larger of type compatible numeric types:
+template <typename NumberT0, typename NumberT1>
+using select_number_t = std::enable_if_t<detail::has_compatible_numeric_types<NumberT0, NumberT1>::value,
+ detail::larger_of<NumberT0, NumberT1>>;
+
+} // namespace detail
+
+namespace detail {
+
+// Choose default distribution for appropriate types:
+template <typename NumberT,
+ bool IsIntegral>
+struct select_distribution
+{
+ using type = std::uniform_int_distribution<NumberT>;
+};
+
+template <typename NumberT>
+struct select_distribution<NumberT, false>
+{
+ using type = std::uniform_real_distribution<NumberT>;
+};
+
+template <typename NumberT>
+using default_distribution = typename
+ select_distribution<NumberT, std::is_integral<NumberT>::value>::type;
+
+} // namespace detail
+
+namespace detail {
+
+template <typename EngineT>
+EngineT& engine();
+
+template <typename MutexT, typename EngineT,
+ typename SeedT = typename EngineT::result_type>
+void randomize_rng(const SeedT seed, MutexT& m, EngineT& e)
+{
+ std::lock_guard<MutexT> lg(m);
+ e.seed(seed);
+}
+
+template <typename MutexT, typename EngineT>
+void randomize_rng(MutexT& m, EngineT& e)
+{
+ std::random_device rd;
+
+ std::lock_guard<MutexT> lg(m);
+ e.seed(rd());
+}
+
+template <typename EngineT = std::default_random_engine,
+ typename SeedT = typename EngineT::result_type>
+void randomize_rng(const SeedT n)
+{
+ detail::engine<EngineT>().seed(n);
+}
+
+template <typename EngineT = std::default_random_engine>
+void randomize_rng()
+{
+ std::random_device rd;
+ detail::engine<EngineT>().seed(rd());
+}
+
+template <typename EngineT>
+EngineT& engine()
+{
+ thread_local boost::optional<EngineT> rng_engine;
+
+ if (!rng_engine) {
+ rng_engine.emplace(EngineT());
+ randomize_rng<EngineT>();
+ }
+
+ return *rng_engine;
+}
+
+} // namespace detail
+
+namespace detail {
+
+template <typename NumberT,
+ typename DistributionT = detail::default_distribution<NumberT>,
+ typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max,
+ EngineT& e)
+{
+ DistributionT d { min, max };
+
+ using param_type = typename DistributionT::param_type;
+ return d(e, param_type { min, max });
+}
+
+template <typename NumberT,
+ typename MutexT,
+ typename DistributionT = detail::default_distribution<NumberT>,
+ typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max,
+ MutexT& m, EngineT& e)
+{
+ DistributionT d { min, max };
+
+ using param_type = typename DistributionT::param_type;
+
+ std::lock_guard<MutexT> lg(m);
+ return d(e, param_type { min, max });
+}
+
+template <typename NumberT,
+ typename DistributionT = detail::default_distribution<NumberT>,
+ typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max)
+{
+ return detail::generate_random_number<NumberT, DistributionT, EngineT>
+ (min, max, detail::engine<EngineT>());
+}
+
+template <typename MutexT,
+ typename EngineT,
+ typename NumberT = int,
+ typename DistributionT = detail::default_distribution<NumberT>>
+NumberT generate_random_number(MutexT& m, EngineT& e)
+{
+ return detail::generate_random_number<NumberT, MutexT, DistributionT, EngineT>
+ (0, std::numeric_limits<NumberT>::max(), m, e);
+}
+
+template <typename NumberT, typename MutexT, typename EngineT>
+NumberT generate_random_number(const NumberT max, MutexT& m, EngineT& e)
+{
+ return generate_random_number<NumberT>(0, max, m, e);
+}
+
+} // namespace detail
+
+template <typename EngineT = std::default_random_engine>
+void randomize_rng()
+{
+ detail::randomize_rng<EngineT>();
+}
+
+template <typename NumberT = int,
+ typename DistributionT = detail::default_distribution<NumberT>,
+ typename EngineT = std::default_random_engine>
+NumberT generate_random_number()
+{
+ return detail::generate_random_number<NumberT, DistributionT, EngineT>
+ (0, std::numeric_limits<NumberT>::max());
+}
+
+template <typename NumberT0, typename NumberT1,
+ typename NumberT = detail::select_number_t<NumberT0, NumberT1>
+ >
+NumberT generate_random_number(const NumberT0 min, const NumberT1 max)
+{
+ return detail::generate_random_number<NumberT,
+ detail::default_distribution<NumberT>,
+ std::default_random_engine>
+ (static_cast<NumberT>(min), static_cast<NumberT>(max));
+}
+
+template <typename NumberT0, typename NumberT1,
+ typename DistributionT,
+ typename EngineT,
+ typename NumberT = detail::select_number_t<NumberT0, NumberT1>
+ >
+NumberT generate_random_number(const NumberT min, const NumberT max,
+ EngineT& e)
+{
+ return detail::generate_random_number<NumberT,
+ DistributionT,
+ EngineT>(static_cast<NumberT>(min), static_cast<NumberT>(max), e);
+}
+
+template <typename NumberT>
+NumberT generate_random_number(const NumberT max)
+{
+ return generate_random_number<NumberT>(0, max);
+}
+
+// Function object:
+template <typename NumberT>
+class random_number_generator final
+{
+ std::mutex l;
+ std::random_device rd;
+ std::default_random_engine e;
+
+ using seed_type = typename decltype(e)::result_type;
+
+ public:
+ using number_type = NumberT;
+ using random_engine_type = decltype(e);
+ using random_device_type = decltype(rd);
+
+ public:
+ random_device_type& random_device() noexcept { return rd; }
+ random_engine_type& random_engine() noexcept { return e; }
+
+ public:
+ random_number_generator() {
+ detail::randomize_rng(l, e);
+ }
+
+ explicit random_number_generator(const seed_type seed) {
+ detail::randomize_rng(seed, l, e);
+ }
+
+ random_number_generator(random_number_generator&& rhs)
+ : e(std::move(rhs.e))
+ {}
+
+ public:
+ random_number_generator(const random_number_generator&) = delete;
+ random_number_generator& operator=(const random_number_generator&) = delete;
+
+ public:
+ NumberT operator()() {
+ return detail::generate_random_number(l, e);
+ }
+
+ NumberT operator()(const NumberT max) {
+ return detail::generate_random_number<NumberT>(max, l, e);
+ }
+
+ NumberT operator()(const NumberT min, const NumberT max) {
+ return detail::generate_random_number<NumberT>(min, max, l, e);
+ }
+
+ public:
+ void seed(const seed_type n) {
+ detail::randomize_rng(n, l, e);
+ }
+};
+
+} // inline namespace version_*
+
+} // namespace ceph::util
+
+#endif
diff --git a/src/include/rangeset.h b/src/include/rangeset.h
new file mode 100644
index 00000000..e7e3d047
--- /dev/null
+++ b/src/include/rangeset.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_RANGESET_H
+#define CEPH_RANGESET_H
+
+/*
+ *
+ * my first container with iterator! it's pretty ugly.
+ *
+ */
+
+#include <map>
+
+//typedef int T;
+
+template <class T>
+struct _rangeset_base {
+ map<T,T> ranges; // pair(first,last) (inclusive, e.g. [first,last])
+
+ typedef typename map<T,T>::iterator mapit;
+
+ // get iterator for range including val. or ranges.end().
+ mapit get_range_for(T val) {
+ mapit it = ranges.lower_bound(val);
+ if (it == ranges.end()) {
+ // search backwards
+ typename map<T,T>::reverse_iterator it = ranges.rbegin();
+ if (it == ranges.rend()) return ranges.end();
+ if (it->first <= val && it->second >= val)
+ return ranges.find(it->first);
+ return ranges.end();
+ } else {
+ if (it->first == val) return
+ it--;
+ if (it->first <= val && it->second >= val)
+ return it;
+ return ranges.end();
+ }
+ }
+
+};
+
+
+template <class T>
+class rangeset_iterator :
+ public std::iterator<std::input_iterator_tag, T>
+{
+ //typedef typename map<T,T>::iterator mapit;
+
+ map<T,T> ranges;
+ typename map<T,T>::iterator it;
+ T current;
+
+public:
+ // cons
+ rangeset_iterator() {}
+
+ rangeset_iterator(typename map<T,T>::iterator& it, map<T,T>& ranges) {
+ this->ranges = ranges;
+ this->it = it;
+ if (this->it != ranges.end())
+ current = it->first;
+ }
+
+ bool operator==(rangeset_iterator<T> rit) {
+ return (it == rit.it && rit.current == current);
+ }
+ bool operator!=(rangeset_iterator<T> rit) {
+ return (it != rit.it) || (rit.current != current);
+ }
+
+ T& operator*() {
+ return current;
+ }
+
+ rangeset_iterator<T> operator++(int) {
+ if (current < it->second)
+ current++;
+ else {
+ it++;
+ if (it != ranges.end())
+ current = it->first;
+ }
+
+ return *this;
+ }
+};
+
+
+template <class T>
+class rangeset
+{
+ typedef typename map<T,T>::iterator map_iterator;
+
+ _rangeset_base<T> theset;
+ inodeno_t _size;
+
+public:
+ rangeset() { _size = 0; }
+ typedef rangeset_iterator<T> iterator;
+
+ iterator begin() {
+ map_iterator it = theset.ranges.begin();
+ return iterator(it, theset.ranges);
+ }
+
+ iterator end() {
+ map_iterator it = theset.ranges.end();
+ return iterator(it, theset.ranges);
+ }
+
+ map_iterator map_begin() {
+ return theset.ranges.begin();
+ }
+ map_iterator map_end() {
+ return theset.ranges.end();
+ }
+ int map_size() {
+ return theset.ranges.size();
+ }
+
+ void map_insert(T v1, T v2) {
+ theset.ranges.insert(pair<T,T>(v1,v2));
+ _size += v2 - v1+1;
+ }
+
+
+ // ...
+ bool contains(T val) {
+ if (theset.get_range_for(val) == theset.ranges.end()) return false;
+ ceph_assert(!empty());
+ return true;
+ }
+
+ void insert(T val) {
+ ceph_assert(!contains(val));
+
+ map_iterator left = theset.get_range_for(val-1);
+ map_iterator right = theset.get_range_for(val+1);
+
+ if (left != theset.ranges.end() &&
+ right != theset.ranges.end()) {
+ // join!
+ left->second = right->second;
+ theset.ranges.erase(right);
+ _size++;
+ return;
+ }
+
+ if (left != theset.ranges.end()) {
+ // add to left range
+ left->second = val;
+ _size++;
+ return;
+ }
+
+ if (right != theset.ranges.end()) {
+ // add to right range
+ theset.ranges.insert(pair<T,T>(val, right->second));
+ theset.ranges.erase(val+1);
+ _size++;
+ return;
+ }
+
+ // new range
+ theset.ranges.insert(pair<T,T>(val,val));
+ _size++;
+ return;
+ }
+
+ unsigned size() {
+ return size();
+ }
+
+ bool empty() {
+ if (theset.ranges.empty()) {
+ ceph_assert(_size == 0);
+ return true;
+ }
+ ceph_assert(_size>0);
+ return false;
+ }
+
+
+ T first() {
+ ceph_assert(!empty());
+ map_iterator it = theset.ranges.begin();
+ return it->first;
+ }
+
+ void erase(T val) {
+ ceph_assert(contains(val));
+ map_iterator it = theset.get_range_for(val);
+ ceph_assert(it != theset.ranges.end());
+
+ // entire range
+ if (val == it->first && val == it->second) {
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ // beginning
+ if (val == it->first) {
+ theset.ranges.insert(pair<T,T>(val+1, it->second));
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ // end
+ if (val == it->second) {
+ it->second = val-1;
+ _size--;
+ return;
+ }
+
+ // middle split
+ theset.ranges.insert(pair<T,T>(it->first, val-1));
+ theset.ranges.insert(pair<T,T>(val+1, it->second));
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ void dump() {
+ for (typename map<T,T>::iterator it = theset.ranges.begin();
+ it != theset.ranges.end();
+ it++) {
+ cout << " " << it->first << "-" << it->second << endl;
+ }
+ }
+
+};
+
+
+#endif
diff --git a/src/include/rbd/features.h b/src/include/rbd/features.h
new file mode 100644
index 00000000..89c54a36
--- /dev/null
+++ b/src/include/rbd/features.h
@@ -0,0 +1,102 @@
+#ifndef CEPH_RBD_FEATURES_H
+#define CEPH_RBD_FEATURES_H
+
+#define RBD_FEATURE_LAYERING (1ULL<<0)
+#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
+#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
+#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
+#define RBD_FEATURE_JOURNALING (1ULL<<6)
+#define RBD_FEATURE_DATA_POOL (1ULL<<7)
+#define RBD_FEATURE_OPERATIONS (1ULL<<8)
+#define RBD_FEATURE_MIGRATING (1ULL<<9)
+
+#define RBD_FEATURES_DEFAULT (RBD_FEATURE_LAYERING | \
+ RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_DEEP_FLATTEN)
+
+#define RBD_FEATURE_NAME_LAYERING "layering"
+#define RBD_FEATURE_NAME_STRIPINGV2 "striping"
+#define RBD_FEATURE_NAME_EXCLUSIVE_LOCK "exclusive-lock"
+#define RBD_FEATURE_NAME_OBJECT_MAP "object-map"
+#define RBD_FEATURE_NAME_FAST_DIFF "fast-diff"
+#define RBD_FEATURE_NAME_DEEP_FLATTEN "deep-flatten"
+#define RBD_FEATURE_NAME_JOURNALING "journaling"
+#define RBD_FEATURE_NAME_DATA_POOL "data-pool"
+#define RBD_FEATURE_NAME_OPERATIONS "operations"
+#define RBD_FEATURE_NAME_MIGRATING "migrating"
+
+/// features that make an image inaccessible for read or write by
+/// clients that don't understand them
+#define RBD_FEATURES_INCOMPATIBLE (RBD_FEATURE_LAYERING | \
+ RBD_FEATURE_STRIPINGV2 | \
+ RBD_FEATURE_DATA_POOL)
+
+/// features that make an image unwritable by clients that don't understand them
+#define RBD_FEATURES_RW_INCOMPATIBLE (RBD_FEATURES_INCOMPATIBLE | \
+ RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_DEEP_FLATTEN | \
+ RBD_FEATURE_JOURNALING | \
+ RBD_FEATURE_OPERATIONS | \
+ RBD_FEATURE_MIGRATING)
+
+#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
+ RBD_FEATURE_STRIPINGV2 | \
+ RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_DEEP_FLATTEN | \
+ RBD_FEATURE_JOURNALING | \
+ RBD_FEATURE_DATA_POOL | \
+ RBD_FEATURE_OPERATIONS | \
+ RBD_FEATURE_MIGRATING)
+
+/// features that may be dynamically enabled or disabled
+#define RBD_FEATURES_MUTABLE (RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_JOURNALING)
+
+/// features that may be dynamically disabled
+#define RBD_FEATURES_DISABLE_ONLY (RBD_FEATURE_DEEP_FLATTEN)
+
+/// features that only work when used with a single client
+/// using the image for writes
+#define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_JOURNALING)
+
+/// features that will be implicitly enabled
+#define RBD_FEATURES_IMPLICIT_ENABLE (RBD_FEATURE_STRIPINGV2 | \
+ RBD_FEATURE_DATA_POOL | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_OPERATIONS | \
+ RBD_FEATURE_MIGRATING)
+
+/// features that cannot be controlled by the user
+#define RBD_FEATURES_INTERNAL (RBD_FEATURE_OPERATIONS | \
+ RBD_FEATURE_MIGRATING)
+
+#define RBD_OPERATION_FEATURE_CLONE_PARENT (1ULL<<0)
+#define RBD_OPERATION_FEATURE_CLONE_CHILD (1ULL<<1)
+#define RBD_OPERATION_FEATURE_GROUP (1ULL<<2)
+#define RBD_OPERATION_FEATURE_SNAP_TRASH (1ULL<<3)
+
+#define RBD_OPERATION_FEATURE_NAME_CLONE_PARENT "clone-parent"
+#define RBD_OPERATION_FEATURE_NAME_CLONE_CHILD "clone-child"
+#define RBD_OPERATION_FEATURE_NAME_GROUP "group"
+#define RBD_OPERATION_FEATURE_NAME_SNAP_TRASH "snap-trash"
+
+/// all valid operation features
+#define RBD_OPERATION_FEATURES_ALL (RBD_OPERATION_FEATURE_CLONE_PARENT | \
+ RBD_OPERATION_FEATURE_CLONE_CHILD | \
+ RBD_OPERATION_FEATURE_GROUP | \
+ RBD_OPERATION_FEATURE_SNAP_TRASH)
+
+#endif
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
new file mode 100644
index 00000000..522a6fb6
--- /dev/null
+++ b/src/include/rbd/librbd.h
@@ -0,0 +1,1243 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRBD_H
+#define CEPH_LIBRBD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <stdbool.h>
+#include <string.h>
+#include <sys/uio.h>
+#include "../rados/librados.h"
+#include "features.h"
+
+#define LIBRBD_VER_MAJOR 1
+#define LIBRBD_VER_MINOR 12
+#define LIBRBD_VER_EXTRA 0
+
+#define LIBRBD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA)
+
+#define LIBRBD_SUPPORTS_AIO_FLUSH 1
+#define LIBRBD_SUPPORTS_AIO_OPEN 1
+#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE 1
+#define LIBRBD_SUPPORTS_LOCKING 1
+#define LIBRBD_SUPPORTS_INVALIDATE 1
+#define LIBRBD_SUPPORTS_IOVEC 1
+#define LIBRBD_SUPPORTS_WATCH 0
+#define LIBRBD_SUPPORTS_WRITESAME 1
+#define LIBRBD_SUPPORTS_WRITE_ZEROES 1
+
+#if __GNUC__ >= 4
+ #define CEPH_RBD_API __attribute__ ((visibility ("default")))
+#else
+ #define CEPH_RBD_API
+#endif
+
+#define RBD_FLAG_OBJECT_MAP_INVALID (1<<0)
+#define RBD_FLAG_FAST_DIFF_INVALID (1<<1)
+
+typedef void *rbd_image_t;
+typedef void *rbd_image_options_t;
+typedef void *rbd_pool_stats_t;
+
+typedef void *rbd_completion_t;
+typedef void (*rbd_callback_t)(rbd_completion_t cb, void *arg);
+
+typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr);
+
+typedef void (*rbd_update_callback_t)(void *arg);
+
+typedef enum {
+ RBD_SNAP_NAMESPACE_TYPE_USER = 0,
+ RBD_SNAP_NAMESPACE_TYPE_GROUP = 1,
+ RBD_SNAP_NAMESPACE_TYPE_TRASH = 2
+} rbd_snap_namespace_type_t;
+
+typedef struct {
+ char *id;
+ char *name;
+} rbd_image_spec_t;
+
+typedef struct {
+ int64_t pool_id;
+ char *pool_name;
+ char *pool_namespace;
+ char *image_id;
+ char *image_name;
+ bool trash;
+} rbd_linked_image_spec_t;
+
+typedef struct {
+ uint64_t id;
+ rbd_snap_namespace_type_t namespace_type;
+ char *name;
+} rbd_snap_spec_t;
+
+typedef struct {
+ uint64_t id;
+ uint64_t size;
+ const char *name;
+} rbd_snap_info_t;
+
+typedef struct {
+ const char *pool_name;
+ const char *image_name;
+ const char *image_id;
+ bool trash;
+} rbd_child_info_t;
+
+#define RBD_MAX_IMAGE_NAME_SIZE 96
+#define RBD_MAX_BLOCK_NAME_SIZE 24
+
+#define RBD_SNAP_REMOVE_UNPROTECT 1 << 0
+#define RBD_SNAP_REMOVE_FLATTEN 1 << 1
+#define RBD_SNAP_REMOVE_FORCE (RBD_SNAP_REMOVE_UNPROTECT | RBD_SNAP_REMOVE_FLATTEN)
+
+/**
+ * These types used to in set_image_notification to indicate the type of event
+ * socket passed in.
+ */
+enum {
+ EVENT_TYPE_PIPE = 1,
+ EVENT_TYPE_EVENTFD = 2
+};
+
+typedef struct {
+ uint64_t size;
+ uint64_t obj_size;
+ uint64_t num_objs;
+ int order;
+ char block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE]; /* deprecated */
+ int64_t parent_pool; /* deprecated */
+ char parent_name[RBD_MAX_IMAGE_NAME_SIZE]; /* deprecated */
+} rbd_image_info_t;
+
+typedef enum {
+ RBD_MIRROR_MODE_DISABLED, /* mirroring is disabled */
+ RBD_MIRROR_MODE_IMAGE, /* mirroring enabled on a per-image basis */
+ RBD_MIRROR_MODE_POOL /* mirroring enabled on all journaled images */
+} rbd_mirror_mode_t;
+
+typedef enum {
+ RBD_MIRROR_PEER_DIRECTION_RX = 0,
+ RBD_MIRROR_PEER_DIRECTION_TX = 1,
+ RBD_MIRROR_PEER_DIRECTION_RX_TX = 2
+} rbd_mirror_peer_direction_t;
+
+typedef struct {
+ char *uuid;
+ char *cluster_name;
+ char *client_name;
+} rbd_mirror_peer_t;
+
+#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_MON_HOST "mon_host"
+#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_KEY "key"
+
+typedef enum {
+ RBD_MIRROR_IMAGE_DISABLING = 0,
+ RBD_MIRROR_IMAGE_ENABLED = 1,
+ RBD_MIRROR_IMAGE_DISABLED = 2
+} rbd_mirror_image_state_t;
+
+typedef struct {
+ char *global_id;
+ rbd_mirror_image_state_t state;
+ bool primary;
+} rbd_mirror_image_info_t;
+
+typedef enum {
+ MIRROR_IMAGE_STATUS_STATE_UNKNOWN = 0,
+ MIRROR_IMAGE_STATUS_STATE_ERROR = 1,
+ MIRROR_IMAGE_STATUS_STATE_SYNCING = 2,
+ MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY = 3,
+ MIRROR_IMAGE_STATUS_STATE_REPLAYING = 4,
+ MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY = 5,
+ MIRROR_IMAGE_STATUS_STATE_STOPPED = 6,
+} rbd_mirror_image_status_state_t;
+
+typedef struct {
+ char *name;
+ rbd_mirror_image_info_t info;
+ rbd_mirror_image_status_state_t state;
+ char *description;
+ time_t last_update;
+ bool up;
+} rbd_mirror_image_status_t;
+
+typedef enum {
+ RBD_GROUP_IMAGE_STATE_ATTACHED,
+ RBD_GROUP_IMAGE_STATE_INCOMPLETE
+} rbd_group_image_state_t;
+
+typedef struct {
+ char *name;
+ int64_t pool;
+ rbd_group_image_state_t state;
+} rbd_group_image_info_t;
+
+typedef struct {
+ char *name;
+ int64_t pool;
+} rbd_group_info_t;
+
+typedef enum {
+ RBD_GROUP_SNAP_STATE_INCOMPLETE,
+ RBD_GROUP_SNAP_STATE_COMPLETE
+} rbd_group_snap_state_t;
+
+typedef struct {
+ char *name;
+ rbd_group_snap_state_t state;
+} rbd_group_snap_info_t;
+
+typedef struct {
+ int64_t group_pool;
+ char *group_name;
+ char *group_snap_name;
+} rbd_snap_group_namespace_t;
+
+typedef enum {
+ RBD_LOCK_MODE_EXCLUSIVE = 0,
+ RBD_LOCK_MODE_SHARED = 1,
+} rbd_lock_mode_t;
+
+CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra);
+
+/* image options */
+enum {
+ RBD_IMAGE_OPTION_FORMAT = 0,
+ RBD_IMAGE_OPTION_FEATURES = 1,
+ RBD_IMAGE_OPTION_ORDER = 2,
+ RBD_IMAGE_OPTION_STRIPE_UNIT = 3,
+ RBD_IMAGE_OPTION_STRIPE_COUNT = 4,
+ RBD_IMAGE_OPTION_JOURNAL_ORDER = 5,
+ RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH = 6,
+ RBD_IMAGE_OPTION_JOURNAL_POOL = 7,
+ RBD_IMAGE_OPTION_FEATURES_SET = 8,
+ RBD_IMAGE_OPTION_FEATURES_CLEAR = 9,
+ RBD_IMAGE_OPTION_DATA_POOL = 10,
+ RBD_IMAGE_OPTION_FLATTEN = 11,
+ RBD_IMAGE_OPTION_CLONE_FORMAT = 12,
+};
+
+typedef enum {
+ RBD_TRASH_IMAGE_SOURCE_USER = 0,
+ RBD_TRASH_IMAGE_SOURCE_MIRRORING = 1,
+ RBD_TRASH_IMAGE_SOURCE_MIGRATION = 2,
+ RBD_TRASH_IMAGE_SOURCE_REMOVING = 3
+} rbd_trash_image_source_t;
+
+typedef struct {
+ char *id;
+ char *name;
+ rbd_trash_image_source_t source;
+ time_t deletion_time;
+ time_t deferment_end_time;
+} rbd_trash_image_info_t;
+
+typedef struct {
+ char *addr;
+ int64_t id;
+ uint64_t cookie;
+} rbd_image_watcher_t;
+
+typedef enum {
+ RBD_IMAGE_MIGRATION_STATE_UNKNOWN = -1,
+ RBD_IMAGE_MIGRATION_STATE_ERROR = 0,
+ RBD_IMAGE_MIGRATION_STATE_PREPARING = 1,
+ RBD_IMAGE_MIGRATION_STATE_PREPARED = 2,
+ RBD_IMAGE_MIGRATION_STATE_EXECUTING = 3,
+ RBD_IMAGE_MIGRATION_STATE_EXECUTED = 4,
+ RBD_IMAGE_MIGRATION_STATE_ABORTING = 5,
+} rbd_image_migration_state_t;
+
+typedef struct {
+ int64_t source_pool_id;
+ char *source_pool_namespace;
+ char *source_image_name;
+ char *source_image_id;
+ int64_t dest_pool_id;
+ char *dest_pool_namespace;
+ char *dest_image_name;
+ char *dest_image_id;
+ rbd_image_migration_state_t state;
+ char *state_description;
+} rbd_image_migration_status_t;
+
+typedef enum {
+ RBD_CONFIG_SOURCE_CONFIG = 0,
+ RBD_CONFIG_SOURCE_POOL = 1,
+ RBD_CONFIG_SOURCE_IMAGE = 2,
+} rbd_config_source_t;
+
+typedef struct {
+ char *name;
+ char *value;
+ rbd_config_source_t source;
+} rbd_config_option_t;
+
+typedef enum {
+ RBD_POOL_STAT_OPTION_IMAGES,
+ RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES,
+ RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES,
+ RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS,
+ RBD_POOL_STAT_OPTION_TRASH_IMAGES,
+ RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES,
+ RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES,
+ RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS
+} rbd_pool_stat_option_t;
+
+CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts);
+CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts,
+ int optname, const char* optval);
+CEPH_RBD_API int rbd_image_options_set_uint64(rbd_image_options_t opts,
+ int optname, uint64_t optval);
+CEPH_RBD_API int rbd_image_options_get_string(rbd_image_options_t opts,
+ int optname, char* optval,
+ size_t maxlen);
+CEPH_RBD_API int rbd_image_options_get_uint64(rbd_image_options_t opts,
+ int optname, uint64_t* optval);
+CEPH_RBD_API int rbd_image_options_is_set(rbd_image_options_t opts,
+ int optname, bool* is_set);
+CEPH_RBD_API int rbd_image_options_unset(rbd_image_options_t opts, int optname);
+CEPH_RBD_API void rbd_image_options_clear(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_is_empty(rbd_image_options_t opts);
+
+/* helpers */
+CEPH_RBD_API void rbd_image_spec_cleanup(rbd_image_spec_t *image);
+CEPH_RBD_API void rbd_image_spec_list_cleanup(rbd_image_spec_t *images,
+ size_t num_images);
+CEPH_RBD_API void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image);
+CEPH_RBD_API void rbd_linked_image_spec_list_cleanup(
+ rbd_linked_image_spec_t *images, size_t num_images);
+CEPH_RBD_API void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap);
+
+/* images */
+CEPH_RBD_API int rbd_list(rados_ioctx_t io, char *names, size_t *size)
+ __attribute__((deprecated));
+CEPH_RBD_API int rbd_list2(rados_ioctx_t io, rbd_image_spec_t* images,
+ size_t *max_images);
+
+CEPH_RBD_API int rbd_create(rados_ioctx_t io, const char *name, uint64_t size,
+ int *order);
+CEPH_RBD_API int rbd_create2(rados_ioctx_t io, const char *name, uint64_t size,
+ uint64_t features, int *order);
+/**
+ * create new rbd image
+ *
+ * The stripe_unit must be a factor of the object size (1 << order).
+ * The stripe_count can be one (no intra-object striping) or greater
+ * than one. The RBD_FEATURE_STRIPINGV2 must be specified if the
+ * stripe_unit != the object size and the stripe_count is != 1.
+ *
+ * @param io ioctx
+ * @param name image name
+ * @param size image size in bytes
+ * @param features initial feature bits
+ * @param order object/block size, as a power of two (object size == 1 << order)
+ * @param stripe_unit stripe unit size, in bytes.
+ * @param stripe_count number of objects to stripe over before looping
+ * @return 0 on success, or negative error code
+ */
+CEPH_RBD_API int rbd_create3(rados_ioctx_t io, const char *name, uint64_t size,
+ uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count);
+CEPH_RBD_API int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size,
+ rbd_image_options_t opts);
+CEPH_RBD_API int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snapname, rados_ioctx_t c_ioctx,
+ const char *c_name, uint64_t features, int *c_order);
+CEPH_RBD_API int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snapname, rados_ioctx_t c_ioctx,
+ const char *c_name, uint64_t features, int *c_order,
+ uint64_t stripe_unit, int stripe_count);
+CEPH_RBD_API int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snapname, rados_ioctx_t c_ioctx,
+ const char *c_name, rbd_image_options_t c_opts);
+CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name);
+CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname,
+ const char *destname);
+
+CEPH_RBD_API int rbd_trash_move(rados_ioctx_t io, const char *name,
+ uint64_t delay);
+CEPH_RBD_API int rbd_trash_get(rados_ioctx_t io, const char *id,
+ rbd_trash_image_info_t *info);
+CEPH_RBD_API void rbd_trash_get_cleanup(rbd_trash_image_info_t *info);
+CEPH_RBD_API int rbd_trash_list(rados_ioctx_t io,
+ rbd_trash_image_info_t *trash_entries,
+ size_t *num_entries);
+CEPH_RBD_API void rbd_trash_list_cleanup(rbd_trash_image_info_t *trash_entries,
+ size_t num_entries);
+CEPH_RBD_API int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, float threshold);
+CEPH_RBD_API int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts,
+ float threshold, librbd_progress_fn_t cb,
+ void* cbdata);
+CEPH_RBD_API int rbd_trash_remove(rados_ioctx_t io, const char *id, bool force);
+CEPH_RBD_API int rbd_trash_remove_with_progress(rados_ioctx_t io,
+ const char *id,
+ bool force,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_trash_restore(rados_ioctx_t io, const char *id,
+ const char *name);
+
+/* migration */
+CEPH_RBD_API int rbd_migration_prepare(rados_ioctx_t ioctx,
+ const char *image_name,
+ rados_ioctx_t dest_ioctx,
+ const char *dest_image_name,
+ rbd_image_options_t opts);
+CEPH_RBD_API int rbd_migration_execute(rados_ioctx_t ioctx,
+ const char *image_name);
+CEPH_RBD_API int rbd_migration_execute_with_progress(rados_ioctx_t ioctx,
+ const char *image_name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_migration_abort(rados_ioctx_t ioctx,
+ const char *image_name);
+CEPH_RBD_API int rbd_migration_abort_with_progress(rados_ioctx_t ioctx,
+ const char *image_name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_migration_commit(rados_ioctx_t ioctx,
+ const char *image_name);
+CEPH_RBD_API int rbd_migration_commit_with_progress(rados_ioctx_t ioctx,
+ const char *image_name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_migration_status(rados_ioctx_t ioctx,
+ const char *image_name,
+ rbd_image_migration_status_t *status,
+ size_t status_size);
+CEPH_RBD_API void rbd_migration_status_cleanup(
+ rbd_image_migration_status_t *status);
+
+/* pool mirroring */
+CEPH_RBD_API int rbd_mirror_site_name_get(rados_t cluster,
+ char *name, size_t *max_len);
+CEPH_RBD_API int rbd_mirror_site_name_set(rados_t cluster,
+ const char *name);
+
+CEPH_RBD_API int rbd_mirror_mode_get(rados_ioctx_t io_ctx,
+ rbd_mirror_mode_t *mirror_mode);
+CEPH_RBD_API int rbd_mirror_mode_set(rados_ioctx_t io_ctx,
+ rbd_mirror_mode_t mirror_mode);
+
+CEPH_RBD_API int rbd_mirror_peer_bootstrap_create(rados_ioctx_t io_ctx,
+ char *token, size_t *max_len);
+CEPH_RBD_API int rbd_mirror_peer_bootstrap_import(
+ rados_ioctx_t io_ctx, rbd_mirror_peer_direction_t direction,
+ const char *token);
+
+CEPH_RBD_API int rbd_mirror_peer_add(rados_ioctx_t io_ctx,
+ char *uuid, size_t uuid_max_length,
+ const char *cluster_name,
+ const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_remove(rados_ioctx_t io_ctx,
+ const char *uuid);
+CEPH_RBD_API int rbd_mirror_peer_list(rados_ioctx_t io_ctx,
+ rbd_mirror_peer_t *peers, int *max_peers);
+CEPH_RBD_API void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers,
+ int max_peers);
+CEPH_RBD_API int rbd_mirror_peer_set_client(rados_ioctx_t io_ctx,
+ const char *uuid,
+ const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_set_cluster(rados_ioctx_t io_ctx,
+ const char *uuid,
+ const char *cluster_name);
+CEPH_RBD_API int rbd_mirror_peer_get_attributes(
+ rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+ char *values, size_t *max_value_len, size_t *key_value_count);
+CEPH_RBD_API int rbd_mirror_peer_set_attributes(
+ rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+ size_t key_value_count);
+
+CEPH_RBD_API int rbd_mirror_image_status_list(rados_ioctx_t io_ctx,
+ const char *start_id, size_t max,
+ char **image_ids,
+ rbd_mirror_image_status_t *images,
+ size_t *len);
+CEPH_RBD_API void rbd_mirror_image_status_list_cleanup(char **image_ids,
+ rbd_mirror_image_status_t *images, size_t len);
+CEPH_RBD_API int rbd_mirror_image_status_summary(rados_ioctx_t io_ctx,
+ rbd_mirror_image_status_state_t *states, int *counts, size_t *maxlen);
+
+CEPH_RBD_API int rbd_mirror_image_instance_id_list(rados_ioctx_t io_ctx,
+ const char *start_id,
+ size_t max, char **image_ids,
+ char **instance_ids,
+ size_t *len);
+CEPH_RBD_API void rbd_mirror_image_instance_id_list_cleanup(char **image_ids,
+ char **instance_ids,
+ size_t len);
+
+/* pool metadata */
+CEPH_RBD_API int rbd_pool_metadata_get(rados_ioctx_t io_ctx, const char *key,
+ char *value, size_t *val_len);
+CEPH_RBD_API int rbd_pool_metadata_set(rados_ioctx_t io_ctx, const char *key,
+ const char *value);
+CEPH_RBD_API int rbd_pool_metadata_remove(rados_ioctx_t io_ctx,
+ const char *key);
+CEPH_RBD_API int rbd_pool_metadata_list(rados_ioctx_t io_ctx, const char *start,
+ uint64_t max, char *keys,
+ size_t *key_len, char *values,
+ size_t *vals_len);
+
+CEPH_RBD_API int rbd_config_pool_list(rados_ioctx_t io_ctx,
+ rbd_config_option_t *options,
+ int *max_options);
+CEPH_RBD_API void rbd_config_pool_list_cleanup(rbd_config_option_t *options,
+ int max_options);
+
+CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name,
+ rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_open_by_id(rados_ioctx_t io, const char *id,
+ rbd_image_t *image, const char *snap_name);
+
+CEPH_RBD_API int rbd_aio_open(rados_ioctx_t io, const char *name,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_open_by_id(rados_ioctx_t io, const char *id,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c);
+
+/**
+ * Open an image in read-only mode.
+ *
+ * This is intended for use by clients that cannot write to a block
+ * device due to cephx restrictions. There will be no watch
+ * established on the header object, since a watch is a write. This
+ * means the metadata reported about this image (parents, snapshots,
+ * size, etc.) may become stale. This should not be used for
+ * long-running operations, unless you can be sure that one of these
+ * properties changing is safe.
+ *
+ * Attempting to write to a read-only image will return -EROFS.
+ *
+ * @param io ioctx to determine the pool the image is in
+ * @param name image name
+ * @param image where to store newly opened image handle
+ * @param snap_name name of snapshot to open at, or NULL for no snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_open_read_only(rados_ioctx_t io, const char *name,
+ rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_open_by_id_read_only(rados_ioctx_t io, const char *id,
+ rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_aio_open_read_only(rados_ioctx_t io, const char *name,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_open_by_id_read_only(rados_ioctx_t io, const char *id,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_close(rbd_image_t image);
+CEPH_RBD_API int rbd_aio_close(rbd_image_t image, rbd_completion_t c);
+CEPH_RBD_API int rbd_resize(rbd_image_t image, uint64_t size);
+CEPH_RBD_API int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_resize_with_progress(rbd_image_t image, uint64_t size,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_stat(rbd_image_t image, rbd_image_info_t *info,
+ size_t infosize);
+CEPH_RBD_API int rbd_get_old_format(rbd_image_t image, uint8_t *old);
+CEPH_RBD_API int rbd_get_size(rbd_image_t image, uint64_t *size);
+CEPH_RBD_API int rbd_get_features(rbd_image_t image, uint64_t *features);
+CEPH_RBD_API int rbd_update_features(rbd_image_t image, uint64_t features,
+ uint8_t enabled);
+CEPH_RBD_API int rbd_get_op_features(rbd_image_t image, uint64_t *op_features);
+CEPH_RBD_API int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit);
+CEPH_RBD_API int rbd_get_stripe_count(rbd_image_t image,
+ uint64_t *stripe_count);
+
+CEPH_RBD_API int rbd_get_create_timestamp(rbd_image_t image,
+ struct timespec *timestamp);
+CEPH_RBD_API int rbd_get_access_timestamp(rbd_image_t image,
+ struct timespec *timestamp);
+CEPH_RBD_API int rbd_get_modify_timestamp(rbd_image_t image,
+ struct timespec *timestamp);
+
+CEPH_RBD_API int rbd_get_overlap(rbd_image_t image, uint64_t *overlap);
+CEPH_RBD_API int rbd_get_name(rbd_image_t image, char *name, size_t *name_len);
+CEPH_RBD_API int rbd_get_id(rbd_image_t image, char *id, size_t id_len);
+CEPH_RBD_API int rbd_get_block_name_prefix(rbd_image_t image,
+ char *prefix, size_t prefix_len);
+CEPH_RBD_API int64_t rbd_get_data_pool_id(rbd_image_t image);
+
+CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image,
+ char *parent_poolname, size_t ppoolnamelen,
+ char *parent_name, size_t pnamelen,
+ char *parent_snapname,
+ size_t psnapnamelen)
+ __attribute__((deprecated));
+CEPH_RBD_API int rbd_get_parent_info2(rbd_image_t image,
+ char *parent_poolname,
+ size_t ppoolnamelen,
+ char *parent_name, size_t pnamelen,
+ char *parent_id, size_t pidlen,
+ char *parent_snapname,
+ size_t psnapnamelen)
+ __attribute__((deprecated));
+CEPH_RBD_API int rbd_get_parent(rbd_image_t image,
+ rbd_linked_image_spec_t *parent_image,
+ rbd_snap_spec_t *parent_snap);
+
+CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags);
+CEPH_RBD_API int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info,
+ size_t group_info_size);
+CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type);
+
+/* exclusive lock feature */
+CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner);
+CEPH_RBD_API int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode);
+CEPH_RBD_API int rbd_lock_release(rbd_image_t image);
+CEPH_RBD_API int rbd_lock_get_owners(rbd_image_t image,
+ rbd_lock_mode_t *lock_mode,
+ char **lock_owners,
+ size_t *max_lock_owners);
+CEPH_RBD_API void rbd_lock_get_owners_cleanup(char **lock_owners,
+ size_t lock_owner_count);
+CEPH_RBD_API int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode,
+ const char *lock_owner);
+
+/* object map feature */
+CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image,
+ librbd_progress_fn_t cb, void *cbdata);
+
+CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx,
+ const char *destname);
+CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest);
+CEPH_RBD_API int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+ const char *destname, rbd_image_options_t dest_opts);
+CEPH_RBD_API int rbd_copy4(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+ const char *destname, rbd_image_options_t dest_opts,
+ size_t sparse_size);
+CEPH_RBD_API int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress2(rbd_image_t src, rbd_image_t dest,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress3(rbd_image_t image,
+ rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress4(rbd_image_t image,
+ rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t cb, void *cbdata,
+ size_t sparse_size);
+
+/* deep copy */
+CEPH_RBD_API int rbd_deep_copy(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+ const char *destname,
+ rbd_image_options_t dest_opts);
+CEPH_RBD_API int rbd_deep_copy_with_progress(rbd_image_t image,
+ rados_ioctx_t dest_io_ctx,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+
+/* snapshots */
+CEPH_RBD_API int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
+ int *max_snaps);
+CEPH_RBD_API void rbd_snap_list_end(rbd_snap_info_t *snaps);
+CEPH_RBD_API int rbd_snap_create(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_remove(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_remove2(rbd_image_t image, const char *snap_name,
+ uint32_t flags, librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id);
+CEPH_RBD_API int rbd_snap_rollback(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_rollback_with_progress(rbd_image_t image,
+ const char *snapname,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_snap_rename(rbd_image_t image, const char *snapname,
+ const char* dstsnapsname);
+/**
+ * Prevent a snapshot from being deleted until it is unprotected.
+ *
+ * @param snap_name which snapshot to protect
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if snap is already protected
+ */
+CEPH_RBD_API int rbd_snap_protect(rbd_image_t image, const char *snap_name);
+/**
+ * Allow a snaphshot to be deleted.
+ *
+ * @param snap_name which snapshot to unprotect
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snap is not protected
+ */
+CEPH_RBD_API int rbd_snap_unprotect(rbd_image_t image, const char *snap_name);
+/**
+ * Determine whether a snapshot is protected.
+ *
+ * @param snap_name which snapshot query
+ * @param is_protected where to store the result (0 or 1)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_is_protected(rbd_image_t image, const char *snap_name,
+ int *is_protected);
+/**
+ * Get the current snapshot limit for an image. If no limit is set,
+ * UINT64_MAX is returned.
+ *
+ * @param limit pointer where the limit will be stored on success
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit);
+
+/**
+ * Set a limit for the number of snapshots that may be taken of an image.
+ *
+ * @param limit the maximum number of snapshots allowed in the future.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_set_limit(rbd_image_t image, uint64_t limit);
+
+/**
+ * Get the timestamp of a snapshot for an image.
+ *
+ * @param snap_id the snap id of a snapshot of input image.
+ * @param timestamp the timestamp of input snapshot.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp);
+
+CEPH_RBD_API int rbd_snap_set(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id);
+
+CEPH_RBD_API int rbd_snap_get_namespace_type(rbd_image_t image,
+ uint64_t snap_id,
+ rbd_snap_namespace_type_t *namespace_type);
+CEPH_RBD_API int rbd_snap_get_group_namespace(rbd_image_t image,
+ uint64_t snap_id,
+ rbd_snap_group_namespace_t *group_snap,
+ size_t group_snap_size);
+CEPH_RBD_API int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap,
+ size_t group_snap_size);
+CEPH_RBD_API int rbd_snap_get_trash_namespace(rbd_image_t image,
+ uint64_t snap_id,
+ char* original_name,
+ size_t max_length);
+
+CEPH_RBD_API int rbd_flatten(rbd_image_t image);
+
+CEPH_RBD_API int rbd_flatten_with_progress(rbd_image_t image,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+
+CEPH_RBD_API int rbd_sparsify(rbd_image_t image, size_t sparse_size);
+
+CEPH_RBD_API int rbd_sparsify_with_progress(rbd_image_t image,
+ size_t sparse_size,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+
+/**
+ * List all images that are cloned from the image at the
+ * snapshot that is set via rbd_snap_set().
+ *
+ * This iterates over all pools, so it should be run by a user with
+ * read access to all of them. pools_len and images_len are filled in
+ * with the number of bytes put into the pools and images buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the pool and image names
+ * of the children, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param pools buffer in which to store pool names
+ * @param pools_len number of bytes in pools buffer
+ * @param images buffer in which to store image names
+ * @param images_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API ssize_t rbd_list_children(rbd_image_t image, char *pools,
+ size_t *pools_len, char *images,
+ size_t *images_len)
+ __attribute__((deprecated));
+CEPH_RBD_API int rbd_list_children2(rbd_image_t image,
+ rbd_child_info_t *children,
+ int *max_children)
+ __attribute__((deprecated));
+CEPH_RBD_API void rbd_list_child_cleanup(rbd_child_info_t *child)
+ __attribute__((deprecated));
+CEPH_RBD_API void rbd_list_children_cleanup(rbd_child_info_t *children,
+ size_t num_children)
+ __attribute__((deprecated));
+
+CEPH_RBD_API int rbd_list_children3(rbd_image_t image,
+ rbd_linked_image_spec_t *images,
+ size_t *max_images);
+
+CEPH_RBD_API int rbd_list_descendants(rbd_image_t image,
+ rbd_linked_image_spec_t *images,
+ size_t *max_images);
+
+/**
+ * @defgroup librbd_h_locking Advisory Locking
+ *
+ * An rbd image may be locking exclusively, or shared, to facilitate
+ * e.g. live migration where the image may be open in two places at once.
+ * These locks are intended to guard against more than one client
+ * writing to an image without coordination. They don't need to
+ * be used for snapshots, since snapshots are read-only.
+ *
+ * Currently locks only guard against locks being acquired.
+ * They do not prevent anything else.
+ *
+ * A locker is identified by the internal rados client id of the
+ * holder and a user-defined cookie. This (client id, cookie) pair
+ * must be unique for each locker.
+ *
+ * A shared lock also has a user-defined tag associated with it. Each
+ * additional shared lock must specify the same tag or lock
+ * acquisition will fail. This can be used by e.g. groups of hosts
+ * using a clustered filesystem on top of an rbd image to make sure
+ * they're accessing the correct image.
+ *
+ * @{
+ */
+/**
+ * List clients that have locked the image and information about the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the image
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RBD_API ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive,
+ char *tag, size_t *tag_len,
+ char *clients, size_t *clients_len,
+ char *cookies, size_t *cookies_len,
+ char *addrs, size_t *addrs_len);
+
+/**
+ * Take an exclusive lock on the image.
+ *
+ * @param image the image to lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_lock_exclusive(rbd_image_t image, const char *cookie);
+
+/**
+ * Take a shared lock on the image.
+ *
+ * Other clients may also take a shared lock, as lock as they use the
+ * same tag.
+ *
+ * @param image the image to lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag user-defined identifier for this shared use of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_lock_shared(rbd_image_t image, const char *cookie,
+ const char *tag);
+
+/**
+ * Release a shared or exclusive lock on the image.
+ *
+ * @param image the image to unlock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_unlock(rbd_image_t image, const char *cookie);
+
+/**
+ * Release a shared or exclusive lock that was taken by the specified client.
+ *
+ * @param image the image to unlock
+ * @param client the entity holding the lock (as given by rbd_list_lockers())
+ * @param cookie user-defined identifier for the instance of the lock to break
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_break_lock(rbd_image_t image, const char *client,
+ const char *cookie);
+
+/** @} locking */
+
+/* I/O */
+CEPH_RBD_API ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len,
+ char *buf);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
+ char *buf, int op_flags);
+/* DEPRECATED; use rbd_read_iterate2 */
+CEPH_RBD_API int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg);
+
+/**
+ * iterate read over an image
+ *
+ * Reads each region of the image and calls the callback. If the
+ * buffer pointer passed to the callback is NULL, the given extent is
+ * defined to be zeros (a hole). Normally the granularity for the
+ * callback is the image stripe size.
+ *
+ * @param image image to read
+ * @param ofs offset to start from
+ * @param len bytes of source image to cover
+ * @param cb callback for each region
+ * @returns 0 success, error otherwise
+ */
+CEPH_RBD_API int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg);
+/**
+ * get difference between two versions of an image
+ *
+ * This will return the differences between two versions of an image
+ * via a callback, which gets the offset and length and a flag
+ * indicating whether the extent exists (1), or is known/defined to
+ * be zeros (a hole, 0). If the source snapshot name is NULL, we
+ * interpret that as the beginning of time and return all allocated
+ * regions of the image. The end version is whatever is currently
+ * selected for the image handle (either a snapshot or the writeable
+ * head).
+ *
+ * @param fromsnapname start snapshot name, or NULL
+ * @param ofs start offset
+ * @param len len in bytes of region to report on
+ * @param include_parent 1 if full history diff should include parent
+ * @param whole_object 1 if diff extents should cover whole object
+ * @param cb callback to call for each allocated region
+ * @param arg argument to pass to the callback
+ * @returns 0 on success, or negative error code on error
+ */
+CEPH_RBD_API int rbd_diff_iterate(rbd_image_t image,
+ const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg);
+CEPH_RBD_API int rbd_diff_iterate2(rbd_image_t image,
+ const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ uint8_t include_parent, uint8_t whole_object,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg);
+CEPH_RBD_API ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf, int op_flags);
+CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
+CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf, size_t data_len,
+ int op_flags);
+CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs,
+ size_t len, int zero_flags,
+ int op_flags);
+CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs,
+ size_t len, const char *cmp_buf,
+ const char *buf,
+ uint64_t *mismatch_off,
+ int op_flags);
+
+CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, rbd_completion_t c);
+
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, rbd_completion_t c,
+ int op_flags);
+CEPH_RBD_API int rbd_aio_writev(rbd_image_t image, const struct iovec *iov,
+ int iovcnt, uint64_t off, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
+ char *buf, rbd_completion_t c);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len,
+ char *buf, rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_readv(rbd_image_t image, const struct iovec *iov,
+ int iovcnt, uint64_t off, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, size_t data_len,
+ rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off,
+ size_t len, rbd_completion_t c,
+ int zero_flags, int op_flags);
+CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image,
+ uint64_t off, size_t len,
+ const char *cmp_buf,
+ const char *buf,
+ rbd_completion_t c,
+ uint64_t *mismatch_off,
+ int op_flags);
+
+CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
+ rbd_callback_t complete_cb,
+ rbd_completion_t *c);
+CEPH_RBD_API int rbd_aio_is_complete(rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_wait_for_complete(rbd_completion_t c);
+CEPH_RBD_API ssize_t rbd_aio_get_return_value(rbd_completion_t c);
+CEPH_RBD_API void *rbd_aio_get_arg(rbd_completion_t c);
+CEPH_RBD_API void rbd_aio_release(rbd_completion_t c);
+CEPH_RBD_API int rbd_flush(rbd_image_t image);
+/**
+ * Start a flush if caching is enabled. Get a callback when
+ * the currently pending writes are on disk.
+ *
+ * @param image the image to flush writes to
+ * @param c what to call when flushing is complete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
+
+/**
+ * Drop any cached data for an image
+ *
+ * @param image the image to invalidate cached data for
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image);
+
+CEPH_RBD_API int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp);
+
+CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len);
+CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value);
+CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key);
+/**
+ * List all metadatas associated with this image.
+ *
+ * This iterates over all metadatas, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the keys and values
+ * of the image, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param start_after which name to begin listing after
+ * (use the empty string to start at the beginning)
+ * @param max the maximum number of names to lis(if 0 means no limit)
+ * @param keys buffer in which to store pool names
+ * @param keys_len number of bytes in pools buffer
+ * @param values buffer in which to store image names
+ * @param vals_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max,
+ char *keys, size_t *key_len, char *values, size_t *vals_len);
+
+// RBD image mirroring support functions
+CEPH_RBD_API int rbd_mirror_image_enable(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_disable(rbd_image_t image, bool force);
+CEPH_RBD_API int rbd_mirror_image_promote(rbd_image_t image, bool force);
+CEPH_RBD_API int rbd_mirror_image_demote(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_resync(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_get_info(rbd_image_t image,
+ rbd_mirror_image_info_t *mirror_image_info,
+ size_t info_size);
+CEPH_RBD_API int rbd_mirror_image_get_status(rbd_image_t image,
+ rbd_mirror_image_status_t *mirror_image_status,
+ size_t status_size);
+CEPH_RBD_API int rbd_mirror_image_get_instance_id(rbd_image_t image,
+ char *instance_id,
+ size_t *id_max_length);
+CEPH_RBD_API int rbd_aio_mirror_image_promote(rbd_image_t image, bool force,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_demote(rbd_image_t image,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_info(rbd_image_t image,
+ rbd_mirror_image_info_t *mirror_image_info,
+ size_t info_size,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_status(rbd_image_t image,
+ rbd_mirror_image_status_t *mirror_image_status,
+ size_t status_size,
+ rbd_completion_t c);
+
+// RBD groups support functions
+CEPH_RBD_API int rbd_group_create(rados_ioctx_t p, const char *name);
+CEPH_RBD_API int rbd_group_remove(rados_ioctx_t p, const char *name);
+CEPH_RBD_API int rbd_group_list(rados_ioctx_t p, char *names, size_t *size);
+CEPH_RBD_API int rbd_group_rename(rados_ioctx_t p, const char *src_name,
+ const char *dest_name);
+CEPH_RBD_API int rbd_group_info_cleanup(rbd_group_info_t *group_info,
+ size_t group_info_size);
+
+/**
+ * Register an image metadata change watcher.
+ *
+ * @param image the image to watch
+ * @param handle where to store the internal id assigned to this watch
+ * @param watch_cb what to do when a notify is received on this image
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_update_watch(rbd_image_t image, uint64_t *handle,
+ rbd_update_callback_t watch_cb, void *arg);
+
+/**
+ * Unregister an image watcher.
+ *
+ * @param image the image to unwatch
+ * @param handle which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_update_unwatch(rbd_image_t image, uint64_t handle);
+
+/**
+ * List any watchers of an image.
+ *
+ * Watchers will be allocated and stored in the passed watchers array. If there
+ * are more watchers than max_watchers, -ERANGE will be returned and the number
+ * of watchers will be stored in max_watchers.
+ *
+ * The caller should call rbd_watchers_list_cleanup when finished with the list
+ * of watchers.
+ *
+ * @param image the image to list watchers for.
+ * @param watchers an array to store watchers in.
+ * @param max_watchers capacity of the watchers array.
+ * @returns 0 on success, negative error code on failure.
+ * @returns -ERANGE if there are too many watchers for the passed array.
+ * @returns the number of watchers in max_watchers.
+ */
+CEPH_RBD_API int rbd_watchers_list(rbd_image_t image,
+ rbd_image_watcher_t *watchers,
+ size_t *max_watchers);
+
+CEPH_RBD_API void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers,
+ size_t num_watchers);
+
+CEPH_RBD_API int rbd_config_image_list(rbd_image_t image,
+ rbd_config_option_t *options,
+ int *max_options);
+CEPH_RBD_API void rbd_config_image_list_cleanup(rbd_config_option_t *options,
+ int max_options);
+
+CEPH_RBD_API int rbd_group_image_add(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_name);
+CEPH_RBD_API int rbd_group_image_remove(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_name);
+CEPH_RBD_API int rbd_group_image_remove_by_id(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_id);
+CEPH_RBD_API int rbd_group_image_list(rados_ioctx_t group_p,
+ const char *group_name,
+ rbd_group_image_info_t *images,
+ size_t group_image_info_size,
+ size_t *num_entries);
+CEPH_RBD_API int rbd_group_image_list_cleanup(rbd_group_image_info_t *images,
+ size_t group_image_info_size,
+ size_t num_entries);
+
+CEPH_RBD_API int rbd_group_snap_create(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_remove(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_rename(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *old_snap_name,
+ const char *new_snap_name);
+CEPH_RBD_API int rbd_group_snap_list(rados_ioctx_t group_p,
+ const char *group_name,
+ rbd_group_snap_info_t *snaps,
+ size_t group_snap_info_size,
+ size_t *num_entries);
+CEPH_RBD_API int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps,
+ size_t group_snap_info_size,
+ size_t num_entries);
+CEPH_RBD_API int rbd_group_snap_rollback(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+
+CEPH_RBD_API int rbd_namespace_create(rados_ioctx_t io,
+ const char *namespace_name);
+CEPH_RBD_API int rbd_namespace_remove(rados_ioctx_t io,
+ const char *namespace_name);
+CEPH_RBD_API int rbd_namespace_list(rados_ioctx_t io, char *namespace_names,
+ size_t *size);
+CEPH_RBD_API int rbd_namespace_exists(rados_ioctx_t io,
+ const char *namespace_name,
+ bool *exists);
+
+CEPH_RBD_API int rbd_pool_init(rados_ioctx_t io, bool force);
+
+CEPH_RBD_API void rbd_pool_stats_create(rbd_pool_stats_t *stats);
+CEPH_RBD_API void rbd_pool_stats_destroy(rbd_pool_stats_t stats);
+CEPH_RBD_API int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats,
+ int stat_option,
+ uint64_t* stat_val);
+CEPH_RBD_API int rbd_pool_stats_get(rados_ioctx_t io, rbd_pool_stats_t stats);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
new file mode 100644
index 00000000..646c6bb3
--- /dev/null
+++ b/src/include/rbd/librbd.hpp
@@ -0,0 +1,686 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __LIBRBD_HPP
+#define __LIBRBD_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <vector>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+#include "librbd.h"
+
+namespace librbd {
+
+ using librados::IoCtx;
+
+ class Image;
+ class ImageOptions;
+ class PoolStats;
+ typedef void *image_ctx_t;
+ typedef void *completion_t;
+ typedef void (*callback_t)(completion_t cb, void *arg);
+
+ typedef struct {
+ std::string id;
+ std::string name;
+ } image_spec_t;
+
+ typedef struct {
+ int64_t pool_id;
+ std::string pool_name;
+ std::string pool_namespace;
+ std::string image_id;
+ std::string image_name;
+ bool trash;
+ } linked_image_spec_t;
+
+ typedef rbd_snap_namespace_type_t snap_namespace_type_t;
+
+ typedef struct {
+ uint64_t id;
+ snap_namespace_type_t namespace_type;
+ std::string name;
+ } snap_spec_t;
+
+ typedef struct {
+ uint64_t id;
+ uint64_t size;
+ std::string name;
+ } snap_info_t;
+
+ typedef struct {
+ int64_t group_pool;
+ std::string group_name;
+ std::string group_snap_name;
+ } snap_group_namespace_t;
+
+ typedef struct {
+ std::string client;
+ std::string cookie;
+ std::string address;
+ } locker_t;
+
+ typedef rbd_mirror_peer_direction_t mirror_peer_direction_t;
+
+ typedef struct {
+ std::string uuid;
+ std::string cluster_name;
+ std::string client_name;
+ } mirror_peer_t;
+
+ typedef rbd_mirror_image_state_t mirror_image_state_t;
+
+ typedef struct {
+ std::string global_id;
+ mirror_image_state_t state;
+ bool primary;
+ } mirror_image_info_t;
+
+ typedef rbd_mirror_image_status_state_t mirror_image_status_state_t;
+
+ typedef struct {
+ std::string name;
+ mirror_image_info_t info;
+ mirror_image_status_state_t state;
+ std::string description;
+ time_t last_update;
+ bool up;
+ } mirror_image_status_t;
+
+ typedef rbd_group_image_state_t group_image_state_t;
+
+ typedef struct {
+ std::string name;
+ int64_t pool;
+ group_image_state_t state;
+ } group_image_info_t;
+
+ typedef struct {
+ std::string name;
+ int64_t pool;
+ } group_info_t;
+
+ typedef rbd_group_snap_state_t group_snap_state_t;
+
+ typedef struct {
+ std::string name;
+ group_snap_state_t state;
+ } group_snap_info_t;
+
+ typedef rbd_image_info_t image_info_t;
+
+ class CEPH_RBD_API ProgressContext
+ {
+ public:
+ virtual ~ProgressContext();
+ virtual int update_progress(uint64_t offset, uint64_t total) = 0;
+ };
+
+ typedef struct {
+ std::string id;
+ std::string name;
+ rbd_trash_image_source_t source;
+ time_t deletion_time;
+ time_t deferment_end_time;
+ } trash_image_info_t;
+
+ typedef struct {
+ std::string pool_name;
+ std::string image_name;
+ std::string image_id;
+ bool trash;
+ } child_info_t;
+
+ typedef struct {
+ std::string addr;
+ int64_t id;
+ uint64_t cookie;
+ } image_watcher_t;
+
+ typedef rbd_image_migration_state_t image_migration_state_t;
+
+ typedef struct {
+ int64_t source_pool_id;
+ std::string source_pool_namespace;
+ std::string source_image_name;
+ std::string source_image_id;
+ int64_t dest_pool_id;
+ std::string dest_pool_namespace;
+ std::string dest_image_name;
+ std::string dest_image_id;
+ image_migration_state_t state;
+ std::string state_description;
+ } image_migration_status_t;
+
+ typedef rbd_config_source_t config_source_t;
+
+ typedef struct {
+ std::string name;
+ std::string value;
+ config_source_t source;
+ } config_option_t;
+
+class CEPH_RBD_API RBD
+{
+public:
+ RBD();
+ ~RBD();
+
+ // This must be dynamically allocated with new, and
+ // must be released with release().
+ // Do not use delete.
+ struct AioCompletion {
+ void *pc;
+ AioCompletion(void *cb_arg, callback_t complete_cb);
+ bool is_complete();
+ int wait_for_complete();
+ ssize_t get_return_value();
+ void *get_arg();
+ void release();
+ };
+
+ void version(int *major, int *minor, int *extra);
+
+ int open(IoCtx& io_ctx, Image& image, const char *name);
+ int open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname);
+ int open_by_id(IoCtx& io_ctx, Image& image, const char *id);
+ int open_by_id(IoCtx& io_ctx, Image& image, const char *id, const char *snapname);
+ int aio_open(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snapname, RBD::AioCompletion *c);
+ int aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snapname, RBD::AioCompletion *c);
+ // see librbd.h
+ int open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snapname);
+ int open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snapname);
+ int aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snapname, RBD::AioCompletion *c);
+ int aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snapname, RBD::AioCompletion *c);
+
+ int list(IoCtx& io_ctx, std::vector<std::string>& names)
+ __attribute__((deprecated));
+ int list2(IoCtx& io_ctx, std::vector<image_spec_t>* images);
+
+ int create(IoCtx& io_ctx, const char *name, uint64_t size, int *order);
+ int create2(IoCtx& io_ctx, const char *name, uint64_t size,
+ uint64_t features, int *order);
+ int create3(IoCtx& io_ctx, const char *name, uint64_t size,
+ uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count);
+ int create4(IoCtx& io_ctx, const char *name, uint64_t size,
+ ImageOptions& opts);
+ int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+ IoCtx& c_ioctx, const char *c_name, uint64_t features,
+ int *c_order);
+ int clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+ IoCtx& c_ioctx, const char *c_name, uint64_t features,
+ int *c_order, uint64_t stripe_unit, int stripe_count);
+ int clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+ IoCtx& c_ioctx, const char *c_name, ImageOptions& opts);
+ int remove(IoCtx& io_ctx, const char *name);
+ int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx);
+ int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname);
+
+ int trash_move(IoCtx &io_ctx, const char *name, uint64_t delay);
+ int trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info);
+ int trash_list(IoCtx &io_ctx, std::vector<trash_image_info_t> &entries);
+ int trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold);
+ int trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, float threshold,
+ ProgressContext &pctx);
+ int trash_remove(IoCtx &io_ctx, const char *image_id, bool force);
+ int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id,
+ bool force, ProgressContext &pctx);
+ int trash_restore(IoCtx &io_ctx, const char *id, const char *name);
+
+ // Migration
+ int migration_prepare(IoCtx& io_ctx, const char *image_name,
+ IoCtx& dest_io_ctx, const char *dest_image_name,
+ ImageOptions& opts);
+ int migration_execute(IoCtx& io_ctx, const char *image_name);
+ int migration_execute_with_progress(IoCtx& io_ctx, const char *image_name,
+ ProgressContext &prog_ctx);
+ int migration_abort(IoCtx& io_ctx, const char *image_name);
+ int migration_abort_with_progress(IoCtx& io_ctx, const char *image_name,
+ ProgressContext &prog_ctx);
+ int migration_commit(IoCtx& io_ctx, const char *image_name);
+ int migration_commit_with_progress(IoCtx& io_ctx, const char *image_name,
+ ProgressContext &prog_ctx);
+ int migration_status(IoCtx& io_ctx, const char *image_name,
+ image_migration_status_t *status, size_t status_size);
+
+ // RBD pool mirroring support functions
+ int mirror_site_name_get(librados::Rados& rados, std::string* site_name);
+ int mirror_site_name_set(librados::Rados& rados,
+ const std::string& site_name);
+
+ int mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode);
+ int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode);
+
+ int mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token);
+ int mirror_peer_bootstrap_import(IoCtx& io_ctx,
+ mirror_peer_direction_t direction,
+ const std::string &token);
+
+ int mirror_peer_add(IoCtx& io_ctx, std::string *uuid,
+ const std::string &cluster_name,
+ const std::string &client_name);
+ int mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid);
+ int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers);
+ int mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &client_name);
+ int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &cluster_name);
+ int mirror_peer_get_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ std::map<std::string, std::string> *key_vals);
+ int mirror_peer_set_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ const std::map<std::string, std::string>& key_vals);
+
+ int mirror_image_status_list(IoCtx& io_ctx, const std::string &start_id,
+ size_t max, std::map<std::string, mirror_image_status_t> *images);
+ int mirror_image_status_summary(IoCtx& io_ctx,
+ std::map<mirror_image_status_state_t, int> *states);
+ int mirror_image_instance_id_list(IoCtx& io_ctx, const std::string &start_id,
+ size_t max, std::map<std::string, std::string> *sevice_ids);
+
+ // RBD groups support functions
+ int group_create(IoCtx& io_ctx, const char *group_name);
+ int group_remove(IoCtx& io_ctx, const char *group_name);
+ int group_list(IoCtx& io_ctx, std::vector<std::string> *names);
+ int group_rename(IoCtx& io_ctx, const char *src_group_name,
+ const char *dest_group_name);
+
+ int group_image_add(IoCtx& io_ctx, const char *group_name,
+ IoCtx& image_io_ctx, const char *image_name);
+ int group_image_remove(IoCtx& io_ctx, const char *group_name,
+ IoCtx& image_io_ctx, const char *image_name);
+ int group_image_remove_by_id(IoCtx& io_ctx, const char *group_name,
+ IoCtx& image_io_ctx, const char *image_id);
+ int group_image_list(IoCtx& io_ctx, const char *group_name,
+ std::vector<group_image_info_t> *images,
+ size_t group_image_info_size);
+
+ int group_snap_create(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name);
+ int group_snap_remove(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name);
+ int group_snap_rename(IoCtx& group_ioctx, const char *group_name,
+ const char *old_snap_name, const char *new_snap_name);
+ int group_snap_list(IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_snap_info_t> *snaps,
+ size_t group_snap_info_size);
+ int group_snap_rollback(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name);
+ int group_snap_rollback_with_progress(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name,
+ ProgressContext& pctx);
+
+ int namespace_create(IoCtx& ioctx, const char *namespace_name);
+ int namespace_remove(IoCtx& ioctx, const char *namespace_name);
+ int namespace_list(IoCtx& io_ctx, std::vector<std::string>* namespace_names);
+ int namespace_exists(IoCtx& io_ctx, const char *namespace_name, bool *exists);
+
+ int pool_init(IoCtx& io_ctx, bool force);
+ int pool_stats_get(IoCtx& io_ctx, PoolStats *pool_stats);
+
+ int pool_metadata_get(IoCtx &io_ctx, const std::string &key,
+ std::string *value);
+ int pool_metadata_set(IoCtx &io_ctx, const std::string &key,
+ const std::string &value);
+ int pool_metadata_remove(IoCtx &io_ctx, const std::string &key);
+ int pool_metadata_list(IoCtx &io_ctx, const std::string &start, uint64_t max,
+ std::map<std::string, ceph::bufferlist> *pairs);
+
+ int config_list(IoCtx& io_ctx, std::vector<config_option_t> *options);
+
+private:
+ /* We don't allow assignment or copying */
+ RBD(const RBD& rhs);
+ const RBD& operator=(const RBD& rhs);
+};
+
+class CEPH_RBD_API ImageOptions {
+public:
+ ImageOptions();
+ ImageOptions(rbd_image_options_t opts);
+ ImageOptions(const ImageOptions &imgopts);
+ ~ImageOptions();
+
+ int set(int optname, const std::string& optval);
+ int set(int optname, uint64_t optval);
+ int get(int optname, std::string* optval) const;
+ int get(int optname, uint64_t* optval) const;
+ int is_set(int optname, bool* is_set);
+ int unset(int optname);
+ void clear();
+ bool empty() const;
+
+private:
+ friend class RBD;
+ friend class Image;
+
+ rbd_image_options_t opts;
+};
+
+class CEPH_RBD_API PoolStats {
+public:
+ PoolStats();
+ ~PoolStats();
+
+ PoolStats(const PoolStats&) = delete;
+ PoolStats& operator=(const PoolStats&) = delete;
+
+ int add(rbd_pool_stat_option_t option, uint64_t* opt_val);
+
+private:
+ friend class RBD;
+
+ rbd_pool_stats_t pool_stats;
+};
+
+class CEPH_RBD_API UpdateWatchCtx {
+public:
+ virtual ~UpdateWatchCtx() {}
+ /**
+ * Callback activated when we receive a notify event.
+ */
+ virtual void handle_notify() = 0;
+};
+
+class CEPH_RBD_API Image
+{
+public:
+ Image();
+ ~Image();
+
+ int close();
+ int aio_close(RBD::AioCompletion *c);
+
+ int resize(uint64_t size);
+ int resize2(uint64_t size, bool allow_shrink, ProgressContext& pctx);
+ int resize_with_progress(uint64_t size, ProgressContext& pctx);
+ int stat(image_info_t &info, size_t infosize);
+ int get_name(std::string *name);
+ int get_id(std::string *id);
+ std::string get_block_name_prefix();
+ int64_t get_data_pool_id();
+ int parent_info(std::string *parent_poolname, std::string *parent_name,
+ std::string *parent_snapname)
+ __attribute__((deprecated));
+ int parent_info2(std::string *parent_poolname, std::string *parent_name,
+ std::string *parent_id, std::string *parent_snapname)
+ __attribute__((deprecated));
+ int get_parent(linked_image_spec_t *parent_image, snap_spec_t *parent_snap);
+
+ int old_format(uint8_t *old);
+ int size(uint64_t *size);
+ int get_group(group_info_t *group_info, size_t group_info_size);
+ int features(uint64_t *features);
+ int update_features(uint64_t features, bool enabled);
+ int get_op_features(uint64_t *op_features);
+ int overlap(uint64_t *overlap);
+ int get_flags(uint64_t *flags);
+ int set_image_notification(int fd, int type);
+
+ /* exclusive lock feature */
+ int is_exclusive_lock_owner(bool *is_owner);
+ int lock_acquire(rbd_lock_mode_t lock_mode);
+ int lock_release();
+ int lock_get_owners(rbd_lock_mode_t *lock_mode,
+ std::list<std::string> *lock_owners);
+ int lock_break(rbd_lock_mode_t lock_mode, const std::string &lock_owner);
+
+ /* object map feature */
+ int rebuild_object_map(ProgressContext &prog_ctx);
+
+ int check_object_map(ProgressContext &prog_ctx);
+
+ int copy(IoCtx& dest_io_ctx, const char *destname);
+ int copy2(Image& dest);
+ int copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
+ int copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts,
+ size_t sparse_size);
+ int copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+ ProgressContext &prog_ctx);
+ int copy_with_progress2(Image& dest, ProgressContext &prog_ctx);
+ int copy_with_progress3(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx);
+ int copy_with_progress4(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx,
+ size_t sparse_size);
+
+ /* deep copy */
+ int deep_copy(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
+ int deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx);
+
+ /* striping */
+ uint64_t get_stripe_unit() const;
+ uint64_t get_stripe_count() const;
+
+ int get_create_timestamp(struct timespec *timestamp);
+ int get_access_timestamp(struct timespec *timestamp);
+ int get_modify_timestamp(struct timespec *timestamp);
+
+ int flatten();
+ int flatten_with_progress(ProgressContext &prog_ctx);
+
+ int sparsify(size_t sparse_size);
+ int sparsify_with_progress(size_t sparse_size, ProgressContext &prog_ctx);
+ /**
+ * Returns a pair of poolname, imagename for each clone
+ * of this image at the currently set snapshot.
+ */
+ int list_children(std::set<std::pair<std::string, std::string> > *children)
+ __attribute__((deprecated));
+ /**
+ * Returns a structure of poolname, imagename, imageid and trash flag
+ * for each clone of this image at the currently set snapshot.
+ */
+ int list_children2(std::vector<librbd::child_info_t> *children)
+ __attribute__((deprecated));
+ int list_children3(std::vector<linked_image_spec_t> *images);
+ int list_descendants(std::vector<linked_image_spec_t> *images);
+
+ /* advisory locking (see librbd.h for details) */
+ int list_lockers(std::list<locker_t> *lockers,
+ bool *exclusive, std::string *tag);
+ int lock_exclusive(const std::string& cookie);
+ int lock_shared(const std::string& cookie, const std::string& tag);
+ int unlock(const std::string& cookie);
+ int break_lock(const std::string& client, const std::string& cookie);
+
+ /* snapshots */
+ int snap_list(std::vector<snap_info_t>& snaps);
+ /* DEPRECATED; use snap_exists2 */
+ bool snap_exists(const char *snapname) __attribute__ ((deprecated));
+ int snap_exists2(const char *snapname, bool *exists);
+ int snap_create(const char *snapname);
+ int snap_remove(const char *snapname);
+ int snap_remove2(const char *snapname, uint32_t flags, ProgressContext& pctx);
+ int snap_remove_by_id(uint64_t snap_id);
+ int snap_rollback(const char *snap_name);
+ int snap_rollback_with_progress(const char *snap_name, ProgressContext& pctx);
+ int snap_protect(const char *snap_name);
+ int snap_unprotect(const char *snap_name);
+ int snap_is_protected(const char *snap_name, bool *is_protected);
+ int snap_set(const char *snap_name);
+ int snap_set_by_id(uint64_t snap_id);
+ int snap_rename(const char *srcname, const char *dstname);
+ int snap_get_limit(uint64_t *limit);
+ int snap_set_limit(uint64_t limit);
+ int snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp);
+ int snap_get_namespace_type(uint64_t snap_id,
+ snap_namespace_type_t *namespace_type);
+ int snap_get_group_namespace(uint64_t snap_id,
+ snap_group_namespace_t *group_namespace,
+ size_t snap_group_namespace_size);
+ int snap_get_trash_namespace(uint64_t snap_id, std::string* original_name);
+
+ /* I/O */
+ ssize_t read(uint64_t ofs, size_t len, ceph::bufferlist& bl);
+ /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+ ssize_t read2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+ int64_t read_iterate(uint64_t ofs, size_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *), void *arg);
+ int read_iterate2(uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *), void *arg);
+ /**
+ * get difference between two versions of an image
+ *
+ * This will return the differences between two versions of an image
+ * via a callback, which gets the offset and length and a flag
+ * indicating whether the extent exists (1), or is known/defined to
+ * be zeros (a hole, 0). If the source snapshot name is NULL, we
+ * interpret that as the beginning of time and return all allocated
+ * regions of the image. The end version is whatever is currently
+ * selected for the image handle (either a snapshot or the writeable
+ * head).
+ *
+ * @param fromsnapname start snapshot name, or NULL
+ * @param ofs start offset
+ * @param len len in bytes of region to report on
+ * @param include_parent true if full history diff should include parent
+ * @param whole_object 1 if diff extents should cover whole object
+ * @param cb callback to call for each allocated region
+ * @param arg argument to pass to the callback
+ * @returns 0 on success, or negative error code on error
+ */
+ int diff_iterate(const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, int, void *), void *arg);
+ int diff_iterate2(const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ bool include_parent, bool whole_object,
+ int (*cb)(uint64_t, size_t, int, void *), void *arg);
+
+ ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl);
+ /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+ ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+
+ int discard(uint64_t ofs, uint64_t len);
+ ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags);
+ ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags);
+
+ ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl,
+ ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags);
+
+ int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
+ /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+ int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags);
+
+ int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
+ int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags);
+ int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c,
+ int zero_flags, int op_flags);
+
+ int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl,
+ ceph::bufferlist& bl, RBD::AioCompletion *c,
+ uint64_t *mismatch_off, int op_flags);
+
+ /**
+ * read async from image
+ *
+ * The target bufferlist is populated with references to buffers
+ * that contain the data for the given extent of the image.
+ *
+ * NOTE: If caching is enabled, the bufferlist will directly
+ * reference buffers in the cache to avoid an unnecessary data copy.
+ * As a result, if the user intends to modify the buffer contents
+ * directly, they should make a copy first (unconditionally, or when
+ * the reference count on ther underlying buffer is more than 1).
+ *
+ * @param off offset in image
+ * @param len length of read
+ * @param bl bufferlist to read into
+ * @param c aio completion to notify when read is complete
+ */
+ int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
+ /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+ int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags);
+
+ int flush();
+ /**
+ * Start a flush if caching is enabled. Get a callback when
+ * the currently pending writes are on disk.
+ *
+ * @param image the image to flush writes to
+ * @param c what to call when flushing is complete
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush(RBD::AioCompletion *c);
+
+ /**
+ * Drop any cached data for this image
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ int invalidate_cache();
+
+ int poll_io_events(RBD::AioCompletion **comps, int numcomp);
+
+ int metadata_get(const std::string &key, std::string *value);
+ int metadata_set(const std::string &key, const std::string &value);
+ int metadata_remove(const std::string &key);
+ /**
+ * Returns a pair of key/value for this image
+ */
+ int metadata_list(const std::string &start, uint64_t max, std::map<std::string, ceph::bufferlist> *pairs);
+
+ // RBD image mirroring support functions
+ int mirror_image_enable();
+ int mirror_image_disable(bool force);
+ int mirror_image_promote(bool force);
+ int mirror_image_demote();
+ int mirror_image_resync();
+ int mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+ size_t info_size);
+ int mirror_image_get_status(mirror_image_status_t *mirror_image_status,
+ size_t status_size);
+ int mirror_image_get_instance_id(std::string *instance_id);
+ int aio_mirror_image_promote(bool force, RBD::AioCompletion *c);
+ int aio_mirror_image_demote(RBD::AioCompletion *c);
+ int aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+ size_t info_size, RBD::AioCompletion *c);
+ int aio_mirror_image_get_status(mirror_image_status_t *mirror_image_status,
+ size_t status_size, RBD::AioCompletion *c);
+
+ int update_watch(UpdateWatchCtx *ctx, uint64_t *handle);
+ int update_unwatch(uint64_t handle);
+
+ int list_watchers(std::list<image_watcher_t> &watchers);
+
+ int config_list(std::vector<config_option_t> *options);
+
+private:
+ friend class RBD;
+
+ Image(const Image& rhs);
+ const Image& operator=(const Image& rhs);
+
+ image_ctx_t ctx;
+};
+
+}
+
+#endif
diff --git a/src/include/rbd/object_map_types.h b/src/include/rbd/object_map_types.h
new file mode 100644
index 00000000..54852caa
--- /dev/null
+++ b/src/include/rbd/object_map_types.h
@@ -0,0 +1,13 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_RBD_OBJECT_MAP_TYPES_H
+#define CEPH_RBD_OBJECT_MAP_TYPES_H
+
+#include "include/int_types.h"
+
+static const uint8_t OBJECT_NONEXISTENT = 0;
+static const uint8_t OBJECT_EXISTS = 1;
+static const uint8_t OBJECT_PENDING = 2;
+static const uint8_t OBJECT_EXISTS_CLEAN = 3;
+
+#endif // CEPH_RBD_OBJECT_MAP_TYPES_H
diff --git a/src/include/rbd_types.h b/src/include/rbd_types.h
new file mode 100644
index 00000000..35a1a8bc
--- /dev/null
+++ b/src/include/rbd_types.h
@@ -0,0 +1,159 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include "include/types.h"
+#include "rbd/features.h"
+
+/* New-style rbd image 'foo' consists of objects
+ * rbd_id.foo - id of image
+ * rbd_header.<id> - image metadata
+ * rbd_object_map.<id> - optional image object map
+ * rbd_data.<id>.00000000
+ * rbd_data.<id>.00000001
+ * ... - data
+ */
+
+#define RBD_HEADER_PREFIX "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX "rbd_object_map."
+#define RBD_DATA_PREFIX "rbd_data."
+#define RBD_ID_PREFIX "rbd_id."
+
+/*
+ * old-style rbd image 'foo' consists of objects
+ * foo.rbd - image metadata
+ * rb.<idhi>.<idlo>.00000000
+ * rb.<idhi>.<idlo>.00000001
+ * ... - data
+ */
+
+#define RBD_SUFFIX ".rbd"
+#define RBD_DIRECTORY "rbd_directory"
+#define RBD_INFO "rbd_info"
+#define RBD_NAMESPACE "rbd_namespace"
+#define RBD_TASK "rbd_task"
+
+/*
+ * rbd_children object in each pool contains omap entries
+ * that map parent (poolid, imageid, snapid) to a list of children
+ * (imageids; snapids aren't required because we get all the snapshot
+ * info from a read of the child's header object anyway).
+ *
+ * The clone operation writes a new item to this child list, and rm or
+ * flatten removes an item, and may remove the whole entry if no children
+ * exist after the rm/flatten.
+ *
+ * When attempting to remove a parent, all pools are searched for
+ * rbd_children objects with entries referring to that parent; if any
+ * exist (and those children exist), the parent removal is prevented.
+ */
+#define RBD_CHILDREN "rbd_children"
+#define RBD_LOCK_NAME "rbd_lock"
+
+/**
+ * rbd_mirroring object in each pool contains pool-specific settings
+ * for configuring mirroring.
+ */
+#define RBD_MIRRORING "rbd_mirroring"
+
+/**
+ * rbd_mirror_leader and rbd_mirror_instance.<instance id> objects are used
+ * for pool-level coordination between rbd-mirror daemons.
+ */
+#define RBD_MIRROR_LEADER "rbd_mirror_leader"
+#define RBD_MIRROR_INSTANCE_PREFIX "rbd_mirror_instance."
+
+#define RBD_MAX_OBJ_NAME_SIZE 96
+#define RBD_MAX_BLOCK_NAME_SIZE 24
+
+/**
+ * Maximum string length of the RBD v2 image id (not including
+ * null termination). This limit was derived from the existing
+ * RBD_MAX_BLOCK_NAME_SIZE limit which needs to hold the "rbd_data."
+ * prefix and null termination.
+ */
+#define RBD_MAX_IMAGE_ID_LENGTH 14
+
+/**
+ * Maximum string length of the RBD block object name prefix (not including
+ * null termination).
+ *
+ * v1 format: rb.<max 8-byte high id>.<max 8-byte low id>.<max 8-byte extra>
+ * v2 format: rbd_data.[<max 19-byte pool id>.]<max 14-byte image id>
+ *
+ * Note: new features might require increasing this maximum prefix length.
+ */
+#define RBD_MAX_BLOCK_NAME_PREFIX_LENGTH 43
+
+#define RBD_COMP_NONE 0
+#define RBD_CRYPT_NONE 0
+
+#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
+#define RBD_MIGRATE_HEADER_TEXT "<<< Migrating RBD Image >>>\n"
+#define RBD_HEADER_SIGNATURE "RBD"
+#define RBD_HEADER_VERSION "001.005"
+
+#define RBD_GROUP_INVALID_POOL (-1)
+
+#define RBD_GROUP_HEADER_PREFIX "rbd_group_header."
+
+#define RBD_GROUP_DIRECTORY "rbd_group_directory"
+
+#define RBD_TRASH "rbd_trash"
+
+/**
+ * MON config-key prefix for storing optional remote cluster connectivity
+ * parameters
+ */
+#define RBD_MIRROR_CONFIG_KEY_PREFIX "rbd/mirror/"
+#define RBD_MIRROR_SITE_NAME_CONFIG_KEY RBD_MIRROR_CONFIG_KEY_PREFIX "site_name"
+#define RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY RBD_MIRROR_CONFIG_KEY_PREFIX "peer_client_id"
+#define RBD_MIRROR_PEER_CONFIG_KEY_PREFIX RBD_MIRROR_CONFIG_KEY_PREFIX "peer/"
+
+struct rbd_info {
+ ceph_le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_obj_snap_ondisk {
+ ceph_le64 id;
+ ceph_le64 image_size;
+} __attribute__((packed));
+
+struct rbd_obj_header_ondisk {
+ char text[40];
+ char block_name[RBD_MAX_BLOCK_NAME_SIZE];
+ char signature[4];
+ char version[8];
+ struct {
+ __u8 order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ __u8 unused;
+ } __attribute__((packed)) options;
+ ceph_le64 image_size;
+ ceph_le64 snap_seq;
+ ceph_le32 snap_count;
+ ceph_le32 reserved;
+ ceph_le64 snap_names_len;
+ struct rbd_obj_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+enum {
+ RBD_PROTECTION_STATUS_UNPROTECTED = 0,
+ RBD_PROTECTION_STATUS_UNPROTECTING = 1,
+ RBD_PROTECTION_STATUS_PROTECTED = 2,
+ RBD_PROTECTION_STATUS_LAST = 3
+};
+
+#endif
diff --git a/src/include/rgw/librgw_admin_user.h b/src/include/rgw/librgw_admin_user.h
new file mode 100644
index 00000000..e1dd5a29
--- /dev/null
+++ b/src/include/rgw/librgw_admin_user.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * create rgw admin user
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef LIB_RGW_ADMIN_USER_H
+#define LIB_RGW_ADMIN_USER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_ADMIN_USER_VER_MAJOR 1
+#define LIBRGW_ADMIN_USER_VER_MINOR 0
+#define LIBRGW_ADMIN_USER_VER_EXTRA 0
+
+#define LIBRGW_ADMIN_USER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_ADMIN_USER_VERSION_CODE LIBRGW_ADMIN_USER_VERSION(LIBRGW_ADMIN_USER_VER_MAJOR, LIBRGW_ADMIN_USER_VER_MINOR, LIBRGW_ADMIN_USER_VER_EXTRA)
+
+typedef void* librgw_admin_user_t;
+int librgw_admin_user_create(librgw_admin_user_t *rgw_admin_user, int argc, char **argv);
+void librgw_admin_user_shutdown(librgw_admin_user_t rgw_admin_user);
+
+struct rgw_user_info
+{
+ const char *uid;
+ const char *display_name;
+ const char *access_key;
+ const char* secret_key;
+ const char* email;
+ const char *caps;
+ const char *access;
+ bool admin;
+ bool system;
+};
+
+ /*
+ * create a new rgw user
+ */
+int rgw_admin_create_user(librgw_admin_user_t rgw_admin_user, const char *uid,
+ const char *display_name, const char *access_key, const char* secret_key,
+ const char *email, const char *caps,
+ const char *access, bool admin, bool system);
+
+/*
+ * get rgw user info
+ */
+int rgw_admin_user_info(librgw_admin_user_t rgw_admin_user,const char * uid, rgw_user_info* user_info);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBRGW_ADMIN_USER */
diff --git a/src/include/scope_guard.h b/src/include/scope_guard.h
new file mode 100644
index 00000000..878d8c16
--- /dev/null
+++ b/src/include/scope_guard.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef SCOPE_GUARD
+#define SCOPE_GUARD
+
+#include <utility>
+
+template <typename F>
+struct scope_guard {
+ F f;
+ scope_guard() = delete;
+ scope_guard(const scope_guard &) = delete;
+ scope_guard(scope_guard &&) = default;
+ scope_guard & operator=(const scope_guard &) = delete;
+ scope_guard & operator=(scope_guard &&) = default;
+ scope_guard(const F& f) : f(f) {}
+ scope_guard(F &&f) : f(std::move(f)) {}
+ template<typename... Args>
+ scope_guard(std::in_place_t, Args&& ...args) : f(std::forward<Args>(args)...) {}
+ ~scope_guard() {
+ std::move(f)(); // Support at-most-once functions
+ }
+};
+
+template <typename F>
+scope_guard<F> make_scope_guard(F &&f) {
+ return scope_guard<F>(std::forward<F>(f));
+}
+
+template<typename F, typename... Args>
+scope_guard<F> make_scope_guard(std::in_place_type_t<F>, Args&& ...args) {
+ return { std::in_place, std::forward<Args>(args)... };
+}
+
+#endif
diff --git a/src/include/sock_compat.h b/src/include/sock_compat.h
new file mode 100644
index 00000000..14b5efa1
--- /dev/null
+++ b/src/include/sock_compat.h
@@ -0,0 +1,43 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_SOCK_COMPAT_H
+#define CEPH_SOCK_COMPAT_H
+
+#include "include/compat.h"
+#include <sys/socket.h>
+
+/*
+ * This optimization may not be available on all platforms (e.g. OSX).
+ * Apparently a similar approach based on TCP_CORK can be used.
+ */
+#ifndef MSG_MORE
+# define MSG_MORE 0
+#endif
+
+/*
+ * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
+ */
+#ifndef MSG_NOSIGNAL
+# define MSG_NOSIGNAL 0
+# ifdef SO_NOSIGPIPE
+# define CEPH_USE_SO_NOSIGPIPE
+# else
+# define CEPH_USE_SIGPIPE_BLOCKER
+# warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!"
+# endif
+#endif
+
+int socket_cloexec(int domain, int type, int protocol);
+int socketpair_cloexec(int domain, int type, int protocol, int sv[2]);
+int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen);
+
+#endif
diff --git a/src/include/spinlock.h b/src/include/spinlock.h
new file mode 100644
index 00000000..3f12bdc0
--- /dev/null
+++ b/src/include/spinlock.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ * @author Jesse Williamson <jwilliamson@suse.de>
+ *
+*/
+
+#ifndef CEPH_SPINLOCK_HPP
+#define CEPH_SPINLOCK_HPP
+
+#include <atomic>
+
+namespace ceph {
+inline namespace version_1_0 {
+
+class spinlock;
+
+inline void spin_lock(std::atomic_flag& lock);
+inline void spin_unlock(std::atomic_flag& lock);
+inline void spin_lock(ceph::spinlock& lock);
+inline void spin_unlock(ceph::spinlock& lock);
+
+/* A pre-packaged spinlock type modelling BasicLockable: */
+class spinlock final
+{
+ std::atomic_flag af = ATOMIC_FLAG_INIT;
+
+ public:
+ void lock() {
+ ceph::spin_lock(af);
+ }
+
+ void unlock() noexcept {
+ ceph::spin_unlock(af);
+ }
+};
+
+// Free functions:
+inline void spin_lock(std::atomic_flag& lock)
+{
+ while(lock.test_and_set(std::memory_order_acquire))
+ ;
+}
+
+inline void spin_unlock(std::atomic_flag& lock)
+{
+ lock.clear(std::memory_order_release);
+}
+
+inline void spin_lock(std::atomic_flag *lock)
+{
+ spin_lock(*lock);
+}
+
+inline void spin_unlock(std::atomic_flag *lock)
+{
+ spin_unlock(*lock);
+}
+
+inline void spin_lock(ceph::spinlock& lock)
+{
+ lock.lock();
+}
+
+inline void spin_unlock(ceph::spinlock& lock)
+{
+ lock.unlock();
+}
+
+inline void spin_lock(ceph::spinlock *lock)
+{
+ spin_lock(*lock);
+}
+
+inline void spin_unlock(ceph::spinlock *lock)
+{
+ spin_unlock(*lock);
+}
+
+} // inline namespace (version)
+} // namespace ceph
+
+#endif
diff --git a/src/include/stat.h b/src/include/stat.h
new file mode 100644
index 00000000..19398758
--- /dev/null
+++ b/src/include/stat.h
@@ -0,0 +1,145 @@
+#ifndef CEPH_STAT_H
+#define CEPH_STAT_H
+
+#include <acconfig.h>
+
+#include <sys/stat.h>
+
+/*
+ * Access time-related `struct stat` members.
+ *
+ * Note that for each of the stat member get/set functions below, setting a
+ * high-res value (stat_set_*_nsec) on a platform without high-res support is
+ * a no-op.
+ */
+
+#ifdef HAVE_STAT_ST_MTIM_TV_NSEC
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+ return st->st_mtim.tv_nsec;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_mtim.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+ return st->st_atim.tv_nsec;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_atim.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+ return st->st_ctim.tv_nsec;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_ctim.tv_nsec = nsec;
+}
+
+#elif defined(HAVE_STAT_ST_MTIMESPEC_TV_NSEC)
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+ return st->st_mtimespec.tv_nsec;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_mtimespec.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+ return st->st_atimespec.tv_nsec;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_atimespec.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+ return st->st_ctimespec.tv_nsec;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_ctimespec.tv_nsec = nsec;
+}
+
+#else
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+ return 0;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+ return 0;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+ return 0;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+#endif
+
+/*
+ * Access second-resolution `struct stat` members.
+ */
+
+static inline uint32_t stat_get_mtime_sec(struct stat *st)
+{
+ return st->st_mtime;
+}
+
+static inline void stat_set_mtime_sec(struct stat *st, uint32_t sec)
+{
+ st->st_mtime = sec;
+}
+
+static inline uint32_t stat_get_atime_sec(struct stat *st)
+{
+ return st->st_atime;
+}
+
+static inline void stat_set_atime_sec(struct stat *st, uint32_t sec)
+{
+ st->st_atime = sec;
+}
+
+static inline uint32_t stat_get_ctime_sec(struct stat *st)
+{
+ return st->st_ctime;
+}
+
+static inline void stat_set_ctime_sec(struct stat *st, uint32_t sec)
+{
+ st->st_ctime = sec;
+}
+
+#endif
diff --git a/src/include/statlite.h b/src/include/statlite.h
new file mode 100644
index 00000000..2ab3a940
--- /dev/null
+++ b/src/include/statlite.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_STATLITE_H
+#define CEPH_STATLITE_H
+
+extern "C" {
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+
+struct statlite {
+ dev_t st_dev; /* device */
+ ino_t st_ino; /* inode */
+ mode_t st_mode; /* protection */
+ nlink_t st_nlink; /* number of hard links */
+ uid_t st_uid; /* user ID of owner */
+ gid_t st_gid; /* group ID of owner */
+ dev_t st_rdev; /* device type (if inode device)*/
+ unsigned long st_litemask; /* bit mask for optional fields */
+ /***************************************************************/
+ /**** Remaining fields are optional according to st_litemask ***/
+ off_t st_size; /* total size, in bytes */
+ blksize_t st_blksize; /* blocksize for filesystem I/O */
+ blkcnt_t st_blocks; /* number of blocks allocated */
+ struct timespec st_atim; /* Time of last access. */
+ struct timespec st_mtim; /* Time of last modification. */
+ struct timespec st_ctim; /* Time of last status change. */
+ //time_t st_atime; /* time of last access */
+ //time_t st_mtime; /* time of last modification */
+ //time_t st_ctime; /* time of last change */
+};
+
+#define S_STATLITE_SIZE 1
+#define S_STATLITE_BLKSIZE 2
+#define S_STATLITE_BLOCKS 4
+#define S_STATLITE_ATIME 8
+#define S_STATLITE_MTIME 16
+#define S_STATLITE_CTIME 32
+
+#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE)
+#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE)
+#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS)
+#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME)
+#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME)
+#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME)
+
+#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE)
+#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE)
+#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS)
+#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME)
+#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME)
+#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME)
+
+
+// readdirplus etc.
+
+struct dirent_plus {
+ struct dirent d_dirent; /* dirent struct for this entry */
+ struct stat d_stat; /* attributes for this entry */
+ int d_stat_err;/* errno for d_stat, or 0 */
+};
+struct dirent_lite {
+ struct dirent d_dirent; /* dirent struct for this entry */
+ struct statlite d_stat; /* attributes for this entry */
+ int d_stat_err;/* errno for d_stat, or 0 */
+};
+
+}
+#endif
diff --git a/src/include/str_list.h b/src/include/str_list.h
new file mode 100644
index 00000000..518db1ca
--- /dev/null
+++ b/src/include/str_list.h
@@ -0,0 +1,129 @@
+#ifndef CEPH_STRLIST_H
+#define CEPH_STRLIST_H
+
+#include <list>
+#include <set>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace ceph {
+
+/// Split a string using the given delimiters, passing each piece as a
+/// (non-null-terminated) std::string_view to the callback.
+template <typename Func> // where Func(std::string_view) is a valid call
+void for_each_substr(std::string_view s, const char *delims, Func&& f)
+{
+ auto pos = s.find_first_not_of(delims);
+ while (pos != s.npos) {
+ s.remove_prefix(pos); // trim delims from the front
+ auto end = s.find_first_of(delims);
+ f(s.substr(0, end));
+ pos = s.find_first_not_of(delims, end);
+ }
+}
+
+} // namespace ceph
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ *
+ * @param [in] str String to split and save as list
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+ std::list<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ *
+ * @param [in] str String to split and save as list
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+ const char *delims,
+ std::list<std::string>& str_list);
+
+std::list<std::string> get_str_list(const std::string& str,
+ const char *delims = ";,= \t");
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_vec**.
+ *
+ * @param [in] str String to split and save as Vector
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+extern void get_str_vec(const std::string& str,
+ std::vector<std::string>& str_vec);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_vec**.
+ *
+ * @param [in] str String to split and save as Vector
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+extern void get_str_vec(const std::string& str,
+ const char *delims,
+ std::vector<std::string>& str_vec);
+
+std::vector<std::string> get_str_vec(const std::string& str,
+ const char *delims = ";,= \t");
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ *
+ * @param [in] str String to split and save as Set
+ * @param [out] str_list Set modified containing str after it has been split
+**/
+extern void get_str_set(const std::string& str,
+ std::set<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ *
+ * @param [in] str String to split and save as Set
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list Set modified containing str after it has been split
+**/
+template<class Compare = std::less<std::string> >
+void get_str_set(const std::string& str,
+ const char *delims,
+ std::set<std::string, Compare>& str_list)
+{
+ str_list.clear();
+ for_each_substr(str, delims, [&str_list] (auto token) {
+ str_list.emplace(token.begin(), token.end());
+ });
+}
+
+std::set<std::string> get_str_set(const std::string& str,
+ const char *delims = ";,= \t");
+
+
+
+/**
+ * Return a String containing the vector **v** joined with **sep**
+ *
+ * If **v** is empty, the function returns an empty string
+ * For each element in **v**,
+ * it will concatenate this element and **sep** with result
+ *
+ * @param [in] v Vector to join as a String
+ * @param [in] sep String used to join each element from **v**
+ * @return empty string if **v** is empty or concatenated string
+**/
+inline std::string str_join(const std::vector<std::string>& v, const std::string& sep)
+{
+ if (v.empty())
+ return std::string();
+ std::vector<std::string>::const_iterator i = v.begin();
+ std::string r = *i;
+ for (++i; i != v.end(); ++i) {
+ r += sep;
+ r += *i;
+ }
+ return r;
+}
+
+#endif
diff --git a/src/include/str_map.h b/src/include/str_map.h
new file mode 100644
index 00000000..6a0370d1
--- /dev/null
+++ b/src/include/str_map.h
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_STRMAP_H
+#define CEPH_STRMAP_H
+
+#define CONST_DELIMS ",;\t\n "
+
+#include <map>
+#include <string>
+#include <sstream>
+
+/**
+ * Parse **str** and set **str_map** with the key/value pairs read
+ * from it. The format of **str** is either a well formed JSON object
+ * or a custom key[=value] plain text format.
+ *
+ * JSON is tried first. If successfully parsed into a JSON object, it
+ * is copied into **str_map** verbatim. If it is not a JSON object ( a
+ * string, integer etc. ), -EINVAL is returned and **ss** is set to
+ * a human readable error message.
+ *
+ * If **str** is no valid JSON and if **fallback_to_plain** is set to true
+ * (default: true) it is assumed to be a string containing white space
+ * separated key=value pairs. A white space is either space, tab or newline.
+ * Function **get_str_map** will be leveraged to parse the plain-text
+ * key/value pairs.
+ *
+ * @param [in] str JSON or plain text key/value pairs
+ * @param [out] ss human readable message on error
+ * @param [out] str_map key/value pairs read from str
+ * @param [in] fallback_to_plain attempt parsing as plain-text if json fails
+ * @return **0** on success or a -EINVAL on error.
+ */
+extern int get_json_str_map(
+ const std::string &str,
+ std::ostream &ss,
+ std::map<std::string,std::string> *str_map,
+ bool fallback_to_plain = true);
+
+/**
+ * Parse **str** and set **str_map** with the key/value pairs read from
+ * it. The format of **str** is a number of custom key[=value] pairs in
+ * plain text format.
+ *
+ * The string will be parsed taking **delims** as field delimiters for
+ * key/values. The value is optional resulting in an empty string when
+ * not provided. For example, using white space as delimiters:
+ *
+ * insert your own=political/ideological statement=here
+ *
+ * will be parsed into:
+ *
+ * { "insert": "",
+ * "your": "",
+ * "own": "political/ideological",
+ * "statement": "here" }
+ *
+ * Alternative delimiters may be provided. For instance, specifying
+ * "white space and slash", for the above statement, would be parsed
+ * into:
+ *
+ * { "insert": "",
+ * "your": "",
+ * "own": "political",
+ * "ideological": "",
+ * "statement": "here" }
+ *
+ * See how adding '/' to the delimiters field will spawn a new key without
+ * a set value.
+ *
+ * Always returns 0, as there is no condition for failure.
+ *
+ * @param [in] str plain text key/value pairs
+ * @param [in] delims field delimiters to be used for parsing str
+ * @param [out] str_map key/value pairs parsed from str
+ * @return **0**
+ */
+extern int get_str_map(
+ const std::string &str,
+ std::map<std::string,std::string> *str_map,
+ const char *delims = CONST_DELIMS);
+
+/**
+ * Returns the value of **key** in **str_map** if available.
+ *
+ * If **key** is not available in **str_map**, and if **def_val** is
+ * not-NULL then returns **def_val**. Otherwise checks if the value of
+ * **key** is an empty string and if so will return **key**.
+ * If the map contains **key**, the function returns the value of **key**.
+ *
+ * @param[in] str_map Map to obtain **key** from
+ * @param[in] key The key to search for in the map
+ * @param[in] def_val The value to return in case **key** is not present
+ */
+extern std::string get_str_map_value(
+ const std::map<std::string,std::string> &str_map,
+ const std::string &key,
+ const std::string *def_val = NULL);
+
+/**
+ * Returns the value of **key** in **str_map** if available.
+ *
+ * If **key** is available in **str_map** returns the value of **key**.
+ *
+ * If **key** is not available in **str_map**, and if **def_key**
+ * is not-NULL and available in **str_map**, then returns the value
+ * of **def_key**.
+ *
+ * Otherwise returns an empty string.
+ *
+ * @param[in] str_map Map to obtain **key** or **def_key** from
+ * @param[in] key Key to obtain the value of from **str_map**
+ * @param[in] def_key Key to fallback to if **key** is not present
+ * in **str_map**
+ */
+extern std::string get_str_map_key(
+ const std::map<std::string,std::string> &str_map,
+ const std::string &key,
+ const std::string *fallback_key = NULL);
+
+
+// This function's only purpose is to check whether a given map has only
+// ONE key with an empty value (which would mean that 'get_str_map()' read
+// a map in the form of 'VALUE', without any KEY/VALUE pairs) and, in such
+// event, to assign said 'VALUE' to a given 'def_key', such that we end up
+// with a map of the form "m = { 'def_key' : 'VALUE' }" instead of the
+// original "m = { 'VALUE' : '' }".
+int get_conf_str_map_helper(
+ const std::string &str,
+ std::ostringstream &oss,
+ std::map<std::string,std::string> *m,
+ const std::string &def_key);
+
+#endif
diff --git a/src/include/stringify.h b/src/include/stringify.h
new file mode 100644
index 00000000..1b2a130c
--- /dev/null
+++ b/src/include/stringify.h
@@ -0,0 +1,33 @@
+#ifndef __CEPH_STRINGIFY_H
+#define __CEPH_STRINGIFY_H
+
+#include <string>
+#include <sstream>
+
+#include "include/types.h"
+
+template<typename T>
+inline std::string stringify(const T& a) {
+#if defined(__GNUC__) && !(defined(__clang__) || defined(__INTEL_COMPILER))
+ static __thread std::ostringstream ss;
+ ss.str("");
+#else
+ std::ostringstream ss;
+#endif
+ ss << a;
+ return ss.str();
+}
+
+template <class T, class A>
+T joinify(const A &begin, const A &end, const T &t)
+{
+ T result;
+ for (A it = begin; it != end; it++) {
+ if (!result.empty())
+ result.append(t);
+ result.append(*it);
+ }
+ return result;
+}
+
+#endif
diff --git a/src/include/timegm.h b/src/include/timegm.h
new file mode 100644
index 00000000..fb970432
--- /dev/null
+++ b/src/include/timegm.h
@@ -0,0 +1,79 @@
+// (C) Copyright Howard Hinnant
+// (C) Copyright 2010-2011 Vicente J. Botet Escriba
+// Use, modification and distribution are subject to the Boost Software License,
+// Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt).
+
+//===-------------------------- locale ------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// This code was adapted by Vicente from Howard Hinnant's experimental work
+// on chrono i/o to Boost and some functions from libc++/locale to emulate the missing time_get::get()
+
+#ifndef BOOST_CHRONO_IO_TIME_POINT_IO_H
+#define BOOST_CHRONO_IO_TIME_POINT_IO_H
+
+#include <time.h>
+
+static int32_t is_leap(int32_t year) {
+ if(year % 400 == 0)
+ return 1;
+ if(year % 100 == 0)
+ return 0;
+ if(year % 4 == 0)
+ return 1;
+ return 0;
+}
+
+static int32_t days_from_0(int32_t year) {
+ year--;
+ return 365 * year + (year / 400) - (year/100) + (year / 4);
+}
+
+int32_t static days_from_1970(int32_t year) {
+ static const int days_from_0_to_1970 = days_from_0(1970);
+ return days_from_0(year) - days_from_0_to_1970;
+}
+
+static int32_t days_from_1jan(int32_t year,int32_t month,int32_t day) {
+ static const int32_t days[2][12] =
+ {
+ { 0,31,59,90,120,151,181,212,243,273,304,334},
+ { 0,31,60,91,121,152,182,213,244,274,305,335}
+ };
+
+ return days[is_leap(year)][month-1] + day - 1;
+}
+
+static time_t internal_timegm(tm const *t) {
+ int year = t->tm_year + 1900;
+ int month = t->tm_mon;
+ if(month > 11)
+ {
+ year += month/12;
+ month %= 12;
+ }
+ else if(month < 0)
+ {
+ int years_diff = (-month + 11)/12;
+ year -= years_diff;
+ month+=12 * years_diff;
+ }
+ month++;
+ int day = t->tm_mday;
+ int day_of_year = days_from_1jan(year,month,day);
+ int days_since_epoch = days_from_1970(year) + day_of_year ;
+
+ time_t seconds_in_day = 3600 * 24;
+ time_t result = seconds_in_day * days_since_epoch + 3600 * t->tm_hour + 60 * t->tm_min + t->tm_sec;
+
+ return result;
+}
+
+#endif
diff --git a/src/include/types.h b/src/include/types.h
new file mode 100644
index 00000000..1ae15277
--- /dev/null
+++ b/src/include/types.h
@@ -0,0 +1,604 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_TYPES_H
+#define CEPH_TYPES_H
+
+// this is needed for ceph_fs to compile in userland
+#include "int_types.h"
+#include "byteorder.h"
+
+#include "uuid.h"
+
+#include <netinet/in.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "rbd_types.h"
+
+#ifdef __cplusplus
+#ifndef _BACKWARD_BACKWARD_WARNING_H
+#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_*
+#endif
+#endif
+
+extern "C" {
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "statlite.h"
+}
+
+#include <string>
+#include <list>
+#include <set>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/flat_map.hpp>
+#include <map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+
+
+#include "include/unordered_map.h"
+
+#include "object.h"
+#include "intarith.h"
+
+#include "acconfig.h"
+
+#include "assert.h"
+
+// DARWIN compatibility
+#ifdef __APPLE__
+typedef long long loff_t;
+typedef long long off64_t;
+#define O_DIRECT 00040000
+#endif
+
+// FreeBSD compatibility
+#ifdef __FreeBSD__
+typedef off_t loff_t;
+typedef off_t off64_t;
+#endif
+
+#if defined(__sun) || defined(_AIX)
+typedef off_t loff_t;
+#endif
+
+
+// -- io helpers --
+
+// Forward declare all the I/O helpers so strict ADL can find them in
+// the case of containers of containers. I'm tempted to abstract this
+// stuff using template templates like I did for denc.
+
+namespace std {
+template<class A, class B>
+inline std::ostream& operator<<(std::ostream&out, const std::pair<A,B>& v);
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v);
+template<class A, std::size_t N, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v);
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t);
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m);
+}
+
+namespace boost {
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const boost::tuple<Ts...> &t);
+
+namespace container {
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& iset);
+}
+}
+
+namespace std {
+template<class A, class B>
+inline std::ostream& operator<<(std::ostream& out, const std::pair<A,B>& v) {
+ return out << v.first << "," << v.second;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v) {
+ bool first = true;
+ out << "[";
+ for (const auto& p : v) {
+ if (!first) out << ",";
+ out << p;
+ first = false;
+ }
+ out << "]";
+ return out;
+}
+
+template<class A, std::size_t N, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v) {
+ bool first = true;
+ out << "[";
+ for (const auto& p : v) {
+ if (!first) out << ",";
+ out << p;
+ first = false;
+ }
+ out << "]";
+ return out;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v) {
+ out << "<";
+ for (auto p = v.begin(); p != v.end(); ++p) {
+ if (p != v.begin()) out << ",";
+ out << *p;
+ }
+ out << ">";
+ return out;
+}
+
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t) {
+ auto f = [n = sizeof...(Ts), i = 0U, &out](const auto& e) mutable {
+ out << e;
+ if (++i != n)
+ out << ",";
+ };
+ ceph::for_each(t, f);
+ return out;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist) {
+ for (auto it = ilist.begin();
+ it != ilist.end();
+ ++it) {
+ if (it != ilist.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset) {
+ for (auto it = iset.begin();
+ it != iset.end();
+ ++it) {
+ if (it != iset.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset) {
+ for (auto it = iset.begin();
+ it != iset.end();
+ ++it) {
+ if (it != iset.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m)
+{
+ out << "{";
+ for (auto it = m.begin();
+ it != m.end();
+ ++it) {
+ if (it != m.begin()) out << ",";
+ out << it->first << "=" << it->second;
+ }
+ out << "}";
+ return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m)
+{
+ out << "{{";
+ for (auto it = m.begin();
+ it != m.end();
+ ++it) {
+ if (it != m.begin()) out << ",";
+ out << it->first << "=" << it->second;
+ }
+ out << "}}";
+ return out;
+}
+
+} // namespace std
+
+namespace boost {
+namespace tuples {
+template<typename A, typename B, typename C>
+inline std::ostream& operator<<(std::ostream& out, const boost::tuples::tuple<A, B, C> &t) {
+ return out << boost::get<0>(t) << ","
+ << boost::get<1>(t) << ","
+ << boost::get<2>(t);
+}
+}
+namespace container {
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset) {
+ for (auto it = iset.begin();
+ it != iset.end();
+ ++it) {
+ if (it != iset.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& m) {
+ for (auto it = m.begin();
+ it != m.end();
+ ++it) {
+ if (it != m.begin()) out << ",";
+ out << it->first << "=" << it->second;
+ }
+ return out;
+}
+}
+} // namespace boost
+
+
+
+/*
+ * comparators for stl containers
+ */
+// for ceph::unordered_map:
+// ceph::unordered_map<const char*, long, hash<const char*>, eqstr> vals;
+struct eqstr
+{
+ bool operator()(const char* s1, const char* s2) const
+ {
+ return strcmp(s1, s2) == 0;
+ }
+};
+
+// for set, map
+struct ltstr
+{
+ bool operator()(const char* s1, const char* s2) const
+ {
+ return strcmp(s1, s2) < 0;
+ }
+};
+
+
+namespace ceph {
+ class Formatter;
+}
+
+#include "encoding.h"
+
+WRITE_RAW_ENCODER(ceph_fsid)
+WRITE_RAW_ENCODER(ceph_file_layout)
+WRITE_RAW_ENCODER(ceph_dir_layout)
+WRITE_RAW_ENCODER(ceph_mds_session_head)
+WRITE_RAW_ENCODER(ceph_mds_request_head_legacy)
+WRITE_RAW_ENCODER(ceph_mds_request_head)
+WRITE_RAW_ENCODER(ceph_mds_request_release)
+WRITE_RAW_ENCODER(ceph_filelock)
+WRITE_RAW_ENCODER(ceph_mds_caps_head)
+WRITE_RAW_ENCODER(ceph_mds_caps_body_legacy)
+WRITE_RAW_ENCODER(ceph_mds_cap_peer)
+WRITE_RAW_ENCODER(ceph_mds_cap_release)
+WRITE_RAW_ENCODER(ceph_mds_cap_item)
+WRITE_RAW_ENCODER(ceph_mds_lease)
+WRITE_RAW_ENCODER(ceph_mds_snap_head)
+WRITE_RAW_ENCODER(ceph_mds_snap_realm)
+WRITE_RAW_ENCODER(ceph_mds_reply_head)
+WRITE_RAW_ENCODER(ceph_mds_reply_cap)
+WRITE_RAW_ENCODER(ceph_mds_cap_reconnect)
+WRITE_RAW_ENCODER(ceph_mds_snaprealm_reconnect)
+WRITE_RAW_ENCODER(ceph_frag_tree_split)
+WRITE_RAW_ENCODER(ceph_osd_reply_head)
+WRITE_RAW_ENCODER(ceph_osd_op)
+WRITE_RAW_ENCODER(ceph_msg_header)
+WRITE_RAW_ENCODER(ceph_msg_footer)
+WRITE_RAW_ENCODER(ceph_msg_footer_old)
+WRITE_RAW_ENCODER(ceph_mon_subscribe_item)
+
+WRITE_RAW_ENCODER(ceph_mon_statfs)
+WRITE_RAW_ENCODER(ceph_mon_statfs_reply)
+
+// ----------------------
+// some basic types
+
+// NOTE: these must match ceph_fs.h typedefs
+typedef uint64_t ceph_tid_t; // transaction id
+typedef uint64_t version_t;
+typedef __u32 epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years)
+
+// --------------------------------------
+// identify individual mount clients by 64bit value
+
+struct client_t {
+ int64_t v;
+
+ // cppcheck-suppress noExplicitConstructor
+ client_t(int64_t _v = -2) : v(_v) {}
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(v, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(v, bl);
+ }
+};
+WRITE_CLASS_ENCODER(client_t)
+
+static inline bool operator==(const client_t& l, const client_t& r) { return l.v == r.v; }
+static inline bool operator!=(const client_t& l, const client_t& r) { return l.v != r.v; }
+static inline bool operator<(const client_t& l, const client_t& r) { return l.v < r.v; }
+static inline bool operator<=(const client_t& l, const client_t& r) { return l.v <= r.v; }
+static inline bool operator>(const client_t& l, const client_t& r) { return l.v > r.v; }
+static inline bool operator>=(const client_t& l, const client_t& r) { return l.v >= r.v; }
+
+static inline bool operator>=(const client_t& l, int64_t o) { return l.v >= o; }
+static inline bool operator<(const client_t& l, int64_t o) { return l.v < o; }
+
+inline ostream& operator<<(ostream& out, const client_t& c) {
+ return out << c.v;
+}
+
+
+
+// --
+
+namespace {
+ inline ostream& format_u(ostream& out, const uint64_t v, const uint64_t n,
+ const int index, const uint64_t mult, const char* u)
+ {
+ char buffer[32];
+
+ if (index == 0) {
+ (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+ } else if ((v % mult) == 0) {
+ // If this is an even multiple of the base, always display
+ // without any decimal fraction.
+ (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+ } else {
+ // We want to choose a precision that reflects the best choice
+ // for fitting in 5 characters. This can get rather tricky when
+ // we have numbers that are very close to an order of magnitude.
+ // For example, when displaying 10239 (which is really 9.999K),
+ // we want only a single place of precision for 10.0K. We could
+ // develop some complex heuristics for this, but it's much
+ // easier just to try each combination in turn.
+ int i;
+ for (i = 2; i >= 0; i--) {
+ if (snprintf(buffer, sizeof(buffer), "%.*f%s", i,
+ static_cast<double>(v) / mult, u) <= 7)
+ break;
+ }
+ }
+
+ return out << buffer;
+ }
+}
+
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * decimal unit prefix (the classic SI units). No actual unit will be added.
+ */
+struct si_u_t {
+ uint64_t v;
+ explicit si_u_t(uint64_t _v) : v(_v) {};
+};
+
+inline ostream& operator<<(ostream& out, const si_u_t& b)
+{
+ uint64_t n = b.v;
+ int index = 0;
+ uint64_t mult = 1;
+ const char* u[] = {"", "k", "M", "G", "T", "P", "E"};
+
+ while (n >= 1000 && index < 7) {
+ n /= 1000;
+ index++;
+ mult *= 1000;
+ }
+
+ return format_u(out, b.v, n, index, mult, u[index]);
+}
+
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * binary unit prefix (IEC units). Since binary unit prefixes are to be used for
+ * "multiples of units in data processing, data transmission, and digital
+ * information" (so bits and bytes) and so far bits are not printed, the unit
+ * "B" for "byte" is added besides the multiplier.
+ */
+struct byte_u_t {
+ uint64_t v;
+ explicit byte_u_t(uint64_t _v) : v(_v) {};
+};
+
+inline ostream& operator<<(ostream& out, const byte_u_t& b)
+{
+ uint64_t n = b.v;
+ int index = 0;
+ const char* u[] = {" B", " KiB", " MiB", " GiB", " TiB", " PiB", " EiB"};
+
+ while (n >= 1024 && index < 7) {
+ n /= 1024;
+ index++;
+ }
+
+ return format_u(out, b.v, n, index, 1ULL << (10 * index), u[index]);
+}
+
+inline ostream& operator<<(ostream& out, const ceph_mon_subscribe_item& i)
+{
+ return out << i.start
+ << ((i.flags & CEPH_SUBSCRIBE_ONETIME) ? "" : "+");
+}
+
+struct weightf_t {
+ float v;
+ // cppcheck-suppress noExplicitConstructor
+ weightf_t(float _v) : v(_v) {}
+};
+
+inline ostream& operator<<(ostream& out, const weightf_t& w)
+{
+ if (w.v < -0.01F) {
+ return out << "-";
+ } else if (w.v < 0.000001F) {
+ return out << "0";
+ } else {
+ std::streamsize p = out.precision();
+ return out << std::fixed << std::setprecision(5) << w.v << std::setprecision(p);
+ }
+}
+
+struct shard_id_t {
+ int8_t id;
+
+ shard_id_t() : id(0) {}
+ explicit shard_id_t(int8_t _id) : id(_id) {}
+
+ operator int8_t() const { return id; }
+
+ const static shard_id_t NO_SHARD;
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(id, bl);
+ }
+ void decode(bufferlist::const_iterator &bl) {
+ using ceph::decode;
+ decode(id, bl);
+ }
+};
+WRITE_CLASS_ENCODER(shard_id_t)
+WRITE_EQ_OPERATORS_1(shard_id_t, id)
+WRITE_CMP_OPERATORS_1(shard_id_t, id)
+ostream &operator<<(ostream &lhs, const shard_id_t &rhs);
+
+#if defined(__sun) || defined(_AIX) || defined(__APPLE__) || defined(__FreeBSD__)
+__s32 ceph_to_hostos_errno(__s32 e);
+__s32 hostos_to_ceph_errno(__s32 e);
+#else
+#define ceph_to_hostos_errno(e) (e)
+#define hostos_to_ceph_errno(e) (e)
+#endif
+
+struct errorcode32_t {
+ int32_t code;
+
+ errorcode32_t() : code(0) {}
+ // cppcheck-suppress noExplicitConstructor
+ errorcode32_t(int32_t i) : code(i) {}
+
+ operator int() const { return code; }
+ int* operator&() { return &code; }
+ int operator==(int i) { return code == i; }
+ int operator>(int i) { return code > i; }
+ int operator>=(int i) { return code >= i; }
+ int operator<(int i) { return code < i; }
+ int operator<=(int i) { return code <= i; }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ __s32 newcode = hostos_to_ceph_errno(code);
+ encode(newcode, bl);
+ }
+ void decode(bufferlist::const_iterator &bl) {
+ using ceph::decode;
+ decode(code, bl);
+ code = ceph_to_hostos_errno(code);
+ }
+};
+WRITE_CLASS_ENCODER(errorcode32_t)
+WRITE_EQ_OPERATORS_1(errorcode32_t, code)
+WRITE_CMP_OPERATORS_1(errorcode32_t, code)
+
+template <uint8_t S>
+struct sha_digest_t {
+ constexpr static uint32_t SIZE = S;
+ // TODO: we might consider std::array in the future. Avoiding it for now
+ // as sha_digest_t is a part of our public API.
+ unsigned char v[S] = {0};
+
+ string to_str() const {
+ char str[S * 2 + 1] = {0};
+ str[0] = '\0';
+ for (size_t i = 0; i < S; i++) {
+ ::sprintf(&str[i * 2], "%02x", static_cast<int>(v[i]));
+ }
+ return string(str);
+ }
+ sha_digest_t(const unsigned char *_v) { memcpy(v, _v, SIZE); };
+ sha_digest_t() {}
+
+ bool operator==(const sha_digest_t& r) const {
+ return ::memcmp(v, r.v, SIZE) == 0;
+ }
+ bool operator!=(const sha_digest_t& r) const {
+ return ::memcmp(v, r.v, SIZE) != 0;
+ }
+
+ void encode(bufferlist &bl) const {
+ // copy to avoid reinterpret_cast, is_pod and other nasty things
+ using ceph::encode;
+ std::array<unsigned char, SIZE> tmparr;
+ memcpy(tmparr.data(), v, SIZE);
+ encode(tmparr, bl);
+ }
+ void decode(bufferlist::const_iterator &bl) {
+ using ceph::decode;
+ std::array<unsigned char, SIZE> tmparr;
+ decode(tmparr, bl);
+ memcpy(v, tmparr.data(), SIZE);
+ }
+};
+
+template <uint8_t S>
+inline ostream &operator<<(ostream &out, const sha_digest_t<S> &b) {
+ string str = b.to_str();
+ return out << str;
+}
+
+using sha1_digest_t = sha_digest_t<20>;
+WRITE_CLASS_ENCODER(sha1_digest_t)
+
+using sha256_digest_t = sha_digest_t<32>;
+WRITE_CLASS_ENCODER(sha256_digest_t)
+
+
+#endif
diff --git a/src/include/unordered_map.h b/src/include/unordered_map.h
new file mode 100644
index 00000000..aee5f5a7
--- /dev/null
+++ b/src/include/unordered_map.h
@@ -0,0 +1,11 @@
+#ifndef CEPH_UNORDERED_MAP_H
+#define CEPH_UNORDERED_MAP_H
+
+#include <unordered_map>
+
+namespace ceph {
+ using std::unordered_map;
+ using std::unordered_multimap;
+}
+
+#endif
diff --git a/src/include/unordered_set.h b/src/include/unordered_set.h
new file mode 100644
index 00000000..e30e1799
--- /dev/null
+++ b/src/include/unordered_set.h
@@ -0,0 +1,10 @@
+#ifndef CEPH_UNORDERED_SET_H
+#define CEPH_UNORDERED_SET_H
+
+#include <unordered_set>
+
+namespace ceph {
+ using std::unordered_set;
+}
+
+#endif
diff --git a/src/include/util.h b/src/include/util.h
new file mode 100644
index 00000000..18aa51ad
--- /dev/null
+++ b/src/include/util.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+#ifndef CEPH_UTIL_H
+#define CEPH_UTIL_H
+
+#include "common/Formatter.h"
+#include "include/types.h"
+
+std::string bytes2str(uint64_t count);
+
+struct ceph_data_stats
+{
+ uint64_t byte_total;
+ uint64_t byte_used;
+ uint64_t byte_avail;
+ int avail_percent;
+
+ ceph_data_stats() :
+ byte_total(0),
+ byte_used(0),
+ byte_avail(0),
+ avail_percent(0)
+ { }
+
+ void dump(Formatter *f) const {
+ ceph_assert(f != NULL);
+ f->dump_int("total", byte_total);
+ f->dump_int("used", byte_used);
+ f->dump_int("avail", byte_avail);
+ f->dump_int("avail_percent", avail_percent);
+ }
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(byte_total, bl);
+ encode(byte_used, bl);
+ encode(byte_avail, bl);
+ encode(avail_percent, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(byte_total, p);
+ decode(byte_used, p);
+ decode(byte_avail, p);
+ decode(avail_percent, p);
+ DECODE_FINISH(p);
+ }
+
+ static void generate_test_instances(list<ceph_data_stats*>& ls) {
+ ls.push_back(new ceph_data_stats);
+ ls.push_back(new ceph_data_stats);
+ ls.back()->byte_total = 1024*1024;
+ ls.back()->byte_used = 512*1024;
+ ls.back()->byte_avail = 512*1024;
+ ls.back()->avail_percent = 50;
+ }
+};
+typedef struct ceph_data_stats ceph_data_stats_t;
+WRITE_CLASS_ENCODER(ceph_data_stats)
+
+int get_fs_stats(ceph_data_stats_t &stats, const char *path);
+
+/// get memory limit for the current cgroup
+int get_cgroup_memory_limit(uint64_t *limit);
+
+/// collect info from @p uname(2), @p /proc/meminfo and @p /proc/cpuinfo
+void collect_sys_info(map<string, string> *m, CephContext *cct);
+
+/// dump service ids grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service id hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(Formatter* f, const map<string, list<int> >& services, const char* type);
+/// dump service names grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service name hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(Formatter* f, const map<string, list<string> >& services, const char* type);
+
+string cleanbin(bufferlist &bl, bool &b64, bool show = false);
+string cleanbin(string &str);
+
+namespace ceph::util {
+
+// Returns true if s matches any parameters:
+template <typename ...XS>
+bool match_str(const std::string& s, const XS& ...xs)
+{
+ return ((s == xs) || ...);
+}
+
+} // namespace ceph::util
+#endif /* CEPH_UTIL_H */
diff --git a/src/include/utime.h b/src/include/utime.h
new file mode 100644
index 00000000..42f9b087
--- /dev/null
+++ b/src/include/utime.h
@@ -0,0 +1,579 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_UTIME_H
+#define CEPH_UTIME_H
+
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+
+#include "include/types.h"
+#include "include/timegm.h"
+#include "common/strtol.h"
+#include "common/ceph_time.h"
+#include "common/safe_io.h"
+#include "common/SubProcess.h"
+#include "include/denc.h"
+
+
+// --------
+// utime_t
+
+inline __u32 cap_to_u32_max(__u64 t) {
+ return std::min(t, (__u64)std::numeric_limits<uint32_t>::max());
+}
+/* WARNING: If add member in utime_t, please make sure the encode/decode function
+ * work well. For little-endian machine, we should make sure there is no padding
+ * in 32-bit machine and 64-bit machine.
+ * You should also modify the padding_check function.
+ */
+class utime_t {
+public:
+ struct {
+ __u32 tv_sec, tv_nsec;
+ } tv;
+
+ public:
+ bool is_zero() const {
+ return (tv.tv_sec == 0) && (tv.tv_nsec == 0);
+ }
+
+ void normalize() {
+ if (tv.tv_nsec > 1000000000ul) {
+ tv.tv_sec = cap_to_u32_max(tv.tv_sec + tv.tv_nsec / (1000000000ul));
+ tv.tv_nsec %= 1000000000ul;
+ }
+ }
+
+ // cons
+ utime_t() { tv.tv_sec = 0; tv.tv_nsec = 0; }
+ utime_t(time_t s, int n) { tv.tv_sec = s; tv.tv_nsec = n; normalize(); }
+ utime_t(const struct ceph_timespec &v) {
+ decode_timeval(&v);
+ }
+ utime_t(const struct timespec v)
+ {
+ // NOTE: this is used by ceph_clock_now() so should be kept
+ // as thin as possible.
+ tv.tv_sec = v.tv_sec;
+ tv.tv_nsec = v.tv_nsec;
+ }
+ // conversion from ceph::real_time/coarse_real_time
+ template <typename Clock, typename std::enable_if_t<
+ ceph::converts_to_timespec_v<Clock>>* = nullptr>
+ explicit utime_t(const std::chrono::time_point<Clock>& t)
+ : utime_t(Clock::to_timespec(t)) {} // forward to timespec ctor
+
+ utime_t(const struct timeval &v) {
+ set_from_timeval(&v);
+ }
+ utime_t(const struct timeval *v) {
+ set_from_timeval(v);
+ }
+ void to_timespec(struct timespec *ts) const {
+ ts->tv_sec = tv.tv_sec;
+ ts->tv_nsec = tv.tv_nsec;
+ }
+ void set_from_double(double d) {
+ tv.tv_sec = (__u32)trunc(d);
+ tv.tv_nsec = (__u32)((d - (double)tv.tv_sec) * 1000000000.0);
+ }
+
+ real_time to_real_time() const {
+ ceph_timespec ts;
+ encode_timeval(&ts);
+ return ceph::real_clock::from_ceph_timespec(ts);
+ }
+
+ // accessors
+ time_t sec() const { return tv.tv_sec; }
+ long usec() const { return tv.tv_nsec/1000; }
+ int nsec() const { return tv.tv_nsec; }
+
+ // ref accessors/modifiers
+ __u32& sec_ref() { return tv.tv_sec; }
+ __u32& nsec_ref() { return tv.tv_nsec; }
+
+ uint64_t to_nsec() const {
+ return (uint64_t)tv.tv_nsec + (uint64_t)tv.tv_sec * 1000000000ull;
+ }
+ uint64_t to_msec() const {
+ return (uint64_t)tv.tv_nsec / 1000000ull + (uint64_t)tv.tv_sec * 1000ull;
+ }
+
+ void copy_to_timeval(struct timeval *v) const {
+ v->tv_sec = tv.tv_sec;
+ v->tv_usec = tv.tv_nsec/1000;
+ }
+ void set_from_timeval(const struct timeval *v) {
+ tv.tv_sec = v->tv_sec;
+ tv.tv_nsec = v->tv_usec*1000;
+ }
+ void padding_check() {
+ static_assert(
+ sizeof(utime_t) ==
+ sizeof(tv.tv_sec) +
+ sizeof(tv.tv_nsec)
+ ,
+ "utime_t have padding");
+ }
+ void encode(bufferlist &bl) const {
+#if defined(CEPH_LITTLE_ENDIAN)
+ bl.append((char *)(this), sizeof(__u32) + sizeof(__u32));
+#else
+ using ceph::encode;
+ encode(tv.tv_sec, bl);
+ encode(tv.tv_nsec, bl);
+#endif
+ }
+ void decode(bufferlist::const_iterator &p) {
+#if defined(CEPH_LITTLE_ENDIAN)
+ p.copy(sizeof(__u32) + sizeof(__u32), (char *)(this));
+#else
+ using ceph::decode;
+ decode(tv.tv_sec, p);
+ decode(tv.tv_nsec, p);
+#endif
+ }
+
+ DENC(utime_t, v, p) {
+ denc(v.tv.tv_sec, p);
+ denc(v.tv.tv_nsec, p);
+ }
+
+
+ void encode_timeval(struct ceph_timespec *t) const {
+ t->tv_sec = tv.tv_sec;
+ t->tv_nsec = tv.tv_nsec;
+ }
+ void decode_timeval(const struct ceph_timespec *t) {
+ tv.tv_sec = t->tv_sec;
+ tv.tv_nsec = t->tv_nsec;
+ }
+
+ utime_t round_to_minute() {
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+ bdt.tm_sec = 0;
+ tt = mktime(&bdt);
+ return utime_t(tt, 0);
+ }
+
+ utime_t round_to_hour() {
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+ bdt.tm_sec = 0;
+ bdt.tm_min = 0;
+ tt = mktime(&bdt);
+ return utime_t(tt, 0);
+ }
+
+ utime_t round_to_day() {
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+ bdt.tm_sec = 0;
+ bdt.tm_min = 0;
+ bdt.tm_hour = 0;
+ tt = mktime(&bdt);
+ return utime_t(tt, 0);
+ }
+
+ // cast to double
+ operator double() const {
+ return (double)sec() + ((double)nsec() / 1000000000.0L);
+ }
+ operator ceph_timespec() const {
+ ceph_timespec ts;
+ ts.tv_sec = sec();
+ ts.tv_nsec = nsec();
+ return ts;
+ }
+
+ void sleep() const {
+ struct timespec ts;
+ to_timespec(&ts);
+ nanosleep(&ts, NULL);
+ }
+
+ // output
+ ostream& gmtime(ostream& out) const {
+ out.setf(std::ios::right);
+ char oldfill = out.fill();
+ out.fill('0');
+ if (sec() < ((time_t)(60*60*24*365*10))) {
+ // raw seconds. this looks like a relative time.
+ out << (long)sec() << "." << std::setw(6) << usec();
+ } else {
+ // this looks like an absolute time.
+ // aim for http://en.wikipedia.org/wiki/ISO_8601
+ struct tm bdt;
+ time_t tt = sec();
+ gmtime_r(&tt, &bdt);
+ out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07'
+ << '-' << std::setw(2) << (bdt.tm_mon+1)
+ << '-' << std::setw(2) << bdt.tm_mday
+ << ' '
+ << std::setw(2) << bdt.tm_hour
+ << ':' << std::setw(2) << bdt.tm_min
+ << ':' << std::setw(2) << bdt.tm_sec;
+ out << "." << std::setw(6) << usec();
+ out << "Z";
+ }
+ out.fill(oldfill);
+ out.unsetf(std::ios::right);
+ return out;
+ }
+
+ // output
+ ostream& gmtime_nsec(ostream& out) const {
+ out.setf(std::ios::right);
+ char oldfill = out.fill();
+ out.fill('0');
+ if (sec() < ((time_t)(60*60*24*365*10))) {
+ // raw seconds. this looks like a relative time.
+ out << (long)sec() << "." << std::setw(6) << usec();
+ } else {
+ // this looks like an absolute time.
+ // aim for http://en.wikipedia.org/wiki/ISO_8601
+ struct tm bdt;
+ time_t tt = sec();
+ gmtime_r(&tt, &bdt);
+ out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07'
+ << '-' << std::setw(2) << (bdt.tm_mon+1)
+ << '-' << std::setw(2) << bdt.tm_mday
+ << ' '
+ << std::setw(2) << bdt.tm_hour
+ << ':' << std::setw(2) << bdt.tm_min
+ << ':' << std::setw(2) << bdt.tm_sec;
+ out << "." << std::setw(9) << nsec();
+ out << "Z";
+ }
+ out.fill(oldfill);
+ out.unsetf(std::ios::right);
+ return out;
+ }
+
+ // output
+ ostream& asctime(ostream& out) const {
+ out.setf(std::ios::right);
+ char oldfill = out.fill();
+ out.fill('0');
+ if (sec() < ((time_t)(60*60*24*365*10))) {
+ // raw seconds. this looks like a relative time.
+ out << (long)sec() << "." << std::setw(6) << usec();
+ } else {
+ // this looks like an absolute time.
+ // aim for http://en.wikipedia.org/wiki/ISO_8601
+ struct tm bdt;
+ time_t tt = sec();
+ gmtime_r(&tt, &bdt);
+
+ char buf[128];
+ asctime_r(&bdt, buf);
+ int len = strlen(buf);
+ if (buf[len - 1] == '\n')
+ buf[len - 1] = '\0';
+ out << buf;
+ }
+ out.fill(oldfill);
+ out.unsetf(std::ios::right);
+ return out;
+ }
+
+ ostream& localtime(ostream& out) const {
+ out.setf(std::ios::right);
+ char oldfill = out.fill();
+ out.fill('0');
+ if (sec() < ((time_t)(60*60*24*365*10))) {
+ // raw seconds. this looks like a relative time.
+ out << (long)sec() << "." << std::setw(6) << usec();
+ } else {
+ // this looks like an absolute time.
+ // aim for http://en.wikipedia.org/wiki/ISO_8601
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+ out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07'
+ << '-' << std::setw(2) << (bdt.tm_mon+1)
+ << '-' << std::setw(2) << bdt.tm_mday
+ << ' '
+ << std::setw(2) << bdt.tm_hour
+ << ':' << std::setw(2) << bdt.tm_min
+ << ':' << std::setw(2) << bdt.tm_sec;
+ out << "." << std::setw(6) << usec();
+ //out << '_' << bdt.tm_zone;
+ }
+ out.fill(oldfill);
+ out.unsetf(std::ios::right);
+ return out;
+ }
+
+ int sprintf(char *out, int outlen) const {
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+
+ return ::snprintf(out, outlen,
+ "%04d-%02d-%02d %02d:%02d:%02d.%06ld",
+ bdt.tm_year + 1900, bdt.tm_mon + 1, bdt.tm_mday,
+ bdt.tm_hour, bdt.tm_min, bdt.tm_sec, usec());
+ }
+
+ static int snprintf(char *out, int outlen, time_t tt) {
+ struct tm bdt;
+ localtime_r(&tt, &bdt);
+
+ return ::snprintf(out, outlen,
+ "%04d-%02d-%02d %02d:%02d:%02d",
+ bdt.tm_year + 1900, bdt.tm_mon + 1, bdt.tm_mday,
+ bdt.tm_hour, bdt.tm_min, bdt.tm_sec);
+ }
+
+ static int invoke_date(const std::string& date_str, utime_t *result) {
+ char buf[256];
+
+ SubProcess bin_date("/bin/date", SubProcess::CLOSE, SubProcess::PIPE,
+ SubProcess::KEEP);
+ bin_date.add_cmd_args("-d", date_str.c_str(), "+%s %N", NULL);
+
+ int r = bin_date.spawn();
+ if (r < 0) return r;
+
+ ssize_t n = safe_read(bin_date.get_stdout(), buf, sizeof(buf));
+
+ r = bin_date.join();
+ if (r || n <= 0) return -EINVAL;
+
+ uint64_t epoch, nsec;
+ std::istringstream iss(buf);
+
+ iss >> epoch;
+ iss >> nsec;
+
+ *result = utime_t(epoch, nsec);
+
+ return 0;
+ }
+
+
+ static int parse_date(const string& date, uint64_t *epoch, uint64_t *nsec,
+ string *out_date=NULL, string *out_time=NULL) {
+ struct tm tm;
+ memset(&tm, 0, sizeof(tm));
+
+ if (nsec)
+ *nsec = 0;
+
+ const char *p = strptime(date.c_str(), "%Y-%m-%d", &tm);
+ if (p) {
+ if (*p == ' ' || *p == 'T') {
+ p++;
+ // strptime doesn't understand fractional/decimal seconds, and
+ // it also only takes format chars or literals, so we have to
+ // get creative.
+ char fmt[32] = {0};
+ strncpy(fmt, p, sizeof(fmt) - 1);
+ fmt[0] = '%';
+ fmt[1] = 'H';
+ fmt[2] = ':';
+ fmt[3] = '%';
+ fmt[4] = 'M';
+ fmt[6] = '%';
+ fmt[7] = 'S';
+ const char *subsec = 0;
+ char *q = fmt + 8;
+ if (*q == '.') {
+ ++q;
+ subsec = p + 9;
+ q = fmt + 9;
+ while (*q && isdigit(*q)) {
+ ++q;
+ }
+ }
+ // look for tz...
+ if (*q == '-' || *q == '+') {
+ *q = '%';
+ *(q+1) = 'z';
+ *(q+2) = 0;
+ }
+ p = strptime(p, fmt, &tm);
+ if (!p) {
+ return -EINVAL;
+ }
+ if (nsec && subsec) {
+ unsigned i;
+ char buf[10]; /* 9 digit + null termination */
+ for (i = 0; (i < sizeof(buf) - 1) && isdigit(*subsec); ++i, ++subsec) {
+ buf[i] = *subsec;
+ }
+ for (; i < sizeof(buf) - 1; ++i) {
+ buf[i] = '0';
+ }
+ buf[i] = '\0';
+ string err;
+ *nsec = (uint64_t)strict_strtol(buf, 10, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+ }
+ }
+ } else {
+ int sec, usec;
+ int r = sscanf(date.c_str(), "%d.%d", &sec, &usec);
+ if (r != 2) {
+ return -EINVAL;
+ }
+
+ time_t tt = sec;
+ gmtime_r(&tt, &tm);
+
+ if (nsec) {
+ *nsec = (uint64_t)usec * 1000;
+ }
+ }
+
+ // apply the tm_gmtoff manually below, since none of mktime,
+ // gmtime, and localtime seem to do it. zero it out here just in
+ // case some other libc *does* apply it. :(
+ auto gmtoff = tm.tm_gmtoff;
+ tm.tm_gmtoff = 0;
+
+ time_t t = internal_timegm(&tm);
+ if (epoch)
+ *epoch = (uint64_t)t;
+
+ *epoch -= gmtoff;
+
+ if (out_date) {
+ char buf[32];
+ strftime(buf, sizeof(buf), "%F", &tm);
+ *out_date = buf;
+ }
+ if (out_time) {
+ char buf[32];
+ strftime(buf, sizeof(buf), "%T", &tm);
+ *out_time = buf;
+ }
+
+ return 0;
+ }
+
+ bool parse(const string& s) {
+ uint64_t epoch, nsec;
+ int r = parse_date(s, &epoch, &nsec);
+ if (r < 0) {
+ return false;
+ }
+ *this = utime_t(epoch, nsec);
+ return true;
+ }
+};
+WRITE_CLASS_ENCODER(utime_t)
+WRITE_CLASS_DENC(utime_t)
+
+// arithmetic operators
+inline utime_t operator+(const utime_t& l, const utime_t& r) {
+ __u64 sec = (__u64)l.sec() + r.sec();
+ return utime_t(cap_to_u32_max(sec), l.nsec() + r.nsec());
+}
+inline utime_t& operator+=(utime_t& l, const utime_t& r) {
+ l.sec_ref() = cap_to_u32_max((__u64)l.sec() + r.sec());
+ l.nsec_ref() += r.nsec();
+ l.normalize();
+ return l;
+}
+inline utime_t& operator+=(utime_t& l, double f) {
+ double fs = trunc(f);
+ double ns = (f - fs) * 1000000000.0;
+ l.sec_ref() = cap_to_u32_max(l.sec() + (__u64)fs);
+ l.nsec_ref() += (long)ns;
+ l.normalize();
+ return l;
+}
+
+inline utime_t operator-(const utime_t& l, const utime_t& r) {
+ return utime_t( l.sec() - r.sec() - (l.nsec()<r.nsec() ? 1:0),
+ l.nsec() - r.nsec() + (l.nsec()<r.nsec() ? 1000000000:0) );
+}
+inline utime_t& operator-=(utime_t& l, const utime_t& r) {
+ l.sec_ref() -= r.sec();
+ if (l.nsec() >= r.nsec())
+ l.nsec_ref() -= r.nsec();
+ else {
+ l.nsec_ref() += 1000000000L - r.nsec();
+ l.sec_ref()--;
+ }
+ return l;
+}
+inline utime_t& operator-=(utime_t& l, double f) {
+ double fs = trunc(f);
+ double ns = (f - fs) * 1000000000.0;
+ l.sec_ref() -= (long)fs;
+ long nsl = (long)ns;
+ if (nsl) {
+ l.sec_ref()--;
+ l.nsec_ref() = 1000000000L + l.nsec_ref() - nsl;
+ }
+ l.normalize();
+ return l;
+}
+
+
+// comparators
+inline bool operator>(const utime_t& a, const utime_t& b)
+{
+ return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.nsec() > b.nsec());
+}
+inline bool operator<=(const utime_t& a, const utime_t& b)
+{
+ return !(operator>(a, b));
+}
+inline bool operator<(const utime_t& a, const utime_t& b)
+{
+ return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.nsec() < b.nsec());
+}
+inline bool operator>=(const utime_t& a, const utime_t& b)
+{
+ return !(operator<(a, b));
+}
+
+inline bool operator==(const utime_t& a, const utime_t& b)
+{
+ return a.sec() == b.sec() && a.nsec() == b.nsec();
+}
+inline bool operator!=(const utime_t& a, const utime_t& b)
+{
+ return a.sec() != b.sec() || a.nsec() != b.nsec();
+}
+
+
+// output
+
+// ostream
+inline std::ostream& operator<<(std::ostream& out, const utime_t& t)
+{
+ return t.localtime(out);
+}
+
+inline std::string utimespan_str(const utime_t& age) {
+ auto age_ts = ceph::timespan(age.nsec()) + std::chrono::seconds(age.sec());
+ return timespan_str(age_ts);
+}
+
+#endif
diff --git a/src/include/uuid.h b/src/include/uuid.h
new file mode 100644
index 00000000..f957f87a
--- /dev/null
+++ b/src/include/uuid.h
@@ -0,0 +1,83 @@
+#ifndef _CEPH_UUID_H
+#define _CEPH_UUID_H
+
+/*
+ * Thin C++ wrapper around libuuid.
+ */
+
+#include "encoding.h"
+
+#include <ostream>
+#include <random>
+
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+struct uuid_d {
+ boost::uuids::uuid uuid;
+
+ uuid_d() {
+ boost::uuids::nil_generator gen;
+ uuid = gen();
+ }
+
+ bool is_zero() const {
+ return uuid.is_nil();
+ }
+
+ void generate_random() {
+ std::random_device rng;
+ boost::uuids::basic_random_generator gen(rng);
+ uuid = gen();
+ }
+
+ bool parse(const char *s) {
+ try {
+ boost::uuids::string_generator gen;
+ uuid = gen(s);
+ return true;
+ } catch (std::runtime_error& e) {
+ return false;
+ }
+ }
+ void print(char *s) const {
+ memcpy(s, boost::uuids::to_string(uuid).c_str(), 37);
+ }
+
+ std::string to_string() const {
+ return boost::uuids::to_string(uuid);
+ }
+
+ char *bytes() const {
+ return (char*)uuid.data;
+ }
+
+ void encode(bufferlist& bl) const {
+ ::encode_raw(uuid, bl);
+ }
+
+ void decode(bufferlist::const_iterator& p) const {
+ ::decode_raw(uuid, p);
+ }
+};
+WRITE_CLASS_ENCODER(uuid_d)
+
+inline std::ostream& operator<<(std::ostream& out, const uuid_d& u) {
+ char b[37];
+ u.print(b);
+ return out << b;
+}
+
+inline bool operator==(const uuid_d& l, const uuid_d& r) {
+ return l.uuid == r.uuid;
+}
+inline bool operator!=(const uuid_d& l, const uuid_d& r) {
+ return l.uuid != r.uuid;
+}
+inline bool operator<(const uuid_d& l, const uuid_d& r) {
+ return l.to_string() < r.to_string();
+}
+
+
+#endif
diff --git a/src/include/xlist.h b/src/include/xlist.h
new file mode 100644
index 00000000..733a318a
--- /dev/null
+++ b/src/include/xlist.h
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_XLIST_H
+#define CEPH_XLIST_H
+
+#include <iterator>
+#include <cstdlib>
+#include <ostream>
+
+#include "include/ceph_assert.h"
+
+template<typename T>
+class xlist {
+public:
+ class item {
+ public:
+ item(T i) : _item(i) {}
+ ~item() {
+ ceph_assert(!is_on_list());
+ }
+
+ item(const item& other) = delete;
+ item(item&& other) = delete;
+ const item& operator= (const item& right) = delete;
+ item& operator= (item&& right) = delete;
+
+ xlist* get_list() { return _list; }
+ bool is_on_list() const { return _list ? true:false; }
+ bool remove_myself() {
+ if (_list) {
+ _list->remove(this);
+ ceph_assert(_list == 0);
+ return true;
+ } else
+ return false;
+ }
+ void move_to_front() {
+ ceph_assert(_list);
+ _list->push_front(this);
+ }
+ void move_to_back() {
+ ceph_assert(_list);
+ _list->push_back(this);
+ }
+
+ private:
+ friend xlist;
+ T _item;
+ item *_prev = nullptr, *_next = nullptr;
+ xlist *_list = nullptr;
+ };
+
+ typedef item* value_type;
+ typedef item* const_reference;
+
+private:
+ item *_front, *_back;
+ size_t _size;
+
+public:
+ xlist(const xlist& other) {
+ _front = other._front;
+ _back = other._back;
+ _size = other._size;
+ }
+
+ xlist() : _front(0), _back(0), _size(0) {}
+ ~xlist() {
+ ceph_assert(_size == 0);
+ ceph_assert(_front == 0);
+ ceph_assert(_back == 0);
+ }
+
+ size_t size() const {
+ ceph_assert((bool)_front == (bool)_size);
+ return _size;
+ }
+ bool empty() const {
+ ceph_assert((bool)_front == (bool)_size);
+ return _front == 0;
+ }
+
+ void clear() {
+ while (_front)
+ remove(_front);
+ ceph_assert((bool)_front == (bool)_size);
+ }
+
+ void push_front(item *i) {
+ if (i->_list)
+ i->_list->remove(i);
+
+ i->_list = this;
+ i->_next = _front;
+ i->_prev = 0;
+ if (_front)
+ _front->_prev = i;
+ else
+ _back = i;
+ _front = i;
+ _size++;
+ }
+ void push_back(item *i) {
+ if (i->_list)
+ i->_list->remove(i);
+
+ i->_list = this;
+ i->_next = 0;
+ i->_prev = _back;
+ if (_back)
+ _back->_next = i;
+ else
+ _front = i;
+ _back = i;
+ _size++;
+ }
+ void remove(item *i) {
+ ceph_assert(i->_list == this);
+
+ if (i->_prev)
+ i->_prev->_next = i->_next;
+ else
+ _front = i->_next;
+ if (i->_next)
+ i->_next->_prev = i->_prev;
+ else
+ _back = i->_prev;
+ _size--;
+
+ i->_list = 0;
+ i->_next = i->_prev = 0;
+ ceph_assert((bool)_front == (bool)_size);
+ }
+
+ T front() { return static_cast<T>(_front->_item); }
+ const T front() const { return static_cast<const T>(_front->_item); }
+
+ T back() { return static_cast<T>(_back->_item); }
+ const T back() const { return static_cast<const T>(_back->_item); }
+
+ void pop_front() {
+ ceph_assert(!empty());
+ remove(_front);
+ }
+ void pop_back() {
+ ceph_assert(!empty());
+ remove(_back);
+ }
+
+ class iterator: std::iterator<std::forward_iterator_tag, T> {
+ private:
+ item *cur;
+ public:
+ iterator(item *i = 0) : cur(i) {}
+ T operator*() { return static_cast<T>(cur->_item); }
+ iterator& operator++() {
+ ceph_assert(cur);
+ ceph_assert(cur->_list);
+ cur = cur->_next;
+ return *this;
+ }
+ bool end() const { return cur == 0; }
+ bool operator==(const iterator& rhs) const {
+ return cur == rhs.cur;
+ }
+ bool operator!=(const iterator& rhs) const {
+ return cur != rhs.cur;
+ }
+ };
+
+ iterator begin() { return iterator(_front); }
+ iterator end() { return iterator(NULL); }
+
+ class const_iterator: std::iterator<std::forward_iterator_tag, T> {
+ private:
+ item *cur;
+ public:
+ const_iterator(item *i = 0) : cur(i) {}
+ const T operator*() { return static_cast<const T>(cur->_item); }
+ const_iterator& operator++() {
+ ceph_assert(cur);
+ ceph_assert(cur->_list);
+ cur = cur->_next;
+ return *this;
+ }
+ bool end() const { return cur == 0; }
+ bool operator==(const_iterator& rhs) const {
+ return cur == rhs.cur;
+ }
+ bool operator!=(const_iterator& rhs) const {
+ return cur != rhs.cur;
+ }
+ };
+
+ const_iterator begin() const { return const_iterator(_front); }
+ const_iterator end() const { return const_iterator(NULL); }
+
+ friend std::ostream &operator<<(std::ostream &oss, const xlist<T> &list) {
+ bool first = true;
+ for (const auto &item : list) {
+ if (!first) {
+ oss << ", ";
+ }
+ oss << *item; /* item should be a pointer */
+ first = false;
+ }
+ return oss;
+ }
+};
+
+
+#endif