summaryrefslogtreecommitdiffstats
path: root/src/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/include')
-rw-r--r--src/include/CMakeLists.txt38
-rw-r--r--src/include/CompatSet.h285
-rw-r--r--src/include/Context.h535
-rw-r--r--src/include/Distribution.h73
-rw-r--r--src/include/addr_parsing.h28
-rw-r--r--src/include/alloc_ptr.h91
-rw-r--r--src/include/any.h704
-rw-r--r--src/include/bitmapper.h48
-rw-r--r--src/include/blobhash.h53
-rw-r--r--src/include/btree_map.h68
-rw-r--r--src/include/buffer.h1284
-rw-r--r--src/include/buffer_fwd.h19
-rw-r--r--src/include/buffer_raw.h126
-rw-r--r--src/include/byteorder.h120
-rw-r--r--src/include/ceph_assert.h147
-rw-r--r--src/include/ceph_features.h279
-rw-r--r--src/include/ceph_frag.h109
-rw-r--r--src/include/ceph_fs.h1007
-rw-r--r--src/include/ceph_fuse.h51
-rw-r--r--src/include/ceph_hash.h14
-rw-r--r--src/include/cephfs/ceph_ll_client.h157
-rw-r--r--src/include/cephfs/libcephfs.h2126
-rw-r--r--src/include/cephfs/metrics/Types.h699
-rw-r--r--src/include/cmp.h205
-rw-r--r--src/include/color.h13
-rw-r--r--src/include/common_fwd.h32
-rw-r--r--src/include/compact_map.h383
-rw-r--r--src/include/compact_set.h305
-rw-r--r--src/include/compat.h401
-rw-r--r--src/include/config-h.in.cmake393
-rw-r--r--src/include/coredumpctl.h105
-rw-r--r--src/include/counter.h56
-rw-r--r--src/include/cpp-btree/btree.h2565
-rw-r--r--src/include/cpp-btree/btree_container.h543
-rw-r--r--src/include/cpp-btree/btree_map.h159
-rw-r--r--src/include/cpp-btree/btree_set.h655
-rw-r--r--src/include/crc32c.h57
-rw-r--r--src/include/demangle.h48
-rw-r--r--src/include/denc.h1903
-rw-r--r--src/include/dlfcn_compat.h48
-rw-r--r--src/include/elist.h193
-rw-r--r--src/include/encoding.h1531
-rw-r--r--src/include/err.h31
-rw-r--r--src/include/error.h41
-rw-r--r--src/include/event_type.h24
-rw-r--r--src/include/expected.hpp2282
-rw-r--r--src/include/filepath.h250
-rw-r--r--src/include/frag.h615
-rw-r--r--src/include/fs_types.h172
-rw-r--r--src/include/function2.hpp1581
-rw-r--r--src/include/hash.h64
-rw-r--r--src/include/health.h83
-rw-r--r--src/include/inline_memory.h150
-rw-r--r--src/include/int_types.h56
-rw-r--r--src/include/intarith.h193
-rw-r--r--src/include/interval_set.h824
-rw-r--r--src/include/ipaddr.h47
-rw-r--r--src/include/krbd.h97
-rw-r--r--src/include/libcephsqlite.h73
-rw-r--r--src/include/linux_fiemap.h73
-rw-r--r--src/include/lru.h241
-rw-r--r--src/include/mempool.h548
-rw-r--r--src/include/msgr.h247
-rw-r--r--src/include/neorados/RADOS.hpp1152
-rw-r--r--src/include/neorados/RADOS_Decodable.hpp107
l---------src/include/neorados/buffer_fwd.h1
l---------src/include/neorados/completion.h1
-rw-r--r--src/include/object.h224
-rw-r--r--src/include/on_exit.h49
-rw-r--r--src/include/page.h18
-rw-r--r--src/include/rados.h696
l---------src/include/rados/buffer.h1
l---------src/include/rados/buffer_fwd.h1
l---------src/include/rados/crc32c.h1
l---------src/include/rados/inline_memory.h1
-rw-r--r--src/include/rados/librados.h4135
-rw-r--r--src/include/rados/librados.hpp1556
-rw-r--r--src/include/rados/librados_fwd.hpp34
-rw-r--r--src/include/rados/librgw.h36
-rw-r--r--src/include/rados/objclass.h177
l---------src/include/rados/page.h1
-rw-r--r--src/include/rados/rados_types.h41
-rw-r--r--src/include/rados/rados_types.hpp341
-rw-r--r--src/include/rados/rgw_file.h431
-rw-r--r--src/include/radosstriper/libradosstriper.h610
-rw-r--r--src/include/radosstriper/libradosstriper.hpp241
-rw-r--r--src/include/random.h301
-rw-r--r--src/include/rangeset.h250
-rw-r--r--src/include/rbd/features.h121
-rw-r--r--src/include/rbd/librbd.h1491
-rw-r--r--src/include/rbd/librbd.hpp842
-rw-r--r--src/include/rbd/object_map_types.h13
-rw-r--r--src/include/rbd_types.h159
-rw-r--r--src/include/scope_guard.h47
-rw-r--r--src/include/sock_compat.h43
-rw-r--r--src/include/spinlock.h92
-rw-r--r--src/include/stat.h145
-rw-r--r--src/include/statlite.h74
-rw-r--r--src/include/str_list.h98
-rw-r--r--src/include/str_map.h148
-rw-r--r--src/include/stringify.h33
-rw-r--r--src/include/timegm.h79
-rw-r--r--src/include/types.h626
-rw-r--r--src/include/unordered_map.h11
-rw-r--r--src/include/unordered_set.h10
-rw-r--r--src/include/uses_allocator.h266
-rw-r--r--src/include/util.h114
-rw-r--r--src/include/utime.cc31
-rw-r--r--src/include/utime.h602
-rw-r--r--src/include/uuid.cc36
-rw-r--r--src/include/uuid.h97
-rw-r--r--src/include/win32/arpa/inet.h1
-rw-r--r--src/include/win32/fs_compat.h36
-rw-r--r--src/include/win32/ifaddrs.h39
-rw-r--r--src/include/win32/netdb.h1
-rw-r--r--src/include/win32/netinet/in.h1
-rw-r--r--src/include/win32/netinet/ip.h0
-rw-r--r--src/include/win32/netinet/tcp.h0
-rw-r--r--src/include/win32/poll.h1
-rw-r--r--src/include/win32/sys/errno.h1
-rw-r--r--src/include/win32/sys/select.h0
-rw-r--r--src/include/win32/sys/socket.h1
-rw-r--r--src/include/win32/sys/statvfs.h36
-rw-r--r--src/include/win32/sys/uio.h1
-rw-r--r--src/include/win32/sys/un.h1
-rw-r--r--src/include/win32/syslog.h64
-rw-r--r--src/include/win32/win32_errno.h146
-rw-r--r--src/include/win32/winsock_compat.h39
-rw-r--r--src/include/win32/winsock_wrapper.h27
-rw-r--r--src/include/xlist.h224
130 files changed, 41551 insertions, 0 deletions
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt
new file mode 100644
index 000000000..dc3ecbb9f
--- /dev/null
+++ b/src/include/CMakeLists.txt
@@ -0,0 +1,38 @@
+install(FILES
+ libcephsqlite.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+install(FILES
+ rados/librados.h
+ rados/rados_types.h
+ rados/rados_types.hpp
+ rados/librados_fwd.hpp
+ rados/librados.hpp
+ buffer.h
+ buffer_fwd.h
+ inline_memory.h
+ page.h
+ crc32c.h
+ rados/objclass.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
+if(WITH_LIBRADOSSTRIPER)
+ install(FILES
+ radosstriper/libradosstriper.h
+ radosstriper/libradosstriper.hpp
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/radosstriper)
+endif()
+
+if(WITH_RBD)
+ install(FILES
+ rbd/features.h
+ rbd/librbd.h
+ rbd/librbd.hpp
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rbd)
+endif()
+
+if(WITH_RADOSGW)
+ install(FILES
+ rados/librgw.h
+ rados/rgw_file.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
+endif()
diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h
new file mode 100644
index 000000000..35c7a7738
--- /dev/null
+++ b/src/include/CompatSet.h
@@ -0,0 +1,285 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMPATSET_H
+#define CEPH_COMPATSET_H
+
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/types.h"
+#include "common/Formatter.h"
+
+struct CompatSet {
+
+ struct Feature {
+ uint64_t id;
+ std::string name;
+
+ Feature(uint64_t _id, const std::string& _name) : id(_id), name(_name) {}
+ };
+
+ class FeatureSet {
+ uint64_t mask;
+ std::map<uint64_t, std::string> names;
+
+ public:
+ friend struct CompatSet;
+ friend class CephCompatSet_AllSet_Test;
+ friend class CephCompatSet_other_Test;
+ friend class CephCompatSet_merge_Test;
+ friend std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs);
+ friend std::ostream& operator<<(std::ostream& out, const CompatSet& compat);
+ FeatureSet() : mask(1), names() {}
+ void insert(const Feature& f) {
+ ceph_assert(f.id > 0);
+ ceph_assert(f.id < 64);
+ mask |= ((uint64_t)1<<f.id);
+ names[f.id] = f.name;
+ }
+
+ bool contains(const Feature& f) const {
+ return names.count(f.id);
+ }
+ bool contains(uint64_t f) const {
+ return names.count(f);
+ }
+ /**
+ * Getter instead of using name[] to be const safe
+ */
+ std::string get_name(uint64_t const f) const {
+ std::map<uint64_t, std::string>::const_iterator i = names.find(f);
+ ceph_assert(i != names.end());
+ return i->second;
+ }
+
+ void remove(uint64_t f) {
+ if (names.count(f)) {
+ names.erase(f);
+ mask &= ~((uint64_t)1<<f);
+ }
+ }
+ void remove(const Feature& f) {
+ remove(f.id);
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ /* See below, mask always has the lowest bit set in memory, but
+ * unset in the encoding */
+ encode(mask & (~(uint64_t)1), bl);
+ encode(names, bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ using ceph::decode;
+ decode(mask, bl);
+ decode(names, bl);
+ /**
+ * Previously, there was a bug where insert did
+ * mask |= f.id rather than mask |= (1 << f.id).
+ * In FeatureSets from those version, mask always
+ * has the lowest bit set. Since then, masks always
+ * have the lowest bit unset.
+ *
+ * When we encounter such a FeatureSet, we have to
+ * reconstruct the mask from the names map.
+ */
+ if (mask & 1) {
+ mask = 1;
+ std::map<uint64_t, std::string> temp_names;
+ temp_names.swap(names);
+ for (auto i = temp_names.begin(); i != temp_names.end(); ++i) {
+ insert(Feature(i->first, i->second));
+ }
+ } else {
+ mask |= 1;
+ }
+ }
+
+ void dump(ceph::Formatter *f) const {
+ for (auto p = names.cbegin(); p != names.cend(); ++p) {
+ char s[18];
+ snprintf(s, sizeof(s), "feature_%llu", (unsigned long long)p->first);
+ f->dump_string(s, p->second);
+ }
+ }
+ };
+
+ // These features have no impact on the read / write status
+ FeatureSet compat;
+ // If any of these features are missing, read is possible ( as long
+ // as no incompat feature is missing ) but it is not possible to write
+ FeatureSet ro_compat;
+ // If any of these features are missing, read or write is not possible
+ FeatureSet incompat;
+
+ CompatSet(FeatureSet& _compat, FeatureSet& _ro_compat, FeatureSet& _incompat) :
+ compat(_compat), ro_compat(_ro_compat), incompat(_incompat) {}
+
+ CompatSet() : compat(), ro_compat(), incompat() { }
+
+
+ /* does this filesystem implementation have the
+ features required to read the other? */
+ bool readable(CompatSet const& other) const {
+ return !((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+ }
+
+ /* does this filesystem implementation have the
+ features required to write the other? */
+ bool writeable(CompatSet const& other) const {
+ return readable(other) &&
+ !((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+ }
+
+ /* Compare this CompatSet to another.
+ * CAREFULLY NOTE: This operation is NOT commutative.
+ * a > b DOES NOT imply that b < a.
+ * If returns:
+ * 0: The CompatSets have the same feature set.
+ * 1: This CompatSet's features are a strict superset of the other's.
+ * -1: This CompatSet is missing at least one feature
+ * described in the other. It may still have more features, though.
+ */
+ int compare(const CompatSet& other) const {
+ if ((other.compat.mask == compat.mask) &&
+ (other.ro_compat.mask == ro_compat.mask) &&
+ (other.incompat.mask == incompat.mask)) return 0;
+ //okay, they're not the same
+
+ //if we're writeable we have a superset of theirs on incompat and ro_compat
+ if (writeable(other) && !((other.compat.mask ^ compat.mask)
+ & other.compat.mask)) return 1;
+ //if we make it here, we weren't writeable or had a difference compat set
+ return -1;
+ }
+
+ /* Get the features supported by other CompatSet but not this one,
+ * as a CompatSet.
+ */
+ CompatSet unsupported(const CompatSet& other) const {
+ CompatSet diff;
+ uint64_t other_compat =
+ ((other.compat.mask ^ compat.mask) & other.compat.mask);
+ uint64_t other_ro_compat =
+ ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+ uint64_t other_incompat =
+ ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+ for (int id = 1; id < 64; ++id) {
+ uint64_t mask = (uint64_t)1 << id;
+ if (mask & other_compat) {
+ diff.compat.insert( Feature(id, other.compat.names.at(id)));
+ }
+ if (mask & other_ro_compat) {
+ diff.ro_compat.insert(Feature(id, other.ro_compat.names.at(id)));
+ }
+ if (mask & other_incompat) {
+ diff.incompat.insert( Feature(id, other.incompat.names.at(id)));
+ }
+ }
+ return diff;
+ }
+
+ /* Merge features supported by other CompatSet into this one.
+ * Return: true if some features were merged
+ */
+ bool merge(CompatSet const & other) {
+ uint64_t other_compat =
+ ((other.compat.mask ^ compat.mask) & other.compat.mask);
+ uint64_t other_ro_compat =
+ ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+ uint64_t other_incompat =
+ ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+ if (!other_compat && !other_ro_compat && !other_incompat)
+ return false;
+ for (int id = 1; id < 64; ++id) {
+ uint64_t mask = (uint64_t)1 << id;
+ if (mask & other_compat) {
+ compat.insert( Feature(id, other.compat.get_name(id)));
+ }
+ if (mask & other_ro_compat) {
+ ro_compat.insert(Feature(id, other.ro_compat.get_name(id)));
+ }
+ if (mask & other_incompat) {
+ incompat.insert( Feature(id, other.incompat.get_name(id)));
+ }
+ }
+ return true;
+ }
+
+ std::ostream& printlite(std::ostream& o) const {
+ o << "{c=[" << std::hex << compat.mask << "]";
+ o << ",r=[" << std::hex << ro_compat.mask << "]";
+ o << ",i=[" << std::hex << incompat.mask << "]}";
+ o << std::dec;
+ return o;
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ compat.encode(bl);
+ ro_compat.encode(bl);
+ incompat.encode(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ compat.decode(bl);
+ ro_compat.decode(bl);
+ incompat.decode(bl);
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->open_object_section("compat");
+ compat.dump(f);
+ f->close_section();
+ f->open_object_section("ro_compat");
+ ro_compat.dump(f);
+ f->close_section();
+ f->open_object_section("incompat");
+ incompat.dump(f);
+ f->close_section();
+ }
+
+ static void generate_test_instances(std::list<CompatSet*>& o) {
+ o.push_back(new CompatSet);
+ o.push_back(new CompatSet);
+ o.back()->compat.insert(Feature(1, "one"));
+ o.back()->compat.insert(Feature(2, "two"));
+ o.back()->ro_compat.insert(Feature(4, "four"));
+ o.back()->incompat.insert(Feature(3, "three"));
+ }
+};
+WRITE_CLASS_ENCODER(CompatSet)
+
+inline std::ostream& operator<<(std::ostream& out, const CompatSet::Feature& f)
+{
+ return out << "F(" << f.id << ", \"" << f.name << "\")";
+}
+
+inline std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs)
+{
+ return out << fs.names;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const CompatSet& compat)
+{
+ return out << "compat=" << compat.compat
+ << ",rocompat=" << compat.ro_compat
+ << ",incompat=" << compat.incompat;
+}
+
+#endif
diff --git a/src/include/Context.h b/src/include/Context.h
new file mode 100644
index 000000000..bef85ca5b
--- /dev/null
+++ b/src/include/Context.h
@@ -0,0 +1,535 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_CONTEXT_H
+#define CEPH_CONTEXT_H
+
+#include "common/dout.h"
+
+#include <functional>
+#include <list>
+#include <memory>
+#include <set>
+
+#include <boost/function.hpp>
+#include <boost/system/error_code.hpp>
+
+#include "common/error_code.h"
+
+#include "include/ceph_assert.h"
+#include "common/ceph_mutex.h"
+
+#define mydout(cct, v) lgeneric_subdout(cct, context, v)
+
+/*
+ * GenContext - abstract callback class
+ */
+template <typename T>
+class GenContext {
+ GenContext(const GenContext& other);
+ const GenContext& operator=(const GenContext& other);
+
+ protected:
+ virtual void finish(T t) = 0;
+
+ public:
+ GenContext() {}
+ virtual ~GenContext() {} // we want a virtual destructor!!!
+
+ template <typename C>
+ void complete(C &&t) {
+ finish(std::forward<C>(t));
+ delete this;
+ }
+
+ template <typename C>
+ void operator()(C &&t) noexcept {
+ complete(std::forward<C>(t));
+ }
+
+ template<typename U = T>
+ auto operator()() noexcept
+ -> typename std::enable_if<std::is_default_constructible<U>::value,
+ void>::type {
+ complete(T{});
+ }
+
+
+ std::reference_wrapper<GenContext> func() {
+ return std::ref(*this);
+ }
+};
+
+template <typename T>
+using GenContextURef = std::unique_ptr<GenContext<T> >;
+
+/*
+ * Context - abstract callback class
+ */
+class Finisher;
+class Context {
+ Context(const Context& other);
+ const Context& operator=(const Context& other);
+
+ protected:
+ virtual void finish(int r) = 0;
+
+ // variant of finish that is safe to call "synchronously." override should
+ // return true.
+ virtual bool sync_finish(int r) {
+ return false;
+ }
+
+ public:
+ Context() {}
+ virtual ~Context() {} // we want a virtual destructor!!!
+ virtual void complete(int r) {
+ finish(r);
+ delete this;
+ }
+ virtual bool sync_complete(int r) {
+ if (sync_finish(r)) {
+ delete this;
+ return true;
+ }
+ return false;
+ }
+ void complete(boost::system::error_code ec) {
+ complete(ceph::from_error_code(ec));
+ }
+ void operator()(boost::system::error_code ec) noexcept {
+ complete(ec);
+ }
+
+ void operator()() noexcept {
+ complete({});
+ }
+
+ std::reference_wrapper<Context> func() {
+ return std::ref(*this);
+ }
+};
+
+/**
+ * Simple context holding a single object
+ */
+template<class T>
+class ContainerContext : public Context {
+ T obj;
+public:
+ ContainerContext(T &obj) : obj(obj) {}
+ void finish(int r) override {}
+};
+template <typename T>
+ContainerContext<T> *make_container_context(T &&t) {
+ return new ContainerContext<T>(std::forward<T>(t));
+}
+
+template <class T>
+struct Wrapper : public Context {
+ Context *to_run;
+ T val;
+ Wrapper(Context *to_run, T val) : to_run(to_run), val(val) {}
+ void finish(int r) override {
+ if (to_run)
+ to_run->complete(r);
+ }
+};
+struct RunOnDelete {
+ Context *to_run;
+ RunOnDelete(Context *to_run) : to_run(to_run) {}
+ ~RunOnDelete() {
+ if (to_run)
+ to_run->complete(0);
+ }
+};
+typedef std::shared_ptr<RunOnDelete> RunOnDeleteRef;
+
+template <typename T>
+class LambdaContext : public Context {
+public:
+ LambdaContext(T &&t) : t(std::forward<T>(t)) {}
+ void finish(int r) override {
+ if constexpr (std::is_invocable_v<T, int>)
+ t(r);
+ else
+ t();
+ }
+private:
+ T t;
+};
+
+template <typename T>
+LambdaContext<T> *make_lambda_context(T &&t) {
+ return new LambdaContext<T>(std::move(t));
+}
+
+template <typename F, typename T>
+struct LambdaGenContext : GenContext<T> {
+ F f;
+ LambdaGenContext(F &&f) : f(std::forward<F>(f)) {}
+ void finish(T t) override {
+ f(std::forward<T>(t));
+ }
+};
+template <typename T, typename F>
+GenContextURef<T> make_gen_lambda_context(F &&f) {
+ return GenContextURef<T>(new LambdaGenContext<F, T>(std::move(f)));
+}
+
+/*
+ * finish and destroy a list of Contexts
+ */
+template<class C>
+inline void finish_contexts(CephContext *cct, C& finished, int result = 0)
+{
+ if (finished.empty())
+ return;
+
+ C ls;
+ ls.swap(finished); // swap out of place to avoid weird loops
+
+ if (cct)
+ mydout(cct,10) << ls.size() << " contexts to finish with " << result << dendl;
+ for (Context* c : ls) {
+ if (cct)
+ mydout(cct,10) << "---- " << c << dendl;
+ c->complete(result);
+ }
+}
+
+class C_NoopContext : public Context {
+public:
+ void finish(int r) override { }
+};
+
+
+struct C_Lock : public Context {
+ ceph::mutex *lock;
+ Context *fin;
+ C_Lock(ceph::mutex *l, Context *c) : lock(l), fin(c) {}
+ ~C_Lock() override {
+ delete fin;
+ }
+ void finish(int r) override {
+ if (fin) {
+ std::lock_guard l{*lock};
+ fin->complete(r);
+ fin = NULL;
+ }
+ }
+};
+
+/*
+ * C_Contexts - set of Contexts
+ *
+ * ContextType must be an ancestor class of ContextInstanceType, or the same class.
+ * ContextInstanceType must be default-constructable.
+ */
+template <class ContextType, class ContextInstanceType, class Container = std::list<ContextType *>>
+class C_ContextsBase : public ContextInstanceType {
+public:
+ CephContext *cct;
+ Container contexts;
+
+ C_ContextsBase(CephContext *cct_)
+ : cct(cct_)
+ {
+ }
+ ~C_ContextsBase() override {
+ for (auto c : contexts) {
+ delete c;
+ }
+ }
+ void add(ContextType* c) {
+ contexts.push_back(c);
+ }
+ void take(Container& ls) {
+ Container c;
+ c.swap(ls);
+ if constexpr (std::is_same_v<Container, std::list<ContextType *>>) {
+ contexts.splice(contexts.end(), c);
+ } else {
+ contexts.insert(contexts.end(), c.begin(), c.end());
+ }
+ }
+ void complete(int r) override {
+ // Neuter any ContextInstanceType custom complete(), because although
+ // I want to look like it, I don't actually want to run its code.
+ Context::complete(r);
+ }
+ void finish(int r) override {
+ finish_contexts(cct, contexts, r);
+ }
+ bool empty() { return contexts.empty(); }
+
+ template<class C>
+ static ContextType *list_to_context(C& cs) {
+ if (cs.size() == 0) {
+ return 0;
+ } else if (cs.size() == 1) {
+ ContextType *c = cs.front();
+ cs.clear();
+ return c;
+ } else {
+ C_ContextsBase<ContextType, ContextInstanceType> *c(new C_ContextsBase<ContextType, ContextInstanceType>(0));
+ c->take(cs);
+ return c;
+ }
+ }
+};
+
+typedef C_ContextsBase<Context, Context> C_Contexts;
+
+/*
+ * C_Gather
+ *
+ * ContextType must be an ancestor class of ContextInstanceType, or the same class.
+ * ContextInstanceType must be default-constructable.
+ *
+ * BUG:? only reports error from last sub to have an error return
+ */
+template <class ContextType, class ContextInstanceType>
+class C_GatherBase {
+private:
+ CephContext *cct;
+ int result = 0;
+ ContextType *onfinish;
+#ifdef DEBUG_GATHER
+ std::set<ContextType*> waitfor;
+#endif
+ int sub_created_count = 0;
+ int sub_existing_count = 0;
+ mutable ceph::recursive_mutex lock =
+ ceph::make_recursive_mutex("C_GatherBase::lock"); // disable lockdep
+ bool activated = false;
+
+ void sub_finish(ContextType* sub, int r) {
+ lock.lock();
+#ifdef DEBUG_GATHER
+ ceph_assert(waitfor.count(sub));
+ waitfor.erase(sub);
+#endif
+ --sub_existing_count;
+ mydout(cct,10) << "C_GatherBase " << this << ".sub_finish(r=" << r << ") " << sub
+#ifdef DEBUG_GATHER
+ << " (remaining " << waitfor << ")"
+#endif
+ << dendl;
+ if (r < 0 && result == 0)
+ result = r;
+ if ((activated == false) || (sub_existing_count != 0)) {
+ lock.unlock();
+ return;
+ }
+ lock.unlock();
+ delete_me();
+ }
+
+ void delete_me() {
+ if (onfinish) {
+ onfinish->complete(result);
+ onfinish = 0;
+ }
+ delete this;
+ }
+
+ class C_GatherSub : public ContextInstanceType {
+ C_GatherBase *gather;
+ public:
+ C_GatherSub(C_GatherBase *g) : gather(g) {}
+ void complete(int r) override {
+ // Cancel any customized complete() functionality
+ // from the Context subclass we're templated for,
+ // we only want to hit that in onfinish, not at each
+ // sub finish. e.g. MDSInternalContext.
+ Context::complete(r);
+ }
+ void finish(int r) override {
+ gather->sub_finish(this, r);
+ gather = 0;
+ }
+ ~C_GatherSub() override {
+ if (gather)
+ gather->sub_finish(this, 0);
+ }
+ };
+
+public:
+ C_GatherBase(CephContext *cct_, ContextType *onfinish_)
+ : cct(cct_), onfinish(onfinish_)
+ {
+ mydout(cct,10) << "C_GatherBase " << this << ".new" << dendl;
+ }
+ ~C_GatherBase() {
+ mydout(cct,10) << "C_GatherBase " << this << ".delete" << dendl;
+ }
+ void set_finisher(ContextType *onfinish_) {
+ std::lock_guard l{lock};
+ ceph_assert(!onfinish);
+ onfinish = onfinish_;
+ }
+ void activate() {
+ lock.lock();
+ ceph_assert(activated == false);
+ activated = true;
+ if (sub_existing_count != 0) {
+ lock.unlock();
+ return;
+ }
+ lock.unlock();
+ delete_me();
+ }
+ ContextType *new_sub() {
+ std::lock_guard l{lock};
+ ceph_assert(activated == false);
+ sub_created_count++;
+ sub_existing_count++;
+ ContextType *s = new C_GatherSub(this);
+#ifdef DEBUG_GATHER
+ waitfor.insert(s);
+#endif
+ mydout(cct,10) << "C_GatherBase " << this << ".new_sub is " << sub_created_count << " " << s << dendl;
+ return s;
+ }
+
+ inline int get_sub_existing_count() const {
+ std::lock_guard l{lock};
+ return sub_existing_count;
+ }
+
+ inline int get_sub_created_count() const {
+ std::lock_guard l{lock};
+ return sub_created_count;
+ }
+};
+
+/*
+ * The C_GatherBuilder remembers each C_Context created by
+ * C_GatherBuilder.new_sub() in a C_Gather. When a C_Context created
+ * by new_sub() is complete(), C_Gather forgets about it. When
+ * C_GatherBuilder notices that there are no C_Context left in
+ * C_Gather, it calls complete() on the C_Context provided as the
+ * second argument of the constructor (finisher).
+ *
+ * How to use C_GatherBuilder:
+ *
+ * 1. Create a C_GatherBuilder on the stack
+ * 2. Call gather_bld.new_sub() as many times as you want to create new subs
+ * It is safe to call this 0 times, or 100, or anything in between.
+ * 3. If you didn't supply a finisher in the C_GatherBuilder constructor,
+ * set one with gather_bld.set_finisher(my_finisher)
+ * 4. Call gather_bld.activate()
+ *
+ * Example:
+ *
+ * C_SaferCond all_done;
+ * C_GatherBuilder gb(g_ceph_context, all_done);
+ * j.submit_entry(1, first, 0, gb.new_sub()); // add a C_Context to C_Gather
+ * j.submit_entry(2, first, 0, gb.new_sub()); // add a C_Context to C_Gather
+ * gb.activate(); // consume C_Context as soon as they complete()
+ * all_done.wait(); // all_done is complete() after all new_sub() are complete()
+ *
+ * The finisher may be called at any point after step 4, including immediately
+ * from the activate() function.
+ * The finisher will never be called before activate().
+ *
+ * Note: Currently, subs must be manually freed by the caller (for some reason.)
+ */
+template <class ContextType, class GatherType>
+class C_GatherBuilderBase
+{
+public:
+ C_GatherBuilderBase(CephContext *cct_)
+ : cct(cct_), c_gather(NULL), finisher(NULL), activated(false)
+ {
+ }
+ C_GatherBuilderBase(CephContext *cct_, ContextType *finisher_)
+ : cct(cct_), c_gather(NULL), finisher(finisher_), activated(false)
+ {
+ }
+ ~C_GatherBuilderBase() {
+ if (c_gather) {
+ ceph_assert(activated); // Don't forget to activate your C_Gather!
+ }
+ else {
+ delete finisher;
+ }
+ }
+ ContextType *new_sub() {
+ if (!c_gather) {
+ c_gather = new GatherType(cct, finisher);
+ }
+ return c_gather->new_sub();
+ }
+ void activate() {
+ if (!c_gather)
+ return;
+ ceph_assert(finisher != NULL);
+ activated = true;
+ c_gather->activate();
+ }
+ void set_finisher(ContextType *finisher_) {
+ finisher = finisher_;
+ if (c_gather)
+ c_gather->set_finisher(finisher);
+ }
+ GatherType *get() const {
+ return c_gather;
+ }
+ bool has_subs() const {
+ return (c_gather != NULL);
+ }
+ int num_subs_created() {
+ ceph_assert(!activated);
+ if (c_gather == NULL)
+ return 0;
+ return c_gather->get_sub_created_count();
+ }
+ int num_subs_remaining() {
+ ceph_assert(!activated);
+ if (c_gather == NULL)
+ return 0;
+ return c_gather->get_sub_existing_count();
+ }
+
+private:
+ CephContext *cct;
+ GatherType *c_gather;
+ ContextType *finisher;
+ bool activated;
+};
+
+typedef C_GatherBase<Context, Context> C_Gather;
+typedef C_GatherBuilderBase<Context, C_Gather > C_GatherBuilder;
+
+template <class ContextType>
+class ContextFactory {
+public:
+ virtual ~ContextFactory() {}
+ virtual ContextType *build() = 0;
+};
+
+inline auto lambdafy(Context *c) {
+ return [fin = std::unique_ptr<Context>(c)]
+ (boost::system::error_code ec) mutable {
+ fin.release()->complete(ceph::from_error_code(ec));
+ };
+}
+
+
+#undef mydout
+
+#endif
diff --git a/src/include/Distribution.h b/src/include/Distribution.h
new file mode 100644
index 000000000..e4f0b30b1
--- /dev/null
+++ b/src/include/Distribution.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_DISTRIBUTION_H
+#define CEPH_DISTRIBUTION_H
+
+#include <vector>
+
+class Distribution {
+ vector<float> p;
+ vector<int> v;
+
+ public:
+ //Distribution() {
+ //}
+
+ unsigned get_width() {
+ return p.size();
+ }
+
+ void clear() {
+ p.clear();
+ v.clear();
+ }
+ void add(int val, float pr) {
+ p.push_back(pr);
+ v.push_back(val);
+ }
+
+ void random() {
+ float sum = 0.0;
+ for (unsigned i=0; i<p.size(); i++) {
+ p[i] = (float)(rand() % 10000);
+ sum += p[i];
+ }
+ for (unsigned i=0; i<p.size(); i++)
+ p[i] /= sum;
+ }
+
+ int sample() {
+ float s = (float)(rand() % 10000) / 10000.0;
+ for (unsigned i=0; i<p.size(); i++) {
+ if (s < p[i]) return v[i];
+ s -= p[i];
+ }
+ ceph_abort();
+ return v[p.size() - 1]; // hmm. :/
+ }
+
+ float normalize() {
+ float s = 0.0;
+ for (unsigned i=0; i<p.size(); i++)
+ s += p[i];
+ for (unsigned i=0; i<p.size(); i++)
+ p[i] /= s;
+ return s;
+ }
+
+};
+
+#endif
diff --git a/src/include/addr_parsing.h b/src/include/addr_parsing.h
new file mode 100644
index 000000000..c205ac75f
--- /dev/null
+++ b/src/include/addr_parsing.h
@@ -0,0 +1,28 @@
+/*
+ * addr_parsing.h
+ *
+ * Created on: Sep 14, 2010
+ * Author: gregf
+ * contains functions used by Ceph to convert named addresses
+ * (eg ceph.com) into IP addresses (ie 127.0.0.1).
+ */
+
+#ifndef ADDR_PARSING_H_
+#define ADDR_PARSING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int safe_cat(char **pstr, int *plen, int pos, const char *str2);
+
+/*
+ * returns a string allocated by malloc; caller must free
+ */
+char *resolve_addrs(const char *orig_str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ADDR_PARSING_H_ */
diff --git a/src/include/alloc_ptr.h b/src/include/alloc_ptr.h
new file mode 100644
index 000000000..258c58338
--- /dev/null
+++ b/src/include/alloc_ptr.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ALLOC_PTR_H
+#define CEPH_ALLOC_PTR_H
+
+#include <memory>
+
+template <class T>
+class alloc_ptr
+{
+public:
+ typedef typename std::pointer_traits< std::unique_ptr<T> >::pointer pointer;
+ typedef typename std::pointer_traits< std::unique_ptr<T> >::element_type element_type;
+
+ alloc_ptr() : ptr() {}
+
+ template<class U>
+ alloc_ptr(U&& u) : ptr(std::forward<U>(u)) {}
+
+ alloc_ptr(alloc_ptr<pointer>&& rhs) : ptr(std::move(rhs.ptr)) {}
+ alloc_ptr(const alloc_ptr<pointer>& rhs) = delete;
+ alloc_ptr& operator=(const alloc_ptr<pointer>&& rhs) {
+ ptr = rhs.ptr;
+ }
+ alloc_ptr& operator=(const alloc_ptr<pointer>& rhs) {
+ ptr = rhs.ptr;
+ }
+
+ void swap (alloc_ptr<pointer>& rhs) {
+ ptr.swap(rhs.ptr);
+ }
+ element_type* release() {
+ return ptr.release();
+ }
+ void reset(element_type *p = nullptr) {
+ ptr.reset(p);
+ }
+ element_type* get() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return ptr.get();
+ }
+ element_type& operator*() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return *ptr;
+ }
+ element_type* operator->() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return ptr.get();
+ }
+ operator bool() const {
+ return !!ptr;
+ }
+
+ friend bool operator< (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::less<element_type>(*lhs, *rhs);
+ }
+ friend bool operator<=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::less_equal<element_type>(*lhs, *rhs);
+ }
+ friend bool operator> (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::greater<element_type>(*lhs, *rhs);
+ }
+ friend bool operator>=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::greater_equal<element_type>(*lhs, *rhs);
+ }
+ friend bool operator==(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return *lhs == *rhs;
+ }
+ friend bool operator!=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return *lhs != *rhs;
+ }
+private:
+ mutable std::unique_ptr<element_type> ptr;
+};
+
+#endif
diff --git a/src/include/any.h b/src/include/any.h
new file mode 100644
index 000000000..da59c88f4
--- /dev/null
+++ b/src/include/any.h
@@ -0,0 +1,704 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef INCLUDE_STATIC_ANY
+#define INCLUDE_STATIC_ANY
+
+#include <any>
+#include <cstddef>
+#include <initializer_list>
+#include <memory>
+#include <typeinfo>
+#include <type_traits>
+
+#include <boost/smart_ptr/shared_ptr.hpp>
+#include <boost/smart_ptr/make_shared.hpp>
+
+namespace ceph {
+
+namespace _any {
+
+// Shared Functionality
+// --------------------
+//
+// Common implementation details. Most functionality is here. We
+// assume that destructors do not throw. Some of them might and
+// they'll invoke terminate and that's fine.
+//
+// We are using the Curiously Recurring Template Pattern! We require
+// that all classes inheriting from us provide:
+//
+// - `static constexpr size_t capacity`: Maximum capacity. No object
+// larger than this may be
+// stored. `dynamic` for dynamic.
+// - `void* ptr() const noexcept`: returns a pointer to storage.
+// (`alloc_storage` must have been called.
+// `free_storage` must not have been called
+// since.)
+// - `void* alloc_storage(const std::size_t)`: allocate storage
+// - `void free_storage() noexcept`: free storage. Must be idempotent.
+//
+// We provide most of the public interface, as well as the operator function,
+// cast_helper, and the type() call.
+
+// Set `capacity` to this value to indicate that there is no fixed
+// capacity.
+//
+inline constexpr std::size_t dynamic = ~0;
+
+// Driver Function
+// ---------------
+//
+// The usual type-erasure control function trick. This one is simpler
+// than usual since we punt on moving and copying. We could dispense
+// with this and just store a deleter and a pointer to a typeinfo, but
+// that would be twice the space.
+//
+// Moved out here so the type of `func_t` isn't dependent on the
+// enclosing class.
+//
+enum class op { type, destroy };
+template<typename T>
+inline void op_func(const op o, void* p) noexcept {
+ static const std::type_info& type = typeid(T);
+ switch (o) {
+ case op::type:
+ *(reinterpret_cast<const std::type_info**>(p)) = &type;
+ break;
+ case op::destroy:
+ reinterpret_cast<T*>(p)->~T();
+ break;
+ }
+}
+using func_t = void (*)(const op, void* p) noexcept;
+
+// The base class
+// --------------
+//
+// The `storage_t` parameter gives the type of the value that manages
+// storage and allocation. We use it to create a protected data member
+// (named `storage`). This allows us to sidestep the problem in
+// initialization order where, where exposed constructors were using
+// trying to allocate or free storage *before* the data members of the
+// derived class were initialized.
+//
+// Making storage_t a member type of the derived class won't work, due
+// to C++'s rules for nested types being *horrible*. Just downright
+// *horrible*.
+//
+template<typename D, typename storage_t>
+class base {
+ // Make definitions from our superclass visible
+ // --------------------------------------------
+ //
+ // And check that they fit the requirements. At least those that are
+ // statically checkable.
+ //
+ static constexpr std::size_t capacity = D::capacity;
+
+ void* ptr() const noexcept {
+ static_assert(
+ noexcept(static_cast<const D*>(this)->ptr()) &&
+ std::is_same_v<decltype(static_cast<const D*>(this)->ptr()), void*>,
+ "‘void* ptr() const noexcept’ missing from superclass");
+ return static_cast<const D*>(this)->ptr();
+ }
+
+ void* alloc_storage(const std::size_t z) {
+ static_assert(
+ std::is_same_v<decltype(static_cast<D*>(this)->alloc_storage(z)), void*>,
+ "‘void* alloc_storage(const size_t)’ missing from superclass.");
+ return static_cast<D*>(this)->alloc_storage(z);
+ }
+
+ void free_storage() noexcept {
+ static_assert(
+ noexcept(static_cast<D*>(this)->free_storage()) &&
+ std::is_void_v<decltype(static_cast<D*>(this)->free_storage())>,
+ "‘void free_storage() noexcept’ missing from superclass.");
+ static_cast<D*>(this)->free_storage();
+ }
+
+
+ // Pile O' Templates
+ // -----------------
+ //
+ // These are just verbose and better typed once than twice. They're
+ // used for SFINAE and declaring noexcept.
+ //
+ template<class T>
+ struct is_in_place_type_helper : std::false_type {};
+ template<class T>
+ struct is_in_place_type_helper<std::in_place_type_t<T>> : std::true_type {};
+
+ template<class T>
+ static constexpr bool is_in_place_type_v =
+ is_in_place_type_helper<std::decay_t<T>>::value;
+
+ // SFINAE condition for value initialized
+ // constructors/assigners. This is analogous to the standard's
+ // requirement that this overload only participate in overload
+ // resolution if std::decay_t<T> is not the same type as the
+ // any-type, nor a specialization of std::in_place_type_t
+ //
+ template<typename T>
+ using value_condition_t = std::enable_if_t<
+ !std::is_same_v<std::decay_t<T>, D> &&
+ !is_in_place_type_v<std::decay_t<T>>>;
+
+ // This `noexcept` condition for value construction lets
+ // `immobile_any`'s value constructor/assigner be noexcept, so long
+ // as the type's copy or move constructor cooperates.
+ //
+ template<typename T>
+ static constexpr bool value_noexcept_v =
+ std::is_nothrow_constructible_v<std::decay_t<T>, T> && capacity != dynamic;
+
+ // SFINAE condition for in-place constructors/assigners
+ //
+ template<typename T, typename... Args>
+ using in_place_condition_t = std::enable_if_t<std::is_constructible_v<
+ std::decay_t<T>, Args...>>;
+
+ // Analogous to the above. Give noexcept to immobile_any::emplace
+ // when possible.
+ //
+ template<typename T, typename... Args>
+ static constexpr bool in_place_noexcept_v =
+ std::is_nothrow_constructible_v<std::decay_t<T>, Args...> &&
+ capacity != dynamic;
+
+private:
+
+ // Functionality!
+ // --------------
+
+ // The driver function for the currently stored object. Whether this
+ // is null is the canonical way to know whether an instance has a
+ // value.
+ //
+ func_t func = nullptr;
+
+ // Construct an object within ourselves. As you can see we give the
+ // weak exception safety guarantee.
+ //
+ template<typename T, typename ...Args>
+ std::decay_t<T>& construct(Args&& ...args) {
+ using Td = std::decay_t<T>;
+ static_assert(capacity == dynamic || sizeof(Td) <= capacity,
+ "Supplied type is too large for this specialization.");
+ try {
+ func = &op_func<Td>;
+ return *new (reinterpret_cast<Td*>(alloc_storage(sizeof(Td))))
+ Td(std::forward<Args>(args)...);
+ } catch (...) {
+ reset();
+ throw;
+ }
+ }
+
+protected:
+
+ // We hold the storage, even if the superclass class manipulates it,
+ // so that its default initialization comes soon enough for us to
+ // use it in our constructors.
+ //
+ storage_t storage;
+
+public:
+
+ base() noexcept = default;
+ ~base() noexcept {
+ reset();
+ }
+
+protected:
+ // Since some of our derived classes /can/ be copied or moved.
+ //
+ base(const base& rhs) noexcept : func(rhs.func) {
+ if constexpr (std::is_copy_assignable_v<storage_t>) {
+ storage = rhs.storage;
+ }
+ }
+ base& operator =(const base& rhs) noexcept {
+ reset();
+ func = rhs.func;
+ if constexpr (std::is_copy_assignable_v<storage_t>) {
+ storage = rhs.storage;
+ }
+ return *this;
+ }
+
+ base(base&& rhs) noexcept : func(std::move(rhs.func)) {
+ if constexpr (std::is_move_assignable_v<storage_t>) {
+ storage = std::move(rhs.storage);
+ }
+ rhs.func = nullptr;
+ }
+ base& operator =(base&& rhs) noexcept {
+ reset();
+ func = rhs.func;
+ if constexpr (std::is_move_assignable_v<storage_t>) {
+ storage = std::move(rhs.storage);
+ }
+ rhs.func = nullptr;
+ return *this;
+ }
+
+public:
+
+ // Value construct/assign
+ // ----------------------
+ //
+ template<typename T,
+ typename = value_condition_t<T>>
+ base(T&& t) noexcept(value_noexcept_v<T>) {
+ construct<T>(std::forward<T>(t));
+ }
+
+ // On exception, *this is set to empty.
+ //
+ template<typename T,
+ typename = value_condition_t<T>>
+ base& operator =(T&& t) noexcept(value_noexcept_v<T>) {
+ reset();
+ construct<T>(std::forward<T>(t));
+ return *this;
+ }
+
+ // In-place construct/assign
+ // -------------------------
+ //
+ // I really hate the way the C++ standard library treats references
+ // as if they were stepchildren in a Charles Dickens novel. I am
+ // quite upset that std::optional lacks a specialization for
+ // references. There's no legitimate reason for it. The whole
+ // 're-seat or refuse' debate is simply a canard. The optional is
+ // effectively a container, so of course it can be emptied or
+ // reassigned. No, pointers are not an acceptable substitute. A
+ // pointer gives an address in memory which may be null and which
+ // may represent an object or may a location in which an object is
+ // to be created. An optional reference, on the other hand, is a
+ // reference to an initialized, live object or /empty/. This is an
+ // obvious difference that should be communicable to any programmer
+ // reading the code through the type system.
+ //
+ // `std::any`, even in the case of in-place construction,
+ // only stores the decayed type. I suspect this was to get around
+ // the question of whether, for a std::any holding a T&,
+ // std::any_cast<T> should return a copy or throw
+ // std::bad_any_cast.
+ //
+ // I think the appropriate response in that case would be to make a
+ // copy if the type supports it and fail otherwise. Once a concrete
+ // type is known the problem solves itself.
+ //
+ // If one were inclined, one could easily load the driver function
+ // with a heavy subset of the type traits (those that depend only on
+ // the type in question) and simply /ask/ whether it's a reference.
+ //
+ // At the moment, I'm maintaining compatibility with the standard
+ // library except for copy/move semantics.
+ //
+ template<typename T,
+ typename... Args,
+ typename = in_place_condition_t<T, Args...>>
+ base(std::in_place_type_t<T>,
+ Args&& ...args) noexcept(in_place_noexcept_v<T, Args...>) {
+ construct<T>(std::forward<Args>(args)...);
+ }
+
+ // On exception, *this is set to empty.
+ //
+ template<typename T,
+ typename... Args,
+ typename = in_place_condition_t<T>>
+ std::decay_t<T>& emplace(Args&& ...args) noexcept(in_place_noexcept_v<
+ T, Args...>) {
+ reset();
+ return construct<T>(std::forward<Args>(args)...);
+ }
+
+ template<typename T,
+ typename U,
+ typename... Args,
+ typename = in_place_condition_t<T, std::initializer_list<U>,
+ Args...>>
+ base(std::in_place_type_t<T>,
+ std::initializer_list<U> i,
+ Args&& ...args) noexcept(in_place_noexcept_v<T, std::initializer_list<U>,
+ Args...>) {
+ construct<T>(i, std::forward<Args>(args)...);
+ }
+
+ // On exception, *this is set to empty.
+ //
+ template<typename T,
+ typename U,
+ typename... Args,
+ typename = in_place_condition_t<T, std::initializer_list<U>,
+ Args...>>
+ std::decay_t<T>& emplace(std::initializer_list<U> i,
+ Args&& ...args) noexcept(in_place_noexcept_v<T,
+ std::initializer_list<U>,
+ Args...>) {
+ reset();
+ return construct<T>(i,std::forward<Args>(args)...);
+ }
+
+ // Empty ourselves, using the subclass to free any storage.
+ //
+ void reset() noexcept {
+ if (has_value()) {
+ func(op::destroy, ptr());
+ func = nullptr;
+ }
+ free_storage();
+ }
+
+ template<typename U = storage_t,
+ typename = std::enable_if<std::is_swappable_v<storage_t>>>
+ void swap(base& rhs) {
+ using std::swap;
+ swap(func, rhs.func);
+ swap(storage, rhs.storage);
+ }
+
+ // All other functions should use this function to test emptiness
+ // rather than examining `func` directly.
+ //
+ bool has_value() const noexcept {
+ return !!func;
+ }
+
+ // Returns the type of the value stored, if any.
+ //
+ const std::type_info& type() const noexcept {
+ if (has_value()) {
+ const std::type_info* t;
+ func(op::type, reinterpret_cast<void*>(&t));
+ return *t;
+ } else {
+ return typeid(void);
+ }
+ }
+
+ template<typename T, typename U, typename V>
+ friend inline void* cast_helper(const base<U, V>& b) noexcept;
+};
+
+// Function used by all `any_cast` functions
+//
+// Returns a void* to the contents if they exist and match the
+// requested type, otherwise `nullptr`.
+//
+template<typename T, typename U, typename V>
+inline void* cast_helper(const base<U, V>& b) noexcept {
+ if (b.func && ((&op_func<T> == b.func) ||
+ (b.type() == typeid(T)))) {
+ return b.ptr();
+ } else {
+ return nullptr;
+ }
+}
+}
+
+// `any_cast`
+// ==========
+//
+// Just the usual gamut of `any_cast` overloads. These get a bit
+// repetitive and it would be nice to think of a way to collapse them
+// down a bit.
+//
+
+// The pointer pair!
+//
+template<typename T, typename U, typename V>
+inline T* any_cast(_any::base<U, V>* a) noexcept {
+ if (a) {
+ return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a));
+ }
+ return nullptr;
+}
+
+template<typename T, typename U, typename V>
+inline const T* any_cast(const _any::base<U, V>* a) noexcept {
+ if (a) {
+ return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a));
+ }
+ return nullptr;
+}
+
+// While we disallow copying the immobile any itself, we can allow
+// anything with an extracted value that the type supports.
+//
+template<typename T, typename U, typename V>
+inline T any_cast(_any::base<U, V>& a) {
+ static_assert(std::is_reference_v<T> ||
+ std::is_copy_constructible_v<T>,
+ "The supplied type must be either a reference or "
+ "copy constructible.");
+ auto p = any_cast<std::decay_t<T>>(&a);
+ if (p) {
+ return static_cast<T>(*p);
+ }
+ throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline T any_cast(const _any::base<U, V>& a) {
+ static_assert(std::is_reference_v<T> ||
+ std::is_copy_constructible_v<T>,
+ "The supplied type must be either a reference or "
+ "copy constructible.");
+ auto p = any_cast<std::decay_t<T>>(&a);
+ if (p) {
+ return static_cast<T>(*p);
+ }
+ throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline std::enable_if_t<(std::is_move_constructible_v<T> ||
+ std::is_copy_constructible_v<T>) &&
+ !std::is_rvalue_reference_v<T>, T>
+any_cast(_any::base<U, V>&& a) {
+ auto p = any_cast<std::decay_t<T>>(&a);
+ if (p) {
+ return std::move((*p));
+ }
+ throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline std::enable_if_t<std::is_rvalue_reference_v<T>, T>
+any_cast(_any::base<U, V>&& a) {
+ auto p = any_cast<std::decay_t<T>>(&a);
+ if (p) {
+ return static_cast<T>(*p);
+ }
+ throw std::bad_any_cast();
+}
+
+// `immobile_any`
+// ==============
+//
+// Sometimes, uncopyable objects exist and I want to do things with
+// them. The C++ standard library is really quite keen on insisting
+// things be copyable before it deigns to work. I find this annoying.
+//
+// Also, the allocator, while useful, is really not considerate of
+// other people's time. Every time we go to visit it, it takes us
+// quite an awfully long time to get away again. As such, I've been
+// trying to avoid its company whenever it is convenient and seemly.
+//
+// We accept any type that will fit in the declared capacity. You may
+// store types with throwing destructors, but terminate will be
+// invoked when they throw.
+//
+template<std::size_t S>
+class immobile_any : public _any::base<immobile_any<S>,
+ std::aligned_storage_t<S>> {
+ using base = _any::base<immobile_any<S>, std::aligned_storage_t<S>>;
+ friend base;
+
+ using _any::base<immobile_any<S>, std::aligned_storage_t<S>>::storage;
+
+ // Superclass requirements!
+ // ------------------------
+ //
+ // Simple as anything. We have a buffer of fixed size and return the
+ // pointer to it when asked.
+ //
+ static constexpr std::size_t capacity = S;
+ void* ptr() const noexcept {
+ return const_cast<void*>(static_cast<const void*>(&storage));
+ }
+ void* alloc_storage(std::size_t) noexcept {
+ return ptr();
+ }
+ void free_storage() noexcept {}
+
+ static_assert(capacity != _any::dynamic,
+ "That is not a valid size for an immobile_any.");
+
+public:
+
+ immobile_any() noexcept = default;
+
+ immobile_any(const immobile_any&) = delete;
+ immobile_any& operator =(const immobile_any&) = delete;
+ immobile_any(immobile_any&&) = delete;
+ immobile_any& operator =(immobile_any&&) = delete;
+
+ using base::base;
+ using base::operator =;
+
+ void swap(immobile_any&) = delete;
+};
+
+template<typename T, std::size_t S, typename... Args>
+inline immobile_any<S> make_immobile_any(Args&& ...args) {
+ return immobile_any<S>(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, std::size_t S, typename U, typename... Args>
+inline immobile_any<S> make_immobile_any(std::initializer_list<U> i, Args&& ...args) {
+ return immobile_any<S>(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+
+// `unique_any`
+// ============
+//
+// Oh dear. Now we're getting back into allocation. You don't think
+// the allocator noticed all those mean things we said about it, do
+// you?
+//
+// Well. Okay, allocator. Sometimes when it's the middle of the night
+// and you're writing template code you say things you don't exactly
+// mean. If it weren't for you, we wouldn't have any memory to run all
+// our programs in at all. Really, I'm just being considerate of
+// *your* needs, trying to avoid having to run to you every time we
+// instantiate a type, making a few that can be self-sufficient…uh…
+//
+// **Anyway**, this is movable but not copyable, as you should expect
+// from anything with ‘unique’ in the name.
+//
+class unique_any : public _any::base<unique_any, std::unique_ptr<std::byte[]>> {
+ using base = _any::base<unique_any, std::unique_ptr<std::byte[]>>;
+ friend base;
+
+ using base::storage;
+
+ // Superclass requirements
+ // -----------------------
+ //
+ // Our storage is a single chunk of RAM owned by a
+ // `std::unique_ptr`.
+ //
+ static constexpr std::size_t capacity = _any::dynamic;
+ void* ptr() const noexcept {
+ return static_cast<void*>(storage.get());
+ return nullptr;
+ }
+
+ void* alloc_storage(const std::size_t z) {
+ storage.reset(new std::byte[z]);
+ return ptr();
+ }
+
+ void free_storage() noexcept {
+ storage.reset();
+ }
+
+public:
+
+ unique_any() noexcept = default;
+ ~unique_any() noexcept = default;
+
+ unique_any(const unique_any&) = delete;
+ unique_any& operator =(const unique_any&) = delete;
+
+ // We can rely on the behavior of `unique_ptr` and the base class to
+ // give us a default move constructor that does the right thing.
+ //
+ unique_any(unique_any&& rhs) noexcept = default;
+ unique_any& operator =(unique_any&& rhs) = default;
+
+ using base::base;
+ using base::operator =;
+};
+
+inline void swap(unique_any& lhs, unique_any& rhs) noexcept {
+ lhs.swap(rhs);
+}
+
+template<typename T, typename... Args>
+inline unique_any make_unique_any(Args&& ...args) {
+ return unique_any(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, typename U, typename... Args>
+inline unique_any make_unique_any(std::initializer_list<U> i, Args&& ...args) {
+ return unique_any(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+
+// `shared_any`
+// ============
+//
+// Once more with feeling!
+//
+// This is both copyable *and* movable. In case you need that sort of
+// thing. It seemed a reasonable completion.
+//
+class shared_any : public _any::base<shared_any, boost::shared_ptr<std::byte[]>> {
+ using base = _any::base<shared_any, boost::shared_ptr<std::byte[]>>;
+ friend base;
+
+ using base::storage;
+
+ // Superclass requirements
+ // -----------------------
+ //
+ // Our storage is a single chunk of RAM allocated from the
+ // heap. This time it's owned by a `boost::shared_ptr` so we can use
+ // `boost::make_shared_noinit`. (This lets us get the optimization
+ // that allocates array and control block in one without wasting
+ // time on `memset`.)
+ //
+ static constexpr std::size_t capacity = _any::dynamic;
+ void* ptr() const noexcept {
+ return static_cast<void*>(storage.get());
+ }
+
+ void* alloc_storage(std::size_t n) {
+ storage = boost::make_shared_noinit<std::byte[]>(n);
+ return ptr();
+ }
+
+ void free_storage() noexcept {
+ storage.reset();
+ }
+
+public:
+
+ shared_any() noexcept = default;
+ ~shared_any() noexcept = default;
+
+ shared_any(const shared_any& rhs) noexcept = default;
+ shared_any& operator =(const shared_any&) noexcept = default;
+
+ shared_any(shared_any&& rhs) noexcept = default;
+ shared_any& operator =(shared_any&& rhs) noexcept = default;
+
+ using base::base;
+ using base::operator =;
+};
+
+inline void swap(shared_any& lhs, shared_any& rhs) noexcept {
+ lhs.swap(rhs);
+}
+
+template<typename T, typename... Args>
+inline shared_any make_shared_any(Args&& ...args) {
+ return shared_any(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, typename U, typename... Args>
+inline shared_any make_shared_any(std::initializer_list<U> i, Args&& ...args) {
+ return shared_any(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+}
+
+#endif // INCLUDE_STATIC_ANY
diff --git a/src/include/bitmapper.h b/src/include/bitmapper.h
new file mode 100644
index 000000000..5a65cc20f
--- /dev/null
+++ b/src/include/bitmapper.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BITMAPPER_H
+#define CEPH_BITMAPPER_H
+
+class bitmapper {
+ char *_data;
+ int _len;
+
+ public:
+ bitmapper() : _data(0), _len(0) { }
+ bitmapper(char *data, int len) : _data(data), _len(len) { }
+
+ void set_data(char *data, int len) { _data = data; _len = len; }
+
+ int bytes() const { return _len; }
+ int bits() const { return _len * 8; }
+
+ bool operator[](int b) const {
+ return get(b);
+ }
+ bool get(int b) const {
+ return _data[b >> 3] & (1 << (b&7));
+ }
+ void set(int b) {
+ _data[b >> 3] |= 1 << (b&7);
+ }
+ void clear(int b) {
+ _data[b >> 3] &= ~(1 << (b&7));
+ }
+ void toggle(int b) {
+ _data[b >> 3] ^= 1 << (b&7);
+ }
+};
+
+#endif
diff --git a/src/include/blobhash.h b/src/include/blobhash.h
new file mode 100644
index 000000000..303892b13
--- /dev/null
+++ b/src/include/blobhash.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BLOBHASH_H
+#define CEPH_BLOBHASH_H
+
+#include <cstdint>
+#include "hash.h"
+
+class blobhash {
+public:
+ uint32_t operator()(const void* p, size_t len) {
+ static rjhash<std::uint32_t> H;
+ std::uint32_t acc = 0;
+ auto buf = static_cast<const unsigned char*>(p);
+ while (len >= sizeof(acc)) {
+ acc ^= unaligned_load(buf);
+ buf += sizeof(std::uint32_t);
+ len -= sizeof(std::uint32_t);
+ }
+ // handle the last few bytes of p[-(len % 4):]
+ switch (len) {
+ case 3:
+ acc ^= buf[2] << 16;
+ [[fallthrough]];
+ case 2:
+ acc ^= buf[1] << 8;
+ [[fallthrough]];
+ case 1:
+ acc ^= buf[0];
+ }
+ return H(acc);
+ }
+private:
+ static inline std::uint32_t unaligned_load(const unsigned char* p) {
+ std::uint32_t result;
+ __builtin_memcpy(&result, p, sizeof(result));
+ return result;
+ }
+};
+
+
+#endif
diff --git a/src/include/btree_map.h b/src/include/btree_map.h
new file mode 100644
index 000000000..218835a0f
--- /dev/null
+++ b/src/include/btree_map.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_INCLUDE_BTREE_MAP_H
+#define CEPH_INCLUDE_BTREE_MAP_H
+
+#include "include/cpp-btree/btree.h"
+#include "include/cpp-btree/btree_map.h"
+#include "include/ceph_assert.h" // cpp-btree uses system assert, blech
+#include "include/encoding.h"
+
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, ceph::buffer::list& bl)
+{
+ using ceph::encode;
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, ceph::buffer::list& bl, uint64_t features)
+{
+ using ceph::encode;
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U>
+inline void decode(btree::btree_map<T,U>& m, ceph::buffer::list::const_iterator& p)
+{
+ using ceph::decode;
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U>
+inline void encode_nohead(const btree::btree_map<T,U>& m, ceph::buffer::list& bl)
+{
+ using ceph::encode;
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U>
+inline void decode_nohead(int n, btree::btree_map<T,U>& m, ceph::buffer::list::const_iterator& p)
+{
+ using ceph::decode;
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+#endif
diff --git a/src/include/buffer.h b/src/include/buffer.h
new file mode 100644
index 000000000..71cb01935
--- /dev/null
+++ b/src/include/buffer.h
@@ -0,0 +1,1284 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_BUFFER_H
+#define CEPH_BUFFER_H
+
+#if defined(__linux__) || defined(__FreeBSD__)
+#include <stdlib.h>
+#endif
+#include <limits.h>
+
+#ifndef _XOPEN_SOURCE
+# define _XOPEN_SOURCE 600
+#endif
+
+#include <stdio.h>
+#include <sys/uio.h>
+
+#if defined(__linux__) // For malloc(2).
+#include <malloc.h>
+#endif
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !defined(__CYGWIN__) && !defined(_WIN32)
+# include <sys/mman.h>
+#endif
+
+#include <iosfwd>
+#include <iomanip>
+#include <list>
+#include <memory>
+#include <vector>
+#include <string>
+#if __cplusplus >= 201703L
+#include <string_view>
+#endif // __cplusplus >= 201703L
+
+#include <exception>
+#include <type_traits>
+
+#include "page.h"
+#include "crc32c.h"
+#include "buffer_fwd.h"
+
+
+#ifdef __CEPH__
+# include "include/ceph_assert.h"
+#else
+# include <assert.h>
+#endif
+
+#include "inline_memory.h"
+
+#define CEPH_BUFFER_API
+
+#ifdef HAVE_SEASTAR
+namespace seastar {
+template <typename T> class temporary_buffer;
+namespace net {
+class packet;
+}
+}
+#endif // HAVE_SEASTAR
+class deleter;
+
+template<typename T> class DencDumper;
+
+namespace ceph {
+
+template <class T>
+struct nop_delete {
+ void operator()(T*) {}
+};
+
+// This is not unique_ptr-like smart pointer! It just signalizes ownership
+// but DOES NOT manage the resource. It WILL LEAK if not manually deleted.
+// It's rather a replacement for raw pointer than any other smart one.
+//
+// Considered options:
+// * unique_ptr with custom deleter implemented in .cc (would provide
+// the non-zero-cost resource management),
+// * GSL's owner<T*> (pretty neat but would impose an extra depedency),
+// * unique_ptr with nop deleter,
+// * raw pointer (doesn't embed ownership enforcement - std::move).
+template <class T>
+struct unique_leakable_ptr : public std::unique_ptr<T, ceph::nop_delete<T>> {
+ using std::unique_ptr<T, ceph::nop_delete<T>>::unique_ptr;
+};
+
+namespace buffer CEPH_BUFFER_API {
+inline namespace v15_2_0 {
+
+/// Actual definitions in common/error_code.h
+struct error;
+struct bad_alloc;
+struct end_of_buffer;
+struct malformed_input;
+struct error_code;
+
+ /// count of cached crc hits (matching input)
+ int get_cached_crc();
+ /// count of cached crc hits (mismatching input, required adjustment)
+ int get_cached_crc_adjusted();
+ /// count of crc cache misses
+ int get_missed_crc();
+ /// enable/disable tracking of cached crcs
+ void track_cached_crc(bool b);
+
+ /*
+ * an abstract raw buffer. with a reference count.
+ */
+ class raw;
+ class raw_malloc;
+ class raw_static;
+ class raw_posix_aligned;
+ class raw_hack_aligned;
+ class raw_char;
+ class raw_claimed_char;
+ class raw_unshareable; // diagnostic, unshareable char buffer
+ class raw_combined;
+ class raw_claim_buffer;
+
+
+ /*
+ * named constructors
+ */
+ ceph::unique_leakable_ptr<raw> copy(const char *c, unsigned len);
+ ceph::unique_leakable_ptr<raw> create(unsigned len);
+ ceph::unique_leakable_ptr<raw> create(unsigned len, char c);
+ ceph::unique_leakable_ptr<raw> create_in_mempool(unsigned len, int mempool);
+ ceph::unique_leakable_ptr<raw> claim_char(unsigned len, char *buf);
+ ceph::unique_leakable_ptr<raw> create_malloc(unsigned len);
+ ceph::unique_leakable_ptr<raw> claim_malloc(unsigned len, char *buf);
+ ceph::unique_leakable_ptr<raw> create_static(unsigned len, char *buf);
+ ceph::unique_leakable_ptr<raw> create_aligned(unsigned len, unsigned align);
+ ceph::unique_leakable_ptr<raw> create_aligned_in_mempool(unsigned len, unsigned align, int mempool);
+ ceph::unique_leakable_ptr<raw> create_page_aligned(unsigned len);
+ ceph::unique_leakable_ptr<raw> create_small_page_aligned(unsigned len);
+ ceph::unique_leakable_ptr<raw> claim_buffer(unsigned len, char *buf, deleter del);
+
+#ifdef HAVE_SEASTAR
+ /// create a raw buffer to wrap seastar cpu-local memory, using foreign_ptr to
+ /// make it safe to share between cpus
+ ceph::unique_leakable_ptr<buffer::raw> create_foreign(seastar::temporary_buffer<char>&& buf);
+ /// create a raw buffer to wrap seastar cpu-local memory, without the safety
+ /// of foreign_ptr. the caller must otherwise guarantee that the buffer ptr is
+ /// destructed on this cpu
+ ceph::unique_leakable_ptr<buffer::raw> create(seastar::temporary_buffer<char>&& buf);
+#endif
+
+ /*
+ * a buffer pointer. references (a subsequence of) a raw buffer.
+ */
+ class CEPH_BUFFER_API ptr {
+ friend class list;
+ protected:
+ raw *_raw;
+ unsigned _off, _len;
+ private:
+
+ void release();
+
+ template<bool is_const>
+ class iterator_impl {
+ const ptr *bp; ///< parent ptr
+ const char *start; ///< starting pointer into bp->c_str()
+ const char *pos; ///< pointer into bp->c_str()
+ const char *end_ptr; ///< pointer to bp->end_c_str()
+ const bool deep; ///< if true, do not allow shallow ptr copies
+
+ iterator_impl(typename std::conditional<is_const, const ptr*, ptr*>::type p,
+ size_t offset, bool d)
+ : bp(p),
+ start(p->c_str() + offset),
+ pos(start),
+ end_ptr(p->end_c_str()),
+ deep(d)
+ {}
+
+ friend class ptr;
+
+ public:
+ using pointer = typename std::conditional<is_const, const char*, char *>::type;
+ pointer get_pos_add(size_t n) {
+ auto r = pos;
+ *this += n;
+ return r;
+ }
+ ptr get_ptr(size_t len) {
+ if (deep) {
+ return buffer::copy(get_pos_add(len), len);
+ } else {
+ size_t off = pos - bp->c_str();
+ *this += len;
+ return ptr(*bp, off, len);
+ }
+ }
+
+ iterator_impl& operator+=(size_t len);
+
+ const char *get_pos() {
+ return pos;
+ }
+ const char *get_end() {
+ return end_ptr;
+ }
+
+ size_t get_offset() {
+ return pos - start;
+ }
+
+ bool end() const {
+ return pos == end_ptr;
+ }
+ };
+
+ public:
+ using const_iterator = iterator_impl<true>;
+ using iterator = iterator_impl<false>;
+
+ ptr() : _raw(nullptr), _off(0), _len(0) {}
+ ptr(ceph::unique_leakable_ptr<raw> r);
+ // cppcheck-suppress noExplicitConstructor
+ ptr(unsigned l);
+ ptr(const char *d, unsigned l);
+ ptr(const ptr& p);
+ ptr(ptr&& p) noexcept;
+ ptr(const ptr& p, unsigned o, unsigned l);
+ ptr(const ptr& p, ceph::unique_leakable_ptr<raw> r);
+ ptr& operator= (const ptr& p);
+ ptr& operator= (ptr&& p) noexcept;
+ ~ptr() {
+ // BE CAREFUL: this destructor is called also for hypercombined ptr_node.
+ // After freeing underlying raw, `*this` can become inaccessible as well!
+ release();
+ }
+
+ bool have_raw() const { return _raw ? true:false; }
+
+ ceph::unique_leakable_ptr<raw> clone();
+ void swap(ptr& other) noexcept;
+
+ iterator begin(size_t offset=0) {
+ return iterator(this, offset, false);
+ }
+ const_iterator begin(size_t offset=0) const {
+ return const_iterator(this, offset, false);
+ }
+ const_iterator cbegin() const {
+ return begin();
+ }
+ const_iterator begin_deep(size_t offset=0) const {
+ return const_iterator(this, offset, true);
+ }
+
+ // misc
+ bool is_aligned(unsigned align) const {
+ return ((uintptr_t)c_str() & (align-1)) == 0;
+ }
+ bool is_page_aligned() const { return is_aligned(CEPH_PAGE_SIZE); }
+ bool is_n_align_sized(unsigned align) const
+ {
+ return (length() % align) == 0;
+ }
+ bool is_n_page_sized() const { return is_n_align_sized(CEPH_PAGE_SIZE); }
+ bool is_partial() const {
+ return have_raw() && (start() > 0 || end() < raw_length());
+ }
+
+ int get_mempool() const;
+ void reassign_to_mempool(int pool);
+ void try_assign_to_mempool(int pool);
+
+ // accessors
+ const char *c_str() const;
+ char *c_str();
+ const char *end_c_str() const;
+ char *end_c_str();
+ unsigned length() const { return _len; }
+ unsigned offset() const { return _off; }
+ unsigned start() const { return _off; }
+ unsigned end() const { return _off + _len; }
+ unsigned unused_tail_length() const;
+ const char& operator[](unsigned n) const;
+ char& operator[](unsigned n);
+
+ const char *raw_c_str() const;
+ unsigned raw_length() const;
+ int raw_nref() const;
+
+ void copy_out(unsigned o, unsigned l, char *dest) const;
+
+ unsigned wasted() const;
+
+ int cmp(const ptr& o) const;
+ bool is_zero() const;
+
+ // modifiers
+ void set_offset(unsigned o) {
+#ifdef __CEPH__
+ ceph_assert(raw_length() >= o);
+#else
+ assert(raw_length() >= o);
+#endif
+ _off = o;
+ }
+ void set_length(unsigned l) {
+#ifdef __CEPH__
+ ceph_assert(raw_length() >= l);
+#else
+ assert(raw_length() >= l);
+#endif
+ _len = l;
+ }
+
+ unsigned append(char c);
+ unsigned append(const char *p, unsigned l);
+#if __cplusplus >= 201703L
+ inline unsigned append(std::string_view s) {
+ return append(s.data(), s.length());
+ }
+#endif // __cplusplus >= 201703L
+ void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset = true);
+ void zero(bool crc_reset = true);
+ void zero(unsigned o, unsigned l, bool crc_reset = true);
+ unsigned append_zeros(unsigned l);
+
+#ifdef HAVE_SEASTAR
+ /// create a temporary_buffer, copying the ptr as its deleter
+ operator seastar::temporary_buffer<char>() &;
+ /// convert to temporary_buffer, stealing the ptr as its deleter
+ operator seastar::temporary_buffer<char>() &&;
+#endif // HAVE_SEASTAR
+
+ };
+
+
+ struct ptr_hook {
+ mutable ptr_hook* next;
+
+ ptr_hook() = default;
+ ptr_hook(ptr_hook* const next)
+ : next(next) {
+ }
+ };
+
+ class ptr_node : public ptr_hook, public ptr {
+ public:
+ struct cloner {
+ ptr_node* operator()(const ptr_node& clone_this);
+ };
+ struct disposer {
+ void operator()(ptr_node* const delete_this) {
+ if (!__builtin_expect(dispose_if_hypercombined(delete_this), 0)) {
+ delete delete_this;
+ }
+ }
+ };
+
+ ~ptr_node() = default;
+
+ static std::unique_ptr<ptr_node, disposer>
+ create(ceph::unique_leakable_ptr<raw> r) {
+ return create_hypercombined(std::move(r));
+ }
+ static std::unique_ptr<ptr_node, disposer>
+ create(const unsigned l) {
+ return create_hypercombined(buffer::create(l));
+ }
+ template <class... Args>
+ static std::unique_ptr<ptr_node, disposer>
+ create(Args&&... args) {
+ return std::unique_ptr<ptr_node, disposer>(
+ new ptr_node(std::forward<Args>(args)...));
+ }
+
+ static ptr_node* copy_hypercombined(const ptr_node& copy_this);
+
+ private:
+ friend list;
+
+ template <class... Args>
+ ptr_node(Args&&... args) : ptr(std::forward<Args>(args)...) {
+ }
+ ptr_node(const ptr_node&) = default;
+
+ ptr& operator= (const ptr& p) = delete;
+ ptr& operator= (ptr&& p) noexcept = delete;
+ ptr_node& operator= (const ptr_node& p) = delete;
+ ptr_node& operator= (ptr_node&& p) noexcept = delete;
+ void swap(ptr& other) noexcept = delete;
+ void swap(ptr_node& other) noexcept = delete;
+
+ static bool dispose_if_hypercombined(ptr_node* delete_this);
+ static std::unique_ptr<ptr_node, disposer> create_hypercombined(
+ ceph::unique_leakable_ptr<raw> r);
+ };
+ /*
+ * list - the useful bit!
+ */
+
+ class CEPH_BUFFER_API list {
+ public:
+ // this the very low-level implementation of singly linked list
+ // ceph::buffer::list is built on. We don't use intrusive slist
+ // of Boost (or any other 3rd party) to save extra dependencies
+ // in our public headers.
+ class buffers_t {
+ // _root.next can be thought as _head
+ ptr_hook _root;
+ ptr_hook* _tail;
+
+ public:
+ template <class T>
+ class buffers_iterator {
+ typename std::conditional<
+ std::is_const<T>::value, const ptr_hook*, ptr_hook*>::type cur;
+ template <class U> friend class buffers_iterator;
+ public:
+ using value_type = T;
+ using reference = typename std::add_lvalue_reference<T>::type;
+ using pointer = typename std::add_pointer<T>::type;
+ using difference_type = std::ptrdiff_t;
+ using iterator_category = std::forward_iterator_tag;
+
+ template <class U>
+ buffers_iterator(U* const p)
+ : cur(p) {
+ }
+ template <class U>
+ buffers_iterator(const buffers_iterator<U>& other)
+ : cur(other.cur) {
+ }
+ buffers_iterator() = default;
+
+ T& operator*() const {
+ return *reinterpret_cast<T*>(cur);
+ }
+ T* operator->() const {
+ return reinterpret_cast<T*>(cur);
+ }
+
+ buffers_iterator& operator++() {
+ cur = cur->next;
+ return *this;
+ }
+ buffers_iterator operator++(int) {
+ const auto temp(*this);
+ ++*this;
+ return temp;
+ }
+
+ template <class U>
+ buffers_iterator& operator=(buffers_iterator<U>& other) {
+ cur = other.cur;
+ return *this;
+ }
+
+ bool operator==(const buffers_iterator& rhs) const {
+ return cur == rhs.cur;
+ }
+ bool operator!=(const buffers_iterator& rhs) const {
+ return !(*this==rhs);
+ }
+
+ using citer_t = buffers_iterator<typename std::add_const<T>::type>;
+ operator citer_t() const {
+ return citer_t(cur);
+ }
+ };
+
+ typedef buffers_iterator<const ptr_node> const_iterator;
+ typedef buffers_iterator<ptr_node> iterator;
+
+ typedef const ptr_node& const_reference;
+ typedef ptr_node& reference;
+
+ buffers_t()
+ : _root(&_root),
+ _tail(&_root) {
+ }
+ buffers_t(const buffers_t&) = delete;
+ buffers_t(buffers_t&& other)
+ : _root(other._root.next == &other._root ? &_root : other._root.next),
+ _tail(other._tail == &other._root ? &_root : other._tail) {
+ other._root.next = &other._root;
+ other._tail = &other._root;
+
+ _tail->next = &_root;
+ }
+ buffers_t& operator=(buffers_t&& other) {
+ if (&other != this) {
+ clear_and_dispose();
+ swap(other);
+ }
+ return *this;
+ }
+
+ void push_back(reference item) {
+ item.next = &_root;
+ // this updates _root.next when called on empty
+ _tail->next = &item;
+ _tail = &item;
+ }
+
+ void push_front(reference item) {
+ item.next = _root.next;
+ _root.next = &item;
+ _tail = _tail == &_root ? &item : _tail;
+ }
+
+ // *_after
+ iterator erase_after(const_iterator it) {
+ const auto* to_erase = it->next;
+
+ it->next = to_erase->next;
+ _root.next = _root.next == to_erase ? to_erase->next : _root.next;
+ _tail = _tail == to_erase ? (ptr_hook*)&*it : _tail;
+ return it->next;
+ }
+
+ void insert_after(const_iterator it, reference item) {
+ item.next = it->next;
+ it->next = &item;
+ _root.next = it == end() ? &item : _root.next;
+ _tail = const_iterator(_tail) == it ? &item : _tail;
+ }
+
+ void splice_back(buffers_t& other) {
+ if (other.empty()) {
+ return;
+ }
+
+ other._tail->next = &_root;
+ // will update root.next if empty() == true
+ _tail->next = other._root.next;
+ _tail = other._tail;
+
+ other._root.next = &other._root;
+ other._tail = &other._root;
+ }
+
+ bool empty() const { return _tail == &_root; }
+
+ const_iterator begin() const {
+ return _root.next;
+ }
+ const_iterator before_begin() const {
+ return &_root;
+ }
+ const_iterator end() const {
+ return &_root;
+ }
+ iterator begin() {
+ return _root.next;
+ }
+ iterator before_begin() {
+ return &_root;
+ }
+ iterator end() {
+ return &_root;
+ }
+
+ reference front() {
+ return reinterpret_cast<reference>(*_root.next);
+ }
+ reference back() {
+ return reinterpret_cast<reference>(*_tail);
+ }
+ const_reference front() const {
+ return reinterpret_cast<const_reference>(*_root.next);
+ }
+ const_reference back() const {
+ return reinterpret_cast<const_reference>(*_tail);
+ }
+
+ void clone_from(const buffers_t& other) {
+ clear_and_dispose();
+ for (auto& node : other) {
+ ptr_node* clone = ptr_node::cloner()(node);
+ push_back(*clone);
+ }
+ }
+ void clear_and_dispose() {
+ for (auto it = begin(); it != end(); /* nop */) {
+ auto& node = *it;
+ it = it->next;
+ ptr_node::disposer()(&node);
+ }
+ _root.next = &_root;
+ _tail = &_root;
+ }
+ iterator erase_after_and_dispose(iterator it) {
+ auto* to_dispose = &*std::next(it);
+ auto ret = erase_after(it);
+ ptr_node::disposer()(to_dispose);
+ return ret;
+ }
+
+ void swap(buffers_t& other) {
+ const auto copy_root = _root;
+ _root.next = \
+ other._root.next == &other._root ? &this->_root : other._root.next;
+ other._root.next = \
+ copy_root.next == &_root ? &other._root : copy_root.next;
+
+ const auto copy_tail = _tail;
+ _tail = other._tail == &other._root ? &this->_root : other._tail;
+ other._tail = copy_tail == &_root ? &other._root : copy_tail;
+
+ _tail->next = &_root;
+ other._tail->next = &other._root;
+ }
+ };
+
+ class iterator;
+
+ private:
+ // my private bits
+ buffers_t _buffers;
+
+ // track bufferptr we can modify (especially ::append() to). Not all bptrs
+ // bufferlist holds have this trait -- if somebody ::push_back(const ptr&),
+ // he expects it won't change.
+ ptr_node* _carriage;
+ unsigned _len, _num;
+
+ template <bool is_const>
+ class CEPH_BUFFER_API iterator_impl {
+ protected:
+ typedef typename std::conditional<is_const,
+ const list,
+ list>::type bl_t;
+ typedef typename std::conditional<is_const,
+ const buffers_t,
+ buffers_t >::type list_t;
+ typedef typename std::conditional<is_const,
+ typename buffers_t::const_iterator,
+ typename buffers_t::iterator>::type list_iter_t;
+ bl_t* bl;
+ list_t* ls; // meh.. just here to avoid an extra pointer dereference..
+ list_iter_t p;
+ unsigned off; // in bl
+ unsigned p_off; // in *p
+ friend class iterator_impl<true>;
+
+ public:
+ using iterator_category = std::forward_iterator_tag;
+ using value_type = typename std::conditional<is_const, const char, char>::type;
+ using difference_type = std::ptrdiff_t;
+ using pointer = typename std::add_pointer<value_type>::type;
+ using reference = typename std::add_lvalue_reference<value_type>::type;
+
+ // constructor. position.
+ iterator_impl()
+ : bl(0), ls(0), off(0), p_off(0) {}
+ iterator_impl(bl_t *l, unsigned o=0);
+ iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po)
+ : bl(l), ls(&bl->_buffers), p(ip), off(o), p_off(po) {}
+ iterator_impl(const list::iterator& i);
+
+ /// get current iterator offset in buffer::list
+ unsigned get_off() const { return off; }
+
+ /// get number of bytes remaining from iterator position to the end of the buffer::list
+ unsigned get_remaining() const { return bl->length() - off; }
+
+ /// true if iterator is at the end of the buffer::list
+ bool end() const {
+ return p == ls->end();
+ //return off == bl->length();
+ }
+ void seek(unsigned o);
+ char operator*() const;
+ iterator_impl& operator+=(unsigned o);
+ iterator_impl& operator++();
+ ptr get_current_ptr() const;
+ bool is_pointing_same_raw(const ptr& other) const;
+
+ bl_t& get_bl() const { return *bl; }
+
+ // copy data out.
+ // note that these all _append_ to dest!
+ void copy(unsigned len, char *dest);
+ // deprecated, use copy_deep()
+ void copy(unsigned len, ptr &dest) __attribute__((deprecated));
+ void copy_deep(unsigned len, ptr &dest);
+ void copy_shallow(unsigned len, ptr &dest);
+ void copy(unsigned len, list &dest);
+ void copy(unsigned len, std::string &dest);
+ void copy_all(list &dest);
+
+ // get a pointer to the currenet iterator position, return the
+ // number of bytes we can read from that position (up to want),
+ // and advance the iterator by that amount.
+ size_t get_ptr_and_advance(size_t want, const char **p);
+
+ /// calculate crc from iterator position
+ uint32_t crc32c(size_t length, uint32_t crc);
+
+ friend bool operator==(const iterator_impl& lhs,
+ const iterator_impl& rhs) {
+ return &lhs.get_bl() == &rhs.get_bl() && lhs.get_off() == rhs.get_off();
+ }
+ friend bool operator!=(const iterator_impl& lhs,
+ const iterator_impl& rhs) {
+ return &lhs.get_bl() != &rhs.get_bl() || lhs.get_off() != rhs.get_off();
+ }
+ };
+
+ public:
+ typedef iterator_impl<true> const_iterator;
+
+ class CEPH_BUFFER_API iterator : public iterator_impl<false> {
+ public:
+ iterator() = default;
+ iterator(bl_t *l, unsigned o=0);
+ iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po);
+ // copy data in
+ void copy_in(unsigned len, const char *src, bool crc_reset = true);
+ void copy_in(unsigned len, const list& otherl);
+ };
+
+ struct reserve_t {
+ char* bp_data;
+ unsigned* bp_len;
+ unsigned* bl_len;
+ };
+
+ class contiguous_appender {
+ ceph::bufferlist& bl;
+ ceph::bufferlist::reserve_t space;
+ char* pos;
+ bool deep;
+
+ /// running count of bytes appended that are not reflected by @pos
+ size_t out_of_band_offset = 0;
+
+ contiguous_appender(bufferlist& bl, size_t len, bool d)
+ : bl(bl),
+ space(bl.obtain_contiguous_space(len)),
+ pos(space.bp_data),
+ deep(d) {
+ }
+
+ void flush_and_continue() {
+ const size_t l = pos - space.bp_data;
+ *space.bp_len += l;
+ *space.bl_len += l;
+ space.bp_data = pos;
+ }
+
+ friend class list;
+ template<typename Type> friend class ::DencDumper;
+
+ public:
+ ~contiguous_appender() {
+ flush_and_continue();
+ }
+
+ size_t get_out_of_band_offset() const {
+ return out_of_band_offset;
+ }
+ void append(const char* __restrict__ p, size_t l) {
+ maybe_inline_memcpy(pos, p, l, 16);
+ pos += l;
+ }
+ char *get_pos_add(size_t len) {
+ char *r = pos;
+ pos += len;
+ return r;
+ }
+ char *get_pos() const {
+ return pos;
+ }
+
+ void append(const bufferptr& p) {
+ const auto plen = p.length();
+ if (!plen) {
+ return;
+ }
+ if (deep) {
+ append(p.c_str(), plen);
+ } else {
+ flush_and_continue();
+ bl.append(p);
+ space = bl.obtain_contiguous_space(0);
+ out_of_band_offset += plen;
+ }
+ }
+ void append(const bufferlist& l) {
+ if (deep) {
+ for (const auto &p : l._buffers) {
+ append(p.c_str(), p.length());
+ }
+ } else {
+ flush_and_continue();
+ bl.append(l);
+ space = bl.obtain_contiguous_space(0);
+ out_of_band_offset += l.length();
+ }
+ }
+
+ size_t get_logical_offset() const {
+ return out_of_band_offset + (pos - space.bp_data);
+ }
+ };
+
+ contiguous_appender get_contiguous_appender(size_t len, bool deep=false) {
+ return contiguous_appender(*this, len, deep);
+ }
+
+ class contiguous_filler {
+ friend buffer::list;
+ char* pos;
+
+ contiguous_filler(char* const pos) : pos(pos) {}
+
+ public:
+ void advance(const unsigned len) {
+ pos += len;
+ }
+ void copy_in(const unsigned len, const char* const src) {
+ memcpy(pos, src, len);
+ advance(len);
+ }
+ char* c_str() {
+ return pos;
+ }
+ };
+ // The contiguous_filler is supposed to be not costlier than a single
+ // pointer. Keep it dumb, please.
+ static_assert(sizeof(contiguous_filler) == sizeof(char*),
+ "contiguous_filler should be no costlier than pointer");
+
+ class page_aligned_appender {
+ bufferlist& bl;
+ unsigned min_alloc;
+
+ page_aligned_appender(list *l, unsigned min_pages)
+ : bl(*l),
+ min_alloc(min_pages * CEPH_PAGE_SIZE) {
+ }
+
+ void _refill(size_t len);
+
+ template <class Func>
+ void _append_common(size_t len, Func&& impl_f) {
+ const auto free_in_last = bl.get_append_buffer_unused_tail_length();
+ const auto first_round = std::min(len, free_in_last);
+ if (first_round) {
+ impl_f(first_round);
+ }
+ if (const auto second_round = len - first_round; second_round) {
+ _refill(second_round);
+ impl_f(second_round);
+ }
+ }
+
+ friend class list;
+
+ public:
+ void append(const bufferlist& l) {
+ bl.append(l);
+ bl.obtain_contiguous_space(0);
+ }
+
+ void append(const char* buf, size_t entire_len) {
+ _append_common(entire_len,
+ [buf, this] (const size_t chunk_len) mutable {
+ bl.append(buf, chunk_len);
+ buf += chunk_len;
+ });
+ }
+
+ void append_zero(size_t entire_len) {
+ _append_common(entire_len, [this] (const size_t chunk_len) {
+ bl.append_zero(chunk_len);
+ });
+ }
+
+ void substr_of(const list& bl, unsigned off, unsigned len) {
+ for (const auto& bptr : bl.buffers()) {
+ if (off >= bptr.length()) {
+ off -= bptr.length();
+ continue;
+ }
+ const auto round_size = std::min(bptr.length() - off, len);
+ append(bptr.c_str() + off, round_size);
+ len -= round_size;
+ off = 0;
+ }
+ }
+ };
+
+ page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) {
+ return page_aligned_appender(this, min_pages);
+ }
+
+ private:
+ // always_empty_bptr has no underlying raw but its _len is always 0.
+ // This is useful for e.g. get_append_buffer_unused_tail_length() as
+ // it allows to avoid conditionals on hot paths.
+ static ptr_node always_empty_bptr;
+ ptr_node& refill_append_space(const unsigned len);
+
+ // for page_aligned_appender; never ever expose this publicly!
+ // carriage / append_buffer is just an implementation's detail.
+ ptr& get_append_buffer() {
+ return *_carriage;
+ }
+
+ public:
+ // cons/des
+ list()
+ : _carriage(&always_empty_bptr),
+ _len(0),
+ _num(0) {
+ }
+ // cppcheck-suppress noExplicitConstructor
+ // cppcheck-suppress noExplicitConstructor
+ list(unsigned prealloc)
+ : _carriage(&always_empty_bptr),
+ _len(0),
+ _num(0) {
+ reserve(prealloc);
+ }
+
+ list(const list& other)
+ : _carriage(&always_empty_bptr),
+ _len(other._len),
+ _num(other._num) {
+ _buffers.clone_from(other._buffers);
+ }
+
+ list(list&& other) noexcept
+ : _buffers(std::move(other._buffers)),
+ _carriage(other._carriage),
+ _len(other._len),
+ _num(other._num) {
+ other.clear();
+ }
+
+ ~list() {
+ _buffers.clear_and_dispose();
+ }
+
+ list& operator= (const list& other) {
+ if (this != &other) {
+ _carriage = &always_empty_bptr;
+ _buffers.clone_from(other._buffers);
+ _len = other._len;
+ _num = other._num;
+ }
+ return *this;
+ }
+ list& operator= (list&& other) noexcept {
+ _buffers = std::move(other._buffers);
+ _carriage = other._carriage;
+ _len = other._len;
+ _num = other._num;
+ other.clear();
+ return *this;
+ }
+
+ uint64_t get_wasted_space() const;
+ unsigned get_num_buffers() const { return _num; }
+ const ptr_node& front() const { return _buffers.front(); }
+ const ptr_node& back() const { return _buffers.back(); }
+
+ int get_mempool() const;
+ void reassign_to_mempool(int pool);
+ void try_assign_to_mempool(int pool);
+
+ size_t get_append_buffer_unused_tail_length() const {
+ return _carriage->unused_tail_length();
+ }
+
+ const buffers_t& buffers() const { return _buffers; }
+ buffers_t& mut_buffers() { return _buffers; }
+ void swap(list& other) noexcept;
+ unsigned length() const {
+#if 0
+ // DEBUG: verify _len
+ unsigned len = 0;
+ for (std::list<ptr>::const_iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++) {
+ len += (*it).length();
+ }
+#ifdef __CEPH__
+ ceph_assert(len == _len);
+#else
+ assert(len == _len);
+#endif // __CEPH__
+#endif
+ return _len;
+ }
+
+ bool contents_equal(const buffer::list& other) const;
+ bool contents_equal(const void* other, size_t length) const;
+
+ bool is_provided_buffer(const char *dst) const;
+ bool is_aligned(unsigned align) const;
+ bool is_page_aligned() const;
+ bool is_n_align_sized(unsigned align) const;
+ bool is_n_page_sized() const;
+ bool is_aligned_size_and_memory(unsigned align_size,
+ unsigned align_memory) const;
+
+ bool is_zero() const;
+
+ // modifiers
+ void clear() noexcept {
+ _carriage = &always_empty_bptr;
+ _buffers.clear_and_dispose();
+ _len = 0;
+ _num = 0;
+ }
+ void push_back(const ptr& bp) {
+ if (bp.length() == 0)
+ return;
+ _buffers.push_back(*ptr_node::create(bp).release());
+ _len += bp.length();
+ _num += 1;
+ }
+ void push_back(ptr&& bp) {
+ if (bp.length() == 0)
+ return;
+ _len += bp.length();
+ _num += 1;
+ _buffers.push_back(*ptr_node::create(std::move(bp)).release());
+ _carriage = &always_empty_bptr;
+ }
+ void push_back(const ptr_node&) = delete;
+ void push_back(ptr_node&) = delete;
+ void push_back(ptr_node&&) = delete;
+ void push_back(std::unique_ptr<ptr_node, ptr_node::disposer> bp) {
+ _carriage = bp.get();
+ _len += bp->length();
+ _num += 1;
+ _buffers.push_back(*bp.release());
+ }
+ void push_back(raw* const r) = delete;
+ void push_back(ceph::unique_leakable_ptr<raw> r) {
+ _buffers.push_back(*ptr_node::create(std::move(r)).release());
+ _carriage = &_buffers.back();
+ _len += _buffers.back().length();
+ _num += 1;
+ }
+
+ void zero();
+ void zero(unsigned o, unsigned l);
+
+ bool is_contiguous() const;
+ void rebuild();
+ void rebuild(std::unique_ptr<ptr_node, ptr_node::disposer> nb);
+ bool rebuild_aligned(unsigned align);
+ // max_buffers = 0 mean don't care _buffers.size(), other
+ // must make _buffers.size() <= max_buffers after rebuilding.
+ bool rebuild_aligned_size_and_memory(unsigned align_size,
+ unsigned align_memory,
+ unsigned max_buffers = 0);
+ bool rebuild_page_aligned();
+
+ void reserve(size_t prealloc);
+
+ [[deprecated("in favor of operator=(list&&)")]] void claim(list& bl) {
+ *this = std::move(bl);
+ }
+ void claim_append(list& bl);
+ void claim_append(list&& bl) {
+ claim_append(bl);
+ }
+
+ // copy with explicit volatile-sharing semantics
+ void share(const list& bl)
+ {
+ if (this != &bl) {
+ clear();
+ for (const auto& bp : bl._buffers) {
+ _buffers.push_back(*ptr_node::create(bp).release());
+ }
+ _len = bl._len;
+ _num = bl._num;
+ }
+ }
+
+#ifdef HAVE_SEASTAR
+ /// convert the bufferlist into a network packet
+ operator seastar::net::packet() &&;
+#endif
+
+ iterator begin(size_t offset=0) {
+ return iterator(this, offset);
+ }
+ iterator end() {
+ return iterator(this, _len, _buffers.end(), 0);
+ }
+
+ const_iterator begin(size_t offset=0) const {
+ return const_iterator(this, offset);
+ }
+ const_iterator cbegin(size_t offset=0) const {
+ return begin(offset);
+ }
+ const_iterator end() const {
+ return const_iterator(this, _len, _buffers.end(), 0);
+ }
+
+ void append(char c);
+ void append(const char *data, unsigned len);
+ void append(std::string s) {
+ append(s.data(), s.length());
+ }
+#if __cplusplus >= 201703L
+ // To forcibly disambiguate between string and string_view in the
+ // case of arrays
+ template<std::size_t N>
+ void append(const char (&s)[N]) {
+ append(s, N);
+ }
+ void append(const char* s) {
+ append(s, strlen(s));
+ }
+ void append(std::string_view s) {
+ append(s.data(), s.length());
+ }
+#endif // __cplusplus >= 201703L
+ void append(const ptr& bp);
+ void append(ptr&& bp);
+ void append(const ptr& bp, unsigned off, unsigned len);
+ void append(const list& bl);
+ /// append each non-empty line from the stream and add '\n',
+ /// so a '\n' will be added even the stream does not end with EOL.
+ ///
+ /// For example, if the stream contains "ABC\n\nDEF", "ABC\nDEF\n" is
+ /// actually appended.
+ void append(std::istream& in);
+ contiguous_filler append_hole(unsigned len);
+ void append_zero(unsigned len);
+ void prepend_zero(unsigned len);
+
+ reserve_t obtain_contiguous_space(const unsigned len);
+
+ /*
+ * get a char
+ */
+ const char& operator[](unsigned n) const;
+ char *c_str();
+ std::string to_str() const;
+
+ void substr_of(const list& other, unsigned off, unsigned len);
+
+ // funky modifer
+ void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */);
+ void write(int off, int len, std::ostream& out) const;
+
+ void encode_base64(list& o);
+ void decode_base64(list& o);
+
+ void write_stream(std::ostream &out) const;
+ void hexdump(std::ostream &out, bool trailing_newline = true) const;
+ ssize_t pread_file(const char *fn, uint64_t off, uint64_t len, std::string *error);
+ int read_file(const char *fn, std::string *error);
+ ssize_t read_fd(int fd, size_t len);
+ ssize_t recv_fd(int fd, size_t len);
+ int write_file(const char *fn, int mode=0644);
+ int write_fd(int fd) const;
+ int write_fd(int fd, uint64_t offset) const;
+ int send_fd(int fd) const;
+ template<typename VectorT>
+ void prepare_iov(VectorT *piov) const {
+#ifdef __CEPH__
+ ceph_assert(_num <= IOV_MAX);
+#else
+ assert(_num <= IOV_MAX);
+#endif
+ piov->resize(_num);
+ unsigned n = 0;
+ for (auto& p : _buffers) {
+ (*piov)[n].iov_base = (void *)p.c_str();
+ (*piov)[n].iov_len = p.length();
+ ++n;
+ }
+ }
+ uint32_t crc32c(uint32_t crc) const;
+ void invalidate_crc();
+
+ // These functions return a bufferlist with a pointer to a single
+ // static buffer. They /must/ not outlive the memory they
+ // reference.
+ static list static_from_mem(char* c, size_t l);
+ static list static_from_cstring(char* c);
+ static list static_from_string(std::string& s);
+ };
+
+} // inline namespace v15_2_0
+
+ /*
+ * efficient hash of one or more bufferlists
+ */
+
+ class hash {
+ uint32_t crc;
+
+ public:
+ hash() : crc(0) { }
+ // cppcheck-suppress noExplicitConstructor
+ hash(uint32_t init) : crc(init) { }
+
+ void update(const buffer::list& bl) {
+ crc = bl.crc32c(crc);
+ }
+
+ uint32_t digest() {
+ return crc;
+ }
+ };
+
+inline bool operator==(const bufferlist &lhs, const bufferlist &rhs) {
+ if (lhs.length() != rhs.length())
+ return false;
+ return std::equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+inline bool operator<(const bufferlist& lhs, const bufferlist& rhs) {
+ auto l = lhs.begin(), r = rhs.begin();
+ for (; l != lhs.end() && r != rhs.end(); ++l, ++r) {
+ if (*l < *r) return true;
+ if (*l > *r) return false;
+ }
+ return (l == lhs.end()) && (r != rhs.end()); // lhs.length() < rhs.length()
+}
+
+inline bool operator<=(const bufferlist& lhs, const bufferlist& rhs) {
+ auto l = lhs.begin(), r = rhs.begin();
+ for (; l != lhs.end() && r != rhs.end(); ++l, ++r) {
+ if (*l < *r) return true;
+ if (*l > *r) return false;
+ }
+ return l == lhs.end(); // lhs.length() <= rhs.length()
+}
+
+inline bool operator!=(const bufferlist &l, const bufferlist &r) {
+ return !(l == r);
+}
+inline bool operator>(const bufferlist& lhs, const bufferlist& rhs) {
+ return rhs < lhs;
+}
+inline bool operator>=(const bufferlist& lhs, const bufferlist& rhs) {
+ return rhs <= lhs;
+}
+
+std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
+
+std::ostream& operator<<(std::ostream& out, const buffer::raw &r);
+
+std::ostream& operator<<(std::ostream& out, const buffer::list& bl);
+
+inline bufferhash& operator<<(bufferhash& l, const bufferlist &r) {
+ l.update(r);
+ return l;
+}
+
+} // namespace buffer
+
+} // namespace ceph
+
+
+#endif
diff --git a/src/include/buffer_fwd.h b/src/include/buffer_fwd.h
new file mode 100644
index 000000000..6de7b1a1f
--- /dev/null
+++ b/src/include/buffer_fwd.h
@@ -0,0 +1,19 @@
+#ifndef BUFFER_FWD_H
+#define BUFFER_FWD_H
+
+namespace ceph {
+ namespace buffer {
+ inline namespace v15_2_0 {
+ class ptr;
+ class list;
+ }
+ class hash;
+ }
+
+ using bufferptr = buffer::ptr;
+ using bufferlist = buffer::list;
+ using bufferhash = buffer::hash;
+}
+
+#endif
+
diff --git a/src/include/buffer_raw.h b/src/include/buffer_raw.h
new file mode 100644
index 000000000..890fb04d5
--- /dev/null
+++ b/src/include/buffer_raw.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BUFFER_RAW_H
+#define CEPH_BUFFER_RAW_H
+
+#include <map>
+#include <utility>
+#include <type_traits>
+#include "common/ceph_atomic.h"
+#include "include/buffer.h"
+#include "include/mempool.h"
+#include "include/spinlock.h"
+
+namespace ceph::buffer {
+inline namespace v15_2_0 {
+
+ class raw {
+ public:
+ // In the future we might want to have a slab allocator here with few
+ // embedded slots. This would allow to avoid the "if" in dtor of ptr_node.
+ std::aligned_storage<sizeof(ptr_node),
+ alignof(ptr_node)>::type bptr_storage;
+ protected:
+ char *data;
+ unsigned len;
+ public:
+ ceph::atomic<unsigned> nref { 0 };
+ int mempool;
+
+ std::pair<size_t, size_t> last_crc_offset {std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max()};
+ std::pair<uint32_t, uint32_t> last_crc_val;
+
+ mutable ceph::spinlock crc_spinlock;
+
+ explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon)
+ : data(nullptr), len(l), nref(0), mempool(mempool) {
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+ }
+ raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon)
+ : data(c), len(l), nref(0), mempool(mempool) {
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+ }
+ virtual ~raw() {
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+ -1, -(int)len);
+ }
+
+ void _set_len(unsigned l) {
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+ -1, -(int)len);
+ len = l;
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+ }
+
+ void reassign_to_mempool(int pool) {
+ if (pool == mempool) {
+ return;
+ }
+ mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+ -1, -(int)len);
+ mempool = pool;
+ mempool::get_pool(mempool::pool_index_t(pool)).adjust_count(1, len);
+ }
+
+ void try_assign_to_mempool(int pool) {
+ if (mempool == mempool::mempool_buffer_anon) {
+ reassign_to_mempool(pool);
+ }
+ }
+
+private:
+ // no copying.
+ // cppcheck-suppress noExplicitConstructor
+ raw(const raw &other) = delete;
+ const raw& operator=(const raw &other) = delete;
+public:
+ char *get_data() const {
+ return data;
+ }
+ unsigned get_len() const {
+ return len;
+ }
+ virtual raw* clone_empty() = 0;
+ ceph::unique_leakable_ptr<raw> clone() {
+ raw* const c = clone_empty();
+ memcpy(c->data, data, len);
+ return ceph::unique_leakable_ptr<raw>(c);
+ }
+ bool get_crc(const std::pair<size_t, size_t> &fromto,
+ std::pair<uint32_t, uint32_t> *crc) const {
+ std::lock_guard lg(crc_spinlock);
+ if (last_crc_offset == fromto) {
+ *crc = last_crc_val;
+ return true;
+ }
+ return false;
+ }
+ void set_crc(const std::pair<size_t, size_t> &fromto,
+ const std::pair<uint32_t, uint32_t> &crc) {
+ std::lock_guard lg(crc_spinlock);
+ last_crc_offset = fromto;
+ last_crc_val = crc;
+ }
+ void invalidate_crc() {
+ std::lock_guard lg(crc_spinlock);
+ last_crc_offset.first = std::numeric_limits<size_t>::max();
+ last_crc_offset.second = std::numeric_limits<size_t>::max();
+ }
+ };
+
+} // inline namespace v15_2_0
+} // namespace ceph::buffer
+
+#endif // CEPH_BUFFER_RAW_H
diff --git a/src/include/byteorder.h b/src/include/byteorder.h
new file mode 100644
index 000000000..4062c2d4c
--- /dev/null
+++ b/src/include/byteorder.h
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <type_traits>
+#include "acconfig.h"
+#include "int_types.h"
+
+
+#ifdef __GNUC__
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type
+swab(T val) {
+ return __builtin_bswap16(val);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type
+swab(T val) {
+ return __builtin_bswap32(val);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type
+swab(T val) {
+ return __builtin_bswap64(val);
+}
+#else
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type
+swab(T val) {
+ return (val >> 8) | (val << 8);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type
+swab(T val) {
+ return (( val >> 24) |
+ ((val >> 8) & 0xff00) |
+ ((val << 8) & 0xff0000) |
+ ((val << 24)));
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type
+swab(T val) {
+ return (( val >> 56) |
+ ((val >> 40) & 0xff00ull) |
+ ((val >> 24) & 0xff0000ull) |
+ ((val >> 8) & 0xff000000ull) |
+ ((val << 8) & 0xff00000000ull) |
+ ((val << 24) & 0xff0000000000ull) |
+ ((val << 40) & 0xff000000000000ull) |
+ ((val << 56)));
+}
+#endif
+
+// mswab == maybe swab (if not LE)
+#ifdef CEPH_BIG_ENDIAN
+template<typename T>
+inline T mswab(T val) {
+ return swab(val);
+}
+#else
+template<typename T>
+inline T mswab(T val) {
+ return val;
+}
+#endif
+
+template<typename T>
+struct ceph_le {
+private:
+ T v;
+public:
+ ceph_le<T>& operator=(T nv) {
+ v = mswab(nv);
+ return *this;
+ }
+ operator T() const { return mswab(v); }
+ friend inline bool operator==(ceph_le a, ceph_le b) {
+ return a.v == b.v;
+ }
+} __attribute__ ((packed));
+
+using ceph_le64 = ceph_le<__u64>;
+using ceph_le32 = ceph_le<__u32>;
+using ceph_le16 = ceph_le<__u16>;
+
+using ceph_les64 = ceph_le<__s64>;
+using ceph_les32 = ceph_le<__s32>;
+using ceph_les16 = ceph_le<__s16>;
+
+inline ceph_le64 init_le64(__u64 x) {
+ ceph_le64 v;
+ v = x;
+ return v;
+}
+inline ceph_le32 init_le32(__u32 x) {
+ ceph_le32 v;
+ v = x;
+ return v;
+}
+inline ceph_le16 init_le16(__u16 x) {
+ ceph_le16 v;
+ v = x;
+ return v;
+}
+
+inline ceph_les64 init_les64(__s64 x) {
+ ceph_les64 v;
+ v = x;
+ return v;
+}
+inline ceph_les32 init_les32(__s32 x) {
+ ceph_les32 v;
+ v = x;
+ return v;
+}
+inline ceph_les16 init_les16(__s16 x) {
+ ceph_les16 v;
+ v = x;
+ return v;
+}
diff --git a/src/include/ceph_assert.h b/src/include/ceph_assert.h
new file mode 100644
index 000000000..0627894ea
--- /dev/null
+++ b/src/include/ceph_assert.h
@@ -0,0 +1,147 @@
+#ifndef CEPH_ASSERT_H
+#define CEPH_ASSERT_H
+
+#include <cstdlib>
+#include <string>
+
+#ifndef __STRING
+# define __STRING(x) #x
+#endif
+
+#if defined(__linux__)
+#include <features.h>
+
+#elif defined(__FreeBSD__)
+#include <sys/cdefs.h>
+#define __GNUC_PREREQ(minor, major) __GNUC_PREREQ__(minor, major)
+#elif defined(__sun) || defined(_AIX)
+#include "include/compat.h"
+#include <assert.h>
+#endif
+
+#ifdef __CEPH__
+# include "acconfig.h"
+#endif
+
+#include "include/common_fwd.h"
+
+namespace ceph {
+
+struct BackTrace;
+
+/*
+ * Select a function-name variable based on compiler tests, and any compiler
+ * specific overrides.
+ */
+#if defined(HAVE_PRETTY_FUNC)
+# define __CEPH_ASSERT_FUNCTION __PRETTY_FUNCTION__
+#elif defined(HAVE_FUNC)
+# define __CEPH_ASSERT_FUNCTION __func__
+#else
+# define __CEPH_ASSERT_FUNCTION ((__const char *) 0)
+#endif
+
+extern void register_assert_context(CephContext *cct);
+
+struct assert_data {
+ const char *assertion;
+ const char *file;
+ const int line;
+ const char *function;
+};
+
+extern void __ceph_assert_fail(const char *assertion, const char *file, int line, const char *function)
+ __attribute__ ((__noreturn__));
+extern void __ceph_assert_fail(const assert_data &ctx)
+ __attribute__ ((__noreturn__));
+
+extern void __ceph_assertf_fail(const char *assertion, const char *file, int line, const char *function, const char* msg, ...)
+ __attribute__ ((__noreturn__));
+extern void __ceph_assert_warn(const char *assertion, const char *file, int line, const char *function);
+
+[[noreturn]] void __ceph_abort(const char *file, int line, const char *func,
+ const std::string& msg);
+
+[[noreturn]] void __ceph_abortf(const char *file, int line, const char *func,
+ const char* msg, ...);
+
+#define _CEPH_ASSERT_VOID_CAST static_cast<void>
+
+#define assert_warn(expr) \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : ::ceph::__ceph_assert_warn (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION))
+
+}
+
+using namespace ceph;
+
+
+/*
+ * ceph_abort aborts the program with a nice backtrace.
+ *
+ * Currently, it's the same as assert(0), but we may one day make assert a
+ * debug-only thing, like it is in many projects.
+ */
+#define ceph_abort(msg, ...) \
+ ::ceph::__ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, "abort() called")
+
+#define ceph_abort_msg(msg) \
+ ::ceph::__ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, msg)
+
+#define ceph_abort_msgf(...) \
+ ::ceph::__ceph_abortf( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)
+
+#ifdef __SANITIZE_ADDRESS__
+#define ceph_assert(expr) \
+ do { \
+ ((expr)) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : ::ceph::__ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \
+ } while (false)
+#else
+#define ceph_assert(expr) \
+ do { static const ceph::assert_data assert_data_ctx = \
+ {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : ::ceph::__ceph_assert_fail(assert_data_ctx)); } while(false)
+#endif
+
+// this variant will *never* get compiled out to NDEBUG in the future.
+// (ceph_assert currently doesn't either, but in the future it might.)
+#ifdef __SANITIZE_ADDRESS__
+#define ceph_assert_always(expr) \
+ do { \
+ ((expr)) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : ::ceph::__ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \
+ } while(false)
+#else
+#define ceph_assert_always(expr) \
+ do { static const ceph::assert_data assert_data_ctx = \
+ {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : ::ceph::__ceph_assert_fail(assert_data_ctx)); } while(false)
+#endif
+
+// Named by analogy with printf. Along with an expression, takes a format
+// string and parameters which are printed if the assertion fails.
+#define assertf(expr, ...) \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+#define ceph_assertf(expr, ...) \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+
+// this variant will *never* get compiled out to NDEBUG in the future.
+// (ceph_assertf currently doesn't either, but in the future it might.)
+#define ceph_assertf_always(expr, ...) \
+ ((expr) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
+ : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+
+#endif
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
new file mode 100644
index 000000000..39a32027c
--- /dev/null
+++ b/src/include/ceph_features.h
@@ -0,0 +1,279 @@
+#ifndef __CEPH_FEATURES
+#define __CEPH_FEATURES
+
+#include "sys/types.h"
+
+/*
+ * Each time we reclaim bits for reuse we need to specify another
+ * bitmask that, if all bits are set, indicates we have the new
+ * incarnation of that feature. Base case is 1 (first use)
+ */
+#define CEPH_FEATURE_INCARNATION_1 (0ull)
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
+
+#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \
+ const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \
+ const static uint64_t CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+// this bit is ignored but still advertised by release *when*
+#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
+ const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+ const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+// this bit is ignored by release *unused* and not advertised by
+// release *unadvertised*
+#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
+
+
+// test for a feature. this test is safer than a typical mask against
+// the bit because it ensures that we have the bit AND the marker for the
+// bit's incarnation. this must be used in any case where the features
+// bits may include an old meaning of the bit.
+#define HAVE_FEATURE(x, name) \
+ (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
+
+
+/*
+ * Notes on deprecation:
+ *
+ * For feature bits used *only* on the server-side:
+ *
+ * - In the first phase we indicate that a feature is DEPRECATED as of
+ * a particular release. This is the first major release X (say,
+ * mimic) that does not depend on its peers advertising the feature.
+ * That is, it safely assumes its peers all have the feature. We
+ * indicate this with the DEPRECATED macro. For example,
+ *
+ * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MON_METADATA, MIMIC)
+ *
+ * because 13.2.z (mimic) did not care if its peers advertised this
+ * feature bit.
+ *
+ * - In the second phase we stop advertising the the bit and call it
+ * RETIRED. This can normally be done 2 major releases
+ * following the one in which we marked the feature DEPRECATED. In
+ * the above example, for 15.0.z (octopus) we can say:
+ *
+ * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MON_METADATA, MIMIC, OCTOPUS)
+ *
+ * - The bit can be reused in the next release that will never talk to
+ * a pre-octopus daemon (13 mimic or 14 nautlius) that advertises the
+ * bit: in this case, the 16.y.z (P-release).
+ *
+ * This ensures that no two versions who have different meanings for
+ * the bit ever speak to each other.
+ */
+
+/*
+ * Notes on the kernel client:
+ *
+ * - "X" means that the feature bit has been advertised and supported
+ * since kernel X
+ *
+ * - "X req" means that the feature bit has been advertised and required
+ * since kernel X
+ *
+ * The remaining feature bits are not and have never been used by the
+ * kernel client.
+ */
+
+DEFINE_CEPH_FEATURE( 0, 1, UID)
+DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) // 2.6.35 req
+DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
+DEFINE_CEPH_FEATURE( 3, 1, FLOCK) // 2.6.36
+DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) // 4.6 req
+DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
+DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ) // 3.10 req
+DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH) // 2.6.38
+DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
+DEFINE_CEPH_FEATURE( 9, 1, PGID64) // 3.9 req
+DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
+DEFINE_CEPH_FEATURE(11, 1, PGPOOL3) // 3.9 req
+DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
+DEFINE_CEPH_FEATURE(13, 1, OSDENC) // 3.9 req
+DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
+DEFINE_CEPH_FEATURE(15, 1, MONENC)
+DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(16, 3, SERVER_OCTOPUS)
+DEFINE_CEPH_FEATURE(16, 3, OSD_REPOP_MLCOD)
+DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(17, 3, OS_PERF_STAT_NS)
+DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES) // 3.6
+DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT)
+DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(20, 3, SERVER_PACIFIC)
+DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) // 4.13
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(22, 2, OSD_FIXED_COLLECTION_LIST)
+DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) // 3.19 req (unless nocephx_require_signatures)
+DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(24, 2, RECOVERY_RESERVATION_2)
+DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) // 3.9
+DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
+DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) // 3.9
+DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
+DEFINE_CEPH_FEATURE(29, 1, MDSENC) // 4.7
+DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) // 3.9
+DEFINE_CEPH_FEATURE_DEPRECATED(31, 1, MON_SINGLE_PAXOS, NAUTILUS)
+DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(32, 3, STRETCH_MODE)
+DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(34, 3, RANGE_BLOCKLIST)
+DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) // 3.14
+DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) // 3.14
+DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) // 3.14
+DEFINE_CEPH_FEATURE_RETIRED(38, 1, OSD_ERASURE_CODES, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) // 3.15
+DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) // 3.19
+DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3) // 3.15
+DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
+DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2) // 4.3 (for consistency)
+DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND) // 4.13
+DEFINE_CEPH_FEATURE_RETIRED(44, 1, ERASURE_CODE_PLUGINS_V2, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
+// available
+DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA) // 4.17
+DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4) // 4.1
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
+// available
+DEFINE_CEPH_FEATURE_RETIRED(50, 1, MON_METADATA, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(51, 1, OSD_BITWISE_HOBJ_SORT, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(52, 1, OSD_PROXY_WRITE_FEATURES, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(53, 1, ERASURE_CODE_PLUGINS_V3, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(54, 1, OSD_HITSET_GMT, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(55, 1, HAMMER_0_94_4, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) // 4.13 (for pg_pool_t >= v25)
+DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) // 4.13
+DEFINE_CEPH_FEATURE_RETIRED(57, 1, MON_ROUTE_OSDMAP, MIMIC, OCTOPUS) // overlap
+DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
+DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5) // 4.5
+DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
+DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
+DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
+DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
+DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
+DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // 4.19, *do not share this bit*
+
+DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinel
+DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
+
+
+/*
+ * Features supported. Should be everything above.
+ */
+#define CEPH_FEATURES_ALL \
+ (CEPH_FEATURE_UID | \
+ CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_FLOCK | \
+ CEPH_FEATURE_SUBSCRIBE2 | \
+ CEPH_FEATURE_MONNAMES | \
+ CEPH_FEATURE_RECONNECT_SEQ | \
+ CEPH_FEATURE_DIRLAYOUTHASH | \
+ CEPH_FEATURE_OBJECTLOCATOR | \
+ CEPH_FEATURE_PGID64 | \
+ CEPH_FEATURE_INCSUBOSDMAP | \
+ CEPH_FEATURE_PGPOOL3 | \
+ CEPH_FEATURE_OSDREPLYMUX | \
+ CEPH_FEATURE_OSDENC | \
+ CEPH_FEATURE_MONENC | \
+ CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_MSG_AUTH | \
+ CEPH_FEATURE_CRUSH_TUNABLES2 | \
+ CEPH_FEATURE_CREATEPOOLID | \
+ CEPH_FEATURE_REPLY_CREATE_INODE | \
+ CEPH_FEATURE_MDSENC | \
+ CEPH_FEATURE_OSDHASHPSPOOL | \
+ CEPH_FEATURE_NEW_OSDOP_ENCODING | \
+ CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \
+ DEPRECATED_CEPH_FEATURE_MON_SINGLE_PAXOS | \
+ CEPH_FEATURE_OSD_CACHEPOOL | \
+ CEPH_FEATURE_CRUSH_V2 | \
+ CEPH_FEATURE_EXPORT_PEER | \
+ CEPH_FEATURE_OSDMAP_ENC | \
+ CEPH_FEATURE_MDS_INLINE_DATA | \
+ CEPH_FEATURE_CRUSH_TUNABLES3 | \
+ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
+ CEPH_FEATURE_MSGR_KEEPALIVE2 | \
+ CEPH_FEATURE_OSD_POOLRESEND | \
+ CEPH_FEATURE_OSD_FADVISE_FLAGS | \
+ CEPH_FEATURE_MDS_QUOTA | \
+ CEPH_FEATURE_CRUSH_V4 | \
+ CEPH_FEATURE_MON_STATEFUL_SUB | \
+ CEPH_FEATURE_CRUSH_TUNABLES5 | \
+ CEPH_FEATURE_SERVER_JEWEL | \
+ CEPH_FEATURE_FS_FILE_LAYOUT_V2 | \
+ CEPH_FEATURE_SERVER_KRAKEN | \
+ CEPH_FEATURE_FS_BTIME | \
+ CEPH_FEATURE_FS_CHANGE_ATTR | \
+ CEPH_FEATURE_MSG_ADDR2 | \
+ CEPH_FEATURE_SERVER_LUMINOUS | \
+ CEPH_FEATURE_RESEND_ON_SPLIT | \
+ CEPH_FEATURE_RADOS_BACKOFF | \
+ CEPH_FEATURE_OSD_RECOVERY_DELETES | \
+ CEPH_FEATURE_SERVER_MIMIC | \
+ CEPH_FEATURE_RECOVERY_RESERVATION_2 | \
+ CEPH_FEATURE_SERVER_NAUTILUS | \
+ CEPH_FEATURE_CEPHX_V2 | \
+ CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \
+ CEPH_FEATUREMASK_SERVER_OCTOPUS | \
+ CEPH_FEATUREMASK_STRETCH_MODE | \
+ CEPH_FEATUREMASK_OSD_REPOP_MLCOD | \
+ CEPH_FEATUREMASK_SERVER_PACIFIC | \
+ CEPH_FEATURE_OSD_FIXED_COLLECTION_LIST | \
+ CEPH_FEATURE_RANGE_BLOCKLIST | \
+ 0ULL)
+
+#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
+
+/*
+ * crush related features
+ */
+#define CEPH_FEATURES_CRUSH \
+ (CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_CRUSH_TUNABLES2 | \
+ CEPH_FEATURE_CRUSH_TUNABLES3 | \
+ CEPH_FEATURE_CRUSH_TUNABLES5 | \
+ CEPH_FEATURE_CRUSH_V2 | \
+ CEPH_FEATURE_CRUSH_V4 | \
+ CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS)
+
+/*
+ * make sure we don't try to use the reserved features
+ */
+#define CEPH_STATIC_ASSERT(x) (void)(sizeof(int[((x)==0) ? -1 : 0]))
+
+static inline void ____build_time_check_for_reserved_bits(void) {
+ CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL &
+ (CEPH_FEATURE_RESERVED |
+ DEPRECATED_CEPH_FEATURE_RESERVED_BROKEN)) == 0);
+}
+
+#endif
diff --git a/src/include/ceph_frag.h b/src/include/ceph_frag.h
new file mode 100644
index 000000000..5babb8e95
--- /dev/null
+++ b/src/include/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask. Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ * 8 upper bits = "bits"
+ * 24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value. This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically. However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+ return (b << 24) |
+ (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+ return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+ return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+ return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+ return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+ return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+ /* is sub as specific as us, and contained by us? */
+ return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+ (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f) - 1,
+ ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1,
+ ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+ int newbits = ceph_frag_bits(f) + by;
+ return ceph_frag_make(newbits,
+ ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+ return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+ return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
new file mode 100644
index 000000000..491931a8b
--- /dev/null
+++ b/src/include/ceph_fs.h
@@ -0,0 +1,1007 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL-2.1 or LGPL-3.0
+ */
+
+#ifndef CEPH_FS_H
+#define CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * The data structures defined here are shared between Linux kernel and
+ * user space. Also, those data structures are maintained always in
+ * little-endian byte order, even on big-endian systems. This is handled
+ * differently in kernel vs. user space. For use as kernel headers, the
+ * little-endian fields need to use the __le16/__le32/__le64 types. These
+ * are markers that indicate endian conversion routines must be used
+ * whenever such fields are accessed, which can be verified by checker
+ * tools like "sparse". For use as user-space headers, the little-endian
+ * fields instead use types ceph_le16/ceph_le32/ceph_le64, which are C++
+ * classes that implement automatic endian conversion on every access.
+ * To still allow for header sharing, this file uses the __le types, but
+ * redefines those to the ceph_ types when compiled in user space.
+ */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * subprotocol versions. when specific messages types or high-level
+ * protocols change, bump the affected components. we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSDC_PROTOCOL 24 /* server/client */
+#define CEPH_MDSC_PROTOCOL 32 /* server/client */
+#define CEPH_MONC_PROTOCOL 15 /* server/client */
+
+
+#define CEPH_INO_ROOT 1
+/*
+ * hidden .ceph dir, which is no longer created but
+ * recognised in existing filesystems so that we
+ * don't try to fragment it.
+ */
+#define CEPH_INO_CEPH 2
+#define CEPH_INO_GLOBAL_SNAPREALM 3
+#define CEPH_INO_LOST_AND_FOUND 4 /* reserved ino for use in recovery */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON 31
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
+ of page size. */
+ __le32 fl_stripe_count; /* over this many objects */
+ __le32 fl_object_size; /* until objects are this big, then move to
+ new objects */
+ __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */
+
+ /* pg -> disk layout */
+ __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
+
+ /* object -> pg layout */
+ __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
+ __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+struct ceph_dir_layout {
+ __u8 dl_dir_hash; /* see ceph_hash.h for ids */
+ __u8 dl_unused1;
+ __u16 dl_unused2;
+ __u32 dl_unused3;
+} __attribute__ ((packed));
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES 0x1
+
+#define CEPH_AES_IV "cephsageyudagreg"
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN 0x0
+#define CEPH_AUTH_NONE 0x1
+#define CEPH_AUTH_CEPHX 0x2
+
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN 0x0
+#define CEPH_CON_MODE_CRC 0x1
+#define CEPH_CON_MODE_SECURE 0x2
+
+extern const char *ceph_con_mode_name(int con_mode);
+
+/* For options with "_", like: GSS_GSS
+ which means: Mode/Protocol to validate "authentication_authorization",
+ where:
+ - Authentication: Verifying the identity of an entity.
+ - Authorization: Verifying that an authenticated entity has
+ the right to access a particular resource.
+*/
+#define CEPH_AUTH_GSS 0x4
+#define CEPH_AUTH_GSS_GSS CEPH_AUTH_GSS
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN 1
+#define CEPH_MSG_PING 2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP 4
+#define CEPH_MSG_MON_GET_MAP 5
+#define CEPH_MSG_MON_GET_OSDMAP 6
+#define CEPH_MSG_MON_METADATA 7
+#define CEPH_MSG_STATFS 13
+#define CEPH_MSG_STATFS_REPLY 14
+#define CEPH_MSG_MON_SUBSCRIBE 15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
+#define CEPH_MSG_AUTH 17
+#define CEPH_MSG_AUTH_REPLY 18
+#define CEPH_MSG_MON_GET_VERSION 19
+#define CEPH_MSG_MON_GET_VERSION_REPLY 20
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP 21
+
+#define CEPH_MSG_CLIENT_SESSION 22
+#define CEPH_MSG_CLIENT_RECONNECT 23
+
+#define CEPH_MSG_CLIENT_REQUEST 24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_RECLAIM 27
+#define CEPH_MSG_CLIENT_RECLAIM_REPLY 28
+#define CEPH_MSG_CLIENT_METRICS 29
+#define CEPH_MSG_CLIENT_CAPS 0x310
+#define CEPH_MSG_CLIENT_LEASE 0x311
+#define CEPH_MSG_CLIENT_SNAP 0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
+#define CEPH_MSG_CLIENT_QUOTA 0x314
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY 48
+#define CEPH_MSG_POOLOP 49
+
+
+/* osd */
+#define CEPH_MSG_OSD_MAP 41
+#define CEPH_MSG_OSD_OP 42
+#define CEPH_MSG_OSD_OPREPLY 43
+#define CEPH_MSG_WATCH_NOTIFY 44
+#define CEPH_MSG_OSD_BACKOFF 61
+
+/* FSMap subscribers (see all MDS clusters at once) */
+#define CEPH_MSG_FS_MAP 45
+/* FSMapUser subscribers (get MDS clusters name->ID mapping) */
+#define CEPH_MSG_FS_MAP_USER 103
+
+/* watch-notify operations */
+enum {
+ CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
+ CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
+ CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
+};
+
+const char *ceph_watch_event_name(int o);
+
+/* pool operations */
+enum {
+ POOL_OP_CREATE = 0x01,
+ POOL_OP_DELETE = 0x02,
+ POOL_OP_AUID_CHANGE = 0x03,
+ POOL_OP_CREATE_SNAP = 0x11,
+ POOL_OP_DELETE_SNAP = 0x12,
+ POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
+ POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
+};
+
+struct ceph_mon_request_header {
+ __le64 have_version;
+ __le16 session_mon;
+ __le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+ __le64 kb, kb_used, kb_avail;
+ __le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+ struct ceph_fsid fsid;
+ __le64 version;
+ struct ceph_statfs st;
+} __attribute__ ((packed));
+
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 pool;
+ __le32 op;
+ __le64 __old_auid; // obsolete
+ __le64 snapid;
+ __le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 reply_code;
+ __le32 epoch;
+ char has_data;
+ char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+ __le64 snapid;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+ struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */
+
+struct ceph_mon_subscribe_item {
+ __le64 start;
+ __u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+ __le32 duration; /* seconds */
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mdsmap flags
+ */
+#define CEPH_MDSMAP_NOT_JOINABLE (1<<0) /* standbys cannot join */
+#define CEPH_MDSMAP_DOWN (CEPH_MDSMAP_NOT_JOINABLE) /* backwards compat */
+#define CEPH_MDSMAP_ALLOW_SNAPS (1<<1) /* cluster allowed to create snapshots */
+/* deprecated #define CEPH_MDSMAP_ALLOW_MULTIMDS (1<<2) cluster allowed to have >1 active MDS */
+/* deprecated #define CEPH_MDSMAP_ALLOW_DIRFRAGS (1<<3) cluster allowed to fragment directories */
+#define CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS (1<<4) /* cluster alllowed to enable MULTIMDS
+ and SNAPS at the same time */
+#define CEPH_MDSMAP_ALLOW_STANDBY_REPLAY (1<<5) /* cluster alllowed to enable MULTIMDS */
+
+#define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \
+ CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS)
+
+/*
+ * mds states
+ * > 0 -> in
+ * <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
+ empty log. */
+#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAYONCE -9 /* Legacy, unused */
+#define CEPH_MDS_STATE_NULL -10
+
+#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
+ operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
+#define CEPH_MDS_STATE_DAMAGED 15 /* rank not replayable, need repair */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ * - these are bitmasks.. we can compose them
+ * - they also define the lock ordering by the MDS
+ * - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN (1 << 0)
+#define CEPH_LOCK_DVERSION (1 << 1)
+#define CEPH_LOCK_ISNAP (1 << 4) /* snapshot lock. MDS internal */
+#define CEPH_LOCK_IPOLICY (1 << 5) /* policy lock on dirs. MDS internal */
+#define CEPH_LOCK_IFILE (1 << 6)
+#define CEPH_LOCK_INEST (1 << 7) /* mds internal */
+#define CEPH_LOCK_IDFT (1 << 8) /* dir frag tree */
+#define CEPH_LOCK_IAUTH (1 << 9)
+#define CEPH_LOCK_ILINK (1 << 10)
+#define CEPH_LOCK_IXATTR (1 << 11)
+#define CEPH_LOCK_IFLOCK (1 << 12) /* advisory file locks */
+#define CEPH_LOCK_IVERSION (1 << 13) /* mds internal */
+
+#define CEPH_LOCK_IFIRST CEPH_LOCK_ISNAP
+
+
+/* client_session ops */
+enum {
+ CEPH_SESSION_REQUEST_OPEN,
+ CEPH_SESSION_OPEN,
+ CEPH_SESSION_REQUEST_CLOSE,
+ CEPH_SESSION_CLOSE,
+ CEPH_SESSION_REQUEST_RENEWCAPS,
+ CEPH_SESSION_RENEWCAPS,
+ CEPH_SESSION_STALE,
+ CEPH_SESSION_RECALL_STATE,
+ CEPH_SESSION_FLUSHMSG,
+ CEPH_SESSION_FLUSHMSG_ACK,
+ CEPH_SESSION_FORCE_RO,
+ // A response to REQUEST_OPEN indicating that the client should
+ // permanently desist from contacting the MDS
+ CEPH_SESSION_REJECT,
+ CEPH_SESSION_REQUEST_FLUSH_MDLOG
+};
+
+// flags for state reclaim
+#define CEPH_RECLAIM_RESET 1
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+ __le32 op;
+ __le64 seq;
+ struct ceph_timespec stamp;
+ __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ * & 0x001000 -> write op
+ * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ & & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE 0x001000
+enum {
+ CEPH_MDS_OP_LOOKUP = 0x00100,
+ CEPH_MDS_OP_GETATTR = 0x00101,
+ CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+ CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+ CEPH_MDS_OP_LOOKUPINO = 0x00104,
+ CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+ CEPH_MDS_OP_GETVXATTR = 0x00106,
+ CEPH_MDS_OP_DUMMY = 0x00107,
+
+ CEPH_MDS_OP_SETXATTR = 0x01105,
+ CEPH_MDS_OP_RMXATTR = 0x01106,
+ CEPH_MDS_OP_SETLAYOUT = 0x01107,
+ CEPH_MDS_OP_SETATTR = 0x01108,
+ CEPH_MDS_OP_SETFILELOCK= 0x01109,
+ CEPH_MDS_OP_GETFILELOCK= 0x00110,
+ CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
+
+ CEPH_MDS_OP_MKNOD = 0x01201,
+ CEPH_MDS_OP_LINK = 0x01202,
+ CEPH_MDS_OP_UNLINK = 0x01203,
+ CEPH_MDS_OP_RENAME = 0x01204,
+ CEPH_MDS_OP_MKDIR = 0x01220,
+ CEPH_MDS_OP_RMDIR = 0x01221,
+ CEPH_MDS_OP_SYMLINK = 0x01222,
+
+ CEPH_MDS_OP_CREATE = 0x01301,
+ CEPH_MDS_OP_OPEN = 0x00302,
+ CEPH_MDS_OP_READDIR = 0x00305,
+
+ CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+ CEPH_MDS_OP_MKSNAP = 0x01400,
+ CEPH_MDS_OP_RMSNAP = 0x01401,
+ CEPH_MDS_OP_LSSNAP = 0x00402,
+ CEPH_MDS_OP_RENAMESNAP = 0x01403,
+
+ // internal op
+ CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
+ CEPH_MDS_OP_EXPORTDIR = 0x01501,
+ CEPH_MDS_OP_FLUSH = 0x01502,
+ CEPH_MDS_OP_ENQUEUE_SCRUB = 0x01503,
+ CEPH_MDS_OP_REPAIR_FRAGSTATS = 0x01504,
+ CEPH_MDS_OP_REPAIR_INODESTATS = 0x01505,
+ CEPH_MDS_OP_RDLOCK_FRAGSSTATS = 0x01507
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+#ifndef CEPH_SETATTR_MODE
+#define CEPH_SETATTR_MODE (1 << 0)
+#define CEPH_SETATTR_UID (1 << 1)
+#define CEPH_SETATTR_GID (1 << 2)
+#define CEPH_SETATTR_MTIME (1 << 3)
+#define CEPH_SETATTR_ATIME (1 << 4)
+#define CEPH_SETATTR_SIZE (1 << 5)
+#define CEPH_SETATTR_CTIME (1 << 6)
+#define CEPH_SETATTR_MTIME_NOW (1 << 7)
+#define CEPH_SETATTR_ATIME_NOW (1 << 8)
+#define CEPH_SETATTR_BTIME (1 << 9)
+#endif
+#define CEPH_SETATTR_KILL_SGUID (1 << 10)
+
+/*
+ * open request flags
+ */
+#define CEPH_O_RDONLY 00000000
+#define CEPH_O_WRONLY 00000001
+#define CEPH_O_RDWR 00000002
+#define CEPH_O_CREAT 00000100
+#define CEPH_O_EXCL 00000200
+#define CEPH_O_TRUNC 00001000
+#define CEPH_O_LAZY 00020000
+#define CEPH_O_DIRECTORY 00200000
+#define CEPH_O_NOFOLLOW 00400000
+
+int ceph_flags_sys2wire(int flags);
+
+/*
+ * Ceph setxattr request flags.
+ */
+#define CEPH_XATTR_CREATE (1 << 0)
+#define CEPH_XATTR_REPLACE (1 << 1)
+#define CEPH_XATTR_REMOVE (1 << 31)
+
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
+#define CEPH_READDIR_HASH_ORDER (1<<9)
+#define CEPH_READDIR_OFFSET_HASH (1<<10)
+
+/* Note that this is embedded wthin ceph_mds_request_head_legacy. */
+union ceph_mds_request_args_legacy {
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ } __attribute__ ((packed)) getattr;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ } __attribute__ ((packed)) setattr;
+ struct {
+ __le32 frag; /* which dir fragment */
+ __le32 max_entries; /* how many dentries to grab */
+ __le32 max_bytes;
+ __le16 flags;
+ __le32 offset_hash;
+ } __attribute__ ((packed)) readdir;
+ struct {
+ __le32 mode;
+ __le32 rdev;
+ } __attribute__ ((packed)) mknod;
+ struct {
+ __le32 mode;
+ } __attribute__ ((packed)) mkdir;
+ struct {
+ __le32 flags;
+ __le32 mode;
+ __le32 stripe_unit; /* layout for newly created file */
+ __le32 stripe_count; /* ... */
+ __le32 object_size;
+ __le32 pool; /* if >= 0 and CREATEPOOLID feature */
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 old_size; /* if O_TRUNC */
+ } __attribute__ ((packed)) open;
+ struct {
+ __le32 flags;
+ __le32 osdmap_epoch; /* use for set file/dir layout */
+ } __attribute__ ((packed)) setxattr;
+ struct {
+ struct ceph_file_layout layout;
+ } __attribute__ ((packed)) setlayout;
+ struct {
+ __u8 rule; /* currently fcntl or flock */
+ __u8 type; /* shared, exclusive, remove*/
+ __le64 owner; /* who requests/holds the lock */
+ __le64 pid; /* process id requesting the lock */
+ __le64 start; /* initial location to lock */
+ __le64 length; /* num bytes to lock from start */
+ __u8 wait; /* will caller wait for lock to become available? */
+ } __attribute__ ((packed)) filelock_change;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+#define CEPH_MDS_FLAG_ASYNC 4 /* request is async */
+
+struct ceph_mds_request_head_legacy {
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args_legacy args;
+} __attribute__ ((packed));
+
+/*
+ * Note that this is embedded wthin ceph_mds_request_head. Also, compatibility
+ * with the ceph_mds_request_args_legacy must be maintained!
+ */
+union ceph_mds_request_args {
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ } __attribute__ ((packed)) getattr;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ struct ceph_timespec btime;
+ } __attribute__ ((packed)) setattr;
+ struct {
+ __le32 frag; /* which dir fragment */
+ __le32 max_entries; /* how many dentries to grab */
+ __le32 max_bytes;
+ __le16 flags;
+ __le32 offset_hash;
+ } __attribute__ ((packed)) readdir;
+ struct {
+ __le32 mode;
+ __le32 rdev;
+ } __attribute__ ((packed)) mknod;
+ struct {
+ __le32 mode;
+ } __attribute__ ((packed)) mkdir;
+ struct {
+ __le32 flags;
+ __le32 mode;
+ __le32 stripe_unit; /* layout for newly created file */
+ __le32 stripe_count; /* ... */
+ __le32 object_size;
+ __le32 pool; /* if >= 0 and CREATEPOOLID feature */
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 old_size; /* if O_TRUNC */
+ } __attribute__ ((packed)) open;
+ struct {
+ __le32 flags;
+ __le32 osdmap_epoch; /* use for set file/dir layout */
+ } __attribute__ ((packed)) setxattr;
+ struct {
+ struct ceph_file_layout layout;
+ } __attribute__ ((packed)) setlayout;
+ struct {
+ __u8 rule; /* currently fcntl or flock */
+ __u8 type; /* shared, exclusive, remove*/
+ __le64 owner; /* who requests/holds the lock */
+ __le64 pid; /* process id requesting the lock */
+ __le64 start; /* initial location to lock */
+ __le64 length; /* num bytes to lock from start */
+ __u8 wait; /* will caller wait for lock to become available? */
+ } __attribute__ ((packed)) filelock_change;
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 snapid;
+ __le64 parent;
+ __le32 hash;
+ } __attribute__ ((packed)) lookupino;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_REQUEST_HEAD_VERSION 1
+
+/*
+ * Note that any change to this structure must ensure that it is compatible
+ * with ceph_mds_request_head_legacy.
+ */
+struct ceph_mds_request_head {
+ __le16 version;
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+ __le64 ino, cap_id; /* ino and unique cap id */
+ __le32 caps, wanted; /* new issued, wanted */
+ __le32 seq, issue_seq, mseq;
+ __le32 dname_seq; /* if releasing a dentry lease, a */
+ __le32 dname_len; /* string follows. */
+} __attribute__ ((packed));
+
+static inline void
+copy_from_legacy_head(struct ceph_mds_request_head *head,
+ struct ceph_mds_request_head_legacy *legacy)
+{
+ struct ceph_mds_request_head_legacy *embedded_legacy =
+ (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;
+ *embedded_legacy = *legacy;
+}
+
+static inline void
+copy_to_legacy_head(struct ceph_mds_request_head_legacy *legacy,
+ struct ceph_mds_request_head *head)
+{
+ struct ceph_mds_request_head_legacy *embedded_legacy =
+ (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;
+ *legacy = *embedded_legacy;
+}
+
+/* client reply */
+struct ceph_mds_reply_head {
+ __le32 op;
+ __le32 result;
+ __le32 mdsmap_epoch;
+ __u8 safe; /* true if committed to disk */
+ __u8 is_dentry, is_target; /* true if dentry, target inode records
+ are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+ __le32 frag; /* this frag splits... */
+ __le32 by; /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+ __le32 nsplits; /* num ceph_frag_tree_split records */
+ struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+ __le32 caps, wanted; /* caps issued, wanted */
+ __le64 cap_id;
+ __le32 seq, mseq;
+ __le64 realm; /* snap realm */
+ __u8 flags; /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* ask client to release the cap */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+ __le16 mask; /* lease type(s) */
+ __le32 duration_ms; /* lease duration */
+ __le32 seq;
+} __attribute__ ((packed));
+
+#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */
+#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */
+
+struct ceph_mds_reply_dirfrag {
+ __le32 frag; /* fragment */
+ __le32 auth; /* auth mds, if this is a delegation point */
+ __le32 ndist; /* number of mds' this is replicated on */
+ __le32 dist[];
+} __attribute__ ((packed));
+
+#define CEPH_LOCK_FCNTL 1
+#define CEPH_LOCK_FLOCK 2
+#define CEPH_LOCK_FCNTL_INTR 3
+#define CEPH_LOCK_FLOCK_INTR 4
+
+#define CEPH_LOCK_SHARED 1
+#define CEPH_LOCK_EXCL 2
+#define CEPH_LOCK_UNLOCK 4
+
+struct ceph_filelock {
+ __le64 start;/* file offset to start lock at */
+ __le64 length; /* num bytes to lock; 0 for all following start */
+ __le64 client; /* which client holds the lock */
+ __le64 owner; /* who requests/holds the lock */
+ __le64 pid; /* process id holding the lock on the client */
+ __u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
+
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN 0
+#define CEPH_FILE_MODE_RD 1
+#define CEPH_FILE_MODE_WR 2
+#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
+#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
+#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+/* inline data state */
+#define CEPH_INLINE_NONE ((__u64)-1)
+#define CEPH_INLINE_MAX_SIZE CEPH_MIN_STRIPE_UNIT
+
+/* capability bits */
+#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+/* note: these definitions are duplicated in mds/locks.c */
+#define CEPH_CAP_GSHARED 1 /* client can reads */
+#define CEPH_CAP_GEXCL 2 /* client can read and update */
+#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
+#define CEPH_CAP_GRD 8 /* (file) client can read */
+#define CEPH_CAP_GWR 16 /* (file) client can write */
+#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
+
+#define CEPH_CAP_SIMPLE_BITS 2
+#define CEPH_CAP_FILE_BITS 8
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH 2
+#define CEPH_CAP_SLINK 4
+#define CEPH_CAP_SXATTR 6
+#define CEPH_CAP_SFILE 8
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x) ((x) << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
+#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
+ CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_XATTR_SHARED)
+#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_XATTR_SHARED | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
+ CEPH_CAP_LINK_EXCL | \
+ CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+ CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+ CEPH_LOCK_IXATTR)
+
+/* cap masks async dir operations */
+#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE
+#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD
+#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
+
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+ CEPH_CAP_OP_GRANT, /* mds->client grant */
+ CEPH_CAP_OP_REVOKE, /* mds->client revoke */
+ CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
+ CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
+ CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
+ CEPH_CAP_OP_UPDATE, /* client->mds update */
+ CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
+ CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
+ CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
+ CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
+ CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+ CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
+ CEPH_CAP_OP_RENEW, /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/* extra info for cap import/export */
+struct ceph_mds_cap_peer {
+ __le64 cap_id;
+ __le32 seq;
+ __le32 mseq;
+ __le32 mds;
+ __u8 flags;
+} __attribute__ ((packed));
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps_head {
+ __le32 op; /* CEPH_CAP_OP_* */
+ __le64 ino, realm;
+ __le64 cap_id;
+ __le32 seq, issue_seq;
+ __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+ __le32 migrate_seq;
+ __le64 snap_follows;
+ __le32 snap_trace_len;
+
+ /* authlock */
+ __le32 uid, gid, mode;
+
+ /* linklock */
+ __le32 nlink;
+
+ /* xattrlock */
+ __le32 xattr_len;
+ __le64 xattr_version;
+} __attribute__ ((packed));
+
+struct ceph_mds_caps_non_export_body {
+ /* all except export */
+ /* filelock */
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ struct ceph_timespec mtime, atime, ctime;
+ struct ceph_file_layout layout;
+ __le32 time_warp_seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_caps_export_body {
+ /* export message */
+ struct ceph_mds_cap_peer peer;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+ __le32 num; /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+ __le64 ino;
+ __le64 cap_id;
+ __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
+#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
+#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
+#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+ __u8 action; /* CEPH_MDS_LEASE_* */
+ __le16 mask; /* which lease */
+ __le64 ino;
+ __le64 first, last; /* snap range */
+ __le32 seq;
+ __le32 duration_ms; /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+ __le32 flock_len; /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+
+struct ceph_mds_cap_reconnect_v1 {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 size;
+ struct ceph_timespec mtime, atime;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+} __attribute__ ((packed));
+
+struct ceph_mds_snaprealm_reconnect {
+ __le64 ino; /* snap realm base */
+ __le64 seq; /* snap seq for this snap realm */
+ __le64 parent; /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+ CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
+ CEPH_SNAP_OP_CREATE,
+ CEPH_SNAP_OP_DESTROY,
+ CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+ __le32 op; /* CEPH_SNAP_OP_* */
+ __le64 split; /* ino to split off, if any */
+ __le32 num_split_inos; /* # inos belonging to new child realm */
+ __le32 num_split_realms; /* # child realms udner new child realm */
+ __le32 trace_len; /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+ __le64 ino; /* ino */
+ __le64 created; /* snap: when created */
+ __le64 parent; /* ino: parent realm */
+ __le64 parent_since; /* snap: same parent since */
+ __le64 seq; /* snap: version */
+ __le32 num_snaps;
+ __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/ceph_fuse.h b/src/include/ceph_fuse.h
new file mode 100644
index 000000000..c95fd1940
--- /dev/null
+++ b/src/include/ceph_fuse.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+#ifndef CEPH_FUSE_H
+#define CEPH_FUSE_H
+
+/*
+ * The API version that we want to use, regardless of what the
+ * library version is. Note that this must be defined before
+ * fuse.h is included.
+ */
+#ifndef FUSE_USE_VERSION
+#define FUSE_USE_VERSION 35
+#endif
+
+#include <fuse.h>
+#include "acconfig.h"
+
+/*
+ * Redefine the FUSE_VERSION macro defined in "fuse_common.h"
+ * header file, because the MINOR numner has been forgotten to
+ * update since libfuse 3.2 to 3.8. We need to fetch the MINOR
+ * number from pkgconfig file.
+ */
+#ifdef FUSE_VERSION
+#undef FUSE_VERSION
+#define FUSE_VERSION FUSE_MAKE_VERSION(CEPH_FUSE_MAJOR_VERSION, CEPH_FUSE_MINOR_VERSION)
+#endif
+
+static inline int filler_compat(fuse_fill_dir_t filler,
+ void *buf, const char *name,
+ const struct stat *stbuf,
+ off_t off)
+{
+ return filler(buf, name, stbuf, off
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ , static_cast<enum fuse_fill_dir_flags>(0)
+#endif
+ );
+}
+#endif /* CEPH_FUSE_H */
diff --git a/src/include/ceph_hash.h b/src/include/ceph_hash.h
new file mode 100644
index 000000000..f9d80ac36
--- /dev/null
+++ b/src/include/ceph_hash.h
@@ -0,0 +1,14 @@
+#ifndef FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+extern bool ceph_str_hash_valid(int type);
+
+#endif
diff --git a/src/include/cephfs/ceph_ll_client.h b/src/include/cephfs/ceph_ll_client.h
new file mode 100644
index 000000000..7709a6de4
--- /dev/null
+++ b/src/include/cephfs/ceph_ll_client.h
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * scalable distributed file system
+ *
+ * Copyright (C) Jeff Layton <jlayton@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_CEPH_LL_CLIENT_H
+#define CEPH_CEPH_LL_CLIENT_H
+#include <stdint.h>
+
+#ifdef _WIN32
+#include "include/win32/fs_compat.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+
+class Fh;
+
+struct inodeno_t;
+struct vinodeno_t;
+typedef struct vinodeno_t vinodeno;
+
+#else /* __cplusplus */
+
+typedef struct Fh Fh;
+
+typedef struct inodeno_t {
+ uint64_t val;
+} inodeno_t;
+
+typedef struct _snapid_t {
+ uint64_t val;
+} snapid_t;
+
+typedef struct vinodeno_t {
+ inodeno_t ino;
+ snapid_t snapid;
+} vinodeno_t;
+
+#endif /* __cplusplus */
+
+/*
+ * Heavily borrowed from David Howells' draft statx patchset.
+ *
+ * Since the xstat patches are still a work in progress, we borrow its data
+ * structures and #defines to implement ceph_getattrx. Once the xstat stuff
+ * has been merged we should drop this and switch over to using that instead.
+ */
+struct ceph_statx {
+ uint32_t stx_mask;
+ uint32_t stx_blksize;
+ uint32_t stx_nlink;
+ uint32_t stx_uid;
+ uint32_t stx_gid;
+ uint16_t stx_mode;
+ uint64_t stx_ino;
+ uint64_t stx_size;
+ uint64_t stx_blocks;
+ dev_t stx_dev;
+ dev_t stx_rdev;
+ struct timespec stx_atime;
+ struct timespec stx_ctime;
+ struct timespec stx_mtime;
+ struct timespec stx_btime;
+ uint64_t stx_version;
+};
+
+#define CEPH_STATX_MODE 0x00000001U /* Want/got stx_mode */
+#define CEPH_STATX_NLINK 0x00000002U /* Want/got stx_nlink */
+#define CEPH_STATX_UID 0x00000004U /* Want/got stx_uid */
+#define CEPH_STATX_GID 0x00000008U /* Want/got stx_gid */
+#define CEPH_STATX_RDEV 0x00000010U /* Want/got stx_rdev */
+#define CEPH_STATX_ATIME 0x00000020U /* Want/got stx_atime */
+#define CEPH_STATX_MTIME 0x00000040U /* Want/got stx_mtime */
+#define CEPH_STATX_CTIME 0x00000080U /* Want/got stx_ctime */
+#define CEPH_STATX_INO 0x00000100U /* Want/got stx_ino */
+#define CEPH_STATX_SIZE 0x00000200U /* Want/got stx_size */
+#define CEPH_STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */
+#define CEPH_STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
+#define CEPH_STATX_BTIME 0x00000800U /* Want/got stx_btime */
+#define CEPH_STATX_VERSION 0x00001000U /* Want/got stx_version */
+#define CEPH_STATX_ALL_STATS 0x00001fffU /* All supported stats */
+
+/*
+ * Compatibility macros until these defines make their way into glibc
+ */
+#ifndef AT_STATX_DONT_SYNC
+#define AT_STATX_SYNC_TYPE 0x6000
+#define AT_STATX_SYNC_AS_STAT 0x0000
+#define AT_STATX_FORCE_SYNC 0x2000
+#define AT_STATX_DONT_SYNC 0x4000 /* Don't sync attributes with the server */
+#endif
+
+/*
+ * This is deprecated and just for backwards compatibility.
+ * Please use AT_STATX_DONT_SYNC instead.
+ */
+#define AT_NO_ATTR_SYNC AT_STATX_DONT_SYNC /* Deprecated */
+
+/*
+ * The statx interfaces only allow these flags. In order to allow us to add
+ * others in the future, we disallow setting any that aren't recognized.
+ */
+#define CEPH_REQ_FLAG_MASK (AT_SYMLINK_NOFOLLOW|AT_STATX_DONT_SYNC)
+
+/* delegation recalls */
+typedef void (*ceph_deleg_cb_t)(Fh *fh, void *priv);
+
+/* inode data/metadata invalidation */
+typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino,
+ int64_t off, int64_t len);
+
+/* dentry invalidation */
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+ vinodeno_t ino, const char *name,
+ size_t len);
+
+/* remount entire fs */
+typedef int (*client_remount_callback_t)(void *handle);
+
+/* lock request interrupted */
+typedef void (*client_switch_interrupt_callback_t)(void *handle, void *data);
+
+/* fetch umask of actor */
+typedef mode_t (*client_umask_callback_t)(void *handle);
+
+/* request that application release Inode references */
+typedef void (*client_ino_release_t)(void *handle, vinodeno_t ino);
+
+/*
+ * The handle is an opaque value that gets passed to some callbacks. Any fields
+ * set to NULL will be left alone. There is no way to unregister callbacks.
+ */
+struct ceph_client_callback_args {
+ void *handle;
+ client_ino_callback_t ino_cb;
+ client_dentry_callback_t dentry_cb;
+ client_switch_interrupt_callback_t switch_intr_cb;
+ client_remount_callback_t remount_cb;
+ client_umask_callback_t umask_cb;
+ client_ino_release_t ino_release_cb;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_STATX_H */
+
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
new file mode 100644
index 000000000..185d5e40e
--- /dev/null
+++ b/src/include/cephfs/libcephfs.h
@@ -0,0 +1,2126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIB_H
+#define CEPH_LIB_H
+
+#if defined(__linux__)
+#include <features.h>
+#endif
+#include <utime.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/statvfs.h>
+#include <sys/socket.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#include "ceph_ll_client.h"
+
+#ifdef __cplusplus
+namespace ceph::common {
+ class CephContext;
+}
+using CephContext = ceph::common::CephContext;
+extern "C" {
+#endif
+
+#define LIBCEPHFS_VER_MAJOR 10
+#define LIBCEPHFS_VER_MINOR 0
+#define LIBCEPHFS_VER_EXTRA 2
+
+#define LIBCEPHFS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBCEPHFS_VERSION_CODE LIBCEPHFS_VERSION(LIBCEPHFS_VER_MAJOR, LIBCEPHFS_VER_MINOR, LIBCEPHFS_VER_EXTRA)
+
+#if __GNUC__ >= 4
+ #define LIBCEPHFS_DEPRECATED __attribute__((deprecated))
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#else
+ #define LIBCEPHFS_DEPRECATED
+#endif
+
+/*
+ * If using glibc check that file offset is 64-bit.
+ */
+#if defined(__GLIBC__) && !defined(__USE_FILE_OFFSET64)
+# error libceph: glibc must define __USE_FILE_OFFSET64 or readdir results will be corrupted
+#endif
+
+/*
+ * XXXX redeclarations from ceph_fs.h, rados.h, etc. We need more of this
+ * in the interface, but shouldn't be re-typing it (and using different
+ * C data types).
+ */
+#ifndef __cplusplus
+
+#define CEPH_INO_ROOT 1
+#define CEPH_NOSNAP ((uint64_t)(-2))
+
+struct ceph_file_layout {
+ /* file -> object mapping */
+ uint32_t fl_stripe_unit; /* stripe unit, in bytes. must be multiple
+ of page size. */
+ uint32_t fl_stripe_count; /* over this many objects */
+ uint32_t fl_object_size; /* until objects are this big, then move to
+ new objects */
+ uint32_t fl_cas_hash; /* 0 = none; 1 = sha256 */
+
+ /* pg -> disk layout */
+ uint32_t fl_object_stripe_unit; /* for per-object parity, if any */
+
+ /* object -> pg layout */
+ uint32_t fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+ uint32_t fl_pg_pool; /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+struct CephContext;
+#endif /* ! __cplusplus */
+
+struct UserPerm;
+typedef struct UserPerm UserPerm;
+
+struct Inode;
+typedef struct Inode Inode;
+
+struct ceph_mount_info;
+struct ceph_dir_result;
+
+// user supplied key,value pair to be associated with a snapshot.
+// callers can supply an array of this struct via ceph_mksnap().
+struct snap_metadata {
+ const char *key;
+ const char *value;
+};
+
+struct snap_info {
+ uint64_t id;
+ size_t nr_snap_metadata;
+ struct snap_metadata *snap_metadata;
+};
+
+/* setattr mask bits */
+#ifndef CEPH_SETATTR_MODE
+# define CEPH_SETATTR_MODE 1
+# define CEPH_SETATTR_UID 2
+# define CEPH_SETATTR_GID 4
+# define CEPH_SETATTR_MTIME 8
+# define CEPH_SETATTR_ATIME 16
+# define CEPH_SETATTR_SIZE 32
+# define CEPH_SETATTR_CTIME 64
+# define CEPH_SETATTR_MTIME_NOW 128
+# define CEPH_SETATTR_ATIME_NOW 256
+# define CEPH_SETATTR_BTIME 512
+#endif
+
+/* define error codes for the mount function*/
+# define CEPHFS_ERROR_MON_MAP_BUILD 1000
+# define CEPHFS_ERROR_NEW_CLIENT 1002
+# define CEPHFS_ERROR_MESSENGER_START 1003
+
+/**
+ * Create a UserPerm credential object.
+ *
+ * Some calls (most notably, the ceph_ll_* ones), take a credential object
+ * that represents the credentials that the calling program is using. This
+ * function creates a new credential object for this purpose. Returns a
+ * pointer to the object, or NULL if it can't be allocated.
+ *
+ * Note that the gidlist array is used directly and is not copied. It must
+ * remain valid over the lifetime of the created UserPerm object.
+ *
+ * @param uid uid to be used
+ * @param gid gid to be used
+ * @param ngids number of gids in supplemental grouplist
+ * @param gidlist array of gid_t's in the list of groups
+ */
+UserPerm *ceph_userperm_new(uid_t uid, gid_t gid, int ngids, gid_t *gidlist);
+
+/**
+ * Destroy a UserPerm credential object.
+ *
+ * @param perm pointer to object to be destroyed
+ *
+ * Currently this just frees the object. Note that the gidlist array is not
+ * freed. The caller must do so if it's necessary.
+ */
+void ceph_userperm_destroy(UserPerm *perm);
+
+/**
+ * Get a pointer to the default UserPerm object for the mount.
+ *
+ * @param cmount the mount info handle
+ *
+ * Every cmount has a default set of credentials. This returns a pointer to
+ * that object.
+ *
+ * Unlike with ceph_userperm_new, this object should not be freed.
+ */
+struct UserPerm *ceph_mount_perms(struct ceph_mount_info *cmount);
+
+/**
+ * Set cmount's default permissions
+ *
+ * @param cmount the mount info handle
+ * @param perm permissions to set to default for mount
+ *
+ * Every cmount has a default set of credentials. This does a deep copy of
+ * the given permissions to the ones in the cmount. Must be done after
+ * ceph_init but before ceph_mount.
+ *
+ * Returns 0 on success, and -EISCONN if the cmount is already mounted.
+ */
+int ceph_mount_perms_set(struct ceph_mount_info *cmount, UserPerm *perm);
+
+/**
+ * @defgroup libcephfs_h_init Setup and Teardown
+ * These are the first and last functions that should be called
+ * when using libcephfs.
+ *
+ * @{
+ */
+
+/**
+ * Get the version of libcephfs.
+ *
+ * The version number is major.minor.patch.
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param patch where to store the extra version number
+ */
+const char *ceph_version(int *major, int *minor, int *patch);
+
+/**
+ * Create a mount handle for interacting with Ceph. All libcephfs
+ * functions operate on a mount info handle.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param id the id of the client. This can be a unique id that identifies
+ * this client, and will get appended onto "client.". Callers can
+ * pass in NULL, and the id will be the process id of the client.
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create(struct ceph_mount_info **cmount, const char * const id);
+
+/**
+ * Create a mount handle from a CephContext, which holds the configuration
+ * for the ceph cluster. A CephContext can be acquired from an existing ceph_mount_info
+ * handle, using the @ref ceph_get_mount_context call. Note that using the same CephContext
+ * for two different mount handles results in the same client entity id being used.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param conf reuse this pre-existing CephContext config
+ * @returns 0 on success, negative error code on failure
+ */
+#ifdef __cplusplus
+int ceph_create_with_context(struct ceph_mount_info **cmount, CephContext *conf);
+#else
+int ceph_create_with_context(struct ceph_mount_info **cmount, struct CephContext *conf);
+#endif
+
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif // VOIDPTR_RADOS_T
+
+/**
+ * Create a mount handle from a rados_t, for using libcephfs in the
+ * same process as librados.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param cluster reference to already-initialized librados handle
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create_from_rados(struct ceph_mount_info **cmount, rados_t cluster);
+
+/**
+ * Initialize the filesystem client (but do not mount the filesystem yet)
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_init(struct ceph_mount_info *cmount);
+
+/**
+ * Optionally set which filesystem to mount, before calling mount.
+ *
+ * An error will be returned if this libcephfs instance is already
+ * mounted. This function is an alternative to setting the global
+ * client_fs setting. Using this function enables multiple libcephfs
+ * instances in the same process to mount different filesystems.
+ *
+ * The filesystem name is *not* validated in this function. That happens
+ * during mount(), where an ENOENT error will result if a non-existent
+ * filesystem was specified here.
+ *
+ * @param cmount the mount info handle
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_select_filesystem(struct ceph_mount_info *cmount, const char *fs_name);
+
+
+/**
+ * Perform a mount using the path for the root of the mount.
+ *
+ * It is optional to call ceph_init before this. If ceph_init has
+ * not already been called, it will be called in the course of this operation.
+ *
+ * @param cmount the mount info handle
+ * @param root the path for the root of the mount. This can be an existing
+ * directory within the ceph cluster, but most likely it will
+ * be "/". Passing in NULL is equivalent to "/".
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_mount(struct ceph_mount_info *cmount, const char *root);
+
+/**
+ * Return cluster ID for a mounted ceph filesystem
+ *
+ * Every ceph filesystem has a filesystem ID associated with it. This
+ * function returns that value. If the ceph_mount_info does not refer to a
+ * mounted filesystem, this returns a negative error code.
+ */
+int64_t ceph_get_fs_cid(struct ceph_mount_info *cmount);
+
+/**
+ * Execute a management command remotely on an MDS.
+ *
+ * Must have called ceph_init or ceph_mount before calling this.
+ *
+ * @param mds_spec string representing rank, MDS name, GID or '*'
+ * @param cmd array of null-terminated strings
+ * @param cmdlen length of cmd array
+ * @param inbuf non-null-terminated input data to command
+ * @param inbuflen length in octets of inbuf
+ * @param outbuf populated with pointer to buffer (command output data)
+ * @param outbuflen length of allocated outbuf
+ * @param outs populated with pointer to buffer (command error strings)
+ * @param outslen length of allocated outs
+ *
+ * @return 0 on success, negative error code on failure
+ *
+ */
+int ceph_mds_command(struct ceph_mount_info *cmount,
+ const char *mds_spec,
+ const char **cmd,
+ size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+/**
+ * Free a buffer, such as those used for output arrays from ceph_mds_command
+ */
+void ceph_buffer_free(char *buf);
+
+/**
+ * Unmount a mount handle.
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_unmount(struct ceph_mount_info *cmount);
+
+/**
+ * Abort mds connections
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_abort_conn(struct ceph_mount_info *cmount);
+
+/**
+ * Destroy the mount handle.
+ *
+ * The handle should not be mounted. This should be called on completion of
+ * all libcephfs functions.
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure.
+ */
+int ceph_release(struct ceph_mount_info *cmount);
+
+/**
+ * Deprecated. Unmount and destroy the ceph mount handle. This should be
+ * called on completion of all libcephfs functions.
+ *
+ * Equivalent to ceph_unmount() + ceph_release() without error handling.
+ *
+ * @param cmount the mount handle to shutdown
+ */
+void ceph_shutdown(struct ceph_mount_info *cmount);
+
+/**
+ * Return associated client addresses
+ *
+ * @param cmount the mount handle
+ * @param addrs the output addresses
+ * @returns 0 on success, a negative error code on failure
+ * @note the returned addrs should be free by the caller
+ */
+int ceph_getaddrs(struct ceph_mount_info *cmount, char** addrs);
+
+/**
+ * Get a global id for current instance
+ *
+ * The handle should not be mounted. This should be called on completion of
+ * all libcephfs functions.
+ *
+ * @param cmount the mount handle
+ * @returns instance global id
+ */
+uint64_t ceph_get_instance_id(struct ceph_mount_info *cmount);
+
+/**
+ * Extract the CephContext from the mount point handle.
+ *
+ * @param cmount the ceph mount handle to get the context from.
+ * @returns the CephContext associated with the mount handle.
+ */
+#ifdef __cplusplus
+CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount);
+#else
+struct CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount);
+#endif
+/*
+ * Check mount status.
+ *
+ * Return non-zero value if mounted. Otherwise, zero.
+ */
+int ceph_is_mounted(struct ceph_mount_info *cmount);
+
+/** @} init */
+
+/**
+ * @defgroup libcephfs_h_config Config
+ * Functions for manipulating the Ceph configuration at runtime.
+ *
+ * @{
+ */
+
+/**
+ * Load the ceph configuration from the specified config file.
+ *
+ * @param cmount the mount handle to load the configuration into.
+ * @param path_list the configuration file path
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_read_file(struct ceph_mount_info *cmount, const char *path_list);
+
+/**
+ * Parse the command line arguments and load the configuration parameters.
+ *
+ * @param cmount the mount handle to load the configuration parameters into.
+ * @param argc count of the arguments in argv
+ * @param argv the argument list
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, const char **argv);
+
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre ceph_mount() has not been called on the handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cmount handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *var);
+
+/** Sets a configuration value from a string.
+ *
+ * @param cmount the mount handle to set the configuration value on
+ * @param option the configuration option to set
+ * @param value the value of the configuration option to set
+ *
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_conf_set(struct ceph_mount_info *cmount, const char *option, const char *value);
+
+/** Set mount timeout.
+ *
+ * @param cmount mount handle to set the configuration value on
+ * @param timeout mount timeout interval
+ *
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_set_mount_timeout(struct ceph_mount_info *cmount, uint32_t timeout);
+
+/**
+ * Gets the configuration value as a string.
+ *
+ * @param cmount the mount handle to set the configuration value on
+ * @param option the config option to get
+ * @param buf the buffer to fill with the value
+ * @param len the length of the buffer.
+ * @returns the size of the buffer filled in with the value, or negative error code on failure
+ */
+int ceph_conf_get(struct ceph_mount_info *cmount, const char *option, char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * @defgroup libcephfs_h_fsops File System Operations.
+ * Functions for getting/setting file system wide information specific to a particular
+ * mount handle.
+ *
+ * @{
+ */
+
+/**
+ * Perform a statfs on the ceph file system. This call fills in file system wide statistics
+ * into the passed in buffer.
+ *
+ * @param cmount the ceph mount handle to use for performing the statfs.
+ * @param path can be any path within the mounted filesystem
+ * @param stbuf the file system statistics filled in by this function.
+ * @return 0 on success, negative error code otherwise.
+ */
+int ceph_statfs(struct ceph_mount_info *cmount, const char *path, struct statvfs *stbuf);
+
+/**
+ * Synchronize all filesystem data to persistent media.
+ *
+ * @param cmount the ceph mount handle to use for performing the sync_fs.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_sync_fs(struct ceph_mount_info *cmount);
+
+/**
+ * Get the current working directory.
+ *
+ * @param cmount the ceph mount to get the current working directory for.
+ * @returns the path to the current working directory
+ */
+const char* ceph_getcwd(struct ceph_mount_info *cmount);
+
+/**
+ * Change the current working directory.
+ *
+ * @param cmount the ceph mount to change the current working directory for.
+ * @param path the path to the working directory to change into.
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_chdir(struct ceph_mount_info *cmount, const char *path);
+
+/** @} fsops */
+
+/**
+ * @defgroup libcephfs_h_dir Directory Operations.
+ * Functions for manipulating and listing directories.
+ *
+ * @{
+ */
+
+/**
+ * Open the given directory.
+ *
+ * @param cmount the ceph mount handle to use to open the directory
+ * @param name the path name of the directory to open. Must be either an absolute path
+ * or a path relative to the current working directory.
+ * @param dirpp the directory result pointer structure to fill in.
+ * @returns 0 on success or negative error code otherwise.
+ */
+int ceph_opendir(struct ceph_mount_info *cmount, const char *name, struct ceph_dir_result **dirpp);
+
+/**
+ * Open a directory referred to by a file descriptor
+ *
+ * @param cmount the ceph mount handle to use to open the directory
+ * @param dirfd open file descriptor for the directory
+ * @param dirpp the directory result pointer structure to fill in
+ * @returns 0 on success or negative error code otherwise
+ */
+int ceph_fdopendir(struct ceph_mount_info *cmount, int dirfd, struct ceph_dir_result **dirpp);
+
+/**
+ * Close the open directory.
+ *
+ * @param cmount the ceph mount handle to use for closing the directory
+ * @param dirp the directory result pointer (set by ceph_opendir) to close
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_closedir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Get the next entry in an open directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry to return.
+ * @returns the next directory entry or NULL if at the end of the directory (or the directory
+ * is empty. This pointer should not be freed by the caller, and is only safe to
+ * access between return and the next call to ceph_readdir or ceph_closedir.
+ */
+struct dirent * ceph_readdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * A safe version of ceph_readdir, where the directory entry struct is allocated by the caller.
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry to return.
+ * @param de the directory entry pointer filled in with the next directory entry of the dirp state.
+ * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached,
+ * and a negative error code on failure.
+ */
+int ceph_readdir_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de);
+
+/**
+ * A safe version of ceph_readdir that also returns the file statistics (readdir+stat).
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir_plus_r.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry to return.
+ * @param de the directory entry pointer filled in with the next directory entry of the dirp state.
+ * @param stx the stats of the file/directory of the entry returned
+ * @param want mask showing desired inode attrs for returned entry
+ * @param flags bitmask of flags to use when filling out attributes
+ * @param out optional returned Inode argument. If non-NULL, then a reference will be taken on
+ * the inode and the pointer set on success.
+ * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached,
+ * and a negative error code on failure.
+ */
+int ceph_readdirplus_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de,
+ struct ceph_statx *stx, unsigned want, unsigned flags, struct Inode **out);
+
+/**
+ * Gets multiple directory entries.
+ *
+ * @param cmount the ceph mount handle to use for performing the getdents.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry/entries to return.
+ * @param name an array of struct dirent that gets filled in with the to fill returned directory entries into.
+ * @param buflen the length of the buffer, which should be the number of dirent structs * sizeof(struct dirent).
+ * @returns the length of the buffer that was filled in, will always be multiples of sizeof(struct dirent), or a
+ * negative error code. If the buffer is not large enough for a single entry, -ERANGE is returned.
+ */
+int ceph_getdents(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen);
+
+/**
+ * Gets multiple directory names.
+ *
+ * @param cmount the ceph mount handle to use for performing the getdents.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ * next entry/entries to return.
+ * @param name a buffer to fill in with directory entry names.
+ * @param buflen the length of the buffer that can be filled in.
+ * @returns the length of the buffer filled in with entry names, or a negative error code on failure.
+ * If the buffer isn't large enough for a single entry, -ERANGE is returned.
+ */
+int ceph_getdnames(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen);
+
+/**
+ * Rewind the directory stream to the beginning of the directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the rewinddir.
+ * @param dirp the directory stream pointer to rewind.
+ */
+void ceph_rewinddir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Get the current position of a directory stream.
+ *
+ * @param cmount the ceph mount handle to use for performing the telldir.
+ * @param dirp the directory stream pointer to get the current position of.
+ * @returns the position of the directory stream. Note that the offsets returned
+ * by ceph_telldir do not have a particular order (cannot be compared with
+ * inequality).
+ */
+int64_t ceph_telldir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Move the directory stream to a position specified by the given offset.
+ *
+ * @param cmount the ceph mount handle to use for performing the seekdir.
+ * @param dirp the directory stream pointer to move.
+ * @param offset the position to move the directory stream to. This offset should be
+ * a value returned by telldir. Note that this value does not refer to the nth
+ * entry in a directory, and can not be manipulated with plus or minus.
+ */
+void ceph_seekdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, int64_t offset);
+
+/**
+ * Create a directory.
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the directory to create. This must be either an
+ * absolute path or a relative path off of the current working directory.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdir(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Create a directory relative to a file descriptor
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param dirfd open file descriptor for a directory (or CEPHFS_AT_FDCWD)
+ * @param relpath the path of the directory to create.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdirat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, mode_t mode);
+
+/**
+ * Create a snapshot
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the directory to create snapshot. This must be either an
+ * absolute path or a relative path off of the current working directory.
+ * @param name snapshot name
+ * @param mode the permissions the directory should have once created.
+ * @param snap_metadata array of snap metadata structs
+ * @param nr_snap_metadata number of snap metadata struct entries
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mksnap(struct ceph_mount_info *cmount, const char *path, const char *name,
+ mode_t mode, struct snap_metadata *snap_metadata, size_t nr_snap_metadata);
+
+/**
+ * Remove a snapshot
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the directory to create snapshot. This must be either an
+ * absolute path or a relative path off of the current working directory.
+ * @param name snapshot name
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_rmsnap(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Create multiple directories at once.
+ *
+ * @param cmount the ceph mount handle to use for making the directories.
+ * @param path the full path of directories and sub-directories that should
+ * be created.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdirs(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Remove a directory.
+ *
+ * @param cmount the ceph mount handle to use for removing directories.
+ * @param path the path of the directory to remove.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_rmdir(struct ceph_mount_info *cmount, const char *path);
+
+/** @} dir */
+
+/**
+ * @defgroup libcephfs_h_links Links and Link Handling.
+ * Functions for creating and manipulating hard links and symbolic inks.
+ *
+ * @{
+ */
+
+/**
+ * Create a link.
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_link(struct ceph_mount_info *cmount, const char *existing, const char *newname);
+
+/**
+ * Read a symbolic link.
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param path the path to the symlink to read
+ * @param buf the buffer to hold the path of the file that the symlink points to.
+ * @param size the length of the buffer
+ * @returns number of bytes copied on success or negative error code on failure
+ */
+int ceph_readlink(struct ceph_mount_info *cmount, const char *path, char *buf, int64_t size);
+
+/**
+ * Read a symbolic link relative to a file descriptor
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the path to the symlink to read
+ * @param buf the buffer to hold the path of the file that the symlink points to.
+ * @param size the length of the buffer
+ * @returns number of bytes copied on success or negative error code on failure
+ */
+int ceph_readlinkat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, char *buf,
+ int64_t size);
+
+/**
+ * Creates a symbolic link.
+ *
+ * @param cmount the ceph mount handle to use for creating the symbolic link.
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on failure.
+ */
+int ceph_symlink(struct ceph_mount_info *cmount, const char *existing, const char *newname);
+
+/**
+ * Creates a symbolic link relative to a file descriptor
+ *
+ * @param cmount the ceph mount handle to use for creating the symbolic link.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on failure.
+ */
+int ceph_symlinkat(struct ceph_mount_info *cmount, const char *existing, int dirfd,
+ const char *newname);
+
+/** @} links */
+
+/**
+ * @defgroup libcephfs_h_files File manipulation and handling.
+ * Functions for creating and manipulating files.
+ *
+ * @{
+ */
+
+
+/**
+ * Checks if deleting a file, link or directory is allowed.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file, link or directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_may_delete(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Removes a file, link, or symbolic link. If the file/link has multiple links to it, the
+ * file will not disappear from the namespace until all references to it are removed.
+ *
+ * @param cmount the ceph mount handle to use for performing the unlink.
+ * @param path the path of the file or link to unlink.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_unlink(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Removes a file, link, or symbolic link relative to a file descriptor.
+ * If the file/link has multiple links to it, the file will not
+ * disappear from the namespace until all references to it are removed.
+ *
+ * @param cmount the ceph mount handle to use for performing the unlink.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the path of the file or link to unlink.
+ * @param flags bitfield that can be used to set AT_* modifier flags (only AT_REMOVEDIR)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_unlinkat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, int flags);
+
+/**
+ * Rename a file or directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the rename.
+ * @param from the path to the existing file or directory.
+ * @param to the new name of the file or directory
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_rename(struct ceph_mount_info *cmount, const char *from, const char *to);
+
+/**
+ * Get an open file's extended statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param fd the file descriptor of the file to get statistics of.
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fstatx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx,
+ unsigned int want, unsigned int flags);
+
+/**
+ * Get attributes of a file relative to a file descriptor
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath to the file/directory to get statistics of
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_statxat(struct ceph_mount_info *cmount, int dirfd, const char *relpath,
+ struct ceph_statx *stx, unsigned int want, unsigned int flags);
+
+/**
+ * Get a file's extended statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_statx(struct ceph_mount_info *cmount, const char *path, struct ceph_statx *stx,
+ unsigned int want, unsigned int flags);
+
+/**
+ * Get a file's statistics and attributes.
+ *
+ * ceph_stat() is deprecated, use ceph_statx() instead.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stbuf the stat struct that will be filled in with the file's statistics.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_stat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf)
+ LIBCEPHFS_DEPRECATED;
+
+/**
+ * Get a file's statistics and attributes, without following symlinks.
+ *
+ * ceph_lstat() is deprecated, use ceph_statx(.., AT_SYMLINK_NOFOLLOW) instead.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stbuf the stat struct that will be filled in with the file's statistics.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lstat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf)
+ LIBCEPHFS_DEPRECATED;
+
+/**
+ * Get the open file's statistics.
+ *
+ * ceph_fstat() is deprecated, use ceph_fstatx() instead.
+ *
+ * @param cmount the ceph mount handle to use for performing the fstat.
+ * @param fd the file descriptor of the file to get statistics of.
+ * @param stbuf the stat struct of the file's statistics, filled in by the
+ * function.
+ * @returns 0 on success or a negative error code on failure
+ */
+int ceph_fstat(struct ceph_mount_info *cmount, int fd, struct stat *stbuf)
+ LIBCEPHFS_DEPRECATED;
+
+/**
+ * Set a file's attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the setattr.
+ * @param relpath the path to the file/directory to set the attributes of.
+ * @param stx the statx struct that must include attribute values to set on the file.
+ * @param mask a mask of all the CEPH_SETATTR_* values that have been set in the statx struct.
+ * @param flags mask of AT_* flags (only AT_ATTR_NOFOLLOW is respected for now)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_setattrx(struct ceph_mount_info *cmount, const char *relpath, struct ceph_statx *stx, int mask, int flags);
+
+/**
+ * Set a file's attributes (extended version).
+ *
+ * @param cmount the ceph mount handle to use for performing the setattr.
+ * @param fd the fd of the open file/directory to set the attributes of.
+ * @param stx the statx struct that must include attribute values to set on the file.
+ * @param mask a mask of all the stat values that have been set on the stat struct.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fsetattrx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx, int mask);
+
+/**
+ * Change the mode bits (permissions) of a file/directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param path the path to the file/directory to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_chmod(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Change the mode bits (permissions) of a file/directory. If the path is a
+ * symbolic link, it's not de-referenced.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param path the path of file/directory to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lchmod(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Change the mode bits (permissions) of an open file.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param fd the open file descriptor to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fchmod(struct ceph_mount_info *cmount, int fd, mode_t mode);
+
+/**
+ * Change the mode bits (permissions) of a file relative to a file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the relpath of the file/directory to change the ownership of.
+ * @param mode the new permissions to set.
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_chmodat(struct ceph_mount_info *cmount, int dirfd, const char *relpath,
+ mode_t mode, int flags);
+
+/**
+ * Change the ownership of a file/directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param path the path of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_chown(struct ceph_mount_info *cmount, const char *path, int uid, int gid);
+
+/**
+ * Change the ownership of a file from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param fd the fd of the open file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fchown(struct ceph_mount_info *cmount, int fd, int uid, int gid);
+
+/**
+ * Change the ownership of a file/directory, don't follow symlinks.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param path the path of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lchown(struct ceph_mount_info *cmount, const char *path, int uid, int gid);
+
+/**
+ * Change the ownership of a file/directory releative to a file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the relpath of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_chownat(struct ceph_mount_info *cmount, int dirfd, const char *relpath,
+ uid_t uid, gid_t gid, int flags);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param buf holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utime(struct ceph_mount_info *cmount, const char *path, struct utimbuf *buf);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param buf holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futime(struct ceph_mount_info *cmount, int fd, struct utimbuf *buf);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times, don't follow symlinks.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lutimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futimes(struct ceph_mount_info *cmount, int fd, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futimens(struct ceph_mount_info *cmount, int fd, struct timespec times[2]);
+
+/**
+ * Change file/directory last access and modification times relative
+ * to a file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the relpath of the file/directory to change the ownership of.
+ * @param dirfd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utimensat(struct ceph_mount_info *cmount, int dirfd, const char *relpath,
+ struct timespec times[2], int flags);
+
+/**
+ * Apply or remove an advisory lock.
+ *
+ * @param cmount the ceph mount handle to use for performing the lock.
+ * @param fd the open file descriptor to change advisory lock.
+ * @param operation the advisory lock operation to be performed on the file
+ * descriptor among LOCK_SH (shared lock), LOCK_EX (exclusive lock),
+ * or LOCK_UN (remove lock). The LOCK_NB value can be ORed to perform a
+ * non-blocking operation.
+ * @param owner the user-supplied owner identifier (an arbitrary integer)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_flock(struct ceph_mount_info *cmount, int fd, int operation,
+ uint64_t owner);
+
+/**
+ * Truncate the file to the given size. If this operation causes the
+ * file to expand, the empty bytes will be filled in with zeros.
+ *
+ * @param cmount the ceph mount handle to use for performing the truncate.
+ * @param path the path to the file to truncate.
+ * @param size the new size of the file.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_truncate(struct ceph_mount_info *cmount, const char *path, int64_t size);
+
+/**
+ * Make a block or character special file.
+ *
+ * @param cmount the ceph mount handle to use for performing the mknod.
+ * @param path the path to the special file.
+ * @param mode the permissions to use and the type of special file. The type can be
+ * one of S_IFREG, S_IFCHR, S_IFBLK, S_IFIFO.
+ * @param rdev If the file type is S_IFCHR or S_IFBLK then this parameter specifies the
+ * major and minor numbers of the newly created device special file. Otherwise,
+ * it is ignored.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_mknod(struct ceph_mount_info *cmount, const char *path, mode_t mode, dev_t rdev);
+/**
+ * Create and/or open a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param path the path of the file to open. If the flags parameter includes O_CREAT,
+ * the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ * is specified in the flags.
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_open(struct ceph_mount_info *cmount, const char *path, int flags, mode_t mode);
+
+/**
+ * Create and/or open a file relative to a directory
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the path of the file to open. If the flags parameter includes O_CREAT,
+ * the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ * is specified in the flags.
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_openat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, int flags, mode_t mode);
+
+/**
+ * Create and/or open a file with a specific file layout.
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param path the path of the file to open. If the flags parameter includes O_CREAT,
+ * the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ * is specified in the flags.
+ * @param stripe_unit the stripe unit size (option, 0 for default)
+ * @param stripe_count the stripe count (optional, 0 for default)
+ * @param object_size the object size (optional, 0 for default)
+ * @param data_pool name of target data pool name (optional, NULL or empty string for default)
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_open_layout(struct ceph_mount_info *cmount, const char *path, int flags,
+ mode_t mode, int stripe_unit, int stripe_count, int object_size,
+ const char *data_pool);
+
+/**
+ * Close the open file.
+ *
+ * @param cmount the ceph mount handle to use for performing the close.
+ * @param fd the file descriptor referring to the open file.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_close(struct ceph_mount_info *cmount, int fd);
+
+/**
+ * Reposition the open file stream based on the given offset.
+ *
+ * @param cmount the ceph mount handle to use for performing the lseek.
+ * @param fd the open file descriptor referring to the open file and holding the
+ * current position of the stream.
+ * @param offset the offset to set the stream to
+ * @param whence the flag to indicate what type of seeking to perform:
+ * SEEK_SET: the offset is set to the given offset in the file.
+ * SEEK_CUR: the offset is set to the current location plus @e offset bytes.
+ * SEEK_END: the offset is set to the end of the file plus @e offset bytes.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int whence);
+/**
+ * Read data from the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param buf the buffer to read data into
+ * @param size the initial size of the buffer
+ * @param offset the offset in the file to read from. If this value is negative, the
+ * function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_read(struct ceph_mount_info *cmount, int fd, char *buf, int64_t size, int64_t offset);
+
+/**
+ * Read data from the file.
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset in the file to read from. If this value is negative, the
+ * function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_preadv(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+ int64_t offset);
+
+/**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param buf the bytes to write to the file
+ * @param size the size of the buf array
+ * @param offset the offset of the file write into. If this value is negative, the
+ * function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_write(struct ceph_mount_info *cmount, int fd, const char *buf, int64_t size,
+ int64_t offset);
+
+/**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset of the file write into. If this value is negative, the
+ * function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_pwritev(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+ int64_t offset);
+
+/**
+ * Truncate a file to the given size.
+ *
+ * @param cmount the ceph mount handle to use for performing the ftruncate.
+ * @param fd the file descriptor of the file to truncate
+ * @param size the new size of the file
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_ftruncate(struct ceph_mount_info *cmount, int fd, int64_t size);
+
+/**
+ * Synchronize an open file to persistent media.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param syncdataonly a boolean whether to synchronize metadata and data (0)
+ * or just data (1).
+ * @return 0 on success or a negative error code on failure.
+ */
+int ceph_fsync(struct ceph_mount_info *cmount, int fd, int syncdataonly);
+
+/**
+ * Preallocate or release disk space for the file for the byte range.
+ *
+ * @param cmount the ceph mount handle to use for performing the fallocate.
+ * @param fd the file descriptor of the file to fallocate.
+ * @param mode the flags determines the operation to be performed on the given range.
+ * default operation (0) allocate and initialize to zero the file in the byte range,
+ * and the file size will be changed if offset + length is greater than
+ * the file size. if the FALLOC_FL_KEEP_SIZE flag is specified in the mode,
+ * the file size will not be changed. if the FALLOC_FL_PUNCH_HOLE flag is
+ * specified in the mode, the operation is deallocate space and zero the byte range.
+ * @param offset the byte range starting.
+ * @param length the length of the range.
+ * @return 0 on success or a negative error code on failure.
+ */
+int ceph_fallocate(struct ceph_mount_info *cmount, int fd, int mode,
+ int64_t offset, int64_t length);
+
+/**
+ * Enable/disable lazyio for the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param enable a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio(struct ceph_mount_info *cmount, int fd, int enable);
+
+
+/**
+ * Flushes the write buffer for the file thereby propogating the buffered write to the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param offset a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio_propagate(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count);
+
+
+/**
+ * Flushes the write buffer for the file and invalidate the read cache. This allows a subsequent read operation to read and cache data directly from the file and hence everyone's propagated writes would be visible.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param offset a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio_synchronize(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count);
+
+/** @} file */
+
+/**
+ * @defgroup libcephfs_h_xattr Extended Attribute manipulation and handling.
+ * Functions for creating and manipulating extended attributes on files.
+ *
+ * @{
+ */
+
+/**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param path the path to the file
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_getxattr(struct ceph_mount_info *cmount, const char *path, const char *name,
+ void *value, size_t size);
+
+/**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param fd the open file descriptor referring to the file to get extended attribute from.
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_fgetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+ void *value, size_t size);
+
+/**
+ * Get an extended attribute without following symbolic links. This function is
+ * identical to ceph_getxattr, but if the path refers to a symbolic link,
+ * we get the extended attributes of the symlink rather than the attributes
+ * of the link itself.
+ *
+ * @param cmount the ceph mount handle to use for performing the lgetxattr.
+ * @param path the path to the file
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_lgetxattr(struct ceph_mount_info *cmount, const char *path, const char *name,
+ void *value, size_t size);
+
+/**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param path the path to the file.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_listxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
+
+/**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param fd the open file descriptor referring to the file to list extended attributes on.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_flistxattr(struct ceph_mount_info *cmount, int fd, char *list, size_t size);
+
+/**
+ * Get the list of extended attribute keys on a file, but do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the llistxattr.
+ * @param path the path to the file.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_llistxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
+
+/**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_removexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param fd the open file descriptor referring to the file to remove extended attribute from.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fremovexattr(struct ceph_mount_info *cmount, int fd, const char *name);
+
+/**
+ * Remove the extended attribute from a file, do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the lremovexattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lremovexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ * CEPH_XATTR_CREATE: create the extended attribute. Must not exist.
+ * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_setxattr(struct ceph_mount_info *cmount, const char *path, const char *name,
+ const void *value, size_t size, int flags);
+
+/**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param fd the open file descriptor referring to the file to set extended attribute on.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ * CEPH_XATTR_CREATE: create the extended attribute. Must not exist.
+ * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fsetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+ const void *value, size_t size, int flags);
+
+/**
+ * Set an extended attribute on a file, do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the lsetxattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ * CEPH_XATTR_CREATE: create the extended attribute. Must not exist.
+ * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char *name,
+ const void *value, size_t size, int flags);
+
+/** @} xattr */
+
+/**
+ * @defgroup libcephfs_h_filelayout Control File Layout.
+ * Functions for setting and getting the file layout of existing files.
+ *
+ * @{
+ */
+
+/**
+ * Get the file striping unit from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping unit.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file striping count from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping count.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file object size from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file object size.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file pool information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file pool information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the name of the pool a opened file is stored in,
+ *
+ * Write the name of the file's pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen);
+
+/**
+ * get the name of a pool by id
+ *
+ * Given a pool's numeric identifier, get the pool's alphanumeric name.
+ *
+ * @param cmount the ceph mount handle to use
+ * @param pool the numeric pool id
+ * @param buf buffer to sore the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough
+ */
+int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t buflen);
+
+/**
+ * Get the name of the pool a file is stored in
+ *
+ * Write the name of the file's pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t buflen);
+
+/**
+ * Get the default pool name of cephfs
+ * Write the name of the default pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ * @param cmount the ceph mount handle to use.
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_default_data_pool_name(struct ceph_mount_info *cmount, char *buf, size_t buflen);
+
+/**
+ * Get the file layout from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file layout.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file replication information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file replication information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the id of the named pool.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param pool_name the name of the pool.
+ * @returns the pool id, or a negative error code on failure.
+ */
+int ceph_get_pool_id(struct ceph_mount_info *cmount, const char *pool_name);
+
+/**
+ * Get the pool replication factor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param pool_id the pool id to look up
+ * @returns the replication factor, or a negative error code on failure.
+ */
+int ceph_get_pool_replication(struct ceph_mount_info *cmount, int pool_id);
+
+/**
+ * Get the OSD address where the primary copy of a file stripe is located.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the open file descriptor referring to the file to get the striping unit of.
+ * @param offset the offset into the file to specify the stripe. The offset can be
+ * anywhere within the stripe unit.
+ * @param addr the address of the OSD holding that stripe
+ * @param naddr the capacity of the address passed in.
+ * @returns the size of the addressed filled into the @e addr parameter, or a negative
+ * error code on failure.
+ */
+int ceph_get_file_stripe_address(struct ceph_mount_info *cmount, int fd, int64_t offset,
+ struct sockaddr_storage *addr, int naddr);
+
+/**
+ * Get the list of OSDs where the objects containing a file offset are located.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the open file descriptor referring to the file.
+ * @param offset the offset within the file.
+ * @param length return the number of bytes between the offset and the end of
+ * the stripe unit (optional).
+ * @param osds an integer array to hold the OSD ids.
+ * @param nosds the size of the integer array.
+ * @returns the number of items stored in the output array, or -ERANGE if the
+ * array is not large enough.
+ */
+int ceph_get_file_extent_osds(struct ceph_mount_info *cmount, int fd,
+ int64_t offset, int64_t *length, int *osds, int nosds);
+
+/**
+ * Get the fully qualified CRUSH location of an OSD.
+ *
+ * Returns (type, name) string pairs for each device in the CRUSH bucket
+ * hierarchy starting from the given osd to the root. Each pair element is
+ * separated by a NULL character.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param osd the OSD id.
+ * @param path buffer to store location.
+ * @param len size of buffer.
+ * @returns the amount of bytes written into the buffer, or -ERANGE if the
+ * array is not large enough.
+ */
+int ceph_get_osd_crush_location(struct ceph_mount_info *cmount,
+ int osd, char *path, size_t len);
+
+/**
+ * Get the network address of an OSD.
+ *
+ * @param cmount the ceph mount handle.
+ * @param osd the OSD id.
+ * @param addr the OSD network address.
+ * @returns zero on success, other returns a negative error code.
+ */
+int ceph_get_osd_addr(struct ceph_mount_info *cmount, int osd,
+ struct sockaddr_storage *addr);
+
+/**
+ * Get the file layout stripe unit granularity.
+ * @param cmount the ceph mount handle.
+ * @returns the stripe unit granularity or a negative error code on failure.
+ */
+int ceph_get_stripe_unit_granularity(struct ceph_mount_info *cmount);
+
+/** @} filelayout */
+
+/**
+ * No longer available. Do not use.
+ * These functions will return -EOPNOTSUPP.
+ */
+int ceph_set_default_file_stripe_unit(struct ceph_mount_info *cmount, int stripe);
+int ceph_set_default_file_stripe_count(struct ceph_mount_info *cmount, int count);
+int ceph_set_default_object_size(struct ceph_mount_info *cmount, int size);
+int ceph_set_default_preferred_pg(struct ceph_mount_info *cmount, int osd);
+int ceph_set_default_file_replication(struct ceph_mount_info *cmount, int replication);
+
+/**
+ * Read from local replicas when possible.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param val a boolean to set (1) or clear (0) the option to favor local objects
+ * for reads.
+ * @returns 0
+ */
+int ceph_localize_reads(struct ceph_mount_info *cmount, int val);
+
+/**
+ * Get the osd id of the local osd (if any)
+ *
+ * @param cmount the ceph mount handle to use.
+ * @returns the osd (if any) local to the node where this call is made, otherwise
+ * -1 is returned.
+ */
+int ceph_get_local_osd(struct ceph_mount_info *cmount);
+
+/** @} default_filelayout */
+
+/**
+ * Get the capabilities currently issued to the client.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the file descriptor to get issued
+ * @returns the current capabilities issued to this client
+ * for the open file
+ */
+int ceph_debug_get_fd_caps(struct ceph_mount_info *cmount, int fd);
+
+/**
+ * Get the capabilities currently issued to the client.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path to the file
+ * @returns the current capabilities issued to this client
+ * for the file
+ */
+int ceph_debug_get_file_caps(struct ceph_mount_info *cmount, const char *path);
+
+/* Low Level */
+struct Inode *ceph_ll_get_inode(struct ceph_mount_info *cmount,
+ vinodeno_t vino);
+
+int ceph_ll_lookup_vino(struct ceph_mount_info *cmount, vinodeno_t vino,
+ Inode **inode);
+
+int ceph_ll_lookup_inode(
+ struct ceph_mount_info *cmount,
+ struct inodeno_t ino,
+ Inode **inode);
+
+/**
+ * Get the root inode of FS. Increase counter of references for root Inode. You must call ceph_ll_forget for it!
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param parent pointer to pointer to Inode struct. Pointer to root inode will be returned
+ * @returns 0 if all good
+ */
+int ceph_ll_lookup_root(struct ceph_mount_info *cmount,
+ Inode **parent);
+int ceph_ll_lookup(struct ceph_mount_info *cmount, Inode *parent,
+ const char *name, Inode **out, struct ceph_statx *stx,
+ unsigned want, unsigned flags, const UserPerm *perms);
+int ceph_ll_put(struct ceph_mount_info *cmount, struct Inode *in);
+int ceph_ll_forget(struct ceph_mount_info *cmount, struct Inode *in,
+ int count);
+int ceph_ll_walk(struct ceph_mount_info *cmount, const char* name, Inode **i,
+ struct ceph_statx *stx, unsigned int want, unsigned int flags,
+ const UserPerm *perms);
+int ceph_ll_getattr(struct ceph_mount_info *cmount, struct Inode *in,
+ struct ceph_statx *stx, unsigned int want, unsigned int flags,
+ const UserPerm *perms);
+int ceph_ll_setattr(struct ceph_mount_info *cmount, struct Inode *in,
+ struct ceph_statx *stx, int mask, const UserPerm *perms);
+int ceph_ll_open(struct ceph_mount_info *cmount, struct Inode *in, int flags,
+ struct Fh **fh, const UserPerm *perms);
+off_t ceph_ll_lseek(struct ceph_mount_info *cmount, struct Fh* filehandle,
+ off_t offset, int whence);
+int ceph_ll_read(struct ceph_mount_info *cmount, struct Fh* filehandle,
+ int64_t off, uint64_t len, char* buf);
+int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh,
+ int syncdataonly);
+int ceph_ll_sync_inode(struct ceph_mount_info *cmount, struct Inode *in,
+ int syncdataonly);
+int ceph_ll_fallocate(struct ceph_mount_info *cmount, struct Fh *fh,
+ int mode, int64_t offset, int64_t length);
+int ceph_ll_write(struct ceph_mount_info *cmount, struct Fh* filehandle,
+ int64_t off, uint64_t len, const char *data);
+int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh,
+ const struct iovec *iov, int iovcnt, int64_t off);
+int64_t ceph_ll_writev(struct ceph_mount_info *cmount, struct Fh *fh,
+ const struct iovec *iov, int iovcnt, int64_t off);
+int ceph_ll_close(struct ceph_mount_info *cmount, struct Fh* filehandle);
+int ceph_ll_iclose(struct ceph_mount_info *cmount, struct Inode *in, int mode);
+/**
+ * Get xattr value by xattr name.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param in file handle
+ * @param name name of attribute
+ * @param value pointer to begin buffer
+ * @param size buffer size
+ * @param perms pointer to UserPerms object
+ * @returns size of returned buffer. Negative number in error case
+ */
+int ceph_ll_getxattr(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, void *value, size_t size,
+ const UserPerm *perms);
+int ceph_ll_setxattr(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, const void *value, size_t size,
+ int flags, const UserPerm *perms);
+int ceph_ll_listxattr(struct ceph_mount_info *cmount, struct Inode *in,
+ char *list, size_t buf_size, size_t *list_size,
+ const UserPerm *perms);
+int ceph_ll_removexattr(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, const UserPerm *perms);
+int ceph_ll_create(struct ceph_mount_info *cmount, Inode *parent,
+ const char *name, mode_t mode, int oflags, Inode **outp,
+ Fh **fhp, struct ceph_statx *stx, unsigned want,
+ unsigned lflags, const UserPerm *perms);
+int ceph_ll_mknod(struct ceph_mount_info *cmount, Inode *parent,
+ const char *name, mode_t mode, dev_t rdev, Inode **out,
+ struct ceph_statx *stx, unsigned want, unsigned flags,
+ const UserPerm *perms);
+int ceph_ll_mkdir(struct ceph_mount_info *cmount, Inode *parent,
+ const char *name, mode_t mode, Inode **out,
+ struct ceph_statx *stx, unsigned want,
+ unsigned flags, const UserPerm *perms);
+int ceph_ll_link(struct ceph_mount_info *cmount, struct Inode *in,
+ struct Inode *newparent, const char *name,
+ const UserPerm *perms);
+int ceph_ll_opendir(struct ceph_mount_info *cmount, struct Inode *in,
+ struct ceph_dir_result **dirpp, const UserPerm *perms);
+int ceph_ll_releasedir(struct ceph_mount_info *cmount,
+ struct ceph_dir_result* dir);
+int ceph_ll_rename(struct ceph_mount_info *cmount, struct Inode *parent,
+ const char *name, struct Inode *newparent,
+ const char *newname, const UserPerm *perms);
+int ceph_ll_unlink(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, const UserPerm *perms);
+int ceph_ll_statfs(struct ceph_mount_info *cmount, struct Inode *in,
+ struct statvfs *stbuf);
+int ceph_ll_readlink(struct ceph_mount_info *cmount, struct Inode *in,
+ char *buf, size_t bufsize, const UserPerm *perms);
+int ceph_ll_symlink(struct ceph_mount_info *cmount,
+ Inode *in, const char *name, const char *value,
+ Inode **out, struct ceph_statx *stx,
+ unsigned want, unsigned flags,
+ const UserPerm *perms);
+int ceph_ll_rmdir(struct ceph_mount_info *cmount, struct Inode *in,
+ const char *name, const UserPerm *perms);
+uint32_t ceph_ll_stripe_unit(struct ceph_mount_info *cmount,
+ struct Inode *in);
+uint32_t ceph_ll_file_layout(struct ceph_mount_info *cmount,
+ struct Inode *in,
+ struct ceph_file_layout *layout);
+uint64_t ceph_ll_snap_seq(struct ceph_mount_info *cmount,
+ struct Inode *in);
+int ceph_ll_get_stripe_osd(struct ceph_mount_info *cmount,
+ struct Inode *in,
+ uint64_t blockno,
+ struct ceph_file_layout* layout);
+int ceph_ll_num_osds(struct ceph_mount_info *cmount);
+int ceph_ll_osdaddr(struct ceph_mount_info *cmount,
+ int osd, uint32_t *addr);
+uint64_t ceph_ll_get_internal_offset(struct ceph_mount_info *cmount,
+ struct Inode *in, uint64_t blockno);
+int ceph_ll_read_block(struct ceph_mount_info *cmount,
+ struct Inode *in, uint64_t blockid,
+ char* bl, uint64_t offset, uint64_t length,
+ struct ceph_file_layout* layout);
+int ceph_ll_write_block(struct ceph_mount_info *cmount,
+ struct Inode *in, uint64_t blockid,
+ char* buf, uint64_t offset,
+ uint64_t length, struct ceph_file_layout* layout,
+ uint64_t snapseq, uint32_t sync);
+int ceph_ll_commit_blocks(struct ceph_mount_info *cmount,
+ struct Inode *in, uint64_t offset, uint64_t range);
+
+
+int ceph_ll_getlk(struct ceph_mount_info *cmount,
+ Fh *fh, struct flock *fl, uint64_t owner);
+int ceph_ll_setlk(struct ceph_mount_info *cmount,
+ Fh *fh, struct flock *fl, uint64_t owner, int sleep);
+
+int ceph_ll_lazyio(struct ceph_mount_info *cmount, Fh *fh, int enable);
+
+/*
+ * Delegation support
+ *
+ * Delegations are way for an application to request exclusive or
+ * semi-exclusive access to an Inode. The client requests the delegation and
+ * if it's successful it can reliably cache file data and metadata until the
+ * delegation is recalled.
+ *
+ * Recalls are issued via a callback function, provided by the application.
+ * Callback functions should act something like signal handlers. You want to
+ * do as little as possible in the callback. Any major work should be deferred
+ * in some fashion as it's difficult to predict the context in which this
+ * function will be called.
+ *
+ * Once the delegation has been recalled, the application should return it as
+ * soon as possible. The application has client_deleg_timeout seconds to
+ * return it, after which the cmount structure is forcibly unmounted and
+ * further calls into it fail.
+ *
+ * The application can set the client_deleg_timeout config option to suit its
+ * needs, but it should take care to choose a value that allows it to avoid
+ * forcible eviction from the cluster in the event of an application bug.
+ */
+
+/* Commands for manipulating delegation state */
+#ifndef CEPH_DELEGATION_NONE
+# define CEPH_DELEGATION_NONE 0
+# define CEPH_DELEGATION_RD 1
+# define CEPH_DELEGATION_WR 2
+#endif
+
+/**
+ * Get the amount of time that the client has to return caps
+ * @param cmount the ceph mount handle to use.
+ *
+ * In the event that a client does not return its caps, the MDS may blocklist
+ * it after this timeout. Applications should check this value and ensure
+ * that they set the delegation timeout to a value lower than this.
+ *
+ * This call returns the cap return timeout (in seconds) for this cmount, or
+ * zero if it's not mounted.
+ */
+uint32_t ceph_get_cap_return_timeout(struct ceph_mount_info *cmount);
+
+/**
+ * Set the delegation timeout for the mount (thereby enabling delegations)
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the delegation timeout (in seconds)
+ *
+ * Since the client could end up blocklisted if it doesn't return delegations
+ * in time, we mandate that any application wanting to use delegations
+ * explicitly set the timeout beforehand. Until this call is done on the
+ * mount, attempts to set a delegation will return -ETIME.
+ *
+ * Once a delegation is recalled, if it is not returned in this amount of
+ * time, the cmount will be forcibly unmounted and further access attempts
+ * will fail (usually with -ENOTCONN errors).
+ *
+ * This value is further vetted against the cap return timeout, and this call
+ * can fail with -EINVAL if the timeout value is too long. Delegations can be
+ * disabled again by setting the timeout to 0.
+ */
+int ceph_set_deleg_timeout(struct ceph_mount_info *cmount, uint32_t timeout);
+
+/**
+ * Request a delegation on an open Fh
+ * @param cmount the ceph mount handle to use.
+ * @param fh file handle
+ * @param cmd CEPH_DELEGATION_* command
+ * @param cb callback function for recalling delegation
+ * @param priv opaque token passed back during recalls
+ *
+ * Returns 0 if the delegation was granted, -EAGAIN if there was a conflict
+ * and other error codes if there is a fatal error of some sort (e.g. -ENOMEM,
+ * -ETIME)
+ */
+int ceph_ll_delegation(struct ceph_mount_info *cmount, Fh *fh,
+ unsigned int cmd, ceph_deleg_cb_t cb, void *priv);
+
+mode_t ceph_umask(struct ceph_mount_info *cmount, mode_t mode);
+
+/* state reclaim */
+#define CEPH_RECLAIM_RESET 1
+
+/**
+ * Set ceph client uuid
+ * @param cmount the ceph mount handle to use.
+ * @param uuid the uuid to set
+ *
+ * Must be called before mount.
+ */
+void ceph_set_uuid(struct ceph_mount_info *cmount, const char *uuid);
+
+/**
+ * Set ceph client session timeout
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the timeout to set
+ *
+ * Must be called before mount.
+ */
+void ceph_set_session_timeout(struct ceph_mount_info *cmount, unsigned timeout);
+
+/**
+ * Start to reclaim states of other client
+ * @param cmount the ceph mount handle to use.
+ * @param uuid uuid of client whose states need to be reclaimed
+ * @param flags flags that control how states get reclaimed
+ *
+ * Returns 0 success, -EOPNOTSUPP if mds does not support the operation,
+ * -ENOENT if CEPH_RECLAIM_RESET is specified and there is no client
+ * with the given uuid, -ENOTRECOVERABLE in all other error cases.
+ */
+int ceph_start_reclaim(struct ceph_mount_info *cmount,
+ const char *uuid, unsigned flags);
+
+/**
+ * finish reclaiming states of other client (
+ * @param cmount the ceph mount handle to use.
+ */
+void ceph_finish_reclaim(struct ceph_mount_info *cmount);
+
+/**
+ * Register a set of callbacks to be used with this cmount
+ * @param cmount the ceph mount handle on which the cb's should be registerd
+ * @param args callback arguments to register with the cmount
+ *
+ * Any fields set to NULL will be ignored. There currently is no way to
+ * unregister these callbacks, so this is a one-way change.
+ */
+void ceph_ll_register_callbacks(struct ceph_mount_info *cmount,
+ struct ceph_client_callback_args *args);
+
+/**
+ * Get snapshot info
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the snapshot. This must be either an
+ * absolute path or a relative path off of the current working directory.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_get_snap_info(struct ceph_mount_info *cmount,
+ const char *path, struct snap_info *snap_info);
+
+/**
+ * Free snapshot info buffers
+ *
+ * @param snap_info snapshot info struct (fetched via call to ceph_get_snap_info()).
+ */
+void ceph_free_snap_info_buffer(struct snap_info *snap_info);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/cephfs/metrics/Types.h b/src/include/cephfs/metrics/Types.h
new file mode 100644
index 000000000..a81a69537
--- /dev/null
+++ b/src/include/cephfs/metrics/Types.h
@@ -0,0 +1,699 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H
+#define CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H
+
+#include <string>
+#include <boost/variant.hpp>
+
+#include "common/Formatter.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "include/int_types.h"
+#include "include/stringify.h"
+#include "include/utime.h"
+
+namespace ceph { class Formatter; }
+
+enum ClientMetricType {
+ CLIENT_METRIC_TYPE_CAP_INFO,
+ CLIENT_METRIC_TYPE_READ_LATENCY,
+ CLIENT_METRIC_TYPE_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_DENTRY_LEASE,
+ CLIENT_METRIC_TYPE_OPENED_FILES,
+ CLIENT_METRIC_TYPE_PINNED_ICAPS,
+ CLIENT_METRIC_TYPE_OPENED_INODES,
+ CLIENT_METRIC_TYPE_READ_IO_SIZES,
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+};
+inline std::ostream &operator<<(std::ostream &os, const ClientMetricType &type) {
+ switch(type) {
+ case ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO:
+ os << "CAP_INFO";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY:
+ os << "READ_LATENCY";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY:
+ os << "WRITE_LATENCY";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY:
+ os << "METADATA_LATENCY";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE:
+ os << "DENTRY_LEASE";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES:
+ os << "OPENED_FILES";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS:
+ os << "PINNED_ICAPS";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES:
+ os << "OPENED_INODES";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES:
+ os << "READ_IO_SIZES";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES:
+ os << "WRITE_IO_SIZES";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_AVG_READ_LATENCY:
+ os << "AVG_READ_LATENCY";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_READ_LATENCY:
+ os << "STDEV_READ_LATENCY";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY:
+ os << "AVG_WRITE_LATENCY";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY:
+ os << "STDEV_WRITE_LATENCY";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY:
+ os << "AVG_METADATA_LATENCY";
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY:
+ os << "STDEV_METADATA_LATENCY";
+ break;
+ default:
+ os << "(UNKNOWN:" << static_cast<std::underlying_type<ClientMetricType>::type>(type) << ")";
+ break;
+ }
+
+ return os;
+}
+
+struct ClientMetricPayloadBase {
+ ClientMetricPayloadBase(ClientMetricType type) : metric_type(type) {}
+
+ ClientMetricType get_type() const {
+ return metric_type;
+ }
+
+ void print_type(ostream *out) const {
+ *out << metric_type;
+ }
+
+ private:
+ ClientMetricType metric_type;
+};
+
+struct CapInfoPayload : public ClientMetricPayloadBase {
+ uint64_t cap_hits = 0;
+ uint64_t cap_misses = 0;
+ uint64_t nr_caps = 0;
+
+ CapInfoPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO) { }
+ CapInfoPayload(uint64_t cap_hits, uint64_t cap_misses, uint64_t nr_caps)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO),
+ cap_hits(cap_hits), cap_misses(cap_misses), nr_caps(nr_caps) {
+ }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(1, 1, bl);
+ encode(cap_hits, bl);
+ encode(cap_misses, bl);
+ encode(nr_caps, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(1, iter);
+ decode(cap_hits, iter);
+ decode(cap_misses, iter);
+ decode(nr_caps, iter);
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("cap_hits", cap_hits);
+ f->dump_int("cap_misses", cap_misses);
+ f->dump_int("num_caps", nr_caps);
+ }
+
+ void print(ostream *out) const {
+ *out << "cap_hits: " << cap_hits << " "
+ << "cap_misses: " << cap_misses << " "
+ << "num_caps: " << nr_caps;
+ }
+};
+
+struct ReadLatencyPayload : public ClientMetricPayloadBase {
+ utime_t lat;
+ utime_t mean;
+ uint64_t sq_sum; // sum of squares
+ uint64_t count; // IO count
+
+ ReadLatencyPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY) { }
+ ReadLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY),
+ lat(lat),
+ mean(mean),
+ sq_sum(sq_sum),
+ count(count) {
+ }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(2, 1, bl);
+ encode(lat, bl);
+ encode(mean, bl);
+ encode(sq_sum, bl);
+ encode(count, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(2, iter);
+ decode(lat, iter);
+ if (struct_v >= 2) {
+ decode(mean, iter);
+ decode(sq_sum, iter);
+ decode(count, iter);
+ }
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("latency", lat);
+ f->dump_int("avg_latency", mean);
+ f->dump_unsigned("sq_sum", sq_sum);
+ f->dump_unsigned("count", count);
+ }
+
+ void print(ostream *out) const {
+ *out << "latency: " << lat << ", avg_latency: " << mean
+ << ", sq_sum: " << sq_sum << ", count=" << count;
+ }
+};
+
+struct WriteLatencyPayload : public ClientMetricPayloadBase {
+ utime_t lat;
+ utime_t mean;
+ uint64_t sq_sum; // sum of squares
+ uint64_t count; // IO count
+
+ WriteLatencyPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY) { }
+ WriteLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY),
+ lat(lat),
+ mean(mean),
+ sq_sum(sq_sum),
+ count(count){
+ }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(2, 1, bl);
+ encode(lat, bl);
+ encode(mean, bl);
+ encode(sq_sum, bl);
+ encode(count, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(2, iter);
+ decode(lat, iter);
+ if (struct_v >= 2) {
+ decode(mean, iter);
+ decode(sq_sum, iter);
+ decode(count, iter);
+ }
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("latency", lat);
+ f->dump_int("avg_latency", mean);
+ f->dump_unsigned("sq_sum", sq_sum);
+ f->dump_unsigned("count", count);
+ }
+
+ void print(ostream *out) const {
+ *out << "latency: " << lat << ", avg_latency: " << mean
+ << ", sq_sum: " << sq_sum << ", count=" << count;
+ }
+};
+
+struct MetadataLatencyPayload : public ClientMetricPayloadBase {
+ utime_t lat;
+ utime_t mean;
+ uint64_t sq_sum; // sum of squares
+ uint64_t count; // IO count
+
+ MetadataLatencyPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY) { }
+ MetadataLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY),
+ lat(lat),
+ mean(mean),
+ sq_sum(sq_sum),
+ count(count) {
+ }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(2, 1, bl);
+ encode(lat, bl);
+ encode(mean, bl);
+ encode(sq_sum, bl);
+ encode(count, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(2, iter);
+ decode(lat, iter);
+ if (struct_v >= 2) {
+ decode(mean, iter);
+ decode(sq_sum, iter);
+ decode(count, iter);
+ }
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("latency", lat);
+ f->dump_int("avg_latency", mean);
+ f->dump_unsigned("sq_sum", sq_sum);
+ f->dump_unsigned("count", count);
+ }
+
+ void print(ostream *out) const {
+ *out << "latency: " << lat << ", avg_latency: " << mean
+ << ", sq_sum: " << sq_sum << ", count=" << count;
+ }
+};
+
+struct DentryLeasePayload : public ClientMetricPayloadBase {
+ uint64_t dlease_hits = 0;
+ uint64_t dlease_misses = 0;
+ uint64_t nr_dentries = 0;
+
+ DentryLeasePayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE) { }
+ DentryLeasePayload(uint64_t dlease_hits, uint64_t dlease_misses, uint64_t nr_dentries)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE),
+ dlease_hits(dlease_hits), dlease_misses(dlease_misses), nr_dentries(nr_dentries) { }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(1, 1, bl);
+ encode(dlease_hits, bl);
+ encode(dlease_misses, bl);
+ encode(nr_dentries, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(1, iter);
+ decode(dlease_hits, iter);
+ decode(dlease_misses, iter);
+ decode(nr_dentries, iter);
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("dlease_hits", dlease_hits);
+ f->dump_int("dlease_misses", dlease_misses);
+ f->dump_int("num_dentries", nr_dentries);
+ }
+
+ void print(ostream *out) const {
+ *out << "dlease_hits: " << dlease_hits << " "
+ << "dlease_misses: " << dlease_misses << " "
+ << "num_dentries: " << nr_dentries;
+ }
+};
+
+struct OpenedFilesPayload : public ClientMetricPayloadBase {
+ uint64_t opened_files = 0;
+ uint64_t total_inodes = 0;
+
+ OpenedFilesPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES) { }
+ OpenedFilesPayload(uint64_t opened_files, uint64_t total_inodes)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES),
+ opened_files(opened_files), total_inodes(total_inodes) { }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(1, 1, bl);
+ encode(opened_files, bl);
+ encode(total_inodes, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(1, iter);
+ decode(opened_files, iter);
+ decode(total_inodes, iter);
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("opened_files", opened_files);
+ f->dump_int("total_inodes", total_inodes);
+ }
+
+ void print(ostream *out) const {
+ *out << "opened_files: " << opened_files << " "
+ << "total_inodes: " << total_inodes;
+ }
+};
+
+struct PinnedIcapsPayload : public ClientMetricPayloadBase {
+ uint64_t pinned_icaps = 0;
+ uint64_t total_inodes = 0;
+
+ PinnedIcapsPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS) { }
+ PinnedIcapsPayload(uint64_t pinned_icaps, uint64_t total_inodes)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS),
+ pinned_icaps(pinned_icaps), total_inodes(total_inodes) { }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(1, 1, bl);
+ encode(pinned_icaps, bl);
+ encode(total_inodes, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(1, iter);
+ decode(pinned_icaps, iter);
+ decode(total_inodes, iter);
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("pinned_icaps", pinned_icaps);
+ f->dump_int("total_inodes", total_inodes);
+ }
+
+ void print(ostream *out) const {
+ *out << "pinned_icaps: " << pinned_icaps << " "
+ << "total_inodes: " << total_inodes;
+ }
+};
+
+struct OpenedInodesPayload : public ClientMetricPayloadBase {
+ uint64_t opened_inodes = 0;
+ uint64_t total_inodes = 0;
+
+ OpenedInodesPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES) { }
+ OpenedInodesPayload(uint64_t opened_inodes, uint64_t total_inodes)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES),
+ opened_inodes(opened_inodes), total_inodes(total_inodes) { }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(1, 1, bl);
+ encode(opened_inodes, bl);
+ encode(total_inodes, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(1, iter);
+ decode(opened_inodes, iter);
+ decode(total_inodes, iter);
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("opened_inodes", opened_inodes);
+ f->dump_int("total_inodes", total_inodes);
+ }
+
+ void print(ostream *out) const {
+ *out << "opened_inodes: " << opened_inodes << " "
+ << "total_inodes: " << total_inodes;
+ }
+};
+
+struct ReadIoSizesPayload : public ClientMetricPayloadBase {
+ uint64_t total_ops = 0;
+ uint64_t total_size = 0;
+
+ ReadIoSizesPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES) { }
+ ReadIoSizesPayload(uint64_t total_ops, uint64_t total_size)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES),
+ total_ops(total_ops), total_size(total_size) { }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(1, 1, bl);
+ encode(total_ops, bl);
+ encode(total_size, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(1, iter);
+ decode(total_ops, iter);
+ decode(total_size, iter);
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("total_ops", total_ops);
+ f->dump_int("total_size", total_size);
+ }
+
+ void print(std::ostream *out) const {
+ *out << "total_ops: " << total_ops << " total_size: " << total_size;
+ }
+};
+
+struct WriteIoSizesPayload : public ClientMetricPayloadBase {
+ uint64_t total_ops = 0;
+ uint64_t total_size = 0;
+
+ WriteIoSizesPayload()
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES) { }
+ WriteIoSizesPayload(uint64_t total_ops, uint64_t total_size)
+ : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES),
+ total_ops(total_ops), total_size(total_size) {
+ }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ ENCODE_START(1, 1, bl);
+ encode(total_ops, bl);
+ encode(total_size, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(1, iter);
+ decode(total_ops, iter);
+ decode(total_size, iter);
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("total_ops", total_ops);
+ f->dump_int("total_size", total_size);
+ }
+
+ void print(std::ostream *out) const {
+ *out << "total_ops: " << total_ops << " total_size: " << total_size;
+ }
+};
+
+struct UnknownPayload : public ClientMetricPayloadBase {
+ UnknownPayload()
+ : ClientMetricPayloadBase(static_cast<ClientMetricType>(-1)) { }
+ UnknownPayload(ClientMetricType metric_type)
+ : ClientMetricPayloadBase(metric_type) { }
+
+ void encode(bufferlist &bl) const {
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ DECODE_START(254, iter);
+ iter.seek(struct_len);
+ DECODE_FINISH(iter);
+ }
+
+ void dump(Formatter *f) const {
+ }
+
+ void print(ostream *out) const {
+ }
+};
+
+typedef boost::variant<CapInfoPayload,
+ ReadLatencyPayload,
+ WriteLatencyPayload,
+ MetadataLatencyPayload,
+ DentryLeasePayload,
+ OpenedFilesPayload,
+ PinnedIcapsPayload,
+ OpenedInodesPayload,
+ ReadIoSizesPayload,
+ WriteIoSizesPayload,
+ UnknownPayload> ClientMetricPayload;
+
+// metric update message sent by clients
+struct ClientMetricMessage {
+public:
+ ClientMetricMessage(const ClientMetricPayload &payload = UnknownPayload())
+ : payload(payload) {
+ }
+
+ class EncodePayloadVisitor : public boost::static_visitor<void> {
+ public:
+ explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {
+ }
+
+ template <typename ClientMetricPayload>
+ inline void operator()(const ClientMetricPayload &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(payload.get_type()), m_bl);
+ payload.encode(m_bl);
+ }
+
+ private:
+ bufferlist &m_bl;
+ };
+
+ class DecodePayloadVisitor : public boost::static_visitor<void> {
+ public:
+ DecodePayloadVisitor(bufferlist::const_iterator &iter) : m_iter(iter) {
+ }
+
+ template <typename ClientMetricPayload>
+ inline void operator()(ClientMetricPayload &payload) const {
+ using ceph::decode;
+ payload.decode(m_iter);
+ }
+
+ private:
+ bufferlist::const_iterator &m_iter;
+ };
+
+ class DumpPayloadVisitor : public boost::static_visitor<void> {
+ public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {
+ }
+
+ template <typename ClientMetricPayload>
+ inline void operator()(const ClientMetricPayload &payload) const {
+ m_formatter->dump_string("client_metric_type", stringify(payload.get_type()));
+ payload.dump(m_formatter);
+ }
+
+ private:
+ Formatter *m_formatter;
+ };
+
+ class PrintPayloadVisitor : public boost::static_visitor<void> {
+ public:
+ explicit PrintPayloadVisitor(ostream *out) : _out(out) {
+ }
+
+ template <typename ClientMetricPayload>
+ inline void operator()(const ClientMetricPayload &payload) const {
+ *_out << "[client_metric_type: ";
+ payload.print_type(_out);
+ *_out << " ";
+ payload.print(_out);
+ *_out << "]";
+ }
+
+ private:
+ ostream *_out;
+ };
+
+ void encode(bufferlist &bl) const {
+ boost::apply_visitor(EncodePayloadVisitor(bl), payload);
+ }
+
+ void decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+
+ uint32_t metric_type;
+ decode(metric_type, iter);
+
+ switch (metric_type) {
+ case ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO:
+ payload = CapInfoPayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY:
+ payload = ReadLatencyPayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY:
+ payload = WriteLatencyPayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY:
+ payload = MetadataLatencyPayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE:
+ payload = DentryLeasePayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES:
+ payload = OpenedFilesPayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS:
+ payload = PinnedIcapsPayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES:
+ payload = OpenedInodesPayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES:
+ payload = ReadIoSizesPayload();
+ break;
+ case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES:
+ payload = WriteIoSizesPayload();
+ break;
+ default:
+ payload = UnknownPayload(static_cast<ClientMetricType>(metric_type));
+ break;
+ }
+
+ boost::apply_visitor(DecodePayloadVisitor(iter), payload);
+ }
+
+ void dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+ }
+
+ void print(ostream *out) const {
+ apply_visitor(PrintPayloadVisitor(out), payload);
+ }
+
+ ClientMetricPayload payload;
+};
+WRITE_CLASS_ENCODER(ClientMetricMessage);
+
+#endif // CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H
diff --git a/src/include/cmp.h b/src/include/cmp.h
new file mode 100644
index 000000000..79372fde5
--- /dev/null
+++ b/src/include/cmp.h
@@ -0,0 +1,205 @@
+#ifndef __CEPH_CMP_H
+#define __CEPH_CMP_H
+
+/*
+ * macros to define comparison operators for classes with small numbers of members.
+ */
+
+#define WRITE_EQ_OPERATORS_1(type, a) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a; \
+ }
+
+#define WRITE_CMP_OPERATORS_1(type, a) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a; \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a; \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a >= r.a; \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a <= r.a; \
+ }
+
+#define WRITE_EQ_OPERATORS_2(type, a, b) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b; \
+ }
+
+#define WRITE_CMP_OPERATORS_2(type, a, b) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b)); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b)); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b >= r.b)); \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b <= r.b)); \
+ }
+
+
+#define WRITE_EQ_OPERATORS_3(type, a, b, c) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b && l.c == r.c; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b || l.c != r.c; \
+ }
+
+#define WRITE_CMP_OPERATORS_3(type, a, b, c) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c)))); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c)))); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c >= r.c)))); \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c <= r.c)))); \
+ }
+
+#define WRITE_EQ_OPERATORS_4(type, a, b, c, d) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d; \
+ }
+
+#define WRITE_CMP_OPERATORS_4(type, a, b, c, d) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d)))))); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d)))))); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d >= r.d)))))); \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d <= r.d)))))); \
+ }
+
+
+
+#define WRITE_EQ_OPERATORS_5(type, a, b, c, d, e) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d && l.e == r.e; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d || l.e != r.e; \
+ }
+
+#define WRITE_CMP_OPERATORS_5(type, a, b, c, d, e) \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d || \
+ (l.d == r.d && l.e > r.e))))))); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d || \
+ (l.d == r.d && (l.e < r.e)))))))); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d || \
+ (l.d == r.d && l.e >= r.e))))))); \
+ } \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d || \
+ (l.d == r.d && l.e <= r.e))))))); \
+ }
+
+#define WRITE_EQ_OPERATORS_7(type, a, b, c, d, e, f, g) \
+ inline bool operator==(const type &l, const type &r) { \
+ return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d && l.e == r.e && l.f == r.f && l.g == r.g; \
+ } \
+ inline bool operator!=(const type &l, const type &r) { \
+ return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d || l.e != r.e || l.f != r.f || l.g != r.g; \
+ }
+#define WRITE_CMP_OPERATORS_7(type, a, b, c, d, e, f, g) \
+ inline bool operator<=(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d || \
+ (l.d == r.d && (l.e < r.e || \
+ (l.e == r.e && (l.f < r.f || \
+ (l.f == r.f && l.g <= r.g))))))))))); \
+ } \
+ inline bool operator>=(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d || \
+ (l.d == r.d && (l.e > r.e || \
+ (l.e == r.e && (l.f > r.f || \
+ (l.f == r.f && l.g >= r.g))))))))))); \
+ } \
+ inline bool operator>(const type &l, const type &r) { \
+ return l.a > r.a || \
+ (l.a == r.a && (l.b > r.b || \
+ (l.b == r.b && (l.c > r.c || \
+ (l.c == r.c && (l.d > r.d || \
+ (l.d == r.d && (l.e > r.e || \
+ (l.e == r.e && (l.f > r.f || \
+ (l.f == r.f && l.g > r.g))))))))))); \
+ } \
+ inline bool operator<(const type &l, const type &r) { \
+ return l.a < r.a || \
+ (l.a == r.a && (l.b < r.b || \
+ (l.b == r.b && (l.c < r.c || \
+ (l.c == r.c && (l.d < r.d || \
+ (l.d == r.d && (l.e < r.e || \
+ (l.e == r.e && (l.f < r.f || \
+ (l.f == r.f && l.g < r.g))))))))))); \
+ }
+#endif
diff --git a/src/include/color.h b/src/include/color.h
new file mode 100644
index 000000000..6c8df40e0
--- /dev/null
+++ b/src/include/color.h
@@ -0,0 +1,13 @@
+#ifndef CEPH_COLOR_H
+#define CEPH_COLOR_H
+
+#define TEXT_NORMAL "\033[0m"
+/*#define TEXT_HAZARD "\033[5;31m"*/
+#define TEXT_RED "\033[0;31m"
+#define TEXT_GREEN "\033[0;32m"
+#define TEXT_YELLOW "\033[0;33m"
+#define TEXT_BLUE "\033[0;34m"
+#define TEXT_MAGENTA "\033[0;35m"
+#define TEXT_CYAN "\033[0;36m"
+
+#endif
diff --git a/src/include/common_fwd.h b/src/include/common_fwd.h
new file mode 100644
index 000000000..d906aadfa
--- /dev/null
+++ b/src/include/common_fwd.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+#define TOPNSPC crimson
+#else
+#define TOPNSPC ceph
+#endif
+
+namespace TOPNSPC::common {
+ class CephContext;
+ class PerfCounters;
+ class PerfCountersBuilder;
+ class PerfCountersCollection;
+ class PerfCountersCollectionImpl;
+ class PerfGuard;
+ class RefCountedObject;
+ class RefCountedObjectSafe;
+ class RefCountedCond;
+ class RefCountedWaitObject;
+ class ConfigProxy;
+}
+using TOPNSPC::common::CephContext;
+using TOPNSPC::common::PerfCounters;
+using TOPNSPC::common::PerfCountersBuilder;
+using TOPNSPC::common::PerfCountersCollection;
+using TOPNSPC::common::PerfCountersCollectionImpl;
+using TOPNSPC::common::PerfGuard;
+using TOPNSPC::common::RefCountedObject;
+using TOPNSPC::common::RefCountedObjectSafe;
+using TOPNSPC::common::RefCountedCond;
+using TOPNSPC::common::RefCountedWaitObject;
+using TOPNSPC::common::ConfigProxy;
diff --git a/src/include/compact_map.h b/src/include/compact_map.h
new file mode 100644
index 000000000..21645e3d1
--- /dev/null
+++ b/src/include/compact_map.h
@@ -0,0 +1,383 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_MAP_H
+#define CEPH_COMPACT_MAP_H
+
+#include "buffer.h"
+#include "encoding.h"
+
+#include <map>
+#include <memory>
+
+#include "include/encoding.h"
+
+template <class Key, class T, class Map>
+class compact_map_base {
+protected:
+ std::unique_ptr<Map> map;
+ void alloc_internal() {
+ if (!map)
+ map.reset(new Map);
+ }
+ void free_internal() {
+ map.reset();
+ }
+ template <class It>
+ class const_iterator_base {
+ const compact_map_base *map;
+ It it;
+ const_iterator_base() : map(0) { }
+ const_iterator_base(const compact_map_base* m) : map(m) { }
+ const_iterator_base(const compact_map_base *m, const It& i) : map(m), it(i) { }
+ friend class compact_map_base;
+ friend class iterator_base;
+ public:
+ const_iterator_base(const const_iterator_base& o) {
+ map = o.map;
+ it = o.it;
+ }
+ bool operator==(const const_iterator_base& o) const {
+ return (map == o.map) && (!map->map || it == o.it);
+ }
+ bool operator!=(const const_iterator_base& o) const {
+ return !(*this == o);;
+ }
+ const_iterator_base& operator=(const const_iterator_base& o) {
+ map = o.map;
+ it = o.it;
+ return *this;
+ }
+ const_iterator_base& operator++() {
+ ++it;
+ return *this;
+ }
+ const_iterator_base& operator--() {
+ --it;
+ return *this;
+ }
+ const std::pair<const Key,T>& operator*() {
+ return *it;
+ }
+ const std::pair<const Key,T>* operator->() {
+ return it.operator->();
+ }
+ };
+ template <class It>
+ class iterator_base {
+ private:
+ const compact_map_base* map;
+ It it;
+ iterator_base() : map(0) { }
+ iterator_base(compact_map_base* m) : map(m) { }
+ iterator_base(compact_map_base* m, const It& i) : map(m), it(i) { }
+ friend class compact_map_base;
+ public:
+ iterator_base(const iterator_base& o) {
+ map = o.map;
+ it = o.it;
+ }
+ bool operator==(const iterator_base& o) const {
+ return (map == o.map) && (!map->map || it == o.it);
+ }
+ bool operator!=(const iterator_base& o) const {
+ return !(*this == o);;
+ }
+ iterator_base& operator=(const iterator_base& o) {
+ map = o.map;
+ it = o.it;
+ return *this;
+ }
+ iterator_base& operator++() {
+ ++it;
+ return *this;
+ }
+ iterator_base operator++(int) {
+ iterator_base tmp = *this;
+ ++it;
+ return tmp;
+ }
+ iterator_base& operator--() {
+ --it;
+ return *this;
+ }
+ std::pair<const Key,T>& operator*() {
+ return *it;
+ }
+ std::pair<const Key,T>* operator->() {
+ return it.operator->();
+ }
+ operator const_iterator_base<It>() const {
+ return const_iterator_base<It>(map, it);
+ }
+ };
+
+public:
+ class iterator : public iterator_base<typename Map::iterator> {
+ public:
+ iterator() { }
+ iterator(const iterator_base<typename Map::iterator>& o)
+ : iterator_base<typename Map::iterator>(o) { }
+ iterator(compact_map_base* m) : iterator_base<typename Map::iterator>(m) { }
+ iterator(compact_map_base* m, const typename Map::iterator& i)
+ : iterator_base<typename Map::iterator>(m, i) { }
+ };
+ class const_iterator : public const_iterator_base<typename Map::const_iterator> {
+ public:
+ const_iterator() { }
+ const_iterator(const iterator_base<typename Map::const_iterator>& o)
+ : const_iterator_base<typename Map::const_iterator>(o) { }
+ const_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_iterator>(m) { }
+ const_iterator(const compact_map_base* m, const typename Map::const_iterator& i)
+ : const_iterator_base<typename Map::const_iterator>(m, i) { }
+ };
+ class reverse_iterator : public iterator_base<typename Map::reverse_iterator> {
+ public:
+ reverse_iterator() { }
+ reverse_iterator(const iterator_base<typename Map::reverse_iterator>& o)
+ : iterator_base<typename Map::reverse_iterator>(o) { }
+ reverse_iterator(compact_map_base* m) : iterator_base<typename Map::reverse_iterator>(m) { }
+ reverse_iterator(compact_map_base* m, const typename Map::reverse_iterator& i)
+ : iterator_base<typename Map::reverse_iterator>(m, i) { }
+ };
+ class const_reverse_iterator : public const_iterator_base<typename Map::const_reverse_iterator> {
+ public:
+ const_reverse_iterator() { }
+ const_reverse_iterator(const iterator_base<typename Map::const_reverse_iterator>& o)
+ : iterator_base<typename Map::const_reverse_iterator>(o) { }
+ const_reverse_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_reverse_iterator>(m) { }
+ const_reverse_iterator(const compact_map_base* m, const typename Map::const_reverse_iterator& i)
+ : const_iterator_base<typename Map::const_reverse_iterator>(m, i) { }
+ };
+ compact_map_base(const compact_map_base& o) {
+ if (o.map) {
+ alloc_internal();
+ *map = *o.map;
+ }
+ }
+ compact_map_base() {}
+ ~compact_map_base() {}
+
+ bool empty() const {
+ return !map || map->empty();
+ }
+ size_t size() const {
+ return map ? map->size() : 0;
+ }
+ bool operator==(const compact_map_base& o) const {
+ return (empty() && o.empty()) || (map && o.map && *map == *o.map);
+ }
+ bool operator!=(const compact_map_base& o) const {
+ return !(*this == o);
+ }
+ size_t count (const Key& k) const {
+ return map ? map->count(k) : 0;
+ }
+ iterator erase (iterator p) {
+ if (map) {
+ ceph_assert(this == p.map);
+ auto it = map->erase(p.it);
+ if (map->empty()) {
+ free_internal();
+ return iterator(this);
+ } else {
+ return iterator(this, it);
+ }
+ } else {
+ return iterator(this);
+ }
+ }
+ size_t erase (const Key& k) {
+ if (!map)
+ return 0;
+ size_t r = map->erase(k);
+ if (map->empty())
+ free_internal();
+ return r;
+ }
+ void clear() {
+ free_internal();
+ }
+ void swap(compact_map_base& o) {
+ map.swap(o.map);
+ }
+ compact_map_base& operator=(const compact_map_base& o) {
+ if (o.map) {
+ alloc_internal();
+ *map = *o.map;
+ } else
+ free_internal();
+ return *this;
+ }
+ iterator insert(const std::pair<const Key, T>& val) {
+ alloc_internal();
+ return iterator(this, map->insert(val));
+ }
+ template <class... Args>
+ std::pair<iterator,bool> emplace ( Args&&... args ) {
+ alloc_internal();
+ auto em = map->emplace(std::forward<Args>(args)...);
+ return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+ }
+ iterator begin() {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->begin());
+ }
+ iterator end() {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->end());
+ }
+ reverse_iterator rbegin() {
+ if (!map)
+ return reverse_iterator(this);
+ return reverse_iterator(this, map->rbegin());
+ }
+ reverse_iterator rend() {
+ if (!map)
+ return reverse_iterator(this);
+ return reverse_iterator(this, map->rend());
+ }
+ iterator find(const Key& k) {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->find(k));
+ }
+ iterator lower_bound(const Key& k) {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->lower_bound(k));
+ }
+ iterator upper_bound(const Key& k) {
+ if (!map)
+ return iterator(this);
+ return iterator(this, map->upper_bound(k));
+ }
+ const_iterator begin() const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->begin());
+ }
+ const_iterator end() const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->end());
+ }
+ const_reverse_iterator rbegin() const {
+ if (!map)
+ return const_reverse_iterator(this);
+ return const_reverse_iterator(this, map->rbegin());
+ }
+ const_reverse_iterator rend() const {
+ if (!map)
+ return const_reverse_iterator(this);
+ return const_reverse_iterator(this, map->rend());
+ }
+ const_iterator find(const Key& k) const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->find(k));
+ }
+ const_iterator lower_bound(const Key& k) const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->lower_bound(k));
+ }
+ const_iterator upper_bound(const Key& k) const {
+ if (!map)
+ return const_iterator(this);
+ return const_iterator(this, map->upper_bound(k));
+ }
+ void encode(ceph::buffer::list &bl) const {
+ using ceph::encode;
+ if (map)
+ encode(*map, bl);
+ else
+ encode((uint32_t)0, bl);
+ }
+ void encode(ceph::buffer::list &bl, uint64_t features) const {
+ using ceph::encode;
+ if (map)
+ encode(*map, bl, features);
+ else
+ encode((uint32_t)0, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ using ceph::decode_nohead;
+ uint32_t n;
+ decode(n, p);
+ if (n > 0) {
+ alloc_internal();
+ decode_nohead(n, *map, p);
+ } else
+ free_internal();
+ }
+};
+
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, ceph::buffer::list& bl) {
+ m.encode(bl);
+}
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, ceph::buffer::list& bl,
+ uint64_t features) {
+ m.encode(bl, features);
+}
+template<class Key, class T, class Map>
+inline void decode(compact_map_base<Key, T, Map>& m, ceph::buffer::list::const_iterator& p) {
+ m.decode(p);
+}
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_map : public compact_map_base<Key, T, std::map<Key,T,Compare,Alloc> > {
+public:
+ T& operator[](const Key& k) {
+ this->alloc_internal();
+ return (*(this->map))[k];
+ }
+};
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T, Compare, Alloc>& m)
+{
+ out << "{";
+ bool first = true;
+ for (const auto &p : m) {
+ if (!first)
+ out << ",";
+ out << p.first << "=" << p.second;
+ first = false;
+ }
+ out << "}";
+ return out;
+}
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T,Compare,Alloc> > {
+};
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T, Compare, Alloc>& m)
+{
+ out << "{{";
+ bool first = true;
+ for (const auto &p : m) {
+ if (!first)
+ out << ",";
+ out << p.first << "=" << p.second;
+ first = false;
+ }
+ out << "}}";
+ return out;
+}
+#endif
diff --git a/src/include/compact_set.h b/src/include/compact_set.h
new file mode 100644
index 000000000..a364fd8c4
--- /dev/null
+++ b/src/include/compact_set.h
@@ -0,0 +1,305 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_SET_H
+#define CEPH_COMPACT_SET_H
+
+#include "buffer.h"
+#include "encoding.h"
+
+#include <memory>
+#include <set>
+
+template <class T, class Set>
+class compact_set_base {
+protected:
+ std::unique_ptr<Set> set;
+ void alloc_internal() {
+ if (!set)
+ set.reset(new Set);
+ }
+ void free_internal() {
+ set.reset();
+ }
+ template <class It>
+ class iterator_base {
+ private:
+ const compact_set_base* set;
+ It it;
+ iterator_base() : set(0) { }
+ iterator_base(const compact_set_base* s) : set(s) { }
+ iterator_base(const compact_set_base* s, const It& i) : set(s), it(i) { }
+ friend class compact_set_base;
+ public:
+ iterator_base(const iterator_base& o) {
+ set = o.set;
+ it = o.it;
+ }
+ bool operator==(const iterator_base& o) const {
+ return (set == o.set) && (!set->set || it == o.it);
+ }
+ bool operator!=(const iterator_base& o) const {
+ return !(*this == o);;
+ }
+ iterator_base& operator=(const iterator_base& o) {
+ set->set = o.set;
+ it = o.it;
+ return *this;
+ }
+ iterator_base& operator++() {
+ ++it;
+ return *this;
+ }
+ iterator_base operator++(int) {
+ iterator_base tmp = *this;
+ ++it;
+ return tmp;
+ }
+ iterator_base& operator--() {
+ --it;
+ return *this;
+ }
+ const T& operator*() {
+ return *it;
+ }
+ };
+public:
+ class const_iterator : public iterator_base<typename Set::const_iterator> {
+ public:
+ const_iterator() { }
+ const_iterator(const iterator_base<typename Set::const_iterator>& o)
+ : iterator_base<typename Set::const_iterator>(o) { }
+ const_iterator(const compact_set_base* s) : iterator_base<typename Set::const_iterator>(s) { }
+ const_iterator(const compact_set_base* s, const typename Set::const_iterator& i)
+ : iterator_base<typename Set::const_iterator>(s, i) { }
+ };
+ class iterator : public iterator_base<typename Set::iterator> {
+ public:
+ iterator() { }
+ iterator(const iterator_base<typename Set::iterator>& o)
+ : iterator_base<typename Set::iterator>(o) { }
+ iterator(compact_set_base* s) : iterator_base<typename Set::iterator>(s) { }
+ iterator(compact_set_base* s, const typename Set::iterator& i)
+ : iterator_base<typename Set::iterator>(s, i) { }
+ operator const_iterator() const {
+ return const_iterator(this->set, this->it);
+ }
+ };
+ class const_reverse_iterator : public iterator_base<typename Set::const_reverse_iterator> {
+ public:
+ const_reverse_iterator() { }
+ const_reverse_iterator(const iterator_base<typename Set::const_reverse_iterator>& o)
+ : iterator_base<typename Set::const_reverse_iterator>(o) { }
+ const_reverse_iterator(const compact_set_base* s) : iterator_base<typename Set::const_reverse_iterator>(s) { }
+ const_reverse_iterator(const compact_set_base* s, const typename Set::const_reverse_iterator& i)
+ : iterator_base<typename Set::const_reverse_iterator>(s, i) { }
+ };
+ class reverse_iterator : public iterator_base<typename Set::reverse_iterator> {
+ public:
+ reverse_iterator() { }
+ reverse_iterator(const iterator_base<typename Set::reverse_iterator>& o)
+ : iterator_base<typename Set::reverse_iterator>(o) { }
+ reverse_iterator(compact_set_base* s) : iterator_base<typename Set::reverse_iterator>(s) { }
+ reverse_iterator(compact_set_base* s, const typename Set::reverse_iterator& i)
+ : iterator_base<typename Set::reverse_iterator>(s, i) { }
+ operator const_iterator() const {
+ return const_iterator(this->set, this->it);
+ }
+ };
+
+ compact_set_base() {}
+ compact_set_base(const compact_set_base& o) {
+ if (o.set) {
+ alloc_internal();
+ *set = *o.set;
+ }
+ }
+ ~compact_set_base() {}
+
+
+ bool empty() const {
+ return !set || set->empty();
+ }
+ size_t size() const {
+ return set ? set->size() : 0;
+ }
+ bool operator==(const compact_set_base& o) const {
+ return (empty() && o.empty()) || (set && o.set && *set == *o.set);
+ }
+ bool operator!=(const compact_set_base& o) const {
+ return !(*this == o);
+ }
+ size_t count(const T& t) const {
+ return set ? set->count(t) : 0;
+ }
+ iterator erase (iterator p) {
+ if (set) {
+ ceph_assert(this == p.set);
+ auto it = set->erase(p.it);
+ if (set->empty()) {
+ free_internal();
+ return iterator(this);
+ } else {
+ return iterator(this, it);
+ }
+ } else {
+ return iterator(this);
+ }
+ }
+ size_t erase (const T& t) {
+ if (!set)
+ return 0;
+ size_t r = set->erase(t);
+ if (set->empty())
+ free_internal();
+ return r;
+ }
+ void clear() {
+ free_internal();
+ }
+ void swap(compact_set_base& o) {
+ set.swap(o.set);
+ }
+ compact_set_base& operator=(const compact_set_base& o) {
+ if (o.set) {
+ alloc_internal();
+ *set = *o.set;
+ } else
+ free_internal();
+ return *this;
+ }
+ std::pair<iterator,bool> insert(const T& t) {
+ alloc_internal();
+ std::pair<typename Set::iterator,bool> r = set->insert(t);
+ return std::make_pair(iterator(this, r.first), r.second);
+ }
+ template <class... Args>
+ std::pair<iterator,bool> emplace ( Args&&... args ) {
+ alloc_internal();
+ auto em = set->emplace(std::forward<Args>(args)...);
+ return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+ }
+
+ iterator begin() {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->begin());
+ }
+ iterator end() {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->end());
+ }
+ reverse_iterator rbegin() {
+ if (!set)
+ return reverse_iterator(this);
+ return reverse_iterator(this, set->rbegin());
+ }
+ reverse_iterator rend() {
+ if (!set)
+ return reverse_iterator(this);
+ return reverse_iterator(this, set->rend());
+ }
+ iterator find(const T& t) {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->find(t));
+ }
+ iterator lower_bound(const T& t) {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->lower_bound(t));
+ }
+ iterator upper_bound(const T& t) {
+ if (!set)
+ return iterator(this);
+ return iterator(this, set->upper_bound(t));
+ }
+ const_iterator begin() const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->begin());
+ }
+ const_iterator end() const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->end());
+ }
+ const_reverse_iterator rbegin() const {
+ if (!set)
+ return const_reverse_iterator(this);
+ return const_reverse_iterator(this, set->rbegin());
+ }
+ const_reverse_iterator rend() const {
+ if (!set)
+ return const_reverse_iterator(this);
+ return const_reverse_iterator(this, set->rend());
+ }
+ const_iterator find(const T& t) const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->find(t));
+ }
+ const_iterator lower_bound(const T& t) const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->lower_bound(t));
+ }
+ const_iterator upper_bound(const T& t) const {
+ if (!set)
+ return const_iterator(this);
+ return const_iterator(this, set->upper_bound(t));
+ }
+ void encode(ceph::buffer::list &bl) const {
+ using ceph::encode;
+ if (set)
+ encode(*set, bl);
+ else
+ encode((uint32_t)0, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ uint32_t n;
+ decode(n, p);
+ if (n > 0) {
+ alloc_internal();
+ ceph::decode_nohead(n, *set, p);
+ } else
+ free_internal();
+ }
+};
+
+template<class T, class Set>
+inline void encode(const compact_set_base<T, Set>& m, ceph::buffer::list& bl) {
+ m.encode(bl);
+}
+template<class T, class Set>
+inline void decode(compact_set_base<T, Set>& m, ceph::buffer::list::const_iterator& p) {
+ m.decode(p);
+}
+
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+class compact_set : public compact_set_base<T, std::set<T, Compare, Alloc> > {
+};
+
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+inline std::ostream& operator<<(std::ostream& out, const compact_set<T,Compare,Alloc>& s)
+{
+ bool first = true;
+ for (auto &v : s) {
+ if (!first)
+ out << ",";
+ out << v;
+ first = false;
+ }
+ return out;
+}
+#endif
diff --git a/src/include/compat.h b/src/include/compat.h
new file mode 100644
index 000000000..753741295
--- /dev/null
+++ b/src/include/compat.h
@@ -0,0 +1,401 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_COMPAT_H
+#define CEPH_COMPAT_H
+
+#include "acconfig.h"
+#include <sys/types.h>
+#include <errno.h>
+#include <unistd.h>
+
+#if defined(__linux__)
+#define PROCPREFIX
+#endif
+
+#include <sys/stat.h>
+#ifndef ACCESSPERMS
+#define ACCESSPERMS (S_IRWXU|S_IRWXG|S_IRWXO)
+#endif
+
+#if defined(__FreeBSD__)
+
+// FreeBSD supports Linux procfs with its compatibility module
+// And all compatibility stuff is standard mounted on this
+#define PROCPREFIX "/compat/linux"
+
+#ifndef MSG_MORE
+#define MSG_MORE 0
+#endif
+
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+/* And include the extra required include file */
+#include <pthread_np.h>
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#define cpu_set_t cpuset_t
+int sched_setaffinity(pid_t pid, size_t cpusetsize,
+ cpu_set_t *mask);
+
+#endif /* __FreeBSD__ */
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+/* Make sure that ENODATA is defined in the correct way */
+#ifdef ENODATA
+#if (ENODATA == 9919)
+// #warning ENODATA already defined to be 9919, redefining to fix
+// Silencing this warning because it fires at all files where compat.h
+// is included after boost files.
+//
+// This value stems from the definition in the boost library
+// And when this case occurs it is due to the fact that boost files
+// are included before this file. Redefinition might not help in this
+// case since already parsed code has evaluated to the wrong value.
+// This would warrrant for d definition that would actually be evaluated
+// at the location of usage and report a possible conflict.
+// This is left up to a future improvement
+#elif (ENODATA != 87)
+// #warning ENODATA already defined to a value different from 87 (ENOATRR), refining to fix
+#endif
+#undef ENODATA
+#endif
+#define ENODATA ENOATTR
+
+// Fix clock accuracy
+#if !defined(CLOCK_MONOTONIC_COARSE)
+#if defined(CLOCK_MONOTONIC_FAST)
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_FAST
+#else
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+#endif
+#endif
+#if !defined(CLOCK_REALTIME_COARSE)
+#if defined(CLOCK_REALTIME_FAST)
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME_FAST
+#else
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#endif
+#endif
+
+/* get PATH_MAX */
+#include <limits.h>
+
+#ifndef EUCLEAN
+#define EUCLEAN 117
+#endif
+#ifndef EREMOTEIO
+#define EREMOTEIO 121
+#endif
+#ifndef EKEYREJECTED
+#define EKEYREJECTED 129
+#endif
+#ifndef XATTR_CREATE
+#define XATTR_CREATE 1
+#endif
+
+#endif /* __APPLE__ */
+
+#ifndef HOST_NAME_MAX
+#ifdef MAXHOSTNAMELEN
+#define HOST_NAME_MAX MAXHOSTNAMELEN
+#else
+#define HOST_NAME_MAX 255
+#endif
+#endif /* HOST_NAME_MAX */
+
+/* O_LARGEFILE is not defined/required on OSX/FreeBSD */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+/* Could be relevant for other platforms */
+#ifndef ERESTART
+#define ERESTART EINTR
+#endif
+
+#ifndef TEMP_FAILURE_RETRY
+#define TEMP_FAILURE_RETRY(expression) ({ \
+ __typeof(expression) __result; \
+ do { \
+ __result = (expression); \
+ } while (__result == -1 && errno == EINTR); \
+ __result; })
+#endif
+
+#ifdef __cplusplus
+# define VOID_TEMP_FAILURE_RETRY(expression) \
+ static_cast<void>(TEMP_FAILURE_RETRY(expression))
+#else
+# define VOID_TEMP_FAILURE_RETRY(expression) \
+ do { (void)TEMP_FAILURE_RETRY(expression); } while (0)
+#endif
+
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#define lseek64(fd, offset, whence) lseek(fd, offset, whence)
+#endif
+
+#if defined(__sun) || defined(_AIX)
+#define LOG_AUTHPRIV (10<<3)
+#define LOG_FTP (11<<3)
+#define __STRING(x) "x"
+#endif
+
+#if defined(__sun) || defined(_AIX) || defined(_WIN32)
+#define IFTODT(mode) (((mode) & 0170000) >> 12)
+#endif
+
+#if defined(_AIX)
+#define MSG_DONTWAIT MSG_NONBLOCK
+#endif
+
+#if defined(HAVE_PTHREAD_SETNAME_NP)
+ #if defined(__APPLE__)
+ #define ceph_pthread_setname(thread, name) ({ \
+ int __result = 0; \
+ if (thread == pthread_self()) \
+ __result = pthread_setname_np(name); \
+ __result; })
+ #else
+ #define ceph_pthread_setname pthread_setname_np
+ #endif
+#elif defined(HAVE_PTHREAD_SET_NAME_NP)
+ /* Fix a small name diff and return 0 */
+ #define ceph_pthread_setname(thread, name) ({ \
+ pthread_set_name_np(thread, name); \
+ 0; })
+#else
+ /* compiler warning free success noop */
+ #define ceph_pthread_setname(thread, name) ({ \
+ int __i = 0; \
+ __i; })
+#endif
+
+#if defined(HAVE_PTHREAD_GETNAME_NP)
+ #define ceph_pthread_getname pthread_getname_np
+#elif defined(HAVE_PTHREAD_GET_NAME_NP)
+ #define ceph_pthread_getname(thread, name, len) ({ \
+ pthread_get_name_np(thread, name, len); \
+ 0; })
+#else
+ /* compiler warning free success noop */
+ #define ceph_pthread_getname(thread, name, len) ({ \
+ if (name != NULL) \
+ *name = '\0'; \
+ 0; })
+#endif
+
+int ceph_posix_fallocate(int fd, off_t offset, off_t len);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int pipe_cloexec(int pipefd[2], int flags);
+char *ceph_strerror_r(int errnum, char *buf, size_t buflen);
+unsigned get_page_size();
+// On success, returns the number of bytes written to the buffer. On
+// failure, returns -1.
+ssize_t get_self_exe_path(char* path, int buff_length);
+
+int ceph_memzero_s(void *dest, size_t destsz, size_t count);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if defined(_WIN32)
+
+#include "include/win32/winsock_compat.h"
+
+#include <windows.h>
+#include <time.h>
+
+#include "include/win32/win32_errno.h"
+#include "include/win32/fs_compat.h"
+
+// There are a few name collisions between Windows headers and Ceph.
+// Updating Ceph definitions would be the prefferable fix in order to avoid
+// confussion, unless it requires too many changes, in which case we're going
+// to redefine Windows values by adding the "WIN32_" prefix.
+#define WIN32_DELETE 0x00010000L
+#undef DELETE
+
+#define WIN32_ERROR 0
+#undef ERROR
+
+#ifndef uint
+typedef unsigned int uint;
+#endif
+
+typedef _sigset_t sigset_t;
+
+typedef unsigned int uid_t;
+typedef unsigned int gid_t;
+
+typedef unsigned int blksize_t;
+typedef unsigned __int64 blkcnt_t;
+typedef unsigned short nlink_t;
+
+typedef long long loff_t;
+
+#define CPU_SETSIZE (sizeof(size_t)*8)
+
+typedef union
+{
+ char cpuset[CPU_SETSIZE/8];
+ size_t _align;
+} cpu_set_t;
+
+struct iovec {
+ void *iov_base;
+ size_t iov_len;
+};
+
+#define SHUT_RD SD_RECEIVE
+#define SHUT_WR SD_SEND
+#define SHUT_RDWR SD_BOTH
+
+#ifndef SIGINT
+#define SIGINT 2
+#endif
+
+#ifndef SIGKILL
+#define SIGKILL 9
+#endif
+
+#define IOV_MAX 1024
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ssize_t readv(int fd, const struct iovec *iov, int iov_cnt);
+ssize_t writev(int fd, const struct iovec *iov, int iov_cnt);
+
+int fsync(int fd);
+ssize_t pread(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset);
+
+long int lrand48(void);
+int random();
+
+int pipe(int pipefd[2]);
+
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+char *strptime(const char *s, const char *format, struct tm *tm);
+
+int chown(const char *path, uid_t owner, gid_t group);
+int fchown(int fd, uid_t owner, gid_t group);
+int lchown(const char *path, uid_t owner, gid_t group);
+int setenv(const char *name, const char *value, int overwrite);
+
+int geteuid();
+int getegid();
+int getuid();
+int getgid();
+
+#define unsetenv(name) _putenv_s(name, "")
+
+int win_socketpair(int socks[2]);
+
+#ifdef __MINGW32__
+extern _CRTIMP errno_t __cdecl _putenv_s(const char *_Name,const char *_Value);
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define htobe16(x) __builtin_bswap16(x)
+#define htole16(x) (x)
+#define be16toh(x) __builtin_bswap16(x)
+#define le16toh(x) (x)
+
+#define htobe32(x) __builtin_bswap32(x)
+#define htole32(x) (x)
+#define be32toh(x) __builtin_bswap32(x)
+#define le32toh(x) (x)
+
+#define htobe64(x) __builtin_bswap64(x)
+#define htole64(x) (x)
+#define be64toh(x) __builtin_bswap64(x)
+#define le64toh(x) (x)
+#endif // defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+
+#endif // __MINGW32__
+
+#ifdef __cplusplus
+}
+#endif
+
+#define compat_closesocket closesocket
+// Use "aligned_free" when freeing memory allocated using posix_memalign or
+// _aligned_malloc. Using "free" will crash.
+#define aligned_free(ptr) _aligned_free(ptr)
+
+// O_CLOEXEC is not defined on Windows. Since handles aren't inherited
+// with subprocesses unless explicitly requested, we'll define this
+// flag as a no-op.
+#define O_CLOEXEC 0
+#define SOCKOPT_VAL_TYPE char*
+
+#define DEV_NULL "nul"
+
+#else /* WIN32 */
+
+#define SOCKOPT_VAL_TYPE void*
+
+#define aligned_free(ptr) free(ptr)
+static inline int compat_closesocket(int fildes) {
+ return close(fildes);
+}
+
+#define DEV_NULL "/dev/null"
+
+#endif /* WIN32 */
+
+/* Supplies code to be run at startup time before invoking main().
+ * Use as:
+ *
+ * CEPH_CONSTRUCTOR(my_constructor) {
+ * ...some code...
+ * }
+ */
+#ifdef _MSC_VER
+#pragma section(".CRT$XCU",read)
+#define CEPH_CONSTRUCTOR(f) \
+ static void __cdecl f(void); \
+ __declspec(allocate(".CRT$XCU")) static void (__cdecl*f##_)(void) = f; \
+ static void __cdecl f(void)
+#else
+#define CEPH_CONSTRUCTOR(f) \
+ static void f(void) __attribute__((constructor)); \
+ static void f(void)
+#endif
+
+/* This should only be used with the socket API. */
+static inline int ceph_sock_errno() {
+#ifdef _WIN32
+ return wsae_to_errno(WSAGetLastError());
+#else
+ return errno;
+#endif
+}
+
+// Needed on Windows when handling binary files. Without it, line
+// endings will be replaced and certain characters can be treated as
+// EOF.
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#endif /* !CEPH_COMPAT_H */
diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake
new file mode 100644
index 000000000..50c3f0bad
--- /dev/null
+++ b/src/include/config-h.in.cmake
@@ -0,0 +1,393 @@
+/* config.h file expanded by Cmake for build */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+/* fallocate(2) is supported */
+#cmakedefine CEPH_HAVE_FALLOCATE
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#cmakedefine HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#cmakedefine HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `syncfs' function. */
+#cmakedefine HAVE_SYS_SYNCFS 1
+
+/* sync_file_range(2) is supported */
+#cmakedefine HAVE_SYNC_FILE_RANGE
+
+/* Define if you have mallinfo */
+#cmakedefine HAVE_MALLINFO
+
+/* Define to 1 if you have the `pwritev' function. */
+#cmakedefine HAVE_PWRITEV 1
+
+/* Define to 1 if you have the <sys/mount.h> header file. */
+#cmakedefine HAVE_SYS_MOUNT_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#cmakedefine HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#cmakedefine HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/vfs.h> header file. */
+#cmakedefine HAVE_SYS_VFS_H 1
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#cmakedefine HAVE_EXECINFO_H 1
+
+/* Define to 1 if the system has the type `__s16'. */
+#cmakedefine HAVE___S16 1
+
+/* Define to 1 if the system has the type `__s32'. */
+#cmakedefine HAVE___S32 1
+
+/* Define to 1 if the system has the type `__s64'. */
+#cmakedefine HAVE___S64 1
+
+/* Define to 1 if the system has the type `__s8'. */
+#cmakedefine HAVE___S8 1
+
+/* Define to 1 if the system has the type `__u16'. */
+#cmakedefine HAVE___U16 1
+
+/* Define to 1 if the system has the type `__u32'. */
+#cmakedefine HAVE___U32 1
+
+/* Define to 1 if the system has the type `__u64'. */
+#cmakedefine HAVE___U64 1
+
+/* Define to 1 if the system has the type `__u8'. */
+#cmakedefine HAVE___U8 1
+
+/* Define if the system has the type `in_addr_t' */
+#cmakedefine HAVE_IN_ADDR_T
+
+/* Define if you have res_nquery */
+#cmakedefine HAVE_RES_NQUERY
+
+/* Defined if you have LZ4 */
+#cmakedefine HAVE_LZ4
+
+/* Defined if you have BROTLI */
+#cmakedefine HAVE_BROTLI
+
+/* Defined if you have libaio */
+#cmakedefine HAVE_LIBAIO
+
+/* Defined if you have libzbd */
+#cmakedefine HAVE_LIBZBD
+
+/* Defined if you have liburing */
+#cmakedefine HAVE_LIBURING
+
+/* Defind if you have POSIX AIO */
+#cmakedefine HAVE_POSIXAIO
+
+/* Defined if OpenLDAP enabled */
+#cmakedefine HAVE_OPENLDAP
+
+/* Define if you have fuse */
+#cmakedefine HAVE_LIBFUSE
+
+/* Define version major */
+#define CEPH_FUSE_MAJOR_VERSION @FUSE_MAJOR_VERSION@
+
+/* Define version minor */
+#define CEPH_FUSE_MINOR_VERSION @FUSE_MINOR_VERSION@
+
+/* Define to 1 if you have libxfs */
+#cmakedefine HAVE_LIBXFS 1
+
+/* SPDK conditional compilation */
+#cmakedefine HAVE_SPDK
+
+/* DPDK conditional compilation */
+#cmakedefine HAVE_DPDK
+
+/* PMEM_DEVICE (OSD) conditional compilation */
+#cmakedefine HAVE_BLUESTORE_PMEM
+
+/* Defined if LevelDB supports bloom filters */
+#cmakedefine HAVE_LEVELDB_FILTER_POLICY
+
+/* Define if you have tcmalloc */
+#cmakedefine HAVE_LIBTCMALLOC
+#cmakedefine LIBTCMALLOC_MISSING_ALIGNED_ALLOC
+
+/* Define if have curl_multi_wait() */
+#cmakedefine HAVE_CURL_MULTI_WAIT 1
+
+/* AsyncMessenger RDMA conditional compilation */
+#cmakedefine HAVE_RDMA
+
+/* ibverbs experimental conditional compilation */
+#cmakedefine HAVE_IBV_EXP
+
+/* define if bluestore enabled */
+#cmakedefine WITH_BLUESTORE
+
+/* define if cephfs enabled */
+#cmakedefine WITH_CEPHFS
+
+/*define if GSSAPI/KRB5 enabled */
+#cmakedefine HAVE_GSSAPI
+
+/* define if rbd enabled */
+#cmakedefine WITH_RBD
+
+/* define if kernel rbd enabled */
+#cmakedefine WITH_KRBD
+
+/* define if key-value-store is enabled */
+#cmakedefine WITH_KVS
+
+/* define if radosgw enabled */
+#cmakedefine WITH_RADOSGW
+
+/* define if radosgw enabled */
+#cmakedefine WITH_RADOSGW_FCGI_FRONTEND
+
+/* define if leveldb is enabled */
+#cmakedefine WITH_LEVELDB
+
+/* define if radosgw's beast frontend enabled */
+#cmakedefine WITH_RADOSGW_BEAST_FRONTEND
+
+/* define if radosgw has openssl support */
+#cmakedefine WITH_CURL_OPENSSL
+
+/* define if HAVE_THREAD_SAFE_RES_QUERY */
+#cmakedefine HAVE_THREAD_SAFE_RES_QUERY
+
+/* define if HAVE_REENTRANT_STRSIGNAL */
+#cmakedefine HAVE_REENTRANT_STRSIGNAL
+
+/* Define if you want to use LTTng */
+#cmakedefine WITH_LTTNG
+
+/* Define if you want to use Jaeger */
+#cmakedefine HAVE_JAEGER
+
+/* Define if you want to use EVENTTRACE */
+#cmakedefine WITH_EVENTTRACE
+
+/* Define if you want to OSD function instrumentation */
+#cmakedefine WITH_OSD_INSTRUMENT_FUNCTIONS
+
+/* Define if you want to use Babeltrace */
+#cmakedefine WITH_BABELTRACE
+
+/* Define to 1 if you have the <babeltrace/babeltrace.h> header file. */
+#cmakedefine HAVE_BABELTRACE_BABELTRACE_H 1
+
+/* Define to 1 if you have the <babeltrace/ctf/events.h> header file. */
+#cmakedefine HAVE_BABELTRACE_CTF_EVENTS_H 1
+
+/* Define to 1 if you have the <babeltrace/ctf/iterator.h> header file. */
+#cmakedefine HAVE_BABELTRACE_CTF_ITERATOR_H 1
+
+/* Define to 1 if you have the <arpa/nameser_compat.h> header file. */
+#cmakedefine HAVE_ARPA_NAMESER_COMPAT_H 1
+
+/* FastCGI headers are in /usr/include/fastcgi */
+#cmakedefine FASTCGI_INCLUDE_DIR
+
+/* splice(2) is supported */
+#cmakedefine CEPH_HAVE_SPLICE
+
+/* Define if you want C_Gather debugging */
+#cmakedefine DEBUG_GATHER
+
+/* Define to 1 if you have the `getgrouplist' function. */
+#cmakedefine HAVE_GETGROUPLIST 1
+
+/* LTTng is disabled, so define this macro to be nothing. */
+#cmakedefine tracepoint
+
+/* Define to 1 if you have fdatasync. */
+#cmakedefine HAVE_FDATASYNC 1
+
+/* Define to 1 if you have the <valgrind/helgrind.h> header file. */
+#cmakedefine HAVE_VALGRIND_HELGRIND_H 1
+
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+#cmakedefine HAVE_SYS_PRCTL_H 1
+
+/* Define to 1 if you have the <linux/types.h> header file. */
+#cmakedefine HAVE_LINUX_TYPES_H 1
+
+/* Define to 1 if you have the <linux/version.h> header file. */
+#cmakedefine HAVE_LINUX_VERSION_H 1
+
+/* Define to 1 if you have sched.h. */
+#cmakedefine HAVE_SCHED 1
+
+/* Define to 1 if you have sigdescr_np. */
+#cmakedefine HAVE_SIGDESCR_NP 1
+
+/* Support SSE (Streaming SIMD Extensions) instructions */
+#cmakedefine HAVE_SSE
+
+/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
+#cmakedefine HAVE_SSE2
+
+/* Define to 1 if you have the `pipe2' function. */
+#cmakedefine HAVE_PIPE2 1
+
+/* Support NEON instructions */
+#cmakedefine HAVE_NEON
+
+/* Define if you have pthread_spin_init */
+#cmakedefine HAVE_PTHREAD_SPINLOCK
+
+/* name_to_handle_at exists */
+#cmakedefine HAVE_NAME_TO_HANDLE_AT
+
+/* we have a recent nasm and are x86_64 */
+#cmakedefine HAVE_NASM_X64
+
+/* nasm can also build the isa-l:avx2 */
+#cmakedefine HAVE_NASM_X64_AVX2
+
+/* nasm can also build the isa-l:avx512 */
+#cmakedefine HAVE_NASM_X64_AVX512
+
+/* Define if isa-l is compiled for arm64 */
+#cmakedefine HAVE_ARMV8_SIMD
+
+/* Define to 1 if strerror_r returns char *. */
+#cmakedefine STRERROR_R_CHAR_P 1
+
+/* Defined if you have libzfs enabled */
+#cmakedefine HAVE_LIBZFS
+
+/* Define if the C compiler supports __func__ */
+#cmakedefine HAVE_FUNC
+
+/* Define if the C compiler supports __PRETTY_FUNCTION__ */
+#cmakedefine HAVE_PRETTY_FUNC
+
+/* Define if the C compiler supports __attribute__((__symver__ (".."))) */
+#cmakedefine HAVE_ATTR_SYMVER
+
+/* Define if the C compiler supports __asm__(".symver ..") */
+#cmakedefine HAVE_ASM_SYMVER
+
+/* Have eventfd extension. */
+#cmakedefine HAVE_EVENTFD
+
+/* Define if enabling coverage. */
+#cmakedefine ENABLE_COVERAGE
+
+/* Defined if you want pg ref debugging */
+#cmakedefine PG_DEBUG_REFS
+
+/* Support ARMv8 CRC instructions */
+#cmakedefine HAVE_ARMV8_CRC
+
+/* Support ARMv8 CRYPTO instructions */
+#cmakedefine HAVE_ARMV8_CRYPTO
+
+/* Support ARMv8 CRC and CRYPTO intrinsics */
+#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+
+/* Define if you have struct stat.st_mtimespec.tv_nsec */
+#cmakedefine HAVE_STAT_ST_MTIMESPEC_TV_NSEC
+
+/* Define if you have struct stat.st_mtim.tv_nsec */
+#cmakedefine HAVE_STAT_ST_MTIM_TV_NSEC
+
+/* Define if compiler supports static_cast<> */
+#cmakedefine HAVE_STATIC_CAST
+
+/* Version number of package */
+#cmakedefine PROJECT_VERSION "@PROJECT_VERSION@"
+
+/* Defined if pthread_setname_np() is available */
+#cmakedefine HAVE_PTHREAD_SETNAME_NP 1
+
+/* Defined if pthread_rwlockattr_setkind_np() is available */
+#cmakedefine HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP
+
+/* Defined if blkin enabled */
+#cmakedefine WITH_BLKIN
+
+/* Defined if pthread_set_name_np() is available */
+#cmakedefine HAVE_PTHREAD_SET_NAME_NP
+
+/* Defined if pthread_getname_np() is available */
+#cmakedefine HAVE_PTHREAD_GETNAME_NP 1
+
+/* Support POWER8 instructions */
+#cmakedefine HAVE_POWER8
+
+/* Define if endian type is big endian */
+#cmakedefine CEPH_BIG_ENDIAN
+
+/* Define if endian type is little endian */
+#cmakedefine CEPH_LITTLE_ENDIAN
+
+#cmakedefine MGR_PYTHON_EXECUTABLE "@MGR_PYTHON_EXECUTABLE@"
+
+/* the default value of "mgr_disabled_module" option */
+#cmakedefine MGR_DISABLED_MODULES "@MGR_DISABLED_MODULES@"
+
+/* Define to 1 if you have the `getprogname' function. */
+#cmakedefine HAVE_GETPROGNAME 1
+
+/* Defined if getentropy() is available */
+#cmakedefine HAVE_GETENTROPY
+
+/* Defined if libradosstriper is enabled: */
+#cmakedefine WITH_LIBRADOSSTRIPER
+
+/* Defined if OpenSSL is available for the rgw beast frontend */
+#cmakedefine WITH_RADOSGW_BEAST_OPENSSL
+
+/* Defined if rabbitmq-c is available for rgw amqp push endpoint */
+#cmakedefine WITH_RADOSGW_AMQP_ENDPOINT
+
+/* Defined if libedkafka is available for rgw kafka push endpoint */
+#cmakedefine WITH_RADOSGW_KAFKA_ENDPOINT
+
+/* Defined if lua packages can be installed by radosgw */
+#cmakedefine WITH_RADOSGW_LUA_PACKAGES
+
+/* Defined if std::map::merge() is supported */
+#cmakedefine HAVE_STDLIB_MAP_SPLICING
+
+/* Defined if Intel QAT compress/decompress is supported */
+#cmakedefine HAVE_QATZIP
+
+/* Define if seastar is available. */
+#cmakedefine HAVE_SEASTAR
+
+/* Define if unit tests are built. */
+#cmakedefine UNIT_TESTS_BUILT
+
+/* Define if RBD QCOW migration format is enabled */
+#cmakedefine WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+
+/* Define if RWL is enabled */
+#cmakedefine WITH_RBD_RWL
+
+/* Define if PWL-SSD is enabled */
+#cmakedefine WITH_RBD_SSD_CACHE
+
+/* Define if libcryptsetup version < 2.0.5 */
+#cmakedefine LIBCRYPTSETUP_LEGACY_DATA_ALIGNMENT
+
+/* Define if libcryptsetup can be used (linux only) */
+#cmakedefine HAVE_LIBCRYPTSETUP
+
+/* Shared library extension, such as .so, .dll or .dylib */
+#cmakedefine CMAKE_SHARED_LIBRARY_SUFFIX "@CMAKE_SHARED_LIBRARY_SUFFIX@"
+
+/* libexec directory path */
+#cmakedefine CMAKE_INSTALL_LIBEXECDIR "@CMAKE_INSTALL_LIBEXECDIR@"
+
+#endif /* CONFIG_H */
diff --git a/src/include/coredumpctl.h b/src/include/coredumpctl.h
new file mode 100644
index 000000000..60b91e999
--- /dev/null
+++ b/src/include/coredumpctl.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include "acconfig.h"
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <iostream>
+#include <sys/prctl.h>
+#include "common/errno.h"
+
+class PrCtl {
+ int saved_state = -1;
+ static int get_dumpable() {
+ int r = prctl(PR_GET_DUMPABLE);
+ if (r == -1) {
+ r = errno;
+ std::cerr << "warning: unable to get dumpable flag: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ static int set_dumpable(bool new_state) {
+ int r = prctl(PR_SET_DUMPABLE, new_state);
+ if (r) {
+ r = -errno;
+ std::cerr << "warning: unable to " << (new_state ? "set" : "unset")
+ << " dumpable flag: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+public:
+ PrCtl(int new_state = 0) {
+ int r = get_dumpable();
+ if (r == -1) {
+ return;
+ }
+ if (r != new_state) {
+ if (!set_dumpable(new_state)) {
+ saved_state = r;
+ }
+ }
+ }
+ ~PrCtl() {
+ if (saved_state < 0) {
+ return;
+ }
+ set_dumpable(saved_state);
+ }
+};
+
+#else
+#ifdef RLIMIT_CORE
+#include <sys/resource.h>
+#include <iostream>
+#include <sys/resource.h>
+#include "common/errno.h"
+
+class PrCtl {
+ rlimit saved_lim;
+ static int get_dumpable(rlimit* saved) {
+ int r = getrlimit(RLIMIT_CORE, saved);
+ if (r) {
+ r = errno;
+ std::cerr << "warning: unable to getrlimit(): " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ static void set_dumpable(const rlimit& rlim) {
+ int r = setrlimit(RLIMIT_CORE, &rlim);
+ if (r) {
+ r = -errno;
+ std::cerr << "warning: unable to setrlimit(): " << cpp_strerror(r)
+ << std::endl;
+ }
+ }
+public:
+ PrCtl(int new_state = 0) {
+ int r = get_dumpable(&saved_lim);
+ if (r == -1) {
+ return;
+ }
+ rlimit new_lim;
+ if (new_state) {
+ new_lim.rlim_cur = saved_lim.rlim_max;
+ } else {
+ new_lim.rlim_cur = new_lim.rlim_max = 0;
+ }
+ if (new_lim.rlim_cur == saved_lim.rlim_cur) {
+ return;
+ }
+ set_dumpable(new_lim);
+ }
+ ~PrCtl() {
+ set_dumpable(saved_lim);
+ }
+};
+#else
+struct PrCtl {
+ // to silence the Wunused-variable warning
+ PrCtl() {}
+};
+
+#endif // RLIMIT_CORE
+#endif
diff --git a/src/include/counter.h b/src/include/counter.h
new file mode 100644
index 000000000..61ed7409c
--- /dev/null
+++ b/src/include/counter.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COUNTER_H
+#define CEPH_COUNTER_H
+
+#include <atomic>
+
+template <typename T>
+class Counter {
+public:
+ Counter() {
+ _count()++;
+ _increments()++;
+ }
+ Counter(const Counter &rhs) {
+ _count()++;
+ _increments()++;
+ }
+ Counter(Counter &&rhs) {}
+ ~Counter() {
+ _count()--;
+ }
+ static uint64_t count() {
+ return _count();
+ }
+ static uint64_t increments() {
+ return _increments();
+ }
+ static uint64_t decrements() {
+ return increments()-count();
+ }
+
+private:
+ static std::atomic<uint64_t> &_count() {
+ static std::atomic<uint64_t> c;
+ return c;
+ }
+ static std::atomic<uint64_t> &_increments() {
+ static std::atomic<uint64_t> i;
+ return i;
+ }
+};
+
+#endif
diff --git a/src/include/cpp-btree/btree.h b/src/include/cpp-btree/btree.h
new file mode 100644
index 000000000..f00abc868
--- /dev/null
+++ b/src/include/cpp-btree/btree.h
@@ -0,0 +1,2565 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A btree implementation of the STL set and map interfaces. A btree is smaller
+// and generally also faster than STL set/map (refer to the benchmarks below).
+// The red-black tree implementation of STL set/map has an overhead of 3
+// pointers (left, right and parent) plus the node color information for each
+// stored value. So a set<int32_t> consumes 40 bytes for each value stored in
+// 64-bit mode. This btree implementation stores multiple values on fixed
+// size nodes (usually 256 bytes) and doesn't store child pointers for leaf
+// nodes. The result is that a btree_set<int32_t> may use much less memory per
+// stored value. For the random insertion benchmark in btree_bench.cc, a
+// btree_set<int32_t> with node-size of 256 uses 5.1 bytes per stored value.
+//
+// The packing of multiple values on to each node of a btree has another effect
+// besides better space utilization: better cache locality due to fewer cache
+// lines being accessed. Better cache locality translates into faster
+// operations.
+//
+// CAVEATS
+//
+// Insertions and deletions on a btree can cause splitting, merging or
+// rebalancing of btree nodes. And even without these operations, insertions
+// and deletions on a btree will move values around within a node. In both
+// cases, the result is that insertions and deletions can invalidate iterators
+// pointing to values other than the one being inserted/deleted. Therefore, this
+// container does not provide pointer stability. This is notably different from
+// STL set/map which takes care to not invalidate iterators on insert/erase
+// except, of course, for iterators pointing to the value being erased. A
+// partial workaround when erasing is available: erase() returns an iterator
+// pointing to the item just after the one that was erased (or end() if none
+// exists).
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <experimental/type_traits>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+namespace btree::internal {
+
+template <typename Compare, typename T>
+using btree_is_key_compare_to =
+ std::is_signed<std::invoke_result_t<Compare, T, T>>;
+
+template<typename T>
+using compare_to_t = decltype(std::declval<T&>().compare(std::declval<const T&>()));
+template<typename T>
+inline constexpr bool has_compare_to = std::experimental::is_detected_v<compare_to_t, T>;
+// A helper class to convert a boolean comparison into a three-way "compare-to"
+// comparison that returns a negative value to indicate less-than, zero to
+// indicate equality and a positive value to indicate greater-than. This helper
+// class is specialized for less<std::string>, greater<std::string>,
+// less<string_view>, and greater<string_view>.
+//
+// key_compare_to_adapter is provided so that btree users
+// automatically get the more efficient compare-to code when using common
+// google string types with common comparison functors.
+// These string-like specializations also turn on heterogeneous lookup by
+// default.
+template <typename Compare, typename=void>
+struct key_compare_to_adapter {
+ using type = Compare;
+};
+
+template <typename K>
+struct key_compare_to_adapter<std::less<K>, std::enable_if_t<has_compare_to<K>>>
+{
+ struct type {
+ inline int operator()(const K& lhs, const K& rhs) const noexcept {
+ return lhs.compare(rhs);
+ }
+ };
+};
+
+template <typename K>
+struct key_compare_to_adapter<std::less<K>, std::enable_if_t<std::is_signed_v<K>>>
+{
+ struct type {
+ inline K operator()(const K& lhs, const K& rhs) const noexcept {
+ return lhs - rhs;
+ }
+ };
+};
+
+template <typename K>
+struct key_compare_to_adapter<std::less<K>, std::enable_if_t<std::is_unsigned_v<K>>>
+{
+ struct type {
+ inline int operator()(const K& lhs, const K& rhs) const noexcept {
+ if (lhs < rhs) {
+ return -1;
+ } else if (lhs > rhs) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ };
+};
+
+template <typename Key, typename Compare, typename Alloc,
+ int TargetNodeSize, int ValueSize,
+ bool Multi>
+struct common_params {
+ // If Compare is a common comparator for a std::string-like type, then we adapt it
+ // to use heterogeneous lookup and to be a key-compare-to comparator.
+ using key_compare = typename key_compare_to_adapter<Compare>::type;
+ // A type which indicates if we have a key-compare-to functor or a plain old
+ // key-compare functor.
+ using is_key_compare_to = btree_is_key_compare_to<key_compare, Key>;
+
+ using allocator_type = Alloc;
+ using key_type = Key;
+ using size_type = std::make_signed<size_t>::type;
+ using difference_type = ptrdiff_t;
+
+ // True if this is a multiset or multimap.
+ using is_multi_container = std::integral_constant<bool, Multi>;
+
+ constexpr static int kTargetNodeSize = TargetNodeSize;
+ constexpr static int kValueSize = ValueSize;
+ // Upper bound for the available space for values. This is largest for leaf
+ // nodes, which have overhead of at least a pointer + 3 bytes (for storing
+ // 3 field_types) + paddings. if alignof(key_type) is 1, the size of padding
+ // would be 0.
+ constexpr static int kNodeValueSpace =
+ TargetNodeSize - /*minimum overhead=*/(sizeof(void *) + 4);
+
+ // This is an integral type large enough to hold as many
+ // ValueSize-values as will fit a node of TargetNodeSize bytes.
+ using node_count_type =
+ std::conditional_t<(kNodeValueSpace / ValueSize >
+ (std::numeric_limits<uint8_t>::max)()),
+ uint16_t,
+ uint8_t>;
+};
+
+// The internal storage type
+//
+// It is convenient for the value_type of a btree_map<K, V> to be
+// pair<const K, V>; the "const K" prevents accidental modification of the key
+// when dealing with the reference returned from find() and similar methods.
+// However, this creates other problems; we want to be able to emplace(K, V)
+// efficiently with move operations, and similarly be able to move a
+// pair<K, V> in insert().
+//
+// The solution is this union, which aliases the const and non-const versions
+// of the pair. This also allows flat_hash_map<const K, V> to work, even though
+// that has the same efficiency issues with move in emplace() and insert() -
+// but people do it anyway.
+template <class K, class V>
+union map_slot_type {
+ map_slot_type() {}
+ ~map_slot_type() = delete;
+ map_slot_type& operator=(const map_slot_type& slot) {
+ mutable_value = slot.mutable_value;
+ return *this;
+ }
+ map_slot_type& operator=(map_slot_type&& slot) {
+ mutable_value = std::move(slot.mutable_value);
+ return *this;
+ }
+ using value_type = std::pair<const K, V>;
+ using mutable_value_type = std::pair<K, V>;
+
+ value_type value;
+ mutable_value_type mutable_value;
+ K key;
+};
+
+template <class K, class V>
+void swap(map_slot_type<K, V>& lhs, map_slot_type<K, V>& rhs) {
+ std::swap(lhs.mutable_value, rhs.mutable_value);
+}
+
+// A parameters structure for holding the type parameters for a btree_map.
+// Compare and Alloc should be nothrow copy-constructible.
+template <typename Key, typename Data, typename Compare, typename Alloc,
+ int TargetNodeSize, bool Multi>
+struct map_params : common_params<Key, Compare, Alloc, TargetNodeSize,
+ sizeof(Key) + sizeof(Data), Multi> {
+ using super_type = typename map_params::common_params;
+ using mapped_type = Data;
+ using value_type = std::pair<const Key, mapped_type>;
+ using mutable_value_type = std::pair<Key, mapped_type>;
+ using slot_type = map_slot_type<Key, mapped_type>;
+ using pointer = value_type*;
+ using const_pointer = const value_type *;
+ using reference = value_type &;
+ using const_reference = const value_type &;
+ using key_compare = typename super_type::key_compare;
+ using init_type = mutable_value_type;
+
+ static constexpr size_t kValueSize = sizeof(Key) + sizeof(mapped_type);
+
+ // Inherit from key_compare for empty base class optimization.
+ struct value_compare : private key_compare {
+ value_compare() = default;
+ explicit value_compare(const key_compare &cmp) : key_compare(cmp) {}
+
+ template <typename T, typename U>
+ auto operator()(const T &left, const U &right) const
+ -> decltype(std::declval<key_compare>()(left.first, right.first)) {
+ return key_compare::operator()(left.first, right.first);
+ }
+ };
+ using is_map_container = std::true_type;
+
+ static const Key &key(const value_type &value) { return value.first; }
+ static mapped_type &value(value_type *value) { return value->second; }
+ static const Key &key(const slot_type *slot) { return slot->key; }
+ static value_type& element(slot_type* slot) { return slot->value; }
+ static const value_type& element(const slot_type* slot) { return slot->value; }
+ template <class... Args>
+ static void construct(Alloc *alloc, slot_type *slot, Args &&... args) {
+ std::allocator_traits<Alloc>::construct(*alloc,
+ &slot->mutable_value,
+ std::forward<Args>(args)...);
+ }
+ // Construct this slot by moving from another slot.
+ static void construct(Alloc* alloc, slot_type* slot, slot_type* other) {
+ emplace(slot);
+ std::allocator_traits<Alloc>::construct(*alloc, &slot->value,
+ std::move(other->value));
+ }
+ static void move(Alloc *alloc, slot_type *src, slot_type *dest) {
+ dest->mutable_value = std::move(src->mutable_value);
+ }
+ static void destroy(Alloc *alloc, slot_type *slot) {
+ std::allocator_traits<Alloc>::destroy(*alloc, &slot->mutable_value);
+ }
+
+private:
+ static void emplace(slot_type* slot) {
+ // The construction of union doesn't do anything at runtime but it allows us
+ // to access its members without violating aliasing rules.
+ new (slot) slot_type;
+ }
+};
+
+// A parameters structure for holding the type parameters for a btree_set.
+template <typename Key, typename Compare, typename Alloc, int TargetNodeSize, bool Multi>
+struct set_params
+ : public common_params<Key, Compare, Alloc, TargetNodeSize,
+ sizeof(Key), Multi> {
+ using value_type = Key;
+ using mutable_value_type = value_type;
+ using slot_type = Key;
+ using pointer = value_type *;
+ using const_pointer = const value_type *;
+ using value_compare = typename set_params::common_params::key_compare;
+ using reference = value_type &;
+ using const_reference = const value_type &;
+ using is_map_container = std::false_type;
+ using init_type = mutable_value_type;
+
+ template <class... Args>
+ static void construct(Alloc *alloc, slot_type *slot, Args &&... args) {
+ std::allocator_traits<Alloc>::construct(*alloc,
+ slot,
+ std::forward<Args>(args)...);
+ }
+ static void construct(Alloc *alloc, slot_type *slot, slot_type *other) {
+ std::allocator_traits<Alloc>::construct(*alloc, slot, std::move(*other));
+ }
+ static void move(Alloc *alloc, slot_type *src, slot_type *dest) {
+ *dest = std::move(*src);
+ }
+ static void destroy(Alloc *alloc, slot_type *slot) {
+ std::allocator_traits<Alloc>::destroy(*alloc, slot);
+ }
+ static const Key &key(const value_type &x) { return x; }
+ static const Key &key(const slot_type *slot) { return *slot; }
+ static value_type &element(slot_type *slot) { return *slot; }
+ static const value_type &element(const slot_type *slot) { return *slot; }
+};
+
+// Helper functions to do a boolean comparison of two keys given a boolean
+// or three-way comparator.
+// SFINAE prevents implicit conversions to bool (such as from int).
+template <typename Result>
+constexpr bool compare_result_as_less_than(const Result r) {
+ if constexpr (std::is_signed_v<Result>) {
+ return r < 0;
+ } else {
+ return r;
+ }
+}
+// An adapter class that converts a lower-bound compare into an upper-bound
+// compare. Note: there is no need to make a version of this adapter specialized
+// for key-compare-to functors because the upper-bound (the first value greater
+// than the input) is never an exact match.
+template <typename Compare>
+struct upper_bound_adapter {
+ explicit upper_bound_adapter(const Compare &c) : comp(c) {}
+ template <typename K, typename LK>
+ bool operator()(const K &a, const LK &b) const {
+ // Returns true when a is not greater than b.
+ return !compare_result_as_less_than(comp(b, a));
+ }
+private:
+ const Compare& comp;
+};
+
+enum class MatchKind : uint8_t { kEq, kNe };
+
+template <typename V, bool IsCompareTo>
+struct SearchResult {
+ V value;
+ MatchKind match;
+
+ static constexpr bool has_match = true;
+ bool IsEq() const { return match == MatchKind::kEq; }
+};
+
+// When we don't use CompareTo, `match` is not present.
+// This ensures that callers can't use it accidentally when it provides no
+// useful information.
+template <typename V>
+struct SearchResult<V, false> {
+ V value;
+
+ static constexpr bool has_match = false;
+ static constexpr bool IsEq() { return false; }
+};
+
+// A node in the btree holding. The same node type is used for both internal
+// and leaf nodes in the btree, though the nodes are allocated in such a way
+// that the children array is only valid in internal nodes.
+template <typename Params>
+class btree_node {
+ using is_key_compare_to = typename Params::is_key_compare_to;
+ using is_multi_container = typename Params::is_multi_container;
+ using field_type = typename Params::node_count_type;
+ using allocator_type = typename Params::allocator_type;
+ using slot_type = typename Params::slot_type;
+
+ public:
+ using params_type = Params;
+ using key_type = typename Params::key_type;
+ using value_type = typename Params::value_type;
+ using mutable_value_type = typename Params::mutable_value_type;
+ using pointer = typename Params::pointer;
+ using const_pointer = typename Params::const_pointer;
+ using reference = typename Params::reference;
+ using const_reference = typename Params::const_reference;
+ using key_compare = typename Params::key_compare;
+ using size_type = typename Params::size_type;
+ using difference_type = typename Params::difference_type;
+
+ // Btree decides whether to use linear node search as follows:
+ // - If the key is arithmetic and the comparator is std::less or
+ // std::greater, choose linear.
+ // - Otherwise, choose binary.
+ // TODO(ezb): Might make sense to add condition(s) based on node-size.
+ using use_linear_search = std::integral_constant<
+ bool,
+ std::is_arithmetic_v<key_type> &&
+ (std::is_same_v<std::less<key_type>, key_compare> ||
+ std::is_same_v<std::greater<key_type>, key_compare>)>;
+
+ ~btree_node() = default;
+ btree_node(const btree_node&) = delete;
+ btree_node& operator=(const btree_node&) = delete;
+
+ protected:
+ btree_node() = default;
+
+ private:
+ constexpr static size_type SizeWithNValues(size_type n) {
+ return sizeof(base_fields) + n * sizeof(value_type);;
+ }
+ // A lower bound for the overhead of fields other than values in a leaf node.
+ constexpr static size_type MinimumOverhead() {
+ return SizeWithNValues(1) - sizeof(value_type);
+ }
+
+ // Compute how many values we can fit onto a leaf node taking into account
+ // padding.
+ constexpr static size_type NodeTargetValues(const int begin, const int end) {
+ return begin == end ? begin
+ : SizeWithNValues((begin + end) / 2 + 1) >
+ params_type::kTargetNodeSize
+ ? NodeTargetValues(begin, (begin + end) / 2)
+ : NodeTargetValues((begin + end) / 2 + 1, end);
+ }
+
+ constexpr static int kValueSize = params_type::kValueSize;
+ constexpr static int kTargetNodeSize = params_type::kTargetNodeSize;
+ constexpr static int kNodeTargetValues = NodeTargetValues(0, kTargetNodeSize);
+
+ // We need a minimum of 3 values per internal node in order to perform
+ // splitting (1 value for the two nodes involved in the split and 1 value
+ // propagated to the parent as the delimiter for the split).
+ constexpr static size_type kNodeValues = std::max(kNodeTargetValues, 3);
+
+ // The node is internal (i.e. is not a leaf node) if and only if `max_count`
+ // has this value.
+ constexpr static size_type kInternalNodeMaxCount = 0;
+
+ struct base_fields {
+ // A pointer to the node's parent.
+ btree_node *parent;
+ // The position of the node in the node's parent.
+ field_type position;
+ // The count of the number of values in the node.
+ field_type count;
+ // The maximum number of values the node can hold.
+ field_type max_count;
+ };
+
+ struct leaf_fields : public base_fields {
+ // The array of values. Only the first count of these values have been
+ // constructed and are valid.
+ slot_type values[kNodeValues];
+ };
+
+ struct internal_fields : public leaf_fields {
+ // The array of child pointers. The keys in children_[i] are all less than
+ // key(i). The keys in children_[i + 1] are all greater than key(i). There
+ // are always count + 1 children.
+ btree_node *children[kNodeValues + 1];
+ };
+
+ constexpr static size_type LeafSize(const int max_values = kNodeValues) {
+ return SizeWithNValues(max_values);
+ }
+ constexpr static size_type InternalSize() {
+ return sizeof(internal_fields);
+ }
+
+ template<auto MemPtr>
+ auto& GetField() {
+ return reinterpret_cast<internal_fields*>(this)->*MemPtr;
+ }
+
+ template<auto MemPtr>
+ auto& GetField() const {
+ return reinterpret_cast<const internal_fields*>(this)->*MemPtr;
+ }
+
+ void set_parent(btree_node *p) { GetField<&base_fields::parent>() = p; }
+ field_type &mutable_count() { return GetField<&base_fields::count>(); }
+ slot_type *slot(int i) { return &GetField<&leaf_fields::values>()[i]; }
+ const slot_type *slot(int i) const { return &GetField<&leaf_fields::values>()[i]; }
+ void set_position(field_type v) { GetField<&base_fields::position>() = v; }
+ void set_count(field_type v) { GetField<&base_fields::count>() = v; }
+ // This method is only called by the node init methods.
+ void set_max_count(field_type v) { GetField<&base_fields::max_count>() = v; }
+
+public:
+ constexpr static size_type Alignment() {
+ static_assert(alignof(leaf_fields) == alignof(internal_fields),
+ "Alignment of all nodes must be equal.");
+ return alignof(internal_fields);
+ }
+
+ // Getter/setter for whether this is a leaf node or not. This value doesn't
+ // change after the node is created.
+ bool leaf() const { return GetField<&base_fields::max_count>() != kInternalNodeMaxCount; }
+
+ // Getter for the position of this node in its parent.
+ field_type position() const { return GetField<&base_fields::position>(); }
+
+ // Getter for the number of values stored in this node.
+ field_type count() const { return GetField<&base_fields::count>(); }
+ field_type max_count() const {
+ // Internal nodes have max_count==kInternalNodeMaxCount.
+ // Leaf nodes have max_count in [1, kNodeValues].
+ const field_type max_count = GetField<&base_fields::max_count>();
+ return max_count == field_type{kInternalNodeMaxCount}
+ ? field_type{kNodeValues}
+ : max_count;
+ }
+
+ // Getter for the parent of this node.
+ btree_node* parent() const { return GetField<&base_fields::parent>(); }
+ // Getter for whether the node is the root of the tree. The parent of the
+ // root of the tree is the leftmost node in the tree which is guaranteed to
+ // be a leaf.
+ bool is_root() const { return parent()->leaf(); }
+ void make_root() {
+ assert(parent()->is_root());
+ set_parent(parent()->parent());
+ }
+
+ // Getters for the key/value at position i in the node.
+ const key_type& key(int i) const { return params_type::key(slot(i)); }
+ reference value(int i) { return params_type::element(slot(i)); }
+ const_reference value(int i) const { return params_type::element(slot(i)); }
+
+ // Getters/setter for the child at position i in the node.
+ btree_node* child(int i) const { return GetField<&internal_fields::children>()[i]; }
+ btree_node*& mutable_child(int i) { return GetField<&internal_fields::children>()[i]; }
+ void clear_child(int i) {
+#ifndef NDEBUG
+ memset(&mutable_child(i), 0, sizeof(btree_node*));
+#endif
+ }
+ void set_child(int i, btree_node *c) {
+ mutable_child(i) = c;
+ c->set_position(i);
+ }
+ void init_child(int i, btree_node *c) {
+ set_child(i, c);
+ c->set_parent(this);
+ }
+ // Returns the position of the first value whose key is not less than k.
+ template <typename K>
+ SearchResult<int, is_key_compare_to::value> lower_bound(
+ const K &k, const key_compare &comp) const {
+ return use_linear_search::value ? linear_search(k, comp)
+ : binary_search(k, comp);
+ }
+ // Returns the position of the first value whose key is greater than k.
+ template <typename K>
+ int upper_bound(const K &k, const key_compare &comp) const {
+ auto upper_compare = upper_bound_adapter<key_compare>(comp);
+ return use_linear_search::value ? linear_search(k, upper_compare).value
+ : binary_search(k, upper_compare).value;
+ }
+
+ template <typename K, typename Compare>
+ SearchResult<int, btree_is_key_compare_to<Compare, key_type>::value>
+ linear_search(const K &k, const Compare &comp) const {
+ return linear_search_impl(k, 0, count(), comp,
+ btree_is_key_compare_to<Compare, key_type>());
+ }
+
+ template <typename K, typename Compare>
+ SearchResult<int, btree_is_key_compare_to<Compare, key_type>::value>
+ binary_search(const K &k, const Compare &comp) const {
+ return binary_search_impl(k, 0, count(), comp,
+ btree_is_key_compare_to<Compare, key_type>());
+ }
+ // Returns the position of the first value whose key is not less than k using
+ // linear search performed using plain compare.
+ template <typename K, typename Compare>
+ SearchResult<int, false> linear_search_impl(
+ const K &k, int s, const int e, const Compare &comp,
+ std::false_type /* IsCompareTo */) const {
+ while (s < e) {
+ if (!comp(key(s), k)) {
+ break;
+ }
+ ++s;
+ }
+ return {s};
+ }
+
+ // Returns the position of the first value whose key is not less than k using
+ // linear search performed using compare-to.
+ template <typename K, typename Compare>
+ SearchResult<int, true> linear_search_impl(
+ const K &k, int s, const int e, const Compare &comp,
+ std::true_type /* IsCompareTo */) const {
+ while (s < e) {
+ const auto c = comp(key(s), k);
+ if (c == 0) {
+ return {s, MatchKind::kEq};
+ } else if (c > 0) {
+ break;
+ }
+ ++s;
+ }
+ return {s, MatchKind::kNe};
+ }
+
+ // Returns the position of the first value whose key is not less than k using
+ // binary search performed using plain compare.
+ template <typename K, typename Compare>
+ SearchResult<int, false> binary_search_impl(
+ const K &k, int s, int e, const Compare &comp,
+ std::false_type /* IsCompareTo */) const {
+ while (s != e) {
+ const int mid = (s + e) >> 1;
+ if (comp(key(mid), k)) {
+ s = mid + 1;
+ } else {
+ e = mid;
+ }
+ }
+ return {s};
+ }
+
+ // Returns the position of the first value whose key is not less than k using
+ // binary search performed using compare-to.
+ template <typename K, typename CompareTo>
+ SearchResult<int, true> binary_search_impl(
+ const K &k, int s, int e, const CompareTo &comp,
+ std::true_type /* IsCompareTo */) const {
+ if constexpr (is_multi_container::value) {
+ MatchKind exact_match = MatchKind::kNe;
+ while (s != e) {
+ const int mid = (s + e) >> 1;
+ const auto c = comp(key(mid), k);
+ if (c < 0) {
+ s = mid + 1;
+ } else {
+ e = mid;
+ if (c == 0) {
+ // Need to return the first value whose key is not less than k,
+ // which requires continuing the binary search if this is a
+ // multi-container.
+ exact_match = MatchKind::kEq;
+ }
+ }
+ }
+ return {s, exact_match};
+ } else { // Not a multi-container.
+ while (s != e) {
+ const int mid = (s + e) >> 1;
+ const auto c = comp(key(mid), k);
+ if (c < 0) {
+ s = mid + 1;
+ } else if (c > 0) {
+ e = mid;
+ } else {
+ return {mid, MatchKind::kEq};
+ }
+ }
+ return {s, MatchKind::kNe};
+ }
+ }
+
+ // Emplaces a value at position i, shifting all existing values and
+ // children at positions >= i to the right by 1.
+ template <typename... Args>
+ void emplace_value(size_type i, allocator_type *alloc, Args &&... args);
+
+ // Removes the value at position i, shifting all existing values and children
+ // at positions > i to the left by 1.
+ void remove_value(const int i, allocator_type *alloc);
+
+ // Removes the values at positions [i, i + to_erase), shifting all values
+ // after that range to the left by to_erase. Does not change children at all.
+ void remove_values_ignore_children(int i, int to_erase,
+ allocator_type *alloc);
+
+ // Rebalances a node with its right sibling.
+ void rebalance_right_to_left(const int to_move, btree_node *right,
+ allocator_type *alloc);
+ void rebalance_left_to_right(const int to_move, btree_node *right,
+ allocator_type *alloc);
+
+ // Splits a node, moving a portion of the node's values to its right sibling.
+ void split(const int insert_position, btree_node *dest, allocator_type *alloc);
+
+ // Merges a node with its right sibling, moving all of the values and the
+ // delimiting key in the parent node onto itself.
+ void merge(btree_node *sibling, allocator_type *alloc);
+
+ // Swap the contents of "this" and "src".
+ void swap(btree_node *src, allocator_type *alloc);
+
+ // Node allocation/deletion routines.
+ static btree_node *init_leaf(btree_node *n, btree_node *parent,
+ int max_count) {
+ n->set_parent(parent);
+ n->set_position(0);
+ n->set_count(0);
+ n->set_max_count(max_count);
+ return n;
+ }
+ static btree_node *init_internal(btree_node *n, btree_node *parent) {
+ init_leaf(n, parent, kNodeValues);
+ // Set `max_count` to a sentinel value to indicate that this node is
+ // internal.
+ n->set_max_count(kInternalNodeMaxCount);
+ return n;
+ }
+ void destroy(allocator_type *alloc) {
+ for (int i = 0; i < count(); ++i) {
+ value_destroy(i, alloc);
+ }
+ }
+
+ private:
+ template <typename... Args>
+ void value_init(const size_type i, allocator_type *alloc, Args &&... args) {
+ params_type::construct(alloc, slot(i), std::forward<Args>(args)...);
+ }
+ void value_destroy(const size_type i, allocator_type *alloc) {
+ params_type::destroy(alloc, slot(i));
+ }
+
+ // Move n values starting at value i in this node into the values starting at
+ // value j in node x.
+ void uninitialized_move_n(const size_type n, const size_type i,
+ const size_type j, btree_node *x,
+ allocator_type *alloc) {
+ for (slot_type *src = slot(i), *end = src + n, *dest = x->slot(j);
+ src != end; ++src, ++dest) {
+ params_type::construct(alloc, dest, src);
+ }
+ }
+
+ // Destroys a range of n values, starting at index i.
+ void value_destroy_n(const size_type i, const size_type n,
+ allocator_type *alloc) {
+ for (int j = 0; j < n; ++j) {
+ value_destroy(i + j, alloc);
+ }
+ }
+
+private:
+ template <typename P>
+ friend class btree;
+ template <typename N, typename R, typename P>
+ friend struct btree_iterator;
+};
+
+template <typename Node, typename Reference, typename Pointer>
+struct btree_iterator {
+ private:
+ using key_type = typename Node::key_type;
+ using size_type = typename Node::size_type;
+ using params_type = typename Node::params_type;
+
+ using node_type = Node;
+ using normal_node = typename std::remove_const<Node>::type;
+ using const_node = const Node;
+ using normal_pointer = typename params_type::pointer;
+ using normal_reference = typename params_type::reference;
+ using const_pointer = typename params_type::const_pointer;
+ using const_reference = typename params_type::const_reference;
+ using slot_type = typename params_type::slot_type;
+
+ using iterator =
+ btree_iterator<normal_node, normal_reference, normal_pointer>;
+ using const_iterator =
+ btree_iterator<const_node, const_reference, const_pointer>;
+
+ public:
+ // These aliases are public for std::iterator_traits.
+ using difference_type = typename Node::difference_type;
+ using value_type = typename params_type::value_type;
+ using pointer = Pointer;
+ using reference = Reference;
+ using iterator_category = std::bidirectional_iterator_tag;
+
+ btree_iterator() = default;
+ btree_iterator(Node *n, int p) : node(n), position(p) {}
+
+ // NOTE: this SFINAE allows for implicit conversions from iterator to
+ // const_iterator, but it specifically avoids defining copy constructors so
+ // that btree_iterator can be trivially copyable. This is for performance and
+ // binary size reasons.
+ template<typename N, typename R, typename P,
+ std::enable_if_t<
+ std::is_same_v<btree_iterator<N, R, P>, iterator> &&
+ std::is_same_v<btree_iterator, const_iterator>,
+ int> = 0>
+ btree_iterator(const btree_iterator<N, R, P> &x)
+ : node(x.node), position(x.position) {}
+
+ private:
+ // This SFINAE allows explicit conversions from const_iterator to
+ // iterator, but also avoids defining a copy constructor.
+ // NOTE: the const_cast is safe because this constructor is only called by
+ // non-const methods and the container owns the nodes.
+ template <typename N, typename R, typename P,
+ std::enable_if_t<
+ std::is_same_v<btree_iterator<N, R, P>, const_iterator> &&
+ std::is_same_v<btree_iterator, iterator>,
+ int> = 0>
+ explicit btree_iterator(const btree_iterator<N, R, P> &x)
+ : node(const_cast<node_type *>(x.node)), position(x.position) {}
+
+ // Increment/decrement the iterator.
+ void increment() {
+ if (node->leaf() && ++position < node->count()) {
+ return;
+ }
+ increment_slow();
+ }
+ void increment_slow();
+
+ void decrement() {
+ if (node->leaf() && --position >= 0) {
+ return;
+ }
+ decrement_slow();
+ }
+ void decrement_slow();
+
+ public:
+ bool operator==(const const_iterator &x) const {
+ return node == x.node && position == x.position;
+ }
+ bool operator!=(const const_iterator &x) const {
+ return node != x.node || position != x.position;
+ }
+
+ // Accessors for the key/value the iterator is pointing at.
+ reference operator*() const {
+ return node->value(position);
+ }
+ pointer operator->() const {
+ return &node->value(position);
+ }
+
+ btree_iterator& operator++() {
+ increment();
+ return *this;
+ }
+ btree_iterator& operator--() {
+ decrement();
+ return *this;
+ }
+ btree_iterator operator++(int) {
+ btree_iterator tmp = *this;
+ ++*this;
+ return tmp;
+ }
+ btree_iterator operator--(int) {
+ btree_iterator tmp = *this;
+ --*this;
+ return tmp;
+ }
+
+ private:
+ template <typename Params>
+ friend class btree;
+ template <typename Tree>
+ friend class btree_container;
+ template <typename Tree>
+ friend class btree_set_container;
+ template <typename Tree>
+ friend class btree_map_container;
+ template <typename Tree>
+ friend class btree_multiset_container;
+ template <typename N, typename R, typename P>
+ friend struct btree_iterator;
+
+ const key_type &key() const { return node->key(position); }
+ slot_type *slot() { return node->slot(position); }
+
+ // The node in the tree the iterator is pointing at.
+ Node *node = nullptr;
+ // The position within the node of the tree the iterator is pointing at.
+ int position = -1;
+};
+
+template <size_t Alignment, class Alloc>
+class AlignedAlloc {
+ struct alignas(Alignment) M {};
+ using alloc_t =
+ typename std::allocator_traits<Alloc>::template rebind_alloc<M>;
+ using traits_t =
+ typename std::allocator_traits<Alloc>::template rebind_traits<M>;
+ static constexpr size_t num_aligned_objects(size_t size) {
+ return (size + sizeof(M) - 1) / sizeof(M);
+ }
+public:
+ static void* allocate(Alloc* alloc, size_t size) {
+ alloc_t aligned_alloc(*alloc);
+ void* p = traits_t::allocate(aligned_alloc,
+ num_aligned_objects(size));
+ assert(reinterpret_cast<uintptr_t>(p) % Alignment == 0 &&
+ "allocator does not respect alignment");
+ return p;
+ }
+ static void deallocate(Alloc* alloc, void* p, size_t size) {
+ alloc_t aligned_alloc(*alloc);
+ traits_t::deallocate(aligned_alloc, static_cast<M*>(p),
+ num_aligned_objects(size));
+ }
+};
+
+template <typename Params>
+class btree {
+ using node_type = btree_node<Params>;
+ using is_key_compare_to = typename Params::is_key_compare_to;
+
+ // We use a static empty node for the root/leftmost/rightmost of empty btrees
+ // in order to avoid branching in begin()/end().
+ struct alignas(node_type::Alignment()) EmptyNodeType : node_type {
+ using field_type = typename node_type::field_type;
+ node_type *parent;
+ field_type position = 0;
+ field_type count = 0;
+ // max_count must be != kInternalNodeMaxCount (so that this node is regarded
+ // as a leaf node). max_count() is never called when the tree is empty.
+ field_type max_count = node_type::kInternalNodeMaxCount + 1;
+
+ constexpr EmptyNodeType(node_type *p) : parent(p) {}
+ };
+
+ static node_type *EmptyNode() {
+ static constexpr EmptyNodeType empty_node(
+ const_cast<EmptyNodeType *>(&empty_node));
+ return const_cast<EmptyNodeType *>(&empty_node);
+ }
+
+ constexpr static int kNodeValues = node_type::kNodeValues;
+ constexpr static int kMinNodeValues = kNodeValues / 2;
+ constexpr static int kValueSize = node_type::kValueSize;
+
+ // A helper class to get the empty base class optimization for 0-size
+ // allocators. Base is allocator_type.
+ // (e.g. empty_base_handle<key_compare, allocator_type, node_type*>). If Base is
+ // 0-size, the compiler doesn't have to reserve any space for it and
+ // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base
+ // class optimization] for more details.
+ template <typename Base1, typename Base2, typename Data>
+ struct empty_base_handle : public Base1, Base2 {
+ empty_base_handle(const Base1 &b1, const Base2 &b2, const Data &d)
+ : Base1(b1),
+ Base2(b2),
+ data(d) {}
+ Data data;
+ };
+
+ struct node_stats {
+ using size_type = typename Params::size_type;
+
+ node_stats(size_type l, size_type i)
+ : leaf_nodes(l),
+ internal_nodes(i) {
+ }
+
+ node_stats& operator+=(const node_stats &x) {
+ leaf_nodes += x.leaf_nodes;
+ internal_nodes += x.internal_nodes;
+ return *this;
+ }
+
+ size_type leaf_nodes;
+ size_type internal_nodes;
+ };
+
+ public:
+ using key_type = typename Params::key_type;
+ using value_type = typename Params::value_type;
+ using size_type = typename Params::size_type;
+ using difference_type = typename Params::difference_type;
+ using key_compare = typename Params::key_compare;
+ using value_compare = typename Params::value_compare;
+ using allocator_type = typename Params::allocator_type;
+ using reference = typename Params::reference;
+ using const_reference = typename Params::const_reference;
+ using pointer = typename Params::pointer;
+ using const_pointer = typename Params::const_pointer;
+ using iterator = btree_iterator<node_type, reference, pointer>;
+ using const_iterator = typename iterator::const_iterator;
+ using reverse_iterator = std::reverse_iterator<iterator>;
+ using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+ // Internal types made public for use by btree_container types.
+ using params_type = Params;
+
+ private:
+ // For use in copy_or_move_values_in_order.
+ const value_type &maybe_move_from_iterator(const_iterator x) { return *x; }
+ value_type &&maybe_move_from_iterator(iterator x) { return std::move(*x); }
+
+ // Copies or moves (depending on the template parameter) the values in
+ // x into this btree in their order in x. This btree must be empty before this
+ // method is called. This method is used in copy construction, copy
+ // assignment, and move assignment.
+ template <typename Btree>
+ void copy_or_move_values_in_order(Btree *x);
+
+ // Validates that various assumptions/requirements are true at compile time.
+ constexpr static bool static_assert_validation();
+
+ public:
+ btree(const key_compare &comp, const allocator_type &alloc);
+
+ btree(const btree &x);
+ btree(btree &&x) noexcept
+ : root_(std::move(x.root_)),
+ rightmost_(std::exchange(x.rightmost_, EmptyNode())),
+ size_(std::exchange(x.size_, 0)) {
+ x.mutable_root() = EmptyNode();
+ }
+
+ ~btree() {
+ // Put static_asserts in destructor to avoid triggering them before the type
+ // is complete.
+ static_assert(static_assert_validation(), "This call must be elided.");
+ clear();
+ }
+
+ // Assign the contents of x to *this.
+ btree &operator=(const btree &x);
+ btree &operator=(btree &&x) noexcept;
+
+ iterator begin() {
+ return iterator(leftmost(), 0);
+ }
+ const_iterator begin() const {
+ return const_iterator(leftmost(), 0);
+ }
+ iterator end() {
+ return iterator(rightmost_, rightmost_->count());
+ }
+ const_iterator end() const {
+ return const_iterator(rightmost_, rightmost_->count());
+ }
+ reverse_iterator rbegin() {
+ return reverse_iterator(end());
+ }
+ const_reverse_iterator rbegin() const {
+ return const_reverse_iterator(end());
+ }
+ reverse_iterator rend() {
+ return reverse_iterator(begin());
+ }
+ const_reverse_iterator rend() const {
+ return const_reverse_iterator(begin());
+ }
+
+ // Finds the first element whose key is not less than key.
+ template <typename K>
+ iterator lower_bound(const K &key) {
+ return internal_end(internal_lower_bound(key));
+ }
+ template <typename K>
+ const_iterator lower_bound(const K &key) const {
+ return internal_end(internal_lower_bound(key));
+ }
+
+ // Finds the first element whose key is greater than key.
+ template <typename K>
+ iterator upper_bound(const K &key) {
+ return internal_end(internal_upper_bound(key));
+ }
+ template <typename K>
+ const_iterator upper_bound(const K &key) const {
+ return internal_end(internal_upper_bound(key));
+ }
+
+ // Finds the range of values which compare equal to key. The first member of
+ // the returned pair is equal to lower_bound(key). The second member pair of
+ // the pair is equal to upper_bound(key).
+ template <typename K>
+ std::pair<iterator, iterator> equal_range(const K &key) {
+ return {lower_bound(key), upper_bound(key)};
+ }
+ template <typename K>
+ std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
+ return {lower_bound(key), upper_bound(key)};
+ }
+
+ // Inserts a value into the btree only if it does not already exist. The
+ // boolean return value indicates whether insertion succeeded or failed.
+ // Requirement: if `key` already exists in the btree, does not consume `args`.
+ // Requirement: `key` is never referenced after consuming `args`.
+ template <typename... Args>
+ std::pair<iterator, bool> insert_unique(const key_type &key, Args &&... args);
+
+ // Inserts with hint. Checks to see if the value should be placed immediately
+ // before `position` in the tree. If so, then the insertion will take
+ // amortized constant time. If not, the insertion will take amortized
+ // logarithmic time as if a call to insert_unique() were made.
+ // Requirement: if `key` already exists in the btree, does not consume `args`.
+ // Requirement: `key` is never referenced after consuming `args`.
+ template <typename... Args>
+ std::pair<iterator, bool> insert_hint_unique(iterator position,
+ const key_type &key,
+ Args &&... args);
+
+ // Insert a range of values into the btree.
+ template <typename InputIterator>
+ void insert_iterator_unique(InputIterator b, InputIterator e);
+
+ // Inserts a value into the btree.
+ template <typename ValueType>
+ iterator insert_multi(const key_type &key, ValueType &&v);
+
+ // Inserts a value into the btree.
+ template <typename ValueType>
+ iterator insert_multi(ValueType &&v) {
+ return insert_multi(params_type::key(v), std::forward<ValueType>(v));
+ }
+
+ // Insert with hint. Check to see if the value should be placed immediately
+ // before position in the tree. If it does, then the insertion will take
+ // amortized constant time. If not, the insertion will take amortized
+ // logarithmic time as if a call to insert_multi(v) were made.
+ template <typename ValueType>
+ iterator insert_hint_multi(iterator position, ValueType &&v);
+
+ // Insert a range of values into the btree.
+ template <typename InputIterator>
+ void insert_iterator_multi(InputIterator b, InputIterator e);
+
+ // Erase the specified iterator from the btree. The iterator must be valid
+ // (i.e. not equal to end()). Return an iterator pointing to the node after
+ // the one that was erased (or end() if none exists).
+ // Requirement: does not read the value at `*iter`.
+ iterator erase(iterator iter);
+
+ // Erases range. Returns the number of keys erased and an iterator pointing
+ // to the element after the last erased element.
+ std::pair<size_type, iterator> erase(iterator begin, iterator end);
+
+ // Erases the specified key from the btree. Returns 1 if an element was
+ // erased and 0 otherwise.
+ template <typename K>
+ size_type erase_unique(const K &key);
+
+ // Erases all of the entries matching the specified key from the
+ // btree. Returns the number of elements erased.
+ template <typename K>
+ size_type erase_multi(const K &key);
+
+ // Finds the iterator corresponding to a key or returns end() if the key is
+ // not present.
+ template <typename K>
+ iterator find(const K &key) {
+ return internal_end(internal_find(key));
+ }
+ template <typename K>
+ const_iterator find(const K &key) const {
+ return internal_end(internal_find(key));
+ }
+
+ // Returns a count of the number of times the key appears in the btree.
+ template <typename K>
+ size_type count_unique(const K &key) const {
+ const iterator begin = internal_find(key);
+ if (begin.node == nullptr) {
+ // The key doesn't exist in the tree.
+ return 0;
+ }
+ return 1;
+ }
+ // Returns a count of the number of times the key appears in the btree.
+ template <typename K>
+ size_type count_multi(const K &key) const {
+ const auto range = equal_range(key);
+ return std::distance(range.first, range.second);
+ }
+
+ // Clear the btree, deleting all of the values it contains.
+ void clear();
+
+ // Swap the contents of *this and x.
+ void swap(btree &x);
+
+ const key_compare &key_comp() const noexcept {
+ return *static_cast<const key_compare*>(&root_);
+ }
+ template <typename K, typename LK>
+ bool compare_keys(const K &x, const LK &y) const {
+ return compare_result_as_less_than(key_comp()(x, y));
+ }
+
+ // Verifies the structure of the btree.
+ void verify() const;
+
+ // Size routines.
+ size_type size() const { return size_; }
+ size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+ bool empty() const { return size_ == 0; }
+
+ // The height of the btree. An empty tree will have height 0.
+ size_type height() const {
+ size_type h = 0;
+ if (!empty()) {
+ // Count the length of the chain from the leftmost node up to the
+ // root. We actually count from the root back around to the level below
+ // the root, but the calculation is the same because of the circularity
+ // of that traversal.
+ const node_type *n = root();
+ do {
+ ++h;
+ n = n->parent();
+ } while (n != root());
+ }
+ return h;
+ }
+
+ // The number of internal, leaf and total nodes used by the btree.
+ size_type leaf_nodes() const {
+ return internal_stats(root()).leaf_nodes;
+ }
+ size_type internal_nodes() const {
+ return internal_stats(root()).internal_nodes;
+ }
+ size_type nodes() const {
+ node_stats stats = internal_stats(root());
+ return stats.leaf_nodes + stats.internal_nodes;
+ }
+
+ // The total number of bytes used by the btree.
+ size_type bytes_used() const {
+ node_stats stats = internal_stats(root());
+ if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) {
+ return sizeof(*this) +
+ node_type::LeafSize(root()->max_count());
+ } else {
+ return sizeof(*this) +
+ stats.leaf_nodes * node_type::LeafSize() +
+ stats.internal_nodes * node_type::InternalSize();
+ }
+ }
+
+ // The average number of bytes used per value stored in the btree.
+ static double average_bytes_per_value() {
+ // Returns the number of bytes per value on a leaf node that is 75%
+ // full. Experimentally, this matches up nicely with the computed number of
+ // bytes per value in trees that had their values inserted in random order.
+ return node_type::LeafSize() / (kNodeValues * 0.75);
+ }
+
+ // The fullness of the btree. Computed as the number of elements in the btree
+ // divided by the maximum number of elements a tree with the current number
+ // of nodes could hold. A value of 1 indicates perfect space
+ // utilization. Smaller values indicate space wastage.
+ // Returns 0 for empty trees.
+ double fullness() const {
+ if (empty()) return 0.0;
+ return static_cast<double>(size()) / (nodes() * kNodeValues);
+ }
+ // The overhead of the btree structure in bytes per node. Computed as the
+ // total number of bytes used by the btree minus the number of bytes used for
+ // storing elements divided by the number of elements.
+ // Returns 0 for empty trees.
+ double overhead() const {
+ if (empty()) return 0.0;
+ return (bytes_used() - size() * sizeof(value_type)) /
+ static_cast<double>(size());
+ }
+
+ // The allocator used by the btree.
+ allocator_type get_allocator() const {
+ return allocator();
+ }
+
+ private:
+ // Internal accessor routines.
+ node_type *root() { return root_.data; }
+ const node_type *root() const { return root_.data; }
+ node_type *&mutable_root() { return root_.data; }
+ key_compare *mutable_key_comp() noexcept {
+ return static_cast<key_compare*>(&root_);
+ }
+
+ node_type* rightmost() {
+ return rightmost_;
+ }
+ const node_type* rightmost() const {
+ return rightmost_;
+ }
+ // The leftmost node is stored as the parent of the root node.
+ node_type* leftmost() { return root() ? root()->parent() : NULL; }
+ const node_type* leftmost() const { return root() ? root()->parent() : NULL; }
+
+ // The size of the tree is stored in the root node.
+ size_type* mutable_size() { return root()->mutable_size(); }
+
+ // Allocator routines.
+ allocator_type* mutable_allocator() noexcept {
+ return static_cast<allocator_type*>(&root_);
+ }
+ const allocator_type& allocator() const noexcept {
+ return *static_cast<const allocator_type*>(&root_);
+ }
+
+ node_type *allocate(const size_type size) {
+ using aligned_alloc_t =
+ AlignedAlloc<node_type::Alignment(), allocator_type>;
+ return static_cast<node_type*>(
+ aligned_alloc_t::allocate(mutable_allocator(), size));
+ }
+
+ // Node creation/deletion routines.
+ node_type* new_internal_node(node_type *parent) {
+ node_type *p = allocate(node_type::InternalSize());
+ return node_type::init_internal(p, parent);
+ }
+ node_type* new_leaf_node(node_type *parent) {
+ node_type *p = allocate(node_type::LeafSize());
+ return node_type::init_leaf(p, parent, kNodeValues);
+ }
+ node_type *new_leaf_root_node(const int max_count) {
+ node_type *p = allocate(node_type::LeafSize(max_count));
+ return node_type::init_leaf(p, p, max_count);
+ }
+
+ // Deletion helper routines.
+ void erase_same_node(iterator begin, iterator end);
+ iterator erase_from_leaf_node(iterator begin, size_type to_erase);
+ iterator rebalance_after_delete(iterator iter);
+
+ // Deallocates a node of a certain size in bytes using the allocator.
+ void deallocate(const size_type size, node_type *node) {
+ using aligned_alloc_t =
+ AlignedAlloc<node_type::Alignment(), allocator_type>;
+ aligned_alloc_t::deallocate(mutable_allocator(), node, size);
+ }
+
+ void delete_internal_node(node_type *node) {
+ node->destroy(mutable_allocator());
+ deallocate(node_type::InternalSize(), node);
+ }
+ void delete_leaf_node(node_type *node) {
+ node->destroy(mutable_allocator());
+ deallocate(node_type::LeafSize(node->max_count()), node);
+ }
+
+ // Rebalances or splits the node iter points to.
+ void rebalance_or_split(iterator *iter);
+
+ // Merges the values of left, right and the delimiting key on their parent
+ // onto left, removing the delimiting key and deleting right.
+ void merge_nodes(node_type *left, node_type *right);
+
+ // Tries to merge node with its left or right sibling, and failing that,
+ // rebalance with its left or right sibling. Returns true if a merge
+ // occurred, at which point it is no longer valid to access node. Returns
+ // false if no merging took place.
+ bool try_merge_or_rebalance(iterator *iter);
+
+ // Tries to shrink the height of the tree by 1.
+ void try_shrink();
+
+ iterator internal_end(iterator iter) {
+ return iter.node != nullptr ? iter : end();
+ }
+ const_iterator internal_end(const_iterator iter) const {
+ return iter.node != nullptr ? iter : end();
+ }
+
+ // Emplaces a value into the btree immediately before iter. Requires that
+ // key(v) <= iter.key() and (--iter).key() <= key(v).
+ template <typename... Args>
+ iterator internal_emplace(iterator iter, Args &&... args);
+
+ // Returns an iterator pointing to the first value >= the value "iter" is
+ // pointing at. Note that "iter" might be pointing to an invalid location as
+ // iter.position == iter.node->count(). This routine simply moves iter up in
+ // the tree to a valid location.
+ // Requires: iter.node is non-null.
+ template <typename IterType>
+ static IterType internal_last(IterType iter);
+
+ // Returns an iterator pointing to the leaf position at which key would
+ // reside in the tree. We provide 2 versions of internal_locate. The first
+ // version uses a less-than comparator and is incapable of distinguishing when
+ // there is an exact match. The second version is for the key-compare-to
+ // specialization and distinguishes exact matches. The key-compare-to
+ // specialization allows the caller to avoid a subsequent comparison to
+ // determine if an exact match was made, which is important for keys with
+ // expensive comparison, such as strings.
+ template <typename K>
+ SearchResult<iterator, is_key_compare_to::value> internal_locate(
+ const K &key) const;
+
+ template <typename K>
+ SearchResult<iterator, false> internal_locate_impl(
+ const K &key, std::false_type /* IsCompareTo */) const;
+
+ template <typename K>
+ SearchResult<iterator, true> internal_locate_impl(
+ const K &key, std::true_type /* IsCompareTo */) const;
+
+ // Internal routine which implements lower_bound().
+ template <typename K>
+ iterator internal_lower_bound(const K &key) const;
+
+ // Internal routine which implements upper_bound().
+ template <typename K>
+ iterator internal_upper_bound(const K &key) const;
+
+ // Internal routine which implements find().
+ template <typename K>
+ iterator internal_find(const K &key) const;
+
+ // Deletes a node and all of its children.
+ void internal_clear(node_type *node);
+
+ // Verifies the tree structure of node.
+ int internal_verify(const node_type *node,
+ const key_type *lo, const key_type *hi) const;
+
+ node_stats internal_stats(const node_type *node) const {
+ // The root can be a static empty node.
+ if (node == nullptr || (node == root() && empty())) {
+ return node_stats(0, 0);
+ }
+ if (node->leaf()) {
+ return node_stats(1, 0);
+ }
+ node_stats res(0, 1);
+ for (int i = 0; i <= node->count(); ++i) {
+ res += internal_stats(node->child(i));
+ }
+ return res;
+ }
+
+ private:
+ empty_base_handle<key_compare, allocator_type, node_type*> root_;
+
+ // A pointer to the rightmost node. Note that the leftmost node is stored as
+ // the root's parent.
+ node_type *rightmost_;
+
+ // Number of values.
+ size_type size_;
+};
+
+////
+// btree_node methods
+template <typename P>
+template <typename... Args>
+inline void btree_node<P>::emplace_value(const size_type i,
+ allocator_type *alloc,
+ Args &&... args) {
+ assert(i <= count());
+ // Shift old values to create space for new value and then construct it in
+ // place.
+ if (i < count()) {
+ value_init(count(), alloc, slot(count() - 1));
+ std::copy_backward(std::make_move_iterator(slot(i)),
+ std::make_move_iterator(slot(count() - 1)),
+ slot(count()));
+ value_destroy(i, alloc);
+ }
+ value_init(i, alloc, std::forward<Args>(args)...);
+ set_count(count() + 1);
+
+ if (!leaf() && count() > i + 1) {
+ for (int j = count(); j > i + 1; --j) {
+ set_child(j, child(j - 1));
+ }
+ clear_child(i + 1);
+ }
+}
+
+template <typename P>
+inline void btree_node<P>::remove_value(const int i, allocator_type *alloc) {
+ if (!leaf() && count() > i + 1) {
+ assert(child(i + 1)->count() == 0);
+ for (size_type j = i + 1; j < count(); ++j) {
+ set_child(j, child(j + 1));
+ }
+ clear_child(count());
+ }
+
+ remove_values_ignore_children(i, /*to_erase=*/1, alloc);
+}
+
+template <typename P>
+inline void btree_node<P>::remove_values_ignore_children(
+ const int i, const int to_erase, allocator_type *alloc) {
+ assert(to_erase >= 0);
+ std::copy(std::make_move_iterator(slot(i + to_erase)),
+ std::make_move_iterator(slot(count())),
+ slot(i));
+ value_destroy_n(count() - to_erase, to_erase, alloc);
+ set_count(count() - to_erase);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_right_to_left(const int to_move,
+ btree_node *right,
+ allocator_type *alloc) {
+ assert(parent() == right->parent());
+ assert(position() + 1 == right->position());
+ assert(right->count() >= count());
+ assert(to_move >= 1);
+ assert(to_move <= right->count());
+
+ // 1) Move the delimiting value in the parent to the left node.
+ value_init(count(), alloc, parent()->slot(position()));
+
+ // 2) Move the (to_move - 1) values from the right node to the left node.
+ right->uninitialized_move_n(to_move - 1, 0, count() + 1, this, alloc);
+
+ // 3) Move the new delimiting value to the parent from the right node.
+ params_type::move(alloc, right->slot(to_move - 1),
+ parent()->slot(position()));
+
+ // 4) Shift the values in the right node to their correct position.
+ std::copy(std::make_move_iterator(right->slot(to_move)),
+ std::make_move_iterator(right->slot(right->count())),
+ right->slot(0));
+
+ // 5) Destroy the now-empty to_move entries in the right node.
+ right->value_destroy_n(right->count() - to_move, to_move, alloc);
+
+ if (!leaf()) {
+ // Move the child pointers from the right to the left node.
+ for (int i = 0; i < to_move; ++i) {
+ init_child(count() + i + 1, right->child(i));
+ }
+ for (int i = 0; i <= right->count() - to_move; ++i) {
+ assert(i + to_move <= right->max_count());
+ right->init_child(i, right->child(i + to_move));
+ right->clear_child(i + to_move);
+ }
+ }
+
+ // Fixup the counts on the left and right nodes.
+ set_count(count() + to_move);
+ right->set_count(right->count() - to_move);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_left_to_right(const int to_move,
+ btree_node *right,
+ allocator_type *alloc) {
+ assert(parent() == right->parent());
+ assert(position() + 1 == right->position());
+ assert(count() >= right->count());
+ assert(to_move >= 1);
+ assert(to_move <= count());
+
+ // Values in the right node are shifted to the right to make room for the
+ // new to_move values. Then, the delimiting value in the parent and the
+ // other (to_move - 1) values in the left node are moved into the right node.
+ // Lastly, a new delimiting value is moved from the left node into the
+ // parent, and the remaining empty left node entries are destroyed.
+
+ if (right->count() >= to_move) {
+ // The original location of the right->count() values are sufficient to hold
+ // the new to_move entries from the parent and left node.
+
+ // 1) Shift existing values in the right node to their correct positions.
+ right->uninitialized_move_n(to_move, right->count() - to_move,
+ right->count(), right, alloc);
+ std::copy_backward(std::make_move_iterator(right->slot(0)),
+ std::make_move_iterator(right->slot(right->count() - to_move)),
+ right->slot(right->count()));
+
+ // 2) Move the delimiting value in the parent to the right node.
+ params_type::move(alloc, parent()->slot(position()),
+ right->slot(to_move - 1));
+
+ // 3) Move the (to_move - 1) values from the left node to the right node.
+ std::copy(std::make_move_iterator(slot(count() - (to_move - 1))),
+ std::make_move_iterator(slot(count())),
+ right->slot(0));
+ } else {
+ // The right node does not have enough initialized space to hold the new
+ // to_move entries, so part of them will move to uninitialized space.
+
+ // 1) Shift existing values in the right node to their correct positions.
+ right->uninitialized_move_n(right->count(), 0, to_move, right, alloc);
+
+ // 2) Move the delimiting value in the parent to the right node.
+ right->value_init(to_move - 1, alloc, parent()->slot(position()));
+
+ // 3) Move the (to_move - 1) values from the left node to the right node.
+ const size_type uninitialized_remaining = to_move - right->count() - 1;
+ uninitialized_move_n(uninitialized_remaining,
+ count() - uninitialized_remaining, right->count(),
+ right, alloc);
+ std::copy(std::make_move_iterator(slot(count() - (to_move - 1))),
+ std::make_move_iterator(slot(count() - uninitialized_remaining)),
+ right->slot(0));
+ }
+
+ // 4) Move the new delimiting value to the parent from the left node.
+ params_type::move(alloc, slot(count() - to_move), parent()->slot(position()));
+
+ // 5) Destroy the now-empty to_move entries in the left node.
+ value_destroy_n(count() - to_move, to_move, alloc);
+
+ if (!leaf()) {
+ // Move the child pointers from the left to the right node.
+ for (int i = right->count(); i >= 0; --i) {
+ right->init_child(i + to_move, right->child(i));
+ right->clear_child(i);
+ }
+ for (int i = 1; i <= to_move; ++i) {
+ right->init_child(i - 1, child(count() - to_move + i));
+ clear_child(count() - to_move + i);
+ }
+ }
+
+ // Fixup the counts on the left and right nodes.
+ set_count(count() - to_move);
+ right->set_count(right->count() + to_move);
+}
+
+template <typename P>
+void btree_node<P>::split(const int insert_position, btree_node *dest,
+ allocator_type *alloc) {
+ assert(dest->count() == 0);
+ assert(max_count() == kNodeValues);
+
+ // We bias the split based on the position being inserted. If we're
+ // inserting at the beginning of the left node then bias the split to put
+ // more values on the right node. If we're inserting at the end of the
+ // right node then bias the split to put more values on the left node.
+ if (insert_position == 0) {
+ dest->set_count(count() - 1);
+ } else if (insert_position == kNodeValues) {
+ dest->set_count(0);
+ } else {
+ dest->set_count(count() / 2);
+ }
+ set_count(count() - dest->count());
+ assert(count() >= 1);
+
+ // Move values from the left sibling to the right sibling.
+ uninitialized_move_n(dest->count(), count(), 0, dest, alloc);
+
+ // Destroy the now-empty entries in the left node.
+ value_destroy_n(count(), dest->count(), alloc);
+
+ // The split key is the largest value in the left sibling.
+ set_count(count() - 1);
+ parent()->emplace_value(position(), alloc, slot(count()));
+ value_destroy(count(), alloc);
+ parent()->init_child(position() + 1, dest);
+
+ if (!leaf()) {
+ for (int i = 0; i <= dest->count(); ++i) {
+ assert(child(count() + i + 1) != nullptr);
+ dest->init_child(i, child(count() + i + 1));
+ clear_child(count() + i + 1);
+ }
+ }
+}
+
+template <typename P>
+void btree_node<P>::merge(btree_node *src, allocator_type *alloc) {
+ assert(parent() == src->parent());
+ assert(position() + 1 == src->position());
+
+ // Move the delimiting value to the left node.
+ value_init(count(), alloc, parent()->slot(position()));
+
+ // Move the values from the right to the left node.
+ src->uninitialized_move_n(src->count(), 0, count() + 1, this, alloc);
+
+ // Destroy the now-empty entries in the right node.
+ src->value_destroy_n(0, src->count(), alloc);
+
+ if (!leaf()) {
+ // Move the child pointers from the right to the left node.
+ for (int i = 0; i <= src->count(); ++i) {
+ init_child(count() + i + 1, src->child(i));
+ src->clear_child(i);
+ }
+ }
+
+ // Fixup the counts on the src and dest nodes.
+ set_count(1 + count() + src->count());
+ src->set_count(0);
+
+ // Remove the value on the parent node.
+ parent()->remove_value(position(), alloc);
+}
+
+template <typename P>
+void btree_node<P>::swap(btree_node *x, allocator_type *alloc) {
+ using std::swap;
+ assert(leaf() == x->leaf());
+
+ // Determine which is the smaller/larger node.
+ btree_node *smaller = this, *larger = x;
+ if (smaller->count() > larger->count()) {
+ swap(smaller, larger);
+ }
+
+ // Swap the values.
+ std::swap_ranges(smaller->slot(0), smaller->slot(smaller->count()),
+ larger->slot(0));
+
+ // Move values that can't be swapped.
+ const size_type to_move = larger->count() - smaller->count();
+ larger->uninitialized_move_n(to_move, smaller->count(), smaller->count(),
+ smaller, alloc);
+ larger->value_destroy_n(smaller->count(), to_move, alloc);
+
+ if (!leaf()) {
+ // Swap the child pointers.
+ std::swap_ranges(&smaller->mutable_child(0),
+ &smaller->mutable_child(smaller->count() + 1),
+ &larger->mutable_child(0));
+ // Update swapped children's parent pointers.
+ int i = 0;
+ for (; i <= smaller->count(); ++i) {
+ smaller->child(i)->set_parent(smaller);
+ larger->child(i)->set_parent(larger);
+ }
+ // Move the child pointers that couldn't be swapped.
+ for (; i <= larger->count(); ++i) {
+ smaller->init_child(i, larger->child(i));
+ larger->clear_child(i);
+ }
+ }
+
+ // Swap the counts.
+ swap(mutable_count(), x->mutable_count());
+}
+
+////
+// btree_iterator methods
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_slow() {
+ if (node->leaf()) {
+ assert(position >= node->count());
+ btree_iterator save(*this);
+ while (position == node->count() && !node->is_root()) {
+ assert(node->parent()->child(node->position()) == node);
+ position = node->position();
+ node = node->parent();
+ }
+ if (position == node->count()) {
+ *this = save;
+ }
+ } else {
+ assert(position < node->count());
+ node = node->child(position + 1);
+ while (!node->leaf()) {
+ node = node->child(0);
+ }
+ position = 0;
+ }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::decrement_slow() {
+ if (node->leaf()) {
+ assert(position <= -1);
+ btree_iterator save(*this);
+ while (position < 0 && !node->is_root()) {
+ assert(node->parent()->child(node->position()) == node);
+ position = node->position() - 1;
+ node = node->parent();
+ }
+ if (position < 0) {
+ *this = save;
+ }
+ } else {
+ assert(position >= 0);
+ node = node->child(position);
+ while (!node->leaf()) {
+ node = node->child(node->count());
+ }
+ position = node->count() - 1;
+ }
+}
+
+////
+// btree methods
+template <typename P>
+template <typename Btree>
+void btree<P>::copy_or_move_values_in_order(Btree *x) {
+ static_assert(std::is_same_v<btree, Btree>||
+ std::is_same_v<const btree, Btree>,
+ "Btree type must be same or const.");
+ assert(empty());
+
+ // We can avoid key comparisons because we know the order of the
+ // values is the same order we'll store them in.
+ auto iter = x->begin();
+ if (iter == x->end()) return;
+ insert_multi(maybe_move_from_iterator(iter));
+ ++iter;
+ for (; iter != x->end(); ++iter) {
+ // If the btree is not empty, we can just insert the new value at the end
+ // of the tree.
+ internal_emplace(end(), maybe_move_from_iterator(iter));
+ }
+}
+
+template <typename P>
+constexpr bool btree<P>::static_assert_validation() {
+ static_assert(std::is_nothrow_copy_constructible_v<key_compare>,
+ "Key comparison must be nothrow copy constructible");
+ static_assert(std::is_nothrow_copy_constructible_v<allocator_type>,
+ "Allocator must be nothrow copy constructible");
+ static_assert(std::is_trivially_copyable_v<iterator>,
+ "iterator not trivially copyable.");
+
+ // Note: We assert that kTargetValues, which is computed from
+ // Params::kTargetNodeSize, must fit the base_fields::field_type.
+ static_assert(
+ kNodeValues < (1 << (8 * sizeof(typename node_type::field_type))),
+ "target node size too large");
+
+ // Verify that key_compare returns an absl::{weak,strong}_ordering or bool.
+ using compare_result_type =
+ std::invoke_result_t<key_compare, key_type, key_type>;
+ static_assert(
+ std::is_same_v<compare_result_type, bool> ||
+ std::is_signed_v<compare_result_type>,
+ "key comparison function must return a signed value or "
+ "bool.");
+
+ // Test the assumption made in setting kNodeValueSpace.
+ static_assert(node_type::MinimumOverhead() >= sizeof(void *) + 4,
+ "node space assumption incorrect");
+
+ return true;
+}
+
+template <typename P>
+btree<P>::btree(const key_compare &comp, const allocator_type &alloc)
+ : root_(comp, alloc, EmptyNode()), rightmost_(EmptyNode()), size_(0) {}
+
+template <typename P>
+btree<P>::btree(const btree &x) : btree(x.key_comp(), x.allocator()) {
+ copy_or_move_values_in_order(&x);
+}
+
+template <typename P>
+template <typename... Args>
+auto btree<P>::insert_unique(const key_type &key, Args &&... args)
+ -> std::pair<iterator, bool> {
+ if (empty()) {
+ mutable_root() = rightmost_ = new_leaf_root_node(1);
+ }
+
+ auto res = internal_locate(key);
+ iterator &iter = res.value;
+
+ if constexpr (res.has_match) {
+ if (res.IsEq()) {
+ // The key already exists in the tree, do nothing.
+ return {iter, false};
+ }
+ } else {
+ iterator last = internal_last(iter);
+ if (last.node && !compare_keys(key, last.key())) {
+ // The key already exists in the tree, do nothing.
+ return {last, false};
+ }
+ }
+ return {internal_emplace(iter, std::forward<Args>(args)...), true};
+}
+
+template <typename P>
+template <typename... Args>
+inline auto btree<P>::insert_hint_unique(iterator position, const key_type &key,
+ Args &&... args)
+ -> std::pair<iterator, bool> {
+ if (!empty()) {
+ if (position == end() || compare_keys(key, position.key())) {
+ iterator prev = position;
+ if (position == begin() || compare_keys((--prev).key(), key)) {
+ // prev.key() < key < position.key()
+ return {internal_emplace(position, std::forward<Args>(args)...), true};
+ }
+ } else if (compare_keys(position.key(), key)) {
+ ++position;
+ if (position == end() || compare_keys(key, position.key())) {
+ // {original `position`}.key() < key < {current `position`}.key()
+ return {internal_emplace(position, std::forward<Args>(args)...), true};
+ }
+ } else {
+ // position.key() == key
+ return {position, false};
+ }
+ }
+ return insert_unique(key, std::forward<Args>(args)...);
+}
+
+template <typename P>
+template <typename InputIterator>
+void btree<P>::insert_iterator_unique(InputIterator b, InputIterator e) {
+ for (; b != e; ++b) {
+ insert_hint_unique(end(), params_type::key(*b), *b);
+ }
+}
+
+template <typename P>
+template <typename ValueType>
+auto btree<P>::insert_multi(const key_type &key, ValueType&& v) -> iterator {
+ if (empty()) {
+ mutable_root() = rightmost_ = new_leaf_root_node(1);
+ }
+
+ iterator iter = internal_upper_bound(key);
+ if (iter.node == nullptr) {
+ iter = end();
+ }
+ return internal_emplace(iter, std::forward<ValueType>(v));
+}
+
+template <typename P>
+template <typename ValueType>
+auto btree<P>::insert_hint_multi(iterator position, ValueType &&v) -> iterator {
+ if (!empty()) {
+ const key_type &key = params_type::key(v);
+ if (position == end() || !compare_keys(position.key(), key)) {
+ iterator prev = position;
+ if (position == begin() || !compare_keys(key, (--prev).key())) {
+ // prev.key() <= key <= position.key()
+ return internal_emplace(position, std::forward<ValueType>(v));
+ }
+ } else {
+ iterator next = position;
+ ++next;
+ if (next == end() || !compare_keys(next.key(), key)) {
+ // position.key() < key <= next.key()
+ return internal_emplace(next, std::forward<ValueType>(v));
+ }
+ }
+ }
+ return insert_multi(std::forward<ValueType>(v));
+}
+
+template <typename P>
+template <typename InputIterator>
+void btree<P>::insert_iterator_multi(InputIterator b, InputIterator e) {
+ for (; b != e; ++b) {
+ insert_hint_multi(end(), *b);
+ }
+}
+
+template <typename P>
+auto btree<P>::operator=(const btree &x) -> btree & {
+ if (this != &x) {
+ clear();
+
+ *mutable_key_comp() = x.key_comp();
+ if constexpr (std::allocator_traits<
+ allocator_type>::propagate_on_container_copy_assignment::value) {
+ *mutable_allocator() = x.allocator();
+ }
+
+ copy_or_move_values_in_order(&x);
+ }
+ return *this;
+}
+
+template <typename P>
+auto btree<P>::operator=(btree &&x) noexcept -> btree & {
+ if (this != &x) {
+ clear();
+
+ using std::swap;
+ if constexpr (std::allocator_traits<
+ allocator_type>::propagate_on_container_copy_assignment::value) {
+ // Note: `root_` also contains the allocator and the key comparator.
+ swap(root_, x.root_);
+ swap(rightmost_, x.rightmost_);
+ swap(size_, x.size_);
+ } else {
+ if (allocator() == x.allocator()) {
+ swap(mutable_root(), x.mutable_root());
+ swap(*mutable_key_comp(), *x.mutable_key_comp());
+ swap(rightmost_, x.rightmost_);
+ swap(size_, x.size_);
+ } else {
+ // We aren't allowed to propagate the allocator and the allocator is
+ // different so we can't take over its memory. We must move each element
+ // individually. We need both `x` and `this` to have `x`s key comparator
+ // while moving the values so we can't swap the key comparators.
+ *mutable_key_comp() = x.key_comp();
+ copy_or_move_values_in_order(&x);
+ }
+ }
+ }
+ return *this;
+}
+
+template <typename P>
+auto btree<P>::erase(iterator iter) -> iterator {
+ bool internal_delete = false;
+ if (!iter.node->leaf()) {
+ // Deletion of a value on an internal node. First, move the largest value
+ // from our left child here, then delete that position (in remove_value()
+ // below). We can get to the largest value from our left child by
+ // decrementing iter.
+ iterator internal_iter(iter);
+ --iter;
+ assert(iter.node->leaf());
+ params_type::move(mutable_allocator(), iter.node->slot(iter.position),
+ internal_iter.node->slot(internal_iter.position));
+ internal_delete = true;
+ }
+
+ // Delete the key from the leaf.
+ iter.node->remove_value(iter.position, mutable_allocator());
+ --size_;
+
+ // We want to return the next value after the one we just erased. If we
+ // erased from an internal node (internal_delete == true), then the next
+ // value is ++(++iter). If we erased from a leaf node (internal_delete ==
+ // false) then the next value is ++iter. Note that ++iter may point to an
+ // internal node and the value in the internal node may move to a leaf node
+ // (iter.node) when rebalancing is performed at the leaf level.
+
+ iterator res = rebalance_after_delete(iter);
+
+ // If we erased from an internal node, advance the iterator.
+ if (internal_delete) {
+ ++res;
+ }
+ return res;
+}
+
+template <typename P>
+auto btree<P>::rebalance_after_delete(iterator iter) -> iterator {
+ // Merge/rebalance as we walk back up the tree.
+ iterator res(iter);
+ bool first_iteration = true;
+ for (;;) {
+ if (iter.node == root()) {
+ try_shrink();
+ if (empty()) {
+ return end();
+ }
+ break;
+ }
+ if (iter.node->count() >= kMinNodeValues) {
+ break;
+ }
+ bool merged = try_merge_or_rebalance(&iter);
+ // On the first iteration, we should update `res` with `iter` because `res`
+ // may have been invalidated.
+ if (first_iteration) {
+ res = iter;
+ first_iteration = false;
+ }
+ if (!merged) {
+ break;
+ }
+ iter.position = iter.node->position();
+ iter.node = iter.node->parent();
+ }
+
+ // Adjust our return value. If we're pointing at the end of a node, advance
+ // the iterator.
+ if (res.position == res.node->count()) {
+ res.position = res.node->count() - 1;
+ ++res;
+ }
+
+ return res;
+}
+
+template <typename P>
+auto btree<P>::erase(iterator begin, iterator end)
+ -> std::pair<size_type, iterator> {
+ difference_type count = std::distance(begin, end);
+ assert(count >= 0);
+
+ if (count == 0) {
+ return {0, begin};
+ }
+
+ if (count == size_) {
+ clear();
+ return {count, this->end()};
+ }
+
+ if (begin.node == end.node) {
+ erase_same_node(begin, end);
+ size_ -= count;
+ return {count, rebalance_after_delete(begin)};
+ }
+
+ const size_type target_size = size_ - count;
+ while (size_ > target_size) {
+ if (begin.node->leaf()) {
+ const size_type remaining_to_erase = size_ - target_size;
+ const size_type remaining_in_node = begin.node->count() - begin.position;
+ begin = erase_from_leaf_node(
+ begin, std::min(remaining_to_erase, remaining_in_node));
+ } else {
+ begin = erase(begin);
+ }
+ }
+ return {count, begin};
+}
+
+template <typename P>
+void btree<P>::erase_same_node(iterator begin, iterator end) {
+ assert(begin.node == end.node);
+ assert(end.position > begin.position);
+
+ node_type *node = begin.node;
+ size_type to_erase = end.position - begin.position;
+ if (!node->leaf()) {
+ // Delete all children between begin and end.
+ for (size_type i = 0; i < to_erase; ++i) {
+ internal_clear(node->child(begin.position + i + 1));
+ }
+ // Rotate children after end into new positions.
+ for (size_type i = begin.position + to_erase + 1; i <= node->count(); ++i) {
+ node->set_child(i - to_erase, node->child(i));
+ node->clear_child(i);
+ }
+ }
+ node->remove_values_ignore_children(begin.position, to_erase,
+ mutable_allocator());
+
+ // Do not need to update rightmost_, because
+ // * either end == this->end(), and therefore node == rightmost_, and still
+ // exists
+ // * or end != this->end(), and therefore rightmost_ hasn't been erased, since
+ // it wasn't covered in [begin, end)
+}
+
+template <typename P>
+auto btree<P>::erase_from_leaf_node(iterator begin, size_type to_erase)
+ -> iterator {
+ node_type *node = begin.node;
+ assert(node->leaf());
+ assert(node->count() > begin.position);
+ assert(begin.position + to_erase <= node->count());
+
+ node->remove_values_ignore_children(begin.position, to_erase,
+ mutable_allocator());
+
+ size_ -= to_erase;
+
+ return rebalance_after_delete(begin);
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::erase_unique(const K &key) -> size_type {
+ const iterator iter = internal_find(key);
+ if (iter.node == nullptr) {
+ // The key doesn't exist in the tree, return nothing done.
+ return 0;
+ }
+ erase(iter);
+ return 1;
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::erase_multi(const K &key) -> size_type {
+ const iterator begin = internal_lower_bound(key);
+ if (begin.node == nullptr) {
+ // The key doesn't exist in the tree, return nothing done.
+ return 0;
+ }
+ // Delete all of the keys between begin and upper_bound(key).
+ const iterator end = internal_end(internal_upper_bound(key));
+ return erase(begin, end).first;
+}
+
+template <typename P>
+void btree<P>::clear() {
+ if (!empty()) {
+ internal_clear(root());
+ }
+ mutable_root() = EmptyNode();
+ rightmost_ = EmptyNode();
+ size_ = 0;
+}
+
+template <typename P>
+void btree<P>::swap(btree &x) {
+ using std::swap;
+ if (std::allocator_traits<
+ allocator_type>::propagate_on_container_swap::value) {
+ // Note: `root_` also contains the allocator and the key comparator.
+ swap(root_, x.root_);
+ } else {
+ // It's undefined behavior if the allocators are unequal here.
+ assert(allocator() == x.allocator());
+ swap(mutable_root(), x.mutable_root());
+ swap(*mutable_key_comp(), *x.mutable_key_comp());
+ }
+ swap(rightmost_, x.rightmost_);
+ swap(size_, x.size_);
+}
+
+template <typename P>
+void btree<P>::verify() const {
+ assert(root() != nullptr);
+ assert(leftmost() != nullptr);
+ assert(rightmost_ != nullptr);
+ assert(empty() || size() == internal_verify(root(), nullptr, nullptr));
+ assert(leftmost() == (++const_iterator(root(), -1)).node);
+ assert(rightmost_ == (--const_iterator(root(), root()->count())).node);
+ assert(leftmost()->leaf());
+ assert(rightmost_->leaf());
+}
+
+template <typename P>
+void btree<P>::rebalance_or_split(iterator *iter) {
+ node_type *&node = iter->node;
+ int &insert_position = iter->position;
+ assert(node->count() == node->max_count());
+ assert(kNodeValues == node->max_count());
+
+ // First try to make room on the node by rebalancing.
+ node_type *parent = node->parent();
+ if (node != root()) {
+ if (node->position() > 0) {
+ // Try rebalancing with our left sibling.
+ node_type *left = parent->child(node->position() - 1);
+ assert(left->max_count() == kNodeValues);
+ if (left->count() < kNodeValues) {
+ // We bias rebalancing based on the position being inserted. If we're
+ // inserting at the end of the right node then we bias rebalancing to
+ // fill up the left node.
+ int to_move = (kNodeValues - left->count()) /
+ (1 + (insert_position < kNodeValues));
+ to_move = std::max(1, to_move);
+
+ if (((insert_position - to_move) >= 0) ||
+ ((left->count() + to_move) < kNodeValues)) {
+ left->rebalance_right_to_left(to_move, node, mutable_allocator());
+
+ assert(node->max_count() - node->count() == to_move);
+ insert_position = insert_position - to_move;
+ if (insert_position < 0) {
+ insert_position = insert_position + left->count() + 1;
+ node = left;
+ }
+
+ assert(node->count() < node->max_count());
+ return;
+ }
+ }
+ }
+
+ if (node->position() < parent->count()) {
+ // Try rebalancing with our right sibling.
+ node_type *right = parent->child(node->position() + 1);
+ assert(right->max_count() == kNodeValues);
+ if (right->count() < kNodeValues) {
+ // We bias rebalancing based on the position being inserted. If we're
+ // inserting at the beginning of the left node then we bias rebalancing
+ // to fill up the right node.
+ int to_move =
+ (kNodeValues - right->count()) / (1 + (insert_position > 0));
+ to_move = (std::max)(1, to_move);
+
+ if ((insert_position <= (node->count() - to_move)) ||
+ ((right->count() + to_move) < kNodeValues)) {
+ node->rebalance_left_to_right(to_move, right, mutable_allocator());
+
+ if (insert_position > node->count()) {
+ insert_position = insert_position - node->count() - 1;
+ node = right;
+ }
+
+ assert(node->count() < node->max_count());
+ return;
+ }
+ }
+ }
+
+ // Rebalancing failed, make sure there is room on the parent node for a new
+ // value.
+ assert(parent->max_count() == kNodeValues);
+ if (parent->count() == kNodeValues) {
+ iterator parent_iter(node->parent(), node->position());
+ rebalance_or_split(&parent_iter);
+ }
+ } else {
+ // Rebalancing not possible because this is the root node.
+ // Create a new root node and set the current root node as the child of the
+ // new root.
+ parent = new_internal_node(parent);
+ parent->init_child(0, root());
+ mutable_root() = parent;
+ // If the former root was a leaf node, then it's now the rightmost node.
+ assert(!parent->child(0)->leaf() || parent->child(0) == rightmost_);
+ }
+
+ // Split the node.
+ node_type *split_node;
+ if (node->leaf()) {
+ split_node = new_leaf_node(parent);
+ node->split(insert_position, split_node, mutable_allocator());
+ if (rightmost_ == node) rightmost_ = split_node;
+ } else {
+ split_node = new_internal_node(parent);
+ node->split(insert_position, split_node, mutable_allocator());
+ }
+
+ if (insert_position > node->count()) {
+ insert_position = insert_position - node->count() - 1;
+ node = split_node;
+ }
+}
+
+template <typename P>
+void btree<P>::merge_nodes(node_type *left, node_type *right) {
+ left->merge(right, mutable_allocator());
+ if (right->leaf()) {
+ if (rightmost_ == right) rightmost_ = left;
+ delete_leaf_node(right);
+ } else {
+ delete_internal_node(right);
+ }
+}
+
+template <typename P>
+bool btree<P>::try_merge_or_rebalance(iterator *iter) {
+ node_type *parent = iter->node->parent();
+ if (iter->node->position() > 0) {
+ // Try merging with our left sibling.
+ node_type *left = parent->child(iter->node->position() - 1);
+ assert(left->max_count() == kNodeValues);
+ if ((1 + left->count() + iter->node->count()) <= kNodeValues) {
+ iter->position += 1 + left->count();
+ merge_nodes(left, iter->node);
+ iter->node = left;
+ return true;
+ }
+ }
+ if (iter->node->position() < parent->count()) {
+ // Try merging with our right sibling.
+ node_type *right = parent->child(iter->node->position() + 1);
+ assert(right->max_count() == kNodeValues);
+ if ((1 + iter->node->count() + right->count()) <= kNodeValues) {
+ merge_nodes(iter->node, right);
+ return true;
+ }
+ // Try rebalancing with our right sibling. We don't perform rebalancing if
+ // we deleted the first element from iter->node and the node is not
+ // empty. This is a small optimization for the common pattern of deleting
+ // from the front of the tree.
+ if ((right->count() > kMinNodeValues) &&
+ ((iter->node->count() == 0) ||
+ (iter->position > 0))) {
+ int to_move = (right->count() - iter->node->count()) / 2;
+ to_move = std::min(to_move, right->count() - 1);
+ iter->node->rebalance_right_to_left(to_move, right, mutable_allocator());
+ return false;
+ }
+ }
+ if (iter->node->position() > 0) {
+ // Try rebalancing with our left sibling. We don't perform rebalancing if
+ // we deleted the last element from iter->node and the node is not
+ // empty. This is a small optimization for the common pattern of deleting
+ // from the back of the tree.
+ node_type *left = parent->child(iter->node->position() - 1);
+ if ((left->count() > kMinNodeValues) &&
+ ((iter->node->count() == 0) ||
+ (iter->position < iter->node->count()))) {
+ int to_move = (left->count() - iter->node->count()) / 2;
+ to_move = std::min(to_move, left->count() - 1);
+ left->rebalance_left_to_right(to_move, iter->node, mutable_allocator());
+ iter->position += to_move;
+ return false;
+ }
+ }
+ return false;
+}
+
+template <typename P>
+void btree<P>::try_shrink() {
+ if (root()->count() > 0) {
+ return;
+ }
+ // Deleted the last item on the root node, shrink the height of the tree.
+ if (root()->leaf()) {
+ assert(size() == 0);
+ delete_leaf_node(root());
+ mutable_root() = EmptyNode();
+ rightmost_ = EmptyNode();
+ } else {
+ node_type *child = root()->child(0);
+ child->make_root();
+ delete_internal_node(root());
+ mutable_root() = child;
+ }
+}
+
+template <typename P>
+template <typename IterType>
+inline IterType btree<P>::internal_last(IterType iter) {
+ assert(iter.node != nullptr);
+ while (iter.position == iter.node->count()) {
+ iter.position = iter.node->position();
+ iter.node = iter.node->parent();
+ if (iter.node->leaf()) {
+ iter.node = nullptr;
+ break;
+ }
+ }
+ return iter;
+}
+
+template <typename P>
+template <typename... Args>
+inline auto btree<P>::internal_emplace(iterator iter, Args &&... args)
+ -> iterator {
+ if (!iter.node->leaf()) {
+ // We can't insert on an internal node. Instead, we'll insert after the
+ // previous value which is guaranteed to be on a leaf node.
+ --iter;
+ ++iter.position;
+ }
+ const int max_count = iter.node->max_count();
+ if (iter.node->count() == max_count) {
+ // Make room in the leaf for the new item.
+ if (max_count < kNodeValues) {
+ // Insertion into the root where the root is smaller than the full node
+ // size. Simply grow the size of the root node.
+ assert(iter.node == root());
+ iter.node =
+ new_leaf_root_node(std::min(kNodeValues, 2 * max_count));
+ iter.node->swap(root(), mutable_allocator());
+ delete_leaf_node(root());
+ mutable_root() = iter.node;
+ rightmost_ = iter.node;
+ } else {
+ rebalance_or_split(&iter);
+ }
+ }
+ iter.node->emplace_value(iter.position, mutable_allocator(),
+ std::forward<Args>(args)...);
+ ++size_;
+ return iter;
+}
+
+template <typename P>
+template <typename K>
+inline auto btree<P>::internal_locate(const K &key) const
+ -> SearchResult<iterator, is_key_compare_to::value> {
+ return internal_locate_impl(key, is_key_compare_to());
+}
+
+template <typename P>
+template <typename K>
+inline auto btree<P>::internal_locate_impl(
+ const K &key, std::false_type /* IsCompareTo */) const
+ -> SearchResult<iterator, false> {
+ iterator iter(const_cast<node_type *>(root()), 0);
+ for (;;) {
+ iter.position = iter.node->lower_bound(key, key_comp()).value;
+ // NOTE: we don't need to walk all the way down the tree if the keys are
+ // equal, but determining equality would require doing an extra comparison
+ // on each node on the way down, and we will need to go all the way to the
+ // leaf node in the expected case.
+ if (iter.node->leaf()) {
+ break;
+ }
+ iter.node = iter.node->child(iter.position);
+ }
+ return {iter};
+}
+
+template <typename P>
+template <typename K>
+inline auto btree<P>::internal_locate_impl(
+ const K &key, std::true_type /* IsCompareTo */) const
+ -> SearchResult<iterator, true> {
+ iterator iter(const_cast<node_type *>(root()), 0);
+ for (;;) {
+ SearchResult<int, true> res = iter.node->lower_bound(key, key_comp());
+ iter.position = res.value;
+ if (res.match == MatchKind::kEq) {
+ return {iter, MatchKind::kEq};
+ }
+ if (iter.node->leaf()) {
+ break;
+ }
+ iter.node = iter.node->child(iter.position);
+ }
+ return {iter, MatchKind::kNe};
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::internal_lower_bound(const K &key) const -> iterator {
+ iterator iter(const_cast<node_type *>(root()), 0);
+ for (;;) {
+ iter.position = iter.node->lower_bound(key, key_comp()).value;
+ if (iter.node->leaf()) {
+ break;
+ }
+ iter.node = iter.node->child(iter.position);
+ }
+ return internal_last(iter);
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::internal_upper_bound(const K &key) const -> iterator {
+ iterator iter(const_cast<node_type *>(root()), 0);
+ for (;;) {
+ iter.position = iter.node->upper_bound(key, key_comp());
+ if (iter.node->leaf()) {
+ break;
+ }
+ iter.node = iter.node->child(iter.position);
+ }
+ return internal_last(iter);
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::internal_find(const K &key) const -> iterator {
+ auto res = internal_locate(key);
+ if constexpr (res.has_match) {
+ if (res.IsEq()) {
+ return res.value;
+ }
+ } else {
+ const iterator iter = internal_last(res.value);
+ if (iter.node != nullptr && !compare_keys(key, iter.key())) {
+ return iter;
+ }
+ }
+ return {nullptr, 0};
+}
+
+template <typename P>
+void btree<P>::internal_clear(node_type *node) {
+ if (!node->leaf()) {
+ for (int i = 0; i <= node->count(); ++i) {
+ internal_clear(node->child(i));
+ }
+ delete_internal_node(node);
+ } else {
+ delete_leaf_node(node);
+ }
+}
+
+template <typename P>
+int btree<P>::internal_verify(
+ const node_type *node, const key_type *lo, const key_type *hi) const {
+ assert(node->count() > 0);
+ assert(node->count() <= node->max_count());
+ if (lo) {
+ assert(!compare_keys(node->key(0), *lo));
+ }
+ if (hi) {
+ assert(!compare_keys(*hi, node->key(node->count() - 1)));
+ }
+ for (int i = 1; i < node->count(); ++i) {
+ assert(!compare_keys(node->key(i), node->key(i - 1)));
+ }
+ int count = node->count();
+ if (!node->leaf()) {
+ for (int i = 0; i <= node->count(); ++i) {
+ assert(node->child(i) != nullptr);
+ assert(node->child(i)->parent() == node);
+ assert(node->child(i)->position() == i);
+ count += internal_verify(
+ node->child(i),
+ (i == 0) ? lo : &node->key(i - 1),
+ (i == node->count()) ? hi : &node->key(i));
+ }
+ }
+ return count;
+}
+
+} // namespace btree::internal
diff --git a/src/include/cpp-btree/btree_container.h b/src/include/cpp-btree/btree_container.h
new file mode 100644
index 000000000..bcbd1ff45
--- /dev/null
+++ b/src/include/cpp-btree/btree_container.h
@@ -0,0 +1,543 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+#include "btree.h"
+
+namespace btree::internal {
+
+// A common base class for btree_set, btree_map, btree_multiset, and
+// btree_multimap.
+template <typename Tree>
+class btree_container {
+ using params_type = typename Tree::params_type;
+
+ protected:
+ // Alias used for heterogeneous lookup functions.
+ // `key_arg<K>` evaluates to `K` when the functors are transparent and to
+ // `key_type` otherwise. It permits template argument deduction on `K` for the
+ // transparent case.
+ template <class Compare>
+ using is_transparent_t = typename Compare::is_transparent;
+ template <class K>
+ using key_arg =
+ std::conditional_t<
+ std::experimental::is_detected_v<is_transparent_t, typename Tree::key_compare>,
+ K,
+ typename Tree::key_type>;
+
+ public:
+ using key_type = typename Tree::key_type;
+ using value_type = typename Tree::value_type;
+ using size_type = typename Tree::size_type;
+ using difference_type = typename Tree::difference_type;
+ using key_compare = typename Tree::key_compare;
+ using value_compare = typename Tree::value_compare;
+ using allocator_type = typename Tree::allocator_type;
+ using reference = typename Tree::reference;
+ using const_reference = typename Tree::const_reference;
+ using pointer = typename Tree::pointer;
+ using const_pointer = typename Tree::const_pointer;
+ using iterator = typename Tree::iterator;
+ using const_iterator = typename Tree::const_iterator;
+ using reverse_iterator = typename Tree::reverse_iterator;
+ using const_reverse_iterator = typename Tree::const_reverse_iterator;
+
+ // Constructors/assignments.
+ btree_container() : tree_(key_compare(), allocator_type()) {}
+ explicit btree_container(const key_compare &comp,
+ const allocator_type &alloc = allocator_type())
+ : tree_(comp, alloc) {}
+ btree_container(const btree_container &x) = default;
+ btree_container(btree_container &&x) noexcept = default;
+ btree_container &operator=(const btree_container &x) = default;
+ btree_container &operator=(btree_container &&x) noexcept(
+ std::is_nothrow_move_assignable<Tree>::value) = default;
+
+ // Iterator routines.
+ iterator begin() { return tree_.begin(); }
+ const_iterator begin() const { return tree_.begin(); }
+ const_iterator cbegin() const { return tree_.begin(); }
+ iterator end() { return tree_.end(); }
+ const_iterator end() const { return tree_.end(); }
+ const_iterator cend() const { return tree_.end(); }
+ reverse_iterator rbegin() { return tree_.rbegin(); }
+ const_reverse_iterator rbegin() const { return tree_.rbegin(); }
+ const_reverse_iterator crbegin() const { return tree_.rbegin(); }
+ reverse_iterator rend() { return tree_.rend(); }
+ const_reverse_iterator rend() const { return tree_.rend(); }
+ const_reverse_iterator crend() const { return tree_.rend(); }
+
+ // Lookup routines.
+ template <typename K = key_type>
+ iterator find(const key_arg<K> &key) {
+ return tree_.find(key);
+ }
+ template <typename K = key_type>
+ const_iterator find(const key_arg<K> &key) const {
+ return tree_.find(key);
+ }
+ template <typename K = key_type>
+ bool contains(const key_arg<K> &key) const {
+ return find(key) != end();
+ }
+ template <typename K = key_type>
+ iterator lower_bound(const key_arg<K> &key) {
+ return tree_.lower_bound(key);
+ }
+ template <typename K = key_type>
+ const_iterator lower_bound(const key_arg<K> &key) const {
+ return tree_.lower_bound(key);
+ }
+ template <typename K = key_type>
+ iterator upper_bound(const key_arg<K> &key) {
+ return tree_.upper_bound(key);
+ }
+ template <typename K = key_type>
+ const_iterator upper_bound(const key_arg<K> &key) const {
+ return tree_.upper_bound(key);
+ }
+ template <typename K = key_type>
+ std::pair<iterator, iterator> equal_range(const key_arg<K> &key) {
+ return tree_.equal_range(key);
+ }
+ template <typename K = key_type>
+ std::pair<const_iterator, const_iterator> equal_range(
+ const key_arg<K> &key) const {
+ return tree_.equal_range(key);
+ }
+
+ // Deletion routines. Note that there is also a deletion routine that is
+ // specific to btree_set_container/btree_multiset_container.
+
+ // Erase the specified iterator from the btree. The iterator must be valid
+ // (i.e. not equal to end()). Return an iterator pointing to the node after
+ // the one that was erased (or end() if none exists).
+ iterator erase(const_iterator iter) { return tree_.erase(iterator(iter)); }
+ iterator erase(iterator iter) { return tree_.erase(iter); }
+ iterator erase(const_iterator first, const_iterator last) {
+ return tree_.erase(iterator(first), iterator(last)).second;
+ }
+
+ public:
+ // Utility routines.
+ void clear() { tree_.clear(); }
+ void swap(btree_container &x) { tree_.swap(x.tree_); }
+ void verify() const { tree_.verify(); }
+
+ // Size routines.
+ size_type size() const { return tree_.size(); }
+ size_type max_size() const { return tree_.max_size(); }
+ bool empty() const { return tree_.empty(); }
+
+ friend bool operator==(const btree_container &x, const btree_container &y) {
+ if (x.size() != y.size()) return false;
+ return std::equal(x.begin(), x.end(), y.begin());
+ }
+
+ friend bool operator!=(const btree_container &x, const btree_container &y) {
+ return !(x == y);
+ }
+
+ friend bool operator<(const btree_container &x, const btree_container &y) {
+ return std::lexicographical_compare(x.begin(), x.end(), y.begin(), y.end());
+ }
+
+ friend bool operator>(const btree_container &x, const btree_container &y) {
+ return y < x;
+ }
+
+ friend bool operator<=(const btree_container &x, const btree_container &y) {
+ return !(y < x);
+ }
+
+ friend bool operator>=(const btree_container &x, const btree_container &y) {
+ return !(x < y);
+ }
+
+ // The allocator used by the btree.
+ allocator_type get_allocator() const { return tree_.get_allocator(); }
+
+ // The key comparator used by the btree.
+ key_compare key_comp() const { return tree_.key_comp(); }
+ value_compare value_comp() const { return tree_.value_comp(); }
+
+ protected:
+ Tree tree_;
+};
+
+// A common base class for btree_set and btree_map.
+template <typename Tree>
+class btree_set_container : public btree_container<Tree> {
+ using super_type = btree_container<Tree>;
+ using params_type = typename Tree::params_type;
+ using init_type = typename params_type::init_type;
+ using is_key_compare_to = typename params_type::is_key_compare_to;
+ friend class BtreeNodePeer;
+
+ protected:
+ template <class K>
+ using key_arg = typename super_type::template key_arg<K>;
+
+ public:
+ using key_type = typename Tree::key_type;
+ using value_type = typename Tree::value_type;
+ using size_type = typename Tree::size_type;
+ using key_compare = typename Tree::key_compare;
+ using allocator_type = typename Tree::allocator_type;
+ using iterator = typename Tree::iterator;
+ using const_iterator = typename Tree::const_iterator;
+
+ // Inherit constructors.
+ using super_type::super_type;
+ btree_set_container() {}
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_set_container(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ insert(b, e);
+ }
+
+ // Initializer list constructor.
+ btree_set_container(std::initializer_list<init_type> init,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : btree_set_container(init.begin(), init.end(), comp, alloc) {}
+
+ // Lookup routines.
+ template <typename K = key_type>
+ size_type count(const key_arg<K> &key) const {
+ return this->tree_.count_unique(key);
+ }
+
+ // Insertion routines.
+ std::pair<iterator, bool> insert(const value_type &x) {
+ return this->tree_.insert_unique(params_type::key(x), x);
+ }
+ std::pair<iterator, bool> insert(value_type &&x) {
+ return this->tree_.insert_unique(params_type::key(x), std::move(x));
+ }
+ template <typename... Args>
+ std::pair<iterator, bool> emplace(Args &&... args) {
+ init_type v(std::forward<Args>(args)...);
+ return this->tree_.insert_unique(params_type::key(v), std::move(v));
+ }
+ iterator insert(const_iterator position, const value_type &x) {
+ return this->tree_
+ .insert_hint_unique(iterator(position), params_type::key(x), x)
+ .first;
+ }
+ iterator insert(const_iterator position, value_type &&x) {
+ return this->tree_
+ .insert_hint_unique(iterator(position), params_type::key(x),
+ std::move(x))
+ .first;
+ }
+ template <typename... Args>
+ iterator emplace_hint(const_iterator position, Args &&... args) {
+ init_type v(std::forward<Args>(args)...);
+ return this->tree_
+ .insert_hint_unique(iterator(position), params_type::key(v),
+ std::move(v))
+ .first;
+ }
+ template <typename InputIterator>
+ void insert(InputIterator b, InputIterator e) {
+ this->tree_.insert_iterator_unique(b, e);
+ }
+ void insert(std::initializer_list<init_type> init) {
+ this->tree_.insert_iterator_unique(init.begin(), init.end());
+ }
+ // Deletion routines.
+ template <typename K = key_type>
+ size_type erase(const key_arg<K> &key) {
+ return this->tree_.erase_unique(key);
+ }
+ using super_type::erase;
+
+ // Merge routines.
+ // Moves elements from `src` into `this`. If the element already exists in
+ // `this`, it is left unmodified in `src`.
+ template <
+ typename T,
+ typename std::enable_if_t<
+ std::conjunction_v<
+ std::is_same<value_type, typename T::value_type>,
+ std::is_same<allocator_type, typename T::allocator_type>,
+ std::is_same<typename params_type::is_map_container,
+ typename T::params_type::is_map_container>>,
+ int> = 0>
+ void merge(btree_container<T> &src) { // NOLINT
+ for (auto src_it = src.begin(); src_it != src.end();) {
+ if (insert(std::move(*src_it)).second) {
+ src_it = src.erase(src_it);
+ } else {
+ ++src_it;
+ }
+ }
+ }
+
+ template <
+ typename T,
+ typename std::enable_if_t<
+ std::conjunction_v<
+ std::is_same<value_type, typename T::value_type>,
+ std::is_same<allocator_type, typename T::allocator_type>,
+ std::is_same<typename params_type::is_map_container,
+ typename T::params_type::is_map_container>>,
+ int> = 0>
+ void merge(btree_container<T> &&src) {
+ merge(src);
+ }
+};
+
+// A common base class for btree_map and safe_btree_map.
+// Base class for btree_map.
+template <typename Tree>
+class btree_map_container : public btree_set_container<Tree> {
+ using super_type = btree_set_container<Tree>;
+ using params_type = typename Tree::params_type;
+
+ protected:
+ template <class K>
+ using key_arg = typename super_type::template key_arg<K>;
+
+ public:
+ using key_type = typename Tree::key_type;
+ using mapped_type = typename params_type::mapped_type;
+ using value_type = typename Tree::value_type;
+ using key_compare = typename Tree::key_compare;
+ using allocator_type = typename Tree::allocator_type;
+ using iterator = typename Tree::iterator;
+ using const_iterator = typename Tree::const_iterator;
+
+ // Inherit constructors.
+ using super_type::super_type;
+ btree_map_container() {}
+
+ // Insertion routines.
+ template <typename... Args>
+ std::pair<iterator, bool> try_emplace(const key_type &k, Args &&... args) {
+ return this->tree_.insert_unique(
+ k, std::piecewise_construct, std::forward_as_tuple(k),
+ std::forward_as_tuple(std::forward<Args>(args)...));
+ }
+ template <typename... Args>
+ std::pair<iterator, bool> try_emplace(key_type &&k, Args &&... args) {
+ // Note: `key_ref` exists to avoid a ClangTidy warning about moving from `k`
+ // and then using `k` unsequenced. This is safe because the move is into a
+ // forwarding reference and insert_unique guarantees that `key` is never
+ // referenced after consuming `args`.
+ const key_type& key_ref = k;
+ return this->tree_.insert_unique(
+ key_ref, std::piecewise_construct, std::forward_as_tuple(std::move(k)),
+ std::forward_as_tuple(std::forward<Args>(args)...));
+ }
+ template <typename... Args>
+ iterator try_emplace(const_iterator hint, const key_type &k,
+ Args &&... args) {
+ return this->tree_
+ .insert_hint_unique(iterator(hint), k, std::piecewise_construct,
+ std::forward_as_tuple(k),
+ std::forward_as_tuple(std::forward<Args>(args)...))
+ .first;
+ }
+ template <typename... Args>
+ iterator try_emplace(const_iterator hint, key_type &&k, Args &&... args) {
+ // Note: `key_ref` exists to avoid a ClangTidy warning about moving from `k`
+ // and then using `k` unsequenced. This is safe because the move is into a
+ // forwarding reference and insert_hint_unique guarantees that `key` is
+ // never referenced after consuming `args`.
+ const key_type& key_ref = k;
+ return this->tree_
+ .insert_hint_unique(iterator(hint), key_ref, std::piecewise_construct,
+ std::forward_as_tuple(std::move(k)),
+ std::forward_as_tuple(std::forward<Args>(args)...))
+ .first;
+ }
+ mapped_type &operator[](const key_type &k) {
+ return try_emplace(k).first->second;
+ }
+ mapped_type &operator[](key_type &&k) {
+ return try_emplace(std::move(k)).first->second;
+ }
+
+ template <typename K = key_type>
+ mapped_type &at(const key_arg<K> &key) {
+ auto it = this->find(key);
+ if (it == this->end())
+ throw std::out_of_range("btree_map::at");
+ return it->second;
+ }
+ template <typename K = key_type>
+ const mapped_type &at(const key_arg<K> &key) const {
+ auto it = this->find(key);
+ if (it == this->end())
+ throw std::out_of_range("btree_map::at");
+ return it->second;
+ }
+};
+
+// A common base class for btree_multiset and btree_multimap.
+template <typename Tree>
+class btree_multiset_container : public btree_container<Tree> {
+ using super_type = btree_container<Tree>;
+ using params_type = typename Tree::params_type;
+ using init_type = typename params_type::init_type;
+ using is_key_compare_to = typename params_type::is_key_compare_to;
+
+ template <class K>
+ using key_arg = typename super_type::template key_arg<K>;
+
+ public:
+ using key_type = typename Tree::key_type;
+ using value_type = typename Tree::value_type;
+ using size_type = typename Tree::size_type;
+ using key_compare = typename Tree::key_compare;
+ using allocator_type = typename Tree::allocator_type;
+ using iterator = typename Tree::iterator;
+ using const_iterator = typename Tree::const_iterator;
+ using node_type = typename super_type::node_type;
+
+ // Inherit constructors.
+ using super_type::super_type;
+ btree_multiset_container() {}
+
+ // Range constructor.
+ template <class InputIterator>
+ btree_multiset_container(InputIterator b, InputIterator e,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : super_type(comp, alloc) {
+ insert(b, e);
+ }
+
+ // Initializer list constructor.
+ btree_multiset_container(std::initializer_list<init_type> init,
+ const key_compare &comp = key_compare(),
+ const allocator_type &alloc = allocator_type())
+ : btree_multiset_container(init.begin(), init.end(), comp, alloc) {}
+
+ // Lookup routines.
+ template <typename K = key_type>
+ size_type count(const key_arg<K> &key) const {
+ return this->tree_.count_multi(key);
+ }
+
+ // Insertion routines.
+ iterator insert(const value_type &x) { return this->tree_.insert_multi(x); }
+ iterator insert(value_type &&x) {
+ return this->tree_.insert_multi(std::move(x));
+ }
+ iterator insert(const_iterator position, const value_type &x) {
+ return this->tree_.insert_hint_multi(iterator(position), x);
+ }
+ iterator insert(const_iterator position, value_type &&x) {
+ return this->tree_.insert_hint_multi(iterator(position), std::move(x));
+ }
+ template <typename InputIterator>
+ void insert(InputIterator b, InputIterator e) {
+ this->tree_.insert_iterator_multi(b, e);
+ }
+ void insert(std::initializer_list<init_type> init) {
+ this->tree_.insert_iterator_multi(init.begin(), init.end());
+ }
+ template <typename... Args>
+ iterator emplace(Args &&... args) {
+ return this->tree_.insert_multi(init_type(std::forward<Args>(args)...));
+ }
+ template <typename... Args>
+ iterator emplace_hint(const_iterator position, Args &&... args) {
+ return this->tree_.insert_hint_multi(
+ iterator(position), init_type(std::forward<Args>(args)...));
+ }
+ iterator insert(node_type &&node) {
+ if (!node) return this->end();
+ iterator res =
+ this->tree_.insert_multi(params_type::key(node.slot()),
+ node.slot());
+ node.destroy();
+ return res;
+ }
+ iterator insert(const_iterator hint, node_type &&node) {
+ if (!node) return this->end();
+ iterator res = this->tree_.insert_hint_multi(
+ iterator(hint),
+ std::move(params_type::element(node.slot())));
+ node.destroy();
+ return res;
+ }
+
+ // Deletion routines.
+ template <typename K = key_type>
+ size_type erase(const key_arg<K> &key) {
+ return this->tree_.erase_multi(key);
+ }
+ using super_type::erase;
+
+ // Merge routines.
+ // Moves all elements from `src` into `this`.
+ template <
+ typename T,
+ typename std::enable_if_t<
+ std::conjunction_v<
+ std::is_same<value_type, typename T::value_type>,
+ std::is_same<allocator_type, typename T::allocator_type>,
+ std::is_same<typename params_type::is_map_container,
+ typename T::params_type::is_map_container>>,
+ int> = 0>
+ void merge(btree_container<T> &src) { // NOLINT
+ insert(std::make_move_iterator(src.begin()),
+ std::make_move_iterator(src.end()));
+ src.clear();
+ }
+
+ template <
+ typename T,
+ typename std::enable_if_t<
+ std::conjunction_v<
+ std::is_same<value_type, typename T::value_type>,
+ std::is_same<allocator_type, typename T::allocator_type>,
+ std::is_same<typename params_type::is_map_container,
+ typename T::params_type::is_map_container>>,
+ int> = 0>
+ void merge(btree_container<T> &&src) {
+ merge(src);
+ }
+};
+
+// A base class for btree_multimap.
+template <typename Tree>
+class btree_multimap_container : public btree_multiset_container<Tree> {
+ using super_type = btree_multiset_container<Tree>;
+ using params_type = typename Tree::params_type;
+
+ public:
+ using mapped_type = typename params_type::mapped_type;
+
+ // Inherit constructors.
+ using super_type::super_type;
+ btree_multimap_container() {}
+};
+} // namespace btree::internal
diff --git a/src/include/cpp-btree/btree_map.h b/src/include/cpp-btree/btree_map.h
new file mode 100644
index 000000000..749c2bbcd
--- /dev/null
+++ b/src/include/cpp-btree/btree_map.h
@@ -0,0 +1,159 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// -----------------------------------------------------------------------------
+// File: btree_map.h
+// -----------------------------------------------------------------------------
+//
+// This header file defines B-tree maps: sorted associative containers mapping
+// keys to values.
+//
+// * `btree::btree_map<>`
+// * `btree::btree_multimap<>`
+//
+// These B-tree types are similar to the corresponding types in the STL
+// (`std::map` and `std::multimap`) and generally conform to the STL interfaces
+// of those types. However, because they are implemented using B-trees, they
+// are more efficient in most situations.
+//
+// Unlike `std::map` and `std::multimap`, which are commonly implemented using
+// red-black tree nodes, B-tree maps use more generic B-tree nodes able to hold
+// multiple values per node. Holding multiple values per node often makes
+// B-tree maps perform better than their `std::map` counterparts, because
+// multiple entries can be checked within the same cache hit.
+//
+// However, these types should not be considered drop-in replacements for
+// `std::map` and `std::multimap` as there are some API differences, which are
+// noted in this header file.
+//
+// Importantly, insertions and deletions may invalidate outstanding iterators,
+// pointers, and references to elements. Such invalidations are typically only
+// an issue if insertion and deletion operations are interleaved with the use of
+// more than one iterator, pointer, or reference simultaneously. For this
+// reason, `insert()` and `erase()` return a valid iterator at the current
+// position.
+
+#pragma once
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// btree::btree_map<>
+//
+// A `btree::btree_map<K, V>` is an ordered associative container of
+// unique keys and associated values designed to be a more efficient replacement
+// for `std::map` (in most cases).
+//
+// Keys are sorted using an (optional) comparison function, which defaults to
+// `std::less<K>`.
+//
+// A `btree::btree_map<K, V>` uses a default allocator of
+// `std::allocator<std::pair<const K, V>>` to allocate (and deallocate)
+// nodes, and construct and destruct values within those nodes. You may
+// instead specify a custom allocator `A` (which in turn requires specifying a
+// custom comparator `C`) as in `btree::btree_map<K, V, C, A>`.
+//
+template <typename Key, typename Value, typename Compare = std::less<Key>,
+ typename Alloc = std::allocator<std::pair<const Key, Value>>>
+class btree_map
+ : public internal::btree_map_container<
+ internal::btree<internal::map_params<
+ Key, Value, Compare, Alloc, /*TargetNodeSize=*/256,
+ /*Multi=*/false>>> {
+
+ using Base = typename btree_map::btree_map_container;
+
+ public:
+ // Default constructor.
+ btree_map() = default;
+ using Base::Base;
+};
+
+// btree::swap(btree::btree_map<>, btree::btree_map<>)
+//
+// Swaps the contents of two `btree::btree_map` containers.
+template <typename K, typename V, typename C, typename A>
+void swap(btree_map<K, V, C, A> &x, btree_map<K, V, C, A> &y) {
+ return x.swap(y);
+}
+
+// btree::erase_if(btree::btree_map<>, Pred)
+//
+// Erases all elements that satisfy the predicate pred from the container.
+template <typename K, typename V, typename C, typename A, typename Pred>
+void erase_if(btree_map<K, V, C, A> &map, Pred pred) {
+ for (auto it = map.begin(); it != map.end();) {
+ if (pred(*it)) {
+ it = map.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+// btree::btree_multimap
+//
+// A `btree::btree_multimap<K, V>` is an ordered associative container of
+// keys and associated values designed to be a more efficient replacement for
+// `std::multimap` (in most cases). Unlike `btree::btree_map`, a B-tree multimap
+// allows multiple elements with equivalent keys.
+//
+// Keys are sorted using an (optional) comparison function, which defaults to
+// `std::less<K>`.
+//
+// A `btree::btree_multimap<K, V>` uses a default allocator of
+// `std::allocator<std::pair<const K, V>>` to allocate (and deallocate)
+// nodes, and construct and destruct values within those nodes. You may
+// instead specify a custom allocator `A` (which in turn requires specifying a
+// custom comparator `C`) as in `btree::btree_multimap<K, V, C, A>`.
+//
+template <typename Key, typename Value, typename Compare = std::less<Key>,
+ typename Alloc = std::allocator<std::pair<const Key, Value>>>
+class btree_multimap
+ : public internal::btree_multimap_container<
+ internal::btree<internal::map_params<
+ Key, Value, Compare, Alloc, /*TargetNodeSize=*/256,
+ /*Multi=*/true>>> {
+ using Base = typename btree_multimap::btree_multimap_container;
+
+ public:
+ btree_multimap() = default;
+ using Base::Base;
+};
+
+// btree::swap(btree::btree_multimap<>, btree::btree_multimap<>)
+//
+// Swaps the contents of two `btree::btree_multimap` containers.
+template <typename K, typename V, typename C, typename A>
+void swap(btree_multimap<K, V, C, A> &x, btree_multimap<K, V, C, A> &y) {
+ return x.swap(y);
+}
+
+// btree::erase_if(btree::btree_multimap<>, Pred)
+//
+// Erases all elements that satisfy the predicate pred from the container.
+template <typename K, typename V, typename C, typename A, typename Pred>
+void erase_if(btree_multimap<K, V, C, A> &map, Pred pred) {
+ for (auto it = map.begin(); it != map.end();) {
+ if (pred(*it)) {
+ it = map.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+} // namespace btree
diff --git a/src/include/cpp-btree/btree_set.h b/src/include/cpp-btree/btree_set.h
new file mode 100644
index 000000000..7d59887dc
--- /dev/null
+++ b/src/include/cpp-btree/btree_set.h
@@ -0,0 +1,655 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// -----------------------------------------------------------------------------
+// File: btree_set.h
+// -----------------------------------------------------------------------------
+//
+// This header file defines B-tree sets: sorted associative containers of
+// values.
+//
+// * `absl::btree_set<>`
+// * `absl::btree_multiset<>`
+//
+// These B-tree types are similar to the corresponding types in the STL
+// (`std::set` and `std::multiset`) and generally conform to the STL interfaces
+// of those types. However, because they are implemented using B-trees, they
+// are more efficient in most situations.
+//
+// Unlike `std::set` and `std::multiset`, which are commonly implemented using
+// red-black tree nodes, B-tree sets use more generic B-tree nodes able to hold
+// multiple values per node. Holding multiple values per node often makes
+// B-tree sets perform better than their `std::set` counterparts, because
+// multiple entries can be checked within the same cache hit.
+//
+// However, these types should not be considered drop-in replacements for
+// `std::set` and `std::multiset` as there are some API differences, which are
+// noted in this header file.
+//
+// Importantly, insertions and deletions may invalidate outstanding iterators,
+// pointers, and references to elements. Such invalidations are typically only
+// an issue if insertion and deletion operations are interleaved with the use of
+// more than one iterator, pointer, or reference simultaneously. For this
+// reason, `insert()` and `erase()` return a valid iterator at the current
+// position.
+
+#pragma once
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// btree::btree_set<>
+//
+// An `btree::btree_set<K>` is an ordered associative container of unique key
+// values designed to be a more efficient replacement for `std::set` (in most
+// cases).
+//
+// Keys are sorted using an (optional) comparison function, which defaults to
+// `std::less<K>`.
+//
+// An `btree::btree_set<K>` uses a default allocator of `std::allocator<K>` to
+// allocate (and deallocate) nodes, and construct and destruct values within
+// those nodes. You may instead specify a custom allocator `A` (which in turn
+// requires specifying a custom comparator `C`) as in
+// `btree::btree_set<K, C, A>`.
+//
+template <typename Key, typename Compare = std::less<Key>,
+ typename Alloc = std::allocator<Key>>
+class btree_set
+ : public internal::btree_set_container<
+ internal::btree<internal::set_params<
+ Key, Compare, Alloc, /*TargetNodeSize=*/256,
+ /*Multi=*/false>>> {
+ using Base = typename btree_set::btree_set_container;
+
+ public:
+ // Constructors and Assignment Operators
+ //
+ // A `btree_set` supports the same overload set as `std::set`
+ // for construction and assignment:
+ //
+ // * Default constructor
+ //
+ // btree::btree_set<std::string> set1;
+ //
+ // * Initializer List constructor
+ //
+ // btree::btree_set<std::string> set2 =
+ // {{"huey"}, {"dewey"}, {"louie"},};
+ //
+ // * Copy constructor
+ //
+ // btree::btree_set<std::string> set3(set2);
+ //
+ // * Copy assignment operator
+ //
+ // btree::btree_set<std::string> set4;
+ // set4 = set3;
+ //
+ // * Move constructor
+ //
+ // // Move is guaranteed efficient
+ // btree::btree_set<std::string> set5(std::move(set4));
+ //
+ // * Move assignment operator
+ //
+ // // May be efficient if allocators are compatible
+ // btree::btree_set<std::string> set6;
+ // set6 = std::move(set5);
+ //
+ // * Range constructor
+ //
+ // std::vector<std::string> v = {"a", "b"};
+ // btree::btree_set<std::string> set7(v.begin(), v.end());
+ btree_set() {}
+ using Base::Base;
+
+ // btree_set::begin()
+ //
+ // Returns an iterator to the beginning of the `btree_set`.
+ using Base::begin;
+
+ // btree_set::cbegin()
+ //
+ // Returns a const iterator to the beginning of the `btree_set`.
+ using Base::cbegin;
+
+ // btree_set::end()
+ //
+ // Returns an iterator to the end of the `btree_set`.
+ using Base::end;
+
+ // btree_set::cend()
+ //
+ // Returns a const iterator to the end of the `btree_set`.
+ using Base::cend;
+
+ // btree_set::empty()
+ //
+ // Returns whether or not the `btree_set` is empty.
+ using Base::empty;
+
+ // btree_set::max_size()
+ //
+ // Returns the largest theoretical possible number of elements within a
+ // `btree_set` under current memory constraints. This value can be thought
+ // of as the largest value of `std::distance(begin(), end())` for a
+ // `btree_set<Key>`.
+ using Base::max_size;
+
+ // btree_set::size()
+ //
+ // Returns the number of elements currently within the `btree_set`.
+ using Base::size;
+
+ // btree_set::clear()
+ //
+ // Removes all elements from the `btree_set`. Invalidates any references,
+ // pointers, or iterators referring to contained elements.
+ using Base::clear;
+
+ // btree_set::erase()
+ //
+ // Erases elements within the `btree_set`. Overloads are listed below.
+ //
+ // iterator erase(iterator position):
+ // iterator erase(const_iterator position):
+ //
+ // Erases the element at `position` of the `btree_set`, returning
+ // the iterator pointing to the element after the one that was erased
+ // (or end() if none exists).
+ //
+ // iterator erase(const_iterator first, const_iterator last):
+ //
+ // Erases the elements in the open interval [`first`, `last`), returning
+ // the iterator pointing to the element after the interval that was erased
+ // (or end() if none exists).
+ //
+ // template <typename K> size_type erase(const K& key):
+ //
+ // Erases the element with the matching key, if it exists, returning the
+ // number of elements erased.
+ using Base::erase;
+
+ // btree_set::insert()
+ //
+ // Inserts an element of the specified value into the `btree_set`,
+ // returning an iterator pointing to the newly inserted element, provided that
+ // an element with the given key does not already exist. If an insertion
+ // occurs, any references, pointers, or iterators are invalidated.
+ // Overloads are listed below.
+ //
+ // std::pair<iterator,bool> insert(const value_type& value):
+ //
+ // Inserts a value into the `btree_set`. Returns a pair consisting of an
+ // iterator to the inserted element (or to the element that prevented the
+ // insertion) and a bool denoting whether the insertion took place.
+ //
+ // std::pair<iterator,bool> insert(value_type&& value):
+ //
+ // Inserts a moveable value into the `btree_set`. Returns a pair
+ // consisting of an iterator to the inserted element (or to the element that
+ // prevented the insertion) and a bool denoting whether the insertion took
+ // place.
+ //
+ // iterator insert(const_iterator hint, const value_type& value):
+ // iterator insert(const_iterator hint, value_type&& value):
+ //
+ // Inserts a value, using the position of `hint` as a non-binding suggestion
+ // for where to begin the insertion search. Returns an iterator to the
+ // inserted element, or to the existing element that prevented the
+ // insertion.
+ //
+ // void insert(InputIterator first, InputIterator last):
+ //
+ // Inserts a range of values [`first`, `last`).
+ //
+ // void insert(std::initializer_list<init_type> ilist):
+ //
+ // Inserts the elements within the initializer list `ilist`.
+ using Base::insert;
+
+ // btree_set::emplace()
+ //
+ // Inserts an element of the specified value by constructing it in-place
+ // within the `btree_set`, provided that no element with the given key
+ // already exists.
+ //
+ // The element may be constructed even if there already is an element with the
+ // key in the container, in which case the newly constructed element will be
+ // destroyed immediately.
+ //
+ // If an insertion occurs, any references, pointers, or iterators are
+ // invalidated.
+ using Base::emplace;
+
+ // btree_set::emplace_hint()
+ //
+ // Inserts an element of the specified value by constructing it in-place
+ // within the `btree_set`, using the position of `hint` as a non-binding
+ // suggestion for where to begin the insertion search, and only inserts
+ // provided that no element with the given key already exists.
+ //
+ // The element may be constructed even if there already is an element with the
+ // key in the container, in which case the newly constructed element will be
+ // destroyed immediately.
+ //
+ // If an insertion occurs, any references, pointers, or iterators are
+ // invalidated.
+ using Base::emplace_hint;
+
+ // btree_set::merge()
+ //
+ // Extracts elements from a given `source` btree_set into this
+ // `btree_set`. If the destination `btree_set` already contains an
+ // element with an equivalent key, that element is not extracted.
+ using Base::merge;
+
+ // btree_set::swap(btree_set& other)
+ //
+ // Exchanges the contents of this `btree_set` with those of the `other`
+ // btree_set, avoiding invocation of any move, copy, or swap operations on
+ // individual elements.
+ //
+ // All iterators and references on the `btree_set` remain valid, excepting
+ // for the past-the-end iterator, which is invalidated.
+ using Base::swap;
+
+ // btree_set::contains()
+ //
+ // template <typename K> bool contains(const K& key) const:
+ //
+ // Determines whether an element comparing equal to the given `key` exists
+ // within the `btree_set`, returning `true` if so or `false` otherwise.
+ //
+ // Supports heterogeneous lookup, provided that the set is provided a
+ // compatible heterogeneous comparator.
+ using Base::contains;
+
+ // btree_set::count()
+ //
+ // template <typename K> size_type count(const K& key) const:
+ //
+ // Returns the number of elements comparing equal to the given `key` within
+ // the `btree_set`. Note that this function will return either `1` or `0`
+ // since duplicate elements are not allowed within a `btree_set`.
+ //
+ // Supports heterogeneous lookup, provided that the set is provided a
+ // compatible heterogeneous comparator.
+ using Base::count;
+
+ // btree_set::equal_range()
+ //
+ // Returns a closed range [first, last], defined by a `std::pair` of two
+ // iterators, containing all elements with the passed key in the
+ // `btree_set`.
+ using Base::equal_range;
+
+ // btree_set::find()
+ //
+ // template <typename K> iterator find(const K& key):
+ // template <typename K> const_iterator find(const K& key) const:
+ //
+ // Finds an element with the passed `key` within the `btree_set`.
+ //
+ // Supports heterogeneous lookup, provided that the set is provided a
+ // compatible heterogeneous comparator.
+ using Base::find;
+
+ // btree_set::get_allocator()
+ //
+ // Returns the allocator function associated with this `btree_set`.
+ using Base::get_allocator;
+
+ // btree_set::key_comp();
+ //
+ // Returns the key comparator associated with this `btree_set`.
+ using Base::key_comp;
+
+ // btree_set::value_comp();
+ //
+ // Returns the value comparator associated with this `btree_set`. The keys to
+ // sort the elements are the values themselves, therefore `value_comp` and its
+ // sibling member function `key_comp` are equivalent.
+ using Base::value_comp;
+};
+
+// btree::swap(btree::btree_set<>, btree::btree_set<>)
+//
+// Swaps the contents of two `btree::btree_set` containers.
+template <typename K, typename C, typename A>
+void swap(btree_set<K, C, A> &x, btree_set<K, C, A> &y) {
+ return x.swap(y);
+}
+
+// btree::erase_if(btree::btree_set<>, Pred)
+//
+// Erases all elements that satisfy the predicate pred from the container.
+template <typename K, typename C, typename A, typename Pred>
+void erase_if(btree_set<K, C, A> &set, Pred pred) {
+ for (auto it = set.begin(); it != set.end();) {
+ if (pred(*it)) {
+ it = set.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+// btree::btree_multiset<>
+//
+// An `btree::btree_multiset<K>` is an ordered associative container of
+// keys and associated values designed to be a more efficient replacement
+// for `std::multiset` (in most cases). Unlike `btree::btree_set`, a B-tree
+// multiset allows equivalent elements.
+//
+// Keys are sorted using an (optional) comparison function, which defaults to
+// `std::less<K>`.
+//
+// An `btree::btree_multiset<K>` uses a default allocator of `std::allocator<K>`
+// to allocate (and deallocate) nodes, and construct and destruct values within
+// those nodes. You may instead specify a custom allocator `A` (which in turn
+// requires specifying a custom comparator `C`) as in
+// `btree::btree_multiset<K, C, A>`.
+//
+template <typename Key, typename Compare = std::less<Key>,
+ typename Alloc = std::allocator<Key>>
+class btree_multiset
+ : public internal::btree_multiset_container<
+ internal::btree<internal::set_params<
+ Key, Compare, Alloc, /*TargetNodeSize=*/256,
+ /*Multi=*/true>>> {
+ using Base = typename btree_multiset::btree_multiset_container;
+
+ public:
+ // Constructors and Assignment Operators
+ //
+ // A `btree_multiset` supports the same overload set as `std::set`
+ // for construction and assignment:
+ //
+ // * Default constructor
+ //
+ // btree::btree_multiset<std::string> set1;
+ //
+ // * Initializer List constructor
+ //
+ // btree::btree_multiset<std::string> set2 =
+ // {{"huey"}, {"dewey"}, {"louie"},};
+ //
+ // * Copy constructor
+ //
+ // btree::btree_multiset<std::string> set3(set2);
+ //
+ // * Copy assignment operator
+ //
+ // btree::btree_multiset<std::string> set4;
+ // set4 = set3;
+ //
+ // * Move constructor
+ //
+ // // Move is guaranteed efficient
+ // btree::btree_multiset<std::string> set5(std::move(set4));
+ //
+ // * Move assignment operator
+ //
+ // // May be efficient if allocators are compatible
+ // btree::btree_multiset<std::string> set6;
+ // set6 = std::move(set5);
+ //
+ // * Range constructor
+ //
+ // std::vector<std::string> v = {"a", "b"};
+ // btree::btree_multiset<std::string> set7(v.begin(), v.end());
+ btree_multiset() {}
+ using Base::Base;
+
+ // btree_multiset::begin()
+ //
+ // Returns an iterator to the beginning of the `btree_multiset`.
+ using Base::begin;
+
+ // btree_multiset::cbegin()
+ //
+ // Returns a const iterator to the beginning of the `btree_multiset`.
+ using Base::cbegin;
+
+ // btree_multiset::end()
+ //
+ // Returns an iterator to the end of the `btree_multiset`.
+ using Base::end;
+
+ // btree_multiset::cend()
+ //
+ // Returns a const iterator to the end of the `btree_multiset`.
+ using Base::cend;
+
+ // btree_multiset::empty()
+ //
+ // Returns whether or not the `btree_multiset` is empty.
+ using Base::empty;
+
+ // btree_multiset::max_size()
+ //
+ // Returns the largest theoretical possible number of elements within a
+ // `btree_multiset` under current memory constraints. This value can be
+ // thought of as the largest value of `std::distance(begin(), end())` for a
+ // `btree_multiset<Key>`.
+ using Base::max_size;
+
+ // btree_multiset::size()
+ //
+ // Returns the number of elements currently within the `btree_multiset`.
+ using Base::size;
+
+ // btree_multiset::clear()
+ //
+ // Removes all elements from the `btree_multiset`. Invalidates any references,
+ // pointers, or iterators referring to contained elements.
+ using Base::clear;
+
+ // btree_multiset::erase()
+ //
+ // Erases elements within the `btree_multiset`. Overloads are listed below.
+ //
+ // iterator erase(iterator position):
+ // iterator erase(const_iterator position):
+ //
+ // Erases the element at `position` of the `btree_multiset`, returning
+ // the iterator pointing to the element after the one that was erased
+ // (or end() if none exists).
+ //
+ // iterator erase(const_iterator first, const_iterator last):
+ //
+ // Erases the elements in the open interval [`first`, `last`), returning
+ // the iterator pointing to the element after the interval that was erased
+ // (or end() if none exists).
+ //
+ // template <typename K> size_type erase(const K& key):
+ //
+ // Erases the elements matching the key, if any exist, returning the
+ // number of elements erased.
+ using Base::erase;
+
+ // btree_multiset::insert()
+ //
+ // Inserts an element of the specified value into the `btree_multiset`,
+ // returning an iterator pointing to the newly inserted element.
+ // Any references, pointers, or iterators are invalidated. Overloads are
+ // listed below.
+ //
+ // iterator insert(const value_type& value):
+ //
+ // Inserts a value into the `btree_multiset`, returning an iterator to the
+ // inserted element.
+ //
+ // iterator insert(value_type&& value):
+ //
+ // Inserts a moveable value into the `btree_multiset`, returning an iterator
+ // to the inserted element.
+ //
+ // iterator insert(const_iterator hint, const value_type& value):
+ // iterator insert(const_iterator hint, value_type&& value):
+ //
+ // Inserts a value, using the position of `hint` as a non-binding suggestion
+ // for where to begin the insertion search. Returns an iterator to the
+ // inserted element.
+ //
+ // void insert(InputIterator first, InputIterator last):
+ //
+ // Inserts a range of values [`first`, `last`).
+ //
+ // void insert(std::initializer_list<init_type> ilist):
+ //
+ // Inserts the elements within the initializer list `ilist`.
+ using Base::insert;
+
+ // btree_multiset::emplace()
+ //
+ // Inserts an element of the specified value by constructing it in-place
+ // within the `btree_multiset`. Any references, pointers, or iterators are
+ // invalidated.
+ using Base::emplace;
+
+ // btree_multiset::emplace_hint()
+ //
+ // Inserts an element of the specified value by constructing it in-place
+ // within the `btree_multiset`, using the position of `hint` as a non-binding
+ // suggestion for where to begin the insertion search.
+ //
+ // Any references, pointers, or iterators are invalidated.
+ using Base::emplace_hint;
+
+ // btree_multiset::extract()
+ //
+ // Extracts the indicated element, erasing it in the process, and returns it
+ // as a C++17-compatible node handle. Overloads are listed below.
+ //
+ // node_type extract(const_iterator position):
+ //
+ // Extracts the element at the indicated position and returns a node handle
+ // owning that extracted data.
+ //
+ // template <typename K> node_type extract(const K& x):
+ //
+ // Extracts the element with the key matching the passed key value and
+ // returns a node handle owning that extracted data. If the `btree_multiset`
+ // does not contain an element with a matching key, this function returns an
+ // empty node handle.
+ //
+ // NOTE: In this context, `node_type` refers to the C++17 concept of a
+ // move-only type that owns and provides access to the elements in associative
+ // containers (https://en.cppreference.com/w/cpp/container/node_handle).
+ // It does NOT refer to the data layout of the underlying btree.
+ using Base::extract;
+
+ // btree_multiset::merge()
+ //
+ // Extracts elements from a given `source` btree_multiset into this
+ // `btree_multiset`. If the destination `btree_multiset` already contains an
+ // element with an equivalent key, that element is not extracted.
+ using Base::merge;
+
+ // btree_multiset::swap(btree_multiset& other)
+ //
+ // Exchanges the contents of this `btree_multiset` with those of the `other`
+ // btree_multiset, avoiding invocation of any move, copy, or swap operations
+ // on individual elements.
+ //
+ // All iterators and references on the `btree_multiset` remain valid,
+ // excepting for the past-the-end iterator, which is invalidated.
+ using Base::swap;
+
+ // btree_multiset::contains()
+ //
+ // template <typename K> bool contains(const K& key) const:
+ //
+ // Determines whether an element comparing equal to the given `key` exists
+ // within the `btree_multiset`, returning `true` if so or `false` otherwise.
+ //
+ // Supports heterogeneous lookup, provided that the set is provided a
+ // compatible heterogeneous comparator.
+ using Base::contains;
+
+ // btree_multiset::count()
+ //
+ // template <typename K> size_type count(const K& key) const:
+ //
+ // Returns the number of elements comparing equal to the given `key` within
+ // the `btree_multiset`.
+ //
+ // Supports heterogeneous lookup, provided that the set is provided a
+ // compatible heterogeneous comparator.
+ using Base::count;
+
+ // btree_multiset::equal_range()
+ //
+ // Returns a closed range [first, last], defined by a `std::pair` of two
+ // iterators, containing all elements with the passed key in the
+ // `btree_multiset`.
+ using Base::equal_range;
+
+ // btree_multiset::find()
+ //
+ // template <typename K> iterator find(const K& key):
+ // template <typename K> const_iterator find(const K& key) const:
+ //
+ // Finds an element with the passed `key` within the `btree_multiset`.
+ //
+ // Supports heterogeneous lookup, provided that the set is provided a
+ // compatible heterogeneous comparator.
+ using Base::find;
+
+ // btree_multiset::get_allocator()
+ //
+ // Returns the allocator function associated with this `btree_multiset`.
+ using Base::get_allocator;
+
+ // btree_multiset::key_comp();
+ //
+ // Returns the key comparator associated with this `btree_multiset`.
+ using Base::key_comp;
+
+ // btree_multiset::value_comp();
+ //
+ // Returns the value comparator associated with this `btree_multiset`. The
+ // keys to sort the elements are the values themselves, therefore `value_comp`
+ // and its sibling member function `key_comp` are equivalent.
+ using Base::value_comp;
+};
+
+// btree::swap(btree::btree_multiset<>, btree::btree_multiset<>)
+//
+// Swaps the contents of two `btree::btree_multiset` containers.
+template <typename K, typename C, typename A>
+void swap(btree_multiset<K, C, A> &x, btree_multiset<K, C, A> &y) {
+ return x.swap(y);
+}
+
+// btree::erase_if(btree::btree_multiset<>, Pred)
+//
+// Erases all elements that satisfy the predicate pred from the container.
+template <typename K, typename C, typename A, typename Pred>
+void erase_if(btree_multiset<K, C, A> &set, Pred pred) {
+ for (auto it = set.begin(); it != set.end();) {
+ if (pred(*it)) {
+ it = set.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+} // namespace btree
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
new file mode 100644
index 000000000..dd4ede666
--- /dev/null
+++ b/src/include/crc32c.h
@@ -0,0 +1,57 @@
+#ifndef CEPH_CRC32C_H
+#define CEPH_CRC32C_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
+
+/*
+ * this is a static global with the chosen crc32c implementation for
+ * the given architecture.
+ */
+extern ceph_crc32c_func_t ceph_crc32c_func;
+
+extern ceph_crc32c_func_t ceph_choose_crc32(void);
+
+/**
+ * calculate crc32c for data that is entirely 0 (ZERO)
+ *
+ * Note: works the same as ceph_crc32c_func for data == nullptr,
+ * but faster than the optimized assembly on certain architectures.
+ * This is faster than intel optimized assembly, but not as fast as
+ * ppc64le optimized assembly.
+ *
+ * @param crc initial value
+ * @param length length of buffer
+ */
+uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length);
+
+/**
+ * calculate crc32c
+ *
+ * Note: if the data pointer is NULL, we calculate a crc value as if
+ * it were zero-filled.
+ *
+ * @param crc initial value
+ * @param data pointer to data buffer
+ * @param length length of buffer
+ */
+static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
+{
+#ifndef HAVE_POWER8
+ if (!data && length > 16)
+ return ceph_crc32c_zeros(crc, length);
+#endif /* HAVE_POWER8 */
+
+ return ceph_crc32c_func(crc, data, length);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/demangle.h b/src/include/demangle.h
new file mode 100644
index 000000000..9e46d952f
--- /dev/null
+++ b/src/include/demangle.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INCLUDE_DEMANGLE
+#define CEPH_INCLUDE_DEMANGLE
+
+//// Stole this code from http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
+#ifdef __GNUG__
+#include <cstdlib>
+#include <memory>
+#include <cxxabi.h>
+
+static std::string ceph_demangle(const char* name)
+{
+ int status = -4; // some arbitrary value to eliminate the compiler warning
+
+ // enable c++11 by passing the flag -std=c++11 to g++
+ std::unique_ptr<char, void(*)(void*)> res {
+ abi::__cxa_demangle(name, NULL, NULL, &status),
+ std::free
+ };
+
+ return (status == 0) ? res.get() : name ;
+}
+
+#else
+
+// does nothing if not g++
+static std::string demangle(const char* name)
+{
+ return name;
+}
+
+#endif
+
+
+#endif
diff --git a/src/include/denc.h b/src/include/denc.h
new file mode 100644
index 000000000..10fa8d0c8
--- /dev/null
+++ b/src/include/denc.h
@@ -0,0 +1,1903 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+// If you #include "include/encoding.h" you get the old-style *and*
+// the new-style definitions. (The old-style needs denc_traits<> in
+// order to disable the container helpers when new-style traits are
+// present.)
+
+// You can also just #include "include/denc.h" and get only the
+// new-style helpers. The eventual goal is to drop the legacy
+// definitions.
+
+#ifndef _ENC_DEC_H
+#define _ENC_DEC_H
+
+#include <array>
+#include <cstring>
+#include <map>
+#include <optional>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/small_vector.hpp>
+#include <boost/intrusive/set.hpp>
+#include <boost/optional.hpp>
+
+#include "include/compat.h"
+#include "include/intarith.h"
+#include "include/int_types.h"
+#include "include/scope_guard.h"
+
+#include "buffer.h"
+#include "byteorder.h"
+
+#include "common/convenience.h"
+#include "common/error_code.h"
+
+template<typename T, typename=void>
+struct denc_traits {
+ static constexpr bool supported = false;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = true;
+};
+
+template<typename T>
+inline constexpr bool denc_supported = denc_traits<T>::supported;
+
+
+// hack for debug only; FIXME
+//#include <iostream>
+//using std::cout;
+
+// Define this to compile in a dump of all encoded objects to disk to
+// populate ceph-object-corpus. Note that there is an almost
+// identical implementation in encoding.h, but you only need to define
+// ENCODE_DUMP_PATH here.
+//
+// See src/test/encoding/generate-corpus-objects.sh.
+//
+//#define ENCODE_DUMP_PATH /tmp/something
+
+#ifdef ENCODE_DUMP_PATH
+# include <cstdio>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <fcntl.h>
+
+# define ENCODE_STR(x) #x
+# define ENCODE_STRINGIFY(x) ENCODE_STR(x)
+
+template<typename T>
+class DencDumper {
+public:
+ DencDumper(const char* name,
+ const ceph::bufferlist::contiguous_appender& appender)
+ : name{name},
+ appender{appender},
+ bl_offset{appender.bl.length()},
+ space_offset{space_size()},
+ start{appender.get_pos()}
+ {}
+ ~DencDumper() {
+ if (do_sample()) {
+ dump();
+ }
+ }
+private:
+ static bool do_sample() {
+ // this hackery with bits below is just to get a semi-reasonable
+ // distribution across time. it is somewhat exponential but not
+ // quite.
+ i++;
+ int bits = 0;
+ for (unsigned t = i; t; bits++)
+ t &= t - 1;
+ return bits <= 2;
+ }
+ size_t space_size() const {
+ return appender.get_logical_offset() - appender.get_out_of_band_offset();
+ }
+ void dump() const {
+ char fn[PATH_MAX];
+ ::snprintf(fn, sizeof(fn),
+ ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", name,
+ getpid(), i++);
+ int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC|O_BINARY, 0644);
+ if (fd < 0) {
+ return;
+ }
+ auto close_fd = make_scope_guard([fd] { ::close(fd); });
+ if (auto bl_delta = appender.bl.length() - bl_offset; bl_delta > 0) {
+ ceph::bufferlist dump_bl;
+ appender.bl.begin(bl_offset + space_offset).copy(bl_delta - space_offset, dump_bl);
+ const size_t space_len = space_size();
+ dump_bl.append(appender.get_pos() - space_len, space_len);
+ dump_bl.write_fd(fd);
+ } else {
+ size_t len = appender.get_pos() - start;
+ [[maybe_unused]] int r = ::write(fd, start, len);
+ }
+ }
+ const char* name;
+ const ceph::bufferlist::contiguous_appender& appender;
+ const size_t bl_offset;
+ const size_t space_offset;
+ const char* start;
+ static int i;
+};
+
+template<typename T> int DencDumper<T>::i = 0;
+
+# define DENC_DUMP_PRE(Type) \
+ DencDumper<Type> _denc_dumper{#Type, p};
+#else
+# define DENC_DUMP_PRE(Type)
+#endif
+
+
+/*
+
+ top level level functions look like so
+ ======================================
+
+ inline void denc(const T& o, size_t& p, uint64_t features=0);
+ inline void denc(const T& o, ceph::buffer::list::contiguous_appender& p,
+ uint64_t features=0);
+ inline void denc(T& o, ceph::buffer::ptr::const_iterator& p, uint64_t features=0);
+
+ or (for featured objects)
+
+ inline void denc(const T& o, size_t& p, uint64_t features);
+ inline void denc(const T& o, ceph::buffer::list::contiguous_appender& p,
+ uint64_t features);
+ inline void denc(T& o, ceph::buffer::ptr::const_iterator& p, uint64_t features);
+
+ - These are symmetrical, so that they can be used from the magic DENC
+ method of writing the bound_encode/encode/decode methods all in one go;
+ they differ only in the type of p.
+
+ - These are automatically fabricated via a template that calls into
+ the denc_traits<> methods (see below), provided denc_traits<T>::supported
+ is defined and true. They never need to be written explicitly.
+
+
+ static denc_traits<> definitions look like so
+ =============================================
+
+ template<>
+ struct denc_traits<T> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const T &o, size_t& p, uint64_t f=0);
+ static void encode(const T &o, ceph::buffer::list::contiguous_appender& p,
+ uint64_t f=0);
+ static void decode(T& o, ceph::buffer::ptr::const_iterator &p, uint64_t f=0);
+ };
+
+ or (for featured objects)
+
+ template<>
+ struct denc_traits<T> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = true;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const T &o, size_t& p, uint64_t f);
+ static void encode(const T &o, ceph::buffer::list::contiguous_appender& p,
+ uint64_t f);
+ static void decode(T& o, ceph::buffer::ptr::const_iterator &p, uint64_t f=0);
+ };
+
+ - denc_traits<T> is normally declared via the WRITE_CLASS_DENC(type) macro,
+ which is used in place of the old-style WRITE_CLASS_ENCODER(type) macro.
+ There are _FEATURED and _BOUNDED variants. The class traits simply call
+ into class methods of the same name (see below).
+
+ - denc_traits<T> can also be written explicitly for some type to indicate
+ how it should be encoded. This is the "source of truth" for how a type
+ is encoded.
+
+ - denc_traits<T> are declared for the base integer types, string, ceph::buffer::ptr,
+ and ceph::buffer::list base types.
+
+ - denc_traits<std::foo<T>>-like traits are declared for standard container
+ types.
+
+
+ class methods look like so
+ ==========================
+
+ void bound_encode(size_t& p) const;
+ void encode(ceph::buffer::list::contiguous_appender& p) const;
+ void decode(ceph::buffer::ptr::const_iterator &p);
+
+ or (for featured objects)
+
+ void bound_encode(size_t& p, uint64_t f) const;
+ void encode(ceph::buffer::list::contiguous_appender& p, uint64_t f) const;
+ void decode(ceph::buffer::ptr::const_iterator &p);
+
+ - These are normally invoked by the denc_traits<> methods that are
+ declared via WRITE_CLASS_DENC, although you can also invoke them explicitly
+ in your code.
+
+ - These methods are optimised for contiguous buffer, but denc() will try
+ rebuild a contigous one if the decoded ceph::buffer::list is segmented. If you are
+ concerned about the cost, you might want to define yet another method:
+
+ void decode(ceph::buffer::list::iterator &p);
+
+ - These can be defined either explicitly (as above), or can be "magically"
+ defined all in one go using the DENC macro and DENC_{START,FINISH} helpers
+ (which work like the legacy {ENCODE,DECODE}_{START,FINISH} macros):
+
+ class foo_t {
+ ...
+ DENC(foo_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.foo, p);
+ denc(v.bar, p);
+ denc(v.baz, p);
+ DENC_FINISH(p);
+ }
+ ...
+ };
+ WRITE_CLASS_DENC(foo_t)
+
+ */
+
+// ---------------------------------------------------------------------
+// raw types
+namespace _denc {
+template<typename T, typename... Us>
+inline constexpr bool is_any_of = (... || std::is_same_v<T, Us>);
+
+template<typename T, typename=void> struct underlying_type {
+ using type = T;
+};
+template<typename T>
+struct underlying_type<T, std::enable_if_t<std::is_enum_v<T>>> {
+ using type = std::underlying_type_t<T>;
+};
+template<typename T>
+using underlying_type_t = typename underlying_type<T>::type;
+}
+
+template<class It>
+struct is_const_iterator
+ : std::conditional_t<std::is_const_v<std::remove_pointer_t<typename It::pointer>>,
+ std::true_type,
+ std::false_type>
+{};
+template<>
+struct is_const_iterator<size_t> : std::false_type {};
+template<>
+struct is_const_iterator<ceph::buffer::list::contiguous_appender> : std::false_type {
+ // appender is used for *changing* the buffer
+};
+template<class It>
+inline constexpr bool is_const_iterator_v = is_const_iterator<It>::value;
+
+template<typename T, class It>
+std::enable_if_t<is_const_iterator_v<It>, const T&>
+get_pos_add(It& i) {
+ return *reinterpret_cast<const T*>(i.get_pos_add(sizeof(T)));
+}
+
+template<typename T, class It>
+std::enable_if_t<!is_const_iterator_v<It>, T&>
+get_pos_add(It& i) {
+ return *reinterpret_cast<T*>(i.get_pos_add(sizeof(T)));
+}
+
+template<typename T>
+struct denc_traits<
+ T,
+ std::enable_if_t<
+ _denc::is_any_of<_denc::underlying_type_t<T>,
+ ceph_le64, ceph_le32, ceph_le16, uint8_t
+#ifndef _CHAR_IS_SIGNED
+ , int8_t
+#endif
+ >>> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+ static void bound_encode(const T &o, size_t& p, uint64_t f=0) {
+ p += sizeof(T);
+ }
+ template<class It>
+ static std::enable_if_t<!is_const_iterator_v<It>>
+ encode(const T &o, It& p, uint64_t f=0) {
+ get_pos_add<T>(p) = o;
+ }
+ template<class It>
+ static std::enable_if_t<is_const_iterator_v<It>>
+ decode(T& o, It& p, uint64_t f=0) {
+ o = get_pos_add<T>(p);
+ }
+ static void decode(T& o, ceph::buffer::list::const_iterator &p) {
+ p.copy(sizeof(T), reinterpret_cast<char*>(&o));
+ }
+};
+
+
+// -----------------------------------------------------------------------
+// integer types
+
+// itype == internal type
+// otype == external type, i.e., the type on the wire
+
+// NOTE: the overload resolution ensures that the legacy encode/decode methods
+// defined for int types is preferred to the ones defined using the specialized
+// template, and hence get selected. This machinery prevents these these from
+// getting glued into the legacy encode/decode methods; the overhead of setting
+// up a contiguous_appender etc is likely to be slower.
+namespace _denc {
+
+template<typename T, typename=void> struct ExtType {
+ using type = void;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int16_t> ||
+ std::is_same_v<T, uint16_t>>> {
+ using type = ceph_le16;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int32_t> ||
+ std::is_same_v<T, uint32_t>>> {
+ using type = ceph_le32;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int64_t> ||
+ std::is_same_v<T, uint64_t>>> {
+ using type = ceph_le64;
+};
+
+template<>
+struct ExtType<bool> {
+ using type = uint8_t;
+};
+template<typename T>
+using ExtType_t = typename ExtType<T>::type;
+} // namespace _denc
+
+template<typename T>
+struct denc_traits<T, std::enable_if_t<!std::is_void_v<_denc::ExtType_t<T>>>>
+{
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+ using etype = _denc::ExtType_t<T>;
+ static void bound_encode(const T &o, size_t& p, uint64_t f=0) {
+ p += sizeof(etype);
+ }
+ template<class It>
+ static std::enable_if_t<!is_const_iterator_v<It>>
+ encode(const T &o, It& p, uint64_t f=0) {
+ get_pos_add<etype>(p) = o;
+ }
+ template<class It>
+ static std::enable_if_t<is_const_iterator_v<It>>
+ decode(T& o, It &p, uint64_t f=0) {
+ o = get_pos_add<etype>(p);
+ }
+ static void decode(T& o, ceph::buffer::list::const_iterator &p) {
+ etype e;
+ p.copy(sizeof(etype), reinterpret_cast<char*>(&e));
+ o = e;
+ }
+};
+
+// varint
+//
+// high bit of each byte indicates another byte follows.
+template<typename T>
+inline void denc_varint(T v, size_t& p) {
+ p += sizeof(T) + 1;
+}
+
+template<typename T>
+inline void denc_varint(T v, ceph::buffer::list::contiguous_appender& p) {
+ uint8_t byte = v & 0x7f;
+ v >>= 7;
+ while (v) {
+ byte |= 0x80;
+ get_pos_add<__u8>(p) = byte;
+ byte = (v & 0x7f);
+ v >>= 7;
+ }
+ get_pos_add<__u8>(p) = byte;
+}
+
+template<typename T>
+inline void denc_varint(T& v, ceph::buffer::ptr::const_iterator& p) {
+ uint8_t byte = *(__u8*)p.get_pos_add(1);
+ v = byte & 0x7f;
+ int shift = 7;
+ while (byte & 0x80) {
+ byte = get_pos_add<__u8>(p);
+ v |= (T)(byte & 0x7f) << shift;
+ shift += 7;
+ }
+}
+
+
+// signed varint encoding
+//
+// low bit = 1 = negative, 0 = positive
+// high bit of every byte indicates whether another byte follows.
+inline void denc_signed_varint(int64_t v, size_t& p) {
+ p += sizeof(v) + 2;
+}
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_signed_varint(int64_t v, It& p) {
+ if (v < 0) {
+ v = (-v << 1) | 1;
+ } else {
+ v <<= 1;
+ }
+ denc_varint(v, p);
+}
+
+template<typename T, class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_signed_varint(T& v, It& p)
+{
+ int64_t i = 0;
+ denc_varint(i, p);
+ if (i & 1) {
+ v = -(i >> 1);
+ } else {
+ v = i >> 1;
+ }
+}
+
+// varint + lowz encoding
+//
+// first(low) 2 bits = how many low zero bits (nibbles)
+// high bit of each byte = another byte follows
+// (so, 5 bits data in first byte, 7 bits data thereafter)
+inline void denc_varint_lowz(uint64_t v, size_t& p) {
+ p += sizeof(v) + 2;
+}
+inline void denc_varint_lowz(uint64_t v,
+ ceph::buffer::list::contiguous_appender& p) {
+ int lowznib = v ? (ctz(v) / 4) : 0;
+ if (lowznib > 3)
+ lowznib = 3;
+ v >>= lowznib * 4;
+ v <<= 2;
+ v |= lowznib;
+ denc_varint(v, p);
+}
+
+template<typename T>
+inline void denc_varint_lowz(T& v, ceph::buffer::ptr::const_iterator& p)
+{
+ uint64_t i = 0;
+ denc_varint(i, p);
+ int lowznib = (i & 3);
+ i >>= 2;
+ i <<= lowznib * 4;
+ v = i;
+}
+
+// signed varint + lowz encoding
+//
+// first low bit = 1 for negative, 0 for positive
+// next 2 bits = how many low zero bits (nibbles)
+// high bit of each byte = another byte follows
+// (so, 4 bits data in first byte, 7 bits data thereafter)
+inline void denc_signed_varint_lowz(int64_t v, size_t& p) {
+ p += sizeof(v) + 2;
+}
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_signed_varint_lowz(int64_t v, It& p) {
+ bool negative = false;
+ if (v < 0) {
+ v = -v;
+ negative = true;
+ }
+ unsigned lowznib = v ? (ctz(v) / 4) : 0u;
+ if (lowznib > 3)
+ lowznib = 3;
+ v >>= lowznib * 4;
+ v <<= 3;
+ v |= lowznib << 1;
+ v |= (int)negative;
+ denc_varint(v, p);
+}
+
+template<typename T, class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_signed_varint_lowz(T& v, It& p)
+{
+ int64_t i = 0;
+ denc_varint(i, p);
+ int lowznib = (i & 6) >> 1;
+ if (i & 1) {
+ i >>= 3;
+ i <<= lowznib * 4;
+ v = -i;
+ } else {
+ i >>= 3;
+ i <<= lowznib * 4;
+ v = i;
+ }
+}
+
+
+// LBA
+//
+// first 1-3 bits = how many low zero bits
+// *0 = 12 (common 4 K alignment case)
+// *01 = 16
+// *011 = 20
+// *111 = byte
+// then 28-30 bits of data
+// then last bit = another byte follows
+// high bit of each subsequent byte = another byte follows
+inline void denc_lba(uint64_t v, size_t& p) {
+ p += sizeof(v) + 2;
+}
+
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_lba(uint64_t v, It& p) {
+ int low_zero_nibbles = v ? (int)(ctz(v) / 4) : 0;
+ int pos;
+ uint32_t word;
+ int t = low_zero_nibbles - 3;
+ if (t < 0) {
+ pos = 3;
+ word = 0x7;
+ } else if (t < 3) {
+ v >>= (low_zero_nibbles * 4);
+ pos = t + 1;
+ word = (1 << t) - 1;
+ } else {
+ v >>= 20;
+ pos = 3;
+ word = 0x3;
+ }
+ word |= (v << pos) & 0x7fffffff;
+ v >>= 31 - pos;
+ if (!v) {
+ *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word;
+ return;
+ }
+ word |= 0x80000000;
+ *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word;
+ uint8_t byte = v & 0x7f;
+ v >>= 7;
+ while (v) {
+ byte |= 0x80;
+ *(__u8*)p.get_pos_add(1) = byte;
+ byte = (v & 0x7f);
+ v >>= 7;
+ }
+ *(__u8*)p.get_pos_add(1) = byte;
+}
+
+template<class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_lba(uint64_t& v, It& p) {
+ uint32_t word = *(ceph_le32*)p.get_pos_add(sizeof(uint32_t));
+ int shift;
+ switch (word & 7) {
+ case 0:
+ case 2:
+ case 4:
+ case 6:
+ v = (uint64_t)(word & 0x7ffffffe) << (12 - 1);
+ shift = 12 + 30;
+ break;
+ case 1:
+ case 5:
+ v = (uint64_t)(word & 0x7ffffffc) << (16 - 2);
+ shift = 16 + 29;
+ break;
+ case 3:
+ v = (uint64_t)(word & 0x7ffffff8) << (20 - 3);
+ shift = 20 + 28;
+ break;
+ case 7:
+ v = (uint64_t)(word & 0x7ffffff8) >> 3;
+ shift = 28;
+ }
+ uint8_t byte = word >> 24;
+ while (byte & 0x80) {
+ byte = *(__u8*)p.get_pos_add(1);
+ v |= (uint64_t)(byte & 0x7f) << shift;
+ shift += 7;
+ }
+}
+
+
+// ---------------------------------------------------------------------
+// denc top-level methods that call into denc_traits<T> methods
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported> denc(
+ const T& o,
+ size_t& p,
+ uint64_t f=0)
+{
+ if constexpr (traits::featured) {
+ traits::bound_encode(o, p, f);
+ } else {
+ traits::bound_encode(o, p);
+ }
+}
+
+template<typename T, class It, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !is_const_iterator_v<It>>
+denc(const T& o,
+ It& p,
+ uint64_t features=0)
+{
+ if constexpr (traits::featured) {
+ traits::encode(o, p, features);
+ } else {
+ traits::encode(o, p);
+ }
+}
+
+template<typename T, class It, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && is_const_iterator_v<It>>
+denc(T& o,
+ It& p,
+ uint64_t features=0)
+{
+ if constexpr (traits::featured) {
+ traits::decode(o, p, features);
+ } else {
+ traits::decode(o, p);
+ }
+}
+
+namespace _denc {
+template<typename T, typename = void>
+struct has_legacy_denc : std::false_type {};
+template<typename T>
+struct has_legacy_denc<T, decltype(std::declval<T&>()
+ .decode(std::declval<
+ ceph::buffer::list::const_iterator&>()))>
+ : std::true_type {
+ static void decode(T& v, ceph::buffer::list::const_iterator& p) {
+ v.decode(p);
+ }
+};
+template<typename T>
+struct has_legacy_denc<T,
+ std::enable_if_t<
+ !denc_traits<T>::need_contiguous>> : std::true_type {
+ static void decode(T& v, ceph::buffer::list::const_iterator& p) {
+ denc_traits<T>::decode(v, p);
+ }
+};
+}
+
+template<typename T,
+ typename traits=denc_traits<T>,
+ typename has_legacy_denc=_denc::has_legacy_denc<T>>
+inline std::enable_if_t<traits::supported &&
+ has_legacy_denc::value> denc(
+ T& o,
+ ceph::buffer::list::const_iterator& p)
+{
+ has_legacy_denc::decode(o, p);
+}
+
+// ---------------------------------------------------------------------
+// base types and containers
+
+//
+// std::string
+//
+template<typename A>
+struct denc_traits<std::basic_string<char,std::char_traits<char>,A>> {
+private:
+ using value_type = std::basic_string<char,std::char_traits<char>,A>;
+
+public:
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = false;
+
+ static void bound_encode(const value_type& s, size_t& p, uint64_t f=0) {
+ p += sizeof(uint32_t) + s.size();
+ }
+ template<class It>
+ static void encode(const value_type& s,
+ It& p,
+ uint64_t f=0) {
+ denc((uint32_t)s.size(), p);
+ memcpy(p.get_pos_add(s.size()), s.data(), s.size());
+ }
+ template<class It>
+ static void decode(value_type& s,
+ It& p,
+ uint64_t f=0) {
+ uint32_t len;
+ denc(len, p);
+ decode_nohead(len, s, p);
+ }
+ static void decode(value_type& s, ceph::buffer::list::const_iterator& p)
+ {
+ uint32_t len;
+ denc(len, p);
+ decode_nohead(len, s, p);
+ }
+ template<class It>
+ static void decode_nohead(size_t len, value_type& s, It& p) {
+ s.clear();
+ if (len) {
+ s.append(p.get_pos_add(len), len);
+ }
+ }
+ static void decode_nohead(size_t len, value_type& s,
+ ceph::buffer::list::const_iterator& p) {
+ if (len) {
+ if constexpr (std::is_same_v<value_type, std::string>) {
+ s.clear();
+ p.copy(len, s);
+ } else {
+ s.resize(len);
+ p.copy(len, s.data());
+ }
+ } else {
+ s.clear();
+ }
+ }
+ template<class It>
+ static std::enable_if_t<!is_const_iterator_v<It>>
+ encode_nohead(const value_type& s, It& p) {
+ auto len = s.length();
+ maybe_inline_memcpy(p.get_pos_add(len), s.data(), len, 16);
+ }
+};
+
+//
+// ceph::buffer::ptr
+//
+template<>
+struct denc_traits<ceph::buffer::ptr> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = false;
+ static void bound_encode(const ceph::buffer::ptr& v, size_t& p, uint64_t f=0) {
+ p += sizeof(uint32_t) + v.length();
+ }
+ template <class It>
+ static std::enable_if_t<!is_const_iterator_v<It>>
+ encode(const ceph::buffer::ptr& v, It& p, uint64_t f=0) {
+ denc((uint32_t)v.length(), p);
+ p.append(v);
+ }
+ template <class It>
+ static std::enable_if_t<is_const_iterator_v<It>>
+ decode(ceph::buffer::ptr& v, It& p, uint64_t f=0) {
+ uint32_t len;
+ denc(len, p);
+ v = p.get_ptr(len);
+ }
+ static void decode(ceph::buffer::ptr& v, ceph::buffer::list::const_iterator& p) {
+ uint32_t len;
+ denc(len, p);
+ ceph::buffer::list s;
+ p.copy(len, s);
+ if (len) {
+ if (s.get_num_buffers() == 1)
+ v = s.front();
+ else
+ v = ceph::buffer::copy(s.c_str(), s.length());
+ }
+ }
+};
+
+//
+// ceph::buffer::list
+//
+template<>
+struct denc_traits<ceph::buffer::list> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = false;
+ static void bound_encode(const ceph::buffer::list& v, size_t& p, uint64_t f=0) {
+ p += sizeof(uint32_t) + v.length();
+ }
+ static void encode(const ceph::buffer::list& v, ceph::buffer::list::contiguous_appender& p,
+ uint64_t f=0) {
+ denc((uint32_t)v.length(), p);
+ p.append(v);
+ }
+ static void decode(ceph::buffer::list& v, ceph::buffer::ptr::const_iterator& p, uint64_t f=0) {
+ uint32_t len;
+ denc(len, p);
+ v.clear();
+ v.push_back(p.get_ptr(len));
+ }
+ static void decode(ceph::buffer::list& v, ceph::buffer::list::const_iterator& p) {
+ uint32_t len;
+ denc(len, p);
+ v.clear();
+ p.copy(len, v);
+ }
+ static void encode_nohead(const ceph::buffer::list& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ p.append(v);
+ }
+ static void decode_nohead(size_t len, ceph::buffer::list& v,
+ ceph::buffer::ptr::const_iterator& p) {
+ v.clear();
+ if (len) {
+ v.append(p.get_ptr(len));
+ }
+ }
+ static void decode_nohead(size_t len, ceph::buffer::list& v,
+ ceph::buffer::list::const_iterator& p) {
+ v.clear();
+ p.copy(len, v);
+ }
+};
+
+//
+// std::pair<A, B>
+//
+template<typename A, typename B>
+struct denc_traits<
+ std::pair<A, B>,
+ std::enable_if_t<denc_supported<std::remove_const_t<A>> && denc_supported<B>>> {
+ typedef denc_traits<A> a_traits;
+ typedef denc_traits<B> b_traits;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = a_traits::featured || b_traits::featured ;
+ static constexpr bool bounded = a_traits::bounded && b_traits::bounded;
+ static constexpr bool need_contiguous = (a_traits::need_contiguous ||
+ b_traits::need_contiguous);
+
+ static void bound_encode(const std::pair<A,B>& v, size_t& p, uint64_t f = 0) {
+ if constexpr (featured) {
+ denc(v.first, p, f);
+ denc(v.second, p, f);
+ } else {
+ denc(v.first, p);
+ denc(v.second, p);
+ }
+ }
+
+ static void encode(const std::pair<A,B>& v, ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ if constexpr (featured) {
+ denc(v.first, p, f);
+ denc(v.second, p, f);
+ } else {
+ denc(v.first, p);
+ denc(v.second, p);
+ }
+ }
+
+ static void decode(std::pair<A,B>& v, ceph::buffer::ptr::const_iterator& p, uint64_t f=0) {
+ denc(const_cast<std::remove_const_t<A>&>(v.first), p, f);
+ denc(v.second, p, f);
+ }
+ template<typename AA=A>
+ static std::enable_if_t<!!sizeof(AA) && !need_contiguous>
+ decode(std::pair<A,B>& v, ceph::buffer::list::const_iterator& p,
+ uint64_t f = 0) {
+ denc(const_cast<std::remove_const_t<AA>&>(v.first), p);
+ denc(v.second, p);
+ }
+};
+
+namespace _denc {
+ template<template<class...> class C, typename Details, typename ...Ts>
+ struct container_base {
+ private:
+ using container = C<Ts...>;
+ using T = typename Details::T;
+
+ public:
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ template<typename U=T>
+ static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+ p += sizeof(uint32_t);
+ if constexpr (traits::bounded) {
+#if _GLIBCXX_USE_CXX11_ABI
+ // intensionally not calling container's empty() method to not prohibit
+ // compiler from optimizing the check if it and the ::size() operate on
+ // different memory (observed when std::list::empty() works on pointers,
+ // not the size field).
+ if (const auto elem_num = s.size(); elem_num > 0) {
+#else
+ if (!s.empty()) {
+ const auto elem_num = s.size();
+#endif
+ // STL containers use weird element types like std::pair<const K, V>;
+ // cast to something we have denc_traits for.
+ size_t elem_size = 0;
+ if constexpr (traits::featured) {
+ denc(static_cast<const T&>(*s.begin()), elem_size, f);
+ } else {
+ denc(static_cast<const T&>(*s.begin()), elem_size);
+ }
+ p += elem_size * elem_num;
+ }
+ } else {
+ for (const T& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ }
+
+ template<typename U=T>
+ static void encode(const container& s,
+ ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ denc((uint32_t)s.size(), p);
+ if constexpr (traits::featured) {
+ encode_nohead(s, p, f);
+ } else {
+ encode_nohead(s, p);
+ }
+ }
+ static void decode(container& s, ceph::buffer::ptr::const_iterator& p,
+ uint64_t f = 0) {
+ uint32_t num;
+ denc(num, p);
+ decode_nohead(num, s, p, f);
+ }
+ template<typename U=T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode(container& s, ceph::buffer::list::const_iterator& p) {
+ uint32_t num;
+ denc(num, p);
+ decode_nohead(num, s, p);
+ }
+
+ // nohead
+ static void encode_nohead(const container& s, ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ for (const T& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ static void decode_nohead(size_t num, container& s,
+ ceph::buffer::ptr::const_iterator& p,
+ uint64_t f=0) {
+ s.clear();
+ Details::reserve(s, num);
+ while (num--) {
+ T t;
+ denc(t, p, f);
+ Details::insert(s, std::move(t));
+ }
+ }
+ template<typename U=T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode_nohead(size_t num, container& s,
+ ceph::buffer::list::const_iterator& p) {
+ s.clear();
+ Details::reserve(s, num);
+ while (num--) {
+ T t;
+ denc(t, p);
+ Details::insert(s, std::move(t));
+ }
+ }
+ };
+
+ template<typename T>
+ class container_has_reserve {
+ template<typename U, U> struct SFINAE_match;
+ template<typename U>
+ static std::true_type test(SFINAE_match<T(*)(typename T::size_type),
+ &U::reserve>*);
+
+ template<typename U>
+ static std::false_type test(...);
+
+ public:
+ static constexpr bool value = decltype(
+ test<denc_traits<T>>(0))::value;
+ };
+ template<typename T>
+ inline constexpr bool container_has_reserve_v =
+ container_has_reserve<T>::value;
+
+
+ template<typename Container>
+ struct container_details_base {
+ using T = typename Container::value_type;
+ static void reserve(Container& c, size_t s) {
+ if constexpr (container_has_reserve_v<Container>) {
+ c.reserve(s);
+ }
+ }
+ };
+
+ template<typename Container>
+ struct pushback_details : public container_details_base<Container> {
+ template<typename ...Args>
+ static void insert(Container& c, Args&& ...args) {
+ c.emplace_back(std::forward<Args>(args)...);
+ }
+ };
+}
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+ std::list<T, Ts...>,
+ typename std::enable_if_t<denc_traits<T>::supported>>
+ : public _denc::container_base<std::list,
+ _denc::pushback_details<std::list<T, Ts...>>,
+ T, Ts...> {};
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+ std::vector<T, Ts...>,
+ typename std::enable_if_t<denc_traits<T>::supported>>
+ : public _denc::container_base<std::vector,
+ _denc::pushback_details<std::vector<T, Ts...>>,
+ T, Ts...> {};
+
+template<typename T, std::size_t N, typename ...Ts>
+struct denc_traits<
+ boost::container::small_vector<T, N, Ts...>,
+ typename std::enable_if_t<denc_traits<T>::supported>> {
+private:
+ using container = boost::container::small_vector<T, N, Ts...>;
+public:
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ template<typename U=T>
+ static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+ p += sizeof(uint32_t);
+ if constexpr (traits::bounded) {
+ if (!s.empty()) {
+ const auto elem_num = s.size();
+ size_t elem_size = 0;
+ if constexpr (traits::featured) {
+ denc(*s.begin(), elem_size, f);
+ } else {
+ denc(*s.begin(), elem_size);
+ }
+ p += elem_size * elem_num;
+ }
+ } else {
+ for (const T& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ }
+
+ template<typename U=T>
+ static void encode(const container& s,
+ ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ denc((uint32_t)s.size(), p);
+ if constexpr (traits::featured) {
+ encode_nohead(s, p, f);
+ } else {
+ encode_nohead(s, p);
+ }
+ }
+ static void decode(container& s, ceph::buffer::ptr::const_iterator& p,
+ uint64_t f = 0) {
+ uint32_t num;
+ denc(num, p);
+ decode_nohead(num, s, p, f);
+ }
+ template<typename U=T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode(container& s, ceph::buffer::list::const_iterator& p) {
+ uint32_t num;
+ denc(num, p);
+ decode_nohead(num, s, p);
+ }
+
+ // nohead
+ static void encode_nohead(const container& s, ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ for (const T& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ static void decode_nohead(size_t num, container& s,
+ ceph::buffer::ptr::const_iterator& p,
+ uint64_t f=0) {
+ s.clear();
+ s.reserve(num);
+ while (num--) {
+ T t;
+ denc(t, p, f);
+ s.push_back(std::move(t));
+ }
+ }
+ template<typename U=T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode_nohead(size_t num, container& s,
+ ceph::buffer::list::const_iterator& p) {
+ s.clear();
+ s.reserve(num);
+ while (num--) {
+ T t;
+ denc(t, p);
+ s.push_back(std::move(t));
+ }
+ }
+};
+
+namespace _denc {
+ template<typename Container>
+ struct setlike_details : public container_details_base<Container> {
+ using T = typename Container::value_type;
+ template<typename ...Args>
+ static void insert(Container& c, Args&& ...args) {
+ c.emplace_hint(c.cend(), std::forward<Args>(args)...);
+ }
+ };
+}
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+ std::set<T, Ts...>,
+ std::enable_if_t<denc_traits<T>::supported>>
+ : public _denc::container_base<std::set,
+ _denc::setlike_details<std::set<T, Ts...>>,
+ T, Ts...> {};
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+ boost::container::flat_set<T, Ts...>,
+ std::enable_if_t<denc_traits<T>::supported>>
+ : public _denc::container_base<
+ boost::container::flat_set,
+ _denc::setlike_details<boost::container::flat_set<T, Ts...>>,
+ T, Ts...> {};
+
+namespace _denc {
+ template<typename Container>
+ struct maplike_details : public container_details_base<Container> {
+ using T = typename Container::value_type;
+ template<typename ...Args>
+ static void insert(Container& c, Args&& ...args) {
+ c.emplace_hint(c.cend(), std::forward<Args>(args)...);
+ }
+ };
+}
+
+template<typename A, typename B, typename ...Ts>
+struct denc_traits<
+ std::map<A, B, Ts...>,
+ std::enable_if_t<denc_traits<A>::supported &&
+ denc_traits<B>::supported>>
+ : public _denc::container_base<std::map,
+ _denc::maplike_details<std::map<A, B, Ts...>>,
+ A, B, Ts...> {};
+
+template<typename A, typename B, typename ...Ts>
+struct denc_traits<
+ boost::container::flat_map<A, B, Ts...>,
+ std::enable_if_t<denc_traits<A>::supported &&
+ denc_traits<B>::supported>>
+ : public _denc::container_base<
+ boost::container::flat_map,
+ _denc::maplike_details<boost::container::flat_map<
+ A, B, Ts...>>,
+ A, B, Ts...> {};
+
+template<typename T, size_t N>
+struct denc_traits<
+ std::array<T, N>,
+ std::enable_if_t<denc_traits<T>::supported>> {
+private:
+ using container = std::array<T, N>;
+public:
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = traits::bounded;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+ if constexpr (traits::bounded) {
+ if constexpr (traits::featured) {
+ if (!s.empty()) {
+ size_t elem_size = 0;
+ denc(*s.begin(), elem_size, f);
+ p += elem_size * s.size();
+ }
+ } else {
+ size_t elem_size = 0;
+ denc(*s.begin(), elem_size);
+ p += elem_size * N;
+ }
+ } else {
+ for (const auto& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ }
+
+ static void encode(const container& s, ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ for (const auto& e : s) {
+ if constexpr (traits::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ }
+ }
+ static void decode(container& s, ceph::buffer::ptr::const_iterator& p,
+ uint64_t f = 0) {
+ for (auto& e : s)
+ denc(e, p, f);
+ }
+ template<typename U=T>
+ static std::enable_if_t<!!sizeof(U) &&
+ !need_contiguous>
+ decode(container& s, ceph::buffer::list::const_iterator& p) {
+ for (auto& e : s) {
+ denc(e, p);
+ }
+ }
+};
+
+template<typename... Ts>
+struct denc_traits<
+ std::tuple<Ts...>,
+ std::enable_if_t<(denc_traits<Ts>::supported && ...)>> {
+
+private:
+ static_assert(sizeof...(Ts) > 0,
+ "Zero-length tuples are not supported.");
+ using container = std::tuple<Ts...>;
+
+public:
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = (denc_traits<Ts>::featured || ...);
+ static constexpr bool bounded = (denc_traits<Ts>::bounded && ...);
+ static constexpr bool need_contiguous =
+ (denc_traits<Ts>::need_contiguous || ...);
+
+ template<typename U = container>
+ static std::enable_if_t<denc_traits<U>::featured>
+ bound_encode(const container& s, size_t& p, uint64_t f) {
+ ceph::for_each(s, [&p, f] (const auto& e) {
+ if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ });
+ }
+ template<typename U = container>
+ static std::enable_if_t<!denc_traits<U>::featured>
+ bound_encode(const container& s, size_t& p) {
+ ceph::for_each(s, [&p] (const auto& e) {
+ denc(e, p);
+ });
+ }
+
+ template<typename U = container>
+ static std::enable_if_t<denc_traits<U>::featured>
+ encode(const container& s, ceph::buffer::list::contiguous_appender& p,
+ uint64_t f) {
+ ceph::for_each(s, [&p, f] (const auto& e) {
+ if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) {
+ denc(e, p, f);
+ } else {
+ denc(e, p);
+ }
+ });
+ }
+ template<typename U = container>
+ static std::enable_if_t<!denc_traits<U>::featured>
+ encode(const container& s, ceph::buffer::list::contiguous_appender& p) {
+ ceph::for_each(s, [&p] (const auto& e) {
+ denc(e, p);
+ });
+ }
+
+ static void decode(container& s, ceph::buffer::ptr::const_iterator& p,
+ uint64_t f = 0) {
+ ceph::for_each(s, [&p] (auto& e) {
+ denc(e, p);
+ });
+ }
+
+ template<typename U = container>
+ static std::enable_if_t<!denc_traits<U>::need_contiguous>
+ decode(container& s, ceph::buffer::list::const_iterator& p, uint64_t f = 0) {
+ ceph::for_each(s, [&p] (auto& e) {
+ denc(e, p);
+ });
+ }
+};
+
+//
+// boost::optional<T>
+//
+template<typename T>
+struct denc_traits<
+ boost::optional<T>,
+ std::enable_if_t<denc_traits<T>::supported>> {
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ static void bound_encode(const boost::optional<T>& v, size_t& p,
+ uint64_t f = 0) {
+ p += sizeof(bool);
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void encode(const boost::optional<T>& v,
+ ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ denc((bool)v, p);
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void decode(boost::optional<T>& v, ceph::buffer::ptr::const_iterator& p,
+ uint64_t f = 0) {
+ bool x;
+ denc(x, p, f);
+ if (x) {
+ v = T{};
+ denc(*v, p, f);
+ } else {
+ v = boost::none;
+ }
+ }
+
+ template<typename U = T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode(boost::optional<T>& v, ceph::buffer::list::const_iterator& p) {
+ bool x;
+ denc(x, p);
+ if (x) {
+ v = T{};
+ denc(*v, p);
+ } else {
+ v = boost::none;
+ }
+ }
+
+ template<typename U = T>
+ static void encode_nohead(const boost::optional<T>& v,
+ ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void decode_nohead(bool num, boost::optional<T>& v,
+ ceph::buffer::ptr::const_iterator& p, uint64_t f = 0) {
+ if (num) {
+ v = T();
+ denc(*v, p, f);
+ } else {
+ v = boost::none;
+ }
+ }
+};
+
+template<>
+struct denc_traits<boost::none_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+
+ static void bound_encode(const boost::none_t& v, size_t& p) {
+ p += sizeof(bool);
+ }
+
+ static void encode(const boost::none_t& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ denc(false, p);
+ }
+};
+
+//
+// std::optional<T>
+//
+template<typename T>
+struct denc_traits<
+ std::optional<T>,
+ std::enable_if_t<denc_traits<T>::supported>> {
+ using traits = denc_traits<T>;
+
+ static constexpr bool supported = true;
+ static constexpr bool featured = traits::featured;
+ static constexpr bool bounded = false;
+ static constexpr bool need_contiguous = traits::need_contiguous;
+
+ static void bound_encode(const std::optional<T>& v, size_t& p,
+ uint64_t f = 0) {
+ p += sizeof(bool);
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void encode(const std::optional<T>& v,
+ ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ denc((bool)v, p);
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void decode(std::optional<T>& v, ceph::buffer::ptr::const_iterator& p,
+ uint64_t f = 0) {
+ bool x;
+ denc(x, p, f);
+ if (x) {
+ v = T{};
+ denc(*v, p, f);
+ } else {
+ v = std::nullopt;
+ }
+ }
+
+ template<typename U = T>
+ static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+ decode(std::optional<T>& v, ceph::buffer::list::const_iterator& p) {
+ bool x;
+ denc(x, p);
+ if (x) {
+ v = T{};
+ denc(*v, p);
+ } else {
+ v = std::nullopt;
+ }
+ }
+
+ static void encode_nohead(const std::optional<T>& v,
+ ceph::buffer::list::contiguous_appender& p,
+ uint64_t f = 0) {
+ if (v) {
+ if constexpr (featured) {
+ denc(*v, p, f);
+ } else {
+ denc(*v, p);
+ }
+ }
+ }
+
+ static void decode_nohead(bool num, std::optional<T>& v,
+ ceph::buffer::ptr::const_iterator& p, uint64_t f = 0) {
+ if (num) {
+ v = T();
+ denc(*v, p, f);
+ } else {
+ v = std::nullopt;
+ }
+ }
+};
+
+template<>
+struct denc_traits<std::nullopt_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+
+ static void bound_encode(const std::nullopt_t& v, size_t& p) {
+ p += sizeof(bool);
+ }
+
+ static void encode(const std::nullopt_t& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ denc(false, p);
+ }
+};
+
+// ----------------------------------------------------------------------
+// class helpers
+
+// Write denc_traits<> for a class that defines bound_encode/encode/decode
+// methods.
+
+#define WRITE_CLASS_DENC(T) _DECLARE_CLASS_DENC(T, false)
+#define WRITE_CLASS_DENC_BOUNDED(T) _DECLARE_CLASS_DENC(T, true)
+#define _DECLARE_CLASS_DENC(T, b) \
+ template<> struct denc_traits<T> { \
+ static constexpr bool supported = true; \
+ static constexpr bool featured = false; \
+ static constexpr bool bounded = b; \
+ static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\
+ static void bound_encode(const T& v, size_t& p, uint64_t f=0) { \
+ v.bound_encode(p); \
+ } \
+ static void encode(const T& v, ::ceph::buffer::list::contiguous_appender& p, \
+ uint64_t f=0) { \
+ v.encode(p); \
+ } \
+ static void decode(T& v, ::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { \
+ v.decode(p); \
+ } \
+ };
+
+#define WRITE_CLASS_DENC_FEATURED(T) _DECLARE_CLASS_DENC_FEATURED(T, false)
+#define WRITE_CLASS_DENC_FEATURED_BOUNDED(T) _DECLARE_CLASS_DENC_FEATURED(T, true)
+#define _DECLARE_CLASS_DENC_FEATURED(T, b) \
+ template<> struct denc_traits<T> { \
+ static constexpr bool supported = true; \
+ static constexpr bool featured = true; \
+ static constexpr bool bounded = b; \
+ static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\
+ static void bound_encode(const T& v, size_t& p, uint64_t f) { \
+ v.bound_encode(p, f); \
+ } \
+ static void encode(const T& v, ::ceph::buffer::list::contiguous_appender& p, \
+ uint64_t f) { \
+ v.encode(p, f); \
+ } \
+ static void decode(T& v, ::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { \
+ v.decode(p, f); \
+ } \
+ };
+
+// ----------------------------------------------------------------------
+// encoded_sizeof_wrapper
+
+namespace ceph {
+
+template <typename T, typename traits=denc_traits<T>>
+constexpr std::enable_if_t<traits::supported && traits::bounded, size_t>
+encoded_sizeof_bounded() {
+ size_t p = 0;
+ traits::bound_encode(T(), p);
+ return p;
+}
+
+template <typename T, typename traits=denc_traits<T>>
+std::enable_if_t<traits::supported, size_t>
+encoded_sizeof(const T &t) {
+ size_t p = 0;
+ traits::bound_encode(t, p);
+ return p;
+}
+
+} // namespace ceph
+
+
+// ----------------------------------------------------------------------
+// encode/decode wrappers
+
+// These glue the new-style denc world into old-style calls to encode
+// and decode by calling into denc_traits<> methods (when present).
+
+namespace ceph {
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::featured> encode(
+ const T& o,
+ ceph::buffer::list& bl,
+ uint64_t features_unused=0)
+{
+ size_t len = 0;
+ traits::bound_encode(o, len);
+ auto a = bl.get_contiguous_appender(len);
+ traits::encode(o, a);
+}
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && traits::featured> encode(
+ const T& o, ::ceph::buffer::list& bl,
+ uint64_t features)
+{
+ size_t len = 0;
+ traits::bound_encode(o, len, features);
+ auto a = bl.get_contiguous_appender(len);
+ traits::encode(o, a, features);
+}
+
+template<typename T,
+ typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::need_contiguous> decode(
+ T& o,
+ ::ceph::buffer::list::const_iterator& p)
+{
+ if (p.end())
+ throw ::ceph::buffer::end_of_buffer();
+ const auto& bl = p.get_bl();
+ const auto remaining = bl.length() - p.get_off();
+ // it is expensive to rebuild a contigous buffer and drop it, so avoid this.
+ if (!p.is_pointing_same_raw(bl.back()) && remaining > CEPH_PAGE_SIZE) {
+ traits::decode(o, p);
+ } else {
+ // ensure we get a contigous buffer... until the end of the
+ // ceph::buffer::list. we don't really know how much we'll need here,
+ // unfortunately. hopefully it is already contiguous and we're just
+ // bumping the raw ref and initializing the ptr tmp fields.
+ ceph::buffer::ptr tmp;
+ auto t = p;
+ t.copy_shallow(remaining, tmp);
+ auto cp = std::cbegin(tmp);
+ traits::decode(o, cp);
+ p += cp.get_offset();
+ }
+}
+
+template<typename T,
+ typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && traits::need_contiguous> decode(
+ T& o,
+ ceph::buffer::list::const_iterator& p)
+{
+ if (p.end())
+ throw ceph::buffer::end_of_buffer();
+ // ensure we get a contigous buffer... until the end of the
+ // ceph::buffer::list. we don't really know how much we'll need here,
+ // unfortunately. hopefully it is already contiguous and we're just
+ // bumping the raw ref and initializing the ptr tmp fields.
+ ceph::buffer::ptr tmp;
+ auto t = p;
+ t.copy_shallow(p.get_bl().length() - p.get_off(), tmp);
+ auto cp = std::cbegin(tmp);
+ traits::decode(o, cp);
+ p += cp.get_offset();
+}
+
+// nohead variants
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported &&
+ !traits::featured> encode_nohead(
+ const T& o,
+ ceph::buffer::list& bl)
+{
+ size_t len = 0;
+ traits::bound_encode(o, len);
+ auto a = bl.get_contiguous_appender(len);
+ traits::encode_nohead(o, a);
+}
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::featured> decode_nohead(
+ size_t num,
+ T& o,
+ ceph::buffer::list::const_iterator& p)
+{
+ if (!num)
+ return;
+ if (p.end())
+ throw ceph::buffer::end_of_buffer();
+ if constexpr (traits::need_contiguous) {
+ ceph::buffer::ptr tmp;
+ auto t = p;
+ if constexpr (denc_traits<typename T::value_type>::bounded) {
+ size_t element_size = 0;
+ typename T::value_type v;
+ denc_traits<typename T::value_type>::bound_encode(v, element_size);
+ t.copy_shallow(num * element_size, tmp);
+ } else {
+ t.copy_shallow(p.get_bl().length() - p.get_off(), tmp);
+ }
+ auto cp = std::cbegin(tmp);
+ traits::decode_nohead(num, o, cp);
+ p += cp.get_offset();
+ } else {
+ traits::decode_nohead(num, o, p);
+ }
+}
+}
+
+
+// ----------------------------------------------------------------
+// DENC
+
+// These are some class methods we need to do the version and length
+// wrappers for DENC_{START,FINISH} for inter-version
+// interoperability.
+
+#define DENC_HELPERS \
+ /* bound_encode */ \
+ static void _denc_start(size_t& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **, uint32_t *) { \
+ p += 2 + 4; \
+ } \
+ static void _denc_finish(size_t& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **, uint32_t *) { } \
+ /* encode */ \
+ static void _denc_start(::ceph::buffer::list::contiguous_appender& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **len_pos, \
+ uint32_t *start_oob_off) { \
+ denc(*struct_v, p); \
+ denc(*struct_compat, p); \
+ *len_pos = p.get_pos_add(4); \
+ *start_oob_off = p.get_out_of_band_offset(); \
+ } \
+ static void _denc_finish(::ceph::buffer::list::contiguous_appender& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **len_pos, \
+ uint32_t *start_oob_off) { \
+ *(ceph_le32*)*len_pos = p.get_pos() - *len_pos - sizeof(uint32_t) + \
+ p.get_out_of_band_offset() - *start_oob_off; \
+ } \
+ /* decode */ \
+ static void _denc_start(::ceph::buffer::ptr::const_iterator& p, \
+ __u8 *struct_v, \
+ __u8 *struct_compat, \
+ char **start_pos, \
+ uint32_t *struct_len) { \
+ denc(*struct_v, p); \
+ denc(*struct_compat, p); \
+ denc(*struct_len, p); \
+ *start_pos = const_cast<char*>(p.get_pos()); \
+ } \
+ static void _denc_finish(::ceph::buffer::ptr::const_iterator& p, \
+ __u8 *struct_v, __u8 *struct_compat, \
+ char **start_pos, \
+ uint32_t *struct_len) { \
+ const char *pos = p.get_pos(); \
+ char *end = *start_pos + *struct_len; \
+ if (pos > end) { \
+ throw ::ceph::buffer::malformed_input(__PRETTY_FUNCTION__); \
+ } \
+ if (pos < end) { \
+ p += end - pos; \
+ } \
+ }
+
+// Helpers for versioning the encoding. These correspond to the
+// {ENCODE,DECODE}_{START,FINISH} macros.
+
+#define DENC_START(v, compat, p) \
+ __u8 struct_v = v; \
+ __u8 struct_compat = compat; \
+ char *_denc_pchar; \
+ uint32_t _denc_u32; \
+ _denc_start(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32); \
+ do {
+
+#define DENC_FINISH(p) \
+ } while (false); \
+ _denc_finish(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32);
+
+
+// ----------------------------------------------------------------------
+
+// Helpers for writing a unified bound_encode/encode/decode
+// implementation that won't screw up buffer size estimations.
+
+#define DENC(Type, v, p) \
+ DENC_HELPERS \
+ void bound_encode(size_t& p) const { \
+ _denc_friend(*this, p); \
+ } \
+ void encode(::ceph::buffer::list::contiguous_appender& p) const { \
+ DENC_DUMP_PRE(Type); \
+ _denc_friend(*this, p); \
+ } \
+ void decode(::ceph::buffer::ptr::const_iterator& p) { \
+ _denc_friend(*this, p); \
+ } \
+ template<typename T, typename P> \
+ friend std::enable_if_t<std::is_same_v<T, Type> || \
+ std::is_same_v<T, const Type>> \
+ _denc_friend(T& v, P& p)
+
+#define DENC_FEATURED(Type, v, p, f) \
+ DENC_HELPERS \
+ void bound_encode(size_t& p, uint64_t f) const { \
+ _denc_friend(*this, p, f); \
+ } \
+ void encode(::ceph::buffer::list::contiguous_appender& p, uint64_t f) const { \
+ DENC_DUMP_PRE(Type); \
+ _denc_friend(*this, p, f); \
+ } \
+ void decode(::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { \
+ _denc_friend(*this, p, f); \
+ } \
+ template<typename T, typename P> \
+ friend std::enable_if_t<std::is_same_v<T, Type> || \
+ std::is_same_v<T, const Type>> \
+ _denc_friend(T& v, P& p, uint64_t f)
+
+#endif
diff --git a/src/include/dlfcn_compat.h b/src/include/dlfcn_compat.h
new file mode 100644
index 000000000..95fd64e51
--- /dev/null
+++ b/src/include/dlfcn_compat.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef DLFCN_COMPAT_H
+#define DLFCN_COMPAT_H
+
+#include "acconfig.h"
+
+#define SHARED_LIB_SUFFIX CMAKE_SHARED_LIBRARY_SUFFIX
+
+#ifdef _WIN32
+ #include <string>
+
+ using dl_errmsg_t = std::string;
+
+ // The load mode flags will be ignored on Windows. We keep the same
+ // values for debugging purposes though.
+ #define RTLD_LAZY 0x00001
+ #define RTLD_NOW 0x00002
+ #define RTLD_BINDING_MASK 0x3
+ #define RTLD_NOLOAD 0x00004
+ #define RTLD_DEEPBIND 0x00008
+ #define RTLD_GLOBAL 0x00100
+ #define RTLD_LOCAL 0
+ #define RTLD_NODELETE 0x01000
+
+ void* dlopen(const char *filename, int flags);
+ int dlclose(void* handle);
+ dl_errmsg_t dlerror();
+ void* dlsym(void* handle, const char* symbol);
+#else
+ #include <dlfcn.h>
+
+ using dl_errmsg_t = char*;
+#endif /* _WIN32 */
+
+#endif /* DLFCN_H */
diff --git a/src/include/elist.h b/src/include/elist.h
new file mode 100644
index 000000000..38be35dbf
--- /dev/null
+++ b/src/include/elist.h
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ELIST_H
+#define CEPH_ELIST_H
+
+/*
+ * elist: embedded list.
+ *
+ * requirements:
+ * - elist<T>::item be embedded in the parent class
+ * - items are _always_ added to the list via the same elist<T>::item at the same
+ * fixed offset in the class.
+ * - begin(), front(), back() methods take the member offset as an argument for traversal.
+ *
+ */
+
+#define member_offset(cls, member) ((size_t)(&((cls*)1)->member) - 1)
+
+template<typename T>
+class elist {
+public:
+ struct item {
+ item *_prev, *_next;
+
+ item(T i=0) : _prev(this), _next(this) {}
+ ~item() {
+ ceph_assert(!is_on_list());
+ }
+
+ item(const item& other) = delete;
+ const item& operator= (const item& right) = delete;
+
+
+ bool empty() const { return _prev == this; }
+ bool is_on_list() const { return !empty(); }
+
+ bool remove_myself() {
+ if (_next == this) {
+ ceph_assert(_prev == this);
+ return false;
+ }
+ _next->_prev = _prev;
+ _prev->_next = _next;
+ _prev = _next = this;
+ return true;
+ }
+
+ void insert_after(item *other) {
+ ceph_assert(other->empty());
+ other->_prev = this;
+ other->_next = _next;
+ _next->_prev = other;
+ _next = other;
+ }
+ void insert_before(item *other) {
+ ceph_assert(other->empty());
+ other->_next = this;
+ other->_prev = _prev;
+ _prev->_next = other;
+ _prev = other;
+ }
+
+ T get_item(size_t offset) {
+ ceph_assert(offset);
+ return (T)(((char *)this) - offset);
+ }
+ };
+
+private:
+ item _head;
+ size_t item_offset;
+
+public:
+ elist(const elist& other);
+ const elist& operator=(const elist& other);
+
+ elist(size_t o) : _head(NULL), item_offset(o) {}
+ ~elist() {
+ ceph_assert(_head.empty());
+ }
+
+ bool empty() const {
+ return _head.empty();
+ }
+
+ void clear() {
+ while (!_head.empty())
+ pop_front();
+ }
+
+ void push_front(item *i) {
+ if (!i->empty())
+ i->remove_myself();
+ _head.insert_after(i);
+ }
+ void push_back(item *i) {
+ if (!i->empty())
+ i->remove_myself();
+ _head.insert_before(i);
+ }
+
+ T front(size_t o=0) {
+ ceph_assert(!_head.empty());
+ return _head._next->get_item(o ? o : item_offset);
+ }
+ T back(size_t o=0) {
+ ceph_assert(!_head.empty());
+ return _head._prev->get_item(o ? o : item_offset);
+ }
+
+ void pop_front() {
+ ceph_assert(!empty());
+ _head._next->remove_myself();
+ }
+ void pop_back() {
+ ceph_assert(!empty());
+ _head._prev->remove_myself();
+ }
+
+ void clear_list() {
+ while (!empty())
+ pop_front();
+ }
+
+ enum mode_t {
+ MAGIC, CURRENT, CACHE_NEXT
+ };
+
+ class iterator {
+ private:
+ item *head;
+ item *cur, *next;
+ size_t item_offset;
+ mode_t mode;
+ public:
+ iterator(item *h, size_t o, mode_t m) :
+ head(h), cur(h->_next), next(cur->_next), item_offset(o),
+ mode(m) {
+ ceph_assert(item_offset > 0);
+ }
+ T operator*() {
+ return cur->get_item(item_offset);
+ }
+ iterator& operator++() {
+ ceph_assert(cur);
+ ceph_assert(cur != head);
+ if (mode == MAGIC) {
+ // if 'cur' appears to be valid, use that. otherwise,
+ // use cached 'next'.
+ // this is a bit magic, and probably a bad idea... :/
+ if (cur->empty())
+ cur = next;
+ else
+ cur = cur->_next;
+ } else if (mode == CURRENT)
+ cur = cur->_next;
+ else if (mode == CACHE_NEXT)
+ cur = next;
+ else
+ ceph_abort();
+ next = cur->_next;
+ return *this;
+ }
+ bool end() const {
+ return cur == head;
+ }
+ };
+
+ iterator begin(size_t o=0) {
+ return iterator(&_head, o ? o : item_offset, MAGIC);
+ }
+ iterator begin_use_current(size_t o=0) {
+ return iterator(&_head, o ? o : item_offset, CURRENT);
+ }
+ iterator begin_cache_next(size_t o=0) {
+ return iterator(&_head, o ? o : item_offset, CACHE_NEXT);
+ }
+};
+
+
+#endif
diff --git a/src/include/encoding.h b/src/include/encoding.h
new file mode 100644
index 000000000..49f2f77be
--- /dev/null
+++ b/src/include/encoding.h
@@ -0,0 +1,1531 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_ENCODING_H
+#define CEPH_ENCODING_H
+
+#include <set>
+#include <map>
+#include <deque>
+#include <vector>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <optional>
+#include <boost/container/small_vector.hpp>
+#include <boost/optional/optional_io.hpp>
+#include <boost/tuple/tuple.hpp>
+
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+#include "common/ceph_time.h"
+
+#include "include/int_types.h"
+
+#include "common/convenience.h"
+
+#include "byteorder.h"
+#include "buffer.h"
+
+// pull in the new-style encoding so that we get the denc_traits<> definition.
+#include "denc.h"
+
+#include "assert.h"
+
+using namespace ceph;
+
+namespace ceph {
+
+/*
+ * Notes on feature encoding:
+ *
+ * - The default encode() methods have a features argument with a default parameter
+ * (which goes to zero).
+ * - Normal classes will use WRITE_CLASS_ENCODER, with that features=0 default.
+ * - Classes that _require_ features will use WRITE_CLASS_ENCODER_FEATURES, which
+ * does not define the default. Any caller must explicitly pass it in.
+ * - STL container macros have two encode variants: one with a features arg, and one
+ * without.
+ *
+ * The result:
+ * - A feature encode() method will fail to compile if a value is not
+ * passed in.
+ * - The feature varianet of the STL templates will be used when the feature arg is
+ * provided. It will be passed through to any template arg types, but it will be
+ * ignored when not needed.
+ */
+
+// --------------------------------------
+// base types
+
+template<class T>
+inline void encode_raw(const T& t, bufferlist& bl)
+{
+ bl.append((char*)&t, sizeof(t));
+}
+template<class T>
+inline void decode_raw(T& t, bufferlist::const_iterator &p)
+{
+ p.copy(sizeof(t), (char*)&t);
+}
+
+#define WRITE_RAW_ENCODER(type) \
+ inline void encode(const type &v, ::ceph::bufferlist& bl, uint64_t features=0) { ::ceph::encode_raw(v, bl); } \
+ inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { ::ceph::decode_raw(v, p); }
+
+WRITE_RAW_ENCODER(__u8)
+#ifndef _CHAR_IS_SIGNED
+WRITE_RAW_ENCODER(__s8)
+#endif
+WRITE_RAW_ENCODER(char)
+WRITE_RAW_ENCODER(ceph_le64)
+WRITE_RAW_ENCODER(ceph_le32)
+WRITE_RAW_ENCODER(ceph_le16)
+
+inline void encode(const bool &v, bufferlist& bl) {
+ __u8 vv = v;
+ encode_raw(vv, bl);
+}
+inline void decode(bool &v, bufferlist::const_iterator& p) {
+ __u8 vv;
+ decode_raw(vv, p);
+ v = vv;
+}
+
+
+// -----------------------------------
+// int types
+
+#define WRITE_INTTYPE_ENCODER(type, etype) \
+ inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \
+ ceph_##etype e; \
+ e = v; \
+ ::ceph::encode_raw(e, bl); \
+ } \
+ inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { \
+ ceph_##etype e; \
+ ::ceph::decode_raw(e, p); \
+ v = e; \
+ }
+
+WRITE_INTTYPE_ENCODER(uint64_t, le64)
+WRITE_INTTYPE_ENCODER(int64_t, le64)
+WRITE_INTTYPE_ENCODER(uint32_t, le32)
+WRITE_INTTYPE_ENCODER(int32_t, le32)
+WRITE_INTTYPE_ENCODER(uint16_t, le16)
+WRITE_INTTYPE_ENCODER(int16_t, le16)
+
+// -----------------------------------
+// float types
+//
+// NOTE: The following code assumes all supported platforms use IEEE binary32
+// as float and IEEE binary64 as double floating-point format. The assumption
+// is verified by the assertions below.
+//
+// Under this assumption, we can use raw encoding of floating-point types
+// on little-endian machines, but we still need to perform a byte swap
+// on big-endian machines to ensure cross-architecture compatibility.
+// To achive that, we reinterpret the values as integers first, which are
+// byte-swapped via the ceph_le types as above. The extra conversions
+// are optimized away on little-endian machines by the compiler.
+#define WRITE_FLTTYPE_ENCODER(type, itype, etype) \
+ static_assert(sizeof(type) == sizeof(itype)); \
+ static_assert(std::numeric_limits<type>::is_iec559, \
+ "floating-point type not using IEEE754 format"); \
+ inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \
+ ceph_##etype e; \
+ e = *reinterpret_cast<itype *>(&v); \
+ ::ceph::encode_raw(e, bl); \
+ } \
+ inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { \
+ ceph_##etype e; \
+ ::ceph::decode_raw(e, p); \
+ *reinterpret_cast<itype *>(&v) = e; \
+ }
+
+WRITE_FLTTYPE_ENCODER(float, uint32_t, le32)
+WRITE_FLTTYPE_ENCODER(double, uint64_t, le64)
+
+// see denc.h for ENCODE_DUMP_PATH discussion and definition.
+#ifdef ENCODE_DUMP_PATH
+# define ENCODE_DUMP_PRE() \
+ unsigned pre_off = bl.length()
+# define ENCODE_DUMP_POST(cl) \
+ do { \
+ static int i = 0; \
+ i++; \
+ int bits = 0; \
+ for (unsigned t = i; t; bits++) \
+ t &= t - 1; \
+ if (bits > 2) \
+ break; \
+ char fn[PATH_MAX]; \
+ snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #cl, getpid(), i++); \
+ int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC|O_BINARY, 0644); \
+ if (fd >= 0) { \
+ ::ceph::bufferlist sub; \
+ sub.substr_of(bl, pre_off, bl.length() - pre_off); \
+ sub.write_fd(fd); \
+ ::close(fd); \
+ } \
+ } while (0)
+#else
+# define ENCODE_DUMP_PRE()
+# define ENCODE_DUMP_POST(cl)
+#endif
+
+
+#define WRITE_CLASS_ENCODER(cl) \
+ inline void encode(const cl& c, ::ceph::buffer::list &bl, uint64_t features=0) { \
+ ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); } \
+ inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_MEMBER_ENCODER(cl) \
+ inline void encode(const cl &c, ::ceph::bufferlist &bl) const { \
+ ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); } \
+ inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_ENCODER_FEATURES(cl) \
+ inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features) { \
+ ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); } \
+ inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_ENCODER_OPTIONAL_FEATURES(cl) \
+ inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features = 0) { \
+ ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); } \
+ inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+
+// string
+inline void encode(std::string_view s, bufferlist& bl, uint64_t features=0)
+{
+ __u32 len = s.length();
+ encode(len, bl);
+ if (len)
+ bl.append(s.data(), len);
+}
+inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0)
+{
+ return encode(std::string_view(s), bl, features);
+}
+inline void decode(std::string& s, bufferlist::const_iterator& p)
+{
+ __u32 len;
+ decode(len, p);
+ s.clear();
+ p.copy(len, s);
+}
+
+inline void encode_nohead(std::string_view s, bufferlist& bl)
+{
+ bl.append(s.data(), s.length());
+}
+inline void encode_nohead(const std::string& s, bufferlist& bl)
+{
+ encode_nohead(std::string_view(s), bl);
+}
+inline void decode_nohead(int len, std::string& s, bufferlist::const_iterator& p)
+{
+ s.clear();
+ p.copy(len, s);
+}
+
+// const char* (encode only, string compatible)
+inline void encode(const char *s, bufferlist& bl)
+{
+ encode(std::string_view(s, strlen(s)), bl);
+}
+
+
+// -----------------------------
+// buffers
+
+// bufferptr (encapsulated)
+inline void encode(const buffer::ptr& bp, bufferlist& bl)
+{
+ __u32 len = bp.length();
+ encode(len, bl);
+ if (len)
+ bl.append(bp);
+}
+inline void decode(buffer::ptr& bp, bufferlist::const_iterator& p)
+{
+ __u32 len;
+ decode(len, p);
+
+ bufferlist s;
+ p.copy(len, s);
+
+ if (len) {
+ if (s.get_num_buffers() == 1)
+ bp = s.front();
+ else
+ bp = buffer::copy(s.c_str(), s.length());
+ }
+}
+
+// bufferlist (encapsulated)
+inline void encode(const bufferlist& s, bufferlist& bl)
+{
+ __u32 len = s.length();
+ encode(len, bl);
+ bl.append(s);
+}
+inline void encode_destructively(bufferlist& s, bufferlist& bl)
+{
+ __u32 len = s.length();
+ encode(len, bl);
+ bl.claim_append(s);
+}
+inline void decode(bufferlist& s, bufferlist::const_iterator& p)
+{
+ __u32 len;
+ decode(len, p);
+ s.clear();
+ p.copy(len, s);
+}
+
+inline void encode_nohead(const bufferlist& s, bufferlist& bl)
+{
+ bl.append(s);
+}
+inline void decode_nohead(int len, bufferlist& s, bufferlist::const_iterator& p)
+{
+ s.clear();
+ p.copy(len, s);
+}
+
+// Time, since the templates are defined in std::chrono
+
+template<typename Clock, typename Duration,
+ typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr>
+void encode(const std::chrono::time_point<Clock, Duration>& t,
+ ceph::bufferlist &bl) {
+ auto ts = Clock::to_timespec(t);
+ // A 32 bit count of seconds causes me vast unhappiness.
+ uint32_t s = ts.tv_sec;
+ uint32_t ns = ts.tv_nsec;
+ encode(s, bl);
+ encode(ns, bl);
+}
+
+template<typename Clock, typename Duration,
+ typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr>
+void decode(std::chrono::time_point<Clock, Duration>& t,
+ bufferlist::const_iterator& p) {
+ uint32_t s;
+ uint32_t ns;
+ decode(s, p);
+ decode(ns, p);
+ struct timespec ts = {
+ static_cast<time_t>(s),
+ static_cast<long int>(ns)};
+
+ t = Clock::from_timespec(ts);
+}
+
+template<typename Rep, typename Period,
+ typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr>
+void encode(const std::chrono::duration<Rep, Period>& d,
+ ceph::bufferlist &bl) {
+ using namespace std::chrono;
+ int32_t s = duration_cast<seconds>(d).count();
+ int32_t ns = (duration_cast<nanoseconds>(d) % seconds(1)).count();
+ encode(s, bl);
+ encode(ns, bl);
+}
+
+template<typename Rep, typename Period,
+ typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr>
+void decode(std::chrono::duration<Rep, Period>& d,
+ bufferlist::const_iterator& p) {
+ int32_t s;
+ int32_t ns;
+ decode(s, p);
+ decode(ns, p);
+ d = std::chrono::seconds(s) + std::chrono::nanoseconds(ns);
+}
+
+// -----------------------------
+// STL container types
+
+template<typename T>
+inline void encode(const boost::optional<T> &p, bufferlist &bl);
+template<typename T>
+inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp);
+template<typename T>
+inline void encode(const std::optional<T> &p, bufferlist &bl);
+template<typename T>
+inline void decode(std::optional<T> &p, bufferlist::const_iterator &bp);
+template<class A, class B, class C>
+inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl);
+template<class A, class B, class C>
+inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp);
+template<class A, class B,
+ typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported || !b_traits::supported>
+encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features);
+template<class A, class B,
+ typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported ||
+ !b_traits::supported>
+encode(const std::pair<A,B> &p, bufferlist &bl);
+template<class A, class B,
+ typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported ||
+ !b_traits::supported>
+decode(std::pair<A,B> &pa, bufferlist::const_iterator &p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::list<T, Alloc>& ls, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p);
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist& bl);
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist& bl, uint64_t features);
+template<class T, class Alloc>
+inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s,
+ bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s,
+ bufferlist::iterator& p);
+template<class T, class Comp, class Alloc>
+inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc>
+inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::vector<T,Alloc>& v, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p);
+template<class T,class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist& bl,
+ uint64_t features);
+template<class T, class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist& bl);
+template<class T, class Alloc>
+inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist::const_iterator& p);
+// small_vector
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p);
+// std::map
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported ||
+ !u_traits::supported>
+encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl,
+ uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc>
+inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
+ uint64_t features);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class Hash, class Pred, class Alloc>
+inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl);
+template<class T, class Hash, class Pred, class Alloc>
+inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features);
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl);
+template<class T, class Alloc>
+inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::array<T, N>& v, bufferlist::const_iterator& p);
+
+// full bl decoder
+template<class T>
+inline void decode(T &o, const bufferlist& bl)
+{
+ auto p = bl.begin();
+ decode(o, p);
+ ceph_assert(p.end());
+}
+
+// boost optional
+template<typename T>
+inline void encode(const boost::optional<T> &p, bufferlist &bl)
+{
+ __u8 present = static_cast<bool>(p);
+ encode(present, bl);
+ if (p)
+ encode(p.get(), bl);
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+template<typename T>
+inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp)
+{
+ __u8 present;
+ decode(present, bp);
+ if (present) {
+ p = T{};
+ decode(p.get(), bp);
+ } else {
+ p = boost::none;
+ }
+}
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+// std optional
+template<typename T>
+inline void encode(const std::optional<T> &p, bufferlist &bl)
+{
+ __u8 present = static_cast<bool>(p);
+ encode(present, bl);
+ if (p)
+ encode(*p, bl);
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+template<typename T>
+inline void decode(std::optional<T> &p, bufferlist::const_iterator &bp)
+{
+ __u8 present;
+ decode(present, bp);
+ if (present) {
+ p = T{};
+ decode(*p, bp);
+ } else {
+ p = std::nullopt;
+ }
+}
+
+// std::tuple
+template<typename... Ts>
+inline void encode(const std::tuple<Ts...> &t, bufferlist& bl)
+{
+ ceph::for_each(t, [&bl](const auto& e) {
+ encode(e, bl);
+ });
+}
+template<typename... Ts>
+inline void decode(std::tuple<Ts...> &t, bufferlist::const_iterator &bp)
+{
+ ceph::for_each(t, [&bp](auto& e) {
+ decode(e, bp);
+ });
+}
+
+//triple boost::tuple
+template<class A, class B, class C>
+inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl)
+{
+ encode(boost::get<0>(t), bl);
+ encode(boost::get<1>(t), bl);
+ encode(boost::get<2>(t), bl);
+}
+template<class A, class B, class C>
+inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp)
+{
+ decode(boost::get<0>(t), bp);
+ decode(boost::get<1>(t), bp);
+ decode(boost::get<2>(t), bp);
+}
+
+// std::pair<A,B>
+template<class A, class B,
+ typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported || !b_traits::supported>
+ encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features)
+{
+ encode(p.first, bl, features);
+ encode(p.second, bl, features);
+}
+template<class A, class B,
+ typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported ||
+ !b_traits::supported>
+ encode(const std::pair<A,B> &p, bufferlist &bl)
+{
+ encode(p.first, bl);
+ encode(p.second, bl);
+}
+template<class A, class B, typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported ||
+ !b_traits::supported>
+ decode(std::pair<A,B> &pa, bufferlist::const_iterator &p)
+{
+ decode(pa.first, p);
+ decode(pa.second, p);
+}
+
+// std::list<T>
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::list<T, Alloc>& ls, bufferlist& bl)
+{
+ __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1)
+ encode(n, bl);
+ for (auto p = ls.begin(); p != ls.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features)
+{
+ using counter_encode_t = ceph_le32;
+ unsigned n = 0;
+ auto filler = bl.append_hole(sizeof(counter_encode_t));
+ for (const auto& item : ls) {
+ // we count on our own because of buggy std::list::size() implementation
+ // which doesn't follow the O(1) complexity constraint C++11 has brought.
+ ++n;
+ encode(item, bl, features);
+ }
+ counter_encode_t en;
+ en = n;
+ filler.copy_in(sizeof(en), reinterpret_cast<char*>(&en));
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ ls.clear();
+ while (n--) {
+ ls.emplace_back();
+ decode(ls.back(), p);
+ }
+}
+
+// std::list<std::shared_ptr<T>>
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist& bl)
+{
+ __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1)
+ encode(n, bl);
+ for (const auto& ref : ls) {
+ encode(*ref, bl);
+ }
+}
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1)
+ encode(n, bl);
+ for (const auto& ref : ls) {
+ encode(*ref, bl, features);
+ }
+}
+template<class T, class Alloc>
+inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls,
+ bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ ls.clear();
+ while (n--) {
+ auto ref = std::make_shared<T>();
+ decode(*ref, p);
+ ls.emplace_back(std::move(ref));
+ }
+}
+
+// std::set<T>
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
+{
+ __u32 n = (__u32)(s.size());
+ encode(n, bl);
+ for (auto p = s.begin(); p != s.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ s.clear();
+ while (n--) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+template<class T, class Comp, class Alloc, typename traits>
+inline typename std::enable_if<!traits::supported>::type
+ encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
+{
+ for (auto p = s.begin(); p != s.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+ for (int i=0; i<len; i++) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+// boost::container::flat_set<T>
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl)
+{
+ __u32 n = (__u32)(s.size());
+ encode(n, bl);
+ for (const auto& e : s)
+ encode(e, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ s.clear();
+ s.reserve(n);
+ while (n--) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s,
+ bufferlist& bl)
+{
+ for (const auto& e : s)
+ encode(e, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s,
+ bufferlist::iterator& p)
+{
+ s.reserve(len);
+ for (int i=0; i<len; i++) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+// multiset
+template<class T, class Comp, class Alloc>
+inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl)
+{
+ __u32 n = (__u32)(s.size());
+ encode(n, bl);
+ for (auto p = s.begin(); p != s.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Comp, class Alloc>
+inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ s.clear();
+ while (n--) {
+ T v;
+ decode(v, p);
+ s.insert(v);
+ }
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (auto p = v.begin(); p != v.end(); ++p)
+ encode(*p, bl, features);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const std::vector<T,Alloc>& v, bufferlist& bl)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (auto p = v.begin(); p != v.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ v.resize(n);
+ for (__u32 i=0; i<n; i++)
+ decode(v[i], p);
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl)
+{
+ for (auto p = v.begin(); p != v.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p)
+{
+ v.resize(len);
+ for (__u32 i=0; i<v.size(); i++)
+ decode(v[i], p);
+}
+
+// small vector
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (const auto& i : v)
+ encode(i, bl, features);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (const auto& i : v)
+ encode(i, bl);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ v.resize(n);
+ for (auto& i : v)
+ decode(i, p);
+}
+
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl)
+{
+ for (const auto& i : v)
+ encode(i, bl);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+ decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p)
+{
+ v.resize(len);
+ for (auto& i : v)
+ decode(i, p);
+}
+
+
+// vector (shared_ptr)
+template<class T,class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist& bl,
+ uint64_t features)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (const auto& ref : v) {
+ if (ref)
+ encode(*ref, bl, features);
+ else
+ encode(T(), bl, features);
+ }
+}
+template<class T, class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist& bl)
+{
+ __u32 n = (__u32)(v.size());
+ encode(n, bl);
+ for (const auto& ref : v) {
+ if (ref)
+ encode(*ref, bl);
+ else
+ encode(T(), bl);
+ }
+}
+template<class T, class Alloc>
+inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v,
+ bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ v.clear();
+ v.reserve(n);
+ while (n--) {
+ auto ref = std::make_shared<T>();
+ decode(*ref, p);
+ v.emplace_back(std::move(ref));
+ }
+}
+
+// map
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported ||
+ !u_traits::supported>
+ encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
+{
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+// boost::container::flat-map
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (typename boost::container::flat_map<T,U,Comp>::const_iterator p
+ = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl,
+ uint64_t features)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ m.reserve(n);
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.reserve(m.size() + n);
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist& bl)
+{
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+ inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist& bl, uint64_t features)
+{
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Comp, class Alloc,
+ typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+ decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m,
+ bufferlist::const_iterator& p)
+{
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+// multimap
+template<class T, class U, class Comp, class Alloc>
+inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ typename std::pair<T,U> tu = std::pair<T,U>();
+ decode(tu.first, p);
+ typename std::multimap<T,U,Comp,Alloc>::iterator it = m.insert(tu);
+ decode(it->second, p);
+ }
+}
+
+// ceph::unordered_map
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
+ uint64_t features)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+// ceph::unordered_set
+template<class T, class Hash, class Pred, class Alloc>
+inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (auto p = m.begin(); p != m.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Hash, class Pred, class Alloc>
+inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ m.insert(k);
+ }
+}
+
+// deque
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features)
+{
+ __u32 n = ls.size();
+ encode(n, bl);
+ for (auto p = ls.begin(); p != ls.end(); ++p)
+ encode(*p, bl, features);
+}
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl)
+{
+ __u32 n = ls.size();
+ encode(n, bl);
+ for (auto p = ls.begin(); p != ls.end(); ++p)
+ encode(*p, bl);
+}
+template<class T, class Alloc>
+inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ ls.clear();
+ while (n--) {
+ ls.emplace_back();
+ decode(ls.back(), p);
+ }
+}
+
+// std::array<T, N>
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features)
+{
+ for (const auto& e : v)
+ encode(e, bl, features);
+}
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl)
+{
+ for (const auto& e : v)
+ encode(e, bl);
+}
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode(std::array<T, N>& v, bufferlist::const_iterator& p)
+{
+ for (auto& e : v)
+ decode(e, p);
+}
+}
+
+/*
+ * guards
+ */
+
+/**
+ * start encoding block
+ *
+ * @param v current (code) version of the encoding
+ * @param compat oldest code version that can decode it
+ * @param bl bufferlist to encode to
+ *
+ */
+#define ENCODE_START(v, compat, bl) \
+ __u8 struct_v = v; \
+ __u8 struct_compat = compat; \
+ ceph_le32 struct_len; \
+ auto filler = (bl).append_hole(sizeof(struct_v) + \
+ sizeof(struct_compat) + sizeof(struct_len)); \
+ const auto starting_bl_len = (bl).length(); \
+ using ::ceph::encode; \
+ do {
+
+/**
+ * finish encoding block
+ *
+ * @param bl bufferlist we were encoding to
+ * @param new_struct_compat struct-compat value to use
+ */
+#define ENCODE_FINISH_NEW_COMPAT(bl, new_struct_compat) \
+ } while (false); \
+ if (new_struct_compat) { \
+ struct_compat = new_struct_compat; \
+ } \
+ struct_len = (bl).length() - starting_bl_len; \
+ filler.copy_in(sizeof(struct_v), (char *)&struct_v); \
+ filler.copy_in(sizeof(struct_compat), \
+ (char *)&struct_compat); \
+ filler.copy_in(sizeof(struct_len), (char *)&struct_len);
+
+#define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0)
+
+#define DECODE_ERR_OLDVERSION(func, v, compatv) \
+ (std::string(func) + " no longer understand old encoding version " #v " < " + std::to_string(compatv))
+
+#define DECODE_ERR_PAST(func) \
+ (std::string(func) + " decode past end of struct encoding")
+
+/**
+ * check for very old encoding
+ *
+ * If the encoded data is older than oldestv, raise an exception.
+ *
+ * @param oldestv oldest version of the code we can successfully decode.
+ */
+#define DECODE_OLDEST(oldestv) \
+ if (struct_v < oldestv) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, oldestv));
+
+/**
+ * start a decoding block
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param bl bufferlist::iterator for the encoded data
+ */
+#define DECODE_START(v, bl) \
+ __u8 struct_v, struct_compat; \
+ using ::ceph::decode; \
+ decode(struct_v, bl); \
+ decode(struct_compat, bl); \
+ if (v < struct_compat) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
+ __u32 struct_len; \
+ decode(struct_len, bl); \
+ if (struct_len > bl.get_remaining()) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ unsigned struct_end = bl.get_off() + struct_len; \
+ do {
+
+/* BEWARE: any change to this macro MUST be also reflected in the duplicative
+ * DECODE_START_LEGACY_COMPAT_LEN! */
+#define __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, skip_v, bl) \
+ using ::ceph::decode; \
+ __u8 struct_v; \
+ decode(struct_v, bl); \
+ if (struct_v >= compatv) { \
+ __u8 struct_compat; \
+ decode(struct_compat, bl); \
+ if (v < struct_compat) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
+ } else if (skip_v) { \
+ if (bl.get_remaining() < skip_v) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ bl += skip_v; \
+ } \
+ unsigned struct_end = 0; \
+ if (struct_v >= lenv) { \
+ __u32 struct_len; \
+ decode(struct_len, bl); \
+ if (struct_len > bl.get_remaining()) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ struct_end = bl.get_off() + struct_len; \
+ } \
+ do {
+
+/**
+ * start a decoding block with legacy support for older encoding schemes
+ *
+ * The old encoding schemes has a __u8 struct_v only, or lacked either
+ * the compat version or length. Skip those fields conditionally.
+ *
+ * Most of the time, v, compatv, and lenv will all match the version
+ * where the structure was switched over to the new macros.
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param compatv oldest version that includes a __u8 compat version field
+ * @param lenv oldest version that includes a __u32 length wrapper
+ * @param bl bufferlist::iterator containing the encoded data
+ */
+
+/* BEWARE: this is duplication of __DECODE_START_LEGACY_COMPAT_LEN which
+ * MUST be changed altogether. For the rationale behind code duplication,
+ * please `git blame` and refer to the commit message. */
+#define DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, bl) \
+ using ::ceph::decode; \
+ __u8 struct_v; \
+ decode(struct_v, bl); \
+ if (struct_v >= compatv) { \
+ __u8 struct_compat; \
+ decode(struct_compat, bl); \
+ if (v < struct_compat) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION( \
+ __PRETTY_FUNCTION__, v, struct_compat)); \
+ } \
+ unsigned struct_end = 0; \
+ if (struct_v >= lenv) { \
+ __u32 struct_len; \
+ decode(struct_len, bl); \
+ if (struct_len > bl.get_remaining()) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ struct_end = bl.get_off() + struct_len; \
+ } \
+ do {
+
+/**
+ * start a decoding block with legacy support for older encoding schemes
+ *
+ * This version of the macro assumes the legacy encoding had a 32 bit
+ * version
+ *
+ * The old encoding schemes has a __u8 struct_v only, or lacked either
+ * the compat version or length. Skip those fields conditionally.
+ *
+ * Most of the time, v, compatv, and lenv will all match the version
+ * where the structure was switched over to the new macros.
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param compatv oldest version that includes a __u8 compat version field
+ * @param lenv oldest version that includes a __u32 length wrapper
+ * @param bl bufferlist::iterator containing the encoded data
+ */
+#define DECODE_START_LEGACY_COMPAT_LEN_32(v, compatv, lenv, bl) \
+ __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 3u, bl)
+
+#define DECODE_START_LEGACY_COMPAT_LEN_16(v, compatv, lenv, bl) \
+ __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 1u, bl)
+
+/**
+ * finish decode block
+ *
+ * @param bl bufferlist::iterator we were decoding from
+ */
+#define DECODE_FINISH(bl) \
+ } while (false); \
+ if (struct_end) { \
+ if (bl.get_off() > struct_end) \
+ throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+ if (bl.get_off() < struct_end) \
+ bl += struct_end - bl.get_off(); \
+ }
+
+namespace ceph {
+
+/*
+ * Encoders/decoders to read from current offset in a file handle and
+ * encode/decode the data according to argument types.
+ */
+inline ssize_t decode_file(int fd, std::string &str)
+{
+ bufferlist bl;
+ __u32 len = 0;
+ bl.read_fd(fd, sizeof(len));
+ decode(len, bl);
+ bl.read_fd(fd, len);
+ decode(str, bl);
+ return bl.length();
+}
+
+inline ssize_t decode_file(int fd, bufferptr &bp)
+{
+ bufferlist bl;
+ __u32 len = 0;
+ bl.read_fd(fd, sizeof(len));
+ decode(len, bl);
+ bl.read_fd(fd, len);
+ auto bli = std::cbegin(bl);
+
+ decode(bp, bli);
+ return bl.length();
+}
+}
+
+#endif
diff --git a/src/include/err.h b/src/include/err.h
new file mode 100644
index 000000000..c188e9753
--- /dev/null
+++ b/src/include/err.h
@@ -0,0 +1,31 @@
+#ifndef CEPH_ERR_H
+#define CEPH_ERR_H
+
+/*
+ * adapted from linux 2.6.24 include/linux/err.h
+ */
+#define MAX_ERRNO 4095
+#define IS_ERR_VALUE(x) ((x) >= (uintptr_t)-MAX_ERRNO)
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+/* this generates a warning in c++; caller can do the cast manually
+static inline void *ERR_PTR(long error)
+{
+ return (void *) error;
+}
+*/
+
+static inline intptr_t PTR_ERR(const void *ptr)
+{
+ return (intptr_t) ptr;
+}
+
+static inline bool IS_ERR(const void *ptr)
+{
+ return IS_ERR_VALUE((uintptr_t)ptr);
+}
+
+#endif
diff --git a/src/include/error.h b/src/include/error.h
new file mode 100644
index 000000000..a548d9756
--- /dev/null
+++ b/src/include/error.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__)
+
+#define ASSERT(c) \
+ ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1))
+
+/* print usage error message and exit */
+extern void userror(const char *use, const char *fmt, ...);
+
+/* print system error message and exit */
+extern void syserror(const char *fmt, ...);
+
+/* print error message and exit */
+extern void exiterror(const char *fmt, ...);
+
+/* print error message */
+extern void error(const char *fmt, ...);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/include/event_type.h b/src/include/event_type.h
new file mode 100644
index 000000000..aa6ddedb4
--- /dev/null
+++ b/src/include/event_type.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_EVENT_TYPE_H
+#define CEPH_COMMON_EVENT_TYPE_H
+
+#define EVENT_SOCKET_TYPE_NONE 0
+#define EVENT_SOCKET_TYPE_PIPE 1
+#define EVENT_SOCKET_TYPE_EVENTFD 2
+
+#endif
diff --git a/src/include/expected.hpp b/src/include/expected.hpp
new file mode 100644
index 000000000..740c6ad24
--- /dev/null
+++ b/src/include/expected.hpp
@@ -0,0 +1,2282 @@
+///
+// expected - An implementation of std::expected with extensions
+// Written in 2017 by Simon Brand (@TartanLlama)
+//
+// To the extent possible under law, the author(s) have dedicated all
+// copyright and related and neighboring rights to this software to the
+// public domain worldwide. This software is distributed without any warranty.
+//
+// You should have received a copy of the CC0 Public Domain Dedication
+// along with this software. If not, see
+// <http://creativecommons.org/publicdomain/zero/1.0/>.
+///
+
+#ifndef TL_EXPECTED_HPP
+#define TL_EXPECTED_HPP
+
+#define TL_EXPECTED_VERSION_MAJOR 0
+#define TL_EXPECTED_VERSION_MINOR 2
+
+#include <exception>
+#include <functional>
+#include <type_traits>
+#include <utility>
+
+#if defined(__EXCEPTIONS) || defined(_CPPUNWIND)
+#define TL_EXPECTED_EXCEPTIONS_ENABLED
+#endif
+
+#if (defined(_MSC_VER) && _MSC_VER == 1900)
+/// \exclude
+#define TL_EXPECTED_MSVC2015
+#define TL_EXPECTED_MSVC2015_CONSTEXPR
+#else
+#define TL_EXPECTED_MSVC2015_CONSTEXPR constexpr
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 && \
+ !defined(__clang__))
+/// \exclude
+#define TL_EXPECTED_GCC49
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 4 && \
+ !defined(__clang__))
+/// \exclude
+#define TL_EXPECTED_GCC54
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 5 && \
+ !defined(__clang__))
+/// \exclude
+#define TL_EXPECTED_GCC55
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 && \
+ !defined(__clang__))
+// GCC < 5 doesn't support overloading on const&& for member functions
+/// \exclude
+#define TL_EXPECTED_NO_CONSTRR
+
+// GCC < 5 doesn't support some standard C++11 type traits
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
+ std::has_trivial_copy_constructor<T>
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
+ std::has_trivial_copy_assign<T>
+
+// This one will be different for GCC 5.7 if it's ever supported
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T) \
+ std::is_trivially_destructible<T>
+
+// GCC 5 < v < 8 has a bug in is_trivially_copy_constructible which breaks std::vector
+// for non-copyable types
+#elif (defined(__GNUC__) && __GNUC__ < 8 && \
+ !defined(__clang__))
+#ifndef TL_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
+#define TL_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
+namespace tl {
+ namespace detail {
+ template<class T>
+ struct is_trivially_copy_constructible : std::is_trivially_copy_constructible<T>{};
+#ifdef _GLIBCXX_VECTOR
+ template<class T, class A>
+ struct is_trivially_copy_constructible<std::vector<T,A>>
+ : std::is_trivially_copy_constructible<T>{};
+#endif
+ }
+}
+#endif
+
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
+ tl::detail::is_trivially_copy_constructible<T>
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
+ std::is_trivially_copy_assignable<T>
+#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>
+#else
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
+ std::is_trivially_copy_constructible<T>
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
+ std::is_trivially_copy_assignable<T>
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T) \
+ std::is_trivially_destructible<T>
+#endif
+
+#if __cplusplus > 201103L
+/// \exclude
+#define TL_EXPECTED_CXX14
+#endif
+
+#ifdef TL_EXPECTED_GCC49
+#define TL_EXPECTED_GCC49_CONSTEXPR
+#else
+#define TL_EXPECTED_GCC49_CONSTEXPR constexpr
+#endif
+
+#if (__cplusplus == 201103L || defined(TL_EXPECTED_MSVC2015) || \
+ defined(TL_EXPECTED_GCC49))
+/// \exclude
+#define TL_EXPECTED_11_CONSTEXPR
+#else
+/// \exclude
+#define TL_EXPECTED_11_CONSTEXPR constexpr
+#endif
+
+namespace tl {
+template <class T, class E> class expected;
+
+#ifndef TL_MONOSTATE_INPLACE_MUTEX
+#define TL_MONOSTATE_INPLACE_MUTEX
+/// \brief Used to represent an expected with no data
+class monostate {};
+
+/// \brief A tag type to tell expected to construct its value in-place
+struct in_place_t {
+ explicit in_place_t() = default;
+};
+/// \brief A tag to tell expected to construct its value in-place
+static constexpr in_place_t in_place{};
+#endif
+
+/// Used as a wrapper to store the unexpected value
+template <class E> class unexpected {
+public:
+ static_assert(!std::is_same<E, void>::value, "E must not be void");
+
+ unexpected() = delete;
+ constexpr explicit unexpected(const E &e) : m_val(e) {}
+
+ constexpr explicit unexpected(E &&e) : m_val(std::move(e)) {}
+
+ /// \returns the contained value
+ /// \group unexpected_value
+ constexpr const E &value() const & { return m_val; }
+ /// \group unexpected_value
+ TL_EXPECTED_11_CONSTEXPR E &value() & { return m_val; }
+ /// \group unexpected_value
+ TL_EXPECTED_11_CONSTEXPR E &&value() && { return std::move(m_val); }
+ /// \exclude
+ constexpr const E &&value() const && { return std::move(m_val); }
+
+private:
+ E m_val;
+};
+
+/// \brief Compares two unexpected objects
+/// \details Simply compares lhs.value() to rhs.value()
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator==(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+ return lhs.value() == rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator!=(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+ return lhs.value() != rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator<(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+ return lhs.value() < rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator<=(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+ return lhs.value() <= rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator>(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+ return lhs.value() > rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator>=(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+ return lhs.value() >= rhs.value();
+}
+
+/// Create an `unexpected` from `e`, deducing the return type
+///
+/// *Example:*
+/// auto e1 = tl::make_unexpected(42);
+/// unexpected<int> e2 (42); //same semantics
+template <class E>
+unexpected<typename std::decay<E>::type> make_unexpected(E &&e) {
+ return unexpected<typename std::decay<E>::type>(std::forward<E>(e));
+}
+
+/// \brief A tag type to tell expected to construct the unexpected value
+struct unexpect_t {
+ unexpect_t() = default;
+};
+/// \brief A tag to tell expected to construct the unexpected value
+static constexpr unexpect_t unexpect{};
+
+/// \exclude
+namespace detail {
+template<typename E>
+[[noreturn]] TL_EXPECTED_11_CONSTEXPR void throw_exception(E &&e) {
+#ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+ throw std::forward<E>(e);
+#else
+ #ifdef _MSC_VER
+ __assume(0);
+ #else
+ __builtin_unreachable();
+ #endif
+#endif
+}
+
+#ifndef TL_TRAITS_MUTEX
+#define TL_TRAITS_MUTEX
+// C++14-style aliases for brevity
+template <class T> using remove_const_t = typename std::remove_const<T>::type;
+template <class T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <class T> using decay_t = typename std::decay<T>::type;
+template <bool E, class T = void>
+using enable_if_t = typename std::enable_if<E, T>::type;
+template <bool B, class T, class F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+
+// std::conjunction from C++17
+template <class...> struct conjunction : std::true_type {};
+template <class B> struct conjunction<B> : B {};
+template <class B, class... Bs>
+struct conjunction<B, Bs...>
+ : std::conditional<bool(B::value), conjunction<Bs...>, B>::type {};
+
+// std::invoke from C++17
+// https://stackoverflow.com/questions/38288042/c11-14-invoke-workaround
+template <typename Fn, typename... Args,
+ typename = enable_if_t<std::is_member_pointer<decay_t<Fn>>{}>,
+ int = 0>
+constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
+ noexcept(std::mem_fn(f)(std::forward<Args>(args)...)))
+ -> decltype(std::mem_fn(f)(std::forward<Args>(args)...)) {
+ return std::mem_fn(f)(std::forward<Args>(args)...);
+}
+
+template <typename Fn, typename... Args,
+ typename = enable_if_t<!std::is_member_pointer<decay_t<Fn>>{}>>
+constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
+ noexcept(std::forward<Fn>(f)(std::forward<Args>(args)...)))
+ -> decltype(std::forward<Fn>(f)(std::forward<Args>(args)...)) {
+ return std::forward<Fn>(f)(std::forward<Args>(args)...);
+}
+
+// std::invoke_result from C++17
+template <class F, class, class... Us> struct invoke_result_impl;
+
+template <class F, class... Us>
+struct invoke_result_impl<
+ F, decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...), void()),
+ Us...> {
+ using type = decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...));
+};
+
+template <class F, class... Us>
+using invoke_result = invoke_result_impl<F, void, Us...>;
+
+template <class F, class... Us>
+using invoke_result_t = typename invoke_result<F, Us...>::type;
+#endif
+
+// Trait for checking if a type is a tl::expected
+template <class T> struct is_expected_impl : std::false_type {};
+template <class T, class E>
+struct is_expected_impl<expected<T, E>> : std::true_type {};
+template <class T> using is_expected = is_expected_impl<decay_t<T>>;
+
+template <class T, class E, class U>
+using expected_enable_forward_value = detail::enable_if_t<
+ std::is_constructible<T, U &&>::value &&
+ !std::is_same<detail::decay_t<U>, in_place_t>::value &&
+ !std::is_same<expected<T, E>, detail::decay_t<U>>::value &&
+ !std::is_same<unexpected<E>, detail::decay_t<U>>::value>;
+
+template <class T, class E, class U, class G, class UR, class GR>
+using expected_enable_from_other = detail::enable_if_t<
+ std::is_constructible<T, UR>::value &&
+ std::is_constructible<E, GR>::value &&
+ !std::is_constructible<T, expected<U, G> &>::value &&
+ !std::is_constructible<T, expected<U, G> &&>::value &&
+ !std::is_constructible<T, const expected<U, G> &>::value &&
+ !std::is_constructible<T, const expected<U, G> &&>::value &&
+ !std::is_convertible<expected<U, G> &, T>::value &&
+ !std::is_convertible<expected<U, G> &&, T>::value &&
+ !std::is_convertible<const expected<U, G> &, T>::value &&
+ !std::is_convertible<const expected<U, G> &&, T>::value>;
+
+template <class T, class U>
+using is_void_or = conditional_t<std::is_void<T>::value, std::true_type, U>;
+
+template <class T>
+using is_copy_constructible_or_void =
+ is_void_or<T, std::is_copy_constructible<T>>;
+
+template <class T>
+using is_move_constructible_or_void =
+ is_void_or<T, std::is_move_constructible<T>>;
+
+template <class T>
+using is_copy_assignable_or_void =
+ is_void_or<T, std::is_copy_assignable<T>>;
+
+
+template <class T>
+using is_move_assignable_or_void =
+ is_void_or<T, std::is_move_assignable<T>>;
+
+
+} // namespace detail
+
+/// \exclude
+namespace detail {
+struct no_init_t {};
+static constexpr no_init_t no_init{};
+
+// Implements the storage of the values, and ensures that the destructor is
+// trivial if it can be.
+//
+// This specialization is for where neither `T` or `E` is trivially
+// destructible, so the destructors must be called on destruction of the
+// `expected`
+template <class T, class E, bool = std::is_trivially_destructible<T>::value,
+ bool = std::is_trivially_destructible<E>::value>
+struct expected_storage_base {
+ constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {}
+ constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {}
+
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+ nullptr>
+ constexpr expected_storage_base(in_place_t, Args &&... args)
+ : m_val(std::forward<Args>(args)...), m_has_val(true) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr expected_storage_base(in_place_t, std::initializer_list<U> il,
+ Args &&... args)
+ : m_val(il, std::forward<Args>(args)...), m_has_val(true) {}
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+ nullptr>
+ constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+ : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr explicit expected_storage_base(unexpect_t,
+ std::initializer_list<U> il,
+ Args &&... args)
+ : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+ ~expected_storage_base() {
+ if (m_has_val) {
+ m_val.~T();
+ } else {
+ m_unexpect.~unexpected<E>();
+ }
+ }
+ union {
+ char m_no_init;
+ T m_val;
+ unexpected<E> m_unexpect;
+ };
+ bool m_has_val;
+};
+
+// This specialization is for when both `T` and `E` are trivially-destructible,
+// so the destructor of the `expected` can be trivial.
+template <class T, class E> struct expected_storage_base<T, E, true, true> {
+ constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {}
+ constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {}
+
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+ nullptr>
+ constexpr expected_storage_base(in_place_t, Args &&... args)
+ : m_val(std::forward<Args>(args)...), m_has_val(true) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr expected_storage_base(in_place_t, std::initializer_list<U> il,
+ Args &&... args)
+ : m_val(il, std::forward<Args>(args)...), m_has_val(true) {}
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+ nullptr>
+ constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+ : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr explicit expected_storage_base(unexpect_t,
+ std::initializer_list<U> il,
+ Args &&... args)
+ : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+ ~expected_storage_base() = default;
+ union {
+ char m_no_init;
+ T m_val;
+ unexpected<E> m_unexpect;
+ };
+ bool m_has_val;
+};
+
+// T is trivial, E is not.
+template <class T, class E> struct expected_storage_base<T, E, true, false> {
+ constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {}
+ TL_EXPECTED_MSVC2015_CONSTEXPR expected_storage_base(no_init_t)
+ : m_no_init(), m_has_val(false) {}
+
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+ nullptr>
+ constexpr expected_storage_base(in_place_t, Args &&... args)
+ : m_val(std::forward<Args>(args)...), m_has_val(true) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr expected_storage_base(in_place_t, std::initializer_list<U> il,
+ Args &&... args)
+ : m_val(il, std::forward<Args>(args)...), m_has_val(true) {}
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+ nullptr>
+ constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+ : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr explicit expected_storage_base(unexpect_t,
+ std::initializer_list<U> il,
+ Args &&... args)
+ : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+ ~expected_storage_base() {
+ if (!m_has_val) {
+ m_unexpect.~unexpected<E>();
+ }
+ }
+
+ union {
+ char m_no_init;
+ T m_val;
+ unexpected<E> m_unexpect;
+ };
+ bool m_has_val;
+};
+
+// E is trivial, T is not.
+template <class T, class E> struct expected_storage_base<T, E, false, true> {
+ constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {}
+ constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {}
+
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+ nullptr>
+ constexpr expected_storage_base(in_place_t, Args &&... args)
+ : m_val(std::forward<Args>(args)...), m_has_val(true) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr expected_storage_base(in_place_t, std::initializer_list<U> il,
+ Args &&... args)
+ : m_val(il, std::forward<Args>(args)...), m_has_val(true) {}
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+ nullptr>
+ constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+ : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr explicit expected_storage_base(unexpect_t,
+ std::initializer_list<U> il,
+ Args &&... args)
+ : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+ ~expected_storage_base() {
+ if (m_has_val) {
+ m_val.~T();
+ }
+ }
+ union {
+ char m_no_init;
+ T m_val;
+ unexpected<E> m_unexpect;
+ };
+ bool m_has_val;
+};
+
+// `T` is `void`, `E` is trivially-destructible
+template <class E> struct expected_storage_base<void, E, false, true> {
+ TL_EXPECTED_MSVC2015_CONSTEXPR expected_storage_base() : m_has_val(true) {}
+ constexpr expected_storage_base(no_init_t) : m_val(), m_has_val(false) {}
+
+ constexpr expected_storage_base(in_place_t) : m_has_val(true) {}
+
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+ nullptr>
+ constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+ : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr explicit expected_storage_base(unexpect_t,
+ std::initializer_list<U> il,
+ Args &&... args)
+ : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+ ~expected_storage_base() = default;
+ struct dummy {};
+ union {
+ dummy m_val;
+ unexpected<E> m_unexpect;
+ };
+ bool m_has_val;
+};
+
+// `T` is `void`, `E` is not trivially-destructible
+template <class E> struct expected_storage_base<void, E, false, false> {
+ constexpr expected_storage_base() : m_dummy(), m_has_val(true) {}
+ constexpr expected_storage_base(no_init_t) : m_dummy(), m_has_val(false) {}
+
+ constexpr expected_storage_base(in_place_t) : m_dummy(), m_has_val(true) {}
+
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+ nullptr>
+ constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+ : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr explicit expected_storage_base(unexpect_t,
+ std::initializer_list<U> il,
+ Args &&... args)
+ : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+ ~expected_storage_base() {
+ if (!m_has_val) {
+ m_unexpect.~unexpected<E>();
+ }
+ }
+
+ union {
+ char m_dummy;
+ unexpected<E> m_unexpect;
+ };
+ bool m_has_val;
+};
+
+// This base class provides some handy member functions which can be used in
+// further derived classes
+template <class T, class E>
+struct expected_operations_base : expected_storage_base<T, E> {
+ using expected_storage_base<T, E>::expected_storage_base;
+
+ template <class... Args> void construct(Args &&... args) noexcept {
+ new (std::addressof(this->m_val)) T(std::forward<Args>(args)...);
+ this->m_has_val = true;
+ }
+
+ template <class Rhs> void construct_with(Rhs &&rhs) noexcept {
+ new (std::addressof(this->m_val)) T(std::forward<Rhs>(rhs).get());
+ this->m_has_val = true;
+ }
+
+ template <class... Args> void construct_error(Args &&... args) noexcept {
+ new (std::addressof(this->m_unexpect))
+ unexpected<E>(std::forward<Args>(args)...);
+ this->m_has_val = false;
+ }
+
+ #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+
+ // These assign overloads ensure that the most efficient assignment
+ // implementation is used while maintaining the strong exception guarantee.
+ // The problematic case is where rhs has a value, but *this does not.
+ //
+ // This overload handles the case where we can just copy-construct `T`
+ // directly into place without throwing.
+ template <class U = T,
+ detail::enable_if_t<std::is_nothrow_copy_constructible<U>::value>
+ * = nullptr>
+ void assign(const expected_operations_base &rhs) noexcept {
+ if (!this->m_has_val && rhs.m_has_val) {
+ geterr().~unexpected<E>();
+ construct(rhs.get());
+ } else {
+ assign_common(rhs);
+ }
+ }
+
+ // This overload handles the case where we can attempt to create a copy of
+ // `T`, then no-throw move it into place if the copy was successful.
+ template <class U = T,
+ detail::enable_if_t<!std::is_nothrow_copy_constructible<U>::value &&
+ std::is_nothrow_move_constructible<U>::value>
+ * = nullptr>
+ void assign(const expected_operations_base &rhs) noexcept {
+ if (!this->m_has_val && rhs.m_has_val) {
+ T tmp = rhs.get();
+ geterr().~unexpected<E>();
+ construct(std::move(tmp));
+ } else {
+ assign_common(rhs);
+ }
+ }
+
+ // This overload is the worst-case, where we have to move-construct the
+ // unexpected value into temporary storage, then try to copy the T into place.
+ // If the construction succeeds, then everything is fine, but if it throws,
+ // then we move the old unexpected value back into place before rethrowing the
+ // exception.
+ template <class U = T,
+ detail::enable_if_t<!std::is_nothrow_copy_constructible<U>::value &&
+ !std::is_nothrow_move_constructible<U>::value>
+ * = nullptr>
+ void assign(const expected_operations_base &rhs) {
+ if (!this->m_has_val && rhs.m_has_val) {
+ auto tmp = std::move(geterr());
+ geterr().~unexpected<E>();
+
+ try {
+ construct(rhs.get());
+ } catch (...) {
+ geterr() = std::move(tmp);
+ throw;
+ }
+ } else {
+ assign_common(rhs);
+ }
+ }
+
+ // These overloads do the same as above, but for rvalues
+ template <class U = T,
+ detail::enable_if_t<std::is_nothrow_move_constructible<U>::value>
+ * = nullptr>
+ void assign(expected_operations_base &&rhs) noexcept {
+ if (!this->m_has_val && rhs.m_has_val) {
+ geterr().~unexpected<E>();
+ construct(std::move(rhs).get());
+ } else {
+ assign_common(std::move(rhs));
+ }
+ }
+
+ template <class U = T,
+ detail::enable_if_t<!std::is_nothrow_move_constructible<U>::value>
+ * = nullptr>
+ void assign(expected_operations_base &&rhs) {
+ if (!this->m_has_val && rhs.m_has_val) {
+ auto tmp = std::move(geterr());
+ geterr().~unexpected<E>();
+ try {
+ construct(std::move(rhs).get());
+ } catch (...) {
+ geterr() = std::move(tmp);
+ throw;
+ }
+ } else {
+ assign_common(std::move(rhs));
+ }
+ }
+
+ #else
+
+ // If exceptions are disabled then we can just copy-construct
+ void assign(const expected_operations_base &rhs) noexcept {
+ if (!this->m_has_val && rhs.m_has_val) {
+ geterr().~unexpected<E>();
+ construct(rhs.get());
+ } else {
+ assign_common(rhs);
+ }
+ }
+
+ void assign(expected_operations_base &&rhs) noexcept {
+ if (!this->m_has_val && rhs.m_has_val) {
+ geterr().~unexpected<E>();
+ construct(std::move(rhs).get());
+ } else {
+ assign_common(rhs);
+ }
+ }
+
+ #endif
+
+ // The common part of move/copy assigning
+ template <class Rhs> void assign_common(Rhs &&rhs) {
+ if (this->m_has_val) {
+ if (rhs.m_has_val) {
+ get() = std::forward<Rhs>(rhs).get();
+ } else {
+ destroy_val();
+ construct_error(std::forward<Rhs>(rhs).geterr());
+ }
+ } else {
+ if (!rhs.m_has_val) {
+ geterr() = std::forward<Rhs>(rhs).geterr();
+ }
+ }
+ }
+
+ bool has_value() const { return this->m_has_val; }
+
+ TL_EXPECTED_11_CONSTEXPR T &get() & { return this->m_val; }
+ constexpr const T &get() const & { return this->m_val; }
+ TL_EXPECTED_11_CONSTEXPR T &&get() && { return std::move(this->m_val); }
+#ifndef TL_EXPECTED_NO_CONSTRR
+ constexpr const T &&get() const && { return std::move(this->m_val); }
+#endif
+
+ TL_EXPECTED_11_CONSTEXPR unexpected<E> &geterr() & {
+ return this->m_unexpect;
+ }
+ constexpr const unexpected<E> &geterr() const & { return this->m_unexpect; }
+ TL_EXPECTED_11_CONSTEXPR unexpected<E> &&geterr() && {
+ return std::move(this->m_unexpect);
+ }
+#ifndef TL_EXPECTED_NO_CONSTRR
+ constexpr const unexpected<E> &&geterr() const && {
+ return std::move(this->m_unexpect);
+ }
+#endif
+
+ constexpr void destroy_val() {
+ get().~T();
+ }
+};
+
+// This base class provides some handy member functions which can be used in
+// further derived classes
+template <class E>
+struct expected_operations_base<void, E> : expected_storage_base<void, E> {
+ using expected_storage_base<void, E>::expected_storage_base;
+
+ template <class... Args> void construct() noexcept { this->m_has_val = true; }
+
+ // This function doesn't use its argument, but needs it so that code in
+ // levels above this can work independently of whether T is void
+ template <class Rhs> void construct_with(Rhs &&) noexcept {
+ this->m_has_val = true;
+ }
+
+ template <class... Args> void construct_error(Args &&... args) noexcept {
+ new (std::addressof(this->m_unexpect))
+ unexpected<E>(std::forward<Args>(args)...);
+ this->m_has_val = false;
+ }
+
+ template <class Rhs> void assign(Rhs &&rhs) noexcept {
+ if (!this->m_has_val) {
+ if (rhs.m_has_val) {
+ geterr().~unexpected<E>();
+ construct();
+ } else {
+ geterr() = std::forward<Rhs>(rhs).geterr();
+ }
+ } else {
+ if (!rhs.m_has_val) {
+ construct_error(std::forward<Rhs>(rhs).geterr());
+ }
+ }
+ }
+
+ bool has_value() const { return this->m_has_val; }
+
+ TL_EXPECTED_11_CONSTEXPR unexpected<E> &geterr() & {
+ return this->m_unexpect;
+ }
+ constexpr const unexpected<E> &geterr() const & { return this->m_unexpect; }
+ TL_EXPECTED_11_CONSTEXPR unexpected<E> &&geterr() && {
+ return std::move(this->m_unexpect);
+ }
+#ifndef TL_EXPECTED_NO_CONSTRR
+ constexpr const unexpected<E> &&geterr() const && {
+ return std::move(this->m_unexpect);
+ }
+#endif
+
+ constexpr void destroy_val() {
+ //no-op
+ }
+};
+
+// This class manages conditionally having a trivial copy constructor
+// This specialization is for when T and E are trivially copy constructible
+template <class T, class E,
+ bool = is_void_or<T, TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)>::
+ value &&TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(E)::value>
+struct expected_copy_base : expected_operations_base<T, E> {
+ using expected_operations_base<T, E>::expected_operations_base;
+};
+
+// This specialization is for when T or E are not trivially copy constructible
+template <class T, class E>
+struct expected_copy_base<T, E, false> : expected_operations_base<T, E> {
+ using expected_operations_base<T, E>::expected_operations_base;
+
+ expected_copy_base() = default;
+ expected_copy_base(const expected_copy_base &rhs)
+ : expected_operations_base<T, E>(no_init) {
+ if (rhs.has_value()) {
+ this->construct_with(rhs);
+ } else {
+ this->construct_error(rhs.geterr());
+ }
+ }
+
+ expected_copy_base(expected_copy_base &&rhs) = default;
+ expected_copy_base &operator=(const expected_copy_base &rhs) = default;
+ expected_copy_base &operator=(expected_copy_base &&rhs) = default;
+};
+
+// This class manages conditionally having a trivial move constructor
+// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it
+// doesn't implement an analogue to std::is_trivially_move_constructible. We
+// have to make do with a non-trivial move constructor even if T is trivially
+// move constructible
+#ifndef TL_EXPECTED_GCC49
+template <class T, class E,
+ bool = is_void_or<T, std::is_trivially_move_constructible<T>>::value
+ &&std::is_trivially_move_constructible<E>::value>
+struct expected_move_base : expected_copy_base<T, E> {
+ using expected_copy_base<T, E>::expected_copy_base;
+};
+#else
+template <class T, class E, bool = false> struct expected_move_base;
+#endif
+template <class T, class E>
+struct expected_move_base<T, E, false> : expected_copy_base<T, E> {
+ using expected_copy_base<T, E>::expected_copy_base;
+
+ expected_move_base() = default;
+ expected_move_base(const expected_move_base &rhs) = default;
+
+ expected_move_base(expected_move_base &&rhs) noexcept(
+ std::is_nothrow_move_constructible<T>::value)
+ : expected_copy_base<T, E>(no_init) {
+ if (rhs.has_value()) {
+ this->construct_with(std::move(rhs));
+ } else {
+ this->construct_error(std::move(rhs.geterr()));
+ }
+ }
+ expected_move_base &operator=(const expected_move_base &rhs) = default;
+ expected_move_base &operator=(expected_move_base &&rhs) = default;
+};
+
+// This class manages conditionally having a trivial copy assignment operator
+template <class T, class E,
+ bool = is_void_or<
+ T, conjunction<TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T),
+ TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T),
+ TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T)>>::value
+ &&TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(E)::value
+ &&TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(E)::value
+ &&TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(E)::value>
+struct expected_copy_assign_base : expected_move_base<T, E> {
+ using expected_move_base<T, E>::expected_move_base;
+};
+
+template <class T, class E>
+struct expected_copy_assign_base<T, E, false> : expected_move_base<T, E> {
+ using expected_move_base<T, E>::expected_move_base;
+
+ expected_copy_assign_base() = default;
+ expected_copy_assign_base(const expected_copy_assign_base &rhs) = default;
+
+ expected_copy_assign_base(expected_copy_assign_base &&rhs) = default;
+ expected_copy_assign_base &operator=(const expected_copy_assign_base &rhs) {
+ this->assign(rhs);
+ return *this;
+ }
+ expected_copy_assign_base &
+ operator=(expected_copy_assign_base &&rhs) = default;
+};
+
+// This class manages conditionally having a trivial move assignment operator
+// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it
+// doesn't implement an analogue to std::is_trivially_move_assignable. We have
+// to make do with a non-trivial move assignment operator even if T is trivially
+// move assignable
+#ifndef TL_EXPECTED_GCC49
+template <class T, class E,
+ bool =
+ is_void_or<T, conjunction<std::is_trivially_destructible<T>,
+ std::is_trivially_move_constructible<T>,
+ std::is_trivially_move_assignable<T>>>::
+ value &&std::is_trivially_destructible<E>::value
+ &&std::is_trivially_move_constructible<E>::value
+ &&std::is_trivially_move_assignable<E>::value>
+struct expected_move_assign_base : expected_copy_assign_base<T, E> {
+ using expected_copy_assign_base<T, E>::expected_copy_assign_base;
+};
+#else
+template <class T, class E, bool = false> struct expected_move_assign_base;
+#endif
+
+template <class T, class E>
+struct expected_move_assign_base<T, E, false>
+ : expected_copy_assign_base<T, E> {
+ using expected_copy_assign_base<T, E>::expected_copy_assign_base;
+
+ expected_move_assign_base() = default;
+ expected_move_assign_base(const expected_move_assign_base &rhs) = default;
+
+ expected_move_assign_base(expected_move_assign_base &&rhs) = default;
+
+ expected_move_assign_base &
+ operator=(const expected_move_assign_base &rhs) = default;
+
+ expected_move_assign_base &
+ operator=(expected_move_assign_base &&rhs) noexcept(
+ std::is_nothrow_move_constructible<T>::value
+ &&std::is_nothrow_move_assignable<T>::value) {
+ this->assign(std::move(rhs));
+ return *this;
+ }
+};
+
+// expected_delete_ctor_base will conditionally delete copy and move
+// constructors depending on whether T is copy/move constructible
+template <class T, class E,
+ bool EnableCopy = (is_copy_constructible_or_void<T>::value &&
+ std::is_copy_constructible<E>::value),
+ bool EnableMove = (is_move_constructible_or_void<T>::value &&
+ std::is_move_constructible<E>::value)>
+struct expected_delete_ctor_base {
+ expected_delete_ctor_base() = default;
+ expected_delete_ctor_base(const expected_delete_ctor_base &) = default;
+ expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = default;
+ expected_delete_ctor_base &
+ operator=(const expected_delete_ctor_base &) = default;
+ expected_delete_ctor_base &
+ operator=(expected_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_ctor_base<T, E, true, false> {
+ expected_delete_ctor_base() = default;
+ expected_delete_ctor_base(const expected_delete_ctor_base &) = default;
+ expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = delete;
+ expected_delete_ctor_base &
+ operator=(const expected_delete_ctor_base &) = default;
+ expected_delete_ctor_base &
+ operator=(expected_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_ctor_base<T, E, false, true> {
+ expected_delete_ctor_base() = default;
+ expected_delete_ctor_base(const expected_delete_ctor_base &) = delete;
+ expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = default;
+ expected_delete_ctor_base &
+ operator=(const expected_delete_ctor_base &) = default;
+ expected_delete_ctor_base &
+ operator=(expected_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_ctor_base<T, E, false, false> {
+ expected_delete_ctor_base() = default;
+ expected_delete_ctor_base(const expected_delete_ctor_base &) = delete;
+ expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = delete;
+ expected_delete_ctor_base &
+ operator=(const expected_delete_ctor_base &) = default;
+ expected_delete_ctor_base &
+ operator=(expected_delete_ctor_base &&) noexcept = default;
+};
+
+// expected_delete_assign_base will conditionally delete copy and move
+// constructors depending on whether T and E are copy/move constructible +
+// assignable
+template <class T, class E,
+ bool EnableCopy = (is_copy_constructible_or_void<T>::value &&
+ std::is_copy_constructible<E>::value &&
+ is_copy_assignable_or_void<T>::value &&
+ std::is_copy_assignable<E>::value),
+ bool EnableMove = (is_move_constructible_or_void<T>::value &&
+ std::is_move_constructible<E>::value &&
+ is_move_assignable_or_void<T>::value &&
+ std::is_move_assignable<E>::value)>
+struct expected_delete_assign_base {
+ expected_delete_assign_base() = default;
+ expected_delete_assign_base(const expected_delete_assign_base &) = default;
+ expected_delete_assign_base(expected_delete_assign_base &&) noexcept =
+ default;
+ expected_delete_assign_base &
+ operator=(const expected_delete_assign_base &) = default;
+ expected_delete_assign_base &
+ operator=(expected_delete_assign_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_assign_base<T, E, true, false> {
+ expected_delete_assign_base() = default;
+ expected_delete_assign_base(const expected_delete_assign_base &) = default;
+ expected_delete_assign_base(expected_delete_assign_base &&) noexcept =
+ default;
+ expected_delete_assign_base &
+ operator=(const expected_delete_assign_base &) = default;
+ expected_delete_assign_base &
+ operator=(expected_delete_assign_base &&) noexcept = delete;
+};
+
+template <class T, class E>
+struct expected_delete_assign_base<T, E, false, true> {
+ expected_delete_assign_base() = default;
+ expected_delete_assign_base(const expected_delete_assign_base &) = default;
+ expected_delete_assign_base(expected_delete_assign_base &&) noexcept =
+ default;
+ expected_delete_assign_base &
+ operator=(const expected_delete_assign_base &) = delete;
+ expected_delete_assign_base &
+ operator=(expected_delete_assign_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_assign_base<T, E, false, false> {
+ expected_delete_assign_base() = default;
+ expected_delete_assign_base(const expected_delete_assign_base &) = default;
+ expected_delete_assign_base(expected_delete_assign_base &&) noexcept =
+ default;
+ expected_delete_assign_base &
+ operator=(const expected_delete_assign_base &) = delete;
+ expected_delete_assign_base &
+ operator=(expected_delete_assign_base &&) noexcept = delete;
+};
+
+// This is needed to be able to construct the expected_default_ctor_base which
+// follows, while still conditionally deleting the default constructor.
+struct default_constructor_tag {
+ explicit constexpr default_constructor_tag() = default;
+};
+
+// expected_default_ctor_base will ensure that expected has a deleted default
+// consturctor if T is not default constructible.
+// This specialization is for when T is default constructible
+template <class T, class E,
+ bool Enable =
+ std::is_default_constructible<T>::value || std::is_void<T>::value>
+struct expected_default_ctor_base {
+ constexpr expected_default_ctor_base() noexcept = default;
+ constexpr expected_default_ctor_base(
+ expected_default_ctor_base const &) noexcept = default;
+ constexpr expected_default_ctor_base(expected_default_ctor_base &&) noexcept =
+ default;
+ expected_default_ctor_base &
+ operator=(expected_default_ctor_base const &) noexcept = default;
+ expected_default_ctor_base &
+ operator=(expected_default_ctor_base &&) noexcept = default;
+
+ constexpr explicit expected_default_ctor_base(default_constructor_tag) {}
+};
+
+// This specialization is for when T is not default constructible
+template <class T, class E> struct expected_default_ctor_base<T, E, false> {
+ constexpr expected_default_ctor_base() noexcept = delete;
+ constexpr expected_default_ctor_base(
+ expected_default_ctor_base const &) noexcept = default;
+ constexpr expected_default_ctor_base(expected_default_ctor_base &&) noexcept =
+ default;
+ expected_default_ctor_base &
+ operator=(expected_default_ctor_base const &) noexcept = default;
+ expected_default_ctor_base &
+ operator=(expected_default_ctor_base &&) noexcept = default;
+
+ constexpr explicit expected_default_ctor_base(default_constructor_tag) {}
+};
+} // namespace detail
+
+template <class E> class bad_expected_access : public std::exception {
+public:
+ explicit bad_expected_access(E e) : m_val(std::move(e)) {}
+
+ virtual const char *what() const noexcept override {
+ return "Bad expected access";
+ }
+
+ const E &error() const & { return m_val; }
+ E &error() & { return m_val; }
+ const E &&error() const && { return std::move(m_val); }
+ E &&error() && { return std::move(m_val); }
+
+private:
+ E m_val;
+};
+
+/// An `expected<T, E>` object is an object that contains the storage for
+/// another object and manages the lifetime of this contained object `T`.
+/// Alternatively it could contain the storage for another unexpected object
+/// `E`. The contained object may not be initialized after the expected object
+/// has been initialized, and may not be destroyed before the expected object
+/// has been destroyed. The initialization state of the contained object is
+/// tracked by the expected object.
+template <class T, class E>
+class expected : private detail::expected_move_assign_base<T, E>,
+ private detail::expected_delete_ctor_base<T, E>,
+ private detail::expected_delete_assign_base<T, E>,
+ private detail::expected_default_ctor_base<T, E> {
+ static_assert(!std::is_reference<T>::value, "T must not be a reference");
+ static_assert(!std::is_same<T, std::remove_cv<in_place_t>>::value,
+ "T must not be in_place_t");
+ static_assert(!std::is_same<T, std::remove_cv<unexpect_t>>::value,
+ "T must not be unexpect_t");
+ static_assert(!std::is_same<T, std::remove_cv<unexpected<E>>>::value,
+ "T must not be unexpected<E>");
+ static_assert(!std::is_reference<E>::value, "E must not be a reference");
+
+ T *valptr() { return std::addressof(this->m_val); }
+ const T *valptr() const { return std::addressof(this->m_val); }
+ unexpected<E> *errptr() { return std::addressof(this->m_unexpect); }
+ const unexpected<E> *errptr() const { return std::addressof(this->m_unexpect); }
+
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ U &val() {
+ return this->m_val;
+ }
+ unexpected<E> &err() { return this->m_unexpect; }
+
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ const U &val() const {
+ return this->m_val;
+ }
+ const unexpected<E> &err() const { return this->m_unexpect; }
+
+ using impl_base = detail::expected_move_assign_base<T, E>;
+ using ctor_base = detail::expected_default_ctor_base<T, E>;
+
+public:
+ typedef T value_type;
+ typedef E error_type;
+ typedef unexpected<E> unexpected_type;
+
+#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) && \
+ !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55)
+ /// \group and_then
+ /// Carries out some operation which returns an expected on the stored object
+ /// if there is one. \requires `std::invoke(std::forward<F>(f), value())`
+ /// returns an `expected<U>` for some `U`. \returns Let `U` be the result
+ /// of `std::invoke(std::forward<F>(f), value())`. Returns an
+ /// `expected<U>`. The return value is empty if `*this` is empty,
+ /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+ /// is returned.
+ /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+ template <class F> TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) & {
+ return and_then_impl(*this, std::forward<F>(f));
+ }
+
+ /// \group and_then
+ /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+ template <class F> TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) && {
+ return and_then_impl(std::move(*this), std::forward<F>(f));
+ }
+
+ /// \group and_then
+ /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+ template <class F> constexpr auto and_then(F &&f) const & {
+ return and_then_impl(*this, std::forward<F>(f));
+ }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+ /// \group and_then
+ /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+ template <class F> constexpr auto and_then(F &&f) const && {
+ return and_then_impl(std::move(*this), std::forward<F>(f));
+ }
+#endif
+
+#else
+ /// \group and_then
+ /// Carries out some operation which returns an expected on the stored object
+ /// if there is one. \requires `std::invoke(std::forward<F>(f), value())`
+ /// returns an `expected<U>` for some `U`. \returns Let `U` be the result
+ /// of `std::invoke(std::forward<F>(f), value())`. Returns an
+ /// `expected<U>`. The return value is empty if `*this` is empty,
+ /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+ /// is returned.
+ /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+ template <class F>
+ TL_EXPECTED_11_CONSTEXPR auto
+ and_then(F &&f) & -> decltype(and_then_impl(*this, std::forward<F>(f))) {
+ return and_then_impl(*this, std::forward<F>(f));
+ }
+
+ /// \group and_then
+ /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+ template <class F>
+ TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) && -> decltype(
+ and_then_impl(std::move(*this), std::forward<F>(f))) {
+ return and_then_impl(std::move(*this), std::forward<F>(f));
+ }
+
+ /// \group and_then
+ /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+ template <class F>
+ constexpr auto and_then(F &&f) const & -> decltype(
+ and_then_impl(*this, std::forward<F>(f))) {
+ return and_then_impl(*this, std::forward<F>(f));
+ }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+ /// \group and_then
+ /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+ template <class F>
+ constexpr auto and_then(F &&f) const && -> decltype(
+ and_then_impl(std::move(*this), std::forward<F>(f))) {
+ return and_then_impl(std::move(*this), std::forward<F>(f));
+ }
+#endif
+#endif
+
+#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) && \
+ !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55)
+ /// \brief Carries out some operation on the stored object if there is one.
+ /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+ /// value())`. If `U` is `void`, returns an `expected<monostate,E>, otherwise
+ // returns an `expected<U,E>`. If `*this` is unexpected, the
+ /// result is `*this`, otherwise an `expected<U,E>` is constructed from the
+ /// return value of `std::invoke(std::forward<F>(f), value())` and is
+ /// returned.
+ ///
+ /// \group map
+ /// \synopsis template <class F> constexpr auto map(F &&f) &;
+ template <class F> TL_EXPECTED_11_CONSTEXPR auto map(F &&f) & {
+ return expected_map_impl(*this, std::forward<F>(f));
+ }
+
+ /// \group map
+ /// \synopsis template <class F> constexpr auto map(F &&f) &&;
+ template <class F> TL_EXPECTED_11_CONSTEXPR auto map(F &&f) && {
+ return expected_map_impl(std::move(*this), std::forward<F>(f));
+ }
+
+ /// \group map
+ /// \synopsis template <class F> constexpr auto map(F &&f) const &;
+ template <class F> constexpr auto map(F &&f) const & {
+ return expected_map_impl(*this, std::forward<F>(f));
+ }
+
+ /// \group map
+ /// \synopsis template <class F> constexpr auto map(F &&f) const &&;
+ template <class F> constexpr auto map(F &&f) const && {
+ return expected_map_impl(std::move(*this), std::forward<F>(f));
+ }
+#else
+ /// \brief Carries out some operation on the stored object if there is one.
+ /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+ /// value())`. If `U` is `void`, returns an `expected<monostate,E>, otherwise
+ // returns an `expected<U,E>`. If `*this` is unexpected, the
+ /// result is `*this`, otherwise an `expected<U,E>` is constructed from the
+ /// return value of `std::invoke(std::forward<F>(f), value())` and is
+ /// returned.
+ ///
+ /// \group map
+ /// \synopsis template <class F> constexpr auto map(F &&f) &;
+ template <class F>
+ TL_EXPECTED_11_CONSTEXPR decltype(
+ expected_map_impl(std::declval<expected &>(), std::declval<F &&>()))
+ map(F &&f) & {
+ return expected_map_impl(*this, std::forward<F>(f));
+ }
+
+ /// \group map
+ /// \synopsis template <class F> constexpr auto map(F &&f) &&;
+ template <class F>
+ TL_EXPECTED_11_CONSTEXPR decltype(
+ expected_map_impl(std::declval<expected>(), std::declval<F &&>()))
+ map(F &&f) && {
+ return expected_map_impl(std::move(*this), std::forward<F>(f));
+ }
+
+ /// \group map
+ /// \synopsis template <class F> constexpr auto map(F &&f) const &;
+ template <class F>
+ constexpr decltype(expected_map_impl(std::declval<const expected &>(),
+ std::declval<F &&>()))
+ map(F &&f) const & {
+ return expected_map_impl(*this, std::forward<F>(f));
+ }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+ /// \group map
+ /// \synopsis template <class F> constexpr auto map(F &&f) const &&;
+ template <class F>
+ constexpr decltype(expected_map_impl(std::declval<const expected &&>(),
+ std::declval<F &&>()))
+ map(F &&f) const && {
+ return expected_map_impl(std::move(*this), std::forward<F>(f));
+ }
+#endif
+#endif
+
+#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) && \
+ !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55)
+ /// \brief Carries out some operation on the stored unexpected object if there
+ /// is one.
+ /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+ /// value())`. If `U` is `void`, returns an `expected<T,monostate>`, otherwise
+ /// returns an `expected<T,U>`. If `*this` has an expected
+ /// value, the result is `*this`, otherwise an `expected<T,U>` is constructed
+ /// from `make_unexpected(std::invoke(std::forward<F>(f), value()))` and is
+ /// returned.
+ ///
+ /// \group map_error
+ /// \synopsis template <class F> constexpr auto map_error(F &&f) &;
+ template <class F> TL_EXPECTED_11_CONSTEXPR auto map_error(F &&f) & {
+ return map_error_impl(*this, std::forward<F>(f));
+ }
+
+ /// \group map_error
+ /// \synopsis template <class F> constexpr auto map_error(F &&f) &&;
+ template <class F> TL_EXPECTED_11_CONSTEXPR auto map_error(F &&f) && {
+ return map_error_impl(std::move(*this), std::forward<F>(f));
+ }
+
+ /// \group map_error
+ /// \synopsis template <class F> constexpr auto map_error(F &&f) const &;
+ template <class F> constexpr auto map_error(F &&f) const & {
+ return map_error_impl(*this, std::forward<F>(f));
+ }
+
+ /// \group map_error
+ /// \synopsis template <class F> constexpr auto map_error(F &&f) const &&;
+ template <class F> constexpr auto map_error(F &&f) const && {
+ return map_error_impl(std::move(*this), std::forward<F>(f));
+ }
+#else
+ /// \brief Carries out some operation on the stored unexpected object if there
+ /// is one.
+ /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+ /// value())`. Returns an `expected<T,U>`. If `*this` has an expected
+ /// value, the result is `*this`, otherwise an `expected<T,U>` is constructed
+ /// from `make_unexpected(std::invoke(std::forward<F>(f), value()))` and is
+ /// returned.
+ ///
+ /// \group map_error
+ /// \synopsis template <class F> constexpr auto map_error(F &&f) &;
+ template <class F>
+ TL_EXPECTED_11_CONSTEXPR decltype(map_error_impl(std::declval<expected &>(),
+ std::declval<F &&>()))
+ map_error(F &&f) & {
+ return map_error_impl(*this, std::forward<F>(f));
+ }
+
+ /// \group map_error
+ /// \synopsis template <class F> constexpr auto map_error(F &&f) &&;
+ template <class F>
+ TL_EXPECTED_11_CONSTEXPR decltype(map_error_impl(std::declval<expected &&>(),
+ std::declval<F &&>()))
+ map_error(F &&f) && {
+ return map_error_impl(std::move(*this), std::forward<F>(f));
+ }
+
+ /// \group map_error
+ /// \synopsis template <class F> constexpr auto map_error(F &&f) const &;
+ template <class F>
+ constexpr decltype(map_error_impl(std::declval<const expected &>(),
+ std::declval<F &&>()))
+ map_error(F &&f) const & {
+ return map_error_impl(*this, std::forward<F>(f));
+ }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+ /// \group map_error
+ /// \synopsis template <class F> constexpr auto map_error(F &&f) const &&;
+ template <class F>
+ constexpr decltype(map_error_impl(std::declval<const expected &&>(),
+ std::declval<F &&>()))
+ map_error(F &&f) const && {
+ return map_error_impl(std::move(*this), std::forward<F>(f));
+ }
+#endif
+#endif
+
+ /// \brief Calls `f` if the expectd is in the unexpected state
+ /// \requires `F` is invokable with `E`, and `std::invoke_result_t<F>`
+ /// must be void or convertible to `expcted<T,E>`.
+ /// \effects If `*this` has a value, returns `*this`.
+ /// Otherwise, if `f` returns `void`, calls `std::forward<F>(f)(E)` and returns
+ /// `std::nullopt`. Otherwise, returns `std::forward<F>(f)(E)`.
+ ///
+ /// \group or_else
+ template <class F> expected TL_EXPECTED_11_CONSTEXPR or_else(F &&f) & {
+ return or_else_impl(*this, std::forward<F>(f));
+ }
+
+ template <class F> expected TL_EXPECTED_11_CONSTEXPR or_else(F &&f) && {
+ return or_else_impl(std::move(*this), std::forward<F>(f));
+ }
+
+ template <class F> expected constexpr or_else(F &&f) const & {
+ return or_else_impl(*this, std::forward<F>(f));
+ }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+ template <class F> expected constexpr or_else(F &&f) const && {
+ return or_else_impl(std::move(*this), std::forward<F>(f));
+ }
+#endif
+ constexpr expected() = default;
+ constexpr expected(const expected &rhs) = default;
+ constexpr expected(expected &&rhs) = default;
+ expected &operator=(const expected &rhs) = default;
+ expected &operator=(expected &&rhs) = default;
+
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+ nullptr>
+ constexpr expected(in_place_t, Args &&... args)
+ : impl_base(in_place, std::forward<Args>(args)...),
+ ctor_base(detail::default_constructor_tag{}) {}
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr expected(in_place_t, std::initializer_list<U> il, Args &&... args)
+ : impl_base(in_place, il, std::forward<Args>(args)...),
+ ctor_base(detail::default_constructor_tag{}) {}
+
+ /// \group unexpected_ctor
+ /// \synopsis EXPLICIT constexpr expected(const unexpected<G> &e);
+ template <class G = E,
+ detail::enable_if_t<std::is_constructible<E, const G &>::value> * =
+ nullptr,
+ detail::enable_if_t<!std::is_convertible<const G &, E>::value> * =
+ nullptr>
+ explicit constexpr expected(const unexpected<G> &e)
+ : impl_base(unexpect, e.value()),
+ ctor_base(detail::default_constructor_tag{}) {}
+
+ /// \exclude
+ template <
+ class G = E,
+ detail::enable_if_t<std::is_constructible<E, const G &>::value> * =
+ nullptr,
+ detail::enable_if_t<std::is_convertible<const G &, E>::value> * = nullptr>
+ constexpr expected(unexpected<G> const &e)
+ : impl_base(unexpect, e.value()),
+ ctor_base(detail::default_constructor_tag{}) {}
+
+ /// \group unexpected_ctor
+ /// \synopsis EXPLICIT constexpr expected(unexpected<G> &&e);
+ template <
+ class G = E,
+ detail::enable_if_t<std::is_constructible<E, G &&>::value> * = nullptr,
+ detail::enable_if_t<!std::is_convertible<G &&, E>::value> * = nullptr>
+ explicit constexpr expected(unexpected<G> &&e) noexcept(
+ std::is_nothrow_constructible<E, G &&>::value)
+ : impl_base(unexpect, std::move(e.value())),
+ ctor_base(detail::default_constructor_tag{}) {}
+
+ /// \exclude
+ template <
+ class G = E,
+ detail::enable_if_t<std::is_constructible<E, G &&>::value> * = nullptr,
+ detail::enable_if_t<std::is_convertible<G &&, E>::value> * = nullptr>
+ constexpr expected(unexpected<G> &&e) noexcept(
+ std::is_nothrow_constructible<E, G &&>::value)
+ : impl_base(unexpect, std::move(e.value())),
+ ctor_base(detail::default_constructor_tag{}) {}
+
+ template <class... Args,
+ detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+ nullptr>
+ constexpr explicit expected(unexpect_t, Args &&... args)
+ : impl_base(unexpect, std::forward<Args>(args)...),
+ ctor_base(detail::default_constructor_tag{}) {}
+
+ /// \exclude
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_constructible<
+ E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ constexpr explicit expected(unexpect_t, std::initializer_list<U> il,
+ Args &&... args)
+ : impl_base(unexpect, il, std::forward<Args>(args)...),
+ ctor_base(detail::default_constructor_tag{}) {}
+
+ template <class U, class G,
+ detail::enable_if_t<!(std::is_convertible<U const &, T>::value &&
+ std::is_convertible<G const &, E>::value)> * =
+ nullptr,
+ detail::expected_enable_from_other<T, E, U, G, const U &, const G &>
+ * = nullptr>
+ explicit TL_EXPECTED_11_CONSTEXPR expected(const expected<U, G> &rhs)
+ : ctor_base(detail::default_constructor_tag{}) {
+ if (rhs.has_value()) {
+ this->construct(*rhs);
+ } else {
+ this->construct_error(rhs.error());
+ }
+ }
+
+ /// \exclude
+ template <class U, class G,
+ detail::enable_if_t<(std::is_convertible<U const &, T>::value &&
+ std::is_convertible<G const &, E>::value)> * =
+ nullptr,
+ detail::expected_enable_from_other<T, E, U, G, const U &, const G &>
+ * = nullptr>
+ TL_EXPECTED_11_CONSTEXPR expected(const expected<U, G> &rhs)
+ : ctor_base(detail::default_constructor_tag{}) {
+ if (rhs.has_value()) {
+ this->construct(*rhs);
+ } else {
+ this->construct_error(rhs.error());
+ }
+ }
+
+ template <
+ class U, class G,
+ detail::enable_if_t<!(std::is_convertible<U &&, T>::value &&
+ std::is_convertible<G &&, E>::value)> * = nullptr,
+ detail::expected_enable_from_other<T, E, U, G, U &&, G &&> * = nullptr>
+ explicit TL_EXPECTED_11_CONSTEXPR expected(expected<U, G> &&rhs)
+ : ctor_base(detail::default_constructor_tag{}) {
+ if (rhs.has_value()) {
+ this->construct(std::move(*rhs));
+ } else {
+ this->construct_error(std::move(rhs.error()));
+ }
+ }
+
+ /// \exclude
+ template <
+ class U, class G,
+ detail::enable_if_t<(std::is_convertible<U &&, T>::value &&
+ std::is_convertible<G &&, E>::value)> * = nullptr,
+ detail::expected_enable_from_other<T, E, U, G, U &&, G &&> * = nullptr>
+ TL_EXPECTED_11_CONSTEXPR expected(expected<U, G> &&rhs)
+ : ctor_base(detail::default_constructor_tag{}) {
+ if (rhs.has_value()) {
+ this->construct(std::move(*rhs));
+ } else {
+ this->construct_error(std::move(rhs.error()));
+ }
+ }
+
+ template <
+ class U = T,
+ detail::enable_if_t<!std::is_convertible<U &&, T>::value> * = nullptr,
+ detail::expected_enable_forward_value<T, E, U> * = nullptr>
+ explicit TL_EXPECTED_MSVC2015_CONSTEXPR expected(U &&v)
+ : expected(in_place, std::forward<U>(v)) {}
+
+ /// \exclude
+ template <
+ class U = T,
+ detail::enable_if_t<std::is_convertible<U &&, T>::value> * = nullptr,
+ detail::expected_enable_forward_value<T, E, U> * = nullptr>
+ TL_EXPECTED_MSVC2015_CONSTEXPR expected(U &&v)
+ : expected(in_place, std::forward<U>(v)) {}
+
+ template <
+ class U = T, class G = T,
+ detail::enable_if_t<std::is_nothrow_constructible<T, U &&>::value> * =
+ nullptr,
+ detail::enable_if_t<!std::is_void<G>::value> * = nullptr,
+ detail::enable_if_t<
+ (!std::is_same<expected<T, E>, detail::decay_t<U>>::value &&
+ !detail::conjunction<std::is_scalar<T>,
+ std::is_same<T, detail::decay_t<U>>>::value &&
+ std::is_constructible<T, U>::value &&
+ std::is_assignable<G &, U>::value &&
+ std::is_nothrow_move_constructible<E>::value)> * = nullptr>
+ expected &operator=(U &&v) {
+ if (has_value()) {
+ val() = std::forward<U>(v);
+ } else {
+ err().~unexpected<E>();
+ ::new (valptr()) T(std::forward<U>(v));
+ this->m_has_val = true;
+ }
+
+ return *this;
+ }
+
+ /// \exclude
+ template <
+ class U = T, class G = T,
+ detail::enable_if_t<!std::is_nothrow_constructible<T, U &&>::value> * =
+ nullptr,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr,
+ detail::enable_if_t<
+ (!std::is_same<expected<T, E>, detail::decay_t<U>>::value &&
+ !detail::conjunction<std::is_scalar<T>,
+ std::is_same<T, detail::decay_t<U>>>::value &&
+ std::is_constructible<T, U>::value &&
+ std::is_assignable<G &, U>::value &&
+ std::is_nothrow_move_constructible<E>::value)> * = nullptr>
+ expected &operator=(U &&v) {
+ if (has_value()) {
+ val() = std::forward<U>(v);
+ } else {
+ auto tmp = std::move(err());
+ err().~unexpected<E>();
+
+ #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+ try {
+ ::new (valptr()) T(std::move(v));
+ this->m_has_val = true;
+ } catch (...) {
+ err() = std::move(tmp);
+ throw;
+ }
+ #else
+ ::new (valptr()) T(std::move(v));
+ this->m_has_val = true;
+ #endif
+ }
+
+ return *this;
+ }
+
+ template <class G = E,
+ detail::enable_if_t<std::is_nothrow_copy_constructible<G>::value &&
+ std::is_assignable<G &, G>::value> * = nullptr>
+ expected &operator=(const unexpected<G> &rhs) {
+ if (!has_value()) {
+ err() = rhs;
+ } else {
+ this->destroy_val();
+ ::new (errptr()) unexpected<E>(rhs);
+ this->m_has_val = false;
+ }
+
+ return *this;
+ }
+
+ template <class G = E,
+ detail::enable_if_t<std::is_nothrow_move_constructible<G>::value &&
+ std::is_move_assignable<G>::value> * = nullptr>
+ expected &operator=(unexpected<G> &&rhs) noexcept {
+ if (!has_value()) {
+ err() = std::move(rhs);
+ } else {
+ this->destroy_val();
+ ::new (errptr()) unexpected<E>(std::move(rhs));
+ this->m_has_val = false;
+ }
+
+ return *this;
+ }
+
+ template <class... Args, detail::enable_if_t<std::is_nothrow_constructible<
+ T, Args &&...>::value> * = nullptr>
+ void emplace(Args &&... args) {
+ if (has_value()) {
+ val() = T(std::forward<Args>(args)...);
+ } else {
+ err().~unexpected<E>();
+ ::new (valptr()) T(std::forward<Args>(args)...);
+ this->m_has_val = true;
+ }
+ }
+
+ /// \exclude
+ template <class... Args, detail::enable_if_t<!std::is_nothrow_constructible<
+ T, Args &&...>::value> * = nullptr>
+ void emplace(Args &&... args) {
+ if (has_value()) {
+ val() = T(std::forward<Args>(args)...);
+ } else {
+ auto tmp = std::move(err());
+ err().~unexpected<E>();
+
+ #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+ try {
+ ::new (valptr()) T(std::forward<Args>(args)...);
+ this->m_has_val = true;
+ } catch (...) {
+ err() = std::move(tmp);
+ throw;
+ }
+ #else
+ ::new (valptr()) T(std::forward<Args>(args)...);
+ this->m_has_val = true;
+ #endif
+ }
+ }
+
+ template <class U, class... Args,
+ detail::enable_if_t<std::is_nothrow_constructible<
+ T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ void emplace(std::initializer_list<U> il, Args &&... args) {
+ if (has_value()) {
+ T t(il, std::forward<Args>(args)...);
+ val() = std::move(t);
+ } else {
+ err().~unexpected<E>();
+ ::new (valptr()) T(il, std::forward<Args>(args)...);
+ this->m_has_val = true;
+ }
+ }
+
+ /// \exclude
+ template <class U, class... Args,
+ detail::enable_if_t<!std::is_nothrow_constructible<
+ T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+ void emplace(std::initializer_list<U> il, Args &&... args) {
+ if (has_value()) {
+ T t(il, std::forward<Args>(args)...);
+ val() = std::move(t);
+ } else {
+ auto tmp = std::move(err());
+ err().~unexpected<E>();
+
+ #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+ try {
+ ::new (valptr()) T(il, std::forward<Args>(args)...);
+ this->m_has_val = true;
+ } catch (...) {
+ err() = std::move(tmp);
+ throw;
+ }
+ #else
+ ::new (valptr()) T(il, std::forward<Args>(args)...);
+ this->m_has_val = true;
+ #endif
+ }
+ }
+
+ // TODO SFINAE
+ void swap(expected &rhs) noexcept(
+ std::is_nothrow_move_constructible<T>::value &&noexcept(
+ swap(std::declval<T &>(), std::declval<T &>())) &&
+ std::is_nothrow_move_constructible<E>::value &&
+ noexcept(swap(std::declval<E &>(), std::declval<E &>()))) {
+ if (has_value() && rhs.has_value()) {
+ using std::swap;
+ swap(val(), rhs.val());
+ } else if (!has_value() && rhs.has_value()) {
+ using std::swap;
+ swap(err(), rhs.err());
+ } else if (has_value()) {
+ auto temp = std::move(rhs.err());
+ ::new (rhs.valptr()) T(val());
+ ::new (errptr()) unexpected_type(std::move(temp));
+ std::swap(this->m_has_val, rhs.m_has_val);
+ } else {
+ auto temp = std::move(this->err());
+ ::new (valptr()) T(rhs.val());
+ ::new (errptr()) unexpected_type(std::move(temp));
+ std::swap(this->m_has_val, rhs.m_has_val);
+ }
+ }
+
+ /// \returns a pointer to the stored value
+ /// \requires a value is stored
+ /// \group pointer
+ constexpr const T *operator->() const { return valptr(); }
+ /// \group pointer
+ TL_EXPECTED_11_CONSTEXPR T *operator->() { return valptr(); }
+
+ /// \returns the stored value
+ /// \requires a value is stored
+ /// \group deref
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ constexpr const U &operator*() const & {
+ return val();
+ }
+ /// \group deref
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ TL_EXPECTED_11_CONSTEXPR U &operator*() & {
+ return val();
+ }
+ /// \group deref
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ constexpr const U &&operator*() const && {
+ return std::move(val());
+ }
+ /// \group deref
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ TL_EXPECTED_11_CONSTEXPR U &&operator*() && {
+ return std::move(val());
+ }
+
+ /// \returns whether or not the optional has a value
+ /// \group has_value
+ constexpr bool has_value() const noexcept { return this->m_has_val; }
+ /// \group has_value
+ constexpr explicit operator bool() const noexcept { return this->m_has_val; }
+
+ /// \returns the contained value if there is one, otherwise throws
+ /// [bad_expected_access]
+ ///
+ /// \group value
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ TL_EXPECTED_11_CONSTEXPR const U &value() const & {
+ if (!has_value())
+ detail::throw_exception(bad_expected_access<E>(err().value()));
+ return val();
+ }
+ /// \group value
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ TL_EXPECTED_11_CONSTEXPR U &value() & {
+ if (!has_value())
+ detail::throw_exception(bad_expected_access<E>(err().value()));
+ return val();
+ }
+ /// \group value
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ TL_EXPECTED_11_CONSTEXPR const U &&value() const && {
+ if (!has_value())
+ detail::throw_exception(bad_expected_access<E>(err().value()));
+ return std::move(val());
+ }
+ /// \group value
+ template <class U = T,
+ detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+ TL_EXPECTED_11_CONSTEXPR U &&value() && {
+ if (!has_value())
+ detail::throw_exception(bad_expected_access<E>(err().value()));
+ return std::move(val());
+ }
+
+ /// \returns the unexpected value
+ /// \requires there is an unexpected value
+ /// \group error
+ constexpr const E &error() const & { return err().value(); }
+ /// \group error
+ TL_EXPECTED_11_CONSTEXPR E &error() & { return err().value(); }
+ /// \group error
+ constexpr const E &&error() const && { return std::move(err().value()); }
+ /// \group error
+ TL_EXPECTED_11_CONSTEXPR E &&error() && { return std::move(err().value()); }
+
+ /// \returns the stored value if there is one, otherwise returns `u`
+ /// \group value_or
+ template <class U> constexpr T value_or(U &&v) const & {
+ static_assert(std::is_copy_constructible<T>::value &&
+ std::is_convertible<U &&, T>::value,
+ "T must be copy-constructible and convertible to from U&&");
+ return bool(*this) ? **this : static_cast<T>(std::forward<U>(v));
+ }
+ /// \group value_or
+ template <class U> TL_EXPECTED_11_CONSTEXPR T value_or(U &&v) && {
+ static_assert(std::is_move_constructible<T>::value &&
+ std::is_convertible<U &&, T>::value,
+ "T must be move-constructible and convertible to from U&&");
+ return bool(*this) ? std::move(**this) : static_cast<T>(std::forward<U>(v));
+ }
+};
+
+/// \exclude
+namespace detail {
+template <class Exp> using exp_t = typename detail::decay_t<Exp>::value_type;
+template <class Exp> using err_t = typename detail::decay_t<Exp>::error_type;
+template <class Exp, class Ret> using ret_t = expected<Ret, err_t<Exp>>;
+
+#ifdef TL_EXPECTED_CXX14
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ *std::declval<Exp>()))>
+constexpr auto and_then_impl(Exp &&exp, F &&f) {
+ static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+
+ return exp.has_value()
+ ? detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp))
+ : Ret(unexpect, exp.error());
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>()))>
+constexpr auto and_then_impl(Exp &&exp, F &&f) {
+ static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+
+ return exp.has_value() ? detail::invoke(std::forward<F>(f))
+ : Ret(unexpect, exp.error());
+}
+#else
+template <class> struct TC;
+template <class Exp, class F,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ *std::declval<Exp>())),
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr>
+auto and_then_impl(Exp &&exp, F &&f) -> Ret {
+ static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+
+ return exp.has_value()
+ ? detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp))
+ : Ret(unexpect, exp.error());
+}
+
+template <class Exp, class F,
+ class Ret = decltype(detail::invoke(std::declval<F>())),
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr>
+constexpr auto and_then_impl(Exp &&exp, F &&f) -> Ret {
+ static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+
+ return exp.has_value() ? detail::invoke(std::forward<F>(f))
+ : Ret(unexpect, exp.error());
+}
+#endif
+
+#ifdef TL_EXPECTED_CXX14
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ *std::declval<Exp>())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto expected_map_impl(Exp &&exp, F &&f) {
+ using result = ret_t<Exp, detail::decay_t<Ret>>;
+ return exp.has_value() ? result(detail::invoke(std::forward<F>(f),
+ *std::forward<Exp>(exp)))
+ : result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ *std::declval<Exp>())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto expected_map_impl(Exp &&exp, F &&f) {
+ using result = expected<void, err_t<Exp>>;
+ if (exp.has_value()) {
+ detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp));
+ return result();
+ }
+
+ return result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto expected_map_impl(Exp &&exp, F &&f) {
+ using result = ret_t<Exp, detail::decay_t<Ret>>;
+ return exp.has_value() ? result(detail::invoke(std::forward<F>(f)))
+ : result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto expected_map_impl(Exp &&exp, F &&f) {
+ using result = expected<void, err_t<Exp>>;
+ if (exp.has_value()) {
+ detail::invoke(std::forward<F>(f));
+ return result();
+ }
+
+ return result(unexpect, std::forward<Exp>(exp).error());
+}
+#else
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ *std::declval<Exp>())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+
+constexpr auto expected_map_impl(Exp &&exp, F &&f)
+ -> ret_t<Exp, detail::decay_t<Ret>> {
+ using result = ret_t<Exp, detail::decay_t<Ret>>;
+
+ return exp.has_value() ? result(detail::invoke(std::forward<F>(f),
+ *std::forward<Exp>(exp)))
+ : result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ *std::declval<Exp>())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+
+auto expected_map_impl(Exp &&exp, F &&f) -> expected<void, err_t<Exp>> {
+ if (exp.has_value()) {
+ detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp));
+ return {};
+ }
+
+ return unexpected<err_t<Exp>>(std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+
+constexpr auto expected_map_impl(Exp &&exp, F &&f)
+ -> ret_t<Exp, detail::decay_t<Ret>> {
+ using result = ret_t<Exp, detail::decay_t<Ret>>;
+
+ return exp.has_value() ? result(detail::invoke(std::forward<F>(f)))
+ : result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+
+auto expected_map_impl(Exp &&exp, F &&f) -> expected<void, err_t<Exp>> {
+ if (exp.has_value()) {
+ detail::invoke(std::forward<F>(f));
+ return {};
+ }
+
+ return unexpected<err_t<Exp>>(std::forward<Exp>(exp).error());
+}
+#endif
+
+#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) && \
+ !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55)
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto map_error_impl(Exp &&exp, F &&f) {
+ using result = expected<exp_t<Exp>, detail::decay_t<Ret>>;
+ return exp.has_value()
+ ? result(*std::forward<Exp>(exp))
+ : result(unexpect, detail::invoke(std::forward<F>(f),
+ std::forward<Exp>(exp).error()));
+}
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto map_error_impl(Exp &&exp, F &&f) {
+ using result = expected<exp_t<Exp>, monostate>;
+ if (exp.has_value()) {
+ return result(*std::forward<Exp>(exp));
+ }
+
+ detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+ return result(unexpect, monostate{});
+}
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto map_error_impl(Exp &&exp, F &&f) {
+ using result = expected<exp_t<Exp>, detail::decay_t<Ret>>;
+ return exp.has_value()
+ ? result()
+ : result(unexpect, detail::invoke(std::forward<F>(f),
+ std::forward<Exp>(exp).error()));
+}
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto map_error_impl(Exp &&exp, F &&f) {
+ using result = expected<exp_t<Exp>, monostate>;
+ if (exp.has_value()) {
+ return result();
+ }
+
+ detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+ return result(unexpect, monostate{});
+}
+#else
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto map_error_impl(Exp &&exp, F &&f)
+ -> expected<exp_t<Exp>, detail::decay_t<Ret>> {
+ using result = expected<exp_t<Exp>, detail::decay_t<Ret>>;
+
+ return exp.has_value()
+ ? result(*std::forward<Exp>(exp))
+ : result(unexpect, detail::invoke(std::forward<F>(f),
+ std::forward<Exp>(exp).error()));
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto map_error_impl(Exp &&exp, F &&f) -> expected<exp_t<Exp>, monostate> {
+ using result = expected<exp_t<Exp>, monostate>;
+ if (exp.has_value()) {
+ return result(*std::forward<Exp>(exp));
+ }
+
+ detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+ return result(unexpect, monostate{});
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto map_error_impl(Exp &&exp, F &&f)
+ -> expected<exp_t<Exp>, detail::decay_t<Ret>> {
+ using result = expected<exp_t<Exp>, detail::decay_t<Ret>>;
+
+ return exp.has_value()
+ ? result()
+ : result(unexpect, detail::invoke(std::forward<F>(f),
+ std::forward<Exp>(exp).error()));
+}
+
+template <class Exp, class F,
+ detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto map_error_impl(Exp &&exp, F &&f) -> expected<exp_t<Exp>, monostate> {
+ using result = expected<exp_t<Exp>, monostate>;
+ if (exp.has_value()) {
+ return result();
+ }
+
+ detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+ return result(unexpect, monostate{});
+}
+#endif
+
+#ifdef TL_EXPECTED_CXX14
+template <class Exp, class F,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto or_else_impl(Exp &&exp, F &&f) {
+ static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+ return exp.has_value()
+ ? std::forward<Exp>(exp)
+ : detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+detail::decay_t<Exp> or_else_impl(Exp &&exp, F &&f) {
+ return exp.has_value()
+ ? std::forward<Exp>(exp)
+ : (detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()),
+ std::forward<Exp>(exp));
+}
+#else
+template <class Exp, class F,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+auto or_else_impl(Exp &&exp, F &&f) -> Ret {
+ static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+ return exp.has_value()
+ ? std::forward<Exp>(exp)
+ : detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+ class Ret = decltype(detail::invoke(std::declval<F>(),
+ std::declval<Exp>().error())),
+ detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+detail::decay_t<Exp> or_else_impl(Exp &&exp, F &&f) {
+ return exp.has_value()
+ ? std::forward<Exp>(exp)
+ : (detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()),
+ std::forward<Exp>(exp));
+}
+#endif
+} // namespace detail
+
+template <class T, class E, class U, class F>
+constexpr bool operator==(const expected<T, E> &lhs,
+ const expected<U, F> &rhs) {
+ return (lhs.has_value() != rhs.has_value())
+ ? false
+ : (!lhs.has_value() ? lhs.error() == rhs.error() : *lhs == *rhs);
+}
+template <class T, class E, class U, class F>
+constexpr bool operator!=(const expected<T, E> &lhs,
+ const expected<U, F> &rhs) {
+ return (lhs.has_value() != rhs.has_value())
+ ? true
+ : (!lhs.has_value() ? lhs.error() != rhs.error() : *lhs != *rhs);
+}
+
+template <class T, class E, class U>
+constexpr bool operator==(const expected<T, E> &x, const U &v) {
+ return x.has_value() ? *x == v : false;
+}
+template <class T, class E, class U>
+constexpr bool operator==(const U &v, const expected<T, E> &x) {
+ return x.has_value() ? *x == v : false;
+}
+template <class T, class E, class U>
+constexpr bool operator!=(const expected<T, E> &x, const U &v) {
+ return x.has_value() ? *x != v : true;
+}
+template <class T, class E, class U>
+constexpr bool operator!=(const U &v, const expected<T, E> &x) {
+ return x.has_value() ? *x != v : true;
+}
+
+template <class T, class E>
+constexpr bool operator==(const expected<T, E> &x, const unexpected<E> &e) {
+ return x.has_value() ? false : x.error() == e.value();
+}
+template <class T, class E>
+constexpr bool operator==(const unexpected<E> &e, const expected<T, E> &x) {
+ return x.has_value() ? false : x.error() == e.value();
+}
+template <class T, class E>
+constexpr bool operator!=(const expected<T, E> &x, const unexpected<E> &e) {
+ return x.has_value() ? true : x.error() != e.value();
+}
+template <class T, class E>
+constexpr bool operator!=(const unexpected<E> &e, const expected<T, E> &x) {
+ return x.has_value() ? true : x.error() != e.value();
+}
+
+// TODO is_swappable
+template <class T, class E,
+ detail::enable_if_t<std::is_move_constructible<T>::value &&
+ std::is_move_constructible<E>::value> * = nullptr>
+void swap(expected<T, E> &lhs,
+ expected<T, E> &rhs) noexcept(noexcept(lhs.swap(rhs))) {
+ lhs.swap(rhs);
+}
+} // namespace tl
+
+#define TL_OPTIONAL_EXPECTED_MUTEX
+#endif
diff --git a/src/include/filepath.h b/src/include/filepath.h
new file mode 100644
index 000000000..d0965ad0c
--- /dev/null
+++ b/src/include/filepath.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILEPATH_H
+#define CEPH_FILEPATH_H
+
+/*
+ * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string.
+ * -> should it be different? how? should this[0] be "", with depth 4?
+ *
+ */
+
+
+#include <iosfwd>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "buffer.h"
+#include "encoding.h"
+#include "include/types.h"
+#include "include/fs_types.h"
+
+#include "common/Formatter.h"
+
+
+class filepath {
+ inodeno_t ino = 0; // base inode. ino=0 implies pure relative path.
+ std::string path; // relative path.
+
+ /** bits - path segments
+ * this is ['a', 'b', 'c'] for both the aboslute and relative case.
+ *
+ * NOTE: this value is LAZILY maintained... i.e. it's a cache
+ */
+ mutable std::vector<std::string> bits;
+ bool encoded = false;
+
+ void rebuild_path() {
+ path.clear();
+ for (unsigned i=0; i<bits.size(); i++) {
+ if (i) path += "/";
+ path += bits[i];
+ }
+ }
+ void parse_bits() const {
+ bits.clear();
+ int off = 0;
+ while (off < (int)path.length()) {
+ int nextslash = path.find('/', off);
+ if (nextslash < 0)
+ nextslash = path.length(); // no more slashes
+ if (((nextslash - off) > 0) || encoded) {
+ // skip empty components unless they were introduced deliberately
+ // see commit message for more detail
+ bits.push_back( path.substr(off,nextslash-off) );
+ }
+ off = nextslash+1;
+ }
+ }
+
+ public:
+ filepath() = default;
+ filepath(std::string_view p, inodeno_t i) : ino(i), path(p) {}
+ filepath(const filepath& o) {
+ ino = o.ino;
+ path = o.path;
+ bits = o.bits;
+ encoded = o.encoded;
+ }
+ filepath(inodeno_t i) : ino(i) {}
+ filepath& operator=(const char* path) {
+ set_path(path);
+ return *this;
+ }
+
+ /*
+ * if we are fed a relative path as a string, either set ino=0 (strictly
+ * relative) or 1 (absolute). throw out any leading '/'.
+ */
+ filepath(std::string_view s) { set_path(s); }
+ filepath(const char* s) { set_path(s); }
+
+ void set_path(std::string_view s, inodeno_t b) {
+ path = s;
+ ino = b;
+ }
+ void set_path(std::string_view s) {
+ if (s[0] == '/') {
+ path = s.substr(1);
+ ino = 1;
+ } else {
+ ino = 0;
+ path = s;
+ }
+ bits.clear();
+ }
+
+
+ // accessors
+ inodeno_t get_ino() const { return ino; }
+ const std::string& get_path() const { return path; }
+ const char *c_str() const { return path.c_str(); }
+
+ int length() const { return path.length(); }
+ unsigned depth() const {
+ if (bits.empty() && path.length() > 0) parse_bits();
+ return bits.size();
+ }
+ bool empty() const { return path.length() == 0 && ino == 0; }
+
+ bool absolute() const { return ino == 1; }
+ bool pure_relative() const { return ino == 0; }
+ bool ino_relative() const { return ino > 0; }
+
+ const std::string& operator[](int i) const {
+ if (bits.empty() && path.length() > 0) parse_bits();
+ return bits[i];
+ }
+
+ const std::string& last_dentry() const {
+ if (bits.empty() && path.length() > 0) parse_bits();
+ ceph_assert(!bits.empty());
+ return bits[ bits.size()-1 ];
+ }
+
+ filepath prefixpath(int s) const {
+ filepath t(ino);
+ for (int i=0; i<s; i++)
+ t.push_dentry(bits[i]);
+ return t;
+ }
+ filepath postfixpath(int s) const {
+ filepath t;
+ for (unsigned i=s; i<bits.size(); i++)
+ t.push_dentry(bits[i]);
+ return t;
+ }
+
+
+ // modifiers
+ // string can be relative "a/b/c" (ino=0) or absolute "/a/b/c" (ino=1)
+ void _set_ino(inodeno_t i) { ino = i; }
+ void clear() {
+ ino = 0;
+ path = "";
+ bits.clear();
+ }
+
+ void pop_dentry() {
+ if (bits.empty() && path.length() > 0)
+ parse_bits();
+ bits.pop_back();
+ rebuild_path();
+ }
+ void push_dentry(std::string_view s) {
+ if (bits.empty() && path.length() > 0)
+ parse_bits();
+ if (!bits.empty())
+ path += "/";
+ path += s;
+ bits.emplace_back(s);
+ }
+ void push_dentry(const std::string& s) {
+ push_dentry(std::string_view(s));
+ }
+ void push_dentry(const char *cs) {
+ push_dentry(std::string_view(cs, strlen(cs)));
+ }
+ void push_front_dentry(const std::string& s) {
+ bits.insert(bits.begin(), s);
+ rebuild_path();
+ }
+ void append(const filepath& a) {
+ ceph_assert(a.pure_relative());
+ for (unsigned i=0; i<a.depth(); i++)
+ push_dentry(a[i]);
+ }
+
+ // encoding
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ __u8 struct_v = 1;
+ encode(struct_v, bl);
+ encode(ino, bl);
+ encode(path, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& blp) {
+ using ceph::decode;
+ bits.clear();
+ __u8 struct_v;
+ decode(struct_v, blp);
+ decode(ino, blp);
+ decode(path, blp);
+ encoded = true;
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_unsigned("base_ino", ino);
+ f->dump_string("relative_path", path);
+ }
+ static void generate_test_instances(std::list<filepath*>& o) {
+ o.push_back(new filepath);
+ o.push_back(new filepath("/usr/bin", 0));
+ o.push_back(new filepath("/usr/sbin", 1));
+ o.push_back(new filepath("var/log", 1));
+ o.push_back(new filepath("foo/bar", 101));
+ }
+
+ bool is_last_dot_or_dotdot() const {
+ if (depth() > 0) {
+ std::string dname = last_dentry();
+ if (dname == "." || dname == "..") {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool is_last_snap() const {
+ // walk into snapdir?
+ return depth() > 0 && bits[0].length() == 0;
+ }
+};
+
+WRITE_CLASS_ENCODER(filepath)
+
+inline std::ostream& operator<<(std::ostream& out, const filepath& path)
+{
+ if (path.get_ino()) {
+ out << '#' << path.get_ino();
+ if (path.length())
+ out << '/';
+ }
+ return out << path.get_path();
+}
+
+#endif
diff --git a/src/include/frag.h b/src/include/frag.h
new file mode 100644
index 000000000..ec18bddfb
--- /dev/null
+++ b/src/include/frag.h
@@ -0,0 +1,615 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_FRAG_H
+#define CEPH_FRAG_H
+
+#include <boost/container/small_vector.hpp>
+
+#include <iostream>
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "buffer.h"
+#include "compact_map.h"
+
+#include "ceph_frag.h"
+#include "include/encoding.h"
+#include "include/ceph_assert.h"
+
+#include "common/dout.h"
+
+/*
+ *
+ * the goal here is to use a binary split strategy to partition a namespace.
+ * frag_t represents a particular fragment. bits() tells you the size of the
+ * fragment, and value() it's name. this is roughly analogous to an ip address
+ * and netmask.
+ *
+ * fragtree_t represents an entire namespace and it's partition. it essentially
+ * tells you where fragments are split into other fragments, and by how much
+ * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
+ *
+ * this vaguely resembles a btree, in that when a fragment becomes large or small
+ * we can split or merge, except that there is no guarantee of being balanced.
+ *
+ * presumably we are partitioning the output of a (perhaps specialized) hash
+ * function.
+ */
+
+/**
+ * frag_t
+ *
+ * description of an individual fragment. that is, a particular piece
+ * of the overall namespace.
+ *
+ * this is conceptually analogous to an ip address and netmask.
+ *
+ * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
+ *
+ * we write it as v/b, where v is a value and b is the number of bits.
+ * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that,
+ * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on.
+ *
+ * this makes the right most bit of v the "most significant", which is the
+ * opposite of what we usually see.
+ */
+
+/*
+ * TODO:
+ * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial)
+ * iteration efficient (see, e.g., try_assimilate_children()
+ * - rework frag_t so that we mask the left-most (most significant) bits instead of
+ * the right-most (least significant) bits. just because it's more intuitive, and
+ * matches the network/netmask concept.
+ */
+
+class frag_t {
+ /*
+ * encoding is dictated by frag_* functions in ceph_fs.h. use those
+ * helpers _exclusively_.
+ */
+public:
+ using _frag_t = uint32_t;
+
+ frag_t() = default;
+ frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
+ frag_t(_frag_t e) : _enc(e) { }
+
+ // constructors
+ void from_unsigned(unsigned e) { _enc = e; }
+
+ // accessors
+ unsigned value() const { return ceph_frag_value(_enc); }
+ unsigned bits() const { return ceph_frag_bits(_enc); }
+ unsigned mask() const { return ceph_frag_mask(_enc); }
+ unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
+
+ operator _frag_t() const { return _enc; }
+
+ // tests
+ bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
+ bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
+ bool is_root() const { return bits() == 0; }
+ frag_t parent() const {
+ ceph_assert(bits() > 0);
+ return frag_t(ceph_frag_parent(_enc));
+ }
+
+ // splitting
+ frag_t make_child(int i, int nb) const {
+ ceph_assert(i < (1<<nb));
+ return frag_t(ceph_frag_make_child(_enc, nb, i));
+ }
+ template<typename T>
+ void split(int nb, T& fragments) const {
+ ceph_assert(nb > 0);
+ unsigned nway = 1 << nb;
+ for (unsigned i=0; i<nway; i++)
+ fragments.push_back(make_child(i, nb));
+ }
+
+ // binary splitting
+ frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
+ frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
+
+ bool is_left() const { return ceph_frag_is_left_child(_enc); }
+ bool is_right() const { return ceph_frag_is_right_child(_enc); }
+ frag_t get_sibling() const {
+ ceph_assert(!is_root());
+ return frag_t(ceph_frag_sibling(_enc));
+ }
+
+ // sequencing
+ bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
+ bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
+ frag_t next() const {
+ ceph_assert(!is_rightmost());
+ return frag_t(ceph_frag_next(_enc));
+ }
+
+ // parse
+ bool parse(const char *s) {
+ int pvalue, pbits;
+ int r = sscanf(s, "%x/%d", &pvalue, &pbits);
+ if (r == 2) {
+ *this = frag_t(pvalue, pbits);
+ return true;
+ }
+ return false;
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ceph::encode_raw(_enc, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ __u32 v;
+ ceph::decode_raw(v, p);
+ _enc = v;
+ }
+ bool operator<(const frag_t& b) const
+ {
+ if (value() != b.value())
+ return value() < b.value();
+ else
+ return bits() < b.bits();
+ }
+private:
+ _frag_t _enc = 0;
+};
+WRITE_CLASS_ENCODER(frag_t)
+
+inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
+{
+ //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
+ unsigned num = hb.bits();
+ if (num) {
+ unsigned val = hb.value();
+ for (unsigned bit = 23; num; num--, bit--)
+ out << ((val & (1<<bit)) ? '1':'0');
+ }
+ return out << '*';
+}
+
+
+using frag_vec_t = boost::container::small_vector<frag_t, 4>;
+
+/**
+ * fragtree_t -- partition an entire namespace into one or more frag_t's.
+ */
+class fragtree_t {
+ // pairs <f, b>:
+ // frag_t f is split by b bits.
+ // if child frag_t does not appear, it is not split.
+public:
+ compact_map<frag_t,int32_t> _splits;
+
+public:
+ // -------------
+ // basics
+ void swap(fragtree_t& other) {
+ _splits.swap(other._splits);
+ }
+ void clear() {
+ _splits.clear();
+ }
+
+ // -------------
+ // accessors
+ bool empty() const {
+ return _splits.empty();
+ }
+ int get_split(const frag_t hb) const {
+ compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
+ if (p == _splits.end())
+ return 0;
+ else
+ return p->second;
+ }
+
+
+ bool is_leaf(frag_t x) const {
+ frag_vec_t s;
+ get_leaves_under(x, s);
+ //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
+ return s.size() == 1 && s.front() == x;
+ }
+
+ /**
+ * get_leaves -- list all leaves
+ */
+ template<typename T>
+ void get_leaves(T& c) const {
+ return get_leaves_under_split(frag_t(), c);
+ }
+
+ /**
+ * get_leaves_under_split -- list all leaves under a known split point (or root)
+ */
+ template<typename T>
+ void get_leaves_under_split(frag_t under, T& c) const {
+ frag_vec_t s;
+ s.push_back(under);
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ int nb = get_split(t);
+ if (nb)
+ t.split(nb, s); // queue up children
+ else
+ c.push_back(t); // not spit, it's a leaf.
+ }
+ }
+
+ /**
+ * get_branch -- get branch point at OR above frag @a x
+ * - may be @a x itself, if @a x is a split
+ * - may be root (frag_t())
+ */
+ frag_t get_branch(frag_t x) const {
+ while (1) {
+ if (x == frag_t()) return x; // root
+ if (get_split(x)) return x; // found it!
+ x = x.parent();
+ }
+ }
+
+ /**
+ * get_branch_above -- get a branch point above frag @a x
+ * - may be root (frag_t())
+ * - may NOT be @a x, even if @a x is a split.
+ */
+ frag_t get_branch_above(frag_t x) const {
+ while (1) {
+ if (x == frag_t()) return x; // root
+ x = x.parent();
+ if (get_split(x)) return x; // found it!
+ }
+ }
+
+
+ /**
+ * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
+ * - may be @a x itself, if @a x is a split or leaf
+ * - may be root (frag_t())
+ */
+ frag_t get_branch_or_leaf(frag_t x) const {
+ frag_t branch = get_branch(x);
+ int nb = get_split(branch);
+ if (nb > 0 && // if branch is a split, and
+ branch.bits() + nb <= x.bits()) // one of the children is or contains x
+ return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf)
+ else
+ return branch;
+ }
+
+ /**
+ * get_leaves_under(x, ls) -- search for any leaves fully contained by x
+ */
+ template<typename T>
+ void get_leaves_under(frag_t x, T& c) const {
+ frag_vec_t s;
+ s.push_back(get_branch_or_leaf(x));
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ if (t.bits() >= x.bits() && // if t is more specific than x, and
+ !x.contains(t)) // x does not contain t,
+ continue; // then skip
+ int nb = get_split(t);
+ if (nb)
+ t.split(nb, s); // queue up children
+ else if (x.contains(t))
+ c.push_back(t); // not spit, it's a leaf.
+ }
+ }
+
+ /**
+ * contains(fg) -- does fragtree contain the specific frag @a x
+ */
+ bool contains(frag_t x) const {
+ frag_vec_t s;
+ s.push_back(get_branch(x));
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ if (t.bits() >= x.bits() && // if t is more specific than x, and
+ !x.contains(t)) // x does not contain t,
+ continue; // then skip
+ int nb = get_split(t);
+ if (nb) {
+ if (t == x) return false; // it's split.
+ t.split(nb, s); // queue up children
+ } else {
+ if (t == x) return true; // it's there.
+ }
+ }
+ return false;
+ }
+
+ /**
+ * operator[] -- map a (hash?) value to a frag
+ */
+ frag_t operator[](unsigned v) const {
+ frag_t t;
+ while (1) {
+ ceph_assert(t.contains(v));
+ int nb = get_split(t);
+
+ // is this a leaf?
+ if (nb == 0) return t; // done.
+
+ // pick appropriate child fragment.
+ unsigned nway = 1 << nb;
+ unsigned i;
+ for (i=0; i<nway; i++) {
+ frag_t n = t.make_child(i, nb);
+ if (n.contains(v)) {
+ t = n;
+ break;
+ }
+ }
+ ceph_assert(i < nway);
+ }
+ }
+
+
+ // ---------------
+ // modifiers
+ void split(frag_t x, int b, bool simplify=true) {
+ ceph_assert(is_leaf(x));
+ _splits[x] = b;
+
+ if (simplify)
+ try_assimilate_children(get_branch_above(x));
+ }
+ void merge(frag_t x, int b, bool simplify=true) {
+ ceph_assert(!is_leaf(x));
+ ceph_assert(_splits[x] == b);
+ _splits.erase(x);
+
+ if (simplify)
+ try_assimilate_children(get_branch_above(x));
+ }
+
+ /*
+ * if all of a given split's children are identically split,
+ * then the children can be assimilated.
+ */
+ void try_assimilate_children(frag_t x) {
+ int nb = get_split(x);
+ if (!nb) return;
+ frag_vec_t children;
+ x.split(nb, children);
+ int childbits = 0;
+ for (auto& frag : children) {
+ int cb = get_split(frag);
+ if (!cb) return; // nope.
+ if (childbits && cb != childbits) return; // not the same
+ childbits = cb;
+ }
+ // all children are split with childbits!
+ for (auto& frag : children)
+ _splits.erase(frag);
+ _splits[x] += childbits;
+ }
+
+ bool force_to_leaf(CephContext *cct, frag_t x) {
+ if (is_leaf(x))
+ return false;
+
+ lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
+
+ frag_t parent = get_branch_or_leaf(x);
+ ceph_assert(parent.bits() <= x.bits());
+ lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
+
+ // do we need to split from parent to x?
+ if (parent.bits() < x.bits()) {
+ int spread = x.bits() - parent.bits();
+ int nb = get_split(parent);
+ lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
+ if (nb == 0) {
+ // easy: split parent (a leaf) by the difference
+ lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
+ split(parent, spread);
+ ceph_assert(is_leaf(x));
+ return true;
+ }
+ ceph_assert(nb > spread);
+
+ // add an intermediary split
+ merge(parent, nb, false);
+ split(parent, spread, false);
+
+ frag_vec_t subs;
+ parent.split(spread, subs);
+ for (auto& frag : subs) {
+ lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl;
+ split(frag, nb - spread, false);
+ }
+ }
+
+ // x is now a leaf or split.
+ // hoover up any children.
+ frag_vec_t s;
+ s.push_back(x);
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ int nb = get_split(t);
+ if (nb) {
+ lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
+ merge(t, nb, false); // merge this point, and
+ t.split(nb, s); // queue up children
+ }
+ }
+
+ lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
+ ceph_assert(is_leaf(x));
+ return true;
+ }
+
+ // encoding
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ encode(_splits, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ decode(_splits, p);
+ }
+ void encode_nohead(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+ p != _splits.end();
+ ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+ }
+ void decode_nohead(int n, ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ _splits.clear();
+ while (n-- > 0) {
+ frag_t f;
+ decode(f, p);
+ decode(_splits[f], p);
+ }
+ }
+
+ void print(std::ostream& out) {
+ out << "fragtree_t(";
+ frag_vec_t s;
+ s.push_back(frag_t());
+ while (!s.empty()) {
+ frag_t t = s.back();
+ s.pop_back();
+ // newline + indent?
+ if (t.bits()) {
+ out << std::endl;
+ for (unsigned i=0; i<t.bits(); i++) out << ' ';
+ }
+ int nb = get_split(t);
+ if (nb) {
+ out << t << " %" << nb;
+ t.split(nb, s); // queue up children
+ } else {
+ out << t;
+ }
+ }
+ out << ")";
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->open_array_section("splits");
+ for (auto p = _splits.begin(); p != _splits.end(); ++p) {
+ f->open_object_section("split");
+ std::ostringstream frag_str;
+ frag_str << p->first;
+ f->dump_string("frag", frag_str.str());
+ f->dump_int("children", p->second);
+ f->close_section(); // split
+ }
+ f->close_section(); // splits
+ }
+};
+WRITE_CLASS_ENCODER(fragtree_t)
+
+inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
+ return l._splits == r._splits;
+}
+inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
+ return l._splits != r._splits;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
+{
+ out << "fragtree_t(";
+
+ for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
+ p != ft._splits.end();
+ ++p) {
+ if (p != ft._splits.begin())
+ out << " ";
+ out << p->first << "^" << p->second;
+ }
+ return out << ")";
+}
+
+/**
+ * fragset_t -- a set of fragments
+ */
+class fragset_t {
+ std::set<frag_t> _set;
+
+public:
+ const std::set<frag_t> &get() const { return _set; }
+ std::set<frag_t>::const_iterator begin() const { return _set.begin(); }
+ std::set<frag_t>::const_iterator end() const { return _set.end(); }
+
+ bool empty() const { return _set.empty(); }
+
+ bool contains(frag_t f) const {
+ while (1) {
+ if (_set.count(f)) return true;
+ if (f.bits() == 0) return false;
+ f = f.parent();
+ }
+ }
+
+ void clear() {
+ _set.clear();
+ }
+
+ void insert_raw(frag_t f){
+ _set.insert(f);
+ }
+ void insert(frag_t f) {
+ _set.insert(f);
+ simplify();
+ }
+
+ void simplify() {
+ auto it = _set.begin();
+ while (it != _set.end()) {
+ if (!it->is_root() &&
+ _set.count(it->get_sibling())) {
+ _set.erase(it->get_sibling());
+ auto ret = _set.insert(it->parent());
+ _set.erase(it);
+ it = ret.first;
+ } else {
+ ++it;
+ }
+ }
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ceph::encode(_set, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ ceph::decode(_set, p);
+ }
+};
+WRITE_CLASS_ENCODER(fragset_t)
+
+
+inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs)
+{
+ return out << "fragset_t(" << fs.get() << ")";
+}
+
+#endif
diff --git a/src/include/fs_types.h b/src/include/fs_types.h
new file mode 100644
index 000000000..fc34e702a
--- /dev/null
+++ b/src/include/fs_types.h
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_INCLUDE_FS_TYPES_H
+#define CEPH_INCLUDE_FS_TYPES_H
+
+#include "types.h"
+class JSONObj;
+
+#define CEPHFS_EBLOCKLISTED 108
+#define CEPHFS_EPERM 1
+#define CEPHFS_ESTALE 116
+#define CEPHFS_ENOSPC 28
+#define CEPHFS_ETIMEDOUT 110
+#define CEPHFS_EIO 5
+#define CEPHFS_ENOTCONN 107
+#define CEPHFS_EEXIST 17
+#define CEPHFS_EINTR 4
+#define CEPHFS_EINVAL 22
+#define CEPHFS_EBADF 9
+#define CEPHFS_EROFS 30
+#define CEPHFS_EAGAIN 11
+#define CEPHFS_EACCES 13
+#define CEPHFS_ELOOP 40
+#define CEPHFS_EISDIR 21
+#define CEPHFS_ENOENT 2
+#define CEPHFS_ENOTDIR 20
+#define CEPHFS_ENAMETOOLONG 36
+#define CEPHFS_EBUSY 16
+#define CEPHFS_EDQUOT 122
+#define CEPHFS_EFBIG 27
+#define CEPHFS_ERANGE 34
+#define CEPHFS_ENXIO 6
+#define CEPHFS_ECANCELED 125
+#define CEPHFS_ENODATA 61
+#define CEPHFS_EOPNOTSUPP 95
+#define CEPHFS_EXDEV 18
+#define CEPHFS_ENOMEM 12
+#define CEPHFS_ENOTRECOVERABLE 131
+#define CEPHFS_ENOSYS 38
+#define CEPHFS_EWOULDBLOCK CEPHFS_EAGAIN
+#define CEPHFS_ENOTEMPTY 39
+#define CEPHFS_EDEADLK 35
+#define CEPHFS_EDEADLOCK CEPHFS_EDEADLK
+#define CEPHFS_EDOM 33
+#define CEPHFS_EMLINK 31
+#define CEPHFS_ETIME 62
+#define CEPHFS_EOLDSNAPC 85
+
+// taken from linux kernel: include/uapi/linux/fcntl.h
+#define CEPHFS_AT_FDCWD -100 /* Special value used to indicate
+ openat should use the current
+ working directory. */
+
+// --------------------------------------
+// ino
+
+typedef uint64_t _inodeno_t;
+
+struct inodeno_t {
+ _inodeno_t val;
+ inodeno_t() : val(0) {}
+ // cppcheck-suppress noExplicitConstructor
+ inodeno_t(_inodeno_t v) : val(v) {}
+ inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; }
+ operator _inodeno_t() const { return val; }
+
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ encode(val, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ decode(val, p);
+ }
+} __attribute__ ((__may_alias__));
+WRITE_CLASS_ENCODER(inodeno_t)
+
+template<>
+struct denc_traits<inodeno_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const inodeno_t &o, size_t& p) {
+ denc(o.val, p);
+ }
+ static void encode(const inodeno_t &o, ceph::buffer::list::contiguous_appender& p) {
+ denc(o.val, p);
+ }
+ static void decode(inodeno_t& o, ceph::buffer::ptr::const_iterator &p) {
+ denc(o.val, p);
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const inodeno_t& ino) {
+ return out << std::hex << "0x" << ino.val << std::dec;
+}
+
+namespace std {
+template<>
+struct hash<inodeno_t> {
+ size_t operator()( const inodeno_t& x ) const {
+ static rjhash<uint64_t> H;
+ return H(x.val);
+ }
+};
+} // namespace std
+
+
+// file modes
+
+inline bool file_mode_is_readonly(int mode) {
+ return (mode & CEPH_FILE_MODE_WR) == 0;
+}
+
+
+// dentries
+#define MAX_DENTRY_LEN 255
+
+// --
+namespace ceph {
+ class Formatter;
+}
+void dump(const ceph_file_layout& l, ceph::Formatter *f);
+void dump(const ceph_dir_layout& l, ceph::Formatter *f);
+
+
+
+// file_layout_t
+
+struct file_layout_t {
+ // file -> object mapping
+ uint32_t stripe_unit; ///< stripe unit, in bytes,
+ uint32_t stripe_count; ///< over this many objects
+ uint32_t object_size; ///< until objects are this big
+
+ int64_t pool_id; ///< rados pool id
+ std::string pool_ns; ///< rados pool namespace
+
+ file_layout_t(uint32_t su=0, uint32_t sc=0, uint32_t os=0)
+ : stripe_unit(su),
+ stripe_count(sc),
+ object_size(os),
+ pool_id(-1) {
+ }
+
+ static file_layout_t get_default() {
+ return file_layout_t(1<<22, 1, 1<<22);
+ }
+
+ uint64_t get_period() const {
+ return static_cast<uint64_t>(stripe_count) * object_size;
+ }
+
+ void from_legacy(const ceph_file_layout& fl);
+ void to_legacy(ceph_file_layout *fl) const;
+
+ bool is_valid() const;
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ void decode_json(JSONObj *obj);
+ static void generate_test_instances(std::list<file_layout_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(file_layout_t)
+
+WRITE_EQ_OPERATORS_5(file_layout_t, stripe_unit, stripe_count, object_size, pool_id, pool_ns);
+
+std::ostream& operator<<(std::ostream& out, const file_layout_t &layout);
+
+#endif
diff --git a/src/include/function2.hpp b/src/include/function2.hpp
new file mode 100644
index 000000000..613e651c7
--- /dev/null
+++ b/src/include/function2.hpp
@@ -0,0 +1,1581 @@
+
+// Copyright 2015-2018 Denis Blank <denis.blank at outlook dot com>
+// Distributed under the Boost Software License, Version 1.0
+// (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef FU2_INCLUDED_FUNCTION2_HPP_
+#define FU2_INCLUDED_FUNCTION2_HPP_
+
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+// Defines:
+// - FU2_HAS_DISABLED_EXCEPTIONS
+#if defined(FU2_WITH_DISABLED_EXCEPTIONS) || \
+ defined(FU2_MACRO_DISABLE_EXCEPTIONS)
+#define FU2_HAS_DISABLED_EXCEPTIONS
+#else // FU2_WITH_DISABLED_EXCEPTIONS
+#if defined(_MSC_VER)
+#if !defined(_HAS_EXCEPTIONS) || (_HAS_EXCEPTIONS == 0)
+#define FU2_HAS_DISABLED_EXCEPTIONS
+#endif
+#elif defined(__clang__)
+#if !(__EXCEPTIONS && __has_feature(cxx_exceptions))
+#define FU2_HAS_DISABLED_EXCEPTIONS
+#endif
+#elif defined(__GNUC__)
+#if !__EXCEPTIONS
+#define FU2_HAS_DISABLED_EXCEPTIONS
+#endif
+#endif
+#endif // FU2_WITH_DISABLED_EXCEPTIONS
+// - FU2_HAS_NO_FUNCTIONAL_HEADER
+#if !defined(FU2_WITH_NO_FUNCTIONAL_HEADER) || \
+ !defined(FU2_NO_FUNCTIONAL_HEADER) || \
+ !defined(FU2_HAS_DISABLED_EXCEPTIONS)
+#define FU2_HAS_NO_FUNCTIONAL_HEADER
+#include <functional>
+#endif
+// - FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#if defined(FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE)
+#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#else // FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE
+#if defined(_MSC_VER)
+#if defined(_HAS_CXX17) && _HAS_CXX17
+#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#endif
+#elif defined(__cpp_noexcept_function_type)
+#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#elif defined(__cplusplus) && (__cplusplus >= 201703L)
+#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#endif
+#endif // FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE
+
+#if !defined(FU2_HAS_DISABLED_EXCEPTIONS)
+#include <exception>
+#endif
+
+namespace fu2 {
+inline namespace abi_310 {
+namespace detail {
+template <typename Config, typename Property>
+class function;
+
+template <typename...>
+struct identity {};
+
+// Equivalent to C++17's std::void_t which is targets a bug in GCC,
+// that prevents correct SFINAE behavior.
+// See http://stackoverflow.com/questions/35753920 for details.
+template <typename...>
+struct deduce_to_void : std::common_type<void> {};
+
+template <typename... T>
+using void_t = typename deduce_to_void<T...>::type;
+
+// Copy enabler helper class
+template <bool /*Copyable*/>
+struct copyable {};
+template <>
+struct copyable<false> {
+ copyable() = default;
+ ~copyable() = default;
+ copyable(copyable const&) = delete;
+ copyable(copyable&&) = default;
+ copyable& operator=(copyable const&) = delete;
+ copyable& operator=(copyable&&) = default;
+};
+
+/// Configuration trait to configure the function_base class.
+template <bool Owning, bool Copyable, std::size_t Capacity>
+struct config {
+ // Is true if the function is copyable.
+ static constexpr auto const is_owning = Owning;
+
+ // Is true if the function is copyable.
+ static constexpr auto const is_copyable = Copyable;
+
+ // The internal capacity of the function
+ // used in small functor optimization.
+ static constexpr auto const capacity = Capacity;
+};
+
+/// A config which isn't compatible to other configs
+template <bool Throws, bool HasStrongExceptGuarantee, typename... Args>
+struct property {
+ // Is true when the function throws an exception on empty invocation.
+ static constexpr auto const is_throwing = Throws;
+
+ // Is true when the function throws an exception on empty invocation.
+ static constexpr auto const is_strong_exception_guaranteed = Throws;
+};
+
+/// Provides utilities for invocing callable objects
+namespace invocation {
+/// Invokes the given callable object with the given arguments
+template <typename Callable, typename... Args>
+constexpr auto invoke(Callable&& callable, Args&&... args) noexcept(
+ noexcept(std::forward<Callable>(callable)(std::forward<Args>(args)...)))
+ -> decltype(std::forward<Callable>(callable)(std::forward<Args>(args)...)) {
+
+ return std::forward<Callable>(callable)(std::forward<Args>(args)...);
+}
+/// Invokes the given member function pointer by reference
+template <typename T, typename Type, typename Self, typename... Args>
+constexpr auto invoke(Type T::*member, Self&& self, Args&&... args) noexcept(
+ noexcept((std::forward<Self>(self).*member)(std::forward<Args>(args)...)))
+ -> decltype((std::forward<Self>(self).*
+ member)(std::forward<Args>(args)...)) {
+ return (std::forward<Self>(self).*member)(std::forward<Args>(args)...);
+}
+/// Invokes the given member function pointer by pointer
+template <typename T, typename Type, typename Self, typename... Args>
+constexpr auto invoke(Type T::*member, Self&& self, Args&&... args) noexcept(
+ noexcept((std::forward<Self>(self)->*member)(std::forward<Args>(args)...)))
+ -> decltype(
+ (std::forward<Self>(self)->*member)(std::forward<Args>(args)...)) {
+ return (std::forward<Self>(self)->*member)(std::forward<Args>(args)...);
+}
+/// Invokes the given pointer to a scalar member by reference
+template <typename T, typename Type, typename Self>
+constexpr auto
+invoke(Type T::*member,
+ Self&& self) noexcept(noexcept(std::forward<Self>(self).*member))
+ -> decltype(std::forward<Self>(self).*member) {
+ return (std::forward<Self>(self).*member);
+}
+/// Invokes the given pointer to a scalar member by pointer
+template <typename T, typename Type, typename Self>
+constexpr auto
+invoke(Type T::*member,
+ Self&& self) noexcept(noexcept(std::forward<Self>(self)->*member))
+ -> decltype(std::forward<Self>(self)->*member) {
+ return std::forward<Self>(self)->*member;
+}
+
+/// Deduces to a true type if the callable object can be invoked with
+/// the given arguments.
+/// We don't use invoke here because MSVC can't evaluate the nested expression
+/// SFINAE here.
+template <typename T, typename Args, typename = void>
+struct can_invoke : std::false_type {};
+template <typename T, typename... Args>
+struct can_invoke<T, identity<Args...>,
+ decltype((void)std::declval<T>()(std::declval<Args>()...))>
+ : std::true_type {};
+template <typename Pointer, typename T, typename... Args>
+struct can_invoke<Pointer, identity<T&, Args...>,
+ decltype((void)((std::declval<T&>().*std::declval<Pointer>())(
+ std::declval<Args>()...)))> : std::true_type {};
+template <typename Pointer, typename T, typename... Args>
+struct can_invoke<Pointer, identity<T&&, Args...>,
+ decltype(
+ (void)((std::declval<T&&>().*std::declval<Pointer>())(
+ std::declval<Args>()...)))> : std::true_type {};
+template <typename Pointer, typename T, typename... Args>
+struct can_invoke<Pointer, identity<T*, Args...>,
+ decltype(
+ (void)((std::declval<T*>()->*std::declval<Pointer>())(
+ std::declval<Args>()...)))> : std::true_type {};
+template <typename Pointer, typename T>
+struct can_invoke<Pointer, identity<T&>,
+ decltype((void)(std::declval<T&>().*std::declval<Pointer>()))>
+ : std::true_type {};
+template <typename Pointer, typename T>
+struct can_invoke<Pointer, identity<T&&>,
+ decltype((void)(std::declval<T&&>().*
+ std::declval<Pointer>()))> : std::true_type {
+};
+template <typename Pointer, typename T>
+struct can_invoke<Pointer, identity<T*>,
+ decltype(
+ (void)(std::declval<T*>()->*std::declval<Pointer>()))>
+ : std::true_type {};
+
+template <bool RequiresNoexcept, typename T, typename Args>
+struct is_noexcept_correct : std::true_type {};
+template <typename T, typename... Args>
+struct is_noexcept_correct<true, T, identity<Args...>>
+ : std::integral_constant<bool, noexcept(invoke(std::declval<T>(),
+ std::declval<Args>()...))> {
+};
+} // end namespace invocation
+
+namespace overloading {
+template <typename... Args>
+struct overload_impl;
+template <typename Current, typename Next, typename... Rest>
+struct overload_impl<Current, Next, Rest...> : Current,
+ overload_impl<Next, Rest...> {
+ explicit overload_impl(Current current, Next next, Rest... rest)
+ : Current(std::move(current)), overload_impl<Next, Rest...>(
+ std::move(next), std::move(rest)...) {
+ }
+
+ using Current::operator();
+ using overload_impl<Next, Rest...>::operator();
+};
+template <typename Current>
+struct overload_impl<Current> : Current {
+ explicit overload_impl(Current current) : Current(std::move(current)) {
+ }
+
+ using Current::operator();
+};
+
+template <typename... T>
+constexpr auto overload(T&&... callables) {
+ return overload_impl<std::decay_t<T>...>{std::forward<T>(callables)...};
+}
+} // namespace overloading
+
+/// Declares the namespace which provides the functionality to work with a
+/// type-erased object.
+namespace type_erasure {
+/// Specialization to work with addresses of callable objects
+template <typename T, typename = void>
+struct address_taker {
+ template <typename O>
+ static void* take(O&& obj) {
+ return std::addressof(obj);
+ }
+ static T& restore(void* ptr) {
+ return *static_cast<T*>(ptr);
+ }
+ static T const& restore(void const* ptr) {
+ return *static_cast<T const*>(ptr);
+ }
+ static T volatile& restore(void volatile* ptr) {
+ return *static_cast<T volatile*>(ptr);
+ }
+ static T const volatile& restore(void const volatile* ptr) {
+ return *static_cast<T const volatile*>(ptr);
+ }
+};
+/// Specialization to work with addresses of raw function pointers
+template <typename T>
+struct address_taker<T, std::enable_if_t<std::is_pointer<T>::value>> {
+ template <typename O>
+ static void* take(O&& obj) {
+ return reinterpret_cast<void*>(obj);
+ }
+ template <typename O>
+ static T restore(O ptr) {
+ return reinterpret_cast<T>(const_cast<void*>(ptr));
+ }
+};
+
+template <typename Box>
+struct box_factory;
+/// Store the allocator inside the box
+template <bool IsCopyable, typename T, typename Allocator>
+struct box : private Allocator {
+ friend box_factory<box>;
+
+ T value_;
+
+ explicit box(T value, Allocator allocator)
+ : Allocator(std::move(allocator)), value_(std::move(value)) {
+ }
+
+ box(box&&) = default;
+ box(box const&) = default;
+ box& operator=(box&&) = default;
+ box& operator=(box const&) = default;
+ ~box() = default;
+};
+template <typename T, typename Allocator>
+struct box<false, T, Allocator> : private Allocator {
+ friend box_factory<box>;
+
+ T value_;
+
+ explicit box(T value, Allocator allocator)
+ : Allocator(std::move(allocator)), value_(std::move(value)) {
+ }
+
+ box(box&&) = default;
+ box(box const&) = delete;
+ box& operator=(box&&) = default;
+ box& operator=(box const&) = delete;
+ ~box() = default;
+};
+
+template <bool IsCopyable, typename T, typename Allocator>
+struct box_factory<box<IsCopyable, T, Allocator>> {
+ using real_allocator =
+ typename std::allocator_traits<std::decay_t<Allocator>>::
+ template rebind_alloc<box<IsCopyable, T, Allocator>>;
+
+ /// Allocates space through the boxed allocator
+ static box<IsCopyable, T, Allocator>*
+ box_allocate(box<IsCopyable, T, Allocator> const* me) {
+ real_allocator allocator(*static_cast<Allocator const*>(me));
+
+ return static_cast<box<IsCopyable, T, Allocator>*>(
+ std::allocator_traits<real_allocator>::allocate(allocator, 1U));
+ }
+
+ /// Destroys the box through the given allocator
+ static void box_deallocate(box<IsCopyable, T, Allocator>* me) {
+ real_allocator allocator(*static_cast<Allocator const*>(me));
+
+ me->~box();
+ std::allocator_traits<real_allocator>::deallocate(allocator, me, 1U);
+ }
+};
+
+/// Creates a box containing the given value and allocator
+template <bool IsCopyable, typename T,
+ typename Allocator = std::allocator<std::decay_t<T>>>
+auto make_box(std::integral_constant<bool, IsCopyable>, T&& value,
+ Allocator&& allocator = Allocator{}) {
+ return box<IsCopyable, std::decay_t<T>, std::decay_t<Allocator>>{
+ std::forward<T>(value), std::forward<Allocator>(allocator)};
+}
+
+template <typename T>
+struct is_box : std::false_type {};
+template <bool IsCopyable, typename T, typename Allocator>
+struct is_box<box<IsCopyable, T, Allocator>> : std::true_type {};
+
+/// Provides access to the pointer to a heal allocated erased object
+/// as well to the inplace storage.
+union data_accessor {
+ data_accessor() = default;
+ explicit constexpr data_accessor(std::nullptr_t) noexcept : ptr_(nullptr) {
+ }
+ explicit constexpr data_accessor(void* ptr) noexcept : ptr_(ptr) {
+ }
+
+ /// The pointer we use if the object is on the heap
+ void* ptr_;
+ /// The first field of the inplace storage
+ std::size_t inplace_storage_;
+};
+
+/// See opcode::op_fetch_empty
+constexpr void write_empty(data_accessor* accessor, bool empty) noexcept {
+ accessor->inplace_storage_ = std::size_t(empty);
+}
+
+template <typename From, typename To>
+using transfer_const_t =
+ std::conditional_t<std::is_const<std::remove_pointer_t<From>>::value,
+ std::add_const_t<To>, To>;
+template <typename From, typename To>
+using transfer_volatile_t =
+ std::conditional_t<std::is_volatile<std::remove_pointer_t<From>>::value,
+ std::add_volatile_t<To>, To>;
+
+/// The retriever when the object is allocated inplace
+template <typename T, typename Accessor>
+constexpr auto retrieve(std::true_type /*is_inplace*/, Accessor from,
+ std::size_t from_capacity) {
+ using type = transfer_const_t<Accessor, transfer_volatile_t<Accessor, void>>*;
+
+ /// Process the command by using the data inside the internal capacity
+ auto storage = &(from->inplace_storage_);
+ auto inplace = const_cast<void*>(static_cast<type>(storage));
+ return type(std::align(alignof(T), sizeof(T), inplace, from_capacity));
+}
+
+/// The retriever which is used when the object is allocated
+/// through the allocator
+template <typename T, typename Accessor>
+constexpr auto retrieve(std::false_type /*is_inplace*/, Accessor from,
+ std::size_t /*from_capacity*/) {
+
+ return from->ptr_;
+}
+
+namespace invocation_table {
+#if !defined(FU2_HAS_DISABLED_EXCEPTIONS)
+#if defined(FU2_HAS_NO_FUNCTIONAL_HEADER)
+struct bad_function_call : std::exception {
+ bad_function_call() noexcept {
+ }
+
+ char const* what() const noexcept override {
+ return "bad function call";
+ }
+};
+#elif
+using std::bad_function_call;
+#endif
+#endif
+
+#ifdef FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#define FU2_EXPAND_QUALIFIERS_NOEXCEPT(F) \
+ F(, , noexcept, , &) \
+ F(const, , noexcept, , &) \
+ F(, volatile, noexcept, , &) \
+ F(const, volatile, noexcept, , &) \
+ F(, , noexcept, &, &) \
+ F(const, , noexcept, &, &) \
+ F(, volatile, noexcept, &, &) \
+ F(const, volatile, noexcept, &, &) \
+ F(, , noexcept, &&, &&) \
+ F(const, , noexcept, &&, &&) \
+ F(, volatile, noexcept, &&, &&) \
+ F(const, volatile, noexcept, &&, &&)
+#else // FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#define FU2_EXPAND_QUALIFIERS_NOEXCEPT(F)
+#endif // FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+
+#define FU2_EXPAND_QUALIFIERS(F) \
+ F(, , , , &) \
+ F(const, , , , &) \
+ F(, volatile, , , &) \
+ F(const, volatile, , , &) \
+ F(, , , &, &) \
+ F(const, , , &, &) \
+ F(, volatile, , &, &) \
+ F(const, volatile, , &, &) \
+ F(, , , &&, &&) \
+ F(const, , , &&, &&) \
+ F(, volatile, , &&, &&) \
+ F(const, volatile, , &&, &&) \
+ FU2_EXPAND_QUALIFIERS_NOEXCEPT(F)
+
+/// If the function is qualified as noexcept, the call will never throw
+template <bool IsNoexcept>
+[[noreturn]] void throw_or_abortnoexcept(
+ std::integral_constant<bool, IsNoexcept> /*is_throwing*/) noexcept {
+ std::abort();
+}
+/// Calls std::abort on empty function calls
+[[noreturn]] inline void
+throw_or_abort(std::false_type /*is_throwing*/) noexcept {
+ std::abort();
+}
+/// Throws bad_function_call on empty funciton calls
+[[noreturn]] inline void throw_or_abort(std::true_type /*is_throwing*/) {
+#ifdef FU2_HAS_DISABLED_EXCEPTIONS
+ throw_or_abort(std::false_type{});
+#else
+ throw bad_function_call{};
+#endif
+}
+
+template <typename T>
+struct function_trait;
+
+using is_noexcept_ = std::false_type;
+using is_noexcept_noexcept = std::true_type;
+
+#define FU2_DEFINE_FUNCTION_TRAIT(CONST, VOLATILE, NOEXCEPT, OVL_REF, REF) \
+ template <typename Ret, typename... Args> \
+ struct function_trait<Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT> { \
+ using pointer_type = Ret (*)(data_accessor CONST VOLATILE*, \
+ std::size_t capacity, Args...); \
+ template <typename T, bool IsInplace> \
+ struct internal_invoker { \
+ static Ret invoke(data_accessor CONST VOLATILE* data, \
+ std::size_t capacity, Args... args) NOEXCEPT { \
+ auto obj = retrieve<T>(std::integral_constant<bool, IsInplace>{}, \
+ data, capacity); \
+ auto box = static_cast<T CONST VOLATILE*>(obj); \
+ return invocation::invoke( \
+ static_cast<std::decay_t<decltype(box->value_)> CONST VOLATILE \
+ REF>(box->value_), \
+ std::forward<Args>(args)...); \
+ } \
+ }; \
+ \
+ template <typename T> \
+ struct view_invoker { \
+ static Ret invoke(data_accessor CONST VOLATILE* data, std::size_t, \
+ Args... args) NOEXCEPT { \
+ \
+ auto ptr = static_cast<void CONST VOLATILE*>(data->ptr_); \
+ return invocation::invoke(address_taker<T>::restore(ptr), \
+ std::forward<Args>(args)...); \
+ } \
+ }; \
+ \
+ template <typename T> \
+ using callable = T CONST VOLATILE REF; \
+ \
+ using arguments = identity<Args...>; \
+ \
+ using is_noexcept = is_noexcept_##NOEXCEPT; \
+ \
+ template <bool Throws> \
+ struct empty_invoker { \
+ static Ret invoke(data_accessor CONST VOLATILE* /*data*/, \
+ std::size_t /*capacity*/, Args... /*args*/) NOEXCEPT { \
+ throw_or_abort##NOEXCEPT(std::integral_constant<bool, Throws>{}); \
+ } \
+ }; \
+ };
+
+FU2_EXPAND_QUALIFIERS(FU2_DEFINE_FUNCTION_TRAIT)
+#undef FU2_DEFINE_FUNCTION_TRAIT
+
+/// Deduces to the function pointer to the given signature
+template <typename Signature>
+using function_pointer_of = typename function_trait<Signature>::pointer_type;
+
+template <typename... Args>
+struct invoke_table;
+
+/// We optimize the vtable_t in case there is a single function overload
+template <typename First>
+struct invoke_table<First> {
+ using type = function_pointer_of<First>;
+
+ /// Return the function pointer itself
+ template <std::size_t Index>
+ static constexpr auto fetch(type pointer) noexcept {
+ static_assert(Index == 0U, "The index should be 0 here!");
+ return pointer;
+ }
+
+ /// Returns the thunk of an single overloaded callable
+ template <typename T, bool IsInplace>
+ static constexpr type get_invocation_table_of() noexcept {
+ return &function_trait<First>::template internal_invoker<T,
+ IsInplace>::invoke;
+ }
+ /// Returns the thunk of an single overloaded callable
+ template <typename T>
+ static constexpr type get_invocation_view_table_of() noexcept {
+ return &function_trait<First>::template view_invoker<T>::invoke;
+ }
+ /// Returns the thunk of an empty single overloaded callable
+ template <bool IsThrowing>
+ static constexpr type get_empty_invocation_table() noexcept {
+ return &function_trait<First>::template empty_invoker<IsThrowing>::invoke;
+ }
+};
+/// We generate a table in case of multiple function overloads
+template <typename First, typename Second, typename... Args>
+struct invoke_table<First, Second, Args...> {
+ using type =
+ std::tuple<function_pointer_of<First>, function_pointer_of<Second>,
+ function_pointer_of<Args>...> const*;
+
+ /// Return the function pointer at the particular index
+ template <std::size_t Index>
+ static constexpr auto fetch(type table) noexcept {
+ return std::get<Index>(*table);
+ }
+
+ /// The invocation vtable for a present object
+ template <typename T, bool IsInplace>
+ struct invocation_vtable : public std::tuple<function_pointer_of<First>,
+ function_pointer_of<Second>,
+ function_pointer_of<Args>...> {
+ constexpr invocation_vtable() noexcept
+ : std::tuple<function_pointer_of<First>, function_pointer_of<Second>,
+ function_pointer_of<Args>...>(std::make_tuple(
+ &function_trait<First>::template internal_invoker<
+ T, IsInplace>::invoke,
+ &function_trait<Second>::template internal_invoker<
+ T, IsInplace>::invoke,
+ &function_trait<Args>::template internal_invoker<
+ T, IsInplace>::invoke...)) {
+ }
+ };
+
+ /// Returns the thunk of an multi overloaded callable
+ template <typename T, bool IsInplace>
+ static type get_invocation_table_of() noexcept {
+ static invocation_vtable<T, IsInplace> const table;
+ return &table;
+ }
+
+ /// The invocation vtable for a present object
+ template <typename T>
+ struct invocation_view_vtable
+ : public std::tuple<function_pointer_of<First>,
+ function_pointer_of<Second>,
+ function_pointer_of<Args>...> {
+ constexpr invocation_view_vtable() noexcept
+ : std::tuple<function_pointer_of<First>, function_pointer_of<Second>,
+ function_pointer_of<Args>...>(std::make_tuple(
+ &function_trait<First>::template view_invoker<T>::invoke,
+ &function_trait<Second>::template view_invoker<T>::invoke,
+ &function_trait<Args>::template view_invoker<T>::invoke...)) {
+ }
+ };
+
+ /// Returns the thunk of an multi overloaded callable
+ template <typename T>
+ static type get_invocation_view_table_of() noexcept {
+ static invocation_view_vtable<T> const table;
+ return &table;
+ }
+
+ /// The invocation table for an empty wrapper
+ template <bool IsThrowing>
+ struct empty_vtable : public std::tuple<function_pointer_of<First>,
+ function_pointer_of<Second>,
+ function_pointer_of<Args>...> {
+ constexpr empty_vtable() noexcept
+ : std::tuple<function_pointer_of<First>, function_pointer_of<Second>,
+ function_pointer_of<Args>...>(
+ std::make_tuple(&function_trait<First>::template empty_invoker<
+ IsThrowing>::invoke,
+ &function_trait<Second>::template empty_invoker<
+ IsThrowing>::invoke,
+ &function_trait<Args>::template empty_invoker<
+ IsThrowing>::invoke...)) {
+ }
+ };
+
+ /// Returns the thunk of an multi single overloaded callable
+ template <bool IsThrowing>
+ static type get_empty_invocation_table() noexcept {
+ static empty_vtable<IsThrowing> const table;
+ return &table;
+ }
+};
+
+template <std::size_t Index, typename Function, typename... Signatures>
+class operator_impl;
+
+#define FU2_DEFINE_FUNCTION_TRAIT(CONST, VOLATILE, NOEXCEPT, OVL_REF, REF) \
+ template <std::size_t Index, typename Function, typename Ret, \
+ typename... Args, typename Next, typename... Signatures> \
+ class operator_impl<Index, Function, \
+ Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT, Next, \
+ Signatures...> \
+ : operator_impl<Index + 1, Function, Next, Signatures...> { \
+ \
+ template <std::size_t, typename, typename...> \
+ friend class operator_impl; \
+ \
+ protected: \
+ operator_impl() = default; \
+ ~operator_impl() = default; \
+ operator_impl(operator_impl const&) = default; \
+ operator_impl(operator_impl&&) = default; \
+ operator_impl& operator=(operator_impl const&) = default; \
+ operator_impl& operator=(operator_impl&&) = default; \
+ \
+ using operator_impl<Index + 1, Function, Next, Signatures...>::operator(); \
+ \
+ Ret operator()(Args... args) CONST VOLATILE OVL_REF NOEXCEPT { \
+ auto parent = static_cast<Function CONST VOLATILE*>(this); \
+ using erasure_t = std::decay_t<decltype(parent->erasure_)>; \
+ \
+ return erasure_t::template invoke<Index>( \
+ static_cast<erasure_t CONST VOLATILE REF>(parent->erasure_), \
+ std::forward<Args>(args)...); \
+ } \
+ }; \
+ template <std::size_t Index, typename Config, typename Property, \
+ typename Ret, typename... Args> \
+ class operator_impl<Index, function<Config, Property>, \
+ Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT> \
+ : copyable<Config::is_owning || Config::is_copyable> { \
+ \
+ template <std::size_t, typename, typename...> \
+ friend class operator_impl; \
+ \
+ protected: \
+ operator_impl() = default; \
+ ~operator_impl() = default; \
+ operator_impl(operator_impl const&) = default; \
+ operator_impl(operator_impl&&) = default; \
+ operator_impl& operator=(operator_impl const&) = default; \
+ operator_impl& operator=(operator_impl&&) = default; \
+ \
+ Ret operator()(Args... args) CONST VOLATILE OVL_REF NOEXCEPT { \
+ auto parent = \
+ static_cast<function<Config, Property> CONST VOLATILE*>(this); \
+ using erasure_t = std::decay_t<decltype(parent->erasure_)>; \
+ \
+ return erasure_t::template invoke<Index>( \
+ static_cast<erasure_t CONST VOLATILE REF>(parent->erasure_), \
+ std::forward<Args>(args)...); \
+ } \
+ };
+
+FU2_EXPAND_QUALIFIERS(FU2_DEFINE_FUNCTION_TRAIT)
+#undef FU2_DEFINE_FUNCTION_TRAIT
+} // namespace invocation_table
+
+namespace tables {
+/// Identifies the action which is dispatched on the erased object
+enum class opcode {
+ op_move, //< Move the object and set the vtable
+ op_copy, //< Copy the object and set the vtable
+ op_destroy, //< Destroy the object and reset the vtable
+ op_weak_destroy, //< Destroy the object without resetting the vtable
+ op_fetch_empty, //< Stores true or false into the to storage
+ //< to indicate emptiness
+};
+
+/// Abstraction for a vtable together with a command table
+/// TODO Add optimization for a single formal argument
+/// TODO Add optimization to merge both tables if the function is size
+/// optimized
+template <typename Property>
+class vtable;
+template <bool IsThrowing, bool HasStrongExceptGuarantee,
+ typename... FormalArgs>
+class vtable<property<IsThrowing, HasStrongExceptGuarantee, FormalArgs...>> {
+ using command_function_t = void (*)(vtable* /*this*/, opcode /*op*/,
+ data_accessor* /*from*/,
+ std::size_t /*from_capacity*/,
+ data_accessor* /*to*/,
+ std::size_t /*to_capacity*/);
+
+ using invoke_table_t = invocation_table::invoke_table<FormalArgs...>;
+
+ command_function_t cmd_;
+ typename invoke_table_t::type vtable_;
+
+ template <typename T>
+ struct trait {
+ static_assert(is_box<T>::value,
+ "The trait must be specialized with a box!");
+
+ /// The command table
+ template <bool IsInplace>
+ static void process_cmd(vtable* to_table, opcode op, data_accessor* from,
+ std::size_t from_capacity, data_accessor* to,
+ std::size_t to_capacity) {
+
+ switch (op) {
+ case opcode::op_move: {
+ /// Retrieve the pointer to the object
+ auto box = static_cast<T*>(retrieve<T>(
+ std::integral_constant<bool, IsInplace>{}, from, from_capacity));
+ assert(box && "The object must not be over aligned or null!");
+
+ if (!IsInplace) {
+ // Just swap both pointers if we allocated on the heap
+ to->ptr_ = from->ptr_;
+
+#ifndef _NDEBUG
+ // We don't need to null the pointer since we know that
+ // we don't own the data anymore through the vtable
+ // which is set to empty.
+ from->ptr_ = nullptr;
+#endif
+
+ to_table->template set_allocated<T>();
+
+ }
+ // The object is allocated inplace
+ else {
+ construct(std::true_type{}, std::move(*box), to_table, to,
+ to_capacity);
+ box->~T();
+ }
+ return;
+ }
+ case opcode::op_copy: {
+ auto box = static_cast<T const*>(retrieve<T>(
+ std::integral_constant<bool, IsInplace>{}, from, from_capacity));
+ assert(box && "The object must not be over aligned or null!");
+
+ assert(std::is_copy_constructible<T>::value &&
+ "The box is required to be copyable here!");
+
+ // Try to allocate the object inplace
+ construct(std::is_copy_constructible<T>{}, *box, to_table, to,
+ to_capacity);
+ return;
+ }
+ case opcode::op_destroy:
+ case opcode::op_weak_destroy: {
+
+ assert(!to && !to_capacity && "Arg overflow!");
+ auto box = static_cast<T*>(retrieve<T>(
+ std::integral_constant<bool, IsInplace>{}, from, from_capacity));
+
+ if (IsInplace) {
+ box->~T();
+ } else {
+ box_factory<T>::box_deallocate(box);
+ }
+
+ if (op == opcode::op_destroy) {
+ to_table->set_empty();
+ }
+ return;
+ }
+ case opcode::op_fetch_empty: {
+ write_empty(to, false);
+ return;
+ }
+ }
+
+ // TODO Use an unreachable intrinsic
+ assert(false && "Unreachable!");
+ std::exit(-1);
+ }
+
+ template <typename Box>
+ static void
+ construct(std::true_type /*apply*/, Box&& box, vtable* to_table,
+ data_accessor* to,
+ std::size_t to_capacity) noexcept(HasStrongExceptGuarantee) {
+ // Try to allocate the object inplace
+ void* storage = retrieve<T>(std::true_type{}, to, to_capacity);
+ if (storage) {
+ to_table->template set_inplace<T>();
+ } else {
+ // Allocate the object through the allocator
+ to->ptr_ = storage =
+ box_factory<std::decay_t<Box>>::box_allocate(std::addressof(box));
+ to_table->template set_allocated<T>();
+ }
+ new (storage) T(std::forward<Box>(box));
+ }
+
+ template <typename Box>
+ static void
+ construct(std::false_type /*apply*/, Box&& /*box*/, vtable* /*to_table*/,
+ data_accessor* /*to*/,
+ std::size_t /*to_capacity*/) noexcept(HasStrongExceptGuarantee) {
+ }
+ };
+
+ /// The command table
+ static void empty_cmd(vtable* to_table, opcode op, data_accessor* /*from*/,
+ std::size_t /*from_capacity*/, data_accessor* to,
+ std::size_t /*to_capacity*/) {
+
+ switch (op) {
+ case opcode::op_move:
+ case opcode::op_copy: {
+ to_table->set_empty();
+ break;
+ }
+ case opcode::op_destroy:
+ case opcode::op_weak_destroy: {
+ // Do nothing
+ break;
+ }
+ case opcode::op_fetch_empty: {
+ write_empty(to, true);
+ break;
+ }
+ }
+ }
+
+public:
+ vtable() noexcept = default;
+
+ /// Initialize an object at the given position
+ template <typename T>
+ static void init(vtable& table, T&& object, data_accessor* to,
+ std::size_t to_capacity) {
+
+ trait<std::decay_t<T>>::construct(std::true_type{}, std::forward<T>(object),
+ &table, to, to_capacity);
+ }
+
+ /// Moves the object at the given position
+ void move(vtable& to_table, data_accessor* from, std::size_t from_capacity,
+ data_accessor* to,
+ std::size_t to_capacity) noexcept(HasStrongExceptGuarantee) {
+ cmd_(&to_table, opcode::op_move, from, from_capacity, to, to_capacity);
+ set_empty();
+ }
+
+ /// Destroys the object at the given position
+ void copy(vtable& to_table, data_accessor const* from,
+ std::size_t from_capacity, data_accessor* to,
+ std::size_t to_capacity) const {
+ cmd_(&to_table, opcode::op_copy, const_cast<data_accessor*>(from),
+ from_capacity, to, to_capacity);
+ }
+
+ /// Destroys the object at the given position
+ void destroy(data_accessor* from,
+ std::size_t from_capacity) noexcept(HasStrongExceptGuarantee) {
+ cmd_(this, opcode::op_destroy, from, from_capacity, nullptr, 0U);
+ }
+
+ /// Destroys the object at the given position without invalidating the
+ /// vtable
+ void
+ weak_destroy(data_accessor* from,
+ std::size_t from_capacity) noexcept(HasStrongExceptGuarantee) {
+ cmd_(this, opcode::op_weak_destroy, from, from_capacity, nullptr, 0U);
+ }
+
+ /// Returns true when the vtable doesn't hold any erased object
+ bool empty() const noexcept {
+ data_accessor data;
+ cmd_(nullptr, opcode::op_fetch_empty, nullptr, 0U, &data, 0U);
+ return bool(data.inplace_storage_);
+ }
+
+ /// Invoke the function at the given index
+ template <std::size_t Index, typename... Args>
+ constexpr auto invoke(Args&&... args) const {
+ auto thunk = invoke_table_t::template fetch<Index>(vtable_);
+ return thunk(std::forward<Args>(args)...);
+ }
+ /// Invoke the function at the given index
+ template <std::size_t Index, typename... Args>
+ constexpr auto invoke(Args&&... args) const volatile {
+ auto thunk = invoke_table_t::template fetch<Index>(vtable_);
+ return thunk(std::forward<Args>(args)...);
+ }
+
+ template <typename T>
+ void set_inplace() noexcept {
+ using type = std::decay_t<T>;
+ vtable_ = invoke_table_t::template get_invocation_table_of<type, true>();
+ cmd_ = &trait<type>::template process_cmd<true>;
+ }
+
+ template <typename T>
+ void set_allocated() noexcept {
+ using type = std::decay_t<T>;
+ vtable_ = invoke_table_t::template get_invocation_table_of<type, false>();
+ cmd_ = &trait<type>::template process_cmd<false>;
+ }
+
+ void set_empty() noexcept {
+ vtable_ = invoke_table_t::template get_empty_invocation_table<IsThrowing>();
+ cmd_ = &empty_cmd;
+ }
+};
+} // namespace tables
+
+/// A union which makes the pointer to the heap object share the
+/// same space with the internal capacity.
+/// The storage type is distinguished by multiple versions of the
+/// control and vtable.
+template <std::size_t Capacity, typename = void>
+struct internal_capacity {
+ /// We extend the union through a technique similar to the tail object hack
+ typedef union {
+ /// Tag to access the structure in a type-safe way
+ data_accessor accessor_;
+ /// The internal capacity we use to allocate in-place
+ std::aligned_storage_t<Capacity> capacity_;
+ } type;
+};
+template <std::size_t Capacity>
+struct internal_capacity<Capacity,
+ std::enable_if_t<(Capacity < sizeof(void*))>> {
+ typedef struct {
+ /// Tag to access the structure in a type-safe way
+ data_accessor accessor_;
+ } type;
+};
+
+template <std::size_t Capacity>
+class internal_capacity_holder {
+ // Tag to access the structure in a type-safe way
+ typename internal_capacity<Capacity>::type storage_;
+
+public:
+ constexpr internal_capacity_holder() = default;
+
+ constexpr data_accessor* opaque_ptr() noexcept {
+ return &storage_.accessor_;
+ }
+ constexpr data_accessor const* opaque_ptr() const noexcept {
+ return &storage_.accessor_;
+ }
+ constexpr data_accessor volatile* opaque_ptr() volatile noexcept {
+ return &storage_.accessor_;
+ }
+ constexpr data_accessor const volatile* opaque_ptr() const volatile noexcept {
+ return &storage_.accessor_;
+ }
+
+ static constexpr std::size_t capacity() noexcept {
+ return sizeof(storage_);
+ }
+};
+
+/// An owning erasure
+template <bool IsOwning /* = true*/, typename Config, typename Property>
+class erasure : internal_capacity_holder<Config::capacity> {
+ template <bool, typename, typename>
+ friend class erasure;
+ template <std::size_t, typename, typename...>
+ friend class operator_impl;
+
+ using vtable_t = tables::vtable<Property>;
+
+ vtable_t vtable_;
+
+public:
+ /// Returns the capacity of this erasure
+ static constexpr std::size_t capacity() noexcept {
+ return internal_capacity_holder<Config::capacity>::capacity();
+ }
+
+ constexpr erasure() noexcept {
+ vtable_.set_empty();
+ }
+
+ constexpr erasure(std::nullptr_t) noexcept {
+ vtable_.set_empty();
+ }
+
+ constexpr erasure(erasure&& right) noexcept(
+ Property::is_strong_exception_guaranteed) {
+ right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(),
+ this->opaque_ptr(), capacity());
+ }
+
+ constexpr erasure(erasure const& right) {
+ right.vtable_.copy(vtable_, right.opaque_ptr(), right.capacity(),
+ this->opaque_ptr(), capacity());
+ }
+
+ template <typename OtherConfig>
+ constexpr erasure(erasure<true, OtherConfig, Property> right) noexcept(
+ Property::is_strong_exception_guaranteed) {
+ right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(),
+ this->opaque_ptr(), capacity());
+ }
+
+ template <typename T, typename Allocator = std::allocator<std::decay_t<T>>>
+ constexpr erasure(T&& callable, Allocator&& allocator = Allocator{}) {
+ vtable_t::init(vtable_,
+ type_erasure::make_box(
+ std::integral_constant<bool, Config::is_copyable>{},
+ std::forward<T>(callable),
+ std::forward<Allocator>(allocator)),
+ this->opaque_ptr(), capacity());
+ }
+
+ ~erasure() {
+ vtable_.weak_destroy(this->opaque_ptr(), capacity());
+ }
+
+ constexpr erasure&
+ operator=(std::nullptr_t) noexcept(Property::is_strong_exception_guaranteed) {
+ vtable_.destroy(this->opaque_ptr(), capacity());
+ return *this;
+ }
+
+ constexpr erasure& operator=(erasure&& right) noexcept(
+ Property::is_strong_exception_guaranteed) {
+ vtable_.weak_destroy(this->opaque_ptr(), capacity());
+ right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(),
+ this->opaque_ptr(), capacity());
+ return *this;
+ }
+
+ constexpr erasure& operator=(erasure const& right) {
+ vtable_.weak_destroy(this->opaque_ptr(), capacity());
+ right.vtable_.copy(vtable_, right.opaque_ptr(), right.capacity(),
+ this->opaque_ptr(), capacity());
+ return *this;
+ }
+
+ template <typename OtherConfig>
+ constexpr erasure&
+ operator=(erasure<true, OtherConfig, Property> right) noexcept(
+ Property::is_strong_exception_guaranteed) {
+ vtable_.weak_destroy(this->opaque_ptr(), capacity());
+ right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(),
+ this->opaque_ptr(), capacity());
+ return *this;
+ }
+
+ template <typename T>
+ constexpr erasure& operator=(T&& callable) {
+ vtable_.weak_destroy(this->opaque_ptr(), capacity());
+ vtable_t::init(vtable_,
+ type_erasure::make_box(
+ std::integral_constant<bool, Config::is_copyable>{},
+ std::forward<T>(callable)),
+ this->opaque_ptr(), capacity());
+ return *this;
+ }
+
+ template <typename T, typename Allocator>
+ void assign(T&& callable, Allocator&& allocator) {
+ vtable_.weak_destroy(this->opaque_ptr(), capacity());
+ vtable_t::init(vtable_,
+ type_erasure::make_box(
+ std::integral_constant<bool, Config::is_copyable>{},
+ std::forward<T>(callable),
+ std::forward<Allocator>(allocator)),
+ this->opaque_ptr(), capacity());
+ }
+
+ /// Returns true when the erasure doesn't hold any erased object
+ constexpr bool empty() const noexcept {
+ return vtable_.empty();
+ }
+
+ /// Invoke the function of the erasure at the given index
+ ///
+ /// We define this out of class to be able to forward the qualified
+ /// erasure correctly.
+ template <std::size_t Index, typename Erasure, typename... Args>
+ static constexpr auto invoke(Erasure&& erasure, Args&&... args) {
+ auto const capacity = erasure.capacity();
+ return erasure.vtable_.template invoke<Index>(
+ std::forward<Erasure>(erasure).opaque_ptr(), capacity,
+ std::forward<Args>(args)...);
+ }
+};
+
+// A non owning erasure
+template </*bool IsOwning = false, */ typename Config, bool IsThrowing,
+ bool HasStrongExceptGuarantee, typename... Args>
+class erasure<false, Config,
+ property<IsThrowing, HasStrongExceptGuarantee, Args...>> {
+ template <bool, typename, typename>
+ friend class erasure;
+ template <std::size_t, typename, typename...>
+ friend class operator_impl;
+
+ using property_t = property<IsThrowing, HasStrongExceptGuarantee, Args...>;
+
+ using invoke_table_t = invocation_table::invoke_table<Args...>;
+ typename invoke_table_t::type invoke_table_;
+
+ /// The internal pointer to the non owned object
+ data_accessor view_;
+
+public:
+ // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+ constexpr erasure() noexcept
+ : invoke_table_(
+ invoke_table_t::template get_empty_invocation_table<IsThrowing>()),
+ view_(nullptr) {
+ }
+
+ // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+ constexpr erasure(std::nullptr_t) noexcept
+ : invoke_table_(
+ invoke_table_t::template get_empty_invocation_table<IsThrowing>()),
+ view_(nullptr) {
+ }
+
+ // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+ constexpr erasure(erasure&& right) noexcept
+ : invoke_table_(right.invoke_table_), view_(right.view_) {
+ }
+
+ constexpr erasure(erasure const& /*right*/) = default;
+
+ template <typename OtherConfig>
+ // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+ constexpr erasure(erasure<false, OtherConfig, property_t> right) noexcept
+ : invoke_table_(right.invoke_table_), view_(right.view_) {
+ }
+
+ template <typename T>
+ // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+ constexpr erasure(T&& object)
+ : invoke_table_(invoke_table_t::template get_invocation_view_table_of<
+ std::decay_t<T>>()),
+ view_(address_taker<std::decay_t<T>>::take(std::forward<T>(object))) {
+ }
+
+ ~erasure() = default;
+
+ constexpr erasure&
+ operator=(std::nullptr_t) noexcept(HasStrongExceptGuarantee) {
+ invoke_table_ =
+ invoke_table_t::template get_empty_invocation_table<IsThrowing>();
+ view_.ptr_ = nullptr;
+ return *this;
+ }
+
+ constexpr erasure& operator=(erasure&& right) noexcept {
+ invoke_table_ = right.invoke_table_;
+ view_ = right.view_;
+ right = nullptr;
+ return *this;
+ }
+
+ constexpr erasure& operator=(erasure const& /*right*/) = default;
+
+ template <typename OtherConfig>
+ constexpr erasure&
+ operator=(erasure<true, OtherConfig, property_t> right) noexcept {
+ invoke_table_ = right.invoke_table_;
+ view_ = right.view_;
+ return *this;
+ }
+
+ template <typename T>
+ constexpr erasure& operator=(T&& object) {
+ invoke_table_ = invoke_table_t::template get_invocation_view_table_of<
+ std::decay_t<T>>();
+ view_.ptr_ = address_taker<std::decay_t<T>>::take(std::forward<T>(object));
+ return *this;
+ }
+
+ /// Returns true when the erasure doesn't hold any erased object
+ constexpr bool empty() const noexcept {
+ return view_.ptr_ == nullptr;
+ }
+
+ template <std::size_t Index, typename Erasure, typename... T>
+ static constexpr auto invoke(Erasure&& erasure, T&&... args) {
+ auto thunk = invoke_table_t::template fetch<Index>(erasure.invoke_table_);
+ return thunk(&(erasure.view_), 0UL, std::forward<T>(args)...);
+ }
+};
+} // namespace type_erasure
+
+/// Deduces to a true_type if the type T provides the given signature and the
+/// signature is noexcept correct callable.
+template <typename T, typename Signature,
+ typename Trait =
+ type_erasure::invocation_table::function_trait<Signature>>
+struct accepts_one
+ : std::integral_constant<
+ bool, invocation::can_invoke<typename Trait::template callable<T>,
+ typename Trait::arguments>::value &&
+ invocation::is_noexcept_correct<
+ Trait::is_noexcept::value,
+ typename Trait::template callable<T>,
+ typename Trait::arguments>::value> {};
+
+/// Deduces to a true_type if the type T provides all signatures
+template <typename T, typename Signatures, typename = void>
+struct accepts_all : std::false_type {};
+template <typename T, typename... Signatures>
+struct accepts_all<
+ T, identity<Signatures...>,
+ void_t<std::enable_if_t<accepts_one<T, Signatures>::value>...>>
+ : std::true_type {};
+
+template <typename Config, typename T>
+struct assert_wrong_copy_assign {
+ static_assert(!Config::is_copyable ||
+ std::is_copy_constructible<std::decay_t<T>>::value,
+ "Can't wrap a non copyable object into a unique function!");
+
+ using type = void;
+};
+
+template <bool IsStrongExceptGuaranteed, typename T>
+struct assert_no_strong_except_guarantee {
+ static_assert(
+ !IsStrongExceptGuaranteed ||
+ (std::is_nothrow_move_constructible<T>::value &&
+ std::is_nothrow_destructible<T>::value),
+ "Can't wrap a object an object that has no strong exception guarantees "
+ "if this is required by the wrapper!");
+
+ using type = void;
+};
+
+/// SFINAES out if the given callable is not copyable correct to the left one.
+template <typename LeftConfig, typename RightConfig>
+using enable_if_copyable_correct_t =
+ std::enable_if_t<(!LeftConfig::is_copyable || RightConfig::is_copyable)>;
+
+template <typename LeftConfig, typename RightConfig>
+using is_owning_correct =
+ std::integral_constant<bool,
+ (LeftConfig::is_owning == RightConfig::is_owning)>;
+
+/// SFINAES out if the given function2 is not owning correct to this one
+template <typename LeftConfig, typename RightConfig>
+using enable_if_owning_correct_t =
+ std::enable_if_t<is_owning_correct<LeftConfig, RightConfig>::value>;
+
+template <typename Config, bool IsThrowing, bool HasStrongExceptGuarantee,
+ typename... Args>
+class function<Config, property<IsThrowing, HasStrongExceptGuarantee, Args...>>
+ : type_erasure::invocation_table::operator_impl<
+ 0U,
+ function<Config,
+ property<IsThrowing, HasStrongExceptGuarantee, Args...>>,
+ Args...> {
+
+ template <typename, typename>
+ friend class function;
+
+ template <std::size_t, typename, typename...>
+ friend class type_erasure::invocation_table::operator_impl;
+
+ using property_t = property<IsThrowing, HasStrongExceptGuarantee, Args...>;
+ using erasure_t =
+ type_erasure::erasure<Config::is_owning, Config, property_t>;
+
+ template <typename T>
+ using enable_if_can_accept_all_t =
+ std::enable_if_t<accepts_all<std::decay_t<T>, identity<Args...>>::value>;
+
+ template <typename Function, typename = void>
+ struct is_convertible_to_this : std::false_type {};
+ template <typename RightConfig>
+ struct is_convertible_to_this<
+ function<RightConfig, property_t>,
+ void_t<enable_if_copyable_correct_t<Config, RightConfig>,
+ enable_if_owning_correct_t<Config, RightConfig>>>
+ : std::true_type {};
+
+ template <typename T>
+ using enable_if_not_convertible_to_this =
+ std::enable_if_t<!is_convertible_to_this<std::decay_t<T>>::value>;
+
+ template <typename T>
+ using enable_if_owning_t =
+ std::enable_if_t<std::is_same<T, T>::value && Config::is_owning>;
+
+ template <typename T>
+ using assert_wrong_copy_assign_t =
+ typename assert_wrong_copy_assign<Config, std::decay_t<T>>::type;
+
+ template <typename T>
+ using assert_no_strong_except_guarantee_t =
+ typename assert_no_strong_except_guarantee<HasStrongExceptGuarantee,
+ std::decay_t<T>>::type;
+
+ erasure_t erasure_;
+
+public:
+ /// Default constructor which empty constructs the function
+ function() = default;
+ ~function() = default;
+
+ explicit constexpr function(function const& /*right*/) = default;
+ explicit constexpr function(function&& /*right*/) = default;
+
+ /// Copy construction from another copyable function
+ template <typename RightConfig,
+ std::enable_if_t<RightConfig::is_copyable>* = nullptr,
+ enable_if_copyable_correct_t<Config, RightConfig>* = nullptr,
+ enable_if_owning_correct_t<Config, RightConfig>* = nullptr>
+ constexpr function(function<RightConfig, property_t> const& right)
+ : erasure_(right.erasure_) {
+ }
+
+ /// Move construction from another function
+ template <typename RightConfig,
+ enable_if_copyable_correct_t<Config, RightConfig>* = nullptr,
+ enable_if_owning_correct_t<Config, RightConfig>* = nullptr>
+ constexpr function(function<RightConfig, property_t>&& right)
+ : erasure_(std::move(right.erasure_)) {
+ }
+
+ /// Construction from a callable object which overloads the `()` operator
+ template <typename T, //
+ enable_if_not_convertible_to_this<T>* = nullptr,
+ enable_if_can_accept_all_t<T>* = nullptr,
+ assert_wrong_copy_assign_t<T>* = nullptr,
+ assert_no_strong_except_guarantee_t<T>* = nullptr>
+ constexpr function(T&& callable) : erasure_(std::forward<T>(callable)) {
+ }
+ template <typename T, typename Allocator, //
+ enable_if_not_convertible_to_this<T>* = nullptr,
+ enable_if_can_accept_all_t<T>* = nullptr,
+ enable_if_owning_t<T>* = nullptr,
+ assert_wrong_copy_assign_t<T>* = nullptr,
+ assert_no_strong_except_guarantee_t<T>* = nullptr>
+ constexpr function(T&& callable, Allocator&& allocator)
+ : erasure_(std::forward<T>(callable),
+ std::forward<Allocator>(allocator)) {
+ }
+
+ /// Empty constructs the function
+ constexpr function(std::nullptr_t np) : erasure_(np) {
+ }
+
+ function& operator=(function const& /*right*/) = default;
+ function& operator=(function&& /*right*/) = default;
+
+ /// Copy assigning from another copyable function
+ template <typename RightConfig,
+ std::enable_if_t<RightConfig::is_copyable>* = nullptr,
+ enable_if_copyable_correct_t<Config, RightConfig>* = nullptr,
+ enable_if_owning_correct_t<Config, RightConfig>* = nullptr>
+ function& operator=(function<RightConfig, property_t> const& right) {
+ erasure_ = right.erasure_;
+ return *this;
+ }
+
+ /// Move assigning from another function
+ template <typename RightConfig,
+ enable_if_copyable_correct_t<Config, RightConfig>* = nullptr,
+ enable_if_owning_correct_t<Config, RightConfig>* = nullptr>
+ function& operator=(function<RightConfig, property_t>&& right) {
+ erasure_ = std::move(right.erasure_);
+ return *this;
+ }
+
+ /// Move assigning from a callable object
+ template <typename T, // ...
+ enable_if_not_convertible_to_this<T>* = nullptr,
+ enable_if_can_accept_all_t<T>* = nullptr,
+ assert_wrong_copy_assign_t<T>* = nullptr,
+ assert_no_strong_except_guarantee_t<T>* = nullptr>
+ function& operator=(T&& callable) {
+ erasure_ = std::forward<T>(callable);
+ return *this;
+ }
+
+ /// Clears the function
+ function& operator=(std::nullptr_t np) {
+ erasure_ = np;
+ return *this;
+ }
+
+ /// Returns true when the function is empty
+ bool empty() const noexcept {
+ return erasure_.empty();
+ }
+
+ /// Returns true when the function isn't empty
+ explicit operator bool() const noexcept {
+ return !empty();
+ }
+
+ /// Assigns a new target with an optional allocator
+ template <typename T, typename Allocator = std::allocator<std::decay_t<T>>,
+ enable_if_not_convertible_to_this<T>* = nullptr,
+ enable_if_can_accept_all_t<T>* = nullptr,
+ assert_wrong_copy_assign_t<T>* = nullptr,
+ assert_no_strong_except_guarantee_t<T>* = nullptr>
+ void assign(T&& callable, Allocator&& allocator = Allocator{}) {
+ erasure_.assign(std::forward<T>(callable),
+ std::forward<Allocator>(allocator));
+ }
+
+ /// Swaps this function with the given function
+ void swap(function& other) noexcept(HasStrongExceptGuarantee) {
+ if (&other == this) {
+ return;
+ }
+
+ function cache = std::move(other);
+ other = std::move(*this);
+ *this = std::move(cache);
+ }
+
+ /// Swaps the left function with the right one
+ friend void swap(function& left,
+ function& right) noexcept(HasStrongExceptGuarantee) {
+ left.swap(right);
+ }
+
+ /// Calls the wrapped callable object
+ using type_erasure::invocation_table::operator_impl<
+ 0U, function<Config, property_t>, Args...>::operator();
+};
+
+template <typename Config, typename Property>
+bool operator==(function<Config, Property> const& f, std::nullptr_t) {
+ return !bool(f);
+}
+
+template <typename Config, typename Property>
+bool operator!=(function<Config, Property> const& f, std::nullptr_t) {
+ return bool(f);
+}
+
+template <typename Config, typename Property>
+bool operator==(std::nullptr_t, function<Config, Property> const& f) {
+ return !bool(f);
+}
+
+template <typename Config, typename Property>
+bool operator!=(std::nullptr_t, function<Config, Property> const& f) {
+ return bool(f);
+}
+
+// Default object size of the function
+using object_size = std::integral_constant<std::size_t, 32U>;
+
+// Default capacity for small functor optimization
+using default_capacity =
+ std::integral_constant<std::size_t,
+ object_size::value - (2 * sizeof(void*))>;
+} // namespace detail
+} // namespace abi_310
+
+/// Adaptable function wrapper base for arbitrary functional types.
+template <
+ /// This is a placeholder for future non owning support
+ bool IsOwning,
+ /// Defines whether the function is copyable or not
+ bool IsCopyable,
+ /// Defines the internal capacity of the function
+ /// for small functor optimization.
+ /// The size of the whole function object will be the capacity plus
+ /// the size of two pointers.
+ /// If the capacity is zero, the size will increase through one additional
+ /// pointer so the whole object has the size of 3 * sizeof(void*).
+ std::size_t Capacity,
+ /// Defines whether the function throws an exception on empty function
+ /// call, `std::abort` is called otherwise.
+ bool IsThrowing,
+ /// Defines whether all objects satisfy the strong exception guarantees,
+ /// which means the function type will satisfy the strong exception
+ /// guarantees too.
+ bool HasStrongExceptGuarantee,
+ /// Defines the signature of the function wrapper
+ typename... Signatures>
+using function_base = detail::function<
+ detail::config<IsOwning, IsCopyable, Capacity>,
+ detail::property<IsThrowing, HasStrongExceptGuarantee, Signatures...>>;
+
+/// An owning copyable function wrapper for arbitrary callable types.
+template <typename... Signatures>
+using function = function_base<true, true, detail::default_capacity::value,
+ true, false, Signatures...>;
+
+/// An owning non copyable function wrapper for arbitrary callable types.
+template <typename... Signatures>
+using unique_function =
+ function_base<true, false, detail::default_capacity::value, true, false,
+ Signatures...>;
+
+/// A non owning copyable function wrapper for arbitrary callable types.
+template <typename... Signatures>
+using function_view =
+ function_base<false, true, detail::default_capacity::value, true, false,
+ Signatures...>;
+
+#if !defined(FU2_HAS_DISABLED_EXCEPTIONS)
+/// Exception type that is thrown when invoking empty function objects
+/// and exception support isn't disabled.
+///
+/// Exception suport is enabled if
+/// the template parameter 'Throwing' is set to true (default).
+///
+/// This type will default to std::bad_function_call if the
+/// functional header is used, otherwise the library provides its own type.
+///
+/// You may disable the inclusion of the functionl header
+/// through defining `FU2_WITH_NO_FUNCTIONAL_HEADER`.
+///
+using detail::type_erasure::invocation_table::bad_function_call;
+#endif
+
+/// Returns a callable object, which unifies all callable objects
+/// that were passed to this function.
+///
+/// ```cpp
+/// auto overloaded = fu2::overload([](std::true_type) { return true; },
+/// [](std::false_type) { return false; });
+/// ```
+///
+/// \param callables A pack of callable objects with arbitrary signatures.
+///
+/// \returns A callable object which exposes the
+///
+template <typename... T>
+constexpr auto overload(T&&... callables) {
+ return detail::overloading::overload(std::forward<T>(callables)...);
+}
+} // namespace fu2
+
+#undef FU2_EXPAND_QUALIFIERS
+#undef FU2_EXPAND_QUALIFIERS_NOEXCEPT
+
+#endif // FU2_INCLUDED_FUNCTION2_HPP_
diff --git a/src/include/hash.h b/src/include/hash.h
new file mode 100644
index 000000000..2ab95448b
--- /dev/null
+++ b/src/include/hash.h
@@ -0,0 +1,64 @@
+#ifndef CEPH_HASH_H
+#define CEPH_HASH_H
+
+#include "acconfig.h"
+
+// Robert Jenkins' function for mixing 32-bit values
+// http://burtleburtle.net/bob/hash/evahash.html
+// a, b = random bits, c = input and output
+
+#define hashmix(a,b,c) \
+ a=a-b; a=a-c; a=a^(c>>13); \
+ b=b-c; b=b-a; b=b^(a<<8); \
+ c=c-a; c=c-b; c=c^(b>>13); \
+ a=a-b; a=a-c; a=a^(c>>12); \
+ b=b-c; b=b-a; b=b^(a<<16); \
+ c=c-a; c=c-b; c=c^(b>>5); \
+ a=a-b; a=a-c; a=a^(c>>3); \
+ b=b-c; b=b-a; b=b^(a<<10); \
+ c=c-a; c=c-b; c=c^(b>>15);
+
+
+//namespace ceph {
+
+template <class _Key> struct rjhash { };
+
+inline uint64_t rjhash64(uint64_t key) {
+ key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+ key = key ^ (key >> 24);
+ key = (key + (key << 3)) + (key << 8); // key * 265
+ key = key ^ (key >> 14);
+ key = (key + (key << 2)) + (key << 4); // key * 21
+ key = key ^ (key >> 28);
+ key = key + (key << 31);
+ return key;
+}
+
+inline uint32_t rjhash32(uint32_t a) {
+ a = (a+0x7ed55d16) + (a<<12);
+ a = (a^0xc761c23c) ^ (a>>19);
+ a = (a+0x165667b1) + (a<<5);
+ a = (a+0xd3a2646c) ^ (a<<9);
+ a = (a+0xfd7046c5) + (a<<3);
+ a = (a^0xb55a4f09) ^ (a>>16);
+ return a;
+}
+
+
+template<> struct rjhash<uint32_t> {
+ inline size_t operator()(const uint32_t x) const {
+ return rjhash32(x);
+ }
+};
+
+template<> struct rjhash<uint64_t> {
+ inline size_t operator()(const uint64_t x) const {
+ return rjhash64(x);
+ }
+};
+
+//}
+
+
+
+#endif
diff --git a/src/include/health.h b/src/include/health.h
new file mode 100644
index 000000000..03191eff7
--- /dev/null
+++ b/src/include/health.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "include/encoding.h"
+
+// health_status_t
+enum health_status_t {
+ HEALTH_ERR = 0,
+ HEALTH_WARN = 1,
+ HEALTH_OK = 2,
+};
+
+inline void encode(health_status_t hs, ceph::buffer::list& bl) {
+ using ceph::encode;
+ uint8_t v = hs;
+ encode(v, bl);
+}
+inline void decode(health_status_t& hs, ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ uint8_t v;
+ decode(v, p);
+ hs = health_status_t(v);
+}
+template<>
+struct denc_traits<health_status_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = false;
+ static void bound_encode(const ceph::buffer::ptr& v, size_t& p, uint64_t f=0) {
+ p++;
+ }
+ static void encode(const health_status_t& v,
+ ceph::buffer::list::contiguous_appender& p,
+ uint64_t f=0) {
+ ::denc((uint8_t)v, p);
+ }
+ static void decode(health_status_t& v, ceph::buffer::ptr::const_iterator& p,
+ uint64_t f=0) {
+ uint8_t tmp;
+ ::denc(tmp, p);
+ v = health_status_t(tmp);
+ }
+ static void decode(health_status_t& v, ceph::buffer::list::const_iterator& p,
+ uint64_t f=0) {
+ uint8_t tmp;
+ ::denc(tmp, p);
+ v = health_status_t(tmp);
+ }
+};
+
+inline std::ostream& operator<<(std::ostream &oss, const health_status_t status) {
+ switch (status) {
+ case HEALTH_ERR:
+ oss << "HEALTH_ERR";
+ break;
+ case HEALTH_WARN:
+ oss << "HEALTH_WARN";
+ break;
+ case HEALTH_OK:
+ oss << "HEALTH_OK";
+ break;
+ }
+ return oss;
+}
+
+inline const char *short_health_string(const health_status_t status) {
+ switch (status) {
+ case HEALTH_ERR:
+ return "ERR";
+ case HEALTH_WARN:
+ return "WRN";
+ case HEALTH_OK:
+ return "OK";
+ default:
+ return "???";
+ }
+}
diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h
new file mode 100644
index 000000000..48d889763
--- /dev/null
+++ b/src/include/inline_memory.h
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_INLINE_MEMORY_H
+#define CEPH_INLINE_MEMORY_H
+
+#if defined(__GNUC__)
+
+// optimize for the common case, which is very small copies
+static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+ size_t inline_len)
+ __attribute__((always_inline));
+
+void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+ size_t inline_len)
+{
+ if (l > inline_len) {
+ return memcpy(dest, src, l);
+ }
+ switch (l) {
+ case 8:
+ return __builtin_memcpy(dest, src, 8);
+ case 4:
+ return __builtin_memcpy(dest, src, 4);
+ case 3:
+ return __builtin_memcpy(dest, src, 3);
+ case 2:
+ return __builtin_memcpy(dest, src, 2);
+ case 1:
+ return __builtin_memcpy(dest, src, 1);
+ default:
+ int cursor = 0;
+ while (l >= sizeof(uint64_t)) {
+ __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+ sizeof(uint64_t));
+ cursor += sizeof(uint64_t);
+ l -= sizeof(uint64_t);
+ }
+ while (l >= sizeof(uint32_t)) {
+ __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+ sizeof(uint32_t));
+ cursor += sizeof(uint32_t);
+ l -= sizeof(uint32_t);
+ }
+ while (l > 0) {
+ *((char*)dest + cursor) = *((char*)src + cursor);
+ cursor++;
+ l--;
+ }
+ }
+ return dest;
+}
+
+#else
+
+#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l)
+
+#endif
+
+
+#if defined(__GNUC__) && defined(__x86_64__)
+
+namespace ceph {
+typedef unsigned uint128_t __attribute__ ((mode (TI)));
+}
+using ceph::uint128_t;
+
+static inline bool mem_is_zero(const char *data, size_t len)
+ __attribute__((always_inline));
+
+bool mem_is_zero(const char *data, size_t len)
+{
+ // we do have XMM registers in x86-64, so if we need to check at least
+ // 16 bytes, make use of them
+ if (len / sizeof(uint128_t) > 0) {
+ // align data pointer to 16 bytes, otherwise it'll segfault due to bug
+ // in (at least some) GCC versions (using MOVAPS instead of MOVUPS).
+ // check up to 15 first bytes while at it.
+ while (((unsigned long long)data) & 15) {
+ if (*(uint8_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint8_t);
+ --len;
+ }
+
+ const char* data_start = data;
+ const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t);
+
+ while (data < max128) {
+ if (*(uint128_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint128_t);
+ }
+ len -= (data - data_start);
+ }
+
+ const char* max = data + len;
+ const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t);
+ while (data < max32) {
+ if (*(uint32_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint32_t);
+ }
+ while (data < max) {
+ if (*(uint8_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint8_t);
+ }
+ return true;
+}
+
+#else // gcc and x86_64
+
+static inline bool mem_is_zero(const char *data, size_t len) {
+ const char *end = data + len;
+ const char* end64 = data + (len / sizeof(uint64_t))*sizeof(uint64_t);
+
+ while (data < end64) {
+ if (*(uint64_t*)data != 0) {
+ return false;
+ }
+ data += sizeof(uint64_t);
+ }
+
+ while (data < end) {
+ if (*data != 0) {
+ return false;
+ }
+ ++data;
+ }
+ return true;
+}
+
+#endif // !x86_64
+
+#endif
diff --git a/src/include/int_types.h b/src/include/int_types.h
new file mode 100644
index 000000000..a704ba71d
--- /dev/null
+++ b/src/include/int_types.h
@@ -0,0 +1,56 @@
+#ifndef CEPH_INTTYPES_H
+#define CEPH_INTTYPES_H
+
+#include "acconfig.h"
+
+#include <inttypes.h>
+
+#ifdef __linux__
+#include <linux/types.h>
+#else
+#ifndef HAVE___U8
+typedef uint8_t __u8;
+#endif
+
+#ifndef HAVE___S8
+typedef int8_t __s8;
+#endif
+
+#ifndef HAVE___U16
+typedef uint16_t __u16;
+#endif
+
+#ifndef HAVE___S16
+typedef int16_t __s16;
+#endif
+
+#ifndef HAVE___U32
+typedef uint32_t __u32;
+#endif
+
+#ifndef HAVE___S32
+typedef int32_t __s32;
+#endif
+
+#ifndef HAVE___U64
+typedef uint64_t __u64;
+#endif
+
+#ifndef HAVE___S64
+typedef int64_t __s64;
+#endif
+#endif /* LINUX_TYPES_H */
+
+#ifndef BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#endif
+
+#ifndef BOOST_MPL_LIMIT_VECTOR_SIZE
+#define BOOST_MPL_LIMIT_VECTOR_SIZE 30 // or whatever you need
+#endif
+
+#ifndef BOOST_MPL_LIMIT_MAP_SIZE
+#define BOOST_MPL_LIMIT_MAP_SIZE 30 // or whatever you need
+#endif
+
+#endif
diff --git a/src/include/intarith.h b/src/include/intarith.h
new file mode 100644
index 000000000..e912cbe7b
--- /dev/null
+++ b/src/include/intarith.h
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INTARITH_H
+#define CEPH_INTARITH_H
+
+#include <type_traits>
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> div_round_up(T n, U d) {
+ return (n + d - 1) / d;
+}
+
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> round_up_to(T n, U d) {
+ return (n % d ? (n + d - n % d) : n);
+}
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> shift_round_up(T x, U y) {
+ return (x + (1 << y) - 1) >> y;
+}
+
+/*
+ * Wrapper to determine if value is a power of 2
+ */
+template<typename T>
+constexpr inline bool isp2(T x) {
+ return (x & (x - 1)) == 0;
+}
+
+/*
+ * Wrappers for various sorts of alignment and rounding. The "align" must
+ * be a power of 2. Often times it is a block, sector, or page.
+ */
+
+/*
+ * return x rounded down to an align boundary
+ * eg, p2align(1200, 1024) == 1024 (1*align)
+ * eg, p2align(1024, 1024) == 1024 (1*align)
+ * eg, p2align(0x1234, 0x100) == 0x1200 (0x12*align)
+ * eg, p2align(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+template<typename T>
+constexpr inline T p2align(T x, T align) {
+ return x & -align;
+}
+
+/*
+ * return x % (mod) align
+ * eg, p2phase(0x1234, 0x100) == 0x34 (x-0x12*align)
+ * eg, p2phase(0x5600, 0x100) == 0x00 (x-0x56*align)
+ */
+template<typename T>
+constexpr inline T p2phase(T x, T align) {
+ return x & (align - 1);
+}
+
+/*
+ * return how much space is left in this block (but if it's perfectly
+ * aligned, return 0).
+ * eg, p2nphase(0x1234, 0x100) == 0xcc (0x13*align-x)
+ * eg, p2nphase(0x5600, 0x100) == 0x00 (0x56*align-x)
+ */
+template<typename T>
+constexpr inline T p2nphase(T x, T align) {
+ return -x & (align - 1);
+}
+
+/*
+ * return x rounded up to an align boundary
+ * eg, p2roundup(0x1234, 0x100) == 0x1300 (0x13*align)
+ * eg, p2roundup(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+template<typename T>
+constexpr inline T p2roundup(T x, T align) {
+ return -(-x & -align);
+}
+
+// count trailing zeros.
+// NOTE: the builtin is nondeterministic on 0 input
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) <= sizeof(unsigned)),
+ unsigned>::type ctz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_ctz(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned int) &&
+ sizeof(T) <= sizeof(unsigned long)),
+ unsigned>::type ctz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_ctzl(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned long) &&
+ sizeof(T) <= sizeof(unsigned long long)),
+ unsigned>::type ctz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_ctzll(v);
+}
+
+// count leading zeros
+// NOTE: the builtin is nondeterministic on 0 input
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) <= sizeof(unsigned)),
+ unsigned>::type clz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_clz(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned int) &&
+ sizeof(T) <= sizeof(unsigned long)),
+ unsigned>::type clz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_clzl(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned long) &&
+ sizeof(T) <= sizeof(unsigned long long)),
+ unsigned>::type clz(T v) {
+ if (v == 0)
+ return sizeof(v) * 8;
+ return __builtin_clzll(v);
+}
+
+// count bits (set + any 0's that follow)
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) <= sizeof(unsigned)),
+ unsigned>::type cbits(T v) {
+ if (v == 0)
+ return 0;
+ return (sizeof(v) * 8) - __builtin_clz(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned int) &&
+ sizeof(T) <= sizeof(unsigned long)),
+ unsigned>::type cbits(T v) {
+ if (v == 0)
+ return 0;
+ return (sizeof(v) * 8) - __builtin_clzl(v);
+}
+
+template<class T>
+ inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ sizeof(T) > sizeof(unsigned long) &&
+ sizeof(T) <= sizeof(unsigned long long)),
+ unsigned>::type cbits(T v) {
+ if (v == 0)
+ return 0;
+ return (sizeof(v) * 8) - __builtin_clzll(v);
+}
+
+#endif
diff --git a/src/include/interval_set.h b/src/include/interval_set.h
new file mode 100644
index 000000000..dfb2a306c
--- /dev/null
+++ b/src/include/interval_set.h
@@ -0,0 +1,824 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_INTERVAL_SET_H
+#define CEPH_INTERVAL_SET_H
+
+#include <iterator>
+#include <map>
+#include <ostream>
+
+#include "encoding.h"
+
+/*
+ * *** NOTE ***
+ *
+ * This class is written to work with a variety of map-like containers,
+ * *include* ones that invalidate iterators when they are modified (e.g.,
+ * flat_map and btree_map).
+ */
+
+template<typename T, template<typename, typename, typename ...> class C = std::map>
+class interval_set {
+ public:
+ using Map = C<T, T>;
+ using value_type = typename Map::value_type;
+ using offset_type = T;
+ using length_type = T;
+ using reference = value_type&;
+ using const_reference = const value_type&;
+ using size_type = typename Map::size_type;
+
+ class const_iterator;
+
+ class iterator
+ {
+ public:
+ using difference_type = ssize_t;
+ using value_type = typename Map::value_type;
+ using pointer = typename Map::value_type*;
+ using reference = typename Map::value_type&;
+ using iterator_category = std::forward_iterator_tag;
+
+ explicit iterator(typename Map::iterator iter)
+ : _iter(iter)
+ { }
+
+ // For the copy constructor and assignment operator, the compiler-generated functions, which
+ // perform simple bitwise copying, should be fine.
+
+ bool operator==(const iterator& rhs) const {
+ return (_iter == rhs._iter);
+ }
+
+ bool operator!=(const iterator& rhs) const {
+ return (_iter != rhs._iter);
+ }
+
+ // Dereference this iterator to get a pair.
+ reference operator*() const {
+ return *_iter;
+ }
+
+ // Return the interval start.
+ offset_type get_start() const {
+ return _iter->first;
+ }
+
+ // Return the interval length.
+ length_type get_len() const {
+ return _iter->second;
+ }
+
+ offset_type get_end() const {
+ return _iter->first + _iter->second;
+ }
+
+ // Set the interval length.
+ void set_len(const length_type& len) {
+ _iter->second = len;
+ }
+
+ // Preincrement
+ iterator& operator++()
+ {
+ ++_iter;
+ return *this;
+ }
+
+ // Postincrement
+ iterator operator++(int)
+ {
+ iterator prev(_iter);
+ ++_iter;
+ return prev;
+ }
+
+ // Predecrement
+ iterator& operator--()
+ {
+ --_iter;
+ return *this;
+ }
+
+ // Postdecrement
+ iterator operator--(int)
+ {
+ iterator prev(_iter);
+ --_iter;
+ return prev;
+ }
+
+ friend class interval_set::const_iterator;
+
+ protected:
+ typename Map::iterator _iter;
+ friend class interval_set;
+ };
+
+ class const_iterator
+ {
+ public:
+ using difference_type = ssize_t;
+ using value_type = const typename Map::value_type;
+ using pointer = const typename Map::value_type*;
+ using reference = const typename Map::value_type&;
+ using iterator_category = std::forward_iterator_tag;
+
+ explicit const_iterator(typename Map::const_iterator iter)
+ : _iter(iter)
+ { }
+
+ const_iterator(const iterator &i)
+ : _iter(i._iter)
+ { }
+
+ // For the copy constructor and assignment operator, the compiler-generated functions, which
+ // perform simple bitwise copying, should be fine.
+
+ bool operator==(const const_iterator& rhs) const {
+ return (_iter == rhs._iter);
+ }
+
+ bool operator!=(const const_iterator& rhs) const {
+ return (_iter != rhs._iter);
+ }
+
+ // Dereference this iterator to get a pair.
+ reference operator*() const {
+ return *_iter;
+ }
+
+ // Return the interval start.
+ offset_type get_start() const {
+ return _iter->first;
+ }
+ offset_type get_end() const {
+ return _iter->first + _iter->second;
+ }
+
+ // Return the interval length.
+ length_type get_len() const {
+ return _iter->second;
+ }
+
+ // Preincrement
+ const_iterator& operator++()
+ {
+ ++_iter;
+ return *this;
+ }
+
+ // Postincrement
+ const_iterator operator++(int)
+ {
+ const_iterator prev(_iter);
+ ++_iter;
+ return prev;
+ }
+
+ // Predecrement
+ iterator& operator--()
+ {
+ --_iter;
+ return *this;
+ }
+
+ // Postdecrement
+ iterator operator--(int)
+ {
+ iterator prev(_iter);
+ --_iter;
+ return prev;
+ }
+
+ protected:
+ typename Map::const_iterator _iter;
+ };
+
+ interval_set() = default;
+ interval_set(Map&& other) {
+ m.swap(other);
+ for (const auto& p : m) {
+ _size += p.second;
+ }
+ }
+
+ size_type num_intervals() const
+ {
+ return m.size();
+ }
+
+ iterator begin() {
+ return iterator(m.begin());
+ }
+
+ iterator lower_bound(T start) {
+ return iterator(find_inc_m(start));
+ }
+
+ iterator end() {
+ return iterator(m.end());
+ }
+
+ const_iterator begin() const {
+ return const_iterator(m.begin());
+ }
+
+ const_iterator lower_bound(T start) const {
+ return const_iterator(find_inc(start));
+ }
+
+ const_iterator end() const {
+ return const_iterator(m.end());
+ }
+
+ // helpers
+ private:
+ auto find_inc(T start) const {
+ auto p = m.lower_bound(start); // p->first >= start
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ --p; // might overlap?
+ if (p->first + p->second <= start)
+ ++p; // it doesn't.
+ }
+ return p;
+ }
+
+ auto find_inc_m(T start) {
+ auto p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ --p; // might overlap?
+ if (p->first + p->second <= start)
+ ++p; // it doesn't.
+ }
+ return p;
+ }
+
+ auto find_adj(T start) const {
+ auto p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ --p; // might touch?
+ if (p->first + p->second < start)
+ ++p; // it doesn't.
+ }
+ return p;
+ }
+
+ auto find_adj_m(T start) {
+ auto p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ --p; // might touch?
+ if (p->first + p->second < start)
+ ++p; // it doesn't.
+ }
+ return p;
+ }
+
+ void intersection_size_asym(const interval_set &s, const interval_set &l) {
+ auto ps = s.m.begin();
+ ceph_assert(ps != s.m.end());
+ auto offset = ps->first;
+ bool first = true;
+ auto mi = m.begin();
+
+ while (1) {
+ if (first)
+ first = false;
+ auto pl = l.find_inc(offset);
+ if (pl == l.m.end())
+ break;
+ while (ps != s.m.end() && ps->first + ps->second <= pl->first)
+ ++ps;
+ if (ps == s.m.end())
+ break;
+ offset = pl->first + pl->second;
+ if (offset <= ps->first) {
+ offset = ps->first;
+ continue;
+ }
+
+ if (*ps == *pl) {
+ do {
+ mi = m.insert(mi, *ps);
+ _size += ps->second;
+ ++ps;
+ ++pl;
+ } while (ps != s.m.end() && pl != l.m.end() && *ps == *pl);
+ if (ps == s.m.end())
+ break;
+ offset = ps->first;
+ continue;
+ }
+
+ auto start = std::max<T>(ps->first, pl->first);
+ auto en = std::min<T>(ps->first + ps->second, offset);
+ ceph_assert(en > start);
+ mi = m.emplace_hint(mi, start, en - start);
+ _size += mi->second;
+ if (ps->first + ps->second <= offset) {
+ ++ps;
+ if (ps == s.m.end())
+ break;
+ offset = ps->first;
+ }
+ }
+ }
+
+ bool subset_size_sym(const interval_set &b) const {
+ auto pa = m.begin(), pb = b.m.begin();
+ const auto a_end = m.end(), b_end = b.m.end();
+
+ while (pa != a_end && pb != b_end) {
+ while (pb->first + pb->second <= pa->first) {
+ ++pb;
+ if (pb == b_end)
+ return false;
+ }
+
+ if (*pa == *pb) {
+ do {
+ ++pa;
+ ++pb;
+ } while (pa != a_end && pb != b_end && *pa == *pb);
+ continue;
+ }
+
+ // interval begins before other
+ if (pa->first < pb->first)
+ return false;
+ // interval is longer than other
+ if (pa->first + pa->second > pb->first + pb->second)
+ return false;
+
+ ++pa;
+ }
+
+ return pa == a_end;
+ }
+
+ public:
+ bool operator==(const interval_set& other) const {
+ return _size == other._size && m == other.m;
+ }
+
+ uint64_t size() const {
+ return _size;
+ }
+
+ void bound_encode(size_t& p) const {
+ denc_traits<Map>::bound_encode(m, p);
+ }
+ void encode(ceph::buffer::list::contiguous_appender& p) const {
+ denc(m, p);
+ }
+ void decode(ceph::buffer::ptr::const_iterator& p) {
+ denc(m, p);
+ _size = 0;
+ for (const auto& p : m) {
+ _size += p.second;
+ }
+ }
+ void decode(ceph::buffer::list::iterator& p) {
+ denc(m, p);
+ _size = 0;
+ for (const auto& p : m) {
+ _size += p.second;
+ }
+ }
+
+ void encode_nohead(ceph::buffer::list::contiguous_appender& p) const {
+ denc_traits<Map>::encode_nohead(m, p);
+ }
+ void decode_nohead(int n, ceph::buffer::ptr::const_iterator& p) {
+ denc_traits<Map>::decode_nohead(n, m, p);
+ _size = 0;
+ for (const auto& p : m) {
+ _size += p.second;
+ }
+ }
+
+ void clear() {
+ m.clear();
+ _size = 0;
+ }
+
+ bool contains(T i, T *pstart=0, T *plen=0) const {
+ auto p = find_inc(i);
+ if (p == m.end()) return false;
+ if (p->first > i) return false;
+ if (p->first+p->second <= i) return false;
+ ceph_assert(p->first <= i && p->first+p->second > i);
+ if (pstart)
+ *pstart = p->first;
+ if (plen)
+ *plen = p->second;
+ return true;
+ }
+ bool contains(T start, T len) const {
+ auto p = find_inc(start);
+ if (p == m.end()) return false;
+ if (p->first > start) return false;
+ if (p->first+p->second <= start) return false;
+ ceph_assert(p->first <= start && p->first+p->second > start);
+ if (p->first+p->second < start+len) return false;
+ return true;
+ }
+ bool intersects(T start, T len) const {
+ interval_set a;
+ a.insert(start, len);
+ interval_set i;
+ i.intersection_of( *this, a );
+ if (i.empty()) return false;
+ return true;
+ }
+
+ // outer range of set
+ bool empty() const {
+ return m.empty();
+ }
+ offset_type range_start() const {
+ ceph_assert(!empty());
+ auto p = m.begin();
+ return p->first;
+ }
+ offset_type range_end() const {
+ ceph_assert(!empty());
+ auto p = m.rbegin();
+ return p->first + p->second;
+ }
+
+ // interval start after p (where p not in set)
+ bool starts_after(T i) const {
+ ceph_assert(!contains(i));
+ auto p = find_inc(i);
+ if (p == m.end()) return false;
+ return true;
+ }
+ offset_type start_after(T i) const {
+ ceph_assert(!contains(i));
+ auto p = find_inc(i);
+ return p->first;
+ }
+
+ // interval end that contains start
+ offset_type end_after(T start) const {
+ ceph_assert(contains(start));
+ auto p = find_inc(start);
+ return p->first+p->second;
+ }
+
+ void insert(T val) {
+ insert(val, 1);
+ }
+
+ void insert(T start, T len, T *pstart=0, T *plen=0) {
+ //cout << "insert " << start << "~" << len << endl;
+ ceph_assert(len > 0);
+ _size += len;
+ auto p = find_adj_m(start);
+ if (p == m.end()) {
+ m[start] = len; // new interval
+ if (pstart)
+ *pstart = start;
+ if (plen)
+ *plen = len;
+ } else {
+ if (p->first < start) {
+
+ if (p->first + p->second != start) {
+ //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
+ ceph_abort();
+ }
+
+ p->second += len; // append to end
+
+ auto n = p;
+ ++n;
+ if (pstart)
+ *pstart = p->first;
+ if (n != m.end() &&
+ start+len == n->first) { // combine with next, too!
+ p->second += n->second;
+ if (plen)
+ *plen = p->second;
+ m.erase(n);
+ } else {
+ if (plen)
+ *plen = p->second;
+ }
+ } else {
+ if (start+len == p->first) {
+ if (pstart)
+ *pstart = start;
+ if (plen)
+ *plen = len + p->second;
+ T psecond = p->second;
+ m.erase(p);
+ m[start] = len + psecond; // append to front
+ } else {
+ ceph_assert(p->first > start+len);
+ if (pstart)
+ *pstart = start;
+ if (plen)
+ *plen = len;
+ m[start] = len; // new interval
+ }
+ }
+ }
+ }
+
+ void swap(interval_set& other) {
+ m.swap(other.m);
+ std::swap(_size, other._size);
+ }
+
+ void erase(const iterator &i) {
+ _size -= i.get_len();
+ m.erase(i._iter);
+ }
+
+ void erase(T val) {
+ erase(val, 1);
+ }
+
+ void erase(T start, T len,
+ std::function<bool(T, T)> claim = {}) {
+ auto p = find_inc_m(start);
+
+ _size -= len;
+
+ ceph_assert(p != m.end());
+ ceph_assert(p->first <= start);
+
+ T before = start - p->first;
+ ceph_assert(p->second >= before+len);
+ T after = p->second - before - len;
+ if (before) {
+ if (claim && claim(p->first, before)) {
+ _size -= before;
+ m.erase(p);
+ } else {
+ p->second = before; // shorten bit before
+ }
+ } else {
+ m.erase(p);
+ }
+ if (after) {
+ if (claim && claim(start + len, after)) {
+ _size -= after;
+ } else {
+ m[start + len] = after;
+ }
+ }
+ }
+
+ void subtract(const interval_set &a) {
+ for (const auto& [start, len] : a.m) {
+ erase(start, len);
+ }
+ }
+
+ void insert(const interval_set &a) {
+ for (const auto& [start, len] : a.m) {
+ insert(start, len);
+ }
+ }
+
+
+ void intersection_of(const interval_set &a, const interval_set &b) {
+ ceph_assert(&a != this);
+ ceph_assert(&b != this);
+ clear();
+
+ const interval_set *s, *l;
+
+ if (a.size() < b.size()) {
+ s = &a;
+ l = &b;
+ } else {
+ s = &b;
+ l = &a;
+ }
+
+ if (!s->size())
+ return;
+
+ /*
+ * Use the lower_bound algorithm for larger size ratios
+ * where it performs better, but not for smaller size
+ * ratios where sequential search performs better.
+ */
+ if (l->size() / s->size() >= 10) {
+ intersection_size_asym(*s, *l);
+ return;
+ }
+
+ auto pa = a.m.begin();
+ auto pb = b.m.begin();
+ auto mi = m.begin();
+
+ while (pa != a.m.end() && pb != b.m.end()) {
+ // passing?
+ if (pa->first + pa->second <= pb->first)
+ { pa++; continue; }
+ if (pb->first + pb->second <= pa->first)
+ { pb++; continue; }
+
+ if (*pa == *pb) {
+ do {
+ mi = m.insert(mi, *pa);
+ _size += pa->second;
+ ++pa;
+ ++pb;
+ } while (pa != a.m.end() && pb != b.m.end() && *pa == *pb);
+ continue;
+ }
+
+ T start = std::max(pa->first, pb->first);
+ T en = std::min(pa->first+pa->second, pb->first+pb->second);
+ ceph_assert(en > start);
+ mi = m.emplace_hint(mi, start, en - start);
+ _size += mi->second;
+ if (pa->first+pa->second > pb->first+pb->second)
+ pb++;
+ else
+ pa++;
+ }
+ }
+ void intersection_of(const interval_set& b) {
+ interval_set a;
+ swap(a);
+ intersection_of(a, b);
+ }
+
+ void union_of(const interval_set &a, const interval_set &b) {
+ ceph_assert(&a != this);
+ ceph_assert(&b != this);
+ clear();
+
+ //cout << "union_of" << endl;
+
+ // a
+ m = a.m;
+ _size = a._size;
+
+ // - (a*b)
+ interval_set ab;
+ ab.intersection_of(a, b);
+ subtract(ab);
+
+ // + b
+ insert(b);
+ return;
+ }
+ void union_of(const interval_set &b) {
+ interval_set a;
+ swap(a);
+ union_of(a, b);
+ }
+ void union_insert(T off, T len) {
+ interval_set a;
+ a.insert(off, len);
+ union_of(a);
+ }
+
+ bool subset_of(const interval_set &big) const {
+ if (!size())
+ return true;
+ if (size() > big.size())
+ return false;
+ if (range_end() > big.range_end())
+ return false;
+
+ /*
+ * Use the lower_bound algorithm for larger size ratios
+ * where it performs better, but not for smaller size
+ * ratios where sequential search performs better.
+ */
+ if (big.size() / size() < 10)
+ return subset_size_sym(big);
+
+ for (const auto& [start, len] : m) {
+ if (!big.contains(start, len)) return false;
+ }
+ return true;
+ }
+
+ /*
+ * build a subset of @other, starting at or after @start, and including
+ * @len worth of values, skipping holes. e.g.,
+ * span_of([5~10,20~5], 8, 5) -> [8~2,20~3]
+ */
+ void span_of(const interval_set &other, T start, T len) {
+ clear();
+ auto p = other.find_inc(start);
+ if (p == other.m.end())
+ return;
+ if (p->first < start) {
+ if (p->first + p->second < start)
+ return;
+ if (p->first + p->second < start + len) {
+ T howmuch = p->second - (start - p->first);
+ insert(start, howmuch);
+ len -= howmuch;
+ p++;
+ } else {
+ insert(start, len);
+ return;
+ }
+ }
+ while (p != other.m.end() && len > 0) {
+ if (p->second < len) {
+ insert(p->first, p->second);
+ len -= p->second;
+ p++;
+ } else {
+ insert(p->first, len);
+ return;
+ }
+ }
+ }
+
+ /*
+ * Move contents of m into another Map. Use that instead of
+ * encoding interval_set into bufferlist then decoding it back into Map.
+ */
+ Map detach() && {
+ return std::move(m);
+ }
+
+private:
+ // data
+ uint64_t _size = 0;
+ Map m; // map start -> len
+};
+
+// declare traits explicitly because (1) it's templatized, and (2) we
+// want to include _nohead variants.
+template<typename T, template<typename, typename, typename ...> class C>
+struct denc_traits<interval_set<T, C>> {
+private:
+ using container_t = interval_set<T, C>;
+public:
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = denc_traits<T, C<T,T>>::need_contiguous;
+ static void bound_encode(const container_t& v, size_t& p) {
+ v.bound_encode(p);
+ }
+ static void encode(const container_t& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ v.encode(p);
+ }
+ static void decode(container_t& v, ceph::buffer::ptr::const_iterator& p) {
+ v.decode(p);
+ }
+ template<typename U=T>
+ static typename std::enable_if<sizeof(U) && !need_contiguous>::type
+ decode(container_t& v, ceph::buffer::list::iterator& p) {
+ v.decode(p);
+ }
+ static void encode_nohead(const container_t& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ v.encode_nohead(p);
+ }
+ static void decode_nohead(size_t n, container_t& v,
+ ceph::buffer::ptr::const_iterator& p) {
+ v.decode_nohead(n, p);
+ }
+};
+
+
+template<typename T, template<typename, typename, typename ...> class C>
+inline std::ostream& operator<<(std::ostream& out, const interval_set<T,C> &s) {
+ out << "[";
+ bool first = true;
+ for (const auto& [start, len] : s) {
+ if (!first) out << ",";
+ out << start << "~" << len;
+ first = false;
+ }
+ out << "]";
+ return out;
+}
+
+
+#endif
diff --git a/src/include/ipaddr.h b/src/include/ipaddr.h
new file mode 100644
index 000000000..bf06cfc93
--- /dev/null
+++ b/src/include/ipaddr.h
@@ -0,0 +1,47 @@
+#ifndef CEPH_IPADDR_H
+#define CEPH_IPADDR_H
+
+class entity_addr_t;
+
+/*
+ * Check if an IP address that is in the wanted subnet.
+ */
+bool matches_ipv4_in_subnet(const struct ifaddrs& addrs,
+ const struct sockaddr_in* net,
+ unsigned int prefix_len);
+bool matches_ipv6_in_subnet(const struct ifaddrs& addrs,
+ const struct sockaddr_in6* net,
+ unsigned int prefix_len);
+
+/*
+ * Validate and parse IPv4 or IPv6 network
+ *
+ * Given a network (e.g. "192.168.0.0/24") and pointers to a sockaddr_storage
+ * struct and an unsigned int:
+ *
+ * if the network string is valid, return true and populate sockaddr_storage
+ * and prefix_len;
+ *
+ * if the network string is invalid, return false.
+ */
+bool parse_network(const char *s,
+ struct sockaddr_storage *network,
+ unsigned int *prefix_len);
+bool parse_network(const char *s,
+ entity_addr_t *network,
+ unsigned int *prefix_len);
+
+void netmask_ipv6(const struct in6_addr *addr,
+ unsigned int prefix_len,
+ struct in6_addr *out);
+
+void netmask_ipv4(const struct in_addr *addr,
+ unsigned int prefix_len,
+ struct in_addr *out);
+
+bool network_contains(
+ const struct entity_addr_t& network,
+ unsigned int prefix_len,
+ const struct entity_addr_t& addr);
+
+#endif
diff --git a/src/include/krbd.h b/src/include/krbd.h
new file mode 100644
index 000000000..977d45fe2
--- /dev/null
+++ b/src/include/krbd.h
@@ -0,0 +1,97 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_KRBD_H
+#define CEPH_KRBD_H
+
+#include "rados/librados.h"
+
+/*
+ * Don't wait for udev add uevents in krbd_map() and udev remove
+ * uevents in krbd_unmap*(). Instead, make do with the respective
+ * kernel uevents and return as soon as they are received.
+ *
+ * systemd-udevd sends out udev uevents after it finishes processing
+ * the respective kernel uevents, which mostly boils down to executing
+ * all matching udev rules. With this flag set, on return from
+ * krbd_map() systemd-udevd may still be poking at the device: it
+ * may still be open with tools such as blkid and various ioctls to
+ * be run against it, none of the persistent symlinks to the device
+ * node may be there, etc. udev used to be responsible for creating
+ * the device node as well, but that has been handled by devtmpfs in
+ * the kernel for many years now, so the device node (as returned
+ * through @pdevnode) is guaranteed to be there.
+ *
+ * If set, krbd_map() and krbd_unmap*() can be invoked from any
+ * network namespace that is owned by the initial user namespace
+ * (which is a formality because things like loading kernel modules
+ * and creating block devices are not namespaced and require global
+ * privileges, i.e. capabilities in the initial user namespace).
+ * Otherwise, krbd_map() and krbd_unmap*() must be invoked from
+ * the initial network namespace.
+ *
+ * If set, krbd_unmap*() doesn't attempt to settle the udev queue
+ * before retrying unmap for the last time. Some EBUSY errors due
+ * to systemd-udevd poking at the device at the time krbd_unmap*()
+ * is invoked that are otherwise covered by the retry logic may be
+ * returned.
+ */
+#define KRBD_CTX_F_NOUDEV (1U << 0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct krbd_ctx;
+
+int krbd_create_from_context(rados_config_t cct, uint32_t flags,
+ struct krbd_ctx **pctx);
+void krbd_destroy(struct krbd_ctx *ctx);
+
+int krbd_map(struct krbd_ctx *ctx,
+ const char *pool_name,
+ const char *nspace_name,
+ const char *image_name,
+ const char *snap_name,
+ const char *options,
+ char **pdevnode);
+int krbd_is_mapped(struct krbd_ctx *ctx,
+ const char *pool_name,
+ const char *nspace_name,
+ const char *image_name,
+ const char *snap_name,
+ char **pdevnode);
+
+int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
+ const char *options);
+int krbd_unmap_by_spec(struct krbd_ctx *ctx,
+ const char *pool_name,
+ const char *nspace_name,
+ const char *image_name,
+ const char *snap_name,
+ const char *options);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+namespace ceph {
+ class Formatter;
+}
+
+int krbd_showmapped(struct krbd_ctx *ctx, ceph::Formatter *f);
+
+#endif /* __cplusplus */
+
+#endif /* CEPH_KRBD_H */
diff --git a/src/include/libcephsqlite.h b/src/include/libcephsqlite.h
new file mode 100644
index 000000000..d81cc55e8
--- /dev/null
+++ b/src/include/libcephsqlite.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2021 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License version 2.1, as published by
+ * the Free Software Foundation. See file COPYING.
+ *
+ */
+
+#ifndef LIBCEPHSQLITE_H
+#define LIBCEPHSQLITE_H
+
+/* This loadable extension does not generally require using this header. It is
+ * here to allow controlling which version of the library is linked in. See
+ * also sqlite3_cephsqlite_init below. Additionally, you may specify which
+ * CephContext to use rather than the library instantiating its own and using
+ * whatever the default credential is.
+ */
+
+#include <sqlite3.h>
+
+#ifdef _WIN32
+# define LIBCEPHSQLITE_API __declspec(dllexport)
+#else
+# define LIBCEPHSQLITE_API [[gnu::visibility("default")]]
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* This is the SQLite entry point when loaded as a dynamic library. You also
+ * need to ensure SQLite calls this method when using libcephsqlite as a static
+ * library or a dynamic library linked at compile time. For the latter case,
+ * you can do this by:
+ *
+ * sqlite3_auto_extension((void (*)())sqlite3_cephsqlite_init);
+ * sqlite3* db = nullptr;
+ * int rc = sqlite3_open_v2(":memory:", &db, SQLITE_OPEN_READWRITE, nullptr);
+ * if (rc == SQLITE_DONE) {
+ * sqlite3_close(db);
+ * } else {
+ * // failure
+ * }
+ *
+ * The throwaway database created (name == "") is a memory database opened so
+ * that SQLite runs the libcephsqlite initialization routine to register the
+ * VFS. AFter that's done, the VFS is available for a future database open with
+ * the VFS set to "ceph":
+ *
+ * sqlite3_open_v2("foo:bar/baz.db", &db, SQLITE_OPEN_READWRITE, "ceph");
+ *
+ * You MUST do this before calling any other libcephsqlite routine so that
+ * sqlite3 can pass its API routines to the libcephsqlite extension.
+ */
+
+LIBCEPHSQLITE_API int sqlite3_cephsqlite_init(sqlite3* db, char** err, const sqlite3_api_routines* api);
+
+/* If you prefer to have libcephsqlite use a CephContext managed by your
+ * application, use this routine to set that. libcephsqlite can only have one
+ * context globally.
+ */
+
+LIBCEPHSQLITE_API int cephsqlite_setcct(class CephContext* cct, char** ident);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/linux_fiemap.h b/src/include/linux_fiemap.h
new file mode 100644
index 000000000..36046b5cc
--- /dev/null
+++ b/src/include/linux_fiemap.h
@@ -0,0 +1,73 @@
+/*
+ * FS_IOC_FIEMAP ioctl infrastructure.
+ *
+ * Some portions copyright (C) 2007 Cluster File Systems, Inc
+ *
+ * Authors: Mark Fasheh <mfasheh@suse.com>
+ * Kalpak Shah <kalpak.shah@sun.com>
+ * Andreas Dilger <adilger@sun.com>
+ */
+#ifndef _LINUX_FIEMAP_H
+#define _LINUX_FIEMAP_H
+
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD_)
+#include <sys/types.h>
+#endif
+
+#include "include/int_types.h"
+
+struct fiemap_extent {
+ __u64 fe_logical; /* logical offset in bytes for the start of
+ * the extent from the beginning of the file */
+ __u64 fe_physical; /* physical offset in bytes for the start
+ * of the extent from the beginning of the disk */
+ __u64 fe_length; /* length in bytes for this extent */
+ __u64 fe_reserved64[2];
+ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
+ __u32 fe_reserved[3];
+};
+
+struct fiemap {
+ __u64 fm_start; /* logical offset (inclusive) at
+ * which to start mapping (in) */
+ __u64 fm_length; /* logical length of mapping which
+ * userspace wants (in) */
+ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
+ __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+ __u32 fm_extent_count; /* size of fm_extents array (in) */
+ __u32 fm_reserved;
+ struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET (~0ULL)
+
+#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
+
+#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.
+ * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read
+ * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs.
+ * Sets EXTENT_NO_BYPASS. */
+#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be
+ * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata.
+ * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block.
+ * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but
+ * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively
+ * support extents. Result
+ * merged for efficiency. */
+#define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other
+ * files. */
+
+#endif /* _LINUX_FIEMAP_H */
diff --git a/src/include/lru.h b/src/include/lru.h
new file mode 100644
index 000000000..3f5069ee3
--- /dev/null
+++ b/src/include/lru.h
@@ -0,0 +1,241 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_LRU_H
+#define CEPH_LRU_H
+
+#include <math.h>
+#include <stdint.h>
+
+#include "common/config.h"
+#include "xlist.h"
+
+class LRUObject {
+public:
+ LRUObject() : lru_link(this) {}
+ virtual ~LRUObject();
+
+ // pin/unpin item in cache
+ void lru_pin();
+ void lru_unpin();
+ bool lru_is_expireable() const { return !lru_pinned; }
+
+ friend class LRU;
+private:
+ class LRU *lru{};
+ xlist<LRUObject *>::item lru_link;
+ bool lru_pinned = false;
+};
+
+class LRU {
+public:
+ uint64_t lru_get_size() const { return lru_get_top()+lru_get_bot()+lru_get_pintail(); }
+ uint64_t lru_get_top() const { return top.size(); }
+ uint64_t lru_get_bot() const{ return bottom.size(); }
+ uint64_t lru_get_pintail() const { return pintail.size(); }
+ uint64_t lru_get_num_pinned() const { return num_pinned; }
+
+ void lru_set_midpoint(double f) { midpoint = fmin(1.0, fmax(0.0, f)); }
+
+ void lru_clear() {
+ while (!top.empty()) {
+ lru_remove(top.front());
+ }
+ while (!bottom.empty()) {
+ lru_remove(bottom.front());
+ }
+ while (!pintail.empty()) {
+ lru_remove(pintail.front());
+ }
+ ceph_assert(num_pinned == 0);
+ }
+
+ // insert at top of lru
+ void lru_insert_top(LRUObject *o) {
+ ceph_assert(!o->lru);
+ o->lru = this;
+ top.push_front(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
+ }
+
+ // insert at mid point in lru
+ void lru_insert_mid(LRUObject *o) {
+ ceph_assert(!o->lru);
+ o->lru = this;
+ bottom.push_front(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
+ }
+
+ // insert at bottom of lru
+ void lru_insert_bot(LRUObject *o) {
+ ceph_assert(!o->lru);
+ o->lru = this;
+ bottom.push_back(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
+ }
+
+ // remove an item
+ LRUObject *lru_remove(LRUObject *o) {
+ if (!o->lru) return o;
+ auto list = o->lru_link.get_list();
+ ceph_assert(list == &top || list == &bottom || list == &pintail);
+ o->lru_link.remove_myself();
+ if (o->lru_pinned) num_pinned--;
+ o->lru = nullptr;
+ adjust();
+ return o;
+ }
+
+ // touch item -- move to head of lru
+ bool lru_touch(LRUObject *o) {
+ if (!o->lru) {
+ lru_insert_top(o);
+ } else {
+ ceph_assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ ceph_assert(list == &top || list == &bottom || list == &pintail);
+ top.push_front(&o->lru_link);
+ adjust();
+ }
+ return true;
+ }
+
+ // touch item -- move to midpoint (unless already higher)
+ bool lru_midtouch(LRUObject *o) {
+ if (!o->lru) {
+ lru_insert_mid(o);
+ } else {
+ ceph_assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ ceph_assert(list == &top || list == &bottom || list == &pintail);
+ if (list == &top) return false;
+ bottom.push_front(&o->lru_link);
+ adjust();
+ }
+ return true;
+ }
+
+ // touch item -- move to bottom
+ bool lru_bottouch(LRUObject *o) {
+ if (!o->lru) {
+ lru_insert_bot(o);
+ } else {
+ ceph_assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ ceph_assert(list == &top || list == &bottom || list == &pintail);
+ bottom.push_back(&o->lru_link);
+ adjust();
+ }
+ return true;
+ }
+
+ void lru_touch_entire_pintail() {
+ // promote entire pintail to the top lru
+ while (pintail.size() > 0) {
+ top.push_back(&pintail.front()->lru_link);
+ adjust();
+ }
+ }
+
+ // expire -- expire a single item
+ LRUObject *lru_get_next_expire() {
+ adjust();
+ // look through tail of bot
+ while (bottom.size()) {
+ LRUObject *p = bottom.back();
+ if (!p->lru_pinned) return p;
+
+ // move to pintail
+ pintail.push_front(&p->lru_link);
+ }
+
+ // ok, try head then
+ while (top.size()) {
+ LRUObject *p = top.back();
+ if (!p->lru_pinned) return p;
+
+ // move to pintail
+ pintail.push_front(&p->lru_link);
+ }
+
+ // no luck!
+ return NULL;
+ }
+
+ LRUObject *lru_expire() {
+ LRUObject *p = lru_get_next_expire();
+ if (p)
+ return lru_remove(p);
+ return NULL;
+ }
+
+ void lru_status() {
+ //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl;
+ }
+
+protected:
+ // adjust top/bot balance, as necessary
+ void adjust() {
+ uint64_t toplen = top.size();
+ uint64_t topwant = (midpoint * (double)(lru_get_size() - num_pinned));
+ /* move items from below midpoint (bottom) to top: move midpoint forward */
+ for (uint64_t i = toplen; i < topwant; i++) {
+ top.push_back(&bottom.front()->lru_link);
+ }
+ /* or: move items from above midpoint (top) to bottom: move midpoint backwards */
+ for (uint64_t i = toplen; i > topwant; i--) {
+ bottom.push_front(&top.back()->lru_link);
+ }
+ }
+
+ uint64_t num_pinned = 0;
+ double midpoint = 0.6;
+
+ friend class LRUObject;
+private:
+ using LRUList = xlist<LRUObject*>;
+ LRUList top, bottom, pintail;
+};
+
+inline LRUObject::~LRUObject() {
+ if (lru) {
+ lru->lru_remove(this);
+ }
+}
+
+inline void LRUObject::lru_pin() {
+ if (lru && !lru_pinned) {
+ lru->num_pinned++;
+ }
+ lru_pinned = true;
+}
+
+inline void LRUObject::lru_unpin() {
+ if (lru && lru_pinned) {
+ lru->num_pinned--;
+
+ // move from pintail -> bot
+ if (lru_link.get_list() == &lru->pintail) {
+ lru->lru_bottouch(this);
+ }
+ }
+ lru_pinned = false;
+}
+
+#endif
diff --git a/src/include/mempool.h b/src/include/mempool.h
new file mode 100644
index 000000000..fe84f3b8f
--- /dev/null
+++ b/src/include/mempool.h
@@ -0,0 +1,548 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef _CEPH_INCLUDE_MEMPOOL_H
+#define _CEPH_INCLUDE_MEMPOOL_H
+
+#include <cstddef>
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <vector>
+#include <list>
+#include <mutex>
+#include <typeinfo>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/flat_map.hpp>
+
+#include "common/Formatter.h"
+#include "common/ceph_atomic.h"
+#include "include/ceph_assert.h"
+#include "include/compact_map.h"
+#include "include/compact_set.h"
+#include "include/compat.h"
+
+
+/*
+
+Memory Pools
+============
+
+A memory pool is a method for accounting the consumption of memory of
+a set of containers.
+
+Memory pools are statically declared (see pool_index_t).
+
+Each memory pool tracks the number of bytes and items it contains.
+
+Allocators can be declared and associated with a type so that they are
+tracked independently of the pool total. This additional accounting
+is optional and only incurs an overhead if the debugging is enabled at
+runtime. This allows developers to see what types are consuming the
+pool resources.
+
+
+Declaring
+---------
+
+Using memory pools is very easy.
+
+To create a new memory pool, simply add a new name into the list of
+memory pools that's defined in "DEFINE_MEMORY_POOLS_HELPER". That's
+it. :)
+
+For each memory pool that's created a C++ namespace is also
+automatically created (name is same as in DEFINE_MEMORY_POOLS_HELPER).
+That namespace contains a set of common STL containers that are predefined
+with the appropriate allocators.
+
+Thus for mempool "osd" we have automatically available to us:
+
+ mempool::osd::map
+ mempool::osd::multimap
+ mempool::osd::set
+ mempool::osd::multiset
+ mempool::osd::list
+ mempool::osd::vector
+ mempool::osd::unordered_map
+
+
+Putting objects in a mempool
+----------------------------
+
+In order to use a memory pool with a particular type, a few additional
+declarations are needed.
+
+For a class:
+
+ struct Foo {
+ MEMPOOL_CLASS_HELPERS();
+ ...
+ };
+
+Then, in an appropriate .cc file,
+
+ MEMPOOL_DEFINE_OBJECT_FACTORY(Foo, foo, osd);
+
+The second argument can generally be identical to the first, except
+when the type contains a nested scope. For example, for
+BlueStore::Onode, we need to do
+
+ MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
+ bluestore_meta);
+
+(This is just because we need to name some static variables and we
+can't use :: in a variable name.)
+
+XXX Note: the new operator hard-codes the allocation size to the size of the
+object given in MEMPOOL_DEFINE_OBJECT_FACTORY. For this reason, you cannot
+incorporate mempools into a base class without also defining a helper/factory
+for the child class as well (as the base class is usually smaller than the
+child class).
+
+In order to use the STL containers, simply use the namespaced variant
+of the container type. For example,
+
+ mempool::osd::map<int> myvec;
+
+Introspection
+-------------
+
+The simplest way to interrogate the process is with
+
+ Formater *f = ...
+ mempool::dump(f);
+
+This will dump information about *all* memory pools. When debug mode
+is enabled, the runtime complexity of dump is O(num_shards *
+num_types). When debug name is disabled it is O(num_shards).
+
+You can also interrogate a specific pool programmatically with
+
+ size_t bytes = mempool::unittest_2::allocated_bytes();
+ size_t items = mempool::unittest_2::allocated_items();
+
+The runtime complexity is O(num_shards).
+
+Note that you cannot easily query per-type, primarily because debug
+mode is optional and you should not rely on that information being
+available.
+
+*/
+
+namespace mempool {
+
+// --------------------------------------------------------------
+// define memory pools
+
+#define DEFINE_MEMORY_POOLS_HELPER(f) \
+ f(bloom_filter) \
+ f(bluestore_alloc) \
+ f(bluestore_cache_data) \
+ f(bluestore_cache_onode) \
+ f(bluestore_cache_meta) \
+ f(bluestore_cache_other) \
+ f(bluestore_Buffer) \
+ f(bluestore_Extent) \
+ f(bluestore_Blob) \
+ f(bluestore_SharedBlob) \
+ f(bluestore_inline_bl) \
+ f(bluestore_fsck) \
+ f(bluestore_txc) \
+ f(bluestore_writing_deferred) \
+ f(bluestore_writing) \
+ f(bluefs) \
+ f(bluefs_file_reader) \
+ f(bluefs_file_writer) \
+ f(buffer_anon) \
+ f(buffer_meta) \
+ f(osd) \
+ f(osd_mapbl) \
+ f(osd_pglog) \
+ f(osdmap) \
+ f(osdmap_mapping) \
+ f(pgmap) \
+ f(mds_co) \
+ f(unittest_1) \
+ f(unittest_2)
+
+
+// give them integer ids
+#define P(x) mempool_##x,
+enum pool_index_t {
+ DEFINE_MEMORY_POOLS_HELPER(P)
+ num_pools // Must be last.
+};
+#undef P
+
+extern bool debug_mode;
+extern void set_debug_mode(bool d);
+
+// --------------------------------------------------------------
+class pool_t;
+
+// we shard pool stats across many shard_t's to reduce the amount
+// of cacheline ping pong.
+enum {
+ num_shard_bits = 5
+};
+enum {
+ num_shards = 1 << num_shard_bits
+};
+
+// align shard to a cacheline
+struct shard_t {
+ ceph::atomic<size_t> bytes = {0};
+ ceph::atomic<size_t> items = {0};
+ char __padding[128 - sizeof(ceph::atomic<size_t>)*2];
+} __attribute__ ((aligned (128)));
+
+static_assert(sizeof(shard_t) == 128, "shard_t should be cacheline-sized");
+
+struct stats_t {
+ ssize_t items = 0;
+ ssize_t bytes = 0;
+ void dump(ceph::Formatter *f) const {
+ f->dump_int("items", items);
+ f->dump_int("bytes", bytes);
+ }
+
+ stats_t& operator+=(const stats_t& o) {
+ items += o.items;
+ bytes += o.bytes;
+ return *this;
+ }
+};
+
+pool_t& get_pool(pool_index_t ix);
+const char *get_pool_name(pool_index_t ix);
+
+struct type_t {
+ const char *type_name;
+ size_t item_size;
+ ceph::atomic<ssize_t> items = {0}; // signed
+};
+
+struct type_info_hash {
+ std::size_t operator()(const std::type_info& k) const {
+ return k.hash_code();
+ }
+};
+
+class pool_t {
+ shard_t shard[num_shards];
+
+ mutable std::mutex lock; // only used for types list
+ std::unordered_map<const char *, type_t> type_map;
+
+public:
+ //
+ // How much this pool consumes. O(<num_shards>)
+ //
+ size_t allocated_bytes() const;
+ size_t allocated_items() const;
+
+ void adjust_count(ssize_t items, ssize_t bytes);
+
+ static size_t pick_a_shard_int() {
+ // Dirt cheap, see:
+ // https://fossies.org/dox/glibc-2.32/pthread__self_8c_source.html
+ size_t me = (size_t)pthread_self();
+ size_t i = (me >> CEPH_PAGE_SHIFT) & ((1 << num_shard_bits) - 1);
+ return i;
+ }
+
+ shard_t* pick_a_shard() {
+ size_t i = pick_a_shard_int();
+ return &shard[i];
+ }
+
+ type_t *get_type(const std::type_info& ti, size_t size) {
+ std::lock_guard<std::mutex> l(lock);
+ auto p = type_map.find(ti.name());
+ if (p != type_map.end()) {
+ return &p->second;
+ }
+ type_t &t = type_map[ti.name()];
+ t.type_name = ti.name();
+ t.item_size = size;
+ return &t;
+ }
+
+ // get pool stats. by_type is not populated if !debug
+ void get_stats(stats_t *total,
+ std::map<std::string, stats_t> *by_type) const;
+
+ void dump(ceph::Formatter *f, stats_t *ptotal=0) const;
+};
+
+void dump(ceph::Formatter *f);
+
+
+// STL allocator for use with containers. All actual state
+// is stored in the static pool_allocator_base_t, which saves us from
+// passing the allocator to container constructors.
+
+template<pool_index_t pool_ix, typename T>
+class pool_allocator {
+ pool_t *pool;
+ type_t *type = nullptr;
+
+public:
+ typedef pool_allocator<pool_ix, T> allocator_type;
+ typedef T value_type;
+ typedef value_type *pointer;
+ typedef const value_type * const_pointer;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ template<typename U> struct rebind {
+ typedef pool_allocator<pool_ix,U> other;
+ };
+
+ void init(bool force_register) {
+ pool = &get_pool(pool_ix);
+ if (debug_mode || force_register) {
+ type = pool->get_type(typeid(T), sizeof(T));
+ }
+ }
+
+ pool_allocator(bool force_register=false) {
+ init(force_register);
+ }
+ template<typename U>
+ pool_allocator(const pool_allocator<pool_ix,U>&) {
+ init(false);
+ }
+
+ T* allocate(size_t n, void *p = nullptr) {
+ size_t total = sizeof(T) * n;
+ shard_t *shard = pool->pick_a_shard();
+ shard->bytes += total;
+ shard->items += n;
+ if (type) {
+ type->items += n;
+ }
+ T* r = reinterpret_cast<T*>(new char[total]);
+ return r;
+ }
+
+ void deallocate(T* p, size_t n) {
+ size_t total = sizeof(T) * n;
+ shard_t *shard = pool->pick_a_shard();
+ shard->bytes -= total;
+ shard->items -= n;
+ if (type) {
+ type->items -= n;
+ }
+ delete[] reinterpret_cast<char*>(p);
+ }
+
+ T* allocate_aligned(size_t n, size_t align, void *p = nullptr) {
+ size_t total = sizeof(T) * n;
+ shard_t *shard = pool->pick_a_shard();
+ shard->bytes += total;
+ shard->items += n;
+ if (type) {
+ type->items += n;
+ }
+ char *ptr;
+ int rc = ::posix_memalign((void**)(void*)&ptr, align, total);
+ if (rc)
+ throw std::bad_alloc();
+ T* r = reinterpret_cast<T*>(ptr);
+ return r;
+ }
+
+ void deallocate_aligned(T* p, size_t n) {
+ size_t total = sizeof(T) * n;
+ shard_t *shard = pool->pick_a_shard();
+ shard->bytes -= total;
+ shard->items -= n;
+ if (type) {
+ type->items -= n;
+ }
+ aligned_free(p);
+ }
+
+ void destroy(T* p) {
+ p->~T();
+ }
+
+ template<class U>
+ void destroy(U *p) {
+ p->~U();
+ }
+
+ void construct(T* p, const T& val) {
+ ::new ((void *)p) T(val);
+ }
+
+ template<class U, class... Args> void construct(U* p,Args&&... args) {
+ ::new((void *)p) U(std::forward<Args>(args)...);
+ }
+
+ bool operator==(const pool_allocator&) const { return true; }
+ bool operator!=(const pool_allocator&) const { return false; }
+};
+
+
+// Namespace mempool
+
+#define P(x) \
+ namespace x { \
+ static const mempool::pool_index_t id = mempool::mempool_##x; \
+ template<typename v> \
+ using pool_allocator = mempool::pool_allocator<id,v>; \
+ \
+ using string = std::basic_string<char,std::char_traits<char>, \
+ pool_allocator<char>>; \
+ \
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using map = std::map<k, v, cmp, \
+ pool_allocator<std::pair<const k,v>>>; \
+ \
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using compact_map = compact_map<k, v, cmp, \
+ pool_allocator<std::pair<const k,v>>>; \
+ \
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using compact_multimap = compact_multimap<k, v, cmp, \
+ pool_allocator<std::pair<const k,v>>>; \
+ \
+ template<typename k, typename cmp = std::less<k> > \
+ using compact_set = compact_set<k, cmp, pool_allocator<k>>; \
+ \
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using multimap = std::multimap<k,v,cmp, \
+ pool_allocator<std::pair<const k, \
+ v>>>; \
+ \
+ template<typename k, typename cmp = std::less<k> > \
+ using set = std::set<k,cmp,pool_allocator<k>>; \
+ \
+ template<typename k, typename cmp = std::less<k> > \
+ using flat_set = boost::container::flat_set<k,cmp,pool_allocator<k>>; \
+ \
+ template<typename k, typename v, typename cmp = std::less<k> > \
+ using flat_map = boost::container::flat_map<k,v,cmp, \
+ pool_allocator<std::pair<k,v>>>; \
+ \
+ template<typename v> \
+ using list = std::list<v,pool_allocator<v>>; \
+ \
+ template<typename v> \
+ using vector = std::vector<v,pool_allocator<v>>; \
+ \
+ template<typename k, typename v, \
+ typename h=std::hash<k>, \
+ typename eq = std::equal_to<k>> \
+ using unordered_map = \
+ std::unordered_map<k,v,h,eq,pool_allocator<std::pair<const k,v>>>;\
+ \
+ inline size_t allocated_bytes() { \
+ return mempool::get_pool(id).allocated_bytes(); \
+ } \
+ inline size_t allocated_items() { \
+ return mempool::get_pool(id).allocated_items(); \
+ } \
+ };
+
+DEFINE_MEMORY_POOLS_HELPER(P)
+
+#undef P
+
+};
+
+// the elements allocated by mempool is in the same memory space as the ones
+// allocated by the default allocator. so compare them in an efficient way:
+// libstdc++'s std::equal is specialized to use memcmp if T is integer or
+// pointer. this is good enough for our usecase. use
+// std::is_trivially_copyable<T> to expand the support to more types if
+// nececssary.
+template<typename T, mempool::pool_index_t pool_index>
+bool operator==(const std::vector<T, std::allocator<T>>& lhs,
+ const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs)
+{
+ return (lhs.size() == rhs.size() &&
+ std::equal(lhs.begin(), lhs.end(), rhs.begin()));
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator!=(const std::vector<T, std::allocator<T>>& lhs,
+ const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs)
+{
+ return !(lhs == rhs);
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator==(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs,
+ const std::vector<T, std::allocator<T>>& rhs)
+{
+ return rhs == lhs;
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator!=(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs,
+ const std::vector<T, std::allocator<T>>& rhs)
+{
+ return !(lhs == rhs);
+}
+
+// Use this for any type that is contained by a container (unless it
+// is a class you defined; see below).
+#define MEMPOOL_DECLARE_FACTORY(obj, factoryname, pool) \
+ namespace mempool { \
+ namespace pool { \
+ extern pool_allocator<obj> alloc_##factoryname; \
+ } \
+ }
+
+#define MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \
+ namespace mempool { \
+ namespace pool { \
+ pool_allocator<obj> alloc_##factoryname = {true}; \
+ } \
+ }
+
+// Use this for each class that belongs to a mempool. For example,
+//
+// class T {
+// MEMPOOL_CLASS_HELPERS();
+// ...
+// };
+//
+#define MEMPOOL_CLASS_HELPERS() \
+ void *operator new(size_t size); \
+ void *operator new[](size_t size) noexcept { \
+ ceph_abort_msg("no array new"); \
+ return nullptr; } \
+ void operator delete(void *); \
+ void operator delete[](void *) { ceph_abort_msg("no array delete"); }
+
+
+// Use this in some particular .cc file to match each class with a
+// MEMPOOL_CLASS_HELPERS().
+#define MEMPOOL_DEFINE_OBJECT_FACTORY(obj,factoryname,pool) \
+ MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \
+ void *obj::operator new(size_t size) { \
+ return mempool::pool::alloc_##factoryname.allocate(1); \
+ } \
+ void obj::operator delete(void *p) { \
+ return mempool::pool::alloc_##factoryname.deallocate((obj*)p, 1); \
+ }
+
+#endif
diff --git a/src/include/msgr.h b/src/include/msgr.h
new file mode 100644
index 000000000..eedb95dd0
--- /dev/null
+++ b/src/include/msgr.h
@@ -0,0 +1,247 @@
+#ifndef CEPH_MSGR_H
+#define CEPH_MSGR_H
+
+#ifndef __KERNEL__
+#include <sys/socket.h> // for struct sockaddr_storage
+#endif
+
+#include "include/int_types.h"
+
+/* See comment in ceph_fs.h. */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT_LEGACY 6789 /* legacy default monitor port */
+#define CEPH_MON_PORT_IANA 3300 /* IANA monitor port */
+
+/*
+ * tcp connection banner. include a protocol version. and adjust
+ * whenever the wire protocol changes. try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+
+
+/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2_PREFIX "ceph v2\n"
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \
+ const static uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+ const static uint64_t CEPH_MSGR2_FEATUREMASK_##name = \
+ (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+ (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES (0ull)
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+ return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+ __u8 type; /* CEPH_ENTITY_TYPE_* */
+ __le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON 0x01
+#define CEPH_ENTITY_TYPE_MDS 0x02
+#define CEPH_ENTITY_TYPE_OSD 0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_MGR 0x10
+#define CEPH_ENTITY_TYPE_AUTH 0x20
+
+#define CEPH_ENTITY_TYPE_ANY 0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+ __le32 type;
+ __le32 nonce; /* unique id for process (e.g. pid) */
+ struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+ struct ceph_entity_name name;
+ struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
+ incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
+ with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
+ with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
+#define CEPH_MSGR_TAG_MSG 7 /* message */
+#define CEPH_MSGR_TAG_ACK 8 /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
+#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2 14
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive reply */
+#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* ceph v2 doing server challenge */
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+ __le64 features; /* supported feature bits */
+ __le32 host_type; /* CEPH_ENTITY_TYPE_* */
+ __le32 global_seq; /* count connections initiated by this host */
+ __le32 connect_seq; /* count connections initiated in this session */
+ __le32 protocol_version;
+ __le32 authorizer_protocol;
+ __le32 authorizer_len;
+ __u8 flags; /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+ __u8 tag;
+ __le64 features; /* feature bits for this session */
+ __le32 global_seq;
+ __le32 connect_seq;
+ __le32 protocol_version;
+ __le32 authorizer_len;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_inst src, orig_src;
+ __le32 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_name src;
+
+ /* oldest code we think can decode this. unknown if zero. */
+ __le16 compat_version;
+ __le16 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header2 {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 data_pre_padding_len;
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ __le64 ack_seq;
+ __u8 flags;
+ /* oldest code we think can decode this. unknown if zero. */
+ __le16 compat_version;
+ __le16 reserved;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW 64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH 196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ * ceph_msg_footer_old does not support digital signatures on messages PLR
+ */
+
+struct ceph_msg_footer_old {
+ __le32 front_crc, middle_crc, data_crc;
+ __u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_msg_footer {
+ __le32 front_crc, middle_crc, data_crc;
+ // sig holds the 64 bits of the digital signature for the message PLR
+ __le64 sig;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
+#define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/neorados/RADOS.hpp b/src/include/neorados/RADOS.hpp
new file mode 100644
index 000000000..244442bcf
--- /dev/null
+++ b/src/include/neorados/RADOS.hpp
@@ -0,0 +1,1152 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef NEORADOS_RADOS_HPP
+#define NEORADOS_RADOS_HPP
+
+#include <cstddef>
+#include <memory>
+#include <tuple>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <variant>
+
+#include <boost/asio.hpp>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/uuid/uuid.hpp>
+
+#include <boost/system/error_code.hpp>
+
+// Will be in C++20!
+
+#include "include/expected.hpp"
+
+// Had better be in C++20. Why is this not in Boost?
+
+#include "include/function2.hpp"
+
+// Things broken out so we can decode them in Objecter.
+
+#include "include/neorados/RADOS_Decodable.hpp"
+
+// Needed for type erasure and template support. We can't really avoid
+// it.
+
+#include "common/async/completion.h"
+
+// These are needed for RGW, but in general as a 'shiny new interface'
+// we should try to use forward declarations and provide standard alternatives.
+
+#include "include/common_fwd.h"
+
+#include "include/buffer.h"
+#include "include/rados/librados_fwd.hpp"
+
+#include "common/ceph_time.h"
+
+namespace neorados {
+using namespace std::literals;
+
+class Object;
+class IOContext;
+}
+namespace std {
+template<>
+struct hash<neorados::Object>;
+template<>
+struct hash<neorados::IOContext>;
+}
+
+namespace neorados {
+namespace detail {
+class Client;
+}
+
+class RADOS;
+
+// Exists mostly so that repeated operations on the same object don't
+// have to pay for the string copy to construct an object_t.
+
+class Object final {
+ friend RADOS;
+ friend std::hash<Object>;
+
+public:
+ Object();
+ Object(const char* s);
+ Object(std::string_view s);
+ Object(std::string&& s);
+ Object(const std::string& s);
+ ~Object();
+
+ Object(const Object& o);
+ Object& operator =(const Object& o);
+
+ Object(Object&& o);
+ Object& operator =(Object&& o);
+
+ operator std::string_view() const;
+
+ friend std::ostream& operator <<(std::ostream& m, const Object& o);
+ friend bool operator <(const Object& lhs, const Object& rhs);
+ friend bool operator <=(const Object& lhs, const Object& rhs);
+ friend bool operator >=(const Object& lhs, const Object& rhs);
+ friend bool operator >(const Object& lhs, const Object& rhs);
+
+ friend bool operator ==(const Object& lhs, const Object& rhs);
+ friend bool operator !=(const Object& lhs, const Object& rhs);
+
+private:
+
+ static constexpr std::size_t impl_size = 4 * 8;
+ std::aligned_storage_t<impl_size> impl;
+};
+
+// Not the same as the librados::IoCtx, but it does gather together
+// some of the same metadata. Since we're likely to do multiple
+// operations in the same pool or namespace, it doesn't make sense to
+// redo a bunch of lookups and string copies.
+
+class IOContext final {
+ friend RADOS;
+ friend std::hash<IOContext>;
+
+public:
+
+ IOContext();
+ explicit IOContext(std::int64_t pool);
+ IOContext(std::int64_t _pool, std::string_view _ns);
+ IOContext(std::int64_t _pool, std::string&& _ns);
+ ~IOContext();
+
+ IOContext(const IOContext& rhs);
+ IOContext& operator =(const IOContext& rhs);
+
+ IOContext(IOContext&& rhs);
+ IOContext& operator =(IOContext&& rhs);
+
+ std::int64_t pool() const;
+ void pool(std::int64_t _pool);
+
+ std::string_view ns() const;
+ void ns(std::string_view _ns);
+ void ns(std::string&& _ns);
+
+ std::optional<std::string_view> key() const;
+ void key(std::string_view _key);
+ void key(std::string&& _key);
+ void clear_key();
+
+ std::optional<std::int64_t> hash() const;
+ void hash(std::int64_t _hash);
+ void clear_hash();
+
+ std::optional<std::uint64_t> read_snap() const;
+ void read_snap(std::optional<std::uint64_t> _snapid);
+
+ // I can't actually move-construct here since snapid_t is its own
+ // separate class type, not an alias.
+ std::optional<
+ std::pair<std::uint64_t,
+ std::vector<std::uint64_t>>> write_snap_context() const;
+ void write_snap_context(std::optional<
+ std::pair<std::uint64_t,
+ std::vector<std::uint64_t>>> snapc);
+
+ bool full_try() const;
+ void full_try(bool _full_try);
+
+ friend std::ostream& operator <<(std::ostream& m, const IOContext& o);
+ friend bool operator <(const IOContext& lhs, const IOContext& rhs);
+ friend bool operator <=(const IOContext& lhs, const IOContext& rhs);
+ friend bool operator >=(const IOContext& lhs, const IOContext& rhs);
+ friend bool operator >(const IOContext& lhs, const IOContext& rhs);
+
+ friend bool operator ==(const IOContext& lhs, const IOContext& rhs);
+ friend bool operator !=(const IOContext& lhs, const IOContext& rhs);
+
+private:
+
+ static constexpr std::size_t impl_size = 16 * 8;
+ std::aligned_storage_t<impl_size> impl;
+};
+
+inline constexpr std::string_view all_nspaces("\001"sv);
+
+enum class cmpxattr_op : std::uint8_t {
+ eq = 1,
+ ne = 2,
+ gt = 3,
+ gte = 4,
+ lt = 5,
+ lte = 6
+};
+
+namespace alloc_hint {
+enum alloc_hint_t {
+ sequential_write = 1,
+ random_write = 2,
+ sequential_read = 4,
+ random_read = 8,
+ append_only = 16,
+ immutable = 32,
+ shortlived = 64,
+ longlived = 128,
+ compressible = 256,
+ incompressible = 512
+};
+}
+
+class Op {
+ friend RADOS;
+
+public:
+
+ Op(const Op&) = delete;
+ Op& operator =(const Op&) = delete;
+ Op(Op&&);
+ Op& operator =(Op&&);
+ ~Op();
+
+ void set_excl();
+ void set_failok();
+ void set_fadvise_random();
+ void set_fadvise_sequential();
+ void set_fadvise_willneed();
+ void set_fadvise_dontneed();
+ void set_fadvise_nocache();
+
+ void cmpext(uint64_t off, ceph::buffer::list&& cmp_bl, std::size_t* s);
+ void cmpxattr(std::string_view name, cmpxattr_op op,
+ const ceph::buffer::list& val);
+ void cmpxattr(std::string_view name, cmpxattr_op op, std::uint64_t val);
+ void assert_version(uint64_t ver);
+ void assert_exists();
+ void cmp_omap(const boost::container::flat_map<
+ std::string,
+ std::pair<ceph::buffer::list, int>>& assertions);
+
+ void exec(std::string_view cls, std::string_view method,
+ const ceph::buffer::list& inbl,
+ ceph::buffer::list* out,
+ boost::system::error_code* ec = nullptr);
+ void exec(std::string_view cls, std::string_view method,
+ const ceph::buffer::list& inbl,
+ fu2::unique_function<void(boost::system::error_code,
+ const ceph::buffer::list&) &&> f);
+ void exec(std::string_view cls, std::string_view method,
+ const ceph::buffer::list& inbl,
+ fu2::unique_function<void(boost::system::error_code, int,
+ const ceph::buffer::list&) &&> f);
+ void exec(std::string_view cls, std::string_view method,
+ const ceph::buffer::list& inbl,
+ boost::system::error_code* ec = nullptr);
+
+
+ // Flags that apply to all ops in the operation vector
+ void balance_reads();
+ void localize_reads();
+ void order_reads_writes();
+ void ignore_cache();
+ void skiprwlocks();
+ void ignore_overlay();
+ void full_try();
+ void full_force();
+ void ignore_redirect();
+ void ordersnap();
+ void returnvec();
+
+ std::size_t size() const;
+ using Signature = void(boost::system::error_code);
+ using Completion = ceph::async::Completion<Signature>;
+
+ friend std::ostream& operator <<(std::ostream& m, const Op& o);
+protected:
+ Op();
+ static constexpr std::size_t impl_size = 85 * 8;
+ std::aligned_storage_t<impl_size> impl;
+};
+
+// This class is /not/ thread-safe. If you want you can wrap it in
+// something that locks it.
+
+class ReadOp final : public Op {
+ friend RADOS;
+
+public:
+
+ ReadOp() = default;
+ ReadOp(const ReadOp&) = delete;
+ ReadOp(ReadOp&&) = default;
+
+ ReadOp& operator =(const ReadOp&) = delete;
+ ReadOp& operator =(ReadOp&&) = default;
+
+ void read(size_t off, uint64_t len, ceph::buffer::list* out,
+ boost::system::error_code* ec = nullptr);
+ void get_xattr(std::string_view name, ceph::buffer::list* out,
+ boost::system::error_code* ec = nullptr);
+ void get_omap_header(ceph::buffer::list*,
+ boost::system::error_code* ec = nullptr);
+
+ void sparse_read(uint64_t off, uint64_t len,
+ ceph::buffer::list* out,
+ std::vector<std::pair<std::uint64_t, std::uint64_t>>* extents,
+ boost::system::error_code* ec = nullptr);
+
+ void stat(std::uint64_t* size, ceph::real_time* mtime,
+ boost::system::error_code* ec = nullptr);
+
+ void get_omap_keys(std::optional<std::string_view> start_after,
+ std::uint64_t max_return,
+ boost::container::flat_set<std::string>* keys,
+ bool* truncated,
+ boost::system::error_code* ec = nullptr);
+
+
+ void get_xattrs(boost::container::flat_map<std::string,
+ ceph::buffer::list>* kv,
+ boost::system::error_code* ec = nullptr);
+
+ void get_omap_vals(std::optional<std::string_view> start_after,
+ std::optional<std::string_view> filter_prefix,
+ uint64_t max_return,
+ boost::container::flat_map<std::string,
+ ceph::buffer::list>* kv,
+ bool* truncated,
+ boost::system::error_code* ec = nullptr);
+
+
+ void get_omap_vals_by_keys(const boost::container::flat_set<std::string>& keys,
+ boost::container::flat_map<std::string,
+ ceph::buffer::list>* kv,
+ boost::system::error_code* ec = nullptr);
+
+ void list_watchers(std::vector<struct ObjWatcher>* watchers,
+ boost::system::error_code* ec = nullptr);
+
+ void list_snaps(struct SnapSet* snaps,
+ boost::system::error_code* ec = nullptr);
+};
+
+class WriteOp final : public Op {
+ friend RADOS;
+public:
+
+ WriteOp() = default;
+ WriteOp(const WriteOp&) = delete;
+ WriteOp(WriteOp&&) = default;
+
+ WriteOp& operator =(const WriteOp&) = delete;
+ WriteOp& operator =(WriteOp&&) = default;
+
+ void set_mtime(ceph::real_time t);
+ void create(bool exclusive);
+ void write(uint64_t off, ceph::buffer::list&& bl);
+ void write_full(ceph::buffer::list&& bl);
+ void writesame(std::uint64_t off, std::uint64_t write_len,
+ ceph::buffer::list&& bl);
+ void append(ceph::buffer::list&& bl);
+ void remove();
+ void truncate(uint64_t off);
+ void zero(uint64_t off, uint64_t len);
+ void rmxattr(std::string_view name);
+ void setxattr(std::string_view name,
+ ceph::buffer::list&& bl);
+ void rollback(uint64_t snapid);
+ void set_omap(const boost::container::flat_map<std::string,
+ ceph::buffer::list>& map);
+ void set_omap_header(ceph::buffer::list&& bl);
+ void clear_omap();
+ void rm_omap_keys(const boost::container::flat_set<std::string>& to_rm);
+ void set_alloc_hint(uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ alloc_hint::alloc_hint_t flags);
+};
+
+
+struct FSStats {
+ uint64_t kb;
+ uint64_t kb_used;
+ uint64_t kb_avail;
+ uint64_t num_objects;
+};
+
+// From librados.h, maybe move into a common file. But I want to see
+// if we need/want to amend/add/remove anything first.
+struct PoolStats {
+ /// space used in bytes
+ uint64_t num_bytes;
+ /// space used in KB
+ uint64_t num_kb;
+ /// number of objects in the pool
+ uint64_t num_objects;
+ /// number of clones of objects
+ uint64_t num_object_clones;
+ /// num_objects * num_replicas
+ uint64_t num_object_copies;
+ /// number of objects missing on primary
+ uint64_t num_objects_missing_on_primary;
+ /// number of objects found on no OSDs
+ uint64_t num_objects_unfound;
+ /// number of objects replicated fewer times than they should be
+ /// (but found on at least one OSD)
+ uint64_t num_objects_degraded;
+ /// number of objects read
+ uint64_t num_rd;
+ /// objects read in KB
+ uint64_t num_rd_kb;
+ /// number of objects written
+ uint64_t num_wr;
+ /// objects written in KB
+ uint64_t num_wr_kb;
+ /// bytes originally provided by user
+ uint64_t num_user_bytes;
+ /// bytes passed compression
+ uint64_t compressed_bytes_orig;
+ /// bytes resulted after compression
+ uint64_t compressed_bytes;
+ /// bytes allocated at storage
+ uint64_t compressed_bytes_alloc;
+};
+
+// Placement group, for PG commands
+struct PG {
+ uint64_t pool;
+ uint32_t seed;
+};
+
+class Cursor final {
+public:
+ static Cursor begin();
+ static Cursor end();
+
+ Cursor();
+ Cursor(const Cursor&);
+ Cursor& operator =(const Cursor&);
+ Cursor(Cursor&&);
+ Cursor& operator =(Cursor&&);
+ ~Cursor();
+
+ friend bool operator ==(const Cursor& lhs,
+ const Cursor& rhs);
+ friend bool operator !=(const Cursor& lhs,
+ const Cursor& rhs);
+ friend bool operator <(const Cursor& lhs,
+ const Cursor& rhs);
+ friend bool operator <=(const Cursor& lhs,
+ const Cursor& rhs);
+ friend bool operator >=(const Cursor& lhs,
+ const Cursor& rhs);
+ friend bool operator >(const Cursor& lhs,
+ const Cursor& rhs);
+
+ std::string to_str() const;
+ static std::optional<Cursor> from_str(const std::string& s);
+
+private:
+ struct end_magic_t {};
+ Cursor(end_magic_t);
+ Cursor(void*);
+ friend RADOS;
+ static constexpr std::size_t impl_size = 16 * 8;
+ std::aligned_storage_t<impl_size> impl;
+};
+
+class RADOS final
+{
+public:
+ static constexpr std::tuple<uint32_t, uint32_t, uint32_t> version() {
+ return {0, 0, 1};
+ }
+
+ using BuildSig = void(boost::system::error_code, RADOS);
+ using BuildComp = ceph::async::Completion<BuildSig>;
+ class Builder {
+ std::optional<std::string> conf_files;
+ std::optional<std::string> cluster;
+ std::optional<std::string> name;
+ std::vector<std::pair<std::string, std::string>> configs;
+ bool no_default_conf = false;
+ bool no_mon_conf = false;
+
+ public:
+ Builder() = default;
+ Builder& add_conf_file(std::string_view v);
+ Builder& set_cluster(std::string_view c) {
+ cluster = std::string(c);
+ return *this;
+ }
+ Builder& set_name(std::string_view n) {
+ name = std::string(n);
+ return *this;
+ }
+ Builder& set_no_default_conf() {
+ no_default_conf = true;
+ return *this;
+ }
+ Builder& set_no_mon_conf() {
+ no_mon_conf = true;
+ return *this;
+ }
+ Builder& set_conf_option(std::string_view opt, std::string_view val) {
+ configs.emplace_back(std::string(opt), std::string(val));
+ return *this;
+ }
+
+ template<typename CompletionToken>
+ auto build(boost::asio::io_context& ioctx, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, BuildSig> init(token);
+ build(ioctx,
+ BuildComp::create(ioctx.get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ private:
+ void build(boost::asio::io_context& ioctx,
+ std::unique_ptr<BuildComp> c);
+ };
+
+
+ template<typename CompletionToken>
+ static auto make_with_cct(CephContext* cct,
+ boost::asio::io_context& ioctx,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, BuildSig> init(token);
+ make_with_cct(cct, ioctx,
+ BuildComp::create(ioctx.get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ static RADOS make_with_librados(librados::Rados& rados);
+
+ RADOS(const RADOS&) = delete;
+ RADOS& operator =(const RADOS&) = delete;
+
+ RADOS(RADOS&&);
+ RADOS& operator =(RADOS&&);
+
+ ~RADOS();
+
+ CephContext* cct();
+
+ using executor_type = boost::asio::io_context::executor_type;
+ executor_type get_executor() const;
+ boost::asio::io_context& get_io_context();
+
+ template<typename CompletionToken>
+ auto execute(const Object& o, const IOContext& ioc, ReadOp&& op,
+ ceph::buffer::list* bl,
+ CompletionToken&& token, uint64_t* objver = nullptr,
+ const blkin_trace_info* trace_info = nullptr) {
+ boost::asio::async_completion<CompletionToken, Op::Signature> init(token);
+ execute(o, ioc, std::move(op), bl,
+ ReadOp::Completion::create(get_executor(),
+ std::move(init.completion_handler)),
+ objver, trace_info);
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto execute(const Object& o, const IOContext& ioc, WriteOp&& op,
+ CompletionToken&& token, uint64_t* objver = nullptr,
+ const blkin_trace_info* trace_info = nullptr) {
+ boost::asio::async_completion<CompletionToken, Op::Signature> init(token);
+ execute(o, ioc, std::move(op),
+ Op::Completion::create(get_executor(),
+ std::move(init.completion_handler)),
+ objver, trace_info);
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto execute(const Object& o, std::int64_t pool,
+ ReadOp&& op,
+ ceph::buffer::list* bl,
+ CompletionToken&& token,
+ std::optional<std::string_view> ns = {},
+ std::optional<std::string_view> key = {},
+ uint64_t* objver = nullptr) {
+ boost::asio::async_completion<CompletionToken, Op::Signature> init(token);
+ execute(o, pool, std::move(op), bl,
+ ReadOp::Completion::create(get_executor(),
+ std::move(init.completion_handler)),
+ ns, key, objver);
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto execute(const Object& o, std::int64_t pool, WriteOp&& op,
+ CompletionToken&& token,
+ std::optional<std::string_view> ns = {},
+ std::optional<std::string_view> key = {},
+ uint64_t* objver = nullptr) {
+ boost::asio::async_completion<CompletionToken, Op::Signature> init(token);
+ execute(o, pool, std::move(op),
+ Op::Completion::create(get_executor(),
+ std::move(init.completion_handler)),
+ ns, key, objver);
+ return init.result.get();
+ }
+
+ boost::uuids::uuid get_fsid() const noexcept;
+
+ using LookupPoolSig = void(boost::system::error_code,
+ std::int64_t);
+ using LookupPoolComp = ceph::async::Completion<LookupPoolSig>;
+ template<typename CompletionToken>
+ auto lookup_pool(std::string_view name,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, LookupPoolSig> init(token);
+ lookup_pool(name,
+ LookupPoolComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ std::optional<uint64_t> get_pool_alignment(int64_t pool_id);
+
+ using LSPoolsSig = void(std::vector<std::pair<std::int64_t, std::string>>);
+ using LSPoolsComp = ceph::async::Completion<LSPoolsSig>;
+ template<typename CompletionToken>
+ auto list_pools(CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, LSPoolsSig> init(token);
+ list_pools(LSPoolsComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+
+
+ using SimpleOpSig = void(boost::system::error_code);
+ using SimpleOpComp = ceph::async::Completion<SimpleOpSig>;
+ template<typename CompletionToken>
+ auto create_pool_snap(int64_t pool, std::string_view snapName,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ create_pool_snap(pool, snapName,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ using SMSnapSig = void(boost::system::error_code, std::uint64_t);
+ using SMSnapComp = ceph::async::Completion<SMSnapSig>;
+ template<typename CompletionToken>
+ auto allocate_selfmanaged_snap(int64_t pool,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SMSnapSig> init(token);
+ allocate_selfmanaged_snap(pool,
+ SMSnapComp::create(
+ get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto delete_pool_snap(int64_t pool, std::string_view snapName,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ delete_pool_snap(pool, snapName,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto delete_selfmanaged_snap(int64_t pool, std::string_view snapName,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ delete_selfmanaged_snap(pool, snapName,
+ SimpleOpComp::create(
+ get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto create_pool(std::string_view name, std::optional<int> crush_rule,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ create_pool(name, crush_rule,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto delete_pool(std::string_view name,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ delete_pool(name,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto delete_pool(int64_t pool,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ delete_pool(pool,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ using PoolStatSig = void(boost::system::error_code,
+ boost::container::flat_map<std::string,
+ PoolStats>, bool);
+ using PoolStatComp = ceph::async::Completion<PoolStatSig>;
+ template<typename CompletionToken>
+ auto stat_pools(const std::vector<std::string>& pools,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, PoolStatSig> init(token);
+ stat_pools(pools,
+ PoolStatComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ using StatFSSig = void(boost::system::error_code,
+ FSStats);
+ using StatFSComp = ceph::async::Completion<StatFSSig>;
+ template<typename CompletionToken>
+ auto statfs(std::optional<int64_t> pool,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, StatFSSig> init(token);
+ ceph_statfs(pool, StatFSComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ using WatchCB = fu2::unique_function<void(boost::system::error_code,
+ uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ ceph::buffer::list&& bl)>;
+
+ using WatchSig = void(boost::system::error_code ec,
+ uint64_t cookie);
+ using WatchComp = ceph::async::Completion<WatchSig>;
+ template<typename CompletionToken>
+ auto watch(const Object& o, const IOContext& ioc,
+ std::optional<std::chrono::seconds> timeout,
+ WatchCB&& cb, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, WatchSig> init(token);
+ watch(o, ioc, timeout, std::move(cb),
+ WatchComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto watch(const Object& o, std::int64_t pool,
+ std::optional<std::chrono::seconds> timeout,
+ WatchCB&& cb, CompletionToken&& token,
+ std::optional<std::string_view> ns = {},
+ std::optional<std::string_view> key = {}) {
+ boost::asio::async_completion<CompletionToken, WatchSig> init(token);
+ watch(o, pool, timeout, std::move(cb),
+ WatchComp::create(get_executor(),
+ std::move(init.completion_handler)),
+ ns, key);
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto notify_ack(const Object& o,
+ const IOContext& ioc,
+ uint64_t notify_id,
+ uint64_t cookie,
+ ceph::buffer::list&& bl,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ notify_ack(o, ioc, notify_id, cookie, std::move(bl),
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto notify_ack(const Object& o,
+ std::int64_t pool,
+ uint64_t notify_id,
+ uint64_t cookie,
+ ceph::buffer::list&& bl,
+ CompletionToken&& token,
+ std::optional<std::string_view> ns = {},
+ std::optional<std::string_view> key = {}) {
+ boost::asio::async_completion<CompletionToken, WatchSig> init(token);
+ notify_ack(o, pool, notify_id, cookie, std::move(bl),
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)),
+ ns, key);
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto unwatch(uint64_t cookie, const IOContext& ioc,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ unwatch(cookie, ioc,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto unwatch(uint64_t cookie, std::int64_t pool,
+ CompletionToken&& token,
+ std::optional<std::string_view> ns = {},
+ std::optional<std::string_view> key = {}) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ unwatch(cookie, pool,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)),
+ ns, key);
+ return init.result.get();
+ }
+
+ // This is one of those places where having to force everything into
+ // a .cc file is really infuriating. If we had modules, that would
+ // let us separate out the implementation details without
+ // sacrificing all the benefits of templates.
+ using VoidOpSig = void();
+ using VoidOpComp = ceph::async::Completion<VoidOpSig>;
+ template<typename CompletionToken>
+ auto flush_watch(CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, VoidOpSig> init(token);
+ flush_watch(VoidOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ using NotifySig = void(boost::system::error_code, ceph::buffer::list);
+ using NotifyComp = ceph::async::Completion<NotifySig>;
+ template<typename CompletionToken>
+ auto notify(const Object& oid, const IOContext& ioc, ceph::buffer::list&& bl,
+ std::optional<std::chrono::milliseconds> timeout,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, NotifySig> init(token);
+ notify(oid, ioc, std::move(bl), timeout,
+ NotifyComp::create(get_executor(),
+ std::move(init.completion_handler)));
+
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto notify(const Object& oid, std::int64_t pool, ceph::buffer::list&& bl,
+ std::optional<std::chrono::milliseconds> timeout,
+ CompletionToken&& token,
+ std::optional<std::string_view> ns = {},
+ std::optional<std::string_view> key = {}) {
+ boost::asio::async_completion<CompletionToken, NotifySig> init(token);
+ notify(oid, pool, bl, timeout,
+ NotifyComp::create(get_executor(),
+ std::move(init.completion_handler)),
+ ns, key);
+
+ return init.result.get();
+ }
+
+ // The versions with pointers are fine for coroutines, but
+ // extraordinarily unappealing for callback-oriented programming.
+ using EnumerateSig = void(boost::system::error_code,
+ std::vector<Entry>,
+ Cursor);
+ using EnumerateComp = ceph::async::Completion<EnumerateSig>;
+ template<typename CompletionToken>
+ auto enumerate_objects(const IOContext& ioc, const Cursor& begin,
+ const Cursor& end, const std::uint32_t max,
+ const ceph::buffer::list& filter,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, EnumerateSig> init(token);
+ enumerate_objects(ioc, begin, end, max, filter,
+ EnumerateComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto enumerate_objects(std::int64_t pool, const Cursor& begin,
+ const Cursor& end, const std::uint32_t max,
+ const ceph::buffer::list& filter,
+ CompletionToken&& token,
+ std::optional<std::string_view> ns = {},
+ std::optional<std::string_view> key = {}) {
+ boost::asio::async_completion<CompletionToken, EnumerateSig> init(token);
+ enumerate_objects(pool, begin, end, max, filter,
+ EnumerateComp::create(get_executor(),
+ std::move(init.completion_handler)),
+ ns, key);
+ return init.result.get();
+ }
+
+ using CommandSig = void(boost::system::error_code,
+ std::string, ceph::buffer::list);
+ using CommandComp = ceph::async::Completion<CommandSig>;
+ template<typename CompletionToken>
+ auto osd_command(int osd, std::vector<std::string>&& cmd,
+ ceph::buffer::list&& in, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+ osd_command(osd, std::move(cmd), std::move(in),
+ CommandComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+ template<typename CompletionToken>
+ auto pg_command(PG pg, std::vector<std::string>&& cmd,
+ ceph::buffer::list&& in, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+ pg_command(pg, std::move(cmd), std::move(in),
+ CommandComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto mon_command(std::vector<std::string> command,
+ const ceph::buffer::list& bl,
+ std::string* outs, ceph::buffer::list* outbl,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ mon_command(command, bl, outs, outbl,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto enable_application(std::string_view pool, std::string_view app_name,
+ bool force, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ enable_application(pool, app_name, force,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto blocklist_add(std::string_view client_address,
+ std::optional<std::chrono::seconds> expire,
+ CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ blocklist_add(client_address, expire,
+ SimpleOpComp::create(get_executor(),
+ std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto wait_for_latest_osd_map(CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+ wait_for_latest_osd_map(
+ SimpleOpComp::create(get_executor(), std::move(init.completion_handler)));
+ return init.result.get();
+ }
+
+ uint64_t instance_id() const;
+
+private:
+
+ RADOS();
+
+ friend Builder;
+
+ RADOS(std::unique_ptr<detail::Client> impl);
+ static void make_with_cct(CephContext* cct,
+ boost::asio::io_context& ioctx,
+ std::unique_ptr<BuildComp> c);
+
+ void execute(const Object& o, const IOContext& ioc, ReadOp&& op,
+ ceph::buffer::list* bl, std::unique_ptr<Op::Completion> c,
+ uint64_t* objver, const blkin_trace_info* trace_info);
+
+ void execute(const Object& o, const IOContext& ioc, WriteOp&& op,
+ std::unique_ptr<Op::Completion> c, uint64_t* objver,
+ const blkin_trace_info* trace_info);
+
+ void execute(const Object& o, std::int64_t pool, ReadOp&& op,
+ ceph::buffer::list* bl, std::unique_ptr<Op::Completion> c,
+ std::optional<std::string_view> ns,
+ std::optional<std::string_view> key,
+ uint64_t* objver);
+
+ void execute(const Object& o, std::int64_t pool, WriteOp&& op,
+ std::unique_ptr<Op::Completion> c,
+ std::optional<std::string_view> ns,
+ std::optional<std::string_view> key,
+ uint64_t* objver);
+
+ void lookup_pool(std::string_view name, std::unique_ptr<LookupPoolComp> c);
+ void list_pools(std::unique_ptr<LSPoolsComp> c);
+ void create_pool_snap(int64_t pool, std::string_view snapName,
+ std::unique_ptr<SimpleOpComp> c);
+ void allocate_selfmanaged_snap(int64_t pool, std::unique_ptr<SMSnapComp> c);
+ void delete_pool_snap(int64_t pool, std::string_view snapName,
+ std::unique_ptr<SimpleOpComp> c);
+ void delete_selfmanaged_snap(int64_t pool, std::uint64_t snap,
+ std::unique_ptr<SimpleOpComp> c);
+ void create_pool(std::string_view name, std::optional<int> crush_rule,
+ std::unique_ptr<SimpleOpComp> c);
+ void delete_pool(std::string_view name,
+ std::unique_ptr<SimpleOpComp> c);
+ void delete_pool(int64_t pool,
+ std::unique_ptr<SimpleOpComp> c);
+ void stat_pools(const std::vector<std::string>& pools,
+ std::unique_ptr<PoolStatComp> c);
+ void stat_fs(std::optional<std::int64_t> pool,
+ std::unique_ptr<StatFSComp> c);
+
+ void watch(const Object& o, const IOContext& ioc,
+ std::optional<std::chrono::seconds> timeout,
+ WatchCB&& cb, std::unique_ptr<WatchComp> c);
+ void watch(const Object& o, std::int64_t pool,
+ std::optional<std::chrono::seconds> timeout,
+ WatchCB&& cb, std::unique_ptr<WatchComp> c,
+ std::optional<std::string_view> ns,
+ std::optional<std::string_view> key);
+ tl::expected<ceph::timespan, boost::system::error_code>
+ watch_check(uint64_t cookie);
+ void notify_ack(const Object& o,
+ const IOContext& _ioc,
+ uint64_t notify_id,
+ uint64_t cookie,
+ ceph::buffer::list&& bl,
+ std::unique_ptr<SimpleOpComp>);
+ void notify_ack(const Object& o,
+ std::int64_t pool,
+ uint64_t notify_id,
+ uint64_t cookie,
+ ceph::buffer::list&& bl,
+ std::unique_ptr<SimpleOpComp>,
+ std::optional<std::string_view> ns,
+ std::optional<std::string_view> key);
+ void unwatch(uint64_t cookie, const IOContext& ioc,
+ std::unique_ptr<SimpleOpComp>);
+ void unwatch(uint64_t cookie, std::int64_t pool,
+ std::unique_ptr<SimpleOpComp>,
+ std::optional<std::string_view> ns,
+ std::optional<std::string_view> key);
+ void notify(const Object& oid, const IOContext& ioctx,
+ ceph::buffer::list&& bl,
+ std::optional<std::chrono::milliseconds> timeout,
+ std::unique_ptr<NotifyComp> c);
+ void notify(const Object& oid, std::int64_t pool,
+ ceph::buffer::list&& bl,
+ std::optional<std::chrono::milliseconds> timeout,
+ std::unique_ptr<NotifyComp> c,
+ std::optional<std::string_view> ns,
+ std::optional<std::string_view> key);
+ void flush_watch(std::unique_ptr<VoidOpComp>);
+
+ void enumerate_objects(const IOContext& ioc, const Cursor& begin,
+ const Cursor& end, const std::uint32_t max,
+ const ceph::buffer::list& filter,
+ std::vector<Entry>* ls,
+ Cursor* cursor,
+ std::unique_ptr<SimpleOpComp> c);
+ void enumerate_objects(std::int64_t pool, const Cursor& begin,
+ const Cursor& end, const std::uint32_t max,
+ const ceph::buffer::list& filter,
+ std::vector<Entry>* ls,
+ Cursor* cursor,
+ std::unique_ptr<SimpleOpComp> c,
+ std::optional<std::string_view> ns,
+ std::optional<std::string_view> key);
+ void enumerate_objects(const IOContext& ioc, const Cursor& begin,
+ const Cursor& end, const std::uint32_t max,
+ const ceph::buffer::list& filter,
+ std::unique_ptr<EnumerateComp> c);
+ void enumerate_objects(std::int64_t pool, const Cursor& begin,
+ const Cursor& end, const std::uint32_t max,
+ const ceph::buffer::list& filter,
+ std::unique_ptr<EnumerateComp> c,
+ std::optional<std::string_view> ns,
+ std::optional<std::string_view> key);
+ void osd_command(int osd, std::vector<std::string>&& cmd,
+ ceph::buffer::list&& in, std::unique_ptr<CommandComp> c);
+ void pg_command(PG pg, std::vector<std::string>&& cmd,
+ ceph::buffer::list&& in, std::unique_ptr<CommandComp> c);
+
+ void mon_command(std::vector<std::string> command,
+ const ceph::buffer::list& bl,
+ std::string* outs, ceph::buffer::list* outbl,
+ std::unique_ptr<SimpleOpComp> c);
+
+ void enable_application(std::string_view pool, std::string_view app_name,
+ bool force, std::unique_ptr<SimpleOpComp> c);
+
+ void blocklist_add(std::string_view client_address,
+ std::optional<std::chrono::seconds> expire,
+ std::unique_ptr<SimpleOpComp> c);
+
+ void wait_for_latest_osd_map(std::unique_ptr<SimpleOpComp> c);
+
+ // Proxy object to provide access to low-level RADOS messaging clients
+ std::unique_ptr<detail::Client> impl;
+};
+
+enum class errc {
+ pool_dne = 1,
+ invalid_snapcontext
+};
+
+const boost::system::error_category& error_category() noexcept;
+}
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::neorados::errc> {
+ static const bool value = true;
+};
+
+template<>
+struct is_error_condition_enum<::neorados::errc> {
+ static const bool value = false;
+};
+}
+
+namespace neorados {
+// explicit conversion:
+inline boost::system::error_code make_error_code(errc e) noexcept {
+ return { static_cast<int>(e), error_category() };
+}
+
+// implicit conversion:
+inline boost::system::error_condition make_error_condition(errc e) noexcept {
+ return { static_cast<int>(e), error_category() };
+}
+}
+
+namespace std {
+template<>
+struct hash<neorados::Object> {
+ size_t operator ()(const neorados::Object& r) const;
+};
+template<>
+struct hash<neorados::IOContext> {
+ size_t operator ()(const neorados::IOContext& r) const;
+};
+} // namespace std
+
+#endif // NEORADOS_RADOS_HPP
diff --git a/src/include/neorados/RADOS_Decodable.hpp b/src/include/neorados/RADOS_Decodable.hpp
new file mode 100644
index 000000000..9654a8489
--- /dev/null
+++ b/src/include/neorados/RADOS_Decodable.hpp
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef NEORADOS_RADOS_DECODABLE_HPP
+#define NEORADOS_RADOS_DECODABLE_HPP
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace neorados {
+struct Entry {
+ std::string nspace;
+ std::string oid;
+ std::string locator;
+
+ Entry() {}
+ Entry(std::string nspace, std::string oid, std::string locator) :
+ nspace(std::move(nspace)), oid(std::move(oid)), locator(locator) {}
+};
+inline bool operator ==(const Entry& l, const Entry r) {
+ return std::tie(l.nspace, l.oid, l.locator) ==
+ std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator !=(const Entry& l, const Entry r) {
+ return std::tie(l.nspace, l.oid, l.locator) !=
+ std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator <(const Entry& l, const Entry r) {
+ return std::tie(l.nspace, l.oid, l.locator) <
+ std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator <=(const Entry& l, const Entry r) {
+ return std::tie(l.nspace, l.oid, l.locator) <=
+ std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator >=(const Entry& l, const Entry r) {
+ return std::tie(l.nspace, l.oid, l.locator) >=
+ std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator >(const Entry& l, const Entry r) {
+ return std::tie(l.nspace, l.oid, l.locator) >
+ std::tie(r.nspace, r.oid, r.locator);
+}
+
+inline std::ostream& operator <<(std::ostream& out, const Entry& entry) {
+ if (!entry.nspace.empty())
+ out << entry.nspace << '/';
+ out << entry.oid;
+ if (!entry.locator.empty())
+ out << '@' << entry.locator;
+ return out;
+}
+
+struct CloneInfo {
+ uint64_t cloneid = 0;
+ std::vector<uint64_t> snaps; // ascending
+ std::vector<std::pair<uint64_t, uint64_t>> overlap;// with next newest
+ uint64_t size = 0;
+ CloneInfo() = default;
+};
+
+struct SnapSet {
+ std::vector<CloneInfo> clones; // ascending
+ std::uint64_t seq = 0; // newest snapid seen by the object
+ SnapSet() = default;
+};
+
+struct ObjWatcher {
+ /// Address of the Watcher
+ std::string addr;
+ /// Watcher ID
+ std::int64_t watcher_id;
+ /// Cookie
+ std::uint64_t cookie;
+ /// Timeout in Seconds
+ std::uint32_t timeout_seconds;
+};
+}
+
+namespace std {
+template<>
+struct hash<::neorados::Entry> {
+ std::size_t operator ()(::neorados::Entry e) const {
+ hash<std::string> h;
+ return (h(e.nspace) << 2) ^ (h(e.oid) << 1) ^ h(e.locator);
+ }
+};
+}
+
+#endif // RADOS_DECODABLE_HPP
diff --git a/src/include/neorados/buffer_fwd.h b/src/include/neorados/buffer_fwd.h
new file mode 120000
index 000000000..bd1f6f1b0
--- /dev/null
+++ b/src/include/neorados/buffer_fwd.h
@@ -0,0 +1 @@
+../buffer_fwd.h \ No newline at end of file
diff --git a/src/include/neorados/completion.h b/src/include/neorados/completion.h
new file mode 120000
index 000000000..100678fc2
--- /dev/null
+++ b/src/include/neorados/completion.h
@@ -0,0 +1 @@
+../../common/async/completion.h \ No newline at end of file
diff --git a/src/include/object.h b/src/include/object.h
new file mode 100644
index 000000000..96951e74d
--- /dev/null
+++ b/src/include/object.h
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJECT_H
+#define CEPH_OBJECT_H
+
+#include <cstdint>
+#include <cstdio>
+#include <iomanip>
+#include <iosfwd>
+#include <string>
+#include <string>
+#include <string_view>
+
+#include "include/rados.h"
+#include "include/unordered_map.h"
+
+#include "hash.h"
+#include "encoding.h"
+#include "ceph_hash.h"
+#include "cmp.h"
+
+using namespace std;
+
+struct object_t {
+ std::string name;
+
+ object_t() {}
+ // cppcheck-suppress noExplicitConstructor
+ object_t(const char *s) : name(s) {}
+ // cppcheck-suppress noExplicitConstructor
+ object_t(const std::string& s) : name(s) {}
+ object_t(std::string&& s) : name(std::move(s)) {}
+ object_t(std::string_view s) : name(s) {}
+
+ void swap(object_t& o) {
+ name.swap(o.name);
+ }
+ void clear() {
+ name.clear();
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ using ceph::encode;
+ encode(name, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ using ceph::decode;
+ decode(name, bl);
+ }
+};
+WRITE_CLASS_ENCODER(object_t)
+
+inline bool operator==(const object_t& l, const object_t& r) {
+ return l.name == r.name;
+}
+inline bool operator!=(const object_t& l, const object_t& r) {
+ return l.name != r.name;
+}
+inline bool operator>(const object_t& l, const object_t& r) {
+ return l.name > r.name;
+}
+inline bool operator<(const object_t& l, const object_t& r) {
+ return l.name < r.name;
+}
+inline bool operator>=(const object_t& l, const object_t& r) {
+ return l.name >= r.name;
+}
+inline bool operator<=(const object_t& l, const object_t& r) {
+ return l.name <= r.name;
+}
+inline std::ostream& operator<<(std::ostream& out, const object_t& o) {
+ return out << o.name;
+}
+
+namespace std {
+template<> struct hash<object_t> {
+ size_t operator()(const object_t& r) const {
+ //static hash<string> H;
+ //return H(r.name);
+ return ceph_str_hash_linux(r.name.c_str(), r.name.length());
+ }
+};
+} // namespace std
+
+
+struct file_object_t {
+ uint64_t ino, bno;
+ mutable char buf[34];
+
+ file_object_t(uint64_t i=0, uint64_t b=0) : ino(i), bno(b) {
+ buf[0] = 0;
+ }
+
+ const char *c_str() const {
+ if (!buf[0])
+ snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)bno);
+ return buf;
+ }
+
+ operator object_t() {
+ return object_t(c_str());
+ }
+};
+
+
+// ---------------------------
+// snaps
+
+struct snapid_t {
+ uint64_t val;
+ // cppcheck-suppress noExplicitConstructor
+ snapid_t(uint64_t v=0) : val(v) {}
+ snapid_t operator+=(snapid_t o) { val += o.val; return *this; }
+ snapid_t operator++() { ++val; return *this; }
+ operator uint64_t() const { return val; }
+};
+
+inline void encode(snapid_t i, ceph::buffer::list &bl) {
+ using ceph::encode;
+ encode(i.val, bl);
+}
+inline void decode(snapid_t &i, ceph::buffer::list::const_iterator &p) {
+ using ceph::decode;
+ decode(i.val, p);
+}
+
+template<>
+struct denc_traits<snapid_t> {
+ static constexpr bool supported = true;
+ static constexpr bool featured = false;
+ static constexpr bool bounded = true;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const snapid_t& o, size_t& p) {
+ denc(o.val, p);
+ }
+ static void encode(const snapid_t &o, ceph::buffer::list::contiguous_appender& p) {
+ denc(o.val, p);
+ }
+ static void decode(snapid_t& o, ceph::buffer::ptr::const_iterator &p) {
+ denc(o.val, p);
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const snapid_t& s) {
+ if (s == CEPH_NOSNAP)
+ return out << "head";
+ else if (s == CEPH_SNAPDIR)
+ return out << "snapdir";
+ else
+ return out << std::hex << s.val << std::dec;
+}
+
+
+struct sobject_t {
+ object_t oid;
+ snapid_t snap;
+
+ sobject_t() : snap(0) {}
+ sobject_t(object_t o, snapid_t s) : oid(o), snap(s) {}
+
+ void swap(sobject_t& o) {
+ oid.swap(o.oid);
+ snapid_t t = snap;
+ snap = o.snap;
+ o.snap = t;
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ encode(oid, bl);
+ encode(snap, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ using ceph::decode;
+ decode(oid, bl);
+ decode(snap, bl);
+ }
+};
+WRITE_CLASS_ENCODER(sobject_t)
+
+inline bool operator==(const sobject_t &l, const sobject_t &r) {
+ return l.oid == r.oid && l.snap == r.snap;
+}
+inline bool operator!=(const sobject_t &l, const sobject_t &r) {
+ return l.oid != r.oid || l.snap != r.snap;
+}
+inline bool operator>(const sobject_t &l, const sobject_t &r) {
+ return l.oid > r.oid || (l.oid == r.oid && l.snap > r.snap);
+}
+inline bool operator<(const sobject_t &l, const sobject_t &r) {
+ return l.oid < r.oid || (l.oid == r.oid && l.snap < r.snap);
+}
+inline bool operator>=(const sobject_t &l, const sobject_t &r) {
+ return l.oid > r.oid || (l.oid == r.oid && l.snap >= r.snap);
+}
+inline bool operator<=(const sobject_t &l, const sobject_t &r) {
+ return l.oid < r.oid || (l.oid == r.oid && l.snap <= r.snap);
+}
+inline std::ostream& operator<<(std::ostream& out, const sobject_t &o) {
+ return out << o.oid << "/" << o.snap;
+}
+namespace std {
+template<> struct hash<sobject_t> {
+ size_t operator()(const sobject_t &r) const {
+ static hash<object_t> H;
+ static rjhash<uint64_t> I;
+ return H(r.oid) ^ I(r.snap);
+ }
+};
+} // namespace std
+
+#endif
diff --git a/src/include/on_exit.h b/src/include/on_exit.h
new file mode 100644
index 000000000..c412ab33e
--- /dev/null
+++ b/src/include/on_exit.h
@@ -0,0 +1,49 @@
+#ifndef CEPH_ON_EXIT_H
+#define CEPH_ON_EXIT_H
+
+#include <pthread.h>
+#include <vector>
+
+#include "include/ceph_assert.h"
+/*
+ * Create a static instance at the file level to get callbacks called when the
+ * process exits via main() or exit().
+ */
+
+class OnExitManager {
+ public:
+ typedef void (*callback_t)(void *arg);
+
+ OnExitManager() {
+ int ret = pthread_mutex_init(&lock_, NULL);
+ ceph_assert(ret == 0);
+ }
+
+ ~OnExitManager() {
+ pthread_mutex_lock(&lock_);
+ std::vector<struct cb>::iterator it;
+ for (it = funcs_.begin(); it != funcs_.end(); it++) {
+ it->func(it->arg);
+ }
+ funcs_.clear();
+ pthread_mutex_unlock(&lock_);
+ }
+
+ void add_callback(callback_t func, void *arg) {
+ pthread_mutex_lock(&lock_);
+ struct cb callback = { func, arg };
+ funcs_.push_back(callback);
+ pthread_mutex_unlock(&lock_);
+ }
+
+ private:
+ struct cb {
+ callback_t func;
+ void *arg;
+ };
+
+ std::vector<struct cb> funcs_;
+ pthread_mutex_t lock_;
+};
+
+#endif
diff --git a/src/include/page.h b/src/include/page.h
new file mode 100644
index 000000000..db6e20585
--- /dev/null
+++ b/src/include/page.h
@@ -0,0 +1,18 @@
+#ifndef CEPH_PAGE_H
+#define CEPH_PAGE_H
+
+namespace ceph {
+ // these are in common/page.cc
+ extern unsigned _page_size;
+ extern unsigned long _page_mask;
+ extern unsigned _page_shift;
+}
+
+#endif
+
+
+#define CEPH_PAGE_SIZE ceph::_page_size
+#define CEPH_PAGE_MASK ceph::_page_mask
+#define CEPH_PAGE_SHIFT ceph::_page_shift
+
+
diff --git a/src/include/rados.h b/src/include/rados.h
new file mode 100644
index 000000000..148465771
--- /dev/null
+++ b/src/include/rados.h
@@ -0,0 +1,696 @@
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include <string.h>
+#include <stdbool.h>
+#include "msgr.h"
+
+/* See comment in ceph_fs.h. */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+ unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+ const struct ceph_fsid *b)
+{
+ return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
+
+struct ceph_timespec {
+ __le32 tv_sec;
+ __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH 1
+#define CEPH_OBJECT_LAYOUT_LINEAR 2
+#define CEPH_OBJECT_LAYOUT_HASHINO 3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH 0
+#define CEPH_PG_LAYOUT_HASH 1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+ __le16 preferred; /* preferred primary osd */
+ __le16 ps; /* placement seed */
+ __le32 pool; /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg pool types
+ *
+ * NOTE: These map 1:1 on to the pg_pool_t::TYPE_* values. They are
+ * duplicated here only for CrushCompiler's benefit.
+ */
+#define CEPH_PG_TYPE_REPLICATED 1
+/* #define CEPH_PG_TYPE_RAID4 2 never implemented */
+#define CEPH_PG_TYPE_ERASURE 3
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time. b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ *
+ * ** This function is released to the public domain by the author. **
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+ if ((x & bmask) < b)
+ return x & bmask;
+ else
+ return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+ struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
+ __le32 ol_stripe_unit; /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+ __le32 epoch;
+ __le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS (1<<0)
+#define CEPH_OSD_UP (1<<1)
+#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
+#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
+#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */
+#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */
+#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */
+#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */
+#define CEPH_OSD_NOUP (1<<8) /* osd can not be marked up */
+#define CEPH_OSD_NODOWN (1<<9) /* osd can not be marked down */
+#define CEPH_OSD_NOIN (1<<10) /* osd can not be marked in */
+#define CEPH_OSD_NOOUT (1<<11) /* osd can not be marked out */
+#define CEPH_OSD_STOP (1<<12) /* osd has been stopped by admin */
+
+extern const char *ceph_osd_state_name(int s);
+
+/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN 0x10000
+#define CEPH_OSD_OUT 0
+
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC), deprecated since mimic*/
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC), deprecated since mimic */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
+#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
+#define CEPH_OSDMAP_PURGED_SNAPDIRS (1<<20) /* osds have converted snapsets */
+#define CEPH_OSDMAP_NOSNAPTRIM (1<<21) /* disable snap trimming */
+#define CEPH_OSDMAP_PGLOG_HARDLIMIT (1<<22) /* put a hard limit on pg log length */
+
+/* these are hidden in 'ceph status' view */
+#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \
+ CEPH_OSDMAP_REQUIRE_KRAKEN | \
+ CEPH_OSDMAP_REQUIRE_LUMINOUS | \
+ CEPH_OSDMAP_RECOVERY_DELETES | \
+ CEPH_OSDMAP_SORTBITWISE | \
+ CEPH_OSDMAP_PURGED_SNAPDIRS | \
+ CEPH_OSDMAP_PGLOG_HARDLIMIT)
+#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \
+ CEPH_OSDMAP_REQUIRE_KRAKEN | \
+ CEPH_OSDMAP_REQUIRE_LUMINOUS)
+
+/*
+ * major ceph release numbers
+ */
+#define CEPH_RELEASE_ARGONAUT 1
+#define CEPH_RELEASE_BOBTAIL 2
+#define CEPH_RELEASE_CUTTLEFISH 3
+#define CEPH_RELEASE_DUMPLING 4
+#define CEPH_RELEASE_EMPEROR 5
+#define CEPH_RELEASE_FIREFLY 6
+#define CEPH_RELEASE_GIANT 7
+#define CEPH_RELEASE_HAMMER 8
+#define CEPH_RELEASE_INFERNALIS 9
+#define CEPH_RELEASE_JEWEL 10
+#define CEPH_RELEASE_KRAKEN 11
+#define CEPH_RELEASE_LUMINOUS 12
+#define CEPH_RELEASE_MIMIC 13
+#define CEPH_RELEASE_NAUTILUS 14
+#define CEPH_RELEASE_OCTOPUS 15
+#define CEPH_RELEASE_PACIFIC 16
+#define CEPH_RELEASE_MAX 17 /* highest + 1 */
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
+
+/*
+ * osd ops
+ *
+ * WARNING: do not use these op codes directly. Use the helpers
+ * defined below instead. In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
+ */
+#define CEPH_OSD_OP_MODE 0xf000
+#define CEPH_OSD_OP_MODE_RD 0x1000
+#define CEPH_OSD_OP_MODE_WR 0x2000
+#define CEPH_OSD_OP_MODE_RMW 0x3000
+#define CEPH_OSD_OP_MODE_SUB 0x4000
+#define CEPH_OSD_OP_MODE_CACHE 0x8000
+
+#define CEPH_OSD_OP_TYPE 0x0f00
+#define CEPH_OSD_OP_TYPE_DATA 0x0200
+#define CEPH_OSD_OP_TYPE_ATTR 0x0300
+#define CEPH_OSD_OP_TYPE_EXEC 0x0400
+#define CEPH_OSD_OP_TYPE_PG 0x0500
+// LEAVE UNUSED 0x0600 used to be multiobject ops
+
+#define __CEPH_OSD_OP1(mode, nr) \
+ (CEPH_OSD_OP_MODE_##mode | (nr))
+
+#define __CEPH_OSD_OP(mode, type, nr) \
+ (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr))
+
+#define __CEPH_FORALL_OSD_OPS(f) \
+ /** data **/ \
+ /* read */ \
+ f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \
+ f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \
+ f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \
+ f(CHECKSUM, __CEPH_OSD_OP(RD, DATA, 31), "checksum") \
+ \
+ /* fancy read */ \
+ f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \
+ f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \
+ \
+ f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \
+ f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \
+ \
+ /* versioning */ \
+ f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \
+ \
+ f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \
+ \
+ f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \
+ \
+ /* sync */ \
+ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \
+ \
+ /* write */ \
+ f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \
+ f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \
+ f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \
+ f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \
+ f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \
+ \
+ /* fancy write */ \
+ f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \
+ f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \
+ f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \
+ f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \
+ \
+ f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \
+ f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \
+ f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \
+ \
+ f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \
+ f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \
+ \
+ f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \
+ \
+ /* omap */ \
+ f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \
+ f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \
+ f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \
+ f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \
+ f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \
+ f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \
+ f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \
+ f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \
+ f(OMAPRMKEYRANGE, __CEPH_OSD_OP(WR, DATA, 44), "omap-rm-key-range") \
+ f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \
+ \
+ /* tiering */ \
+ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \
+ f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \
+ /* was copy-get-classic */ \
+ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \
+ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \
+ f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \
+ f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \
+ f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \
+ f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \
+ \
+ /* convert tmap to omap */ \
+ f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \
+ \
+ /* hints */ \
+ f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \
+ \
+ /* cache pin/unpin */ \
+ f(CACHE_PIN, __CEPH_OSD_OP(WR, DATA, 36), "cache-pin") \
+ f(CACHE_UNPIN, __CEPH_OSD_OP(WR, DATA, 37), "cache-unpin") \
+ \
+ /* ESX/SCSI */ \
+ f(WRITESAME, __CEPH_OSD_OP(WR, DATA, 38), "write-same") \
+ f(CMPEXT, __CEPH_OSD_OP(RD, DATA, 32), "cmpext") \
+ \
+ /* Extensible */ \
+ f(SET_REDIRECT, __CEPH_OSD_OP(WR, DATA, 39), "set-redirect") \
+ f(SET_CHUNK, __CEPH_OSD_OP(CACHE, DATA, 40), "set-chunk") \
+ f(TIER_PROMOTE, __CEPH_OSD_OP(WR, DATA, 41), "tier-promote") \
+ f(UNSET_MANIFEST, __CEPH_OSD_OP(WR, DATA, 42), "unset-manifest") \
+ f(TIER_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 43), "tier-flush") \
+ f(TIER_EVICT, __CEPH_OSD_OP(CACHE, DATA, 44), "tier-evict") \
+ \
+ /** attrs **/ \
+ /* read */ \
+ f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \
+ f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \
+ f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \
+ \
+ /* write */ \
+ f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \
+ f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \
+ f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \
+ f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \
+ \
+ /** subop **/ \
+ f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \
+ f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \
+ f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \
+ f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \
+ f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \
+ f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \
+ f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \
+ /* 8 used to be scrub-stop */ \
+ f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \
+ \
+ /** exec **/ \
+ /* note: the RD bit here is wrong; see special-case below in helper */ \
+ f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \
+ \
+ /** pg **/ \
+ f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \
+ f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \
+ f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \
+ f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get") \
+ f(PGNLS, __CEPH_OSD_OP(RD, PG, 5), "pgnls") \
+ f(PGNLS_FILTER, __CEPH_OSD_OP(RD, PG, 6), "pgnls-filter") \
+ f(SCRUBLS, __CEPH_OSD_OP(RD, PG, 7), "scrubls")
+
+enum {
+#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode),
+__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
+#undef GENERATE_ENUM_ENTRY
+};
+
+static inline int ceph_osd_op_type_data(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+ return (op & CEPH_OSD_OP_MODE_RD) &&
+ op != CEPH_OSD_OP_CALL;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+ return op & CEPH_OSD_OP_MODE_WR;
+}
+static inline int ceph_osd_op_mode_cache(int op)
+{
+ return op & CEPH_OSD_OP_MODE_CACHE;
+}
+static inline bool ceph_osd_op_uses_extent(int op)
+{
+ switch(op) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_MAPEXT:
+ case CEPH_OSD_OP_MASKTRUNC:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_SYNC_READ:
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ case CEPH_OSD_OP_TRUNCATE:
+ case CEPH_OSD_OP_ZERO:
+ case CEPH_OSD_OP_APPEND:
+ case CEPH_OSD_OP_TRIMTRUNC:
+ case CEPH_OSD_OP_CMPEXT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * and objclass.h. Any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
+#define CEPH_OSD_TMAP_RM 'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
+
+extern const char *ceph_osd_op_name(int op);
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+ CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
+ CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
+ CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
+ CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
+ CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
+ CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
+ CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
+ CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
+ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
+ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
+ CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */
+ CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
+ CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000, /* ignore pool overlay */
+ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
+ CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000, /* map snap direct to clone id
+ */
+ CEPH_OSD_FLAG_ENFORCE_SNAPC =0x100000, /* use snapc provided even if
+ pool uses pool snaps */
+ CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
+ CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
+ CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
+ CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
+ CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000, /* ignore redirection */
+ CEPH_OSD_FLAG_RETURNVEC = 0x4000000, /* allow overall result >= 0, and return >= 0 and buffer for each op in opvec */
+};
+
+enum {
+ CEPH_OSD_OP_FLAG_EXCL = 0x1, /* EXCL object create */
+ CEPH_OSD_OP_FLAG_FAILOK = 0x2, /* continue despite failure */
+ CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */
+ CEPH_OSD_OP_FLAG_WITH_REFERENCE = 0x80, /* need reference couting */
+ CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */
+};
+
+#define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLOCKLISTED 108 /* blocklisted */
+#define EBLACKLISTED 108 /* deprecated */
+
+/* xattr comparison */
+enum {
+ CEPH_OSD_CMPXATTR_OP_EQ = 1,
+ CEPH_OSD_CMPXATTR_OP_NE = 2,
+ CEPH_OSD_CMPXATTR_OP_GT = 3,
+ CEPH_OSD_CMPXATTR_OP_GTE = 4,
+ CEPH_OSD_CMPXATTR_OP_LT = 5,
+ CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+ CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+ CEPH_OSD_CMPXATTR_MODE_U64 = 2
+};
+
+enum {
+ CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
+ * cloneid */
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* use provided truncate_{seq,size} (copy-from2 only) */
+};
+
+#define CEPH_OSD_COPY_FROM_FLAGS \
+ (CEPH_OSD_COPY_FROM_FLAG_FLUSH | \
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | \
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | \
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | \
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED | \
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)
+
+enum {
+ CEPH_OSD_TMAP2OMAP_NULLOK = 1,
+};
+
+enum {
+ CEPH_OSD_WATCH_OP_UNWATCH = 0,
+ CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+ /* note: use only ODD ids to prevent pre-giant code from
+ interpreting the op as UNWATCH */
+ CEPH_OSD_WATCH_OP_WATCH = 3,
+ CEPH_OSD_WATCH_OP_RECONNECT = 5,
+ CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+enum {
+ CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32 = 0,
+ CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64 = 1,
+ CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C = 2
+};
+
+const char *ceph_osd_watch_op_name(int o);
+
+enum {
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+
+const char *ceph_osd_alloc_hint_flag_name(int f);
+
+enum {
+ CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+ CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+ CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
+
+const char *ceph_osd_backoff_op_name(int op);
+
+/*
+ * an individual object operation. each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+ __le16 op; /* CEPH_OSD_OP_* */
+ __le32 flags; /* CEPH_OSD_OP_FLAG_* */
+ union {
+ struct {
+ __le64 offset, length;
+ __le64 truncate_size;
+ __le32 truncate_seq;
+ } __attribute__ ((packed)) extent;
+ struct {
+ __le32 name_len;
+ __le32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
+ } __attribute__ ((packed)) xattr;
+ struct {
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
+ __le32 indata_len;
+ } __attribute__ ((packed)) cls;
+ struct {
+ __le64 count;
+ __le32 start_epoch; /* for the pgls sequence */
+ } __attribute__ ((packed)) pgls;
+ struct {
+ __le64 snapid;
+ } __attribute__ ((packed)) snap;
+ struct {
+ __le64 cookie;
+ __le64 ver; /* no longer used */
+ __u8 op; /* CEPH_OSD_WATCH_OP_* */
+ __u32 gen; /* registration generation */
+ __u32 timeout; /* connection timeout */
+ } __attribute__ ((packed)) watch;
+ struct {
+ __le64 cookie;
+ } __attribute__ ((packed)) notify;
+ struct {
+ __le64 unused;
+ __le64 ver;
+ } __attribute__ ((packed)) assert_ver;
+ struct {
+ __le64 offset, length;
+ __le64 src_offset;
+ } __attribute__ ((packed)) clonerange;
+ struct {
+ __le64 max; /* max data in reply */
+ } __attribute__ ((packed)) copy_get;
+ struct {
+ __le64 snapid;
+ __le64 src_version;
+ __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
+ /*
+ * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
+ * for src object, flags for dest object are in
+ * ceph_osd_op::flags.
+ */
+ __le32 src_fadvise_flags;
+ } __attribute__ ((packed)) copy_from;
+ struct {
+ struct ceph_timespec stamp;
+ } __attribute__ ((packed)) hit_set_get;
+ struct {
+ __u8 flags;
+ } __attribute__ ((packed)) tmap2omap;
+ struct {
+ __le64 expected_object_size;
+ __le64 expected_write_size;
+ __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+ } __attribute__ ((packed)) alloc_hint;
+ struct {
+ __le64 offset;
+ __le64 length;
+ __le64 data_length;
+ } __attribute__ ((packed)) writesame;
+ struct {
+ __le64 offset;
+ __le64 length;
+ __le32 chunk_size;
+ __u8 type; /* CEPH_OSD_CHECKSUM_OP_TYPE_* */
+ } __attribute__ ((packed)) checksum;
+ } __attribute__ ((packed));
+ __le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * Check the compatibility of struct ceph_osd_op
+ * (2+4+(2*8+8+4)+4) = (sizeof(ceph_osd_op::op) +
+ * sizeof(ceph_osd_op::flags) +
+ * sizeof(ceph_osd_op::extent) +
+ * sizeof(ceph_osd_op::payload_len))
+ */
+#ifdef __cplusplus
+static_assert(sizeof(ceph_osd_op) == (2+4+(2*8+8+4)+4),
+ "sizeof(ceph_osd_op) breaks the compatibility");
+#endif
+
+struct ceph_osd_reply_head {
+ __le32 client_inc; /* client incarnation */
+ __le32 flags;
+ struct ceph_object_layout layout;
+ __le32 osdmap_epoch;
+ struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+ __le32 result; /* result code */
+
+ __le32 object_len; /* length of object name */
+ __le32 num_ops;
+ struct ceph_osd_op ops[0]; /* ops[], object */
+} __attribute__ ((packed));
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h
new file mode 120000
index 000000000..51fc03be1
--- /dev/null
+++ b/src/include/rados/buffer.h
@@ -0,0 +1 @@
+../buffer.h \ No newline at end of file
diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h
new file mode 120000
index 000000000..bd1f6f1b0
--- /dev/null
+++ b/src/include/rados/buffer_fwd.h
@@ -0,0 +1 @@
+../buffer_fwd.h \ No newline at end of file
diff --git a/src/include/rados/crc32c.h b/src/include/rados/crc32c.h
new file mode 120000
index 000000000..19ef4317e
--- /dev/null
+++ b/src/include/rados/crc32c.h
@@ -0,0 +1 @@
+../crc32c.h \ No newline at end of file
diff --git a/src/include/rados/inline_memory.h b/src/include/rados/inline_memory.h
new file mode 120000
index 000000000..48f0d4436
--- /dev/null
+++ b/src/include/rados/inline_memory.h
@@ -0,0 +1 @@
+../inline_memory.h \ No newline at end of file
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
new file mode 100644
index 000000000..d9e39eceb
--- /dev/null
+++ b/src/include/rados/librados.h
@@ -0,0 +1,4135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRADOS_H
+#define CEPH_LIBRADOS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <unistd.h>
+#include <string.h>
+#include "rados_types.h"
+
+#include <sys/time.h>
+
+#ifndef CEPH_OSD_TMAP_SET
+/* These are also defined in rados.h and objclass.h. Keep them in sync! */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c'
+#define CEPH_OSD_TMAP_RM 'r'
+#endif
+
+#define LIBRADOS_VER_MAJOR 3
+#define LIBRADOS_VER_MINOR 0
+#define LIBRADOS_VER_EXTRA 0
+
+#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA)
+
+#define LIBRADOS_SUPPORTS_WATCH 1
+#define LIBRADOS_SUPPORTS_SERVICES 1
+#define LIBRADOS_SUPPORTS_GETADDRS 1
+#define LIBRADOS_SUPPORTS_APP_METADATA 1
+
+/* RADOS lock flags
+ * They are also defined in cls_lock_types.h. Keep them in sync!
+ */
+#define LIBRADOS_LOCK_FLAG_RENEW (1u<<0)
+#define LIBRADOS_LOCK_FLAG_MAY_RENEW LIBRADOS_LOCK_FLAG_RENEW
+#define LIBRADOS_LOCK_FLAG_MUST_RENEW (1u<<1)
+
+/*
+ * Constants for rados_write_op_create().
+ */
+#define LIBRADOS_CREATE_EXCLUSIVE 1
+#define LIBRADOS_CREATE_IDEMPOTENT 0
+
+/*
+ * Flags that can be set on a per-op basis via
+ * rados_read_op_set_flags() and rados_write_op_set_flags().
+ */
+enum {
+ // fail a create operation if the object already exists
+ LIBRADOS_OP_FLAG_EXCL = 0x1,
+ // allow the transaction to succeed even if the flagged op fails
+ LIBRADOS_OP_FLAG_FAILOK = 0x2,
+ // indicate read/write op random
+ LIBRADOS_OP_FLAG_FADVISE_RANDOM = 0x4,
+ // indicate read/write op sequential
+ LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8,
+ // indicate read/write data will be accessed in the near future (by someone)
+ LIBRADOS_OP_FLAG_FADVISE_WILLNEED = 0x10,
+ // indicate read/write data will not accessed in the near future (by anyone)
+ LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20,
+ // indicate read/write data will not accessed again (by *this* client)
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40,
+ // optionally support FUA (force unit access) on write requests
+ LIBRADOS_OP_FLAG_FADVISE_FUA = 0x80,
+};
+
+#define CEPH_RADOS_API
+
+/**
+ * @name xattr comparison operations
+ * Operators for comparing xattrs on objects, and aborting the
+ * rados_read_op or rados_write_op transaction if the comparison
+ * fails.
+ *
+ * @{
+ */
+enum {
+ LIBRADOS_CMPXATTR_OP_EQ = 1,
+ LIBRADOS_CMPXATTR_OP_NE = 2,
+ LIBRADOS_CMPXATTR_OP_GT = 3,
+ LIBRADOS_CMPXATTR_OP_GTE = 4,
+ LIBRADOS_CMPXATTR_OP_LT = 5,
+ LIBRADOS_CMPXATTR_OP_LTE = 6
+};
+/** @} */
+
+/**
+ * @name Operation Flags
+ * Flags for rados_read_op_operate(), rados_write_op_operate(),
+ * rados_aio_read_op_operate(), and rados_aio_write_op_operate().
+ * See librados.hpp for details.
+ * @{
+ */
+enum {
+ LIBRADOS_OPERATION_NOFLAG = 0,
+ LIBRADOS_OPERATION_BALANCE_READS = 1,
+ LIBRADOS_OPERATION_LOCALIZE_READS = 2,
+ LIBRADOS_OPERATION_ORDER_READS_WRITES = 4,
+ LIBRADOS_OPERATION_IGNORE_CACHE = 8,
+ LIBRADOS_OPERATION_SKIPRWLOCKS = 16,
+ LIBRADOS_OPERATION_IGNORE_OVERLAY = 32,
+ /* send requests to cluster despite the cluster or pool being marked
+ full; ops will either succeed (e.g., delete) or return EDQUOT or
+ ENOSPC. */
+ LIBRADOS_OPERATION_FULL_TRY = 64,
+ /*
+ * Mainly for delete op
+ */
+ LIBRADOS_OPERATION_FULL_FORCE = 128,
+ LIBRADOS_OPERATION_IGNORE_REDIRECT = 256,
+ LIBRADOS_OPERATION_ORDERSNAP = 512,
+ /* enable/allow >0 return values and payloads on write/update */
+ LIBRADOS_OPERATION_RETURNVEC = 1024,
+};
+/** @} */
+
+/**
+ * @name Alloc hint flags
+ * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2()
+ * indicating future IO patterns.
+ * @{
+ */
+enum {
+ LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+/** @} */
+
+typedef enum {
+ LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0,
+ LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1,
+ LIBRADOS_CHECKSUM_TYPE_CRC32C = 2
+} rados_checksum_type_t;
+
+/*
+ * snap id contants
+ */
+#define LIBRADOS_SNAP_HEAD ((uint64_t)(-2))
+#define LIBRADOS_SNAP_DIR ((uint64_t)(-1))
+
+/**
+ * @typedef rados_t
+ *
+ * A handle for interacting with a RADOS cluster. It encapsulates all
+ * RADOS client configuration, including username, key for
+ * authentication, logging, and debugging. Talking to different clusters
+ * -- or to the same cluster with different users -- requires
+ * different cluster handles.
+ */
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif //VOIDPTR_RADOS_T
+
+/**
+ * @typedef rados_config_t
+ *
+ * A handle for the ceph configuration context for the rados_t cluster
+ * instance. This can be used to share configuration context/state
+ * (e.g., logging configuration) between librados instance.
+ *
+ * @warning The config context does not have independent reference
+ * counting. As such, a rados_config_t handle retrieved from a given
+ * rados_t is only valid as long as that rados_t.
+ */
+typedef void *rados_config_t;
+
+/**
+ * @typedef rados_ioctx_t
+ *
+ * An io context encapsulates a few settings for all I/O operations
+ * done on it:
+ * - pool - set when the io context is created (see rados_ioctx_create())
+ * - snapshot context for writes (see
+ * rados_ioctx_selfmanaged_snap_set_write_ctx())
+ * - snapshot id to read from (see rados_ioctx_snap_set_read())
+ * - object locator for all single-object operations (see
+ * rados_ioctx_locator_set_key())
+ * - namespace for all single-object operations (see
+ * rados_ioctx_set_namespace()). Set to LIBRADOS_ALL_NSPACES
+ * before rados_nobjects_list_open() will list all objects in all
+ * namespaces.
+ *
+ * @warning Changing any of these settings is not thread-safe -
+ * librados users must synchronize any of these changes on their own,
+ * or use separate io contexts for each thread
+ */
+typedef void *rados_ioctx_t;
+
+/**
+ * @typedef rados_list_ctx_t
+ *
+ * An iterator for listing the objects in a pool.
+ * Used with rados_nobjects_list_open(),
+ * rados_nobjects_list_next(), rados_nobjects_list_next2(), and
+ * rados_nobjects_list_close().
+ */
+typedef void *rados_list_ctx_t;
+
+/**
+ * @typedef rados_object_list_cursor
+ *
+ * The cursor used with rados_enumerate_objects
+ * and accompanying methods.
+ */
+typedef void * rados_object_list_cursor;
+
+/**
+ * @struct rados_object_list_item
+ *
+ * The item populated by rados_object_list in
+ * the results array.
+ */
+typedef struct {
+
+ /// oid length
+ size_t oid_length;
+ /// name of the object
+ char *oid;
+ /// namespace length
+ size_t nspace_length;
+ /// the object namespace
+ char *nspace;
+ /// locator length
+ size_t locator_length;
+ /// object locator
+ char *locator;
+} rados_object_list_item;
+
+/**
+ * @typedef rados_snap_t
+ * The id of a snapshot.
+ */
+typedef uint64_t rados_snap_t;
+
+/**
+ * @typedef rados_xattrs_iter_t
+ * An iterator for listing extended attrbutes on an object.
+ * Used with rados_getxattrs(), rados_getxattrs_next(), and
+ * rados_getxattrs_end().
+ */
+typedef void *rados_xattrs_iter_t;
+
+/**
+ * @typedef rados_omap_iter_t
+ * An iterator for listing omap key/value pairs on an object.
+ * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(),
+ * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and
+ * rados_omap_get_end().
+ */
+typedef void *rados_omap_iter_t;
+
+/**
+ * @struct rados_pool_stat_t
+ * Usage information for a pool.
+ */
+struct rados_pool_stat_t {
+ /// space used in bytes
+ uint64_t num_bytes;
+ /// space used in KB
+ uint64_t num_kb;
+ /// number of objects in the pool
+ uint64_t num_objects;
+ /// number of clones of objects
+ uint64_t num_object_clones;
+ /// num_objects * num_replicas
+ uint64_t num_object_copies;
+ /// number of objects missing on primary
+ uint64_t num_objects_missing_on_primary;
+ /// number of objects found on no OSDs
+ uint64_t num_objects_unfound;
+ /// number of objects replicated fewer times than they should be
+ /// (but found on at least one OSD)
+ uint64_t num_objects_degraded;
+ /// number of objects read
+ uint64_t num_rd;
+ /// objects read in KB
+ uint64_t num_rd_kb;
+ /// number of objects written
+ uint64_t num_wr;
+ /// objects written in KB
+ uint64_t num_wr_kb;
+ /// bytes originally provided by user
+ uint64_t num_user_bytes;
+ /// bytes passed compression
+ uint64_t compressed_bytes_orig;
+ /// bytes resulted after compression
+ uint64_t compressed_bytes;
+ /// bytes allocated at storage
+ uint64_t compressed_bytes_alloc;
+};
+
+/**
+ * @struct rados_cluster_stat_t
+ * Cluster-wide usage information
+ */
+struct rados_cluster_stat_t {
+ /// total device size
+ uint64_t kb;
+ /// total used
+ uint64_t kb_used;
+ /// total available/free
+ uint64_t kb_avail;
+ /// number of objects
+ uint64_t num_objects;
+};
+
+/**
+ * @typedef rados_write_op_t
+ *
+ * An object write operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_write_op() rados_release_write_op()
+ * - Extended attribute manipulation: rados_write_op_cmpxattr()
+ * rados_write_op_cmpxattr(), rados_write_op_setxattr(),
+ * rados_write_op_rmxattr()
+ * - Object map key/value pairs: rados_write_op_omap_set(),
+ * rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(),
+ * rados_write_op_omap_cmp()
+ * - Object properties: rados_write_op_assert_exists(),
+ * rados_write_op_assert_version()
+ * - Creating objects: rados_write_op_create()
+ * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero
+ * rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove,
+ * rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext()
+ * - Hints: rados_write_op_set_alloc_hint()
+ * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate()
+ */
+typedef void *rados_write_op_t;
+
+/**
+ * @typedef rados_read_op_t
+ *
+ * An object read operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_read_op() rados_release_read_op()
+ * - Extended attribute manipulation: rados_read_op_cmpxattr(),
+ * rados_read_op_getxattr(), rados_read_op_getxattrs()
+ * - Object map key/value pairs: rados_read_op_omap_get_vals(),
+ * rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(),
+ * rados_read_op_omap_cmp()
+ * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(),
+ * rados_read_op_assert_version()
+ * - IO on objects: rados_read_op_read(), rados_read_op_checksum(),
+ * rados_read_op_cmpext()
+ * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf()
+ * - Request properties: rados_read_op_set_flags()
+ * - Performing the operation: rados_read_op_operate(),
+ * rados_aio_read_op_operate()
+ */
+typedef void *rados_read_op_t;
+
+/**
+ * @typedef rados_completion_t
+ * Represents the state of an asynchronous operation - it contains the
+ * return value once the operation completes, and can be used to block
+ * until the operation is complete or safe.
+ */
+typedef void *rados_completion_t;
+
+/**
+ * @struct blkin_trace_info
+ * blkin trace information for Zipkin tracing
+ */
+struct blkin_trace_info;
+
+/**
+ * Get the version of librados.
+ *
+ * The version number is major.minor.extra. Note that this is
+ * unrelated to the Ceph version number.
+ *
+ * TODO: define version semantics, i.e.:
+ * - incrementing major is for backwards-incompatible changes
+ * - incrementing minor is for backwards-compatible changes
+ * - incrementing extra is for bug fixes
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param extra where to store the extra version number
+ */
+CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra);
+
+/**
+ * @name Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using librados.
+ *
+ * @{
+ */
+
+/**
+ * Create a handle for communicating with a RADOS cluster.
+ *
+ * Ceph environment variables are read when this is called, so if
+ * $CEPH_ARGS specifies everything you need to connect, no further
+ * configuration is necessary.
+ *
+ * @param cluster where to store the handle
+ * @param id the user to connect as (i.e. admin, not client.admin)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id);
+
+/**
+ * Extended version of rados_create.
+ *
+ * Like rados_create, but
+ * 1) don't assume 'client\.'+id; allow full specification of name
+ * 2) allow specification of cluster name
+ * 3) flags for future expansion
+ */
+CEPH_RADOS_API int rados_create2(rados_t *pcluster,
+ const char *const clustername,
+ const char * const name, uint64_t flags);
+
+/**
+ * Initialize a cluster handle from an existing configuration.
+ *
+ * Share configuration state with another rados_t instance.
+ *
+ * @param cluster where to store the handle
+ * @param cct the existing configuration to use
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create_with_context(rados_t *cluster,
+ rados_config_t cct);
+
+/**
+ * Ping the monitor with ID mon_id, storing the resulting reply in
+ * buf (if specified) with a maximum size of len.
+ *
+ * The result buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param mon_id [in] ID of the monitor to ping
+ * @param outstr [out] double pointer with the resulting reply
+ * @param outstrlen [out] pointer with the size of the reply in outstr
+ */
+CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id,
+ char **outstr, size_t *outstrlen);
+
+/**
+ * Connect to the cluster.
+ *
+ * @note BUG: Before calling this, calling a function that communicates with the
+ * cluster will crash.
+ *
+ * @pre The cluster handle is configured with at least a monitor
+ * address. If cephx is enabled, a client name and secret must also be
+ * set.
+ *
+ * @post If this succeeds, any function in librados may be used
+ *
+ * @param cluster The cluster to connect to.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_connect(rados_t cluster);
+
+/**
+ * Disconnects from the cluster.
+ *
+ * For clean up, this is only necessary after rados_connect() has
+ * succeeded.
+ *
+ * @warning This does not guarantee any asynchronous writes have
+ * completed. To do that, you must call rados_aio_flush() on all open
+ * io contexts.
+ *
+ * @warning We implicitly call rados_watch_flush() on shutdown. If
+ * there are watches being used, this should be done explicitly before
+ * destroying the relevant IoCtx. We do it here as a safety measure.
+ *
+ * @post the cluster handle cannot be used again
+ *
+ * @param cluster the cluster to shutdown
+ */
+CEPH_RADOS_API void rados_shutdown(rados_t cluster);
+
+/** @} init */
+
+/**
+ * @name Configuration
+ * These functions read and update Ceph configuration for a cluster
+ * handle. Any configuration changes must be done before connecting to
+ * the cluster.
+ *
+ * Options that librados users might want to set include:
+ * - mon_host
+ * - auth_supported
+ * - key, keyfile, or keyring when using cephx
+ * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog
+ * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms
+ *
+ * See docs.ceph.com for information about available configuration options`
+ *
+ * @{
+ */
+
+/**
+ * Configure the cluster handle using a Ceph config file
+ *
+ * If path is NULL, the default locations are searched, and the first
+ * found is used. The locations are:
+ * - $CEPH_CONF (environment variable)
+ * - /etc/ceph/ceph.conf
+ * - ~/.ceph/config
+ * - ceph.conf (in the current working directory)
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param path path to a Ceph configuration file
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path);
+
+/**
+ * Configure the cluster handle with command line arguments
+ *
+ * argv can contain any common Ceph command line option, including any
+ * configuration parameter prefixed by '--' and replacing spaces with
+ * dashes or underscores. For example, the following options are equivalent:
+ * - --mon-host 10.0.0.1:6789
+ * - --mon_host 10.0.0.1:6789
+ * - -m 10.0.0.1:6789
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc,
+ const char **argv);
+
+
+/**
+ * Configure the cluster handle with command line arguments, returning
+ * any remainders. Same rados_conf_parse_argv, except for extra
+ * remargv argument to hold returns unrecognized arguments.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @param remargv char* array for returned unrecognized arguments
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc,
+ const char **argv,
+ const char **remargv);
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cluster cluster handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var);
+
+/**
+ * Set a configuration option
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param option option to set
+ * @param value value of the option
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when the option is not a Ceph configuration option
+ */
+CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option,
+ const char *value);
+
+/**
+ * Get the value of a configuration option
+ *
+ * @param cluster configuration to read
+ * @param option which option to read
+ * @param buf where to write the configuration value
+ * @param len the size of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENAMETOOLONG if the buffer is too short to contain the
+ * requested value
+ */
+CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option,
+ char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * Read usage info about the cluster
+ *
+ * This tells you total space, space used, space available, and number
+ * of objects. These are not updated immediately when data is written,
+ * they are eventually consistent.
+ *
+ * @param cluster cluster to query
+ * @param result where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cluster_stat(rados_t cluster,
+ struct rados_cluster_stat_t *result);
+
+/**
+ * Get the fsid of the cluster as a hexadecimal string.
+ *
+ * The fsid is a unique id of an entire Ceph cluster.
+ *
+ * @param cluster where to get the fsid
+ * @param buf where to write the fsid
+ * @param len the size of buf in bytes (should be 37)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the buffer is too short to contain the
+ * fsid
+ */
+CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len);
+
+/**
+ * Get/wait for the most recent osdmap
+ *
+ * @param cluster the cluster to shutdown
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster);
+
+/**
+ * @name Pools
+ *
+ * RADOS pools are separate namespaces for objects. Pools may have
+ * different crush rules associated with them, so they could have
+ * differing replication levels or placement strategies. RADOS
+ * permissions are also tied to pools - users can have different read,
+ * write, and execute permissions on a per-pool basis.
+ *
+ * @{
+ */
+
+/**
+ * List pools
+ *
+ * Gets a list of pool names as NULL-terminated strings. The pool
+ * names will be placed in the supplied buffer one after another.
+ * After the last pool name, there will be two 0 bytes in a row.
+ *
+ * If len is too short to fit all the pool name entries we need, we will fill
+ * as much as we can.
+ *
+ * Buf may be null to determine the buffer size needed to list all pools.
+ *
+ * @param cluster cluster handle
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len);
+
+/**
+ * List inconsistent placement groups of the given pool
+ *
+ * Gets a list of inconsistent placement groups as NULL-terminated strings.
+ * The placement group names will be placed in the supplied buffer one after
+ * another. After the last name, there will be two 0 types in a row.
+ *
+ * If len is too short to fit all the placement group entries we need, we will
+ * fill as much as we can.
+ *
+ * @param cluster cluster handle
+ * @param pool pool ID
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool,
+ char *buf, size_t len);
+
+/**
+ * Get a configuration handle for a rados cluster handle
+ *
+ * This handle is valid only as long as the cluster handle is valid.
+ *
+ * @param cluster cluster handle
+ * @returns config handle for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster);
+
+/**
+ * Get a global id for current instance
+ *
+ * This id is a unique representation of current connection to the cluster
+ *
+ * @param cluster cluster handle
+ * @returns instance global id
+ */
+CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster);
+
+/**
+ * Gets the minimum compatible OSD version
+ *
+ * @param cluster cluster handle
+ * @param require_osd_release [out] minimum compatible OSD version
+ * based upon the current features
+ * @returns 0 on sucess, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster,
+ int8_t* require_osd_release);
+
+/**
+ * Gets the minimum compatible client version
+ *
+ * @param cluster cluster handle
+ * @param min_compat_client [out] minimum compatible client version
+ * based upon the current features
+ * @param require_min_compat_client [out] required minimum client version
+ * based upon explicit setting
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster,
+ int8_t* min_compat_client,
+ int8_t* require_min_compat_client);
+
+/**
+ * Create an io context
+ *
+ * The io context allows you to perform operations within a particular
+ * pool. For more details see rados_ioctx_t.
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name name of the pool
+ * @param ioctx where to store the io context
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name,
+ rados_ioctx_t *ioctx);
+CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id,
+ rados_ioctx_t *ioctx);
+
+/**
+ * The opposite of rados_ioctx_create
+ *
+ * This just tells librados that you no longer need to use the io context.
+ * It may not be freed immediately if there are pending asynchronous
+ * requests on it, but you should not use an io context again after
+ * calling this function on it.
+ *
+ * @warning This does not guarantee any asynchronous
+ * writes have completed. You must call rados_aio_flush()
+ * on the io context before destroying it to do that.
+ *
+ * @warning If this ioctx is used by rados_watch, the caller needs to
+ * be sure that all registered watches are disconnected via
+ * rados_unwatch() and that rados_watch_flush() is called. This
+ * ensures that a racing watch callback does not make use of a
+ * destroyed ioctx.
+ *
+ * @param io the io context to dispose of
+ */
+CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io);
+
+/**
+ * Get configuration handle for a pool handle
+ *
+ * @param io pool handle
+ * @returns rados_config_t for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io);
+
+/**
+ * Get the cluster handle used by this rados_ioctx_t
+ * Note that this is a weak reference, and should not
+ * be destroyed via rados_shutdown().
+ *
+ * @param io the io context
+ * @returns the cluster handle for this io context
+ */
+CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io);
+
+/**
+ * Get pool usage statistics
+ *
+ * Fills in a rados_pool_stat_t after querying the cluster.
+ *
+ * @param io determines which pool to query
+ * @param stats where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io,
+ struct rados_pool_stat_t *stats);
+
+/**
+ * Get the id of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name which pool to look up
+ * @returns id of the pool
+ * @returns -ENOENT if the pool is not found
+ */
+CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster,
+ const char *pool_name);
+
+/**
+ * Get the name of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param id the id of the pool
+ * @param buf where to store the pool name
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id,
+ char *buf, size_t maxlen);
+
+/**
+ * Create a pool with default settings
+ *
+ * The default crush rule is rule 0.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name);
+
+/**
+ * Create a pool owned by a specific auid.
+ *
+ * DEPRECATED: auid support has been removed, and this call will be removed in a future
+ * release.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster,
+ const char *pool_name,
+ uint64_t auid)
+ __attribute__((deprecated));
+
+/**
+ * Create a pool with a specific CRUSH rule
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool1
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster,
+ const char *pool_name,
+ uint8_t crush_rule_num);
+
+/**
+ * Create a pool with a specific CRUSH rule and auid
+ *
+ * DEPRECATED: auid support has been removed and this call will be removed
+ * in a future release.
+ *
+ * This is a combination of rados_pool_create_with_crush_rule() and
+ * rados_pool_create_with_auid().
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool2
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster,
+ const char *pool_name,
+ uint64_t auid,
+ uint8_t crush_rule_num)
+ __attribute__((deprecated));
+
+/**
+ * Returns the pool that is the base tier for this pool.
+ *
+ * The return value is the ID of the pool that should be used to read from/write to.
+ * If tiering is not set up for the pool, returns \c pool.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool ID of the pool to query
+ * @param base_tier [out] base tier, or \c pool if tiering is not configured
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool,
+ int64_t* base_tier);
+
+/**
+ * Delete a pool and all data inside it
+ *
+ * The pool is removed from the cluster immediately,
+ * but the actual data is deleted in the background.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool_name which pool to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name);
+
+/**
+ * Attempt to change an io context's associated auid "owner"
+ *
+ * DEPRECATED: auid support has been removed and this call has no effect.
+ *
+ * Requires that you have write permission on both the current and new
+ * auid.
+ *
+ * @param io reference to the pool to change.
+ * @param auid the auid you wish the io to have.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid)
+ __attribute__((deprecated));
+
+
+/**
+ * Get the auid of a pool
+ *
+ * DEPRECATED: auid support has been removed and this call always reports
+ * CEPH_AUTH_UID_DEFAULT (-1).
+
+ * @param io pool to query
+ * @param auid where to store the auid
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid)
+ __attribute__((deprecated));
+
+/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+/**
+ * Test whether the specified pool requires alignment or not.
+ *
+ * @param io pool to query
+ * @param req 1 if alignment is supported, 0 if not.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io,
+ int *req);
+
+/* deprecated, use rados_ioctx_pool_required_alignment2 instead */
+CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+/**
+ * Get the alignment flavor of a pool
+ *
+ * @param io pool to query
+ * @param alignment where to store the alignment flavor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io,
+ uint64_t *alignment);
+
+/**
+ * Get the pool id of the io context
+ *
+ * @param io the io context to query
+ * @returns the id of the pool the io context uses
+ */
+CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io);
+
+/**
+ * Get the pool name of the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf,
+ unsigned maxlen);
+
+/** @} pools */
+
+/**
+ * @name Object Locators
+ *
+ * @{
+ */
+
+/**
+ * Set the key for mapping objects to pgs within an io context.
+ *
+ * The key is used instead of the object name to determine which
+ * placement groups an object is put in. This affects all subsequent
+ * operations of the io context - until a different locator key is
+ * set, all objects in this io context will be placed in the same pg.
+ *
+ * @param io the io context to change
+ * @param key the key to use as the object locator, or NULL to discard
+ * any previously set key
+ */
+CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io,
+ const char *key);
+
+/**
+ * Set the namespace for objects within an io context
+ *
+ * The namespace specification further refines a pool into different
+ * domains. The mapping of objects to pgs is also based on this
+ * value.
+ *
+ * @param io the io context to change
+ * @param nspace the name to use as the namespace, or NULL use the
+ * default namespace
+ */
+CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io,
+ const char *nspace);
+
+/**
+ * Get the namespace for objects within the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf,
+ unsigned maxlen);
+
+/** @} obj_loc */
+
+/**
+ * @name Listing Objects
+ * @{
+ */
+/**
+ * Start listing objects in a pool
+ *
+ * @param io the pool to list from
+ * @param ctx the handle to store list context in
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io,
+ rados_list_ctx_t *ctx);
+
+/**
+ * Return hash position of iterator, rounded to the current PG
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @returns current hash position, rounded to the current pg
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx);
+
+/**
+ * Reposition object iterator to a different hash position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param pos hash position to move to
+ * @returns actual (rounded) position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx,
+ uint32_t pos);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor position to move to
+ * @returns rounded position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx,
+ rados_object_list_cursor cursor);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * The returned handle must be released with rados_object_list_cursor_free().
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor where to store cursor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx,
+ rados_object_list_cursor *cursor);
+
+/**
+ * Get the next object name and locator in the pool
+ *
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key,
+ const char **nspace);
+
+/**
+ * Get the next object name, locator and their sizes in the pool
+ *
+ * The sizes allow to list objects with \0 (the NUL character)
+ * in .e.g *entry. Is is unusual see such object names but a bug
+ * in a client has risen the need to handle them as well.
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @param entry_size where to store the size of name of the entry
+ * @param key_size where to store the size of object locator (set to NULL to ignore)
+ * @param nspace_size where to store the size of object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key,
+ const char **nspace,
+ size_t *entry_size,
+ size_t *key_size,
+ size_t *nspace_size);
+
+/**
+ * Close the object listing handle.
+ *
+ * This should be called when the handle is no longer needed.
+ * The handle should not be used after it has been closed.
+ *
+ * @param ctx the handle to close
+ */
+CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx);
+
+/**
+ * Get cursor handle pointing to the *beginning* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool. It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin(
+ rados_ioctx_t io);
+
+/**
+ * Get cursor handle pointing to the *end* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool. It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io);
+
+/**
+ * Check if a cursor has reached the end of a pool
+ *
+ * @param io ioctx
+ * @param cur cursor
+ * @returns 1 if the cursor has reached the end of the pool, 0 otherwise
+ */
+CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io,
+ rados_object_list_cursor cur);
+
+/**
+ * Release a cursor
+ *
+ * Release a cursor. The handle may not be used after this point.
+ *
+ * @param io ioctx
+ * @param cur cursor
+ */
+CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io,
+ rados_object_list_cursor cur);
+
+/**
+ * Compare two cursor positions
+ *
+ * Compare two cursors, and indicate whether the first cursor precedes,
+ * matches, or follows the second.
+ *
+ * @param io ioctx
+ * @param lhs first cursor
+ * @param rhs second cursor
+ * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs
+ */
+CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io,
+ rados_object_list_cursor lhs, rados_object_list_cursor rhs);
+
+/**
+ * @return the number of items set in the results array
+ */
+CEPH_RADOS_API int rados_object_list(rados_ioctx_t io,
+ const rados_object_list_cursor start,
+ const rados_object_list_cursor finish,
+ const size_t result_size,
+ const char *filter_buf,
+ const size_t filter_buf_len,
+ rados_object_list_item *results,
+ rados_object_list_cursor *next);
+
+CEPH_RADOS_API void rados_object_list_free(
+ const size_t result_size,
+ rados_object_list_item *results);
+
+/**
+ * Obtain cursors delineating a subset of a range. Use this
+ * when you want to split up the work of iterating over the
+ * global namespace. Expected use case is when you are iterating
+ * in parallel, with `m` workers, and each worker taking an id `n`.
+ *
+ * @param io ioctx
+ * @param start start of the range to be sliced up (inclusive)
+ * @param finish end of the range to be sliced up (exclusive)
+ * @param n which of the m chunks you would like to get cursors for
+ * @param m how many chunks to divide start-finish into
+ * @param split_start cursor populated with start of the subrange (inclusive)
+ * @param split_finish cursor populated with end of the subrange (exclusive)
+ */
+CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io,
+ const rados_object_list_cursor start,
+ const rados_object_list_cursor finish,
+ const size_t n,
+ const size_t m,
+ rados_object_list_cursor *split_start,
+ rados_object_list_cursor *split_finish);
+
+
+/** @} Listing Objects */
+
+/**
+ * @name Snapshots
+ *
+ * RADOS snapshots are based upon sequence numbers that form a
+ * snapshot context. They are pool-specific. The snapshot context
+ * consists of the current snapshot sequence number for a pool, and an
+ * array of sequence numbers at which snapshots were taken, in
+ * descending order. Whenever a snapshot is created or deleted, the
+ * snapshot sequence number for the pool is increased. To add a new
+ * snapshot, the new snapshot sequence number must be increased and
+ * added to the snapshot context.
+ *
+ * There are two ways to manage these snapshot contexts:
+ * -# within the RADOS cluster
+ * These are called pool snapshots, and store the snapshot context
+ * in the OSDMap. These represent a snapshot of all the objects in
+ * a pool.
+ * -# within the RADOS clients
+ * These are called self-managed snapshots, and push the
+ * responsibility for keeping track of the snapshot context to the
+ * clients. For every write, the client must send the snapshot
+ * context. In librados, this is accomplished with
+ * rados_selfmanaged_snap_set_write_ctx(). These are more
+ * difficult to manage, but are restricted to specific objects
+ * instead of applying to an entire pool.
+ *
+ * @{
+ */
+
+/**
+ * Create a pool-wide snapshot
+ *
+ * @param io the pool to snapshot
+ * @param snapname the name of the snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io,
+ const char *snapname);
+
+/**
+ * Delete a pool snapshot
+ *
+ * @param io the pool to delete the snapshot from
+ * @param snapname which snapshot to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io,
+ const char *snapname);
+
+/**
+ * Rollback an object to a pool snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapname which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid,
+ const char *snapname);
+
+/**
+ * @warning Deprecated: Use rados_ioctx_snap_rollback() instead
+ */
+CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid,
+ const char *snapname)
+ __attribute__((deprecated));
+
+/**
+ * Set the snapshot from which reads are performed.
+ *
+ * Subsequent reads will return data as it was at the time of that
+ * snapshot.
+ *
+ * @param io the io context to change
+ * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no
+ * snapshot (i.e. normal operation)
+ */
+CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io,
+ rados_snap_t snap);
+
+/**
+ * Allocate an ID for a self-managed snapshot
+ *
+ * Get a unique ID to put in the snaphot context to create a
+ * snapshot. A clone of an object is not created until a write with
+ * the new snapshot context is completed.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+ rados_snap_t *snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+ rados_snap_t *snapid,
+ rados_completion_t completion);
+
+/**
+ * Remove a self-managed snapshot
+ *
+ * This increases the snapshot sequence number, which will cause
+ * snapshots to be removed lazily.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+ rados_snap_t snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+ rados_snap_t snapid,
+ rados_completion_t completion);
+
+/**
+ * Rollback an object to a self-managed snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapid which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io,
+ const char *oid,
+ rados_snap_t snapid);
+
+/**
+ * Set the snapshot context for use when writing to objects
+ *
+ * This is stored in the io context, and applies to all future writes.
+ *
+ * @param io the io context to change
+ * @param seq the newest snapshot sequence number for the pool
+ * @param snaps array of snapshots in sorted by descending id
+ * @param num_snaps how many snaphosts are in the snaps array
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snaps are not in descending order
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io,
+ rados_snap_t seq,
+ rados_snap_t *snaps,
+ int num_snaps);
+
+/**
+ * List all the ids of pool snapshots
+ *
+ * If the output array does not have enough space to fit all the
+ * snapshots, -ERANGE is returned and the caller should retry with a
+ * larger array.
+ *
+ * @param io the pool to read from
+ * @param snaps where to store the results
+ * @param maxlen the number of rados_snap_t that fit in the snaps array
+ * @returns number of snapshots on success, negative error code on failure
+ * @returns -ERANGE is returned if the snaps array is too short
+ */
+CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps,
+ int maxlen);
+
+/**
+ * Get the id of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param name the snapshot to find
+ * @param id where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name,
+ rados_snap_t *id);
+
+/**
+ * Get the name of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param id the snapshot to find
+ * @param name where to store the result
+ * @param maxlen the size of the name array
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the name array is too small
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id,
+ char *name, int maxlen);
+
+/**
+ * Find when a pool snapshot occurred
+ *
+ * @param io the pool the snapshot was taken in
+ * @param id the snapshot to lookup
+ * @param t where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id,
+ time_t *t);
+
+/** @} Snapshots */
+
+/**
+ * @name Synchronous I/O
+ * Writes are replicated to a number of OSDs based on the
+ * configuration of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_ioctx_wait_for_complete(). For greater data safety, use the
+ * asynchronous functions and rados_aio_wait_for_safe().
+ *
+ * @{
+ */
+
+/**
+ * Return the version of the last object read or written to.
+ *
+ * This exposes the internal version number of the last object read or
+ * written via this io context
+ *
+ * @param io the io context to check
+ * @returns last read or written object version
+ */
+CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object, starting at
+ * offset *off*. The value of *len* must be <= UINT_MAX/2.
+ *
+ * @note This will never return a positive value not equal to len.
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len, uint64_t off);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len);
+
+/**
+ * Write the same *data_len* bytes from *buf* multiple times into the
+ * *oid* object. *write_len* bytes are written in total, which must be
+ * a multiple of *data_len*. The value of *write_len* and *data_len*
+ * must be <= UINT_MAX/2.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t data_len,
+ size_t write_len, uint64_t off);
+
+/**
+ * Append *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len);
+
+/**
+ * Read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf,
+ size_t len, uint64_t off);
+
+/**
+ * Compute checksum from object data
+ *
+ * The io context determines the snapshot to checksum, if any was set
+ * by rados_ioctx_snap_set_read(). The length of the init_value and
+ * resulting checksum are dependent upon the checksum type:
+ *
+ * XXHASH64: le64
+ * XXHASH32: le32
+ * CRC32C: le32
+ *
+ * The checksum result is encoded the following manner:
+ *
+ * le32 num_checksum_chunks
+ * {
+ * leXX checksum for chunk (where XX = appropriate size for the checksum type)
+ * } * num_checksum_chunks
+ *
+ * @param io the context in which to perform the checksum
+ * @param oid the name of the object to checksum
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param len the number of bytes to checksum
+ * @param off the offset to start checksumming in the object
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result
+ * @param checksum_len the number of bytes available for the result
+ * @return negative error code on failure
+ */
+CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid,
+ rados_checksum_type_t type,
+ const char *init_value, size_t init_value_len,
+ size_t len, uint64_t off, size_t chunk_size,
+ char *pchecksum, size_t checksum_len);
+
+/**
+ * Delete an object
+ *
+ * @note This does not delete any snapshots of the object.
+ *
+ * @param io the pool to delete the object from
+ * @param oid the name of the object to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @param io the context in which to truncate
+ * @param oid the name of the object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid,
+ uint64_t size);
+
+/**
+ * Compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o name of the object
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o,
+ const char *cmp_buf, size_t cmp_len,
+ uint64_t off);
+
+/**
+ * @name Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o,
+ const char *name, char *buf, size_t len);
+
+/**
+ * Set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o,
+ const char *name, const char *buf,
+ size_t len);
+
+/**
+ * Delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o,
+ const char *name);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid,
+ rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter,
+ const char **name, const char **val,
+ size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Get the next omap key/value pair on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key is
+ * null-terminated, and val has length len. If the end of the list has
+ * been reached, key and val are NULL, and len is 0. key and val will
+ * not be accessible after rados_omap_get_end() is called on iter, so
+ * if they are needed after that they should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter,
+ char **key,
+ char **val,
+ size_t *len);
+
+/**
+ * Get the next omap key/value pair on the object. Note that it's
+ * perfectly safe to mix calls to rados_omap_get_next and
+ * rados_omap_get_next2.
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key has length
+ * keylen and val has length vallen. If the end of the list has
+ * been reached, key and val are NULL, and keylen and vallen is 0.
+ * key and val will not be accessible after rados_omap_get_end()
+ * is called on iter, so if they are needed after that they
+ * should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param key_len where to store the number of bytes in key
+ * @param val_len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter,
+ char **key,
+ char **val,
+ size_t *key_len,
+ size_t *val_len);
+
+/**
+ * Return number of elements in the iterator
+ *
+ * @param iter the iterator of which to return the size
+ */
+CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter);
+
+/**
+ * Close the omap iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter);
+
+/**
+ * Get object stats (size/mtime)
+ *
+ * TODO: when are these set, and by whom? can they be out of date?
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize,
+ time_t *pmtime);
+/**
+ * Execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param oid the object to call the method on
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns the length of the output, or
+ * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For
+ * methods that don't return data, the return value is
+ * method-specific.
+ */
+CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid,
+ const char *cls, const char *method,
+ const char *in_buf, size_t in_len, char *buf,
+ size_t out_len);
+
+
+/** @} Synchronous I/O */
+
+/**
+ * @name Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_callback_t
+ * Callbacks for asynchrous operations take two parameters:
+ * - cb the completion that has finished
+ * - arg application defined data made available to the callback function
+ */
+typedef void (*rados_callback_t)(rados_completion_t cb, void *arg);
+
+/**
+ * Constructs a completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * TODO: more complete documentation of this elsewhere (in the RADOS docs?)
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all replicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg,
+ rados_callback_t cb_complete,
+ rados_callback_t cb_safe,
+ rados_completion_t *pc);
+
+/**
+ * Constructs a completion to use with asynchronous operations
+ *
+ * The complete callback corresponds to operation being acked.
+ *
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is committed
+ * on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_create_completion2(void *cb_arg,
+ rados_callback_t cb_complete,
+ rados_completion_t *pc);
+
+/**
+ * Block until an operation completes
+ *
+ * This means it is in memory on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c);
+
+/**
+ * Block until an operation is safe
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c)
+ __attribute__((deprecated));
+
+/**
+ * Has an asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c);
+
+/**
+ * Block until an operation completes and callback completes
+ *
+ * This means it is in memory on all replicas and can be read.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c);
+
+/**
+ * Block until an operation is safe and callback has completed
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c)
+ __attribute__((deprecated));
+
+/**
+ * Has an asynchronous operation and callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe and has the callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c);
+
+/**
+ * Get the return value of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns return value of the operation
+ */
+CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c);
+
+/**
+ * Get the internal object version of the target of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns version number of the asychronous operation's target
+ */
+CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c);
+
+/**
+ * Release a completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c completion to release
+ */
+CEPH_RADOS_API void rados_aio_release(rados_completion_t c);
+
+/**
+ * Write data to an object asynchronously
+ *
+ * Queues the write and returns. The return value of the completion
+ * will be 0 on success, negative error code on failure.
+ *
+ * @param io the context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len, uint64_t off);
+
+/**
+ * Asynchronously append data to an object
+ *
+ * Queues the append and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the append is safe and complete
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len);
+
+/**
+ * Asynchronously write an entire object
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ * Queues the write_full and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write_full is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len);
+
+/**
+ * Asynchronously write the same buffer multiple times
+ *
+ * Queues the writesame and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the writesame is safe and complete
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t data_len,
+ size_t write_len, uint64_t off);
+
+/**
+ * Asynchronously remove an object
+ *
+ * Queues the remove and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion);
+
+/**
+ * Asynchronously read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @note only the 'complete' callback of the completion will be called.
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param completion what to do when the read is complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ char *buf, size_t len, uint64_t off);
+
+/**
+ * Block until all pending writes in an io context are safe
+ *
+ * This is not equivalent to calling rados_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @note BUG: always returns 0, should be void or accept a timeout
+ *
+ * @param io the context to flush
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io);
+
+
+/**
+ * Schedule a callback for when all currently pending
+ * aio writes are safe. This is a non-blocking version of
+ * rados_aio_flush().
+ *
+ * @param io the context to flush
+ * @param completion what to do when the writes are safe
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io,
+ rados_completion_t completion);
+
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param completion what to do when the stat is complete
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ uint64_t *psize, time_t *pmtime);
+
+/**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o the name of the object to compare with
+ * @param completion what to do when the comparison is complete
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off);
+
+/**
+ * Cancel async operation
+ *
+ * @param io ioctx
+ * @param completion completion handle
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io,
+ rados_completion_t completion);
+
+/**
+ * Asynchronously execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param o name of the object
+ * @param completion what to do when the exec completes
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *cls, const char *method,
+ const char *in_buf, size_t in_len,
+ char *buf, size_t out_len);
+
+/** @} Asynchronous I/O */
+
+/**
+ * @name Asynchronous Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Asynchronously get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param completion what to do when the getxattr completes
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name, char *buf, size_t len);
+
+/**
+ * Asynchronously set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param completion what to do when the setxattr completes
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name, const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param completion what to do when the rmxattr completes
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name);
+
+/**
+ * Asynchronously start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param completion what to do when the getxattrs completes
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ rados_xattrs_iter_t *iter);
+
+/** @} Asynchronous Xattrs */
+
+/**
+ * @name Watch/Notify
+ *
+ * Watch/notify is a protocol to help communicate among clients. It
+ * can be used to sychronize client state. All that's needed is a
+ * well-known object name (for example, rbd uses the header object of
+ * an image).
+ *
+ * Watchers register an interest in an object, and receive all
+ * notifies on that object. A notify attempts to communicate with all
+ * clients watching an object, and blocks on the notifier until each
+ * client responds or a timeout is reached.
+ *
+ * See rados_watch() and rados_notify() for more details.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_watchcb_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param opcode undefined
+ * @param ver version of the watched object
+ * @param arg application-specific data
+ *
+ * @note BUG: opcode is an internal detail that shouldn't be exposed
+ * @note BUG: ver is unused
+ */
+typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg);
+
+/**
+ * @typedef rados_watchcb2_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param arg opaque user-defined value provided to rados_watch2()
+ * @param notify_id an id for this notify event
+ * @param handle the watcher handle we are notifying
+ * @param notifier_id the unique client id for the notifier
+ * @param data payload from the notifier
+ * @param datalen length of payload buffer
+ */
+typedef void (*rados_watchcb2_t)(void *arg,
+ uint64_t notify_id,
+ uint64_t handle,
+ uint64_t notifier_id,
+ void *data,
+ size_t data_len);
+
+/**
+ * @typedef rados_watcherrcb_t
+ *
+ * Callback activated when we encounter an error with the watch session.
+ * This can happen when the location of the objects moves within the
+ * cluster and we fail to register our watch with the new object location,
+ * or when our connection with the object OSD is otherwise interrupted and
+ * we may have missed notify events.
+ *
+ * @param pre opaque user-defined value provided to rados_watch2()
+ * @param err error code
+ */
+ typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @note BUG: librados should provide a way for watchers to notice connection resets
+ * @note BUG: the ver parameter does not work, and -ERANGE will never be returned
+ * (See URL tracker.ceph.com/issues/2592)
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param ver expected version of the object
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param arg application defined data to pass when watchcb is called
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the version of the object is greater than ver
+ */
+CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver,
+ uint64_t *cookie,
+ rados_watchcb_t watchcb, void *arg)
+ __attribute__((deprecated));
+
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to the
+ * primary OSD for a watched object, the watch will be removed after
+ * a timeout configured with osd_client_watch_timeout.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ void *arg);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ uint32_t timeout,
+ void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o,
+ rados_completion_t completion, uint64_t *handle,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after the number of seconds that configured in timeout parameter.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o,
+ rados_completion_t completion, uint64_t *handle,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ uint32_t timeout,
+ void *arg);
+
+/**
+ * Check on the status of a watch
+ *
+ * Return the number of milliseconds since the watch was last confirmed.
+ * Or, if there has been an error, return that.
+ *
+ * If there is an error, the watch is no longer valid, and should be
+ * destroyed with rados_unwatch2(). The the user is still interested
+ * in the object, a new watch should be created with rados_watch2().
+ *
+ * @param io the pool the object is in
+ * @param cookie the watch handle
+ * @returns ms since last confirmed on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the watched object (ignored)
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie)
+ __attribute__((deprecated));
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Asynchronous unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie,
+ rados_completion_t completion);
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * @note BUG: the timeout is not changeable via the C API
+ * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param ver obsolete - just pass zero
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver,
+ const char *buf, int buf_len)
+ __attribute__((deprecated));
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * The reply buffer is optional. If specified, the client will get
+ * back an encoded buffer that includes the ids of the clients that
+ * acknowledged the notify as well as their notify ack payloads (if
+ * any). Clients that timed out are not included. Even clients that
+ * do not include a notify ack payload are included in the list but
+ * have a 0-length payload associated with them. The format:
+ *
+ * le32 num_acks
+ * {
+ * le64 gid global id for the client (for client.1234 that's 1234)
+ * le64 cookie cookie for the client
+ * le32 buflen length of reply message buffer
+ * u8 * buflen payload
+ * } * num_acks
+ * le32 num_timeouts
+ * {
+ * le64 gid global id for the client
+ * le64 cookie cookie for the client
+ * } * num_timeouts
+ *
+ * Note: There may be multiple instances of the same gid if there are
+ * multiple watchers registered via the same client.
+ *
+ * Note: The buffer must be released with rados_buffer_free() when the
+ * user is done with it.
+ *
+ * Note: Since the result buffer includes clients that time out, it
+ * will be set even when rados_notify() returns an error code (like
+ * -ETIMEDOUT).
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param o the name of the object
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @param timeout_ms notify timeout (in ms)
+ * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free)
+ * @param reply_buffer_len pointer to size of reply buffer
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *buf, int buf_len,
+ uint64_t timeout_ms, char **reply_buffer,
+ size_t *reply_buffer_len);
+CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o,
+ const char *buf, int buf_len,
+ uint64_t timeout_ms,
+ char **reply_buffer, size_t *reply_buffer_len);
+
+/**
+ * Decode a notify response
+ *
+ * Decode a notify response (from rados_aio_notify() call) into acks and
+ * timeout arrays.
+ *
+ * @param reply_buffer buffer from rados_aio_notify() call
+ * @param reply_buffer_len reply_buffer length
+ * @param acks pointer to struct notify_ack_t pointer
+ * @param nr_acks pointer to ack count
+ * @param timeouts pointer to notify_timeout_t pointer
+ * @param nr_timeouts pointer to timeout count
+ * @returns 0 on success
+ */
+CEPH_RADOS_API int rados_decode_notify_response(char *reply_buffer, size_t reply_buffer_len,
+ struct notify_ack_t **acks, size_t *nr_acks,
+ struct notify_timeout_t **timeouts, size_t *nr_timeouts);
+
+/**
+ * Free notify allocated buffer
+ *
+ * Release memory allocated by rados_decode_notify_response() call
+ *
+ * @param acks notify_ack_t struct (from rados_decode_notify_response())
+ * @param nr_acks ack count
+ * @param timeouts notify_timeout_t struct (from rados_decode_notify_response())
+ */
+CEPH_RADOS_API void rados_free_notify_response(struct notify_ack_t *acks, size_t nr_acks,
+ struct notify_timeout_t *timeouts);
+
+/**
+ * Acknolwedge receipt of a notify
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param notify_id the notify_id we got on the watchcb2_t callback
+ * @param cookie the watcher handle
+ * @param buf payload to return to notifier (optional)
+ * @param buf_len payload length
+ * @returns 0 on success
+ */
+CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o,
+ uint64_t notify_id, uint64_t cookie,
+ const char *buf, int buf_len);
+
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will block until all pending watch/notify callbacks have
+ * been executed and the queue is empty. It should usually be called
+ * after shutting down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ */
+CEPH_RADOS_API int rados_watch_flush(rados_t cluster);
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will be nonblock, and the completion will be called
+ * until all pending watch/notify callbacks have been executed and
+ * the queue is empty. It should usually be called after shutting
+ * down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ * @param completion what to do when operation has been attempted
+ */
+CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion);
+
+/** @} Watch/Notify */
+
+/**
+ * Pin an object in the cache tier
+ *
+ * When an object is pinned in the cache tier, it stays in the cache
+ * tier, and won't be flushed out.
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o);
+
+/**
+ * Unpin an object in the cache tier
+ *
+ * After an object is unpinned in the cache tier, it can be flushed out
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o);
+
+/**
+ * @name Hints
+ *
+ * @{
+ */
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+/** @} Hints */
+
+/**
+ * @name Object Operations
+ *
+ * A single rados operation can do multiple operations on one object
+ * atomically. The whole operation will succeed or fail, and no partial
+ * results will be visible.
+ *
+ * Operations may be either reads, which can return data, or writes,
+ * which cannot. The effects of writes are applied and visible all at
+ * once, so an operation that sets an xattr and then checks its value
+ * will not see the updated value.
+ *
+ * @{
+ */
+
+/**
+ * Create a new rados_write_op_t write operation. This will store all actions
+ * to be performed atomically. You must call rados_release_write_op when you are
+ * finished with it.
+ *
+ * @note the ownership of a write operartion is passed to the function
+ * performing the operation, so the same instance of @c rados_write_op_t
+ * cannot be used again after being performed.
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_write_op_t rados_create_write_op(void);
+
+/**
+ * Free a rados_write_op_t, must be called when you're done with it.
+ * @param write_op operation to deallocate, created with rados_create_write_op
+ */
+CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op);
+
+/**
+ * Set flags for the last operation added to this write_op.
+ * At least one op must have been added to the write_op.
+ * @param write_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op,
+ int flags);
+
+/**
+ * Ensure that the object exists before writing
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before writing. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ * then rados_write_op_operate will return -ERANGE instead of
+ * executing the op.
+ * - If the object's version is less than the asserted version
+ * then rados_write_op_operate will return -EOVERFLOW instead
+ * of executing the op.
+ * @param write_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param write_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ * on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off,
+ int *prval);
+
+/**
+ * Ensure that given xattr satisfies comparison.
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op,
+ const char *name,
+ uint8_t comparison_operator,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t key_len,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Set an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr
+ * @param value buffer to set xattr to
+ * @param value_len length of buffer to set xattr to
+ */
+CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op,
+ const char *name,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Remove an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to remove
+ */
+CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op,
+ const char *name);
+
+/**
+ * Create the object
+ * @param write_op operation to add this action to
+ * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or
+ LIBRADOS_CREATE_IDEMPOTENT
+ * will error if the object already exists.
+ * @param category category string (DEPRECATED, HAS NO EFFECT)
+ */
+CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op,
+ int exclusive,
+ const char* category);
+
+/**
+ * Write to offset
+ * @param write_op operation to add this action to
+ * @param offset offset to write to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len,
+ uint64_t offset);
+
+/**
+ * Write whole object, atomically replacing it.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len);
+
+/**
+ * Write the same buffer multiple times
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param data_len length of buffer
+ * @param write_len total number of bytes to write, as a multiple of @c data_len
+ * @param offset offset to write to
+ */
+CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op,
+ const char *buffer,
+ size_t data_len,
+ size_t write_len,
+ uint64_t offset);
+
+/**
+ * Append to end of object.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len);
+/**
+ * Remove object
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op);
+
+/**
+ * Truncate an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to truncate to
+ */
+CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op,
+ uint64_t offset);
+
+/**
+ * Zero part of an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to zero
+ * @param len length to zero
+ */
+CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op,
+ uint64_t offset,
+ uint64_t len);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * @param write_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ int *prval);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op,
+ char const* const* keys,
+ char const* const* vals,
+ const size_t *lens,
+ size_t num);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param key_lens array of lengths corresponding to each key
+ * @param val_lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op,
+ char const* const* keys,
+ char const* const* vals,
+ const size_t *key_lens,
+ const size_t *val_lens,
+ size_t num);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to remove
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op,
+ char const* const* keys,
+ size_t keys_len);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of char arrays representing keys to remove
+ * @param key_lens array of size_t values representing length of each key
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op,
+ char const* const* keys,
+ const size_t* key_lens,
+ size_t keys_len);
+
+
+/**
+ * Remove key/value pairs from an object whose keys are in the range
+ * [key_begin, key_end)
+ *
+ * @param write_op operation to add this action to
+ * @param key_begin the lower bound of the key range to remove
+ * @param key_begin_len length of key_begin
+ * @param key_end the upper bound of the key range to remove
+ * @param key_end_len length of key_end
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_range2(rados_write_op_t write_op,
+ const char *key_begin,
+ size_t key_begin_len,
+ const char *key_end,
+ size_t key_end_len);
+
+/**
+ * Remove all key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ const char *oid,
+ time_t *mtime,
+ int flags);
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+
+CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ const char *oid,
+ struct timespec *mtime,
+ int flags);
+
+/**
+ * Perform a write operation asynchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ rados_completion_t completion,
+ const char *oid,
+ time_t *mtime,
+ int flags);
+
+/**
+ * Create a new rados_read_op_t read operation. This will store all
+ * actions to be performed atomically. You must call
+ * rados_release_read_op when you are finished with it (after it
+ * completes, or you decide not to send it in the first place).
+ *
+ * @note the ownership of a read operartion is passed to the function
+ * performing the operation, so the same instance of @c rados_read_op_t
+ * cannot be used again after being performed.
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_read_op_t rados_create_read_op(void);
+
+/**
+ * Free a rados_read_op_t, must be called when you're done with it.
+ * @param read_op operation to deallocate, created with rados_create_read_op
+ */
+CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op);
+
+/**
+ * Set flags for the last operation added to this read_op.
+ * At least one op must have been added to the read_op.
+ * @param read_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags);
+
+/**
+ * Ensure that the object exists before reading
+ * @param read_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before reading. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ * then rados_read_op_operate will return -ERANGE instead of
+ * executing the op.
+ * - If the object's version is less than the asserted version
+ * then rados_read_op_operate will return -EOVERFLOW instead
+ * of executing the op.
+ * @param read_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param read_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ * on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off,
+ int *prval);
+
+/**
+ * Ensure that the an xattr satisfies a comparison
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param read_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op,
+ const char *name,
+ uint8_t comparison_operator,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @param read_op operation to add this action to
+ * @param iter where to store the iterator
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op,
+ rados_xattrs_iter_t *iter,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t key_len,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Get object size and mtime
+ * @param read_op operation to add this action to
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op,
+ uint64_t *psize,
+ time_t *pmtime,
+ int *prval);
+
+/**
+ * Read bytes from offset into buffer.
+ *
+ * prlen will be filled with the number of bytes read if successful.
+ * A short read can only occur if the read reaches the end of the
+ * object.
+ *
+ * @param read_op operation to add this action to
+ * @param offset offset to read from
+ * @param len length of buffer
+ * @param buffer where to put the data
+ * @param bytes_read where to store the number of bytes read by this action
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op,
+ uint64_t offset,
+ size_t len,
+ char *buffer,
+ size_t *bytes_read,
+ int *prval);
+
+/**
+ * Compute checksum from object data
+ *
+ * @param read_op operation to add this action to
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param offset the offset to start checksumming in the object
+ * @param len the number of bytes to checksum
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result for this action
+ * @param checksum_len the number of bytes available for the result
+ * @param prval where to store the return value for this action
+ */
+CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op,
+ rados_checksum_type_t type,
+ const char *init_value,
+ size_t init_value_len,
+ uint64_t offset, size_t len,
+ size_t chunk_size, char *pchecksum,
+ size_t checksum_len, int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * The output buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf where to put librados-allocated output buffer
+ * @param out_len length of out_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ char **out_buf,
+ size_t *out_len,
+ int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * If the output buffer is too small, prval will
+ * be set to -ERANGE and used_len will be 0.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf user-provided buffer to read into
+ * @param out_len length of out_buf in bytes
+ * @param used_len where to store the number of bytes read into out_buf
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ char *out_buf,
+ size_t out_len,
+ size_t *used_len,
+ int *prval);
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op,
+ const char *start_after,
+ const char *filter_prefix,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ int *prval)
+ __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op,
+ const char *start_after,
+ const char *filter_prefix,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ unsigned char *pmore,
+ int *prval);
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op,
+ const char *start_after,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ int *prval)
+ __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op,
+ const char *start_after,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ unsigned char *pmore,
+ int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to null-terminated keys to get
+ * @param keys_len the number of strings in keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op,
+ char const* const* keys,
+ size_t keys_len,
+ rados_omap_iter_t *iter,
+ int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to keys to get
+ * @param num_keys the number of strings in keys
+ * @param key_lens array of size_t's describing each key len (in bytes)
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op,
+ char const* const* keys,
+ size_t num_keys,
+ const size_t* key_lens,
+ rados_omap_iter_t *iter,
+ int *prval);
+
+/**
+ * Perform a read operation synchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op,
+ rados_ioctx_t io,
+ const char *oid,
+ int flags);
+
+/**
+ * Perform a read operation asynchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op,
+ rados_ioctx_t io,
+ rados_completion_t completion,
+ const char *oid,
+ int flags);
+
+/** @} Object Operations */
+
+/**
+ * Take an exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid,
+ const char * name, const char * cookie,
+ const char * desc,
+ struct timeval * duration,
+ uint8_t flags);
+
+/**
+ * Take a shared lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag The tag of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o,
+ const char * name, const char * cookie,
+ const char * tag, const char * desc,
+ struct timeval * duration, uint8_t flags);
+
+/**
+ * Release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o,
+ const char *name, const char *cookie);
+
+/**
+ * Asynchronous release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @param completion what to do when operation has been attempted
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o,
+ const char *name, const char *cookie,
+ rados_completion_t completion);
+
+/**
+ * List clients that have locked the named object lock and information about
+ * the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the object lock
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o,
+ const char *name, int *exclusive,
+ char *tag, size_t *tag_len,
+ char *clients, size_t *clients_len,
+ char *cookies, size_t *cookies_len,
+ char *addrs, size_t *addrs_len);
+
+/**
+ * Releases a shared or exclusive lock on an object, which was taken by the
+ * specified client.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param client the client currently holding the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ * @returns -EINVAL if the client cannot be parsed
+ */
+CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o,
+ const char *name, const char *client,
+ const char *cookie);
+
+/**
+ * Blocklists the specified client from the OSDs
+ *
+ * @param cluster cluster handle
+ * @param client_address client address
+ * @param expire_seconds number of seconds to blocklist (0 for default)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_blocklist_add(rados_t cluster,
+ char *client_address,
+ uint32_t expire_seconds);
+CEPH_RADOS_API int rados_blacklist_add(rados_t cluster,
+ char *client_address,
+ uint32_t expire_seconds)
+ __attribute__((deprecated));
+
+/**
+ * Gets addresses of the RADOS session, suitable for blocklisting.
+ *
+ * @param cluster cluster handle
+ * @param addrs the output string.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs);
+
+CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+CEPH_RADOS_API void rados_set_pool_full_try(rados_ioctx_t io);
+
+CEPH_RADOS_API void rados_unset_pool_full_try(rados_ioctx_t io);
+
+/**
+ * Enable an application on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param force 0 if only single application per pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io,
+ const char *app_name, int force);
+
+/**
+ * List all enabled applications
+ *
+ * If the provided buffer is too short, the required length is filled in and
+ * -ERANGE is returned. Otherwise, the buffers are filled with the application
+ * names, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param values buffer in which to store application names
+ * @param values_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values,
+ size_t *values_len);
+
+/**
+ * Get application metadata value from pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value result buffer
+ * @param value_len maximum len of value
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io,
+ const char *app_name,
+ const char *key, char *value,
+ size_t *value_len);
+
+/**
+ * Set application metadata on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io,
+ const char *app_name,
+ const char *key,
+ const char *value);
+
+/**
+ * Remove application metadata from a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io,
+ const char *app_name,
+ const char *key);
+
+/**
+ * List all metadata key/value pairs associated with an application.
+ *
+ * This iterates over all metadata, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are filled
+ * in and -ERANGE is returned. Otherwise, the buffers are filled with
+ * the keys and values of the metadata, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param keys buffer in which to store key names
+ * @param key_len number of bytes in keys buffer
+ * @param values buffer in which to store values
+ * @param vals_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io,
+ const char *app_name,
+ char *keys, size_t *key_len,
+ char *values,
+ size_t *vals_len);
+
+/**
+ * @name Mon/OSD/PG Commands
+ *
+ * These interfaces send commands relating to the monitor, OSD, or PGs.
+ *
+ * @{
+ */
+
+/**
+ * Send monitor command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd,
+ size_t cmdlen, const char *inbuf,
+ size_t inbuflen, char **outbuf,
+ size_t *outbuflen, char **outs,
+ size_t *outslen);
+
+/**
+ * Send ceph-mgr command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd,
+ size_t cmdlen, const char *inbuf,
+ size_t inbuflen, char **outbuf,
+ size_t *outbuflen, char **outs,
+ size_t *outslen);
+
+/**
+ * Send ceph-mgr tell command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param name mgr name to target
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mgr_command_target(
+ rados_t cluster,
+ const char *name,
+ const char **cmd,
+ size_t cmdlen, const char *inbuf,
+ size_t inbuflen, char **outbuf,
+ size_t *outbuflen, char **outs,
+ size_t *outslen);
+
+/**
+ * Send monitor command to a specific monitor.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param name target monitor's name
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+/**
+ * free a rados-allocated buffer
+ *
+ * Release memory allocated by librados calls like rados_mon_command().
+ *
+ * @param buf buffer pointer
+ */
+CEPH_RADOS_API void rados_buffer_free(char *buf);
+
+CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+CEPH_RADOS_API int rados_mgr_command(rados_t cluster,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log. The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback_t)(void *arg,
+ const char *line,
+ const char *who,
+ uint64_t sec, uint64_t nsec,
+ uint64_t seq, const char *level,
+ const char *msg);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log. The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback2_t)(void *arg,
+ const char *line,
+ const char *channel,
+ const char *who,
+ const char *name,
+ uint64_t sec, uint64_t nsec,
+ uint64_t seq, const char *level,
+ const char *msg);
+
+CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level,
+ rados_log_callback_t cb, void *arg);
+CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level,
+ rados_log_callback2_t cb, void *arg);
+
+
+/**
+ * register daemon instance for a service
+ *
+ * Register us as a daemon providing a particular service. We identify
+ * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname').
+ * The metadata is a map of keys and values with arbitrary static metdata
+ * for this instance. The encoding is a series of NULL-terminated strings,
+ * alternating key names and values, terminating with an empty key name.
+ * For example, "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}.
+ *
+ * For the lifetime of the librados instance, regular beacons will be sent
+ * to the cluster to maintain our registration in the service map.
+ *
+ * @param cluster handle
+ * @param service service name
+ * @param daemon daemon instance name
+ * @param metadata_dict static daemon metadata dict
+ */
+CEPH_RADOS_API int rados_service_register(
+ rados_t cluster,
+ const char *service,
+ const char *daemon,
+ const char *metadata_dict);
+
+/**
+ * update daemon status
+ *
+ * Update our mutable status information in the service map.
+ *
+ * The status dict is encoded the same way the daemon metadata is encoded
+ * for rados_service_register. For example, "foo\0bar\0this\0that\0\0" is
+ * {foo=bar,this=that}.
+ *
+ * @param cluster rados cluster handle
+ * @param status_dict status dict
+ */
+CEPH_RADOS_API int rados_service_update_status(
+ rados_t cluster,
+ const char *status_dict);
+
+/** @} Mon/OSD/PG commands */
+
+/*
+ * These methods are no longer supported and return -ENOTSUP where possible.
+ */
+CEPH_RADOS_API int rados_objects_list_open(
+ rados_ioctx_t io,
+ rados_list_ctx_t *ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position(
+ rados_list_ctx_t ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_seek(
+ rados_list_ctx_t ctx,
+ uint32_t pos) __attribute__((deprecated));
+CEPH_RADOS_API int rados_objects_list_next(
+ rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key) __attribute__((deprecated));
+CEPH_RADOS_API void rados_objects_list_close(
+ rados_list_ctx_t ctx) __attribute__((deprecated));
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
new file mode 100644
index 000000000..76fe6dfd0
--- /dev/null
+++ b/src/include/rados/librados.hpp
@@ -0,0 +1,1556 @@
+#ifndef __LIBRADOS_HPP
+#define __LIBRADOS_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <utility>
+#include "buffer.h"
+
+#include "librados.h"
+#include "librados_fwd.hpp"
+#include "rados_types.hpp"
+
+namespace libradosstriper
+{
+ class RadosStriper;
+}
+
+namespace neorados { class RADOS; }
+
+namespace librados {
+
+using ceph::bufferlist;
+
+struct AioCompletionImpl;
+struct IoCtxImpl;
+struct ListObjectImpl;
+class NObjectIteratorImpl;
+struct ObjListCtx;
+class ObjectOperationImpl;
+struct PlacementGroupImpl;
+struct PoolAsyncCompletionImpl;
+
+typedef struct rados_cluster_stat_t cluster_stat_t;
+typedef struct rados_pool_stat_t pool_stat_t;
+
+typedef void *list_ctx_t;
+typedef uint64_t auid_t;
+typedef void *config_t;
+
+typedef struct {
+ std::string client;
+ std::string cookie;
+ std::string address;
+} locker_t;
+
+typedef std::map<std::string, pool_stat_t> stats_map;
+
+typedef void *completion_t;
+typedef void (*callback_t)(completion_t cb, void *arg);
+
+inline namespace v14_2_0 {
+
+ class IoCtx;
+ class RadosClient;
+
+ class CEPH_RADOS_API ListObject
+ {
+ public:
+ const std::string& get_nspace() const;
+ const std::string& get_oid() const;
+ const std::string& get_locator() const;
+
+ ListObject();
+ ~ListObject();
+ ListObject( const ListObject&);
+ ListObject& operator=(const ListObject& rhs);
+ private:
+ ListObject(ListObjectImpl *impl);
+
+ friend class librados::NObjectIteratorImpl;
+ friend std::ostream& operator<<(std::ostream& out, const ListObject& lop);
+
+ ListObjectImpl *impl;
+ };
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop);
+
+ class CEPH_RADOS_API NObjectIterator;
+
+ class CEPH_RADOS_API ObjectCursor
+ {
+ public:
+ ObjectCursor();
+ ObjectCursor(const ObjectCursor &rhs);
+ explicit ObjectCursor(rados_object_list_cursor c);
+ ~ObjectCursor();
+ ObjectCursor& operator=(const ObjectCursor& rhs);
+ bool operator<(const ObjectCursor &rhs) const;
+ bool operator==(const ObjectCursor &rhs) const;
+ void set(rados_object_list_cursor c);
+
+ friend class IoCtx;
+ friend class librados::NObjectIteratorImpl;
+ friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+ std::string to_str() const;
+ bool from_str(const std::string& s);
+
+ protected:
+ rados_object_list_cursor c_cursor;
+ };
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+ class CEPH_RADOS_API NObjectIterator : public std::iterator <std::forward_iterator_tag, ListObject> {
+ public:
+ static const NObjectIterator __EndObjectIterator;
+ NObjectIterator(): impl(NULL) {}
+ ~NObjectIterator();
+ NObjectIterator(const NObjectIterator &rhs);
+ NObjectIterator& operator=(const NObjectIterator& rhs);
+
+ bool operator==(const NObjectIterator& rhs) const;
+ bool operator!=(const NObjectIterator& rhs) const;
+ const ListObject& operator*() const;
+ const ListObject* operator->() const;
+ NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions
+ NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions
+ friend class IoCtx;
+ friend class librados::NObjectIteratorImpl;
+
+ /// get current hash position of the iterator, rounded to the current pg
+ uint32_t get_pg_hash_position() const;
+
+ /// move the iterator to a given hash position. this may (will!) be rounded
+ /// to the nearest pg. errors are thrown as exceptions
+ uint32_t seek(uint32_t pos);
+
+ /// move the iterator to a given cursor position. errors are thrown as exceptions
+ uint32_t seek(const ObjectCursor& cursor);
+
+ /// get current cursor position
+ ObjectCursor get_cursor();
+
+ /**
+ * Configure PGLS filter to be applied OSD-side (requires caller
+ * to know/understand the format expected by the OSD)
+ */
+ void set_filter(const bufferlist &bl);
+
+ private:
+ NObjectIterator(ObjListCtx *ctx_);
+ void get_next();
+ NObjectIteratorImpl *impl;
+ };
+
+ class CEPH_RADOS_API ObjectItem
+ {
+ public:
+ std::string oid;
+ std::string nspace;
+ std::string locator;
+ };
+
+ /// DEPRECATED; do not use
+ class CEPH_RADOS_API WatchCtx {
+ public:
+ virtual ~WatchCtx();
+ virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0;
+ };
+
+ class CEPH_RADOS_API WatchCtx2 {
+ public:
+ virtual ~WatchCtx2();
+ /**
+ * Callback activated when we receive a notify event.
+ *
+ * @param notify_id unique id for this notify event
+ * @param cookie the watcher we are notifying
+ * @param notifier_id the unique client id of the notifier
+ * @param bl opaque notify payload (from the notifier)
+ */
+ virtual void handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl) = 0;
+
+ /**
+ * Callback activated when we encounter an error with the watch.
+ *
+ * Errors we may see:
+ * -ENOTCONN : our watch was disconnected
+ * -ETIMEDOUT : our watch is still valid, but we may have missed
+ * a notify event.
+ *
+ * @param cookie the watcher with the problem
+ * @param err error
+ */
+ virtual void handle_error(uint64_t cookie, int err) = 0;
+ };
+
+ struct CEPH_RADOS_API AioCompletion {
+ AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {}
+ ~AioCompletion();
+ int set_complete_callback(void *cb_arg, callback_t cb);
+ int set_safe_callback(void *cb_arg, callback_t cb)
+ __attribute__ ((deprecated));
+ int wait_for_complete();
+ int wait_for_safe() __attribute__ ((deprecated));
+ int wait_for_complete_and_cb();
+ int wait_for_safe_and_cb() __attribute__ ((deprecated));
+ bool is_complete();
+ bool is_safe() __attribute__ ((deprecated));
+ bool is_complete_and_cb();
+ bool is_safe_and_cb() __attribute__ ((deprecated));
+ int get_return_value();
+ int get_version() __attribute__ ((deprecated));
+ uint64_t get_version64();
+ void release();
+ AioCompletionImpl *pc;
+ };
+
+ struct CEPH_RADOS_API PoolAsyncCompletion {
+ PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {}
+ ~PoolAsyncCompletion();
+ int set_callback(void *cb_arg, callback_t cb);
+ int wait();
+ bool is_complete();
+ int get_return_value();
+ void release();
+ PoolAsyncCompletionImpl *pc;
+ };
+
+ /**
+ * These are per-op flags which may be different among
+ * ops added to an ObjectOperation.
+ */
+ enum ObjectOperationFlags {
+ OP_EXCL = LIBRADOS_OP_FLAG_EXCL,
+ OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK,
+ OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM,
+ OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL,
+ OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED,
+ OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
+ OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE,
+ };
+
+ class CEPH_RADOS_API ObjectOperationCompletion {
+ public:
+ virtual ~ObjectOperationCompletion() {}
+ virtual void handle_completion(int r, bufferlist& outbl) = 0;
+ };
+
+ /**
+ * These flags apply to the ObjectOperation as a whole.
+ *
+ * BALANCE_READS and LOCALIZE_READS should only be used
+ * when reading from data you're certain won't change,
+ * like a snapshot, or where eventual consistency is ok.
+ *
+ * ORDER_READS_WRITES will order reads the same way writes are
+ * ordered (e.g., waiting for degraded objects). In particular, it
+ * will make a write followed by a read sequence be preserved.
+ *
+ * IGNORE_CACHE will skip the caching logic on the OSD that normally
+ * handles promotion of objects between tiers. This allows an operation
+ * to operate (or read) the cached (or uncached) object, even if it is
+ * not coherent.
+ *
+ * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and
+ * process the op directly on the destination pool. This is useful
+ * for CACHE_FLUSH and CACHE_EVICT operations.
+ */
+ enum ObjectOperationGlobalFlags {
+ OPERATION_NOFLAG = LIBRADOS_OPERATION_NOFLAG,
+ OPERATION_BALANCE_READS = LIBRADOS_OPERATION_BALANCE_READS,
+ OPERATION_LOCALIZE_READS = LIBRADOS_OPERATION_LOCALIZE_READS,
+ OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES,
+ OPERATION_IGNORE_CACHE = LIBRADOS_OPERATION_IGNORE_CACHE,
+ OPERATION_SKIPRWLOCKS = LIBRADOS_OPERATION_SKIPRWLOCKS,
+ OPERATION_IGNORE_OVERLAY = LIBRADOS_OPERATION_IGNORE_OVERLAY,
+ // send requests to cluster despite the cluster or pool being
+ // marked full; ops will either succeed (e.g., delete) or return
+ // EDQUOT or ENOSPC
+ OPERATION_FULL_TRY = LIBRADOS_OPERATION_FULL_TRY,
+ // mainly for delete
+ OPERATION_FULL_FORCE = LIBRADOS_OPERATION_FULL_FORCE,
+ OPERATION_IGNORE_REDIRECT = LIBRADOS_OPERATION_IGNORE_REDIRECT,
+ OPERATION_ORDERSNAP = LIBRADOS_OPERATION_ORDERSNAP,
+ // enable/allow return value and per-op return code/buffers
+ OPERATION_RETURNVEC = LIBRADOS_OPERATION_RETURNVEC,
+ };
+
+ /*
+ * Alloc hint flags for the alloc_hint operation.
+ */
+ enum AllocHintFlags {
+ ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ ALLOC_HINT_FLAG_LONGLIVED = 128,
+ ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+ };
+
+ /*
+ * ObjectOperation : compound object operation
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectOperation
+ {
+ public:
+ ObjectOperation();
+ virtual ~ObjectOperation();
+
+ ObjectOperation(const ObjectOperation&) = delete;
+ ObjectOperation& operator=(const ObjectOperation&) = delete;
+
+ /**
+ * Move constructor.
+ * \warning A moved from ObjectOperation is invalid and may not be used for
+ * any purpose. This is a hard contract violation and will
+ * kill your program.
+ */
+ ObjectOperation(ObjectOperation&&);
+ ObjectOperation& operator =(ObjectOperation&&);
+
+ size_t size();
+ void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated));
+ //flag mean ObjectOperationFlags
+ void set_op_flags2(int flags);
+
+ void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval);
+ void cmpxattr(const char *name, uint8_t op, const bufferlist& val);
+ void cmpxattr(const char *name, uint8_t op, uint64_t v);
+ void exec(const char *cls, const char *method, bufferlist& inbl);
+ void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval);
+ void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion);
+ /**
+ * Guard operation with a check that object version == ver
+ *
+ * @param ver [in] version to check
+ */
+ void assert_version(uint64_t ver);
+
+ /**
+ * Guard operation with a check that the object already exists
+ */
+ void assert_exists();
+
+ /**
+ * get key/value pairs for specified keys
+ *
+ * @param assertions [in] comparison assertions
+ * @param prval [out] place error code in prval upon completion
+ *
+ * assertions has the form of mappings from keys to (comparison rval, assertion)
+ * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ].
+ *
+ * That is, to assert that the value at key 'foo' is greater than 'bar':
+ *
+ * ObjectReadOperation op;
+ * int r;
+ * map<string, pair<bufferlist, int> > assertions;
+ * bufferlist bar(string('bar'));
+ * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT);
+ * op.omap_cmp(assertions, &r);
+ */
+ void omap_cmp(
+ const std::map<std::string, std::pair<bufferlist, int> > &assertions,
+ int *prval);
+
+ protected:
+ ObjectOperationImpl* impl;
+ friend class IoCtx;
+ friend class Rados;
+ };
+
+ /*
+ * ObjectWriteOperation : compound object write operation
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation
+ {
+ protected:
+ time_t *unused;
+ public:
+ ObjectWriteOperation() : unused(NULL) {}
+ ~ObjectWriteOperation() override {}
+
+ ObjectWriteOperation(ObjectWriteOperation&&) = default;
+ ObjectWriteOperation& operator =(ObjectWriteOperation&&) = default;
+
+ void mtime(time_t *pt);
+ void mtime2(struct timespec *pts);
+
+ void create(bool exclusive);
+ void create(bool exclusive,
+ const std::string& category); ///< NOTE: category is unused
+
+ void write(uint64_t off, const bufferlist& bl);
+ void write_full(const bufferlist& bl);
+ void writesame(uint64_t off, uint64_t write_len,
+ const bufferlist& bl);
+ void append(const bufferlist& bl);
+ void remove();
+ void truncate(uint64_t off);
+ void zero(uint64_t off, uint64_t len);
+ void rmxattr(const char *name);
+ void setxattr(const char *name, const bufferlist& bl);
+ void setxattr(const char *name, const bufferlist&& bl);
+ void tmap_update(const bufferlist& cmdbl);
+ void tmap_put(const bufferlist& bl);
+ void selfmanaged_snap_rollback(uint64_t snapid);
+
+ /**
+ * Rollback an object to the specified snapshot id
+ *
+ * Used with pool snapshots
+ *
+ * @param snapid [in] snopshot id specified
+ */
+ void snap_rollback(uint64_t snapid);
+
+ /**
+ * set keys and values according to map
+ *
+ * @param map [in] keys and values to set
+ */
+ void omap_set(const std::map<std::string, bufferlist> &map);
+
+ /**
+ * set header
+ *
+ * @param bl [in] header to set
+ */
+ void omap_set_header(const bufferlist &bl);
+
+ /**
+ * Clears omap contents
+ */
+ void omap_clear();
+
+ /**
+ * Clears keys in to_rm
+ *
+ * @param to_rm [in] keys to remove
+ */
+ void omap_rm_keys(const std::set<std::string> &to_rm);
+
+ /**
+ * Copy an object
+ *
+ * Copies an object from another location. The operation is atomic in that
+ * the copy either succeeds in its entirety or fails (e.g., because the
+ * source object was modified while the copy was in progress).
+ *
+ * @param src source object name
+ * @param src_ioctx ioctx for the source object
+ * @param src_version current version of the source object
+ * @param src_fadvise_flags the fadvise flags for source object
+ */
+ void copy_from(const std::string& src, const IoCtx& src_ioctx,
+ uint64_t src_version, uint32_t src_fadvise_flags);
+
+ /**
+ * Copy an object
+ *
+ * Copies an object from another location. The operation is atomic in that
+ * the copy either succeeds in its entirety or fails (e.g., because the
+ * source object was modified while the copy was in progress). Instead of
+ * copying truncate_seq and truncate_size from the source object it receives
+ * these values as parameters.
+ *
+ * @param src source object name
+ * @param src_ioctx ioctx for the source object
+ * @param src_version current version of the source object
+ * @param truncate_seq truncate sequence for the destination object
+ * @param truncate_size truncate size for the destination object
+ * @param src_fadvise_flags the fadvise flags for source object
+ */
+ void copy_from2(const std::string& src, const IoCtx& src_ioctx,
+ uint64_t src_version, uint32_t truncate_seq,
+ uint64_t truncate_size, uint32_t src_fadvise_flags);
+
+ /**
+ * undirty an object
+ *
+ * Clear an objects dirty flag
+ */
+ void undirty();
+
+ /**
+ * Set allocation hint for an object
+ *
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags flags ()
+ */
+ void set_alloc_hint(uint64_t expected_object_size,
+ uint64_t expected_write_size);
+ void set_alloc_hint2(uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+ /**
+ * Pin/unpin an object in cache tier
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ void cache_pin();
+ void cache_unpin();
+
+ /**
+ * Extensible tier
+ *
+ * Set redirect target
+ */
+ void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx,
+ uint64_t tgt_version, int flag = 0);
+ void tier_promote();
+ void unset_manifest();
+
+ friend class IoCtx;
+ };
+
+ /*
+ * ObjectReadOperation : compound object operation that return value
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation
+ {
+ public:
+ ObjectReadOperation() {}
+ ~ObjectReadOperation() override {}
+
+ ObjectReadOperation(ObjectReadOperation&&) = default;
+ ObjectReadOperation& operator =(ObjectReadOperation&&) = default;
+
+ void stat(uint64_t *psize, time_t *pmtime, int *prval);
+ void stat2(uint64_t *psize, struct timespec *pts, int *prval);
+ void getxattr(const char *name, bufferlist *pbl, int *prval);
+ void getxattrs(std::map<std::string, bufferlist> *pattrs, int *prval);
+ void read(size_t off, uint64_t len, bufferlist *pbl, int *prval);
+ void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl,
+ uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl,
+ int *prval);
+
+ /**
+ * see aio_sparse_read()
+ */
+ void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m,
+ bufferlist *data_bl, int *prval);
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list no keys smaller than start_after
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals(
+ const std::string &start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list no keys smaller than start_after
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals2(
+ const std::string &start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore,
+ int *prval);
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param filter_prefix [in] list only keys beginning with filter_prefix
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals(
+ const std::string &start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_vals2: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param filter_prefix [in] list only keys beginning with filter_prefix
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param pmore [out] pointer to bool indicating whether there are more keys
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals2(
+ const std::string &start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore,
+ int *prval);
+
+
+ /**
+ * omap_get_keys: keys from the object omap
+ *
+ * Get up to max_return keys beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param max_return [in] list no more than max_return keys
+ * @param out_keys [out] place returned values in out_keys on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_keys(const std::string &start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_keys2: keys from the object omap
+ *
+ * Get up to max_return keys beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param max_return [in] list no more than max_return keys
+ * @param out_keys [out] place returned values in out_keys on completion
+ * @param pmore [out] pointer to bool indicating whether there are more keys
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_keys2(const std::string &start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ bool *pmore,
+ int *prval);
+
+ /**
+ * omap_get_header: get header from object omap
+ *
+ * @param header [out] place header here upon completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_header(bufferlist *header, int *prval);
+
+ /**
+ * get key/value pairs for specified keys
+ *
+ * @param keys [in] keys to get
+ * @param map [out] place key/value pairs found here on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals_by_keys(const std::set<std::string> &keys,
+ std::map<std::string, bufferlist> *map,
+ int *prval);
+
+ /**
+ * list_watchers: Get list watchers of object
+ *
+ * @param out_watchers [out] place returned values in out_watchers on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void list_watchers(std::list<obj_watch_t> *out_watchers, int *prval);
+
+ /**
+ * list snapshot clones associated with a logical object
+ *
+ * This will include a record for each version of the object,
+ * include the "HEAD" (which will have a cloneid of SNAP_HEAD).
+ * Each clone includes a vector of snap ids for which it is
+ * defined to exist.
+ *
+ * NOTE: this operation must be submitted from an IoCtx with a
+ * read snapid of SNAP_DIR for reliable results.
+ *
+ * @param out_snaps [out] pointer to resulting snap_set_t
+ * @param prval [out] place error code in prval upon completion
+ */
+ void list_snaps(snap_set_t *out_snaps, int *prval);
+
+ /**
+ * query dirty state of an object
+ *
+ * @param isdirty [out] pointer to resulting bool
+ * @param prval [out] place error code in prval upon completion
+ */
+ void is_dirty(bool *isdirty, int *prval);
+
+ /**
+ * flush a cache tier object to backing tier; will block racing
+ * updates.
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promotion.
+ */
+ void cache_flush();
+
+ /**
+ * Flush a cache tier object to backing tier; will EAGAIN if we race
+ * with an update. Must be used with the SKIPRWLOCKS flag.
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promotion.
+ */
+ void cache_try_flush();
+
+ /**
+ * evict a clean cache tier object
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promote on the OSD (that is then evicted).
+ */
+ void cache_evict();
+
+ /**
+ * Extensible tier
+ *
+ * set_chunk: make a chunk pointing a part of the source object at the target
+ * object
+ *
+ * @param src_offset [in] source offset to indicate the start position of
+ * a chunk in the source object
+ * @param src_length [in] source length to set the length of the chunk
+ * @param tgt_oid [in] target object's id to set a chunk
+ * @param tgt_offset [in] the start position of the target object
+ * @param flag [in] flag for the source object
+ *
+ */
+ void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx,
+ std::string tgt_oid, uint64_t tgt_offset, int flag = 0);
+ /**
+ * flush a manifest tier object to backing tier; will block racing
+ * updates.
+ */
+ void tier_flush();
+ /**
+ * evict a manifest tier object to backing tier; will block racing
+ * updates.
+ */
+ void tier_evict();
+ };
+
+ /* IoCtx : This is a context in which we can perform I/O.
+ * It includes a Pool,
+ *
+ * Typical use (error checking omitted):
+ *
+ * IoCtx p;
+ * rados.ioctx_create("my_pool", p);
+ * p->stat(&stats);
+ * ... etc ...
+ *
+ * NOTE: be sure to call watch_flush() prior to destroying any IoCtx
+ * that is used for watch events to ensure that racing callbacks
+ * have completed.
+ */
+ class CEPH_RADOS_API IoCtx
+ {
+ public:
+ IoCtx();
+ static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool);
+ IoCtx(const IoCtx& rhs);
+ IoCtx& operator=(const IoCtx& rhs);
+ IoCtx(IoCtx&& rhs) noexcept;
+ IoCtx& operator=(IoCtx&& rhs) noexcept;
+
+ ~IoCtx();
+
+ bool is_valid() const;
+
+ // Close our pool handle
+ void close();
+
+ // deep copy
+ void dup(const IoCtx& rhs);
+
+ // set pool auid
+ int set_auid(uint64_t auid_)
+ __attribute__ ((deprecated));
+
+ // set pool auid
+ int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+
+ // get pool auid
+ int get_auid(uint64_t *auid_)
+ __attribute__ ((deprecated));
+
+ uint64_t get_instance_id() const;
+
+ std::string get_pool_name();
+
+ bool pool_requires_alignment();
+ int pool_requires_alignment2(bool * req);
+ uint64_t pool_required_alignment();
+ int pool_required_alignment2(uint64_t * alignment);
+
+ // create an object
+ int create(const std::string& oid, bool exclusive);
+ int create(const std::string& oid, bool exclusive,
+ const std::string& category); ///< category is unused
+
+ /**
+ * write bytes to an object at a specified offset
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+ /**
+ * append bytes to an object
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int append(const std::string& oid, bufferlist& bl, size_t len);
+ /**
+ * replace object contents with provided data
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write_full(const std::string& oid, bufferlist& bl);
+ int writesame(const std::string& oid, bufferlist& bl,
+ size_t write_len, uint64_t off);
+ int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+ int checksum(const std::string& o, rados_checksum_type_t type,
+ const bufferlist &init_value_bl, size_t len, uint64_t off,
+ size_t chunk_size, bufferlist *pbl);
+ int remove(const std::string& oid);
+ int remove(const std::string& oid, int flags);
+ int trunc(const std::string& oid, uint64_t size);
+ int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m);
+ int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl);
+ int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off);
+ int getxattr(const std::string& oid, const char *name, bufferlist& bl);
+ int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset);
+ int setxattr(const std::string& oid, const char *name, bufferlist& bl);
+ int rmxattr(const std::string& oid, const char *name);
+ int stat(const std::string& oid, uint64_t *psize, time_t *pmtime);
+ int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts);
+ int exec(const std::string& oid, const char *cls, const char *method,
+ bufferlist& inbl, bufferlist& outbl);
+ /**
+ * modify object tmap based on encoded update sequence
+ *
+ * NOTE: this call steals the contents of @param bl
+ */
+ int tmap_update(const std::string& oid, bufferlist& cmdbl);
+
+ int omap_get_vals(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals);
+ int omap_get_vals2(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore);
+ int omap_get_vals(const std::string& oid,
+ const std::string& start_after,
+ const std::string& filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals);
+ int omap_get_vals2(const std::string& oid,
+ const std::string& start_after,
+ const std::string& filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore);
+ int omap_get_keys(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys);
+ int omap_get_keys2(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ bool *pmore);
+ int omap_get_header(const std::string& oid,
+ bufferlist *bl);
+ int omap_get_vals_by_keys(const std::string& oid,
+ const std::set<std::string>& keys,
+ std::map<std::string, bufferlist> *vals);
+ int omap_set(const std::string& oid,
+ const std::map<std::string, bufferlist>& map);
+ int omap_set_header(const std::string& oid,
+ const bufferlist& bl);
+ int omap_clear(const std::string& oid);
+ int omap_rm_keys(const std::string& oid,
+ const std::set<std::string>& keys);
+
+ void snap_set_read(snap_t seq);
+ int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector<snap_t>& snaps);
+
+ // Create a snapshot with a given name
+ int snap_create(const char *snapname);
+
+ // Look up a snapshot by name.
+ // Returns 0 on success; error code otherwise
+ int snap_lookup(const char *snapname, snap_t *snap);
+
+ // Gets a timestamp for a snap
+ int snap_get_stamp(snap_t snapid, time_t *t);
+
+ // Gets the name of a snap
+ int snap_get_name(snap_t snapid, std::string *s);
+
+ // Remove a snapshot from this pool
+ int snap_remove(const char *snapname);
+
+ int snap_list(std::vector<snap_t> *snaps);
+
+ int snap_rollback(const std::string& oid, const char *snapname);
+
+ // Deprecated name kept for backward compatibility - same as snap_rollback()
+ int rollback(const std::string& oid, const char *snapname)
+ __attribute__ ((deprecated));
+
+ int selfmanaged_snap_create(uint64_t *snapid);
+ void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c);
+
+ int selfmanaged_snap_remove(uint64_t snapid);
+ void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c);
+
+ int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid);
+
+ // Advisory locking on rados objects.
+ int lock_exclusive(const std::string &oid, const std::string &name,
+ const std::string &cookie,
+ const std::string &description,
+ struct timeval * duration, uint8_t flags);
+
+ int lock_shared(const std::string &oid, const std::string &name,
+ const std::string &cookie, const std::string &tag,
+ const std::string &description,
+ struct timeval * duration, uint8_t flags);
+
+ int unlock(const std::string &oid, const std::string &name,
+ const std::string &cookie);
+
+ int break_lock(const std::string &oid, const std::string &name,
+ const std::string &client, const std::string &cookie);
+
+ int list_lockers(const std::string &oid, const std::string &name,
+ int *exclusive,
+ std::string *tag,
+ std::list<librados::locker_t> *lockers);
+
+
+ /// Start enumerating objects for a pool. Errors are thrown as exceptions.
+ NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist());
+ /// Start enumerating objects for a pool starting from a hash position.
+ /// Errors are thrown as exceptions.
+ NObjectIterator nobjects_begin(uint32_t start_hash_position,
+ const bufferlist &filter=bufferlist());
+ /// Start enumerating objects for a pool starting from cursor. Errors are
+ /// thrown as exceptions.
+ NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor,
+ const bufferlist &filter=bufferlist());
+ /// Iterator indicating the end of a pool
+ const NObjectIterator& nobjects_end() const;
+
+ /// Get cursor for pool beginning
+ ObjectCursor object_list_begin();
+
+ /// Get cursor for pool end
+ ObjectCursor object_list_end();
+
+ /// Check whether a cursor is at the end of a pool
+ bool object_list_is_end(const ObjectCursor &oc);
+
+ /// List some objects between two cursors
+ int object_list(const ObjectCursor &start, const ObjectCursor &finish,
+ const size_t result_count,
+ const bufferlist &filter,
+ std::vector<ObjectItem> *result,
+ ObjectCursor *next);
+
+ /// Generate cursors that include the N out of Mth slice of the pool
+ void object_list_slice(
+ const ObjectCursor start,
+ const ObjectCursor finish,
+ const size_t n,
+ const size_t m,
+ ObjectCursor *split_start,
+ ObjectCursor *split_finish);
+
+ /**
+ * List available hit set objects
+ *
+ * @param uint32_t [in] hash position to query
+ * @param c [in] completion
+ * @param pls [out] list of available intervals
+ */
+ int hit_set_list(uint32_t hash, AioCompletion *c,
+ std::list< std::pair<time_t, time_t> > *pls);
+
+ /**
+ * Retrieve hit set for a given hash, and time
+ *
+ * @param hash [in] hash position
+ * @param c [in] completion
+ * @param stamp [in] time interval that falls within the hit set's interval
+ * @param pbl [out] buffer to store the result in
+ */
+ int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp,
+ bufferlist *pbl);
+
+ uint64_t get_last_version();
+
+ int aio_read(const std::string& oid, AioCompletion *c,
+ bufferlist *pbl, size_t len, uint64_t off);
+ /**
+ * Asynchronously read from an object at a particular snapshot
+ *
+ * This is the same as normal aio_read, except that it chooses
+ * the snapshot to read from from its arguments instead of the
+ * internal IoCtx state.
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param pbl where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @param snapid the id of the snapshot to read from
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_read(const std::string& oid, AioCompletion *c,
+ bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid);
+ int aio_sparse_read(const std::string& oid, AioCompletion *c,
+ std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+ size_t len, uint64_t off);
+ /**
+ * Asynchronously read existing extents from an object at a
+ * particular snapshot
+ *
+ * This is the same as normal aio_sparse_read, except that it chooses
+ * the snapshot to read from from its arguments instead of the
+ * internal IoCtx state.
+ *
+ * m will be filled in with a map of extents in the object,
+ * mapping offsets to lengths (in bytes) within the range
+ * requested. The data for all of the extents are stored
+ * back-to-back in offset order in data_bl.
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param m where to store the map of extents
+ * @param data_bl where to store the data
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @param snapid the id of the snapshot to read from
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_sparse_read(const std::string& oid, AioCompletion *c,
+ std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+ size_t len, uint64_t off, uint64_t snapid);
+ /**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param off object byte offset at which to start the comparison
+ * @param cmp_bl buffer containing bytes to be compared with object contents
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+ int aio_cmpext(const std::string& oid,
+ librados::AioCompletion *c,
+ uint64_t off,
+ bufferlist& cmp_bl);
+ int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t len, uint64_t off);
+ int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t len);
+ int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl);
+ int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t write_len, uint64_t off);
+
+ /**
+ * Asynchronously remove an object
+ *
+ * Queues the remove and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param oid the name of the object
+ * @param c what to do when the remove is safe and complete
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than SNAP_HEAD
+ */
+ int aio_remove(const std::string& oid, AioCompletion *c);
+ int aio_remove(const std::string& oid, AioCompletion *c, int flags);
+
+ /**
+ * Wait for all currently pending aio writes to be safe.
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush();
+
+ /**
+ * Schedule a callback for when all currently pending
+ * aio writes are safe. This is a non-blocking version of
+ * aio_flush().
+ *
+ * @param c what to do when the writes are safe
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush_async(AioCompletion *c);
+ int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+ int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map<std::string, bufferlist>& attrset);
+ int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+ int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name);
+ int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime);
+ int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts);
+
+ /**
+ * Cancel aio operation
+ *
+ * @param c completion handle
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_cancel(AioCompletion *c);
+
+ int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method,
+ bufferlist& inbl, bufferlist *outbl);
+
+ /*
+ * asynchronous version of unlock
+ */
+ int aio_unlock(const std::string &oid, const std::string &name,
+ const std::string &cookie, AioCompletion *c);
+
+ // compound object operations
+ int operate(const std::string& oid, ObjectWriteOperation *op);
+ int operate(const std::string& oid, ObjectWriteOperation *op, int flags);
+ int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl);
+ int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl, int flags);
+ int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op);
+ int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags);
+ /**
+ * Schedule an async write operation with explicit snapshot parameters
+ *
+ * This is the same as the first aio_operate(), except that it
+ * gets the snapshot context from its arguments instead of the
+ * IoCtx internal state.
+ *
+ * @param oid the object to operate on
+ * @param c what to do when the operation is complete and safe
+ * @param op which operations to perform
+ * @param seq latest selfmanaged snapshot sequence number for this object
+ * @param snaps currently existing selfmanaged snapshot ids for this object
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps,
+ const blkin_trace_info *trace_info);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps, int flags,
+ const blkin_trace_info *trace_info);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, bufferlist *pbl);
+
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, snap_t snapid, int flags,
+ bufferlist *pbl)
+ __attribute__ ((deprecated));
+
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, int flags,
+ bufferlist *pbl);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, int flags,
+ bufferlist *pbl, const blkin_trace_info *trace_info);
+
+ // watch/notify
+ int watch2(const std::string& o, uint64_t *handle,
+ librados::WatchCtx2 *ctx);
+ int watch3(const std::string& o, uint64_t *handle,
+ librados::WatchCtx2 *ctx, uint32_t timeout);
+ int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle,
+ librados::WatchCtx2 *ctx);
+ int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle,
+ librados::WatchCtx2 *ctx, uint32_t timeout);
+ int unwatch2(uint64_t handle);
+ int aio_unwatch(uint64_t handle, AioCompletion *c);
+ /**
+ * Send a notify event to watchers
+ *
+ * Upon completion the pbl bufferlist reply payload will be
+ * encoded like so:
+ *
+ * le32 num_acks
+ * {
+ * le64 gid global id for the client (for client.1234 that's 1234)
+ * le64 cookie cookie for the client
+ * le32 buflen length of reply message buffer
+ * u8 * buflen payload
+ * } * num_acks
+ * le32 num_timeouts
+ * {
+ * le64 gid global id for the client
+ * le64 cookie cookie for the client
+ * } * num_timeouts
+ *
+ *
+ */
+ int notify2(const std::string& o, ///< object
+ bufferlist& bl, ///< optional broadcast payload
+ uint64_t timeout_ms, ///< timeout (in ms)
+ bufferlist *pbl); ///< reply buffer
+ int aio_notify(const std::string& o, ///< object
+ AioCompletion *c, ///< completion when notify completes
+ bufferlist& bl, ///< optional broadcast payload
+ uint64_t timeout_ms, ///< timeout (in ms)
+ bufferlist *pbl); ///< reply buffer
+ /*
+ * Decode a notify response into acks and timeout vectors.
+ */
+ void decode_notify_response(bufferlist &bl,
+ std::vector<librados::notify_ack_t> *acks,
+ std::vector<librados::notify_timeout_t> *timeouts);
+
+ int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers);
+ int list_snaps(const std::string& o, snap_set_t *out_snaps);
+ void set_notify_timeout(uint32_t timeout);
+
+ /// acknowledge a notify we received.
+ void notify_ack(const std::string& o, ///< watched object
+ uint64_t notify_id, ///< notify id
+ uint64_t cookie, ///< our watch handle
+ bufferlist& bl); ///< optional reply payload
+
+ /***
+ * check on watch validity
+ *
+ * Check if a watch is valid. If so, return the number of
+ * milliseconds since we last confirmed its liveness. If there is
+ * a known error, return it.
+ *
+ * If there is an error, the watch is no longer valid, and should
+ * be destroyed with unwatch(). The user is still interested in
+ * the object, a new watch should be created with watch().
+ *
+ * @param cookie watch handle
+ * @returns ms since last confirmed valid, or error
+ */
+ int watch_check(uint64_t cookie);
+
+ // old, deprecated versions
+ int watch(const std::string& o, uint64_t ver, uint64_t *cookie,
+ librados::WatchCtx *ctx) __attribute__ ((deprecated));
+ int notify(const std::string& o, uint64_t ver, bufferlist& bl)
+ __attribute__ ((deprecated));
+ int unwatch(const std::string& o, uint64_t cookie)
+ __attribute__ ((deprecated));
+
+ /**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it
+ * was submitted with a OP_FAILOK flag set) and is not guaranteed
+ * to do anything on the backend.
+ *
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+ int set_alloc_hint(const std::string& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+ int set_alloc_hint2(const std::string& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+ // assert version for next sync operations
+ void set_assert_version(uint64_t ver);
+
+ /**
+ * Pin/unpin an object in cache tier
+ *
+ * @param o the name of the object
+ * @returns 0 on success, negative error code on failure
+ */
+ int cache_pin(const std::string& o);
+ int cache_unpin(const std::string& o);
+
+ std::string get_pool_name() const;
+
+ void locator_set_key(const std::string& key);
+ void set_namespace(const std::string& nspace);
+ std::string get_namespace() const;
+
+ int64_t get_id();
+
+ // deprecated versions
+ uint32_t get_object_hash_position(const std::string& oid)
+ __attribute__ ((deprecated));
+ uint32_t get_object_pg_hash_position(const std::string& oid)
+ __attribute__ ((deprecated));
+
+ int get_object_hash_position2(const std::string& oid, uint32_t *hash_position);
+ int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position);
+
+ config_t cct();
+
+ void set_osdmap_full_try()
+ __attribute__ ((deprecated));
+ void unset_osdmap_full_try()
+ __attribute__ ((deprecated));
+
+ bool get_pool_full_try();
+ void set_pool_full_try();
+ void unset_pool_full_try();
+
+ int application_enable(const std::string& app_name, bool force);
+ int application_enable_async(const std::string& app_name,
+ bool force, PoolAsyncCompletion *c);
+ int application_list(std::set<std::string> *app_names);
+ int application_metadata_get(const std::string& app_name,
+ const std::string &key,
+ std::string *value);
+ int application_metadata_set(const std::string& app_name,
+ const std::string &key,
+ const std::string& value);
+ int application_metadata_remove(const std::string& app_name,
+ const std::string &key);
+ int application_metadata_list(const std::string& app_name,
+ std::map<std::string, std::string> *values);
+
+ private:
+ /* You can only get IoCtx instances from Rados */
+ IoCtx(IoCtxImpl *io_ctx_impl_);
+
+ friend class Rados; // Only Rados can use our private constructor to create IoCtxes.
+ friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl
+ friend class ObjectWriteOperation; // copy_from needs to see our IoCtxImpl
+ friend class ObjectReadOperation; // set_chunk needs to see our IoCtxImpl
+
+ IoCtxImpl *io_ctx_impl;
+ };
+
+ struct CEPH_RADOS_API PlacementGroup {
+ PlacementGroup();
+ PlacementGroup(const PlacementGroup&);
+ ~PlacementGroup();
+ bool parse(const char*);
+ std::unique_ptr<PlacementGroupImpl> impl;
+ };
+
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&);
+
+ class CEPH_RADOS_API Rados
+ {
+ public:
+ static void version(int *major, int *minor, int *extra);
+
+ Rados();
+ explicit Rados(IoCtx& ioctx);
+ ~Rados();
+ static void from_rados_t(rados_t cluster, Rados &rados);
+
+ int init(const char * const id);
+ int init2(const char * const name, const char * const clustername,
+ uint64_t flags);
+ int init_with_context(config_t cct_);
+ config_t cct();
+ int connect();
+ void shutdown();
+ int watch_flush();
+ int aio_watch_flush(AioCompletion*);
+ int conf_read_file(const char * const path) const;
+ int conf_parse_argv(int argc, const char ** argv) const;
+ int conf_parse_argv_remainder(int argc, const char ** argv,
+ const char ** remargv) const;
+ int conf_parse_env(const char *env) const;
+ int conf_set(const char *option, const char *value);
+ int conf_get(const char *option, std::string &val);
+
+ int service_daemon_register(
+ const std::string& service, ///< service name (e.g., 'rgw')
+ const std::string& name, ///< daemon name (e.g., 'gwfoo')
+ const std::map<std::string,std::string>& metadata); ///< static metadata about daemon
+ int service_daemon_update_status(
+ std::map<std::string,std::string>&& status);
+
+ int pool_create(const char *name);
+ int pool_create(const char *name, uint64_t auid)
+ __attribute__ ((deprecated));
+ int pool_create(const char *name, uint64_t auid, uint8_t crush_rule)
+ __attribute__ ((deprecated));
+ int pool_create_with_rule(const char *name, uint8_t crush_rule);
+ int pool_create_async(const char *name, PoolAsyncCompletion *c);
+ int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+ int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+ int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c);
+ int pool_get_base_tier(int64_t pool, int64_t* base_tier);
+ int pool_delete(const char *name);
+ int pool_delete_async(const char *name, PoolAsyncCompletion *c);
+ int64_t pool_lookup(const char *name);
+ int pool_reverse_lookup(int64_t id, std::string *name);
+
+ uint64_t get_instance_id();
+
+ int get_min_compatible_osd(int8_t* require_osd_release);
+ int get_min_compatible_client(int8_t* min_compat_client,
+ int8_t* require_min_compat_client);
+
+ int mon_command(std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int mgr_command(std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int osd_command(int osdid, std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+
+ int ioctx_create(const char *name, IoCtx &pioctx);
+ int ioctx_create2(int64_t pool_id, IoCtx &pioctx);
+
+ // Features useful for test cases
+ void test_blocklist_self(bool set);
+
+ /* pool info */
+ int pool_list(std::list<std::string>& v);
+ int pool_list2(std::list<std::pair<int64_t, std::string> >& v);
+ int get_pool_stats(std::list<std::string>& v,
+ stats_map& result);
+ /// deprecated; use simpler form. categories no longer supported.
+ int get_pool_stats(std::list<std::string>& v,
+ std::map<std::string, stats_map>& stats);
+ /// deprecated; categories no longer supported
+ int get_pool_stats(std::list<std::string>& v,
+ std::string& category,
+ std::map<std::string, stats_map>& stats);
+ /// check if pool has selfmanaged snaps
+ bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname);
+
+ int cluster_stat(cluster_stat_t& result);
+ int cluster_fsid(std::string *fsid);
+
+ /**
+ * List inconsistent placement groups in the given pool
+ *
+ * @param pool_id the pool id
+ * @param pgs [out] the inconsistent PGs
+ */
+ int get_inconsistent_pgs(int64_t pool_id,
+ std::vector<PlacementGroup>* pgs);
+ /**
+ * List the inconsistent objects found in a given PG by last scrub
+ *
+ * @param pg the placement group returned by @c pg_list()
+ * @param start_after the first returned @c objects
+ * @param max_return the max number of the returned @c objects
+ * @param c what to do when the operation is complete and safe
+ * @param objects [out] the objects where inconsistencies are found
+ * @param interval [in,out] an epoch indicating current interval
+ * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+ * the current interval begin epoch is different.
+ */
+ int get_inconsistent_objects(const PlacementGroup& pg,
+ const object_id_t &start_after,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_obj_t>* objects,
+ uint32_t* interval);
+ /**
+ * List the inconsistent snapsets found in a given PG by last scrub
+ *
+ * @param pg the placement group returned by @c pg_list()
+ * @param start_after the first returned @c objects
+ * @param max_return the max number of the returned @c objects
+ * @param c what to do when the operation is complete and safe
+ * @param snapsets [out] the objects where inconsistencies are found
+ * @param interval [in,out] an epoch indicating current interval
+ * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+ * the current interval begin epoch is different.
+ */
+ int get_inconsistent_snapsets(const PlacementGroup& pg,
+ const object_id_t &start_after,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_snapset_t>* snapset,
+ uint32_t* interval);
+
+ /// get/wait for the most recent osdmap
+ int wait_for_latest_osdmap();
+
+ int blocklist_add(const std::string& client_address,
+ uint32_t expire_seconds);
+
+ std::string get_addrs() const;
+
+ /*
+ * pool aio
+ *
+ * It is up to the caller to release the completion handler, even if the pool_create_async()
+ * and/or pool_delete_async() fails and does not send the async request
+ */
+ static PoolAsyncCompletion *pool_async_create_completion();
+
+ // -- aio --
+ static AioCompletion *aio_create_completion();
+ static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete,
+ callback_t cb_safe)
+ __attribute__ ((deprecated));
+ static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete);
+
+ friend std::ostream& operator<<(std::ostream &oss, const Rados& r);
+ private:
+ friend class neorados::RADOS;
+
+ // We don't allow assignment or copying
+ Rados(const Rados& rhs);
+ const Rados& operator=(const Rados& rhs);
+ RadosClient *client;
+ };
+
+} // namespace v14_2_0
+} // namespace librados
+
+#endif
+
diff --git a/src/include/rados/librados_fwd.hpp b/src/include/rados/librados_fwd.hpp
new file mode 100644
index 000000000..396f3a838
--- /dev/null
+++ b/src/include/rados/librados_fwd.hpp
@@ -0,0 +1,34 @@
+#ifndef __LIBRADOS_FWD_HPP
+#define __LIBRADOS_FWD_HPP
+
+struct blkin_trace_info;
+
+namespace libradosstriper {
+
+class RadosStriper;
+
+} // namespace libradosstriper
+
+namespace librados {
+inline namespace v14_2_0 {
+
+class AioCompletion;
+class IoCtx;
+class ListObject;
+class NObjectIterator;
+class ObjectCursor;
+class ObjectItem;
+class ObjectOperation;
+class ObjectOperationCompletion;
+class ObjectReadOperation;
+class ObjectWriteOperation;
+class PlacementGroup;
+class PoolAsyncCompletion;
+class Rados;
+class WatchCtx;
+class WatchCtx2;
+
+} // inline namespace v14_2_0
+} // namespace librados
+
+#endif // __LIBRADOS_FWD_HPP
diff --git a/src/include/rados/librgw.h b/src/include/rados/librgw.h
new file mode 100644
index 000000000..c20e96bed
--- /dev/null
+++ b/src/include/rados/librgw.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_LIBRGW_H
+#define CEPH_LIBRGW_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_VER_MAJOR 1
+#define LIBRGW_VER_MINOR 1
+#define LIBRGW_VER_EXTRA 0
+
+#define LIBRGW_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_VERSION_CODE LIBRGW_VERSION(LIBRGW_VER_MAJOR, LIBRGW_VER_MINOR, LIBRGW_VER_EXTRA)
+
+typedef void* librgw_t;
+int librgw_create(librgw_t *rgw, int argc, char **argv);
+void librgw_shutdown(librgw_t rgw);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LIBRGW_H */
diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h
new file mode 100644
index 000000000..80ae69d25
--- /dev/null
+++ b/src/include/rados/objclass.h
@@ -0,0 +1,177 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+
+#ifdef __cplusplus
+
+#include "buffer.h"
+
+extern "C" {
+#endif
+
+#define CEPH_CLS_API [[gnu::visibility("default")]]
+
+#define CLS_VER(maj,min) \
+int __cls_ver__## maj ## _ ##min = 0; \
+int __cls_ver_maj = maj; \
+int __cls_ver_min = min;
+
+#define CLS_NAME(name) \
+int __cls_name__## name = 0; \
+const char *__cls_name = #name;
+
+#define CLS_INIT(name) \
+CEPH_CLS_API void __cls_init()
+
+#define CLS_METHOD_RD 0x1 /// method executes read operations
+#define CLS_METHOD_WR 0x2 /// method executes write operations
+#define CLS_METHOD_PROMOTE 0x8 /// method cannot be proxied to base tier
+
+#define CLS_LOG(level, fmt, ...) \
+ cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__)
+
+/**
+ * Initialize a class.
+ */
+void __cls_init();
+
+/**
+ * @typdef cls_handle_t
+ *
+ * A handle for interacting with the object class.
+ */
+typedef void *cls_handle_t;
+
+/**
+ * @typedef cls_method_handle_t
+ *
+ * A handle for interacting with the method of the object class.
+ */
+typedef void *cls_method_handle_t;
+
+/**
+ * @typedef cls_method_context_t
+ *
+ * A context for the method of the object class.
+ */
+typedef void* cls_method_context_t;
+
+/*class utils*/
+extern int cls_log(int level, const char *format, ...)
+ __attribute__((__format__(printf, 2, 3)));
+
+/* class registration api */
+extern int cls_register(const char *name, cls_handle_t *handle);
+
+#ifdef __cplusplus
+}
+
+/**
+ * @typedef cls_method_cxx_call_t
+ *
+ */
+typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx,
+ class ceph::buffer::list *inbl, class ceph::buffer::list *outbl);
+
+/**
+ * Register a method.
+ *
+ * @param hclass
+ * @param method
+ * @param flags
+ * @param class_call
+ * @param handle
+ */
+extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags,
+ cls_method_cxx_call_t class_call, cls_method_handle_t *handle);
+
+/**
+ * Create an object.
+ *
+ * @param hctx
+ * @param exclusive
+ */
+extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive);
+
+/**
+ * Remove an object.
+ *
+ * @param hctx
+ */
+extern int cls_cxx_remove(cls_method_context_t hctx);
+
+/**
+ * Check on the status of an object.
+ *
+ * @param hctx
+ * @param size
+ * @param mtime
+ */
+extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
+
+/**
+ * Read contents of an object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Write to the object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Get xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param outbl
+ */
+extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
+ ceph::bufferlist *outbl);
+
+/**
+ * Set xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param inbl
+ */
+extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
+ ceph::bufferlist *inbl);
+
+/**
+ * Get value corresponding to a key from the map.
+ *
+ * @param hctx
+ * @param key
+ * @param outbl
+ */
+extern int cls_cxx_map_get_val(cls_method_context_t hctx,
+ const std::string &key, ceph::bufferlist *outbl);
+
+/**
+ * Set value corresponding to a key in the map.
+ *
+ * @param hctx
+ * @param key
+ * @param inbl
+ */
+extern int cls_cxx_map_set_val(cls_method_context_t hctx,
+ const std::string &key, ceph::bufferlist *inbl);
+
+#endif
+
+#endif
diff --git a/src/include/rados/page.h b/src/include/rados/page.h
new file mode 120000
index 000000000..cf983e838
--- /dev/null
+++ b/src/include/rados/page.h
@@ -0,0 +1 @@
+../page.h \ No newline at end of file
diff --git a/src/include/rados/rados_types.h b/src/include/rados/rados_types.h
new file mode 100644
index 000000000..d308341ec
--- /dev/null
+++ b/src/include/rados/rados_types.h
@@ -0,0 +1,41 @@
+#ifndef CEPH_RADOS_TYPES_H
+#define CEPH_RADOS_TYPES_H
+
+#include <stdint.h>
+
+/**
+ * @struct obj_watch_t
+ * One item from list_watchers
+ */
+struct obj_watch_t {
+ /// Address of the Watcher
+ char addr[256];
+ /// Watcher ID
+ int64_t watcher_id;
+ /// Cookie
+ uint64_t cookie;
+ /// Timeout in Seconds
+ uint32_t timeout_seconds;
+};
+
+struct notify_ack_t {
+ uint64_t notifier_id;
+ uint64_t cookie;
+ char *payload;
+ uint64_t payload_len;
+};
+
+struct notify_timeout_t {
+ uint64_t notifier_id;
+ uint64_t cookie;
+};
+
+/**
+ *
+ * Pass as nspace argument to rados_ioctx_set_namespace()
+ * before calling rados_nobjects_list_open() to return
+ * all objects in all namespaces.
+ */
+#define LIBRADOS_ALL_NSPACES "\001"
+
+#endif
diff --git a/src/include/rados/rados_types.hpp b/src/include/rados/rados_types.hpp
new file mode 100644
index 000000000..84023579b
--- /dev/null
+++ b/src/include/rados/rados_types.hpp
@@ -0,0 +1,341 @@
+#ifndef CEPH_RADOS_TYPES_HPP
+#define CEPH_RADOS_TYPES_HPP
+
+#include <map>
+#include <utility>
+#include <vector>
+#include <stdint.h>
+#include <string>
+
+#include "buffer.h"
+#include "rados_types.h"
+
+namespace librados {
+
+typedef uint64_t snap_t;
+
+enum {
+ SNAP_HEAD = (uint64_t)(-2),
+ SNAP_DIR = (uint64_t)(-1)
+};
+
+struct clone_info_t {
+ snap_t cloneid;
+ std::vector<snap_t> snaps; // ascending
+ std::vector< std::pair<uint64_t,uint64_t> > overlap; // with next newest
+ uint64_t size;
+ clone_info_t() : cloneid(0), size(0) {}
+};
+
+struct snap_set_t {
+ std::vector<clone_info_t> clones; // ascending
+ snap_t seq; // newest snapid seen by the object
+ snap_set_t() : seq(0) {}
+};
+
+struct object_id_t {
+ std::string name;
+ std::string nspace;
+ std::string locator;
+ snap_t snap = 0;
+ object_id_t() = default;
+ object_id_t(const std::string& name,
+ const std::string& nspace,
+ const std::string& locator,
+ snap_t snap)
+ : name(name),
+ nspace(nspace),
+ locator(locator),
+ snap(snap)
+ {}
+};
+
+struct err_t {
+ enum : uint64_t {
+ SHARD_MISSING = 1 << 1,
+ SHARD_STAT_ERR = 1 << 2,
+ SHARD_READ_ERR = 1 << 3,
+ DATA_DIGEST_MISMATCH_OI = 1 << 9, // Old
+ DATA_DIGEST_MISMATCH_INFO = 1 << 9,
+ OMAP_DIGEST_MISMATCH_OI = 1 << 10, // Old
+ OMAP_DIGEST_MISMATCH_INFO = 1 << 10,
+ SIZE_MISMATCH_OI = 1 << 11, // Old
+ SIZE_MISMATCH_INFO = 1 << 11,
+ SHARD_EC_HASH_MISMATCH = 1 << 12,
+ SHARD_EC_SIZE_MISMATCH = 1 << 13,
+ OI_ATTR_MISSING = 1 << 14, // Old
+ INFO_MISSING = 1 << 14,
+ OI_ATTR_CORRUPTED = 1 << 15, // Old
+ INFO_CORRUPTED = 1 << 15,
+ SS_ATTR_MISSING = 1 << 16, // Old
+ SNAPSET_MISSING = 1 << 16,
+ SS_ATTR_CORRUPTED = 1 << 17, // Old
+ SNAPSET_CORRUPTED = 1 << 17,
+ OBJ_SIZE_OI_MISMATCH = 1 << 18, // Old
+ OBJ_SIZE_INFO_MISMATCH = 1 << 18,
+ HINFO_MISSING = 1 << 19,
+ HINFO_CORRUPTED = 1 << 20
+ // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+ };
+ uint64_t errors = 0;
+ static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED;
+ static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH;
+ bool has_shard_missing() const {
+ return errors & SHARD_MISSING;
+ }
+ bool has_stat_error() const {
+ return errors & SHARD_STAT_ERR;
+ }
+ bool has_read_error() const {
+ return errors & SHARD_READ_ERR;
+ }
+ bool has_data_digest_mismatch_oi() const { // Compatibility
+ return errors & DATA_DIGEST_MISMATCH_OI;
+ }
+ bool has_data_digest_mismatch_info() const {
+ return errors & DATA_DIGEST_MISMATCH_INFO;
+ }
+ bool has_omap_digest_mismatch_oi() const { // Compatibility
+ return errors & OMAP_DIGEST_MISMATCH_OI;
+ }
+ bool has_omap_digest_mismatch_info() const {
+ return errors & OMAP_DIGEST_MISMATCH_INFO;
+ }
+ bool has_size_mismatch_oi() const { // Compatibility
+ return errors & SIZE_MISMATCH_OI;
+ }
+ bool has_size_mismatch_info() const {
+ return errors & SIZE_MISMATCH_INFO;
+ }
+ bool has_ec_hash_error() const {
+ return errors & SHARD_EC_HASH_MISMATCH;
+ }
+ bool has_ec_size_error() const {
+ return errors & SHARD_EC_SIZE_MISMATCH;
+ }
+ bool has_oi_attr_missing() const { // Compatibility
+ return errors & OI_ATTR_MISSING;
+ }
+ bool has_info_missing() const {
+ return errors & INFO_MISSING;
+ }
+ bool has_oi_attr_corrupted() const { // Compatibility
+ return errors & OI_ATTR_CORRUPTED;
+ }
+ bool has_info_corrupted() const {
+ return errors & INFO_CORRUPTED;
+ }
+ bool has_ss_attr_missing() const { // Compatibility
+ return errors & SS_ATTR_MISSING;
+ }
+ bool has_snapset_missing() const {
+ return errors & SNAPSET_MISSING;
+ }
+ bool has_ss_attr_corrupted() const { // Compatibility
+ return errors & SS_ATTR_CORRUPTED;
+ }
+ bool has_snapset_corrupted() const {
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool has_shallow_errors() const {
+ return errors & SHALLOW_ERRORS;
+ }
+ bool has_deep_errors() const {
+ return errors & DEEP_ERRORS;
+ }
+ bool has_obj_size_oi_mismatch() const { // Compatibility
+ return errors & OBJ_SIZE_OI_MISMATCH;
+ }
+ bool has_obj_size_info_mismatch() const {
+ return errors & OBJ_SIZE_INFO_MISMATCH;
+ }
+ bool has_hinfo_missing() const {
+ return errors & HINFO_MISSING;
+ }
+ bool has_hinfo_corrupted() const {
+ return errors & HINFO_CORRUPTED;
+ }
+};
+
+struct shard_info_t : err_t {
+ std::map<std::string, ceph::bufferlist> attrs;
+ uint64_t size = -1;
+ bool omap_digest_present = false;
+ uint32_t omap_digest = 0;
+ bool data_digest_present = false;
+ uint32_t data_digest = 0;
+ bool selected_oi = false;
+ bool primary = false;
+};
+
+struct osd_shard_t {
+ int32_t osd;
+ int8_t shard;
+};
+
+inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) {
+ if (lhs.osd < rhs.osd)
+ return true;
+ else if (lhs.osd > rhs.osd)
+ return false;
+ else
+ return lhs.shard < rhs.shard;
+}
+
+struct obj_err_t {
+ enum : uint64_t {
+ OBJECT_INFO_INCONSISTENCY = 1 << 1,
+ // XXX: Can an older rados binary work if these bits stay the same?
+ DATA_DIGEST_MISMATCH = 1 << 4,
+ OMAP_DIGEST_MISMATCH = 1 << 5,
+ SIZE_MISMATCH = 1 << 6,
+ ATTR_VALUE_MISMATCH = 1 << 7,
+ ATTR_NAME_MISMATCH = 1 << 8,
+ SNAPSET_INCONSISTENCY = 1 << 9,
+ HINFO_INCONSISTENCY = 1 << 10,
+ SIZE_TOO_LARGE = 1 << 11,
+ // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+ };
+ uint64_t errors = 0;
+ static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH
+ |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE;
+ static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH;
+ bool has_object_info_inconsistency() const {
+ return errors & OBJECT_INFO_INCONSISTENCY;
+ }
+ bool has_data_digest_mismatch() const {
+ return errors & DATA_DIGEST_MISMATCH;
+ }
+ bool has_omap_digest_mismatch() const {
+ return errors & OMAP_DIGEST_MISMATCH;
+ }
+ bool has_size_mismatch() const {
+ return errors & SIZE_MISMATCH;
+ }
+ bool has_attr_value_mismatch() const {
+ return errors & ATTR_VALUE_MISMATCH;
+ }
+ bool has_attr_name_mismatch() const {
+ return errors & ATTR_NAME_MISMATCH;
+ }
+ bool has_shallow_errors() const {
+ return errors & SHALLOW_ERRORS;
+ }
+ bool has_deep_errors() const {
+ return errors & DEEP_ERRORS;
+ }
+ bool has_snapset_inconsistency() const {
+ return errors & SNAPSET_INCONSISTENCY;
+ }
+ bool has_hinfo_inconsistency() const {
+ return errors & HINFO_INCONSISTENCY;
+ }
+ bool has_size_too_large() const {
+ return errors & SIZE_TOO_LARGE;
+ }
+};
+
+struct inconsistent_obj_t : obj_err_t {
+ inconsistent_obj_t() = default;
+ inconsistent_obj_t(const object_id_t& object)
+ : object{object}, version(0)
+ {}
+ object_id_t object;
+ uint64_t version; // XXX: Redundant with object info attr
+ std::map<osd_shard_t, shard_info_t> shards;
+ err_t union_shards;
+};
+
+struct inconsistent_snapset_t {
+ inconsistent_snapset_t() = default;
+ inconsistent_snapset_t(const object_id_t& head)
+ : object{head}
+ {}
+ enum {
+ SNAPSET_MISSING = 1 << 0,
+ SNAPSET_CORRUPTED = 1 << 1,
+ CLONE_MISSING = 1 << 2,
+ SNAP_ERROR = 1 << 3,
+ HEAD_MISMATCH = 1 << 4, // Unused
+ HEADLESS_CLONE = 1 << 5,
+ SIZE_MISMATCH = 1 << 6,
+ OI_MISSING = 1 << 7, // Old
+ INFO_MISSING = 1 << 7,
+ OI_CORRUPTED = 1 << 8, // Old
+ INFO_CORRUPTED = 1 << 8,
+ EXTRA_CLONES = 1 << 9,
+ };
+ uint64_t errors = 0;
+ object_id_t object;
+ // Extra clones
+ std::vector<snap_t> clones;
+ std::vector<snap_t> missing;
+ ceph::bufferlist ss_bl;
+
+ bool ss_attr_missing() const { // Compatibility
+ return errors & SNAPSET_MISSING;
+ }
+ bool snapset_missing() const {
+ return errors & SNAPSET_MISSING;
+ }
+ bool ss_attr_corrupted() const { // Compatibility
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool snapset_corrupted() const {
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool clone_missing() const {
+ return errors & CLONE_MISSING;
+ }
+ bool snapset_mismatch() const { // Compatibility
+ return errors & SNAP_ERROR;
+ }
+ bool snapset_error() const {
+ return errors & SNAP_ERROR;
+ }
+ bool head_mismatch() const { // Compatibility
+ return false;
+ }
+ bool headless() const {
+ return errors & HEADLESS_CLONE;
+ }
+ bool size_mismatch() const {
+ return errors & SIZE_MISMATCH;
+ }
+ bool oi_attr_missing() const { // Compatibility
+ return errors & OI_MISSING;
+ }
+ bool info_missing() const {
+ return errors & INFO_MISSING;
+ }
+ bool oi_attr_corrupted() const { // Compatibility
+ return errors & OI_CORRUPTED;
+ }
+ bool info_corrupted() const {
+ return errors & INFO_CORRUPTED;
+ }
+ bool extra_clones() const {
+ return errors & EXTRA_CLONES;
+ }
+};
+
+/**
+ * @var all_nspaces
+ * Pass as nspace argument to IoCtx::set_namespace()
+ * before calling nobjects_begin() to iterate
+ * through all objects in all namespaces.
+ */
+const std::string all_nspaces(LIBRADOS_ALL_NSPACES);
+
+struct notify_ack_t {
+ uint64_t notifier_id;
+ uint64_t cookie;
+ ceph::bufferlist payload_bl;
+};
+
+struct notify_timeout_t {
+ uint64_t notifier_id;
+ uint64_t cookie;
+};
+}
+#endif
diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h
new file mode 100644
index 000000000..eb2d6dc4d
--- /dev/null
+++ b/src/include/rados/rgw_file.h
@@ -0,0 +1,431 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * convert RGW commands to file commands
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef RADOS_RGW_FILE_H
+#define RADOS_RGW_FILE_H
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "librgw.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_FILE_VER_MAJOR 1
+#define LIBRGW_FILE_VER_MINOR 2
+#define LIBRGW_FILE_VER_EXTRA 0
+
+#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
+
+/*
+ * object types
+ */
+enum rgw_fh_type {
+ RGW_FS_TYPE_NIL = 0,
+ RGW_FS_TYPE_FILE,
+ RGW_FS_TYPE_DIRECTORY,
+ RGW_FS_TYPE_SYMBOLIC_LINK,
+};
+
+/*
+ * dynamic allocated handle to support nfs handle
+ */
+
+/* content-addressable hash */
+struct rgw_fh_hk {
+ uint64_t bucket;
+ uint64_t object;
+};
+
+struct rgw_file_handle
+{
+ /* content-addressable hash */
+ struct rgw_fh_hk fh_hk;
+ void *fh_private; /* librgw private data */
+ /* object type */
+ enum rgw_fh_type fh_type;
+};
+
+struct rgw_fs
+{
+ librgw_t rgw;
+ void *fs_private;
+ struct rgw_file_handle* root_fh;
+};
+
+
+/* XXX mount info hypothetical--emulate Unix, support at least
+ * UUID-length fsid */
+struct rgw_statvfs {
+ uint64_t f_bsize; /* file system block size */
+ uint64_t f_frsize; /* fragment size */
+ uint64_t f_blocks; /* size of fs in f_frsize units */
+ uint64_t f_bfree; /* # free blocks */
+ uint64_t f_bavail; /* # free blocks for unprivileged users */
+ uint64_t f_files; /* # inodes */
+ uint64_t f_ffree; /* # free inodes */
+ uint64_t f_favail; /* # free inodes for unprivileged users */
+ uint64_t f_fsid[2]; /* file system ID */
+ uint64_t f_flag; /* mount flags */
+ uint64_t f_namemax; /* maximum filename length */
+};
+
+
+void rgwfile_version(int *major, int *minor, int *extra);
+
+/*
+ lookup object by name (POSIX style)
+*/
+#define RGW_LOOKUP_FLAG_NONE 0x0000
+#define RGW_LOOKUP_FLAG_CREATE 0x0001
+#define RGW_LOOKUP_FLAG_RCB 0x0002 /* readdir callback hint */
+#define RGW_LOOKUP_FLAG_DIR 0x0004
+#define RGW_LOOKUP_FLAG_FILE 0x0008
+
+#define RGW_LOOKUP_TYPE_FLAGS \
+ (RGW_LOOKUP_FLAG_DIR|RGW_LOOKUP_FLAG_FILE)
+
+int rgw_lookup(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char *path,
+ struct rgw_file_handle **fh,
+ struct stat *st, uint32_t mask, uint32_t flags);
+
+/*
+ lookup object by handle (NFS style)
+*/
+int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk,
+ struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+ * release file handle
+ */
+#define RGW_FH_RELE_FLAG_NONE 0x0000
+
+int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ attach rgw namespace
+*/
+#define RGW_MOUNT_FLAG_NONE 0x0000
+
+int rgw_mount(librgw_t rgw, const char *uid, const char *key,
+ const char *secret, struct rgw_fs **rgw_fs,
+ uint32_t flags);
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *key,
+ const char *secret, const char *root, struct rgw_fs **rgw_fs,
+ uint32_t flags);
+
+/*
+ register invalidate callbacks
+*/
+#define RGW_REG_INVALIDATE_FLAG_NONE 0x0000
+
+typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk);
+
+int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb,
+ void *arg, uint32_t flags);
+
+/*
+ detach rgw namespace
+*/
+#define RGW_UMOUNT_FLAG_NONE 0x0000
+
+int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags);
+
+
+/*
+ get filesystem attributes
+*/
+#define RGW_STATFS_FLAG_NONE 0x0000
+
+int rgw_statfs(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ struct rgw_statvfs *vfs_st,
+ uint32_t flags);
+
+
+/* XXX (get|set)attr mask bits */
+#define RGW_SETATTR_MODE 1
+#define RGW_SETATTR_UID 2
+#define RGW_SETATTR_GID 4
+#define RGW_SETATTR_MTIME 8
+#define RGW_SETATTR_ATIME 16
+#define RGW_SETATTR_SIZE 32
+#define RGW_SETATTR_CTIME 64
+
+/*
+ create file
+*/
+#define RGW_CREATE_FLAG_NONE 0x0000
+
+int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ const char *name, struct stat *st, uint32_t mask,
+ struct rgw_file_handle **fh, uint32_t posix_flags,
+ uint32_t flags);
+
+/*
+ create a symbolic link
+ */
+#define RGW_CREATELINK_FLAG_NONE 0x0000
+int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ const char *name, const char *link_path, struct stat *st,
+ uint32_t mask, struct rgw_file_handle **fh, uint32_t posix_flags,
+ uint32_t flags);
+
+/*
+ create a new directory
+*/
+#define RGW_MKDIR_FLAG_NONE 0x0000
+
+int rgw_mkdir(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ const char *name, struct stat *st, uint32_t mask,
+ struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+ rename object
+*/
+#define RGW_RENAME_FLAG_NONE 0x0000
+
+int rgw_rename(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *olddir, const char* old_name,
+ struct rgw_file_handle *newdir, const char* new_name,
+ uint32_t flags);
+
+/*
+ remove file or directory
+*/
+#define RGW_UNLINK_FLAG_NONE 0x0000
+
+int rgw_unlink(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char* path,
+ uint32_t flags);
+
+/*
+ read directory content
+*/
+typedef bool (*rgw_readdir_cb)(const char *name, void *arg, uint64_t offset,
+ struct stat *st, uint32_t mask,
+ uint32_t flags);
+
+#define RGW_READDIR_FLAG_NONE 0x0000
+#define RGW_READDIR_FLAG_DOTDOT 0x0001 /* send dot names */
+
+int rgw_readdir(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, uint64_t *offset,
+ rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+ uint32_t flags);
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char *name,
+ rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+ uint32_t flags);
+
+/* project offset of dirent name */
+#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
+
+int rgw_dirent_offset(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ const char *name, int64_t *offset,
+ uint32_t flags);
+
+/*
+ get unix attributes for object
+*/
+#define RGW_GETATTR_FLAG_NONE 0x0000
+
+int rgw_getattr(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, struct stat *st,
+ uint32_t flags);
+
+/*
+ set unix attributes for object
+*/
+#define RGW_SETATTR_FLAG_NONE 0x0000
+
+int rgw_setattr(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, struct stat *st,
+ uint32_t mask, uint32_t flags);
+
+/*
+ truncate file
+*/
+#define RGW_TRUNCATE_FLAG_NONE 0x0000
+
+int rgw_truncate(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t size,
+ uint32_t flags);
+
+/*
+ open file
+*/
+#define RGW_OPEN_FLAG_NONE 0x0000
+#define RGW_OPEN_FLAG_CREATE 0x0001
+#define RGW_OPEN_FLAG_V3 0x0002 /* ops have v3 semantics */
+#define RGW_OPEN_FLAG_STATELESS 0x0002 /* alias it */
+
+int rgw_open(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ uint32_t posix_flags, uint32_t flags);
+
+/*
+ close file
+*/
+
+#define RGW_CLOSE_FLAG_NONE 0x0000
+#define RGW_CLOSE_FLAG_RELE 0x0001
+
+int rgw_close(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ read data from file
+*/
+#define RGW_READ_FLAG_NONE 0x0000
+
+int rgw_read(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_read, void *buffer,
+ uint32_t flags);
+
+/*
+ read symbolic link
+*/
+#define RGW_READLINK_FLAG_NONE 0x0000
+
+int rgw_readlink(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_read, void *buffer,
+ uint32_t flags);
+
+/*
+ write data to file
+*/
+#define RGW_WRITE_FLAG_NONE 0x0000
+
+int rgw_write(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_written, void *buffer,
+ uint32_t flags);
+
+#define RGW_UIO_NONE 0x0000
+#define RGW_UIO_GIFT 0x0001
+#define RGW_UIO_FREE 0x0002
+#define RGW_UIO_BUFQ 0x0004
+
+struct rgw_uio;
+typedef void (*rgw_uio_release)(struct rgw_uio *, uint32_t);
+
+/* buffer vector descriptors */
+struct rgw_vio {
+ void *vio_p1;
+ void *vio_u1;
+ void *vio_base;
+ int32_t vio_len;
+};
+
+struct rgw_uio {
+ rgw_uio_release uio_rele;
+ void *uio_p1;
+ void *uio_u1;
+ uint64_t uio_offset;
+ uint64_t uio_resid;
+ uint32_t uio_cnt;
+ uint32_t uio_flags;
+ struct rgw_vio *uio_vio; /* appended vectors */
+};
+
+typedef struct rgw_uio rgw_uio;
+
+int rgw_readv(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+int rgw_writev(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+/*
+ sync written data
+*/
+#define RGW_FSYNC_FLAG_NONE 0x0000
+
+int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ NFS commit operation
+*/
+
+#define RGW_COMMIT_FLAG_NONE 0x0000
+
+int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint64_t offset, uint64_t length, uint32_t flags);
+
+/*
+ extended attributes
+ */
+typedef struct rgw_xattrstr
+{
+ char *val;
+ uint32_t len;
+} rgw_xattrstr;
+
+typedef struct rgw_xattr
+{
+ rgw_xattrstr key;
+ rgw_xattrstr val;
+} rgw_xattr;
+
+typedef struct rgw_xattrlist
+{
+ rgw_xattr *xattrs;
+ uint32_t xattr_cnt;
+} rgw_xattrlist;
+
+#define RGW_GETXATTR_FLAG_NONE 0x0000
+
+typedef int (*rgw_getxattr_cb)(rgw_xattrlist *attrs, void *arg,
+ uint32_t flags);
+
+int rgw_getxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ rgw_xattrlist *attrs, rgw_getxattr_cb cb, void *cb_arg,
+ uint32_t flags);
+
+#define RGW_LSXATTR_FLAG_NONE 0x0000
+#define RGW_LSXATTR_FLAG_STOP 0x0001
+
+int rgw_lsxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ rgw_xattrstr *filter_prefix /* unimplemented for now */,
+ rgw_getxattr_cb cb, void *cb_arg, uint32_t flags);
+
+#define RGW_SETXATTR_FLAG_NONE 0x0000
+
+int rgw_setxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ rgw_xattrlist *attrs, uint32_t flags);
+
+#define RGW_RMXATTR_FLAG_NONE 0x0000
+
+int rgw_rmxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ rgw_xattrlist *attrs, uint32_t flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RADOS_RGW_FILE_H */
diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h
new file mode 100644
index 000000000..7eb33596c
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.h
@@ -0,0 +1,610 @@
+#ifndef CEPH_LIBRADOSSTRIPER_H
+#define CEPH_LIBRADOSSTRIPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+#include "../rados/librados.h"
+
+#define LIBRADOSSTRIPER_VER_MAJOR 0
+#define LIBRADOSSTRIPER_VER_MINOR 0
+#define LIBRADOSSTRIPER_VER_EXTRA 0
+
+#define LIBRADOSSTRIPER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOSSTRIPER_VERSION_CODE LIBRADOSSTRIPER_VERSION(LIBRADOSSTRIPER_VER_MAJOR, LIBRADOSSTRIPER_VER_MINOR, LIBRADOSSTRIPER_VER_EXTRA)
+
+/**
+ * @typedef rados_striper_t
+ *
+ * A handle for interacting with striped objects in a RADOS cluster.
+ */
+typedef void *rados_striper_t;
+
+/**
+ * @defgroup libradosstriper_h_init Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using libradosstriper.
+ *
+ * @{
+ */
+
+/**
+ * Creates a rados striper using the given io context
+ * Striper has initially default object layout.
+ * See rados_striper_set_object_layout_*() to change this
+ *
+ * @param ioctx the rados context to use
+ * @param striper where to store the rados striper
+ * @returns 0 on success, negative error code on failure
+ */
+ int rados_striper_create(rados_ioctx_t ioctx,
+ rados_striper_t *striper);
+
+/**
+ * Destroys a rados striper
+ *
+ * @param striper the striper to destroy
+ */
+void rados_striper_destroy(rados_striper_t striper);
+
+/**
+ * Sets the object layout's stripe unit of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_unit the stripe_unit value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper,
+ unsigned int stripe_unit);
+
+/**
+ * Sets the object layout's stripe count of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_count the stripe_count value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_count(rados_striper_t striper,
+ unsigned int stripe_count);
+
+/**
+ * Sets the object layout's object_size of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param object_size the object_size value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_object_size(rados_striper_t striper,
+ unsigned int object_size);
+
+/** @} init */
+
+/**
+ * @defgroup libradosstriper_h_synch_io Synchronous I/O
+ * Writes are striped to several rados objects which are then
+ * replicated to a number of OSDs based on the configuration
+ * of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_striper_ioctx_wait_for_complete().
+ *
+ * @{
+ */
+
+/**
+ * Synchronously write data to a striped object at the specified offset
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_write(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Synchronously write an entire striped object
+ *
+ * The striped object is filled with the provided data. If the striped object exists,
+ * it is truncated and then written.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_write_full(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len);
+
+/**
+ * Append data to an object
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_append(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len);
+
+/**
+ * Synchronously read data from a striped object at the specified offset
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+int rados_striper_read(rados_striper_t striper,
+ const char *soid,
+ char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Synchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_remove(rados_striper_t striper,
+ const char* soid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @note the truncation is not fully atomic. The metadata part is,
+ * so the behavior will be atomic from user point of view when
+ * the object size is reduced. However, in case of failure, old data
+ * may stay around, hidden. They may reappear if the object size is
+ * later grown, instead of the expected 0s. When growing the
+ * object and in case of failure, the new 0 data may not be
+ * fully created. This can lead to ENOENT errors when
+ * writing/reading the missing parts.
+ * @note the truncation can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during truncation (same EBUSY return code)
+ * @param io the rados context to use
+ * @param soid the name of the striped object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_trunc(rados_striper_t striper, const char *soid, uint64_t size);
+
+/** @} Synchronous I/O */
+
+/**
+ * @defgroup libradosstriper_h_xattrs Xattrs
+ * Extended attributes are stored as extended attributes on the
+ * first rados regular object of the striped object.
+ * Thus, they have the same limitations as the underlying
+ * rados extended attributes.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the getxattr will occur
+ * @param oid name of the striped object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+int rados_striper_getxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name,
+ char *buf,
+ size_t len);
+
+/**
+ * Set an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the setxattr will occur
+ * @param oid name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_setxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name,
+ const char *buf,
+ size_t len);
+
+/**
+ * Delete an extended attribute from a striped object.
+ *
+ * @param striper the striper in which the rmxattr will occur
+ * @param oid name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_rmxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name);
+
+/**
+ * Start iterating over xattrs on a striped object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param striper the striper in which the getxattrs will occur
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs(rados_striper_t striper,
+ const char *oid,
+ rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the striped object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs_next(rados_xattrs_iter_t iter,
+ const char **name,
+ const char **val,
+ size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+void rados_striper_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Synchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_stat(rados_striper_t striper,
+ const char* soid,
+ uint64_t *psize,
+ time_t *pmtime);
+
+/**
+ * @defgroup libradosstriper_h_asynch_io Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_striper_multi_completion_t
+ * Represents the state of a set of asynchronous operations
+ * it contains the aggregated return value once the operations complete
+ * and can be used to block until all operations are complete and/or safe.
+ */
+typedef void *rados_striper_multi_completion_t;
+
+/**
+ * Constructs a multi completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all relpicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+int rados_striper_multi_aio_create_completion(void *cb_arg,
+ rados_callback_t cb_complete,
+ rados_callback_t cb_safe,
+ rados_striper_multi_completion_t *pc);
+
+/**
+ * Block until all operation complete
+ *
+ * This means data is in memory on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operation are safe
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations complete and callback completes
+ *
+ * This means data is in memory on all replicas and can be read.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations are safe and callback has completed
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation and callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe and has the callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Get the return value of a multi asychronous operation
+ *
+ * The return value is set when all operations are complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operations to inspect
+ * @returns aggregated return value of the operations
+ */
+int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c);
+
+/**
+ * Release a multi asynchrnous IO completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c multi completion to release
+ */
+void rados_striper_multi_aio_release(rados_striper_multi_completion_t c);
+
+/**
+ * Asynchronously write data to a striped object at the specified offset
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Asynchronously appends data to a striped object
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_append(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously fills and object with the provided data.
+ * If the object exists, it is truncated and then written.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write_full(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously read data from a striped object at the specified offset
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the read is safe and complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_read(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ char *buf,
+ const size_t len,
+ uint64_t off);
+
+/**
+ * Asynchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, negative error code on failure
+ */
+
+int rados_striper_aio_remove(rados_striper_t striper,
+ const char* soid,
+ rados_completion_t completion);
+
+/**
+ * Block until all pending writes in a striper are safe
+ *
+ * This is not equivalent to calling rados_striper_multi_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @param striper the striper in which the flush will occur
+ * @returns 0 on success, negative error code on failure
+*/
+void rados_striper_aio_flush(rados_striper_t striper);
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param completion what to do when the stats is complete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_aio_stat(rados_striper_t striper,
+ const char* soid,
+ rados_completion_t completion,
+ uint64_t *psize,
+ time_t *pmtime);
+
+/** @} Asynchronous I/O */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp
new file mode 100644
index 000000000..fb790b0d7
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.hpp
@@ -0,0 +1,241 @@
+#ifndef __LIBRADOSSTRIPER_HPP
+#define __LIBRADOSSTRIPER_HPP
+
+#include <string.h>
+#include <string>
+#include <map>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+
+#include "libradosstriper.h"
+
+namespace libradosstriper
+{
+ struct RadosStriperImpl;
+ struct MultiAioCompletionImpl;
+
+ /*
+ * Completion object for multiple asynchronous IO
+ * It allows to internally handle several "requests"
+ */
+ struct MultiAioCompletion {
+ MultiAioCompletion(MultiAioCompletionImpl *pc_) : pc(pc_) {}
+ ~MultiAioCompletion();
+ int set_complete_callback(void *cb_arg, librados::callback_t cb);
+ int set_safe_callback(void *cb_arg, librados::callback_t cb) __attribute__ ((deprecated));
+ void wait_for_complete();
+ void wait_for_safe() __attribute__ ((deprecated));
+ void wait_for_complete_and_cb();
+ void wait_for_safe_and_cb() __attribute__ ((deprecated));
+ bool is_complete();
+ bool is_safe() __attribute__ ((deprecated));
+ bool is_complete_and_cb();
+ bool is_safe_and_cb() __attribute__ ((deprecated));
+ int get_return_value();
+ void release();
+ MultiAioCompletionImpl *pc;
+ };
+
+ /* RadosStriper : This class allows to perform read/writes on striped objects
+ *
+ * Typical use (error checking omitted):
+ *
+ * RadosStriper rs;
+ * RadosStriper.striper_create("my_cluster", rs);
+ * bufferlist bl;
+ * ... put data in bl ...
+ * rs.write(object_name, bl, len, offset);
+ * bufferlist bl2;
+ * rs.read(object_name, &bl2, len, offset);
+ * ...
+ */
+ class RadosStriper
+ {
+ public:
+
+ /*
+ * constructor
+ */
+ RadosStriper();
+
+ /*
+ * builds the C counter part of a RadosStriper
+ */
+ static void to_rados_striper_t(RadosStriper &striper,
+ rados_striper_t *s);
+
+ /*
+ * copy constructor
+ */
+ RadosStriper(const RadosStriper& rs);
+
+ /*
+ * operator=
+ */
+ RadosStriper& operator=(const RadosStriper& rs);
+
+ /*
+ * destructor
+ * Internally calling close() if an object is currently opened
+ */
+ ~RadosStriper();
+
+ /*
+ * create method
+ */
+ static int striper_create(librados::IoCtx& ioctx,
+ RadosStriper *striper);
+
+ /*
+ * set object layout's stripe unit
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_stripe_unit(unsigned int stripe_unit);
+
+ /*
+ * set object layout's stripe count
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_stripe_count(unsigned int stripe_count);
+
+ /*
+ * set object layout's object size
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_object_size(unsigned int object_size);
+
+ /**
+ * Get the value of an extended attribute on a striped object
+ */
+ int getxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+ /**
+ * Set the value of an extended attribute on a striped object
+ */
+ int setxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+ /**
+ * Delete an extended attribute from a striped object
+ */
+ int rmxattr(const std::string& oid, const char *name);
+
+ /**
+ * Start iterating over xattrs on a striped object.
+ */
+ int getxattrs(const std::string& oid,
+ std::map<std::string, ceph::bufferlist>& attrset);
+
+ /**
+ * synchronously write to the striped object at the specified offset.
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write(const std::string& soid, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+ /**
+ * synchronously fill the striped object with the specified data
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write_full(const std::string& soid, const ceph::bufferlist& bl);
+
+ /**
+ * synchronously append data to the striped object
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int append(const std::string& soid, const ceph::bufferlist& bl, size_t len);
+
+ /**
+ * asynchronously write to the striped object at the specified offset.
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+ /**
+ * asynchronously fill the striped object with the specified data
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl);
+
+ /**
+ * asynchronously append data to the striped object
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len);
+
+ /**
+ * synchronously read from the striped object at the specified offset.
+ */
+ int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off);
+
+ /**
+ * asynchronously read from the striped object at the specified offset.
+ */
+ int aio_read(const std::string& soid, librados::AioCompletion *c, ceph::bufferlist *pbl, size_t len, uint64_t off);
+
+ /**
+ * synchronously get striped object stats (size/mtime)
+ */
+ int stat(const std::string& soid, uint64_t *psize, time_t *pmtime);
+ int stat2(const std::string& soid, uint64_t *psize, struct timespec *pts);
+
+ /**
+ * asynchronously get striped object stats (size/mtime)
+ */
+ int aio_stat(const std::string& soid, librados::AioCompletion *c,
+ uint64_t *psize, time_t *pmtime);
+ int aio_stat2(const std::string& soid, librados::AioCompletion *c,
+ uint64_t *psize, struct timespec *pts);
+
+ /**
+ * deletes a striped object.
+ * There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ */
+ int remove(const std::string& soid);
+ int remove(const std::string& soid, int flags);
+
+ /**
+ * asynchronous remove of striped objects
+ * See synchronous version for comments on (lack of) atomicity
+ */
+ int aio_remove(const std::string& soid, librados::AioCompletion *c);
+ int aio_remove(const std::string& soid, librados::AioCompletion *c, int flags);
+
+ /**
+ * Resizes a striped object
+ * the truncation can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during truncation (same EBUSY return code)
+ */
+ int trunc(const std::string& oid, uint64_t size);
+
+ /**
+ * Wait for all currently pending aio writes to be safe.
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush();
+
+ /**
+ * creation of multi aio completion objects
+ */
+ static MultiAioCompletion *multi_aio_create_completion();
+ static MultiAioCompletion *multi_aio_create_completion(void *cb_arg,
+ librados::callback_t cb_complete,
+ librados::callback_t cb_safe);
+
+ private:
+ RadosStriperImpl *rados_striper_impl;
+
+ };
+
+}
+
+#endif
diff --git a/src/include/random.h b/src/include/random.h
new file mode 100644
index 000000000..f2e3e37bc
--- /dev/null
+++ b/src/include/random.h
@@ -0,0 +1,301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+*/
+
+#ifndef CEPH_RANDOM_H
+#define CEPH_RANDOM_H 1
+
+#include <mutex>
+#include <random>
+#include <type_traits>
+#include <boost/optional.hpp>
+
+// Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85494
+#ifdef __MINGW32__
+#include <boost/random/random_device.hpp>
+
+using random_device_t = boost::random::random_device;
+#else
+using random_device_t = std::random_device;
+#endif
+
+// Basic random number facility (see N3551 for inspiration):
+namespace ceph::util {
+
+inline namespace version_1_0_3 {
+
+namespace detail {
+
+template <typename T0, typename T1>
+using larger_of = typename std::conditional<
+ sizeof(T0) >= sizeof(T1),
+ T0, T1>
+ ::type;
+
+// avoid mixing floating point and integers:
+template <typename NumberT0, typename NumberT1>
+using has_compatible_numeric_types =
+ std::disjunction<
+ std::conjunction<
+ std::is_floating_point<NumberT0>, std::is_floating_point<NumberT1>
+ >,
+ std::conjunction<
+ std::is_integral<NumberT0>, std::is_integral<NumberT1>
+ >
+ >;
+
+
+// Select the larger of type compatible numeric types:
+template <typename NumberT0, typename NumberT1>
+using select_number_t = std::enable_if_t<detail::has_compatible_numeric_types<NumberT0, NumberT1>::value,
+ detail::larger_of<NumberT0, NumberT1>>;
+
+} // namespace detail
+
+namespace detail {
+
+// Choose default distribution for appropriate types:
+template <typename NumberT,
+ bool IsIntegral>
+struct select_distribution
+{
+ using type = std::uniform_int_distribution<NumberT>;
+};
+
+template <typename NumberT>
+struct select_distribution<NumberT, false>
+{
+ using type = std::uniform_real_distribution<NumberT>;
+};
+
+template <typename NumberT>
+using default_distribution = typename
+ select_distribution<NumberT, std::is_integral<NumberT>::value>::type;
+
+} // namespace detail
+
+namespace detail {
+
+template <typename EngineT>
+EngineT& engine();
+
+template <typename MutexT, typename EngineT,
+ typename SeedT = typename EngineT::result_type>
+void randomize_rng(const SeedT seed, MutexT& m, EngineT& e)
+{
+ std::lock_guard<MutexT> lg(m);
+ e.seed(seed);
+}
+
+template <typename MutexT, typename EngineT>
+void randomize_rng(MutexT& m, EngineT& e)
+{
+ random_device_t rd;
+
+ std::lock_guard<MutexT> lg(m);
+ e.seed(rd());
+}
+
+template <typename EngineT = std::default_random_engine,
+ typename SeedT = typename EngineT::result_type>
+void randomize_rng(const SeedT n)
+{
+ detail::engine<EngineT>().seed(n);
+}
+
+template <typename EngineT = std::default_random_engine>
+void randomize_rng()
+{
+ random_device_t rd;
+ detail::engine<EngineT>().seed(rd());
+}
+
+template <typename EngineT>
+EngineT& engine()
+{
+ thread_local boost::optional<EngineT> rng_engine;
+
+ if (!rng_engine) {
+ rng_engine.emplace(EngineT());
+ randomize_rng<EngineT>();
+ }
+
+ return *rng_engine;
+}
+
+} // namespace detail
+
+namespace detail {
+
+template <typename NumberT,
+ typename DistributionT = detail::default_distribution<NumberT>,
+ typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max,
+ EngineT& e)
+{
+ DistributionT d { min, max };
+
+ using param_type = typename DistributionT::param_type;
+ return d(e, param_type { min, max });
+}
+
+template <typename NumberT,
+ typename MutexT,
+ typename DistributionT = detail::default_distribution<NumberT>,
+ typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max,
+ MutexT& m, EngineT& e)
+{
+ DistributionT d { min, max };
+
+ using param_type = typename DistributionT::param_type;
+
+ std::lock_guard<MutexT> lg(m);
+ return d(e, param_type { min, max });
+}
+
+template <typename NumberT,
+ typename DistributionT = detail::default_distribution<NumberT>,
+ typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max)
+{
+ return detail::generate_random_number<NumberT, DistributionT, EngineT>
+ (min, max, detail::engine<EngineT>());
+}
+
+template <typename MutexT,
+ typename EngineT,
+ typename NumberT = int,
+ typename DistributionT = detail::default_distribution<NumberT>>
+NumberT generate_random_number(MutexT& m, EngineT& e)
+{
+ return detail::generate_random_number<NumberT, MutexT, DistributionT, EngineT>
+ (0, std::numeric_limits<NumberT>::max(), m, e);
+}
+
+template <typename NumberT, typename MutexT, typename EngineT>
+NumberT generate_random_number(const NumberT max, MutexT& m, EngineT& e)
+{
+ return generate_random_number<NumberT>(0, max, m, e);
+}
+
+} // namespace detail
+
+template <typename EngineT = std::default_random_engine>
+void randomize_rng()
+{
+ detail::randomize_rng<EngineT>();
+}
+
+template <typename NumberT = int,
+ typename DistributionT = detail::default_distribution<NumberT>,
+ typename EngineT = std::default_random_engine>
+NumberT generate_random_number()
+{
+ return detail::generate_random_number<NumberT, DistributionT, EngineT>
+ (0, std::numeric_limits<NumberT>::max());
+}
+
+template <typename NumberT0, typename NumberT1,
+ typename NumberT = detail::select_number_t<NumberT0, NumberT1>
+ >
+NumberT generate_random_number(const NumberT0 min, const NumberT1 max)
+{
+ return detail::generate_random_number<NumberT,
+ detail::default_distribution<NumberT>,
+ std::default_random_engine>
+ (static_cast<NumberT>(min), static_cast<NumberT>(max));
+}
+
+template <typename NumberT0, typename NumberT1,
+ typename DistributionT,
+ typename EngineT,
+ typename NumberT = detail::select_number_t<NumberT0, NumberT1>
+ >
+NumberT generate_random_number(const NumberT min, const NumberT max,
+ EngineT& e)
+{
+ return detail::generate_random_number<NumberT,
+ DistributionT,
+ EngineT>(static_cast<NumberT>(min), static_cast<NumberT>(max), e);
+}
+
+template <typename NumberT>
+NumberT generate_random_number(const NumberT max)
+{
+ return generate_random_number<NumberT>(0, max);
+}
+
+// Function object:
+template <typename NumberT>
+class random_number_generator final
+{
+ std::mutex l;
+ random_device_t rd;
+ std::default_random_engine e;
+
+ using seed_type = typename decltype(e)::result_type;
+
+ public:
+ using number_type = NumberT;
+ using random_engine_type = decltype(e);
+ using random_device_type = decltype(rd);
+
+ public:
+ random_device_type& random_device() noexcept { return rd; }
+ random_engine_type& random_engine() noexcept { return e; }
+
+ public:
+ random_number_generator() {
+ detail::randomize_rng(l, e);
+ }
+
+ explicit random_number_generator(const seed_type seed) {
+ detail::randomize_rng(seed, l, e);
+ }
+
+ random_number_generator(random_number_generator&& rhs)
+ : e(std::move(rhs.e))
+ {}
+
+ public:
+ random_number_generator(const random_number_generator&) = delete;
+ random_number_generator& operator=(const random_number_generator&) = delete;
+
+ public:
+ NumberT operator()() {
+ return detail::generate_random_number(l, e);
+ }
+
+ NumberT operator()(const NumberT max) {
+ return detail::generate_random_number<NumberT>(max, l, e);
+ }
+
+ NumberT operator()(const NumberT min, const NumberT max) {
+ return detail::generate_random_number<NumberT>(min, max, l, e);
+ }
+
+ public:
+ void seed(const seed_type n) {
+ detail::randomize_rng(n, l, e);
+ }
+};
+
+template <typename NumberT>
+random_number_generator(const NumberT max) -> random_number_generator<NumberT>;
+
+} // inline namespace version_*
+
+} // namespace ceph::util
+
+#endif
diff --git a/src/include/rangeset.h b/src/include/rangeset.h
new file mode 100644
index 000000000..e7e3d047c
--- /dev/null
+++ b/src/include/rangeset.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_RANGESET_H
+#define CEPH_RANGESET_H
+
+/*
+ *
+ * my first container with iterator! it's pretty ugly.
+ *
+ */
+
+#include <map>
+
+//typedef int T;
+
+template <class T>
+struct _rangeset_base {
+ map<T,T> ranges; // pair(first,last) (inclusive, e.g. [first,last])
+
+ typedef typename map<T,T>::iterator mapit;
+
+ // get iterator for range including val. or ranges.end().
+ mapit get_range_for(T val) {
+ mapit it = ranges.lower_bound(val);
+ if (it == ranges.end()) {
+ // search backwards
+ typename map<T,T>::reverse_iterator it = ranges.rbegin();
+ if (it == ranges.rend()) return ranges.end();
+ if (it->first <= val && it->second >= val)
+ return ranges.find(it->first);
+ return ranges.end();
+ } else {
+ if (it->first == val) return
+ it--;
+ if (it->first <= val && it->second >= val)
+ return it;
+ return ranges.end();
+ }
+ }
+
+};
+
+
+template <class T>
+class rangeset_iterator :
+ public std::iterator<std::input_iterator_tag, T>
+{
+ //typedef typename map<T,T>::iterator mapit;
+
+ map<T,T> ranges;
+ typename map<T,T>::iterator it;
+ T current;
+
+public:
+ // cons
+ rangeset_iterator() {}
+
+ rangeset_iterator(typename map<T,T>::iterator& it, map<T,T>& ranges) {
+ this->ranges = ranges;
+ this->it = it;
+ if (this->it != ranges.end())
+ current = it->first;
+ }
+
+ bool operator==(rangeset_iterator<T> rit) {
+ return (it == rit.it && rit.current == current);
+ }
+ bool operator!=(rangeset_iterator<T> rit) {
+ return (it != rit.it) || (rit.current != current);
+ }
+
+ T& operator*() {
+ return current;
+ }
+
+ rangeset_iterator<T> operator++(int) {
+ if (current < it->second)
+ current++;
+ else {
+ it++;
+ if (it != ranges.end())
+ current = it->first;
+ }
+
+ return *this;
+ }
+};
+
+
+template <class T>
+class rangeset
+{
+ typedef typename map<T,T>::iterator map_iterator;
+
+ _rangeset_base<T> theset;
+ inodeno_t _size;
+
+public:
+ rangeset() { _size = 0; }
+ typedef rangeset_iterator<T> iterator;
+
+ iterator begin() {
+ map_iterator it = theset.ranges.begin();
+ return iterator(it, theset.ranges);
+ }
+
+ iterator end() {
+ map_iterator it = theset.ranges.end();
+ return iterator(it, theset.ranges);
+ }
+
+ map_iterator map_begin() {
+ return theset.ranges.begin();
+ }
+ map_iterator map_end() {
+ return theset.ranges.end();
+ }
+ int map_size() {
+ return theset.ranges.size();
+ }
+
+ void map_insert(T v1, T v2) {
+ theset.ranges.insert(pair<T,T>(v1,v2));
+ _size += v2 - v1+1;
+ }
+
+
+ // ...
+ bool contains(T val) {
+ if (theset.get_range_for(val) == theset.ranges.end()) return false;
+ ceph_assert(!empty());
+ return true;
+ }
+
+ void insert(T val) {
+ ceph_assert(!contains(val));
+
+ map_iterator left = theset.get_range_for(val-1);
+ map_iterator right = theset.get_range_for(val+1);
+
+ if (left != theset.ranges.end() &&
+ right != theset.ranges.end()) {
+ // join!
+ left->second = right->second;
+ theset.ranges.erase(right);
+ _size++;
+ return;
+ }
+
+ if (left != theset.ranges.end()) {
+ // add to left range
+ left->second = val;
+ _size++;
+ return;
+ }
+
+ if (right != theset.ranges.end()) {
+ // add to right range
+ theset.ranges.insert(pair<T,T>(val, right->second));
+ theset.ranges.erase(val+1);
+ _size++;
+ return;
+ }
+
+ // new range
+ theset.ranges.insert(pair<T,T>(val,val));
+ _size++;
+ return;
+ }
+
+ unsigned size() {
+ return size();
+ }
+
+ bool empty() {
+ if (theset.ranges.empty()) {
+ ceph_assert(_size == 0);
+ return true;
+ }
+ ceph_assert(_size>0);
+ return false;
+ }
+
+
+ T first() {
+ ceph_assert(!empty());
+ map_iterator it = theset.ranges.begin();
+ return it->first;
+ }
+
+ void erase(T val) {
+ ceph_assert(contains(val));
+ map_iterator it = theset.get_range_for(val);
+ ceph_assert(it != theset.ranges.end());
+
+ // entire range
+ if (val == it->first && val == it->second) {
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ // beginning
+ if (val == it->first) {
+ theset.ranges.insert(pair<T,T>(val+1, it->second));
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ // end
+ if (val == it->second) {
+ it->second = val-1;
+ _size--;
+ return;
+ }
+
+ // middle split
+ theset.ranges.insert(pair<T,T>(it->first, val-1));
+ theset.ranges.insert(pair<T,T>(val+1, it->second));
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ void dump() {
+ for (typename map<T,T>::iterator it = theset.ranges.begin();
+ it != theset.ranges.end();
+ it++) {
+ cout << " " << it->first << "-" << it->second << endl;
+ }
+ }
+
+};
+
+
+#endif
diff --git a/src/include/rbd/features.h b/src/include/rbd/features.h
new file mode 100644
index 000000000..31c73b38f
--- /dev/null
+++ b/src/include/rbd/features.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_FEATURES_H
+#define CEPH_RBD_FEATURES_H
+
+#define RBD_FEATURE_LAYERING (1ULL<<0)
+#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
+#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
+#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
+#define RBD_FEATURE_JOURNALING (1ULL<<6)
+#define RBD_FEATURE_DATA_POOL (1ULL<<7)
+#define RBD_FEATURE_OPERATIONS (1ULL<<8)
+#define RBD_FEATURE_MIGRATING (1ULL<<9)
+#define RBD_FEATURE_NON_PRIMARY (1ULL<<10)
+#define RBD_FEATURE_DIRTY_CACHE (1ULL<<11)
+
+#define RBD_FEATURES_DEFAULT (RBD_FEATURE_LAYERING | \
+ RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_DEEP_FLATTEN)
+
+#define RBD_FEATURE_NAME_LAYERING "layering"
+#define RBD_FEATURE_NAME_STRIPINGV2 "striping"
+#define RBD_FEATURE_NAME_EXCLUSIVE_LOCK "exclusive-lock"
+#define RBD_FEATURE_NAME_OBJECT_MAP "object-map"
+#define RBD_FEATURE_NAME_FAST_DIFF "fast-diff"
+#define RBD_FEATURE_NAME_DEEP_FLATTEN "deep-flatten"
+#define RBD_FEATURE_NAME_JOURNALING "journaling"
+#define RBD_FEATURE_NAME_DATA_POOL "data-pool"
+#define RBD_FEATURE_NAME_OPERATIONS "operations"
+#define RBD_FEATURE_NAME_MIGRATING "migrating"
+#define RBD_FEATURE_NAME_NON_PRIMARY "non-primary"
+#define RBD_FEATURE_NAME_DIRTY_CACHE "dirty-cache"
+
+/// features that make an image inaccessible for read or write by
+/// clients that don't understand them
+#define RBD_FEATURES_INCOMPATIBLE (RBD_FEATURE_LAYERING | \
+ RBD_FEATURE_STRIPINGV2 | \
+ RBD_FEATURE_DATA_POOL | \
+ RBD_FEATURE_DIRTY_CACHE)
+
+/// features that make an image unwritable by clients that don't understand them
+#define RBD_FEATURES_RW_INCOMPATIBLE (RBD_FEATURES_INCOMPATIBLE | \
+ RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_DEEP_FLATTEN | \
+ RBD_FEATURE_JOURNALING | \
+ RBD_FEATURE_OPERATIONS | \
+ RBD_FEATURE_MIGRATING | \
+ RBD_FEATURE_NON_PRIMARY)
+
+#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
+ RBD_FEATURE_STRIPINGV2 | \
+ RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_DEEP_FLATTEN | \
+ RBD_FEATURE_JOURNALING | \
+ RBD_FEATURE_DATA_POOL | \
+ RBD_FEATURE_OPERATIONS | \
+ RBD_FEATURE_MIGRATING | \
+ RBD_FEATURE_NON_PRIMARY | \
+ RBD_FEATURE_DIRTY_CACHE)
+
+/// features that may be dynamically enabled or disabled
+#define RBD_FEATURES_MUTABLE (RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_JOURNALING | \
+ RBD_FEATURE_NON_PRIMARY | \
+ RBD_FEATURE_DIRTY_CACHE)
+
+#define RBD_FEATURES_MUTABLE_INTERNAL (RBD_FEATURE_NON_PRIMARY | \
+ RBD_FEATURE_DIRTY_CACHE)
+
+/// features that may be dynamically disabled
+#define RBD_FEATURES_DISABLE_ONLY (RBD_FEATURE_DEEP_FLATTEN)
+
+/// features that only work when used with a single client
+/// using the image for writes
+#define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_JOURNALING | \
+ RBD_FEATURE_DIRTY_CACHE)
+
+/// features that will be implicitly enabled
+#define RBD_FEATURES_IMPLICIT_ENABLE (RBD_FEATURE_STRIPINGV2 | \
+ RBD_FEATURE_DATA_POOL | \
+ RBD_FEATURE_FAST_DIFF | \
+ RBD_FEATURE_OPERATIONS | \
+ RBD_FEATURE_MIGRATING | \
+ RBD_FEATURE_NON_PRIMARY | \
+ RBD_FEATURE_DIRTY_CACHE)
+
+/// features that cannot be controlled by the user
+#define RBD_FEATURES_INTERNAL (RBD_FEATURE_OPERATIONS | \
+ RBD_FEATURE_MIGRATING)
+
+#define RBD_OPERATION_FEATURE_CLONE_PARENT (1ULL<<0)
+#define RBD_OPERATION_FEATURE_CLONE_CHILD (1ULL<<1)
+#define RBD_OPERATION_FEATURE_GROUP (1ULL<<2)
+#define RBD_OPERATION_FEATURE_SNAP_TRASH (1ULL<<3)
+
+#define RBD_OPERATION_FEATURE_NAME_CLONE_PARENT "clone-parent"
+#define RBD_OPERATION_FEATURE_NAME_CLONE_CHILD "clone-child"
+#define RBD_OPERATION_FEATURE_NAME_GROUP "group"
+#define RBD_OPERATION_FEATURE_NAME_SNAP_TRASH "snap-trash"
+
+/// all valid operation features
+#define RBD_OPERATION_FEATURES_ALL (RBD_OPERATION_FEATURE_CLONE_PARENT | \
+ RBD_OPERATION_FEATURE_CLONE_CHILD | \
+ RBD_OPERATION_FEATURE_GROUP | \
+ RBD_OPERATION_FEATURE_SNAP_TRASH)
+
+#endif
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
new file mode 100644
index 000000000..8618e6596
--- /dev/null
+++ b/src/include/rbd/librbd.h
@@ -0,0 +1,1491 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRBD_H
+#define CEPH_LIBRBD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <stdbool.h>
+#include <string.h>
+#include <sys/uio.h>
+#include "../rados/librados.h"
+#include "features.h"
+
+#define LIBRBD_VER_MAJOR 1
+#define LIBRBD_VER_MINOR 16
+#define LIBRBD_VER_EXTRA 0
+
+#define LIBRBD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA)
+
+#define LIBRBD_SUPPORTS_AIO_FLUSH 1
+#define LIBRBD_SUPPORTS_AIO_OPEN 1
+#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE 1
+#define LIBRBD_SUPPORTS_LOCKING 1
+#define LIBRBD_SUPPORTS_INVALIDATE 1
+#define LIBRBD_SUPPORTS_IOVEC 1
+#define LIBRBD_SUPPORTS_WATCH 0
+#define LIBRBD_SUPPORTS_WRITESAME 1
+#define LIBRBD_SUPPORTS_WRITE_ZEROES 1
+#define LIBRBD_SUPPORTS_ENCRYPTION 1
+
+#if __GNUC__ >= 4
+ #define CEPH_RBD_API __attribute__ ((visibility ("default")))
+ #define CEPH_RBD_DEPRECATED __attribute__((deprecated))
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#else
+ #define CEPH_RBD_API
+ #define CEPH_RBD_DEPRECATED
+#endif
+
+#define RBD_FLAG_OBJECT_MAP_INVALID (1<<0)
+#define RBD_FLAG_FAST_DIFF_INVALID (1<<1)
+
+#define RBD_MIRROR_IMAGE_STATUS_LOCAL_MIRROR_UUID ""
+
+typedef void *rbd_image_t;
+typedef void *rbd_image_options_t;
+typedef void *rbd_pool_stats_t;
+
+typedef void *rbd_completion_t;
+typedef void (*rbd_callback_t)(rbd_completion_t cb, void *arg);
+
+typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr);
+
+typedef void (*rbd_update_callback_t)(void *arg);
+
+typedef enum {
+ RBD_SNAP_NAMESPACE_TYPE_USER = 0,
+ RBD_SNAP_NAMESPACE_TYPE_GROUP = 1,
+ RBD_SNAP_NAMESPACE_TYPE_TRASH = 2,
+ RBD_SNAP_NAMESPACE_TYPE_MIRROR = 3,
+} rbd_snap_namespace_type_t;
+
+typedef struct {
+ char *id;
+ char *name;
+} rbd_image_spec_t;
+
+typedef struct {
+ int64_t pool_id;
+ char *pool_name;
+ char *pool_namespace;
+ char *image_id;
+ char *image_name;
+ bool trash;
+} rbd_linked_image_spec_t;
+
+typedef struct {
+ uint64_t id;
+ rbd_snap_namespace_type_t namespace_type;
+ char *name;
+} rbd_snap_spec_t;
+
+typedef struct {
+ uint64_t id;
+ uint64_t size;
+ const char *name;
+} rbd_snap_info_t;
+
+typedef struct {
+ const char *pool_name;
+ const char *image_name;
+ const char *image_id;
+ bool trash;
+} rbd_child_info_t;
+
+#define RBD_MAX_IMAGE_NAME_SIZE 96
+#define RBD_MAX_BLOCK_NAME_SIZE 24
+
+#define RBD_SNAP_CREATE_SKIP_QUIESCE (1 << 0)
+#define RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR (1 << 1)
+
+#define RBD_SNAP_REMOVE_UNPROTECT (1 << 0)
+#define RBD_SNAP_REMOVE_FLATTEN (1 << 1)
+#define RBD_SNAP_REMOVE_FORCE (RBD_SNAP_REMOVE_UNPROTECT | RBD_SNAP_REMOVE_FLATTEN)
+
+/**
+ * These types used to in set_image_notification to indicate the type of event
+ * socket passed in.
+ */
+enum {
+ EVENT_TYPE_PIPE = 1,
+ EVENT_TYPE_EVENTFD = 2
+};
+
+typedef struct {
+ uint64_t size;
+ uint64_t obj_size;
+ uint64_t num_objs;
+ int order;
+ char block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE]; /* deprecated */
+ int64_t parent_pool; /* deprecated */
+ char parent_name[RBD_MAX_IMAGE_NAME_SIZE]; /* deprecated */
+} rbd_image_info_t;
+
+typedef enum {
+ RBD_MIRROR_MODE_DISABLED, /* mirroring is disabled */
+ RBD_MIRROR_MODE_IMAGE, /* mirroring enabled on a per-image basis */
+ RBD_MIRROR_MODE_POOL /* mirroring enabled on all journaled images */
+} rbd_mirror_mode_t;
+
+typedef enum {
+ RBD_MIRROR_PEER_DIRECTION_RX = 0,
+ RBD_MIRROR_PEER_DIRECTION_TX = 1,
+ RBD_MIRROR_PEER_DIRECTION_RX_TX = 2
+} rbd_mirror_peer_direction_t;
+
+typedef struct {
+ char *uuid;
+ char *cluster_name;
+ char *client_name;
+} rbd_mirror_peer_t CEPH_RBD_DEPRECATED;
+
+typedef struct {
+ char *uuid;
+ rbd_mirror_peer_direction_t direction;
+ char *site_name;
+ char *mirror_uuid;
+ char *client_name;
+ time_t last_seen;
+} rbd_mirror_peer_site_t;
+
+#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_MON_HOST "mon_host"
+#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_KEY "key"
+
+typedef enum {
+ RBD_MIRROR_IMAGE_MODE_JOURNAL = 0,
+ RBD_MIRROR_IMAGE_MODE_SNAPSHOT = 1,
+} rbd_mirror_image_mode_t;
+
+typedef enum {
+ RBD_MIRROR_IMAGE_DISABLING = 0,
+ RBD_MIRROR_IMAGE_ENABLED = 1,
+ RBD_MIRROR_IMAGE_DISABLED = 2
+} rbd_mirror_image_state_t;
+
+typedef struct {
+ char *global_id;
+ rbd_mirror_image_state_t state;
+ bool primary;
+} rbd_mirror_image_info_t;
+
+typedef enum {
+ MIRROR_IMAGE_STATUS_STATE_UNKNOWN = 0,
+ MIRROR_IMAGE_STATUS_STATE_ERROR = 1,
+ MIRROR_IMAGE_STATUS_STATE_SYNCING = 2,
+ MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY = 3,
+ MIRROR_IMAGE_STATUS_STATE_REPLAYING = 4,
+ MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY = 5,
+ MIRROR_IMAGE_STATUS_STATE_STOPPED = 6,
+} rbd_mirror_image_status_state_t;
+
+typedef struct {
+ char *name;
+ rbd_mirror_image_info_t info;
+ rbd_mirror_image_status_state_t state;
+ char *description;
+ time_t last_update;
+ bool up;
+} rbd_mirror_image_status_t CEPH_RBD_DEPRECATED;
+
+typedef struct {
+ char *mirror_uuid;
+ rbd_mirror_image_status_state_t state;
+ char *description;
+ time_t last_update;
+ bool up;
+} rbd_mirror_image_site_status_t;
+
+typedef struct {
+ char *name;
+ rbd_mirror_image_info_t info;
+ uint32_t site_statuses_count;
+ rbd_mirror_image_site_status_t *site_statuses;
+} rbd_mirror_image_global_status_t;
+
+typedef enum {
+ RBD_GROUP_IMAGE_STATE_ATTACHED,
+ RBD_GROUP_IMAGE_STATE_INCOMPLETE
+} rbd_group_image_state_t;
+
+typedef struct {
+ char *name;
+ int64_t pool;
+ rbd_group_image_state_t state;
+} rbd_group_image_info_t;
+
+typedef struct {
+ char *name;
+ int64_t pool;
+} rbd_group_info_t;
+
+typedef enum {
+ RBD_GROUP_SNAP_STATE_INCOMPLETE,
+ RBD_GROUP_SNAP_STATE_COMPLETE
+} rbd_group_snap_state_t;
+
+typedef struct {
+ char *name;
+ rbd_group_snap_state_t state;
+} rbd_group_snap_info_t;
+
+typedef struct {
+ int64_t group_pool;
+ char *group_name;
+ char *group_snap_name;
+} rbd_snap_group_namespace_t;
+
+typedef enum {
+ RBD_SNAP_MIRROR_STATE_PRIMARY,
+ RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED,
+ RBD_SNAP_MIRROR_STATE_NON_PRIMARY,
+ RBD_SNAP_MIRROR_STATE_NON_PRIMARY_DEMOTED
+} rbd_snap_mirror_state_t;
+
+typedef struct {
+ rbd_snap_mirror_state_t state;
+ size_t mirror_peer_uuids_count;
+ char *mirror_peer_uuids;
+ bool complete;
+ char *primary_mirror_uuid;
+ uint64_t primary_snap_id;
+ uint64_t last_copied_object_number;
+} rbd_snap_mirror_namespace_t;
+
+typedef enum {
+ RBD_LOCK_MODE_EXCLUSIVE = 0,
+ RBD_LOCK_MODE_SHARED = 1,
+} rbd_lock_mode_t;
+
+CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra);
+
+/* image options */
+enum {
+ RBD_IMAGE_OPTION_FORMAT = 0,
+ RBD_IMAGE_OPTION_FEATURES = 1,
+ RBD_IMAGE_OPTION_ORDER = 2,
+ RBD_IMAGE_OPTION_STRIPE_UNIT = 3,
+ RBD_IMAGE_OPTION_STRIPE_COUNT = 4,
+ RBD_IMAGE_OPTION_JOURNAL_ORDER = 5,
+ RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH = 6,
+ RBD_IMAGE_OPTION_JOURNAL_POOL = 7,
+ RBD_IMAGE_OPTION_FEATURES_SET = 8,
+ RBD_IMAGE_OPTION_FEATURES_CLEAR = 9,
+ RBD_IMAGE_OPTION_DATA_POOL = 10,
+ RBD_IMAGE_OPTION_FLATTEN = 11,
+ RBD_IMAGE_OPTION_CLONE_FORMAT = 12,
+ RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE = 13,
+};
+
+typedef enum {
+ RBD_TRASH_IMAGE_SOURCE_USER = 0,
+ RBD_TRASH_IMAGE_SOURCE_MIRRORING = 1,
+ RBD_TRASH_IMAGE_SOURCE_MIGRATION = 2,
+ RBD_TRASH_IMAGE_SOURCE_REMOVING = 3,
+ RBD_TRASH_IMAGE_SOURCE_USER_PARENT = 4,
+} rbd_trash_image_source_t;
+
+typedef struct {
+ char *id;
+ char *name;
+ rbd_trash_image_source_t source;
+ time_t deletion_time;
+ time_t deferment_end_time;
+} rbd_trash_image_info_t;
+
+typedef struct {
+ char *addr;
+ int64_t id;
+ uint64_t cookie;
+} rbd_image_watcher_t;
+
+typedef enum {
+ RBD_IMAGE_MIGRATION_STATE_UNKNOWN = -1,
+ RBD_IMAGE_MIGRATION_STATE_ERROR = 0,
+ RBD_IMAGE_MIGRATION_STATE_PREPARING = 1,
+ RBD_IMAGE_MIGRATION_STATE_PREPARED = 2,
+ RBD_IMAGE_MIGRATION_STATE_EXECUTING = 3,
+ RBD_IMAGE_MIGRATION_STATE_EXECUTED = 4,
+ RBD_IMAGE_MIGRATION_STATE_ABORTING = 5,
+} rbd_image_migration_state_t;
+
+typedef struct {
+ int64_t source_pool_id;
+ char *source_pool_namespace;
+ char *source_image_name;
+ char *source_image_id;
+ int64_t dest_pool_id;
+ char *dest_pool_namespace;
+ char *dest_image_name;
+ char *dest_image_id;
+ rbd_image_migration_state_t state;
+ char *state_description;
+} rbd_image_migration_status_t;
+
+typedef enum {
+ RBD_CONFIG_SOURCE_CONFIG = 0,
+ RBD_CONFIG_SOURCE_POOL = 1,
+ RBD_CONFIG_SOURCE_IMAGE = 2,
+} rbd_config_source_t;
+
+typedef struct {
+ char *name;
+ char *value;
+ rbd_config_source_t source;
+} rbd_config_option_t;
+
+typedef enum {
+ RBD_POOL_STAT_OPTION_IMAGES,
+ RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES,
+ RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES,
+ RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS,
+ RBD_POOL_STAT_OPTION_TRASH_IMAGES,
+ RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES,
+ RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES,
+ RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS
+} rbd_pool_stat_option_t;
+
+/* rbd_write_zeroes / rbd_aio_write_zeroes flags */
+enum {
+ RBD_WRITE_ZEROES_FLAG_THICK_PROVISION = (1U<<0), /* fully allocated zeroed extent */
+};
+
+typedef enum {
+ RBD_ENCRYPTION_FORMAT_LUKS1 = 0,
+ RBD_ENCRYPTION_FORMAT_LUKS2 = 1
+} rbd_encryption_format_t;
+
+typedef enum {
+ RBD_ENCRYPTION_ALGORITHM_AES128 = 0,
+ RBD_ENCRYPTION_ALGORITHM_AES256 = 1
+} rbd_encryption_algorithm_t;
+
+typedef void *rbd_encryption_options_t;
+
+typedef struct {
+ rbd_encryption_algorithm_t alg;
+ const char* passphrase;
+ size_t passphrase_size;
+} rbd_encryption_luks1_format_options_t;
+
+typedef struct {
+ rbd_encryption_algorithm_t alg;
+ const char* passphrase;
+ size_t passphrase_size;
+} rbd_encryption_luks2_format_options_t;
+
+CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts);
+CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts,
+ int optname, const char* optval);
+CEPH_RBD_API int rbd_image_options_set_uint64(rbd_image_options_t opts,
+ int optname, uint64_t optval);
+CEPH_RBD_API int rbd_image_options_get_string(rbd_image_options_t opts,
+ int optname, char* optval,
+ size_t maxlen);
+CEPH_RBD_API int rbd_image_options_get_uint64(rbd_image_options_t opts,
+ int optname, uint64_t* optval);
+CEPH_RBD_API int rbd_image_options_is_set(rbd_image_options_t opts,
+ int optname, bool* is_set);
+CEPH_RBD_API int rbd_image_options_unset(rbd_image_options_t opts, int optname);
+CEPH_RBD_API void rbd_image_options_clear(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_is_empty(rbd_image_options_t opts);
+
+/* helpers */
+CEPH_RBD_API void rbd_image_spec_cleanup(rbd_image_spec_t *image);
+CEPH_RBD_API void rbd_image_spec_list_cleanup(rbd_image_spec_t *images,
+ size_t num_images);
+CEPH_RBD_API void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image);
+CEPH_RBD_API void rbd_linked_image_spec_list_cleanup(
+ rbd_linked_image_spec_t *images, size_t num_images);
+CEPH_RBD_API void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap);
+
+/* images */
+CEPH_RBD_API int rbd_list(rados_ioctx_t io, char *names, size_t *size)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_list2(rados_ioctx_t io, rbd_image_spec_t* images,
+ size_t *max_images);
+
+CEPH_RBD_API int rbd_create(rados_ioctx_t io, const char *name, uint64_t size,
+ int *order);
+CEPH_RBD_API int rbd_create2(rados_ioctx_t io, const char *name, uint64_t size,
+ uint64_t features, int *order);
+/**
+ * create new rbd image
+ *
+ * The stripe_unit must be a factor of the object size (1 << order).
+ * The stripe_count can be one (no intra-object striping) or greater
+ * than one. The RBD_FEATURE_STRIPINGV2 must be specified if the
+ * stripe_unit != the object size and the stripe_count is != 1.
+ *
+ * @param io ioctx
+ * @param name image name
+ * @param size image size in bytes
+ * @param features initial feature bits
+ * @param order object/block size, as a power of two (object size == 1 << order)
+ * @param stripe_unit stripe unit size, in bytes.
+ * @param stripe_count number of objects to stripe over before looping
+ * @return 0 on success, or negative error code
+ */
+CEPH_RBD_API int rbd_create3(rados_ioctx_t io, const char *name, uint64_t size,
+ uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count);
+CEPH_RBD_API int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size,
+ rbd_image_options_t opts);
+CEPH_RBD_API int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snapname, rados_ioctx_t c_ioctx,
+ const char *c_name, uint64_t features, int *c_order);
+CEPH_RBD_API int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snapname, rados_ioctx_t c_ioctx,
+ const char *c_name, uint64_t features, int *c_order,
+ uint64_t stripe_unit, int stripe_count);
+CEPH_RBD_API int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snapname, rados_ioctx_t c_ioctx,
+ const char *c_name, rbd_image_options_t c_opts);
+CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name);
+CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname,
+ const char *destname);
+
+CEPH_RBD_API int rbd_trash_move(rados_ioctx_t io, const char *name,
+ uint64_t delay);
+CEPH_RBD_API int rbd_trash_get(rados_ioctx_t io, const char *id,
+ rbd_trash_image_info_t *info);
+CEPH_RBD_API void rbd_trash_get_cleanup(rbd_trash_image_info_t *info);
+CEPH_RBD_API int rbd_trash_list(rados_ioctx_t io,
+ rbd_trash_image_info_t *trash_entries,
+ size_t *num_entries);
+CEPH_RBD_API void rbd_trash_list_cleanup(rbd_trash_image_info_t *trash_entries,
+ size_t num_entries);
+CEPH_RBD_API int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, float threshold);
+CEPH_RBD_API int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts,
+ float threshold, librbd_progress_fn_t cb,
+ void* cbdata);
+CEPH_RBD_API int rbd_trash_remove(rados_ioctx_t io, const char *id, bool force);
+CEPH_RBD_API int rbd_trash_remove_with_progress(rados_ioctx_t io,
+ const char *id,
+ bool force,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_trash_restore(rados_ioctx_t io, const char *id,
+ const char *name);
+
+/* migration */
+CEPH_RBD_API int rbd_migration_prepare(rados_ioctx_t ioctx,
+ const char *image_name,
+ rados_ioctx_t dest_ioctx,
+ const char *dest_image_name,
+ rbd_image_options_t opts);
+CEPH_RBD_API int rbd_migration_prepare_import(
+ const char *source_spec, rados_ioctx_t dest_ioctx,
+ const char *dest_image_name, rbd_image_options_t opts);
+CEPH_RBD_API int rbd_migration_execute(rados_ioctx_t ioctx,
+ const char *image_name);
+CEPH_RBD_API int rbd_migration_execute_with_progress(rados_ioctx_t ioctx,
+ const char *image_name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_migration_abort(rados_ioctx_t ioctx,
+ const char *image_name);
+CEPH_RBD_API int rbd_migration_abort_with_progress(rados_ioctx_t ioctx,
+ const char *image_name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_migration_commit(rados_ioctx_t ioctx,
+ const char *image_name);
+CEPH_RBD_API int rbd_migration_commit_with_progress(rados_ioctx_t ioctx,
+ const char *image_name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_migration_status(rados_ioctx_t ioctx,
+ const char *image_name,
+ rbd_image_migration_status_t *status,
+ size_t status_size);
+CEPH_RBD_API void rbd_migration_status_cleanup(
+ rbd_image_migration_status_t *status);
+
+/* pool mirroring */
+CEPH_RBD_API int rbd_mirror_site_name_get(rados_t cluster,
+ char *name, size_t *max_len);
+CEPH_RBD_API int rbd_mirror_site_name_set(rados_t cluster,
+ const char *name);
+
+CEPH_RBD_API int rbd_mirror_mode_get(rados_ioctx_t io_ctx,
+ rbd_mirror_mode_t *mirror_mode);
+CEPH_RBD_API int rbd_mirror_mode_set(rados_ioctx_t io_ctx,
+ rbd_mirror_mode_t mirror_mode);
+
+CEPH_RBD_API int rbd_mirror_uuid_get(rados_ioctx_t io_ctx,
+ char *uuid, size_t *max_len);
+
+CEPH_RBD_API int rbd_mirror_peer_bootstrap_create(
+ rados_ioctx_t io_ctx, char *token, size_t *max_len);
+CEPH_RBD_API int rbd_mirror_peer_bootstrap_import(
+ rados_ioctx_t io_ctx, rbd_mirror_peer_direction_t direction,
+ const char *token);
+
+CEPH_RBD_API int rbd_mirror_peer_site_add(
+ rados_ioctx_t io_ctx, char *uuid, size_t uuid_max_length,
+ rbd_mirror_peer_direction_t direction, const char *site_name,
+ const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_site_set_name(
+ rados_ioctx_t io_ctx, const char *uuid, const char *site_name);
+CEPH_RBD_API int rbd_mirror_peer_site_set_client_name(
+ rados_ioctx_t io_ctx, const char *uuid, const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_site_set_direction(
+ rados_ioctx_t io_ctx, const char *uuid,
+ rbd_mirror_peer_direction_t direction);
+CEPH_RBD_API int rbd_mirror_peer_site_remove(
+ rados_ioctx_t io_ctx, const char *uuid);
+CEPH_RBD_API int rbd_mirror_peer_site_list(
+ rados_ioctx_t io_ctx, rbd_mirror_peer_site_t *peers, int *max_peers);
+CEPH_RBD_API void rbd_mirror_peer_site_list_cleanup(
+ rbd_mirror_peer_site_t *peers, int max_peers);
+CEPH_RBD_API int rbd_mirror_peer_site_get_attributes(
+ rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+ char *values, size_t *max_value_len, size_t *key_value_count);
+CEPH_RBD_API int rbd_mirror_peer_site_set_attributes(
+ rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+ size_t key_value_count);
+
+CEPH_RBD_API int rbd_mirror_image_global_status_list(
+ rados_ioctx_t io_ctx, const char *start_id, size_t max, char **image_ids,
+ rbd_mirror_image_global_status_t *images, size_t *len);
+CEPH_RBD_API void rbd_mirror_image_global_status_list_cleanup(
+ char **image_ids, rbd_mirror_image_global_status_t *images, size_t len);
+
+/* rbd_mirror_peer_ commands are deprecated to rbd_mirror_peer_site_
+ * equivalents */
+CEPH_RBD_API int rbd_mirror_peer_add(
+ rados_ioctx_t io_ctx, char *uuid, size_t uuid_max_length,
+ const char *cluster_name, const char *client_name)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_remove(
+ rados_ioctx_t io_ctx, const char *uuid)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_list(
+ rados_ioctx_t io_ctx, rbd_mirror_peer_t *peers, int *max_peers)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API void rbd_mirror_peer_list_cleanup(
+ rbd_mirror_peer_t *peers, int max_peers)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_set_client(
+ rados_ioctx_t io_ctx, const char *uuid, const char *client_name)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_set_cluster(
+ rados_ioctx_t io_ctx, const char *uuid, const char *cluster_name)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_get_attributes(
+ rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+ char *values, size_t *max_value_len, size_t *key_value_count)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_set_attributes(
+ rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+ size_t key_value_count)
+ CEPH_RBD_DEPRECATED;
+
+/* rbd_mirror_image_status_list_ commands are deprecard to
+ * rbd_mirror_image_global_status_list_ commands */
+
+CEPH_RBD_API int rbd_mirror_image_status_list(
+ rados_ioctx_t io_ctx, const char *start_id, size_t max, char **image_ids,
+ rbd_mirror_image_status_t *images, size_t *len)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API void rbd_mirror_image_status_list_cleanup(
+ char **image_ids, rbd_mirror_image_status_t *images, size_t len)
+ CEPH_RBD_DEPRECATED;
+
+CEPH_RBD_API int rbd_mirror_image_status_summary(
+ rados_ioctx_t io_ctx, rbd_mirror_image_status_state_t *states, int *counts,
+ size_t *maxlen);
+
+CEPH_RBD_API int rbd_mirror_image_instance_id_list(rados_ioctx_t io_ctx,
+ const char *start_id,
+ size_t max, char **image_ids,
+ char **instance_ids,
+ size_t *len);
+CEPH_RBD_API void rbd_mirror_image_instance_id_list_cleanup(char **image_ids,
+ char **instance_ids,
+ size_t len);
+CEPH_RBD_API int rbd_mirror_image_info_list(
+ rados_ioctx_t io_ctx, rbd_mirror_image_mode_t *mode_filter,
+ const char *start_id, size_t max, char **image_ids,
+ rbd_mirror_image_mode_t *mode_entries,
+ rbd_mirror_image_info_t *info_entries, size_t *num_entries);
+CEPH_RBD_API void rbd_mirror_image_info_list_cleanup(
+ char **image_ids, rbd_mirror_image_info_t *info_entries,
+ size_t num_entries);
+
+/* pool metadata */
+CEPH_RBD_API int rbd_pool_metadata_get(rados_ioctx_t io_ctx, const char *key,
+ char *value, size_t *val_len);
+CEPH_RBD_API int rbd_pool_metadata_set(rados_ioctx_t io_ctx, const char *key,
+ const char *value);
+CEPH_RBD_API int rbd_pool_metadata_remove(rados_ioctx_t io_ctx,
+ const char *key);
+CEPH_RBD_API int rbd_pool_metadata_list(rados_ioctx_t io_ctx, const char *start,
+ uint64_t max, char *keys,
+ size_t *key_len, char *values,
+ size_t *vals_len);
+
+CEPH_RBD_API int rbd_config_pool_list(rados_ioctx_t io_ctx,
+ rbd_config_option_t *options,
+ int *max_options);
+CEPH_RBD_API void rbd_config_pool_list_cleanup(rbd_config_option_t *options,
+ int max_options);
+
+CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name,
+ rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_open_by_id(rados_ioctx_t io, const char *id,
+ rbd_image_t *image, const char *snap_name);
+
+CEPH_RBD_API int rbd_aio_open(rados_ioctx_t io, const char *name,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_open_by_id(rados_ioctx_t io, const char *id,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c);
+
+/**
+ * Open an image in read-only mode.
+ *
+ * This is intended for use by clients that cannot write to a block
+ * device due to cephx restrictions. There will be no watch
+ * established on the header object, since a watch is a write. This
+ * means the metadata reported about this image (parents, snapshots,
+ * size, etc.) may become stale. This should not be used for
+ * long-running operations, unless you can be sure that one of these
+ * properties changing is safe.
+ *
+ * Attempting to write to a read-only image will return -EROFS.
+ *
+ * @param io ioctx to determine the pool the image is in
+ * @param name image name
+ * @param image where to store newly opened image handle
+ * @param snap_name name of snapshot to open at, or NULL for no snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_open_read_only(rados_ioctx_t io, const char *name,
+ rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_open_by_id_read_only(rados_ioctx_t io, const char *id,
+ rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_aio_open_read_only(rados_ioctx_t io, const char *name,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_open_by_id_read_only(rados_ioctx_t io, const char *id,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_features_to_string(uint64_t features, char *str_features,
+ size_t *size);
+CEPH_RBD_API int rbd_features_from_string(const char *str_features, uint64_t *features);
+CEPH_RBD_API int rbd_close(rbd_image_t image);
+CEPH_RBD_API int rbd_aio_close(rbd_image_t image, rbd_completion_t c);
+CEPH_RBD_API int rbd_resize(rbd_image_t image, uint64_t size);
+CEPH_RBD_API int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_resize_with_progress(rbd_image_t image, uint64_t size,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_stat(rbd_image_t image, rbd_image_info_t *info,
+ size_t infosize);
+CEPH_RBD_API int rbd_get_old_format(rbd_image_t image, uint8_t *old);
+CEPH_RBD_API int rbd_get_size(rbd_image_t image, uint64_t *size);
+CEPH_RBD_API int rbd_get_features(rbd_image_t image, uint64_t *features);
+CEPH_RBD_API int rbd_update_features(rbd_image_t image, uint64_t features,
+ uint8_t enabled);
+CEPH_RBD_API int rbd_get_op_features(rbd_image_t image, uint64_t *op_features);
+CEPH_RBD_API int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit);
+CEPH_RBD_API int rbd_get_stripe_count(rbd_image_t image,
+ uint64_t *stripe_count);
+
+CEPH_RBD_API int rbd_get_create_timestamp(rbd_image_t image,
+ struct timespec *timestamp);
+CEPH_RBD_API int rbd_get_access_timestamp(rbd_image_t image,
+ struct timespec *timestamp);
+CEPH_RBD_API int rbd_get_modify_timestamp(rbd_image_t image,
+ struct timespec *timestamp);
+
+CEPH_RBD_API int rbd_get_overlap(rbd_image_t image, uint64_t *overlap);
+CEPH_RBD_API int rbd_get_name(rbd_image_t image, char *name, size_t *name_len);
+CEPH_RBD_API int rbd_get_id(rbd_image_t image, char *id, size_t id_len);
+CEPH_RBD_API int rbd_get_block_name_prefix(rbd_image_t image,
+ char *prefix, size_t prefix_len);
+CEPH_RBD_API int64_t rbd_get_data_pool_id(rbd_image_t image);
+
+CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image,
+ char *parent_poolname, size_t ppoolnamelen,
+ char *parent_name, size_t pnamelen,
+ char *parent_snapname,
+ size_t psnapnamelen)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_get_parent_info2(rbd_image_t image,
+ char *parent_poolname,
+ size_t ppoolnamelen,
+ char *parent_name, size_t pnamelen,
+ char *parent_id, size_t pidlen,
+ char *parent_snapname,
+ size_t psnapnamelen)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_get_parent(rbd_image_t image,
+ rbd_linked_image_spec_t *parent_image,
+ rbd_snap_spec_t *parent_snap);
+
+CEPH_RBD_API int rbd_get_migration_source_spec(rbd_image_t image,
+ char* source_spec,
+ size_t* max_len);
+
+CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags);
+CEPH_RBD_API int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info,
+ size_t group_info_size);
+CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type);
+
+/* exclusive lock feature */
+CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner);
+CEPH_RBD_API int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode);
+CEPH_RBD_API int rbd_lock_release(rbd_image_t image);
+CEPH_RBD_API int rbd_lock_get_owners(rbd_image_t image,
+ rbd_lock_mode_t *lock_mode,
+ char **lock_owners,
+ size_t *max_lock_owners);
+CEPH_RBD_API void rbd_lock_get_owners_cleanup(char **lock_owners,
+ size_t lock_owner_count);
+CEPH_RBD_API int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode,
+ const char *lock_owner);
+
+/* object map feature */
+CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image,
+ librbd_progress_fn_t cb, void *cbdata);
+
+CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx,
+ const char *destname);
+CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest);
+CEPH_RBD_API int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+ const char *destname, rbd_image_options_t dest_opts);
+CEPH_RBD_API int rbd_copy4(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+ const char *destname, rbd_image_options_t dest_opts,
+ size_t sparse_size);
+CEPH_RBD_API int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress2(rbd_image_t src, rbd_image_t dest,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress3(rbd_image_t image,
+ rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress4(rbd_image_t image,
+ rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t cb, void *cbdata,
+ size_t sparse_size);
+
+/* deep copy */
+CEPH_RBD_API int rbd_deep_copy(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+ const char *destname,
+ rbd_image_options_t dest_opts);
+CEPH_RBD_API int rbd_deep_copy_with_progress(rbd_image_t image,
+ rados_ioctx_t dest_io_ctx,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+
+/* encryption */
+CEPH_RBD_API int rbd_encryption_format(rbd_image_t image,
+ rbd_encryption_format_t format,
+ rbd_encryption_options_t opts,
+ size_t opts_size);
+CEPH_RBD_API int rbd_encryption_load(rbd_image_t image,
+ rbd_encryption_format_t format,
+ rbd_encryption_options_t opts,
+ size_t opts_size);
+
+/* snapshots */
+CEPH_RBD_API int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
+ int *max_snaps);
+CEPH_RBD_API void rbd_snap_list_end(rbd_snap_info_t *snaps);
+CEPH_RBD_API int rbd_snap_exists(rbd_image_t image, const char *snapname, bool *exists);
+CEPH_RBD_API int rbd_snap_create(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_create2(rbd_image_t image, const char *snap_name,
+ uint32_t flags, librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_snap_remove(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_remove2(rbd_image_t image, const char *snap_name,
+ uint32_t flags, librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id);
+CEPH_RBD_API int rbd_snap_rollback(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_rollback_with_progress(rbd_image_t image,
+ const char *snapname,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+CEPH_RBD_API int rbd_snap_rename(rbd_image_t image, const char *snapname,
+ const char* dstsnapsname);
+/**
+ * Prevent a snapshot from being deleted until it is unprotected.
+ *
+ * @param snap_name which snapshot to protect
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if snap is already protected
+ */
+CEPH_RBD_API int rbd_snap_protect(rbd_image_t image, const char *snap_name);
+/**
+ * Allow a snaphshot to be deleted.
+ *
+ * @param snap_name which snapshot to unprotect
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snap is not protected
+ */
+CEPH_RBD_API int rbd_snap_unprotect(rbd_image_t image, const char *snap_name);
+/**
+ * Determine whether a snapshot is protected.
+ *
+ * @param snap_name which snapshot query
+ * @param is_protected where to store the result (0 or 1)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_is_protected(rbd_image_t image, const char *snap_name,
+ int *is_protected);
+/**
+ * Get the current snapshot limit for an image. If no limit is set,
+ * UINT64_MAX is returned.
+ *
+ * @param limit pointer where the limit will be stored on success
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit);
+
+/**
+ * Set a limit for the number of snapshots that may be taken of an image.
+ *
+ * @param limit the maximum number of snapshots allowed in the future.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_set_limit(rbd_image_t image, uint64_t limit);
+
+/**
+ * Get the timestamp of a snapshot for an image.
+ *
+ * @param snap_id the snap id of a snapshot of input image.
+ * @param timestamp the timestamp of input snapshot.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp);
+
+CEPH_RBD_API int rbd_snap_set(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id);
+CEPH_RBD_API int rbd_snap_get_name(rbd_image_t image, uint64_t snap_id, char *snapname, size_t *name_len);
+CEPH_RBD_API int rbd_snap_get_id(rbd_image_t image, const char *snapname, uint64_t *snap_id);
+
+CEPH_RBD_API int rbd_snap_get_namespace_type(rbd_image_t image,
+ uint64_t snap_id,
+ rbd_snap_namespace_type_t *namespace_type);
+CEPH_RBD_API int rbd_snap_get_group_namespace(rbd_image_t image,
+ uint64_t snap_id,
+ rbd_snap_group_namespace_t *group_snap,
+ size_t group_snap_size);
+CEPH_RBD_API int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap,
+ size_t group_snap_size);
+CEPH_RBD_API int rbd_snap_get_trash_namespace(rbd_image_t image,
+ uint64_t snap_id,
+ char* original_name,
+ size_t max_length);
+CEPH_RBD_API int rbd_snap_get_mirror_namespace(
+ rbd_image_t image, uint64_t snap_id,
+ rbd_snap_mirror_namespace_t *mirror_snap, size_t mirror_snap_size);
+CEPH_RBD_API int rbd_snap_mirror_namespace_cleanup(
+ rbd_snap_mirror_namespace_t *mirror_snap, size_t mirror_snap_size);
+
+CEPH_RBD_API int rbd_flatten(rbd_image_t image);
+
+CEPH_RBD_API int rbd_flatten_with_progress(rbd_image_t image,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+
+CEPH_RBD_API int rbd_sparsify(rbd_image_t image, size_t sparse_size);
+
+CEPH_RBD_API int rbd_sparsify_with_progress(rbd_image_t image,
+ size_t sparse_size,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+
+/**
+ * List all images that are cloned from the image at the
+ * snapshot that is set via rbd_snap_set().
+ *
+ * This iterates over all pools, so it should be run by a user with
+ * read access to all of them. pools_len and images_len are filled in
+ * with the number of bytes put into the pools and images buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the pool and image names
+ * of the children, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param pools buffer in which to store pool names
+ * @param pools_len number of bytes in pools buffer
+ * @param images buffer in which to store image names
+ * @param images_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API ssize_t rbd_list_children(rbd_image_t image, char *pools,
+ size_t *pools_len, char *images,
+ size_t *images_len)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_list_children2(rbd_image_t image,
+ rbd_child_info_t *children,
+ int *max_children)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API void rbd_list_child_cleanup(rbd_child_info_t *child)
+ CEPH_RBD_DEPRECATED;
+CEPH_RBD_API void rbd_list_children_cleanup(rbd_child_info_t *children,
+ size_t num_children)
+ CEPH_RBD_DEPRECATED;
+
+CEPH_RBD_API int rbd_list_children3(rbd_image_t image,
+ rbd_linked_image_spec_t *images,
+ size_t *max_images);
+
+CEPH_RBD_API int rbd_list_descendants(rbd_image_t image,
+ rbd_linked_image_spec_t *images,
+ size_t *max_images);
+
+/**
+ * @defgroup librbd_h_locking Advisory Locking
+ *
+ * An rbd image may be locking exclusively, or shared, to facilitate
+ * e.g. live migration where the image may be open in two places at once.
+ * These locks are intended to guard against more than one client
+ * writing to an image without coordination. They don't need to
+ * be used for snapshots, since snapshots are read-only.
+ *
+ * Currently locks only guard against locks being acquired.
+ * They do not prevent anything else.
+ *
+ * A locker is identified by the internal rados client id of the
+ * holder and a user-defined cookie. This (client id, cookie) pair
+ * must be unique for each locker.
+ *
+ * A shared lock also has a user-defined tag associated with it. Each
+ * additional shared lock must specify the same tag or lock
+ * acquisition will fail. This can be used by e.g. groups of hosts
+ * using a clustered filesystem on top of an rbd image to make sure
+ * they're accessing the correct image.
+ *
+ * @{
+ */
+/**
+ * List clients that have locked the image and information about the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the image
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RBD_API ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive,
+ char *tag, size_t *tag_len,
+ char *clients, size_t *clients_len,
+ char *cookies, size_t *cookies_len,
+ char *addrs, size_t *addrs_len);
+
+/**
+ * Take an exclusive lock on the image.
+ *
+ * @param image the image to lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_lock_exclusive(rbd_image_t image, const char *cookie);
+
+/**
+ * Take a shared lock on the image.
+ *
+ * Other clients may also take a shared lock, as lock as they use the
+ * same tag.
+ *
+ * @param image the image to lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag user-defined identifier for this shared use of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_lock_shared(rbd_image_t image, const char *cookie,
+ const char *tag);
+
+/**
+ * Release a shared or exclusive lock on the image.
+ *
+ * @param image the image to unlock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_unlock(rbd_image_t image, const char *cookie);
+
+/**
+ * Release a shared or exclusive lock that was taken by the specified client.
+ *
+ * @param image the image to unlock
+ * @param client the entity holding the lock (as given by rbd_list_lockers())
+ * @param cookie user-defined identifier for the instance of the lock to break
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_break_lock(rbd_image_t image, const char *client,
+ const char *cookie);
+
+/** @} locking */
+
+/* I/O */
+CEPH_RBD_API ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len,
+ char *buf);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
+ char *buf, int op_flags);
+/* DEPRECATED; use rbd_read_iterate2 */
+CEPH_RBD_API int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg);
+
+/**
+ * iterate read over an image
+ *
+ * Reads each region of the image and calls the callback. If the
+ * buffer pointer passed to the callback is NULL, the given extent is
+ * defined to be zeros (a hole). Normally the granularity for the
+ * callback is the image stripe size.
+ *
+ * @param image image to read
+ * @param ofs offset to start from
+ * @param len bytes of source image to cover
+ * @param cb callback for each region
+ * @returns 0 success, error otherwise
+ */
+CEPH_RBD_API int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg);
+/**
+ * get difference between two versions of an image
+ *
+ * This will return the differences between two versions of an image
+ * via a callback, which gets the offset and length and a flag
+ * indicating whether the extent exists (1), or is known/defined to
+ * be zeros (a hole, 0). If the source snapshot name is NULL, we
+ * interpret that as the beginning of time and return all allocated
+ * regions of the image. The end version is whatever is currently
+ * selected for the image handle (either a snapshot or the writeable
+ * head).
+ *
+ * @param fromsnapname start snapshot name, or NULL
+ * @param ofs start offset
+ * @param len len in bytes of region to report on
+ * @param include_parent 1 if full history diff should include parent
+ * @param whole_object 1 if diff extents should cover whole object
+ * @param cb callback to call for each allocated region
+ * @param arg argument to pass to the callback
+ * @returns 0 on success, or negative error code on error
+ */
+CEPH_RBD_API int rbd_diff_iterate(rbd_image_t image,
+ const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg);
+CEPH_RBD_API int rbd_diff_iterate2(rbd_image_t image,
+ const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ uint8_t include_parent, uint8_t whole_object,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg);
+CEPH_RBD_API ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf, int op_flags);
+CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
+CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf, size_t data_len,
+ int op_flags);
+CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs,
+ size_t len, int zero_flags,
+ int op_flags);
+CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs,
+ size_t len, const char *cmp_buf,
+ const char *buf,
+ uint64_t *mismatch_off,
+ int op_flags);
+
+CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, rbd_completion_t c);
+
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, rbd_completion_t c,
+ int op_flags);
+CEPH_RBD_API int rbd_aio_writev(rbd_image_t image, const struct iovec *iov,
+ int iovcnt, uint64_t off, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
+ char *buf, rbd_completion_t c);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len,
+ char *buf, rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_readv(rbd_image_t image, const struct iovec *iov,
+ int iovcnt, uint64_t off, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, size_t data_len,
+ rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off,
+ size_t len, rbd_completion_t c,
+ int zero_flags, int op_flags);
+CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image,
+ uint64_t off, size_t len,
+ const char *cmp_buf,
+ const char *buf,
+ rbd_completion_t c,
+ uint64_t *mismatch_off,
+ int op_flags);
+
+CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
+ rbd_callback_t complete_cb,
+ rbd_completion_t *c);
+CEPH_RBD_API int rbd_aio_is_complete(rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_wait_for_complete(rbd_completion_t c);
+CEPH_RBD_API ssize_t rbd_aio_get_return_value(rbd_completion_t c);
+CEPH_RBD_API void *rbd_aio_get_arg(rbd_completion_t c);
+CEPH_RBD_API void rbd_aio_release(rbd_completion_t c);
+CEPH_RBD_API int rbd_flush(rbd_image_t image);
+/**
+ * Start a flush if caching is enabled. Get a callback when
+ * the currently pending writes are on disk.
+ *
+ * @param image the image to flush writes to
+ * @param c what to call when flushing is complete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
+
+/**
+ * Drop any cached data for an image
+ *
+ * @param image the image to invalidate cached data for
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image);
+
+CEPH_RBD_API int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp);
+
+CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len);
+CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value);
+CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key);
+/**
+ * List all metadatas associated with this image.
+ *
+ * This iterates over all metadatas, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the keys and values
+ * of the image, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param start_after which name to begin listing after
+ * (use the empty string to start at the beginning)
+ * @param max the maximum number of names to lis(if 0 means no limit)
+ * @param keys buffer in which to store pool names
+ * @param keys_len number of bytes in pools buffer
+ * @param values buffer in which to store image names
+ * @param vals_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max,
+ char *keys, size_t *key_len, char *values, size_t *vals_len);
+
+// RBD image mirroring support functions
+CEPH_RBD_API int rbd_mirror_image_enable(rbd_image_t image) CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_image_enable2(rbd_image_t image,
+ rbd_mirror_image_mode_t mode);
+CEPH_RBD_API int rbd_mirror_image_disable(rbd_image_t image, bool force);
+CEPH_RBD_API int rbd_mirror_image_promote(rbd_image_t image, bool force);
+CEPH_RBD_API int rbd_mirror_image_demote(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_resync(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_create_snapshot(rbd_image_t image,
+ uint64_t *snap_id);
+CEPH_RBD_API int rbd_mirror_image_create_snapshot2(rbd_image_t image,
+ uint32_t flags,
+ uint64_t *snap_id);
+CEPH_RBD_API int rbd_mirror_image_get_info(rbd_image_t image,
+ rbd_mirror_image_info_t *mirror_image_info,
+ size_t info_size);
+CEPH_RBD_API void rbd_mirror_image_get_info_cleanup(
+ rbd_mirror_image_info_t *mirror_image_info);
+CEPH_RBD_API int rbd_mirror_image_get_mode(rbd_image_t image,
+ rbd_mirror_image_mode_t *mode);
+
+CEPH_RBD_API int rbd_mirror_image_get_global_status(
+ rbd_image_t image,
+ rbd_mirror_image_global_status_t *mirror_image_global_status,
+ size_t status_size);
+CEPH_RBD_API void rbd_mirror_image_global_status_cleanup(
+ rbd_mirror_image_global_status_t *mirror_image_global_status);
+
+CEPH_RBD_API int rbd_mirror_image_get_status(
+ rbd_image_t image, rbd_mirror_image_status_t *mirror_image_status,
+ size_t status_size)
+ CEPH_RBD_DEPRECATED;
+
+CEPH_RBD_API int rbd_mirror_image_get_instance_id(rbd_image_t image,
+ char *instance_id,
+ size_t *id_max_length);
+CEPH_RBD_API int rbd_aio_mirror_image_promote(rbd_image_t image, bool force,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_demote(rbd_image_t image,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_info(rbd_image_t image,
+ rbd_mirror_image_info_t *mirror_image_info,
+ size_t info_size,
+ rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_mode(rbd_image_t image,
+ rbd_mirror_image_mode_t *mode,
+ rbd_completion_t c);
+
+CEPH_RBD_API int rbd_aio_mirror_image_get_global_status(
+ rbd_image_t image,
+ rbd_mirror_image_global_status_t *mirror_global_image_status,
+ size_t status_size, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_status(
+ rbd_image_t image, rbd_mirror_image_status_t *mirror_image_status,
+ size_t status_size, rbd_completion_t c)
+ CEPH_RBD_DEPRECATED;
+
+CEPH_RBD_API int rbd_aio_mirror_image_create_snapshot(rbd_image_t image,
+ uint32_t flags,
+ uint64_t *snap_id,
+ rbd_completion_t c);
+
+// RBD groups support functions
+CEPH_RBD_API int rbd_group_create(rados_ioctx_t p, const char *name);
+CEPH_RBD_API int rbd_group_remove(rados_ioctx_t p, const char *name);
+CEPH_RBD_API int rbd_group_list(rados_ioctx_t p, char *names, size_t *size);
+CEPH_RBD_API int rbd_group_rename(rados_ioctx_t p, const char *src_name,
+ const char *dest_name);
+CEPH_RBD_API int rbd_group_info_cleanup(rbd_group_info_t *group_info,
+ size_t group_info_size);
+
+/**
+ * Register an image metadata change watcher.
+ *
+ * @param image the image to watch
+ * @param handle where to store the internal id assigned to this watch
+ * @param watch_cb what to do when a notify is received on this image
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_update_watch(rbd_image_t image, uint64_t *handle,
+ rbd_update_callback_t watch_cb, void *arg);
+
+/**
+ * Unregister an image watcher.
+ *
+ * @param image the image to unwatch
+ * @param handle which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_update_unwatch(rbd_image_t image, uint64_t handle);
+
+/**
+ * List any watchers of an image.
+ *
+ * Watchers will be allocated and stored in the passed watchers array. If there
+ * are more watchers than max_watchers, -ERANGE will be returned and the number
+ * of watchers will be stored in max_watchers.
+ *
+ * The caller should call rbd_watchers_list_cleanup when finished with the list
+ * of watchers.
+ *
+ * @param image the image to list watchers for.
+ * @param watchers an array to store watchers in.
+ * @param max_watchers capacity of the watchers array.
+ * @returns 0 on success, negative error code on failure.
+ * @returns -ERANGE if there are too many watchers for the passed array.
+ * @returns the number of watchers in max_watchers.
+ */
+CEPH_RBD_API int rbd_watchers_list(rbd_image_t image,
+ rbd_image_watcher_t *watchers,
+ size_t *max_watchers);
+
+CEPH_RBD_API void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers,
+ size_t num_watchers);
+
+CEPH_RBD_API int rbd_config_image_list(rbd_image_t image,
+ rbd_config_option_t *options,
+ int *max_options);
+CEPH_RBD_API void rbd_config_image_list_cleanup(rbd_config_option_t *options,
+ int max_options);
+
+CEPH_RBD_API int rbd_group_image_add(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_name);
+CEPH_RBD_API int rbd_group_image_remove(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_name);
+CEPH_RBD_API int rbd_group_image_remove_by_id(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_id);
+CEPH_RBD_API int rbd_group_image_list(rados_ioctx_t group_p,
+ const char *group_name,
+ rbd_group_image_info_t *images,
+ size_t group_image_info_size,
+ size_t *num_entries);
+CEPH_RBD_API int rbd_group_image_list_cleanup(rbd_group_image_info_t *images,
+ size_t group_image_info_size,
+ size_t num_entries);
+
+CEPH_RBD_API int rbd_group_snap_create(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_create2(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name,
+ uint32_t flags);
+CEPH_RBD_API int rbd_group_snap_remove(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_rename(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *old_snap_name,
+ const char *new_snap_name);
+CEPH_RBD_API int rbd_group_snap_list(rados_ioctx_t group_p,
+ const char *group_name,
+ rbd_group_snap_info_t *snaps,
+ size_t group_snap_info_size,
+ size_t *num_entries);
+CEPH_RBD_API int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps,
+ size_t group_snap_info_size,
+ size_t num_entries);
+CEPH_RBD_API int rbd_group_snap_rollback(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name,
+ librbd_progress_fn_t cb,
+ void *cbdata);
+
+CEPH_RBD_API int rbd_namespace_create(rados_ioctx_t io,
+ const char *namespace_name);
+CEPH_RBD_API int rbd_namespace_remove(rados_ioctx_t io,
+ const char *namespace_name);
+CEPH_RBD_API int rbd_namespace_list(rados_ioctx_t io, char *namespace_names,
+ size_t *size);
+CEPH_RBD_API int rbd_namespace_exists(rados_ioctx_t io,
+ const char *namespace_name,
+ bool *exists);
+
+CEPH_RBD_API int rbd_pool_init(rados_ioctx_t io, bool force);
+
+CEPH_RBD_API void rbd_pool_stats_create(rbd_pool_stats_t *stats);
+CEPH_RBD_API void rbd_pool_stats_destroy(rbd_pool_stats_t stats);
+CEPH_RBD_API int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats,
+ int stat_option,
+ uint64_t* stat_val);
+CEPH_RBD_API int rbd_pool_stats_get(rados_ioctx_t io, rbd_pool_stats_t stats);
+
+/**
+ * Register a quiesce/unquiesce watcher.
+ *
+ * @param image the image to watch
+ * @param quiesce_cb what to do when librbd wants to quiesce
+ * @param unquiesce_cb what to do when librbd wants to unquiesce
+ * @param arg opaque value to pass to the callbacks
+ * @param handle where to store the internal id assigned to this watch
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_quiesce_watch(rbd_image_t image,
+ rbd_update_callback_t quiesce_cb,
+ rbd_update_callback_t unquiesce_cb,
+ void *arg, uint64_t *handle);
+
+/**
+ * Notify quiesce is complete
+ *
+ * @param image the image to notify
+ * @param handle which watch is complete
+ * @param r the return code
+ */
+CEPH_RADOS_API void rbd_quiesce_complete(rbd_image_t image, uint64_t handle,
+ int r);
+
+/**
+ * Unregister a quiesce/unquiesce watcher.
+ *
+ * @param image the image to unwatch
+ * @param handle which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_quiesce_unwatch(rbd_image_t image, uint64_t handle);
+
+#if __GNUC__ >= 4
+ #pragma GCC diagnostic pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LIBRBD_H */
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
new file mode 100644
index 000000000..f1ddc2965
--- /dev/null
+++ b/src/include/rbd/librbd.hpp
@@ -0,0 +1,842 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __LIBRBD_HPP
+#define __LIBRBD_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <vector>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+#include "librbd.h"
+
+#if __GNUC__ >= 4
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+namespace librbd {
+
+ using librados::IoCtx;
+
+ class Image;
+ class ImageOptions;
+ class PoolStats;
+ typedef void *image_ctx_t;
+ typedef void *completion_t;
+ typedef void (*callback_t)(completion_t cb, void *arg);
+
+ typedef struct {
+ std::string id;
+ std::string name;
+ } image_spec_t;
+
+ typedef struct {
+ int64_t pool_id;
+ std::string pool_name;
+ std::string pool_namespace;
+ std::string image_id;
+ std::string image_name;
+ bool trash;
+ } linked_image_spec_t;
+
+ typedef rbd_snap_namespace_type_t snap_namespace_type_t;
+
+ typedef struct {
+ uint64_t id;
+ snap_namespace_type_t namespace_type;
+ std::string name;
+ } snap_spec_t;
+
+ typedef struct {
+ uint64_t id;
+ uint64_t size;
+ std::string name;
+ } snap_info_t;
+
+ typedef struct {
+ int64_t group_pool;
+ std::string group_name;
+ std::string group_snap_name;
+ } snap_group_namespace_t;
+
+ typedef rbd_snap_mirror_state_t snap_mirror_state_t;
+
+ typedef struct {
+ snap_mirror_state_t state;
+ std::set<std::string> mirror_peer_uuids;
+ bool complete;
+ std::string primary_mirror_uuid;
+ uint64_t primary_snap_id;
+ uint64_t last_copied_object_number;
+ } snap_mirror_namespace_t;
+
+ typedef struct {
+ std::string client;
+ std::string cookie;
+ std::string address;
+ } locker_t;
+
+ typedef rbd_mirror_peer_direction_t mirror_peer_direction_t;
+
+ typedef struct {
+ std::string uuid;
+ std::string cluster_name;
+ std::string client_name;
+ } mirror_peer_t CEPH_RBD_DEPRECATED;
+
+ typedef struct {
+ std::string uuid;
+ mirror_peer_direction_t direction;
+ std::string site_name;
+ std::string mirror_uuid;
+ std::string client_name;
+ time_t last_seen;
+ } mirror_peer_site_t;
+
+ typedef rbd_mirror_image_mode_t mirror_image_mode_t;
+ typedef rbd_mirror_image_state_t mirror_image_state_t;
+
+ typedef struct {
+ std::string global_id;
+ mirror_image_state_t state;
+ bool primary;
+ } mirror_image_info_t;
+
+ typedef rbd_mirror_image_status_state_t mirror_image_status_state_t;
+
+ typedef struct {
+ std::string name;
+ mirror_image_info_t info;
+ mirror_image_status_state_t state;
+ std::string description;
+ time_t last_update;
+ bool up;
+ } mirror_image_status_t CEPH_RBD_DEPRECATED;
+
+ typedef struct {
+ std::string mirror_uuid;
+ mirror_image_status_state_t state;
+ std::string description;
+ time_t last_update;
+ bool up;
+ } mirror_image_site_status_t;
+
+ typedef struct {
+ std::string name;
+ mirror_image_info_t info;
+ std::vector<mirror_image_site_status_t> site_statuses;
+ } mirror_image_global_status_t;
+
+ typedef rbd_group_image_state_t group_image_state_t;
+
+ typedef struct {
+ std::string name;
+ int64_t pool;
+ group_image_state_t state;
+ } group_image_info_t;
+
+ typedef struct {
+ std::string name;
+ int64_t pool;
+ } group_info_t;
+
+ typedef rbd_group_snap_state_t group_snap_state_t;
+
+ typedef struct {
+ std::string name;
+ group_snap_state_t state;
+ } group_snap_info_t;
+
+ typedef rbd_image_info_t image_info_t;
+
+ class CEPH_RBD_API ProgressContext
+ {
+ public:
+ virtual ~ProgressContext();
+ virtual int update_progress(uint64_t offset, uint64_t total) = 0;
+ };
+
+ typedef struct {
+ std::string id;
+ std::string name;
+ rbd_trash_image_source_t source;
+ time_t deletion_time;
+ time_t deferment_end_time;
+ } trash_image_info_t;
+
+ typedef struct {
+ std::string pool_name;
+ std::string image_name;
+ std::string image_id;
+ bool trash;
+ } child_info_t;
+
+ typedef struct {
+ std::string addr;
+ int64_t id;
+ uint64_t cookie;
+ } image_watcher_t;
+
+ typedef rbd_image_migration_state_t image_migration_state_t;
+
+ typedef struct {
+ int64_t source_pool_id;
+ std::string source_pool_namespace;
+ std::string source_image_name;
+ std::string source_image_id;
+ int64_t dest_pool_id;
+ std::string dest_pool_namespace;
+ std::string dest_image_name;
+ std::string dest_image_id;
+ image_migration_state_t state;
+ std::string state_description;
+ } image_migration_status_t;
+
+ typedef rbd_config_source_t config_source_t;
+
+ typedef struct {
+ std::string name;
+ std::string value;
+ config_source_t source;
+ } config_option_t;
+
+ typedef rbd_encryption_format_t encryption_format_t;
+ typedef rbd_encryption_algorithm_t encryption_algorithm_t;
+ typedef rbd_encryption_options_t encryption_options_t;
+
+ typedef struct {
+ encryption_algorithm_t alg;
+ std::string passphrase;
+ } encryption_luks1_format_options_t;
+
+ typedef struct {
+ encryption_algorithm_t alg;
+ std::string passphrase;
+ } encryption_luks2_format_options_t;
+
+class CEPH_RBD_API RBD
+{
+public:
+ RBD();
+ ~RBD();
+
+ // This must be dynamically allocated with new, and
+ // must be released with release().
+ // Do not use delete.
+ struct AioCompletion {
+ void *pc;
+ AioCompletion(void *cb_arg, callback_t complete_cb);
+ bool is_complete();
+ int wait_for_complete();
+ ssize_t get_return_value();
+ void *get_arg();
+ void release();
+ };
+
+ void version(int *major, int *minor, int *extra);
+
+ int open(IoCtx& io_ctx, Image& image, const char *name);
+ int open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname);
+ int open_by_id(IoCtx& io_ctx, Image& image, const char *id);
+ int open_by_id(IoCtx& io_ctx, Image& image, const char *id, const char *snapname);
+ int aio_open(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snapname, RBD::AioCompletion *c);
+ int aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snapname, RBD::AioCompletion *c);
+ // see librbd.h
+ int open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snapname);
+ int open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snapname);
+ int aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snapname, RBD::AioCompletion *c);
+ int aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snapname, RBD::AioCompletion *c);
+ int features_to_string(uint64_t features, std::string *str_features);
+ int features_from_string(const std::string str_features, uint64_t *features);
+
+ int list(IoCtx& io_ctx, std::vector<std::string>& names)
+ CEPH_RBD_DEPRECATED;
+ int list2(IoCtx& io_ctx, std::vector<image_spec_t>* images);
+
+ int create(IoCtx& io_ctx, const char *name, uint64_t size, int *order);
+ int create2(IoCtx& io_ctx, const char *name, uint64_t size,
+ uint64_t features, int *order);
+ int create3(IoCtx& io_ctx, const char *name, uint64_t size,
+ uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count);
+ int create4(IoCtx& io_ctx, const char *name, uint64_t size,
+ ImageOptions& opts);
+ int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+ IoCtx& c_ioctx, const char *c_name, uint64_t features,
+ int *c_order);
+ int clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+ IoCtx& c_ioctx, const char *c_name, uint64_t features,
+ int *c_order, uint64_t stripe_unit, int stripe_count);
+ int clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+ IoCtx& c_ioctx, const char *c_name, ImageOptions& opts);
+ int remove(IoCtx& io_ctx, const char *name);
+ int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx);
+ int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname);
+
+ int trash_move(IoCtx &io_ctx, const char *name, uint64_t delay);
+ int trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info);
+ int trash_list(IoCtx &io_ctx, std::vector<trash_image_info_t> &entries);
+ int trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold);
+ int trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, float threshold,
+ ProgressContext &pctx);
+ int trash_remove(IoCtx &io_ctx, const char *image_id, bool force);
+ int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id,
+ bool force, ProgressContext &pctx);
+ int trash_restore(IoCtx &io_ctx, const char *id, const char *name);
+
+ // Migration
+ int migration_prepare(IoCtx& io_ctx, const char *image_name,
+ IoCtx& dest_io_ctx, const char *dest_image_name,
+ ImageOptions& opts);
+ int migration_prepare_import(const char *source_spec, IoCtx& dest_io_ctx,
+ const char *dest_image_name, ImageOptions& opts);
+ int migration_execute(IoCtx& io_ctx, const char *image_name);
+ int migration_execute_with_progress(IoCtx& io_ctx, const char *image_name,
+ ProgressContext &prog_ctx);
+ int migration_abort(IoCtx& io_ctx, const char *image_name);
+ int migration_abort_with_progress(IoCtx& io_ctx, const char *image_name,
+ ProgressContext &prog_ctx);
+ int migration_commit(IoCtx& io_ctx, const char *image_name);
+ int migration_commit_with_progress(IoCtx& io_ctx, const char *image_name,
+ ProgressContext &prog_ctx);
+ int migration_status(IoCtx& io_ctx, const char *image_name,
+ image_migration_status_t *status, size_t status_size);
+
+ // RBD pool mirroring support functions
+ int mirror_site_name_get(librados::Rados& rados, std::string* site_name);
+ int mirror_site_name_set(librados::Rados& rados,
+ const std::string& site_name);
+
+ int mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode);
+ int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode);
+
+ int mirror_uuid_get(IoCtx& io_ctx, std::string* mirror_uuid);
+
+ int mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token);
+ int mirror_peer_bootstrap_import(IoCtx& io_ctx,
+ mirror_peer_direction_t direction,
+ const std::string &token);
+
+ int mirror_peer_site_add(IoCtx& io_ctx, std::string *uuid,
+ mirror_peer_direction_t direction,
+ const std::string &site_name,
+ const std::string &client_name);
+ int mirror_peer_site_set_name(IoCtx& io_ctx, const std::string& uuid,
+ const std::string &site_name);
+ int mirror_peer_site_set_client_name(IoCtx& io_ctx, const std::string& uuid,
+ const std::string &client_name);
+ int mirror_peer_site_set_direction(IoCtx& io_ctx, const std::string& uuid,
+ mirror_peer_direction_t direction);
+ int mirror_peer_site_remove(IoCtx& io_ctx, const std::string& uuid);
+ int mirror_peer_site_list(IoCtx& io_ctx,
+ std::vector<mirror_peer_site_t> *peers);
+ int mirror_peer_site_get_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ std::map<std::string, std::string> *key_vals);
+ int mirror_peer_site_set_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ const std::map<std::string, std::string>& key_vals);
+
+ int mirror_image_global_status_list(
+ IoCtx& io_ctx, const std::string &start_id, size_t max,
+ std::map<std::string, mirror_image_global_status_t> *images);
+ int mirror_image_status_summary(IoCtx& io_ctx,
+ std::map<mirror_image_status_state_t, int> *states);
+ int mirror_image_instance_id_list(IoCtx& io_ctx, const std::string &start_id,
+ size_t max, std::map<std::string, std::string> *sevice_ids);
+ int mirror_image_info_list(IoCtx& io_ctx, mirror_image_mode_t *mode_filter,
+ const std::string &start_id, size_t max,
+ std::map<std::string, std::pair<mirror_image_mode_t,
+ mirror_image_info_t>> *entries);
+
+ /// mirror_peer_ commands are deprecated to mirror_peer_site_ equivalents
+ int mirror_peer_add(IoCtx& io_ctx, std::string *uuid,
+ const std::string &cluster_name,
+ const std::string &client_name)
+ CEPH_RBD_DEPRECATED;
+ int mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid)
+ CEPH_RBD_DEPRECATED;
+ int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers)
+ CEPH_RBD_DEPRECATED;
+ int mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &client_name)
+ CEPH_RBD_DEPRECATED;
+ int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &cluster_name)
+ CEPH_RBD_DEPRECATED;
+ int mirror_peer_get_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ std::map<std::string, std::string> *key_vals)
+ CEPH_RBD_DEPRECATED;
+ int mirror_peer_set_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ const std::map<std::string, std::string>& key_vals)
+ CEPH_RBD_DEPRECATED;
+
+ /// mirror_image_status_list command is deprecated to
+ /// mirror_image_global_status_list
+
+ int mirror_image_status_list(
+ IoCtx& io_ctx, const std::string &start_id, size_t max,
+ std::map<std::string, mirror_image_status_t> *images)
+ CEPH_RBD_DEPRECATED;
+
+ // RBD groups support functions
+ int group_create(IoCtx& io_ctx, const char *group_name);
+ int group_remove(IoCtx& io_ctx, const char *group_name);
+ int group_list(IoCtx& io_ctx, std::vector<std::string> *names);
+ int group_rename(IoCtx& io_ctx, const char *src_group_name,
+ const char *dest_group_name);
+
+ int group_image_add(IoCtx& io_ctx, const char *group_name,
+ IoCtx& image_io_ctx, const char *image_name);
+ int group_image_remove(IoCtx& io_ctx, const char *group_name,
+ IoCtx& image_io_ctx, const char *image_name);
+ int group_image_remove_by_id(IoCtx& io_ctx, const char *group_name,
+ IoCtx& image_io_ctx, const char *image_id);
+ int group_image_list(IoCtx& io_ctx, const char *group_name,
+ std::vector<group_image_info_t> *images,
+ size_t group_image_info_size);
+
+ int group_snap_create(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name);
+ int group_snap_create2(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name, uint32_t flags);
+ int group_snap_remove(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name);
+ int group_snap_rename(IoCtx& group_ioctx, const char *group_name,
+ const char *old_snap_name, const char *new_snap_name);
+ int group_snap_list(IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_snap_info_t> *snaps,
+ size_t group_snap_info_size);
+ int group_snap_rollback(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name);
+ int group_snap_rollback_with_progress(IoCtx& io_ctx, const char *group_name,
+ const char *snap_name,
+ ProgressContext& pctx);
+
+ int namespace_create(IoCtx& ioctx, const char *namespace_name);
+ int namespace_remove(IoCtx& ioctx, const char *namespace_name);
+ int namespace_list(IoCtx& io_ctx, std::vector<std::string>* namespace_names);
+ int namespace_exists(IoCtx& io_ctx, const char *namespace_name, bool *exists);
+
+ int pool_init(IoCtx& io_ctx, bool force);
+ int pool_stats_get(IoCtx& io_ctx, PoolStats *pool_stats);
+
+ int pool_metadata_get(IoCtx &io_ctx, const std::string &key,
+ std::string *value);
+ int pool_metadata_set(IoCtx &io_ctx, const std::string &key,
+ const std::string &value);
+ int pool_metadata_remove(IoCtx &io_ctx, const std::string &key);
+ int pool_metadata_list(IoCtx &io_ctx, const std::string &start, uint64_t max,
+ std::map<std::string, ceph::bufferlist> *pairs);
+
+ int config_list(IoCtx& io_ctx, std::vector<config_option_t> *options);
+
+private:
+ /* We don't allow assignment or copying */
+ RBD(const RBD& rhs);
+ const RBD& operator=(const RBD& rhs);
+};
+
+class CEPH_RBD_API ImageOptions {
+public:
+ ImageOptions();
+ ImageOptions(rbd_image_options_t opts);
+ ImageOptions(const ImageOptions &imgopts);
+ ~ImageOptions();
+
+ int set(int optname, const std::string& optval);
+ int set(int optname, uint64_t optval);
+ int get(int optname, std::string* optval) const;
+ int get(int optname, uint64_t* optval) const;
+ int is_set(int optname, bool* is_set);
+ int unset(int optname);
+ void clear();
+ bool empty() const;
+
+private:
+ friend class RBD;
+ friend class Image;
+
+ rbd_image_options_t opts;
+};
+
+class CEPH_RBD_API PoolStats {
+public:
+ PoolStats();
+ ~PoolStats();
+
+ PoolStats(const PoolStats&) = delete;
+ PoolStats& operator=(const PoolStats&) = delete;
+
+ int add(rbd_pool_stat_option_t option, uint64_t* opt_val);
+
+private:
+ friend class RBD;
+
+ rbd_pool_stats_t pool_stats;
+};
+
+class CEPH_RBD_API UpdateWatchCtx {
+public:
+ virtual ~UpdateWatchCtx() {}
+ /**
+ * Callback activated when we receive a notify event.
+ */
+ virtual void handle_notify() = 0;
+};
+
+class CEPH_RBD_API QuiesceWatchCtx {
+public:
+ virtual ~QuiesceWatchCtx() {}
+ /**
+ * Callback activated when we want to quiesce.
+ */
+ virtual void handle_quiesce() = 0;
+
+ /**
+ * Callback activated when we want to unquiesce.
+ */
+ virtual void handle_unquiesce() = 0;
+};
+
+class CEPH_RBD_API Image
+{
+public:
+ Image();
+ ~Image();
+
+ int close();
+ int aio_close(RBD::AioCompletion *c);
+
+ int resize(uint64_t size);
+ int resize2(uint64_t size, bool allow_shrink, ProgressContext& pctx);
+ int resize_with_progress(uint64_t size, ProgressContext& pctx);
+ int stat(image_info_t &info, size_t infosize);
+ int get_name(std::string *name);
+ int get_id(std::string *id);
+ std::string get_block_name_prefix();
+ int64_t get_data_pool_id();
+ int parent_info(std::string *parent_poolname, std::string *parent_name,
+ std::string *parent_snapname)
+ CEPH_RBD_DEPRECATED;
+ int parent_info2(std::string *parent_poolname, std::string *parent_name,
+ std::string *parent_id, std::string *parent_snapname)
+ CEPH_RBD_DEPRECATED;
+ int get_parent(linked_image_spec_t *parent_image, snap_spec_t *parent_snap);
+
+ int get_migration_source_spec(std::string* source_spec);
+
+ int old_format(uint8_t *old);
+ int size(uint64_t *size);
+ int get_group(group_info_t *group_info, size_t group_info_size);
+ int features(uint64_t *features);
+ int update_features(uint64_t features, bool enabled);
+ int get_op_features(uint64_t *op_features);
+ int overlap(uint64_t *overlap);
+ int get_flags(uint64_t *flags);
+ int set_image_notification(int fd, int type);
+
+ /* exclusive lock feature */
+ int is_exclusive_lock_owner(bool *is_owner);
+ int lock_acquire(rbd_lock_mode_t lock_mode);
+ int lock_release();
+ int lock_get_owners(rbd_lock_mode_t *lock_mode,
+ std::list<std::string> *lock_owners);
+ int lock_break(rbd_lock_mode_t lock_mode, const std::string &lock_owner);
+
+ /* object map feature */
+ int rebuild_object_map(ProgressContext &prog_ctx);
+
+ int check_object_map(ProgressContext &prog_ctx);
+
+ int copy(IoCtx& dest_io_ctx, const char *destname);
+ int copy2(Image& dest);
+ int copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
+ int copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts,
+ size_t sparse_size);
+ int copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+ ProgressContext &prog_ctx);
+ int copy_with_progress2(Image& dest, ProgressContext &prog_ctx);
+ int copy_with_progress3(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx);
+ int copy_with_progress4(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx,
+ size_t sparse_size);
+
+ /* deep copy */
+ int deep_copy(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
+ int deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx);
+
+ /* encryption */
+ int encryption_format(encryption_format_t format, encryption_options_t opts,
+ size_t opts_size);
+ int encryption_load(encryption_format_t format, encryption_options_t opts,
+ size_t opts_size);
+
+ /* striping */
+ uint64_t get_stripe_unit() const;
+ uint64_t get_stripe_count() const;
+
+ int get_create_timestamp(struct timespec *timestamp);
+ int get_access_timestamp(struct timespec *timestamp);
+ int get_modify_timestamp(struct timespec *timestamp);
+
+ int flatten();
+ int flatten_with_progress(ProgressContext &prog_ctx);
+
+ int sparsify(size_t sparse_size);
+ int sparsify_with_progress(size_t sparse_size, ProgressContext &prog_ctx);
+ /**
+ * Returns a pair of poolname, imagename for each clone
+ * of this image at the currently set snapshot.
+ */
+ int list_children(std::set<std::pair<std::string, std::string> > *children)
+ CEPH_RBD_DEPRECATED;
+ /**
+ * Returns a structure of poolname, imagename, imageid and trash flag
+ * for each clone of this image at the currently set snapshot.
+ */
+ int list_children2(std::vector<librbd::child_info_t> *children)
+ CEPH_RBD_DEPRECATED;
+ int list_children3(std::vector<linked_image_spec_t> *images);
+ int list_descendants(std::vector<linked_image_spec_t> *images);
+
+ /* advisory locking (see librbd.h for details) */
+ int list_lockers(std::list<locker_t> *lockers,
+ bool *exclusive, std::string *tag);
+ int lock_exclusive(const std::string& cookie);
+ int lock_shared(const std::string& cookie, const std::string& tag);
+ int unlock(const std::string& cookie);
+ int break_lock(const std::string& client, const std::string& cookie);
+
+ /* snapshots */
+ int snap_list(std::vector<snap_info_t>& snaps);
+ /* DEPRECATED; use snap_exists2 */
+ bool snap_exists(const char *snapname) CEPH_RBD_DEPRECATED;
+ int snap_exists2(const char *snapname, bool *exists);
+ int snap_create(const char *snapname);
+ int snap_create2(const char *snapname, uint32_t flags, ProgressContext& pctx);
+ int snap_remove(const char *snapname);
+ int snap_remove2(const char *snapname, uint32_t flags, ProgressContext& pctx);
+ int snap_remove_by_id(uint64_t snap_id);
+ int snap_rollback(const char *snap_name);
+ int snap_rollback_with_progress(const char *snap_name, ProgressContext& pctx);
+ int snap_protect(const char *snap_name);
+ int snap_unprotect(const char *snap_name);
+ int snap_is_protected(const char *snap_name, bool *is_protected);
+ int snap_set(const char *snap_name);
+ int snap_set_by_id(uint64_t snap_id);
+ int snap_get_name(uint64_t snap_id, std::string *snap_name);
+ int snap_get_id(const std::string snap_name, uint64_t *snap_id);
+ int snap_rename(const char *srcname, const char *dstname);
+ int snap_get_limit(uint64_t *limit);
+ int snap_set_limit(uint64_t limit);
+ int snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp);
+ int snap_get_namespace_type(uint64_t snap_id,
+ snap_namespace_type_t *namespace_type);
+ int snap_get_group_namespace(uint64_t snap_id,
+ snap_group_namespace_t *group_namespace,
+ size_t snap_group_namespace_size);
+ int snap_get_trash_namespace(uint64_t snap_id, std::string* original_name);
+ int snap_get_mirror_namespace(
+ uint64_t snap_id, snap_mirror_namespace_t *mirror_namespace,
+ size_t snap_mirror_namespace_size);
+
+ /* I/O */
+ ssize_t read(uint64_t ofs, size_t len, ceph::bufferlist& bl);
+ /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+ ssize_t read2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+ int64_t read_iterate(uint64_t ofs, size_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *), void *arg);
+ int read_iterate2(uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *), void *arg);
+ /**
+ * get difference between two versions of an image
+ *
+ * This will return the differences between two versions of an image
+ * via a callback, which gets the offset and length and a flag
+ * indicating whether the extent exists (1), or is known/defined to
+ * be zeros (a hole, 0). If the source snapshot name is NULL, we
+ * interpret that as the beginning of time and return all allocated
+ * regions of the image. The end version is whatever is currently
+ * selected for the image handle (either a snapshot or the writeable
+ * head).
+ *
+ * @param fromsnapname start snapshot name, or NULL
+ * @param ofs start offset
+ * @param len len in bytes of region to report on
+ * @param include_parent true if full history diff should include parent
+ * @param whole_object 1 if diff extents should cover whole object
+ * @param cb callback to call for each allocated region
+ * @param arg argument to pass to the callback
+ * @returns 0 on success, or negative error code on error
+ */
+ int diff_iterate(const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, int, void *), void *arg);
+ int diff_iterate2(const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ bool include_parent, bool whole_object,
+ int (*cb)(uint64_t, size_t, int, void *), void *arg);
+
+ ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl);
+ /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+ ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+
+ int discard(uint64_t ofs, uint64_t len);
+ ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags);
+ ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags);
+
+ ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl,
+ ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags);
+
+ int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
+ /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+ int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags);
+
+ int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
+ int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags);
+ int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c,
+ int zero_flags, int op_flags);
+
+ int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl,
+ ceph::bufferlist& bl, RBD::AioCompletion *c,
+ uint64_t *mismatch_off, int op_flags);
+
+ /**
+ * read async from image
+ *
+ * The target bufferlist is populated with references to buffers
+ * that contain the data for the given extent of the image.
+ *
+ * NOTE: If caching is enabled, the bufferlist will directly
+ * reference buffers in the cache to avoid an unnecessary data copy.
+ * As a result, if the user intends to modify the buffer contents
+ * directly, they should make a copy first (unconditionally, or when
+ * the reference count on ther underlying buffer is more than 1).
+ *
+ * @param off offset in image
+ * @param len length of read
+ * @param bl bufferlist to read into
+ * @param c aio completion to notify when read is complete
+ */
+ int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
+ /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+ int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags);
+
+ int flush();
+ /**
+ * Start a flush if caching is enabled. Get a callback when
+ * the currently pending writes are on disk.
+ *
+ * @param image the image to flush writes to
+ * @param c what to call when flushing is complete
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush(RBD::AioCompletion *c);
+
+ /**
+ * Drop any cached data for this image
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ int invalidate_cache();
+
+ int poll_io_events(RBD::AioCompletion **comps, int numcomp);
+
+ int metadata_get(const std::string &key, std::string *value);
+ int metadata_set(const std::string &key, const std::string &value);
+ int metadata_remove(const std::string &key);
+ /**
+ * Returns a pair of key/value for this image
+ */
+ int metadata_list(const std::string &start, uint64_t max, std::map<std::string, ceph::bufferlist> *pairs);
+
+ // RBD image mirroring support functions
+ int mirror_image_enable() CEPH_RBD_DEPRECATED;
+ int mirror_image_enable2(mirror_image_mode_t mode);
+ int mirror_image_disable(bool force);
+ int mirror_image_promote(bool force);
+ int mirror_image_demote();
+ int mirror_image_resync();
+ int mirror_image_create_snapshot(uint64_t *snap_id);
+ int mirror_image_create_snapshot2(uint32_t flags, uint64_t *snap_id);
+ int mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+ size_t info_size);
+ int mirror_image_get_mode(mirror_image_mode_t *mode);
+ int mirror_image_get_global_status(
+ mirror_image_global_status_t *mirror_image_global_status,
+ size_t status_size);
+ int mirror_image_get_status(
+ mirror_image_status_t *mirror_image_status, size_t status_size)
+ CEPH_RBD_DEPRECATED;
+ int mirror_image_get_instance_id(std::string *instance_id);
+ int aio_mirror_image_promote(bool force, RBD::AioCompletion *c);
+ int aio_mirror_image_demote(RBD::AioCompletion *c);
+ int aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+ size_t info_size, RBD::AioCompletion *c);
+ int aio_mirror_image_get_mode(mirror_image_mode_t *mode,
+ RBD::AioCompletion *c);
+ int aio_mirror_image_get_global_status(
+ mirror_image_global_status_t *mirror_image_global_status,
+ size_t status_size, RBD::AioCompletion *c);
+ int aio_mirror_image_get_status(
+ mirror_image_status_t *mirror_image_status, size_t status_size,
+ RBD::AioCompletion *c)
+ CEPH_RBD_DEPRECATED;
+ int aio_mirror_image_create_snapshot(uint32_t flags, uint64_t *snap_id,
+ RBD::AioCompletion *c);
+
+ int update_watch(UpdateWatchCtx *ctx, uint64_t *handle);
+ int update_unwatch(uint64_t handle);
+
+ int list_watchers(std::list<image_watcher_t> &watchers);
+
+ int config_list(std::vector<config_option_t> *options);
+
+ int quiesce_watch(QuiesceWatchCtx *ctx, uint64_t *handle);
+ int quiesce_unwatch(uint64_t handle);
+ void quiesce_complete(uint64_t handle, int r);
+
+private:
+ friend class RBD;
+
+ Image(const Image& rhs);
+ const Image& operator=(const Image& rhs);
+
+ image_ctx_t ctx;
+};
+
+} // namespace librbd
+
+#if __GNUC__ >= 4
+ #pragma GCC diagnostic pop
+#endif
+
+#endif // __LIBRBD_HPP
diff --git a/src/include/rbd/object_map_types.h b/src/include/rbd/object_map_types.h
new file mode 100644
index 000000000..54852caa8
--- /dev/null
+++ b/src/include/rbd/object_map_types.h
@@ -0,0 +1,13 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_RBD_OBJECT_MAP_TYPES_H
+#define CEPH_RBD_OBJECT_MAP_TYPES_H
+
+#include "include/int_types.h"
+
+static const uint8_t OBJECT_NONEXISTENT = 0;
+static const uint8_t OBJECT_EXISTS = 1;
+static const uint8_t OBJECT_PENDING = 2;
+static const uint8_t OBJECT_EXISTS_CLEAN = 3;
+
+#endif // CEPH_RBD_OBJECT_MAP_TYPES_H
diff --git a/src/include/rbd_types.h b/src/include/rbd_types.h
new file mode 100644
index 000000000..35a1a8bc3
--- /dev/null
+++ b/src/include/rbd_types.h
@@ -0,0 +1,159 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include "include/types.h"
+#include "rbd/features.h"
+
+/* New-style rbd image 'foo' consists of objects
+ * rbd_id.foo - id of image
+ * rbd_header.<id> - image metadata
+ * rbd_object_map.<id> - optional image object map
+ * rbd_data.<id>.00000000
+ * rbd_data.<id>.00000001
+ * ... - data
+ */
+
+#define RBD_HEADER_PREFIX "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX "rbd_object_map."
+#define RBD_DATA_PREFIX "rbd_data."
+#define RBD_ID_PREFIX "rbd_id."
+
+/*
+ * old-style rbd image 'foo' consists of objects
+ * foo.rbd - image metadata
+ * rb.<idhi>.<idlo>.00000000
+ * rb.<idhi>.<idlo>.00000001
+ * ... - data
+ */
+
+#define RBD_SUFFIX ".rbd"
+#define RBD_DIRECTORY "rbd_directory"
+#define RBD_INFO "rbd_info"
+#define RBD_NAMESPACE "rbd_namespace"
+#define RBD_TASK "rbd_task"
+
+/*
+ * rbd_children object in each pool contains omap entries
+ * that map parent (poolid, imageid, snapid) to a list of children
+ * (imageids; snapids aren't required because we get all the snapshot
+ * info from a read of the child's header object anyway).
+ *
+ * The clone operation writes a new item to this child list, and rm or
+ * flatten removes an item, and may remove the whole entry if no children
+ * exist after the rm/flatten.
+ *
+ * When attempting to remove a parent, all pools are searched for
+ * rbd_children objects with entries referring to that parent; if any
+ * exist (and those children exist), the parent removal is prevented.
+ */
+#define RBD_CHILDREN "rbd_children"
+#define RBD_LOCK_NAME "rbd_lock"
+
+/**
+ * rbd_mirroring object in each pool contains pool-specific settings
+ * for configuring mirroring.
+ */
+#define RBD_MIRRORING "rbd_mirroring"
+
+/**
+ * rbd_mirror_leader and rbd_mirror_instance.<instance id> objects are used
+ * for pool-level coordination between rbd-mirror daemons.
+ */
+#define RBD_MIRROR_LEADER "rbd_mirror_leader"
+#define RBD_MIRROR_INSTANCE_PREFIX "rbd_mirror_instance."
+
+#define RBD_MAX_OBJ_NAME_SIZE 96
+#define RBD_MAX_BLOCK_NAME_SIZE 24
+
+/**
+ * Maximum string length of the RBD v2 image id (not including
+ * null termination). This limit was derived from the existing
+ * RBD_MAX_BLOCK_NAME_SIZE limit which needs to hold the "rbd_data."
+ * prefix and null termination.
+ */
+#define RBD_MAX_IMAGE_ID_LENGTH 14
+
+/**
+ * Maximum string length of the RBD block object name prefix (not including
+ * null termination).
+ *
+ * v1 format: rb.<max 8-byte high id>.<max 8-byte low id>.<max 8-byte extra>
+ * v2 format: rbd_data.[<max 19-byte pool id>.]<max 14-byte image id>
+ *
+ * Note: new features might require increasing this maximum prefix length.
+ */
+#define RBD_MAX_BLOCK_NAME_PREFIX_LENGTH 43
+
+#define RBD_COMP_NONE 0
+#define RBD_CRYPT_NONE 0
+
+#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
+#define RBD_MIGRATE_HEADER_TEXT "<<< Migrating RBD Image >>>\n"
+#define RBD_HEADER_SIGNATURE "RBD"
+#define RBD_HEADER_VERSION "001.005"
+
+#define RBD_GROUP_INVALID_POOL (-1)
+
+#define RBD_GROUP_HEADER_PREFIX "rbd_group_header."
+
+#define RBD_GROUP_DIRECTORY "rbd_group_directory"
+
+#define RBD_TRASH "rbd_trash"
+
+/**
+ * MON config-key prefix for storing optional remote cluster connectivity
+ * parameters
+ */
+#define RBD_MIRROR_CONFIG_KEY_PREFIX "rbd/mirror/"
+#define RBD_MIRROR_SITE_NAME_CONFIG_KEY RBD_MIRROR_CONFIG_KEY_PREFIX "site_name"
+#define RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY RBD_MIRROR_CONFIG_KEY_PREFIX "peer_client_id"
+#define RBD_MIRROR_PEER_CONFIG_KEY_PREFIX RBD_MIRROR_CONFIG_KEY_PREFIX "peer/"
+
+struct rbd_info {
+ ceph_le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_obj_snap_ondisk {
+ ceph_le64 id;
+ ceph_le64 image_size;
+} __attribute__((packed));
+
+struct rbd_obj_header_ondisk {
+ char text[40];
+ char block_name[RBD_MAX_BLOCK_NAME_SIZE];
+ char signature[4];
+ char version[8];
+ struct {
+ __u8 order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ __u8 unused;
+ } __attribute__((packed)) options;
+ ceph_le64 image_size;
+ ceph_le64 snap_seq;
+ ceph_le32 snap_count;
+ ceph_le32 reserved;
+ ceph_le64 snap_names_len;
+ struct rbd_obj_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+enum {
+ RBD_PROTECTION_STATUS_UNPROTECTED = 0,
+ RBD_PROTECTION_STATUS_UNPROTECTING = 1,
+ RBD_PROTECTION_STATUS_PROTECTED = 2,
+ RBD_PROTECTION_STATUS_LAST = 3
+};
+
+#endif
diff --git a/src/include/scope_guard.h b/src/include/scope_guard.h
new file mode 100644
index 000000000..878d8c169
--- /dev/null
+++ b/src/include/scope_guard.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef SCOPE_GUARD
+#define SCOPE_GUARD
+
+#include <utility>
+
+template <typename F>
+struct scope_guard {
+ F f;
+ scope_guard() = delete;
+ scope_guard(const scope_guard &) = delete;
+ scope_guard(scope_guard &&) = default;
+ scope_guard & operator=(const scope_guard &) = delete;
+ scope_guard & operator=(scope_guard &&) = default;
+ scope_guard(const F& f) : f(f) {}
+ scope_guard(F &&f) : f(std::move(f)) {}
+ template<typename... Args>
+ scope_guard(std::in_place_t, Args&& ...args) : f(std::forward<Args>(args)...) {}
+ ~scope_guard() {
+ std::move(f)(); // Support at-most-once functions
+ }
+};
+
+template <typename F>
+scope_guard<F> make_scope_guard(F &&f) {
+ return scope_guard<F>(std::forward<F>(f));
+}
+
+template<typename F, typename... Args>
+scope_guard<F> make_scope_guard(std::in_place_type_t<F>, Args&& ...args) {
+ return { std::in_place, std::forward<Args>(args)... };
+}
+
+#endif
diff --git a/src/include/sock_compat.h b/src/include/sock_compat.h
new file mode 100644
index 000000000..14b5efa1d
--- /dev/null
+++ b/src/include/sock_compat.h
@@ -0,0 +1,43 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_SOCK_COMPAT_H
+#define CEPH_SOCK_COMPAT_H
+
+#include "include/compat.h"
+#include <sys/socket.h>
+
+/*
+ * This optimization may not be available on all platforms (e.g. OSX).
+ * Apparently a similar approach based on TCP_CORK can be used.
+ */
+#ifndef MSG_MORE
+# define MSG_MORE 0
+#endif
+
+/*
+ * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
+ */
+#ifndef MSG_NOSIGNAL
+# define MSG_NOSIGNAL 0
+# ifdef SO_NOSIGPIPE
+# define CEPH_USE_SO_NOSIGPIPE
+# else
+# define CEPH_USE_SIGPIPE_BLOCKER
+# warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!"
+# endif
+#endif
+
+int socket_cloexec(int domain, int type, int protocol);
+int socketpair_cloexec(int domain, int type, int protocol, int sv[2]);
+int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen);
+
+#endif
diff --git a/src/include/spinlock.h b/src/include/spinlock.h
new file mode 100644
index 000000000..3f12bdc00
--- /dev/null
+++ b/src/include/spinlock.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ * @author Jesse Williamson <jwilliamson@suse.de>
+ *
+*/
+
+#ifndef CEPH_SPINLOCK_HPP
+#define CEPH_SPINLOCK_HPP
+
+#include <atomic>
+
+namespace ceph {
+inline namespace version_1_0 {
+
+class spinlock;
+
+inline void spin_lock(std::atomic_flag& lock);
+inline void spin_unlock(std::atomic_flag& lock);
+inline void spin_lock(ceph::spinlock& lock);
+inline void spin_unlock(ceph::spinlock& lock);
+
+/* A pre-packaged spinlock type modelling BasicLockable: */
+class spinlock final
+{
+ std::atomic_flag af = ATOMIC_FLAG_INIT;
+
+ public:
+ void lock() {
+ ceph::spin_lock(af);
+ }
+
+ void unlock() noexcept {
+ ceph::spin_unlock(af);
+ }
+};
+
+// Free functions:
+inline void spin_lock(std::atomic_flag& lock)
+{
+ while(lock.test_and_set(std::memory_order_acquire))
+ ;
+}
+
+inline void spin_unlock(std::atomic_flag& lock)
+{
+ lock.clear(std::memory_order_release);
+}
+
+inline void spin_lock(std::atomic_flag *lock)
+{
+ spin_lock(*lock);
+}
+
+inline void spin_unlock(std::atomic_flag *lock)
+{
+ spin_unlock(*lock);
+}
+
+inline void spin_lock(ceph::spinlock& lock)
+{
+ lock.lock();
+}
+
+inline void spin_unlock(ceph::spinlock& lock)
+{
+ lock.unlock();
+}
+
+inline void spin_lock(ceph::spinlock *lock)
+{
+ spin_lock(*lock);
+}
+
+inline void spin_unlock(ceph::spinlock *lock)
+{
+ spin_unlock(*lock);
+}
+
+} // inline namespace (version)
+} // namespace ceph
+
+#endif
diff --git a/src/include/stat.h b/src/include/stat.h
new file mode 100644
index 000000000..19398758e
--- /dev/null
+++ b/src/include/stat.h
@@ -0,0 +1,145 @@
+#ifndef CEPH_STAT_H
+#define CEPH_STAT_H
+
+#include <acconfig.h>
+
+#include <sys/stat.h>
+
+/*
+ * Access time-related `struct stat` members.
+ *
+ * Note that for each of the stat member get/set functions below, setting a
+ * high-res value (stat_set_*_nsec) on a platform without high-res support is
+ * a no-op.
+ */
+
+#ifdef HAVE_STAT_ST_MTIM_TV_NSEC
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+ return st->st_mtim.tv_nsec;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_mtim.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+ return st->st_atim.tv_nsec;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_atim.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+ return st->st_ctim.tv_nsec;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_ctim.tv_nsec = nsec;
+}
+
+#elif defined(HAVE_STAT_ST_MTIMESPEC_TV_NSEC)
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+ return st->st_mtimespec.tv_nsec;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_mtimespec.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+ return st->st_atimespec.tv_nsec;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_atimespec.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+ return st->st_ctimespec.tv_nsec;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+ st->st_ctimespec.tv_nsec = nsec;
+}
+
+#else
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+ return 0;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+ return 0;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+ return 0;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+#endif
+
+/*
+ * Access second-resolution `struct stat` members.
+ */
+
+static inline uint32_t stat_get_mtime_sec(struct stat *st)
+{
+ return st->st_mtime;
+}
+
+static inline void stat_set_mtime_sec(struct stat *st, uint32_t sec)
+{
+ st->st_mtime = sec;
+}
+
+static inline uint32_t stat_get_atime_sec(struct stat *st)
+{
+ return st->st_atime;
+}
+
+static inline void stat_set_atime_sec(struct stat *st, uint32_t sec)
+{
+ st->st_atime = sec;
+}
+
+static inline uint32_t stat_get_ctime_sec(struct stat *st)
+{
+ return st->st_ctime;
+}
+
+static inline void stat_set_ctime_sec(struct stat *st, uint32_t sec)
+{
+ st->st_ctime = sec;
+}
+
+#endif
diff --git a/src/include/statlite.h b/src/include/statlite.h
new file mode 100644
index 000000000..0ff4b04e7
--- /dev/null
+++ b/src/include/statlite.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_STATLITE_H
+#define CEPH_STATLITE_H
+
+extern "C" {
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include "include/compat.h"
+
+struct statlite {
+ dev_t st_dev; /* device */
+ ino_t st_ino; /* inode */
+ mode_t st_mode; /* protection */
+ nlink_t st_nlink; /* number of hard links */
+ uid_t st_uid; /* user ID of owner */
+ gid_t st_gid; /* group ID of owner */
+ dev_t st_rdev; /* device type (if inode device)*/
+ unsigned long st_litemask; /* bit mask for optional fields */
+ /***************************************************************/
+ /**** Remaining fields are optional according to st_litemask ***/
+ off_t st_size; /* total size, in bytes */
+ blksize_t st_blksize; /* blocksize for filesystem I/O */
+ blkcnt_t st_blocks; /* number of blocks allocated */
+ struct timespec st_atim; /* Time of last access. */
+ struct timespec st_mtim; /* Time of last modification. */
+ struct timespec st_ctim; /* Time of last status change. */
+ //time_t st_atime; /* time of last access */
+ //time_t st_mtime; /* time of last modification */
+ //time_t st_ctime; /* time of last change */
+};
+
+#define S_STATLITE_SIZE 1
+#define S_STATLITE_BLKSIZE 2
+#define S_STATLITE_BLOCKS 4
+#define S_STATLITE_ATIME 8
+#define S_STATLITE_MTIME 16
+#define S_STATLITE_CTIME 32
+
+#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE)
+#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE)
+#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS)
+#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME)
+#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME)
+#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME)
+
+#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE)
+#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE)
+#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS)
+#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME)
+#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME)
+#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME)
+
+
+// readdirplus etc.
+
+struct dirent_plus {
+ struct dirent d_dirent; /* dirent struct for this entry */
+ struct stat d_stat; /* attributes for this entry */
+ int d_stat_err;/* errno for d_stat, or 0 */
+};
+struct dirent_lite {
+ struct dirent d_dirent; /* dirent struct for this entry */
+ struct statlite d_stat; /* attributes for this entry */
+ int d_stat_err;/* errno for d_stat, or 0 */
+};
+
+}
+#endif
diff --git a/src/include/str_list.h b/src/include/str_list.h
new file mode 100644
index 000000000..1ca61099a
--- /dev/null
+++ b/src/include/str_list.h
@@ -0,0 +1,98 @@
+#ifndef CEPH_STRLIST_H
+#define CEPH_STRLIST_H
+
+#include <list>
+#include <set>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace ceph {
+
+/// Split a string using the given delimiters, passing each piece as a
+/// (non-null-terminated) std::string_view to the callback.
+template <typename Func> // where Func(std::string_view) is a valid call
+void for_each_substr(std::string_view s, const char *delims, Func&& f)
+{
+ auto pos = s.find_first_not_of(delims);
+ while (pos != s.npos) {
+ s.remove_prefix(pos); // trim delims from the front
+ auto end = s.find_first_of(delims);
+ f(s.substr(0, end));
+ pos = s.find_first_not_of(delims, end);
+ }
+}
+
+} // namespace ceph
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ *
+ * @param [in] str String to split and save as list
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+ std::list<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ *
+ * @param [in] str String to split and save as list
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+ const char *delims,
+ std::list<std::string>& str_list);
+
+std::list<std::string> get_str_list(const std::string& str,
+ const char *delims = ";,= \t");
+
+/**
+ * Split **str** into a vector of strings, using the ";,= \t" delimiters and output the result in **str_vec**.
+ *
+ * @param [in] str String to split and save as Vector
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+extern void get_str_vec(const std::string& str,
+ std::vector<std::string>& str_vec);
+
+/**
+ * Split **str** into a vector of strings, using the **delims** delimiters and output the result in **str_vec**.
+ *
+ * @param [in] str String to split and save as Vector
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+extern void get_str_vec(const std::string& str,
+ const char *delims,
+ std::vector<std::string>& str_vec);
+
+std::vector<std::string> get_str_vec(const std::string& str,
+ const char *delims = ";,= \t");
+
+/**
+ * Return a String containing the vector **v** joined with **sep**
+ *
+ * If **v** is empty, the function returns an empty string
+ * For each element in **v**,
+ * it will concatenate this element and **sep** with result
+ *
+ * @param [in] v Vector to join as a String
+ * @param [in] sep String used to join each element from **v**
+ * @return empty string if **v** is empty or concatenated string
+**/
+inline std::string str_join(const std::vector<std::string>& v, const std::string& sep)
+{
+ if (v.empty())
+ return std::string();
+ auto i = v.cbegin();
+ std::string r = *i;
+ for (++i; i != v.cend(); ++i) {
+ r += sep;
+ r += *i;
+ }
+ return r;
+}
+
+#endif
diff --git a/src/include/str_map.h b/src/include/str_map.h
new file mode 100644
index 000000000..6a0370d12
--- /dev/null
+++ b/src/include/str_map.h
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_STRMAP_H
+#define CEPH_STRMAP_H
+
+#define CONST_DELIMS ",;\t\n "
+
+#include <map>
+#include <string>
+#include <sstream>
+
+/**
+ * Parse **str** and set **str_map** with the key/value pairs read
+ * from it. The format of **str** is either a well formed JSON object
+ * or a custom key[=value] plain text format.
+ *
+ * JSON is tried first. If successfully parsed into a JSON object, it
+ * is copied into **str_map** verbatim. If it is not a JSON object ( a
+ * string, integer etc. ), -EINVAL is returned and **ss** is set to
+ * a human readable error message.
+ *
+ * If **str** is no valid JSON and if **fallback_to_plain** is set to true
+ * (default: true) it is assumed to be a string containing white space
+ * separated key=value pairs. A white space is either space, tab or newline.
+ * Function **get_str_map** will be leveraged to parse the plain-text
+ * key/value pairs.
+ *
+ * @param [in] str JSON or plain text key/value pairs
+ * @param [out] ss human readable message on error
+ * @param [out] str_map key/value pairs read from str
+ * @param [in] fallback_to_plain attempt parsing as plain-text if json fails
+ * @return **0** on success or a -EINVAL on error.
+ */
+extern int get_json_str_map(
+ const std::string &str,
+ std::ostream &ss,
+ std::map<std::string,std::string> *str_map,
+ bool fallback_to_plain = true);
+
+/**
+ * Parse **str** and set **str_map** with the key/value pairs read from
+ * it. The format of **str** is a number of custom key[=value] pairs in
+ * plain text format.
+ *
+ * The string will be parsed taking **delims** as field delimiters for
+ * key/values. The value is optional resulting in an empty string when
+ * not provided. For example, using white space as delimiters:
+ *
+ * insert your own=political/ideological statement=here
+ *
+ * will be parsed into:
+ *
+ * { "insert": "",
+ * "your": "",
+ * "own": "political/ideological",
+ * "statement": "here" }
+ *
+ * Alternative delimiters may be provided. For instance, specifying
+ * "white space and slash", for the above statement, would be parsed
+ * into:
+ *
+ * { "insert": "",
+ * "your": "",
+ * "own": "political",
+ * "ideological": "",
+ * "statement": "here" }
+ *
+ * See how adding '/' to the delimiters field will spawn a new key without
+ * a set value.
+ *
+ * Always returns 0, as there is no condition for failure.
+ *
+ * @param [in] str plain text key/value pairs
+ * @param [in] delims field delimiters to be used for parsing str
+ * @param [out] str_map key/value pairs parsed from str
+ * @return **0**
+ */
+extern int get_str_map(
+ const std::string &str,
+ std::map<std::string,std::string> *str_map,
+ const char *delims = CONST_DELIMS);
+
+/**
+ * Returns the value of **key** in **str_map** if available.
+ *
+ * If **key** is not available in **str_map**, and if **def_val** is
+ * not-NULL then returns **def_val**. Otherwise checks if the value of
+ * **key** is an empty string and if so will return **key**.
+ * If the map contains **key**, the function returns the value of **key**.
+ *
+ * @param[in] str_map Map to obtain **key** from
+ * @param[in] key The key to search for in the map
+ * @param[in] def_val The value to return in case **key** is not present
+ */
+extern std::string get_str_map_value(
+ const std::map<std::string,std::string> &str_map,
+ const std::string &key,
+ const std::string *def_val = NULL);
+
+/**
+ * Returns the value of **key** in **str_map** if available.
+ *
+ * If **key** is available in **str_map** returns the value of **key**.
+ *
+ * If **key** is not available in **str_map**, and if **def_key**
+ * is not-NULL and available in **str_map**, then returns the value
+ * of **def_key**.
+ *
+ * Otherwise returns an empty string.
+ *
+ * @param[in] str_map Map to obtain **key** or **def_key** from
+ * @param[in] key Key to obtain the value of from **str_map**
+ * @param[in] def_key Key to fallback to if **key** is not present
+ * in **str_map**
+ */
+extern std::string get_str_map_key(
+ const std::map<std::string,std::string> &str_map,
+ const std::string &key,
+ const std::string *fallback_key = NULL);
+
+
+// This function's only purpose is to check whether a given map has only
+// ONE key with an empty value (which would mean that 'get_str_map()' read
+// a map in the form of 'VALUE', without any KEY/VALUE pairs) and, in such
+// event, to assign said 'VALUE' to a given 'def_key', such that we end up
+// with a map of the form "m = { 'def_key' : 'VALUE' }" instead of the
+// original "m = { 'VALUE' : '' }".
+int get_conf_str_map_helper(
+ const std::string &str,
+ std::ostringstream &oss,
+ std::map<std::string,std::string> *m,
+ const std::string &def_key);
+
+#endif
diff --git a/src/include/stringify.h b/src/include/stringify.h
new file mode 100644
index 000000000..1b2a130c9
--- /dev/null
+++ b/src/include/stringify.h
@@ -0,0 +1,33 @@
+#ifndef __CEPH_STRINGIFY_H
+#define __CEPH_STRINGIFY_H
+
+#include <string>
+#include <sstream>
+
+#include "include/types.h"
+
+template<typename T>
+inline std::string stringify(const T& a) {
+#if defined(__GNUC__) && !(defined(__clang__) || defined(__INTEL_COMPILER))
+ static __thread std::ostringstream ss;
+ ss.str("");
+#else
+ std::ostringstream ss;
+#endif
+ ss << a;
+ return ss.str();
+}
+
+template <class T, class A>
+T joinify(const A &begin, const A &end, const T &t)
+{
+ T result;
+ for (A it = begin; it != end; it++) {
+ if (!result.empty())
+ result.append(t);
+ result.append(*it);
+ }
+ return result;
+}
+
+#endif
diff --git a/src/include/timegm.h b/src/include/timegm.h
new file mode 100644
index 000000000..fb970432d
--- /dev/null
+++ b/src/include/timegm.h
@@ -0,0 +1,79 @@
+// (C) Copyright Howard Hinnant
+// (C) Copyright 2010-2011 Vicente J. Botet Escriba
+// Use, modification and distribution are subject to the Boost Software License,
+// Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt).
+
+//===-------------------------- locale ------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// This code was adapted by Vicente from Howard Hinnant's experimental work
+// on chrono i/o to Boost and some functions from libc++/locale to emulate the missing time_get::get()
+
+#ifndef BOOST_CHRONO_IO_TIME_POINT_IO_H
+#define BOOST_CHRONO_IO_TIME_POINT_IO_H
+
+#include <time.h>
+
+static int32_t is_leap(int32_t year) {
+ if(year % 400 == 0)
+ return 1;
+ if(year % 100 == 0)
+ return 0;
+ if(year % 4 == 0)
+ return 1;
+ return 0;
+}
+
+static int32_t days_from_0(int32_t year) {
+ year--;
+ return 365 * year + (year / 400) - (year/100) + (year / 4);
+}
+
+int32_t static days_from_1970(int32_t year) {
+ static const int days_from_0_to_1970 = days_from_0(1970);
+ return days_from_0(year) - days_from_0_to_1970;
+}
+
+static int32_t days_from_1jan(int32_t year,int32_t month,int32_t day) {
+ static const int32_t days[2][12] =
+ {
+ { 0,31,59,90,120,151,181,212,243,273,304,334},
+ { 0,31,60,91,121,152,182,213,244,274,305,335}
+ };
+
+ return days[is_leap(year)][month-1] + day - 1;
+}
+
+static time_t internal_timegm(tm const *t) {
+ int year = t->tm_year + 1900;
+ int month = t->tm_mon;
+ if(month > 11)
+ {
+ year += month/12;
+ month %= 12;
+ }
+ else if(month < 0)
+ {
+ int years_diff = (-month + 11)/12;
+ year -= years_diff;
+ month+=12 * years_diff;
+ }
+ month++;
+ int day = t->tm_mday;
+ int day_of_year = days_from_1jan(year,month,day);
+ int days_since_epoch = days_from_1970(year) + day_of_year ;
+
+ time_t seconds_in_day = 3600 * 24;
+ time_t result = seconds_in_day * days_since_epoch + 3600 * t->tm_hour + 60 * t->tm_min + t->tm_sec;
+
+ return result;
+}
+
+#endif
diff --git a/src/include/types.h b/src/include/types.h
new file mode 100644
index 000000000..60d1fb305
--- /dev/null
+++ b/src/include/types.h
@@ -0,0 +1,626 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_TYPES_H
+#define CEPH_TYPES_H
+
+// this is needed for ceph_fs to compile in userland
+#include "int_types.h"
+#include "byteorder.h"
+
+#include "uuid.h"
+
+#include <netinet/in.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "rbd_types.h"
+
+#ifdef __cplusplus
+#ifndef _BACKWARD_BACKWARD_WARNING_H
+#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_*
+#endif
+#endif
+
+extern "C" {
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "statlite.h"
+}
+
+#include <string>
+#include <list>
+#include <set>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/flat_map.hpp>
+#include <map>
+#include <vector>
+#include <optional>
+#include <ostream>
+#include <iomanip>
+
+
+#include "include/unordered_map.h"
+
+#include "object.h"
+#include "intarith.h"
+
+#include "acconfig.h"
+
+#include "assert.h"
+
+// DARWIN compatibility
+#ifdef __APPLE__
+typedef long long loff_t;
+typedef long long off64_t;
+#define O_DIRECT 00040000
+#endif
+
+// FreeBSD compatibility
+#ifdef __FreeBSD__
+typedef off_t loff_t;
+typedef off_t off64_t;
+#endif
+
+#if defined(__sun) || defined(_AIX)
+typedef off_t loff_t;
+#endif
+
+
+// -- io helpers --
+
+// Forward declare all the I/O helpers so strict ADL can find them in
+// the case of containers of containers. I'm tempted to abstract this
+// stuff using template templates like I did for denc.
+
+namespace std {
+template<class A, class B>
+inline std::ostream& operator<<(std::ostream&out, const std::pair<A,B>& v);
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v);
+template<class A, std::size_t N, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v);
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t);
+template<typename T>
+inline std::ostream& operator<<(std::ostream& out, const std::optional<T> &t);
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m);
+}
+
+namespace boost {
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const boost::tuple<Ts...> &t);
+
+namespace container {
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& iset);
+}
+}
+
+namespace std {
+template<class A, class B>
+inline std::ostream& operator<<(std::ostream& out, const std::pair<A,B>& v) {
+ return out << v.first << "," << v.second;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v) {
+ bool first = true;
+ out << "[";
+ for (const auto& p : v) {
+ if (!first) out << ",";
+ out << p;
+ first = false;
+ }
+ out << "]";
+ return out;
+}
+
+template<class A, std::size_t N, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v) {
+ bool first = true;
+ out << "[";
+ for (const auto& p : v) {
+ if (!first) out << ",";
+ out << p;
+ first = false;
+ }
+ out << "]";
+ return out;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v) {
+ out << "<";
+ for (auto p = v.begin(); p != v.end(); ++p) {
+ if (p != v.begin()) out << ",";
+ out << *p;
+ }
+ out << ">";
+ return out;
+}
+
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t) {
+ auto f = [n = sizeof...(Ts), i = 0U, &out](const auto& e) mutable {
+ out << e;
+ if (++i != n)
+ out << ",";
+ };
+ ceph::for_each(t, f);
+ return out;
+}
+
+// Mimics boost::optional
+template<typename T>
+inline std::ostream& operator<<(std::ostream& out, const std::optional<T> &t) {
+ if (!t)
+ out << "--" ;
+ else
+ out << ' ' << *t ;
+ return out;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist) {
+ for (auto it = ilist.begin();
+ it != ilist.end();
+ ++it) {
+ if (it != ilist.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset) {
+ for (auto it = iset.begin();
+ it != iset.end();
+ ++it) {
+ if (it != iset.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset) {
+ for (auto it = iset.begin();
+ it != iset.end();
+ ++it) {
+ if (it != iset.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m)
+{
+ out << "{";
+ for (auto it = m.begin();
+ it != m.end();
+ ++it) {
+ if (it != m.begin()) out << ",";
+ out << it->first << "=" << it->second;
+ }
+ out << "}";
+ return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m)
+{
+ out << "{{";
+ for (auto it = m.begin();
+ it != m.end();
+ ++it) {
+ if (it != m.begin()) out << ",";
+ out << it->first << "=" << it->second;
+ }
+ out << "}}";
+ return out;
+}
+
+} // namespace std
+
+namespace boost {
+namespace tuples {
+template<typename A, typename B, typename C>
+inline std::ostream& operator<<(std::ostream& out, const boost::tuples::tuple<A, B, C> &t) {
+ return out << boost::get<0>(t) << ","
+ << boost::get<1>(t) << ","
+ << boost::get<2>(t);
+}
+}
+namespace container {
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset) {
+ for (auto it = iset.begin();
+ it != iset.end();
+ ++it) {
+ if (it != iset.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& m) {
+ for (auto it = m.begin();
+ it != m.end();
+ ++it) {
+ if (it != m.begin()) out << ",";
+ out << it->first << "=" << it->second;
+ }
+ return out;
+}
+}
+} // namespace boost
+
+
+
+/*
+ * comparators for stl containers
+ */
+// for ceph::unordered_map:
+// ceph::unordered_map<const char*, long, hash<const char*>, eqstr> vals;
+struct eqstr
+{
+ bool operator()(const char* s1, const char* s2) const
+ {
+ return strcmp(s1, s2) == 0;
+ }
+};
+
+// for set, map
+struct ltstr
+{
+ bool operator()(const char* s1, const char* s2) const
+ {
+ return strcmp(s1, s2) < 0;
+ }
+};
+
+
+namespace ceph {
+ class Formatter;
+}
+
+#include "encoding.h"
+
+WRITE_RAW_ENCODER(ceph_fsid)
+WRITE_RAW_ENCODER(ceph_file_layout)
+WRITE_RAW_ENCODER(ceph_dir_layout)
+WRITE_RAW_ENCODER(ceph_mds_session_head)
+WRITE_RAW_ENCODER(ceph_mds_request_head_legacy)
+WRITE_RAW_ENCODER(ceph_mds_request_head)
+WRITE_RAW_ENCODER(ceph_mds_request_release)
+WRITE_RAW_ENCODER(ceph_filelock)
+WRITE_RAW_ENCODER(ceph_mds_caps_head)
+WRITE_RAW_ENCODER(ceph_mds_caps_export_body)
+WRITE_RAW_ENCODER(ceph_mds_caps_non_export_body)
+WRITE_RAW_ENCODER(ceph_mds_cap_peer)
+WRITE_RAW_ENCODER(ceph_mds_cap_release)
+WRITE_RAW_ENCODER(ceph_mds_cap_item)
+WRITE_RAW_ENCODER(ceph_mds_lease)
+WRITE_RAW_ENCODER(ceph_mds_snap_head)
+WRITE_RAW_ENCODER(ceph_mds_snap_realm)
+WRITE_RAW_ENCODER(ceph_mds_reply_head)
+WRITE_RAW_ENCODER(ceph_mds_reply_cap)
+WRITE_RAW_ENCODER(ceph_mds_cap_reconnect)
+WRITE_RAW_ENCODER(ceph_mds_snaprealm_reconnect)
+WRITE_RAW_ENCODER(ceph_frag_tree_split)
+WRITE_RAW_ENCODER(ceph_osd_reply_head)
+WRITE_RAW_ENCODER(ceph_osd_op)
+WRITE_RAW_ENCODER(ceph_msg_header)
+WRITE_RAW_ENCODER(ceph_msg_footer)
+WRITE_RAW_ENCODER(ceph_msg_footer_old)
+WRITE_RAW_ENCODER(ceph_mon_subscribe_item)
+
+WRITE_RAW_ENCODER(ceph_mon_statfs)
+WRITE_RAW_ENCODER(ceph_mon_statfs_reply)
+
+// ----------------------
+// some basic types
+
+// NOTE: these must match ceph_fs.h typedefs
+typedef uint64_t ceph_tid_t; // transaction id
+typedef uint64_t version_t;
+typedef __u32 epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years)
+
+// --------------------------------------
+// identify individual mount clients by 64bit value
+
+struct client_t {
+ int64_t v;
+
+ // cppcheck-suppress noExplicitConstructor
+ client_t(int64_t _v = -2) : v(_v) {}
+
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ encode(v, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ using ceph::decode;
+ decode(v, bl);
+ }
+};
+WRITE_CLASS_ENCODER(client_t)
+
+static inline bool operator==(const client_t& l, const client_t& r) { return l.v == r.v; }
+static inline bool operator!=(const client_t& l, const client_t& r) { return l.v != r.v; }
+static inline bool operator<(const client_t& l, const client_t& r) { return l.v < r.v; }
+static inline bool operator<=(const client_t& l, const client_t& r) { return l.v <= r.v; }
+static inline bool operator>(const client_t& l, const client_t& r) { return l.v > r.v; }
+static inline bool operator>=(const client_t& l, const client_t& r) { return l.v >= r.v; }
+
+static inline bool operator>=(const client_t& l, int64_t o) { return l.v >= o; }
+static inline bool operator<(const client_t& l, int64_t o) { return l.v < o; }
+
+inline std::ostream& operator<<(std::ostream& out, const client_t& c) {
+ return out << c.v;
+}
+
+
+
+// --
+
+namespace {
+inline std::ostream& format_u(std::ostream& out, const uint64_t v, const uint64_t n,
+ const int index, const uint64_t mult, const char* u)
+ {
+ char buffer[32];
+
+ if (index == 0) {
+ (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+ } else if ((v % mult) == 0) {
+ // If this is an even multiple of the base, always display
+ // without any decimal fraction.
+ (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+ } else {
+ // We want to choose a precision that reflects the best choice
+ // for fitting in 5 characters. This can get rather tricky when
+ // we have numbers that are very close to an order of magnitude.
+ // For example, when displaying 10239 (which is really 9.999K),
+ // we want only a single place of precision for 10.0K. We could
+ // develop some complex heuristics for this, but it's much
+ // easier just to try each combination in turn.
+ int i;
+ for (i = 2; i >= 0; i--) {
+ if (snprintf(buffer, sizeof(buffer), "%.*f%s", i,
+ static_cast<double>(v) / mult, u) <= 7)
+ break;
+ }
+ }
+
+ return out << buffer;
+ }
+}
+
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * decimal unit prefix (the classic SI units). No actual unit will be added.
+ */
+struct si_u_t {
+ uint64_t v;
+ explicit si_u_t(uint64_t _v) : v(_v) {};
+};
+
+inline std::ostream& operator<<(std::ostream& out, const si_u_t& b)
+{
+ uint64_t n = b.v;
+ int index = 0;
+ uint64_t mult = 1;
+ const char* u[] = {"", "k", "M", "G", "T", "P", "E"};
+
+ while (n >= 1000 && index < 7) {
+ n /= 1000;
+ index++;
+ mult *= 1000;
+ }
+
+ return format_u(out, b.v, n, index, mult, u[index]);
+}
+
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * binary unit prefix (IEC units). Since binary unit prefixes are to be used for
+ * "multiples of units in data processing, data transmission, and digital
+ * information" (so bits and bytes) and so far bits are not printed, the unit
+ * "B" for "byte" is added besides the multiplier.
+ */
+struct byte_u_t {
+ uint64_t v;
+ explicit byte_u_t(uint64_t _v) : v(_v) {};
+};
+
+inline std::ostream& operator<<(std::ostream& out, const byte_u_t& b)
+{
+ uint64_t n = b.v;
+ int index = 0;
+ const char* u[] = {" B", " KiB", " MiB", " GiB", " TiB", " PiB", " EiB"};
+
+ while (n >= 1024 && index < 7) {
+ n /= 1024;
+ index++;
+ }
+
+ return format_u(out, b.v, n, index, 1ULL << (10 * index), u[index]);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const ceph_mon_subscribe_item& i)
+{
+ return out << i.start
+ << ((i.flags & CEPH_SUBSCRIBE_ONETIME) ? "" : "+");
+}
+
+struct weightf_t {
+ float v;
+ // cppcheck-suppress noExplicitConstructor
+ weightf_t(float _v) : v(_v) {}
+};
+
+inline std::ostream& operator<<(std::ostream& out, const weightf_t& w)
+{
+ if (w.v < -0.01F) {
+ return out << "-";
+ } else if (w.v < 0.000001F) {
+ return out << "0";
+ } else {
+ std::streamsize p = out.precision();
+ return out << std::fixed << std::setprecision(5) << w.v << std::setprecision(p);
+ }
+}
+
+struct shard_id_t {
+ int8_t id;
+
+ shard_id_t() : id(0) {}
+ explicit shard_id_t(int8_t _id) : id(_id) {}
+
+ operator int8_t() const { return id; }
+
+ const static shard_id_t NO_SHARD;
+
+ void encode(ceph::buffer::list &bl) const {
+ using ceph::encode;
+ encode(id, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ using ceph::decode;
+ decode(id, bl);
+ }
+};
+WRITE_CLASS_ENCODER(shard_id_t)
+WRITE_EQ_OPERATORS_1(shard_id_t, id)
+WRITE_CMP_OPERATORS_1(shard_id_t, id)
+std::ostream &operator<<(std::ostream &lhs, const shard_id_t &rhs);
+
+#if defined(__sun) || defined(_AIX) || defined(__APPLE__) || \
+ defined(__FreeBSD__) || defined(_WIN32)
+extern "C" {
+__s32 ceph_to_hostos_errno(__s32 e);
+__s32 hostos_to_ceph_errno(__s32 e);
+}
+#else
+#define ceph_to_hostos_errno(e) (e)
+#define hostos_to_ceph_errno(e) (e)
+#endif
+
+struct errorcode32_t {
+ int32_t code;
+
+ errorcode32_t() : code(0) {}
+ // cppcheck-suppress noExplicitConstructor
+ errorcode32_t(int32_t i) : code(i) {}
+
+ operator int() const { return code; }
+ int* operator&() { return &code; }
+ int operator==(int i) { return code == i; }
+ int operator>(int i) { return code > i; }
+ int operator>=(int i) { return code >= i; }
+ int operator<(int i) { return code < i; }
+ int operator<=(int i) { return code <= i; }
+
+ void encode(ceph::buffer::list &bl) const {
+ using ceph::encode;
+ __s32 newcode = hostos_to_ceph_errno(code);
+ encode(newcode, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ using ceph::decode;
+ decode(code, bl);
+ code = ceph_to_hostos_errno(code);
+ }
+};
+WRITE_CLASS_ENCODER(errorcode32_t)
+WRITE_EQ_OPERATORS_1(errorcode32_t, code)
+WRITE_CMP_OPERATORS_1(errorcode32_t, code)
+
+template <uint8_t S>
+struct sha_digest_t {
+ constexpr static uint32_t SIZE = S;
+ // TODO: we might consider std::array in the future. Avoiding it for now
+ // as sha_digest_t is a part of our public API.
+ unsigned char v[S] = {0};
+
+ std::string to_str() const {
+ char str[S * 2 + 1] = {0};
+ str[0] = '\0';
+ for (size_t i = 0; i < S; i++) {
+ ::sprintf(&str[i * 2], "%02x", static_cast<int>(v[i]));
+ }
+ return std::string(str);
+ }
+ sha_digest_t(const unsigned char *_v) { memcpy(v, _v, SIZE); };
+ sha_digest_t() {}
+
+ bool operator==(const sha_digest_t& r) const {
+ return ::memcmp(v, r.v, SIZE) == 0;
+ }
+ bool operator!=(const sha_digest_t& r) const {
+ return ::memcmp(v, r.v, SIZE) != 0;
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ // copy to avoid reinterpret_cast, is_pod and other nasty things
+ using ceph::encode;
+ std::array<unsigned char, SIZE> tmparr;
+ memcpy(tmparr.data(), v, SIZE);
+ encode(tmparr, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ using ceph::decode;
+ std::array<unsigned char, SIZE> tmparr;
+ decode(tmparr, bl);
+ memcpy(v, tmparr.data(), SIZE);
+ }
+};
+
+template<uint8_t S>
+inline std::ostream &operator<<(std::ostream &out, const sha_digest_t<S> &b) {
+ std::string str = b.to_str();
+ return out << str;
+}
+
+using sha1_digest_t = sha_digest_t<20>;
+WRITE_CLASS_ENCODER(sha1_digest_t)
+
+using sha256_digest_t = sha_digest_t<32>;
+WRITE_CLASS_ENCODER(sha256_digest_t)
+
+using sha512_digest_t = sha_digest_t<64>;
+
+using md5_digest_t = sha_digest_t<16>;
+WRITE_CLASS_ENCODER(md5_digest_t)
+
+
+#endif
diff --git a/src/include/unordered_map.h b/src/include/unordered_map.h
new file mode 100644
index 000000000..aee5f5a76
--- /dev/null
+++ b/src/include/unordered_map.h
@@ -0,0 +1,11 @@
+#ifndef CEPH_UNORDERED_MAP_H
+#define CEPH_UNORDERED_MAP_H
+
+#include <unordered_map>
+
+namespace ceph {
+ using std::unordered_map;
+ using std::unordered_multimap;
+}
+
+#endif
diff --git a/src/include/unordered_set.h b/src/include/unordered_set.h
new file mode 100644
index 000000000..e30e1799e
--- /dev/null
+++ b/src/include/unordered_set.h
@@ -0,0 +1,10 @@
+#ifndef CEPH_UNORDERED_SET_H
+#define CEPH_UNORDERED_SET_H
+
+#include <unordered_set>
+
+namespace ceph {
+ using std::unordered_set;
+}
+
+#endif
diff --git a/src/include/uses_allocator.h b/src/include/uses_allocator.h
new file mode 100644
index 000000000..35cdbd709
--- /dev/null
+++ b/src/include/uses_allocator.h
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+// Derived from:
+/* uses_allocator.h -*-C++-*-
+ *
+ * Copyright (C) 2016 Pablo Halpern <phalpern@halpernwightsoftware.com>
+ * Distributed under the Boost Software License - Version 1.0
+ */
+// Downloaded from https://github.com/phalpern/uses-allocator.git
+
+#pragma once
+
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace ceph {
+
+namespace internal {
+template <class T, class Tuple, std::size_t... Indexes>
+T make_from_tuple_imp(Tuple&& t, std::index_sequence<Indexes...>)
+{
+ return T(std::get<Indexes>(std::forward<Tuple>(t))...);
+}
+} // namespace internal
+
+template<class T, class Tuple>
+T make_from_tuple(Tuple&& args_tuple)
+{
+ using namespace internal;
+ using Indices = std::make_index_sequence<std::tuple_size_v<
+ std::decay_t<Tuple>>>;
+ return make_from_tuple_imp<T>(std::forward<Tuple>(args_tuple), Indices{});
+}
+
+////////////////////////////////////////////////////////////////////////
+
+// Forward declaration
+template <class T, class Alloc, class... Args>
+auto uses_allocator_construction_args(const Alloc& a, Args&&... args);
+
+namespace internal {
+
+template <class T, class A>
+struct has_allocator : std::uses_allocator<T, A> { };
+
+// Specialization of `has_allocator` for `std::pair`
+template <class T1, class T2, class A>
+struct has_allocator<std::pair<T1, T2>, A>
+ : std::integral_constant<bool, has_allocator<T1, A>::value ||
+ has_allocator<T2, A>::value>
+{
+};
+
+template <bool V> using boolean_constant = std::integral_constant<bool, V>;
+
+template <class T> struct is_pair : std::false_type { };
+
+template <class T1, class T2>
+struct is_pair<std::pair<T1, T2>> : std::true_type { };
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload is handles types for which `has_allocator<T, Alloc>` is false.
+template <class T, class Unused1, class Unused2, class Alloc, class... Args>
+auto uses_allocator_args_imp(Unused1 /* is_pair */,
+ std::false_type /* has_allocator */,
+ Unused2 /* uses prefix allocator arg */,
+ const Alloc& /* ignored */,
+ Args&&... args)
+{
+ // Allocator is ignored
+ return std::forward_as_tuple(std::forward<Args>(args)...);
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles non-pair `T` for which `has_allocator<T, Alloc>` is
+// true and constructor `T(allocator_arg_t, a, args...)` is valid.
+template <class T, class Alloc, class... Args>
+auto uses_allocator_args_imp(std::false_type /* is_pair */,
+ std::true_type /* has_allocator */,
+ std::true_type /* uses prefix allocator arg */,
+ const Alloc& a,
+ Args&&... args)
+{
+ // Allocator added to front of argument list, after `allocator_arg`.
+ return std::tuple<std::allocator_arg_t, const Alloc&,
+ Args&&...>(std::allocator_arg, a, std::forward<Args>(args)...);
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles non-pair `T` for which `has_allocator<T, Alloc>` is
+// true and constructor `T(allocator_arg_t, a, args...)` NOT valid.
+// This function will produce invalid results unless `T(args..., a)` is valid.
+template <class T1, class Alloc, class... Args>
+auto uses_allocator_args_imp(std::false_type /* is_pair */,
+ std::true_type /* has_allocator */,
+ std::false_type /* prefix allocator arg */,
+ const Alloc& a,
+ Args&&... args)
+{
+ // Allocator added to end of argument list
+ return std::forward_as_tuple(std::forward<Args>(args)..., a);
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// piecewise_construct arguments are passed in.
+template <class T, class Alloc, class Tuple1, class Tuple2>
+auto uses_allocator_args_imp(std::true_type /* is_pair */,
+ std::true_type /* has_allocator */,
+ std::false_type /* prefix allocator arg */,
+ const Alloc& a,
+ std::piecewise_construct_t,
+ Tuple1&& x, Tuple2&& y)
+{
+ using T1 = typename T::first_type;
+ using T2 = typename T::second_type;
+
+ return std::make_tuple(
+ std::piecewise_construct,
+ std::apply([&a](auto&&... args1) -> auto {
+ return uses_allocator_construction_args<T1>(
+ a, std::forward<decltype(args1)>(args1)...);
+ }, std::forward<Tuple1>(x)),
+ std::apply([&a](auto&&... args2) -> auto {
+ return uses_allocator_construction_args<T2>(
+ a, std::forward<decltype(args2)>(args2)...);
+ }, std::forward<Tuple2>(y))
+ );
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// no other constructor arguments are passed in.
+template <class T, class Alloc>
+auto uses_allocator_args_imp(std::true_type /* is_pair */,
+ std::true_type /* has_allocator */,
+ std::false_type /* prefix allocator arg */,
+ const Alloc& a)
+{
+ // using T1 = typename T::first_type;
+ // using T2 = typename T::second_type;
+
+ // return std::make_tuple(
+ // piecewise_construct,
+ // uses_allocator_construction_args<T1>(a),
+ // uses_allocator_construction_args<T2>(a));
+ return uses_allocator_construction_args<T>(a, std::piecewise_construct,
+ std::tuple<>{}, std::tuple<>{});
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// a single argument of type const-lvalue-of-pair is passed in.
+template <class T, class Alloc, class U1, class U2>
+auto uses_allocator_args_imp(std::true_type /* is_pair */,
+ std::true_type /* has_allocator */,
+ std::false_type /* prefix allocator arg */,
+ const Alloc& a,
+ const std::pair<U1, U2>& arg)
+{
+ // using T1 = typename T::first_type;
+ // using T2 = typename T::second_type;
+
+ // return std::make_tuple(
+ // piecewise_construct,
+ // uses_allocator_construction_args<T1>(a, arg.first),
+ // uses_allocator_construction_args<T2>(a, arg.second));
+ return uses_allocator_construction_args<T>(a, std::piecewise_construct,
+ std::forward_as_tuple(arg.first),
+ std::forward_as_tuple(arg.second));
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// a single argument of type rvalue-of-pair is passed in.
+template <class T, class Alloc, class U1, class U2>
+auto uses_allocator_args_imp(std::true_type /* is_pair */,
+ std::true_type /* has_allocator */,
+ std::false_type /* prefix allocator arg */,
+ const Alloc& a,
+ std::pair<U1, U2>&& arg)
+{
+ // using T1 = typename T::first_type;
+ // using T2 = typename T::second_type;
+
+ // return std::make_tuple(
+ // piecewise_construct,
+ // uses_allocator_construction_args<T1>(a, forward<U1>(arg.first)),
+ // uses_allocator_construction_args<T2>(a, forward<U2>(arg.second)));
+ return uses_allocator_construction_args<T>(a, std::piecewise_construct,
+ std::forward_as_tuple(std::forward<U1>(arg.first)),
+ std::forward_as_tuple(std::forward<U2>(arg.second)));
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// two additional constructor arguments are passed in.
+template <class T, class Alloc, class U1, class U2>
+auto uses_allocator_args_imp(std::true_type /* is_pair */,
+ std::true_type /* has_allocator */,
+ std::false_type /* prefix allocator arg */,
+ const Alloc& a,
+ U1&& arg1, U2&& arg2)
+{
+ // using T1 = typename T::first_type;
+ // using T2 = typename T::second_type;
+
+ // return std::make_tuple(
+ // piecewise_construct,
+ // uses_allocator_construction_args<T1>(a, forward<U1>(arg1)),
+ // uses_allocator_construction_args<T2>(a, forward<U2>(arg2)));
+ return uses_allocator_construction_args<T>(
+ a, std::piecewise_construct,
+ std::forward_as_tuple(std::forward<U1>(arg1)),
+ std::forward_as_tuple(std::forward<U2>(arg2)));
+}
+
+} // close namespace internal
+
+template <class T, class Alloc, class... Args>
+auto uses_allocator_construction_args(const Alloc& a, Args&&... args)
+{
+ using namespace internal;
+ return uses_allocator_args_imp<T>(is_pair<T>(),
+ has_allocator<T, Alloc>(),
+ std::is_constructible<T, std::allocator_arg_t,
+ Alloc, Args...>(),
+ a, std::forward<Args>(args)...);
+}
+
+template <class T, class Alloc, class... Args>
+T make_obj_using_allocator(const Alloc& a, Args&&... args)
+{
+ return make_from_tuple<T>(
+ uses_allocator_construction_args<T>(a, std::forward<Args>(args)...));
+}
+
+template <class T, class Alloc, class... Args>
+T* uninitialized_construct_using_allocator(T* p,
+ const Alloc& a,
+ Args&&... args)
+{
+ return std::apply([p](auto&&... args2){
+ return ::new(static_cast<void*>(p))
+ T(std::forward<decltype(args2)>(args2)...);
+ }, uses_allocator_construction_args<T>(
+ a, std::forward<Args>(args)...));
+}
+
+} // namespace ceph
diff --git a/src/include/util.h b/src/include/util.h
new file mode 100644
index 000000000..acad4a52c
--- /dev/null
+++ b/src/include/util.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+#ifndef CEPH_UTIL_H
+#define CEPH_UTIL_H
+
+#include "common/Formatter.h"
+#include "include/types.h"
+
+std::string bytes2str(uint64_t count);
+
+struct ceph_data_stats
+{
+ uint64_t byte_total;
+ uint64_t byte_used;
+ uint64_t byte_avail;
+ int avail_percent;
+
+ ceph_data_stats() :
+ byte_total(0),
+ byte_used(0),
+ byte_avail(0),
+ avail_percent(0)
+ { }
+
+ void dump(ceph::Formatter *f) const {
+ ceph_assert(f != NULL);
+ f->dump_int("total", byte_total);
+ f->dump_int("used", byte_used);
+ f->dump_int("avail", byte_avail);
+ f->dump_int("avail_percent", avail_percent);
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(byte_total, bl);
+ encode(byte_used, bl);
+ encode(byte_avail, bl);
+ encode(avail_percent, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(byte_total, p);
+ decode(byte_used, p);
+ decode(byte_avail, p);
+ decode(avail_percent, p);
+ DECODE_FINISH(p);
+ }
+
+ static void generate_test_instances(std::list<ceph_data_stats*>& ls) {
+ ls.push_back(new ceph_data_stats);
+ ls.push_back(new ceph_data_stats);
+ ls.back()->byte_total = 1024*1024;
+ ls.back()->byte_used = 512*1024;
+ ls.back()->byte_avail = 512*1024;
+ ls.back()->avail_percent = 50;
+ }
+};
+typedef struct ceph_data_stats ceph_data_stats_t;
+WRITE_CLASS_ENCODER(ceph_data_stats)
+
+int get_fs_stats(ceph_data_stats_t &stats, const char *path);
+
+/// get memory limit for the current cgroup
+int get_cgroup_memory_limit(uint64_t *limit);
+
+/// collect info from @p uname(2), @p /proc/meminfo and @p /proc/cpuinfo
+void collect_sys_info(std::map<std::string, std::string> *m, CephContext *cct);
+
+#ifdef _WIN32
+/// Retrieve the actual Windows version, regardless of the app manifest.
+int get_windows_version(POSVERSIONINFOEXW ver);
+#endif
+
+/// dump service ids grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service id hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(ceph::Formatter* f,
+ const std::map<std::string, std::list<int> >& services,
+ const char* type);
+/// dump service names grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service name hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(ceph::Formatter* f, const std::map<std::string,
+ std::list<std::string> >& services, const char* type);
+
+std::string cleanbin(ceph::buffer::list &bl, bool &b64, bool show = false);
+std::string cleanbin(std::string &str);
+
+namespace ceph::util {
+
+// Returns true if s matches any parameters:
+template <typename ...XS>
+bool match_str(const std::string& s, const XS& ...xs)
+{
+ return ((s == xs) || ...);
+}
+
+} // namespace ceph::util
+#endif /* CEPH_UTIL_H */
diff --git a/src/include/utime.cc b/src/include/utime.cc
new file mode 100644
index 000000000..2252a1ca4
--- /dev/null
+++ b/src/include/utime.cc
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "utime.h"
+#include "common/Formatter.h"
+
+void utime_t::dump(ceph::Formatter *f) const
+{
+ f->dump_int("seconds", tv.tv_sec);
+ f->dump_int("nanoseconds", tv.tv_nsec);
+}
+
+void utime_t::generate_test_instances(std::list<utime_t*>& o)
+{
+ o.push_back(new utime_t());
+ o.push_back(new utime_t());
+ o.back()->tv.tv_sec = static_cast<__u32>((1L << 32) - 1);
+ o.push_back(new utime_t());
+ o.back()->tv.tv_nsec = static_cast<__u32>((1L << 32) - 1);
+}
diff --git a/src/include/utime.h b/src/include/utime.h
new file mode 100644
index 000000000..512149db0
--- /dev/null
+++ b/src/include/utime.h
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_UTIME_H
+#define CEPH_UTIME_H
+
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+
+#if defined(WITH_SEASTAR)
+#include <seastar/core/lowres_clock.hh>
+#endif
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/timegm.h"
+#include "common/strtol.h"
+#include "common/ceph_time.h"
+#include "common/safe_io.h"
+#include "common/SubProcess.h"
+#include "include/denc.h"
+
+
+// --------
+// utime_t
+
+inline __u32 cap_to_u32_max(__u64 t) {
+ return std::min(t, (__u64)std::numeric_limits<uint32_t>::max());
+}
+/* WARNING: If add member in utime_t, please make sure the encode/decode function
+ * work well. For little-endian machine, we should make sure there is no padding
+ * in 32-bit machine and 64-bit machine.
+ * You should also modify the padding_check function.
+ */
+class utime_t {
+public:
+ struct {
+ __u32 tv_sec, tv_nsec;
+ } tv;
+
+ public:
+ bool is_zero() const {
+ return (tv.tv_sec == 0) && (tv.tv_nsec == 0);
+ }
+
+ void normalize() {
+ if (tv.tv_nsec > 1000000000ul) {
+ tv.tv_sec = cap_to_u32_max(tv.tv_sec + tv.tv_nsec / (1000000000ul));
+ tv.tv_nsec %= 1000000000ul;
+ }
+ }
+
+ // cons
+ utime_t() { tv.tv_sec = 0; tv.tv_nsec = 0; }
+ utime_t(time_t s, int n) { tv.tv_sec = s; tv.tv_nsec = n; normalize(); }
+ utime_t(const struct ceph_timespec &v) {
+ decode_timeval(&v);
+ }
+ utime_t(const struct timespec v)
+ {
+ // NOTE: this is used by ceph_clock_now() so should be kept
+ // as thin as possible.
+ tv.tv_sec = v.tv_sec;
+ tv.tv_nsec = v.tv_nsec;
+ }
+ // conversion from ceph::real_time/coarse_real_time
+ template <typename Clock, typename std::enable_if_t<
+ ceph::converts_to_timespec_v<Clock>>* = nullptr>
+ explicit utime_t(const std::chrono::time_point<Clock>& t)
+ : utime_t(Clock::to_timespec(t)) {} // forward to timespec ctor
+
+ template<class Rep, class Period>
+ explicit utime_t(const std::chrono::duration<Rep, Period>& dur) {
+ using common_t = std::common_type_t<Rep, int>;
+ tv.tv_sec = std::max<common_t>(std::chrono::duration_cast<std::chrono::seconds>(dur).count(), 0);
+ tv.tv_nsec = std::max<common_t>((std::chrono::duration_cast<std::chrono::nanoseconds>(dur) %
+ std::chrono::seconds(1)).count(), 0);
+ }
+#if defined(WITH_SEASTAR)
+ explicit utime_t(const seastar::lowres_system_clock::time_point& t) {
+ tv.tv_sec = std::chrono::duration_cast<std::chrono::seconds>(
+ t.time_since_epoch()).count();
+ tv.tv_nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(
+ t.time_since_epoch() % std::chrono::seconds(1)).count();
+ }
+ explicit operator seastar::lowres_system_clock::time_point() const noexcept {
+ using clock_t = seastar::lowres_system_clock;
+ return clock_t::time_point{std::chrono::duration_cast<clock_t::duration>(
+ std::chrono::seconds{tv.tv_sec} + std::chrono::nanoseconds{tv.tv_nsec})};
+ }
+#endif
+
+ utime_t(const struct timeval &v) {
+ set_from_timeval(&v);
+ }
+ utime_t(const struct timeval *v) {
+ set_from_timeval(v);
+ }
+ void to_timespec(struct timespec *ts) const {
+ ts->tv_sec = tv.tv_sec;
+ ts->tv_nsec = tv.tv_nsec;
+ }
+ void set_from_double(double d) {
+ tv.tv_sec = (__u32)trunc(d);
+ tv.tv_nsec = (__u32)((d - (double)tv.tv_sec) * 1000000000.0);
+ }
+
+ ceph::real_time to_real_time() const {
+ ceph_timespec ts;
+ encode_timeval(&ts);
+ return ceph::real_clock::from_ceph_timespec(ts);
+ }
+
+ // accessors
+ time_t sec() const { return tv.tv_sec; }
+ long usec() const { return tv.tv_nsec/1000; }
+ int nsec() const { return tv.tv_nsec; }
+
+ // ref accessors/modifiers
+ __u32& sec_ref() { return tv.tv_sec; }
+ __u32& nsec_ref() { return tv.tv_nsec; }
+
+ uint64_t to_nsec() const {
+ return (uint64_t)tv.tv_nsec + (uint64_t)tv.tv_sec * 1000000000ull;
+ }
+ uint64_t to_msec() const {
+ return (uint64_t)tv.tv_nsec / 1000000ull + (uint64_t)tv.tv_sec * 1000ull;
+ }
+
+ void copy_to_timeval(struct timeval *v) const {
+ v->tv_sec = tv.tv_sec;
+ v->tv_usec = tv.tv_nsec/1000;
+ }
+ void set_from_timeval(const struct timeval *v) {
+ tv.tv_sec = v->tv_sec;
+ tv.tv_nsec = v->tv_usec*1000;
+ }
+ void padding_check() {
+ static_assert(
+ sizeof(utime_t) ==
+ sizeof(tv.tv_sec) +
+ sizeof(tv.tv_nsec)
+ ,
+ "utime_t have padding");
+ }
+ void encode(ceph::buffer::list &bl) const {
+#if defined(CEPH_LITTLE_ENDIAN)
+ bl.append((char *)(this), sizeof(__u32) + sizeof(__u32));
+#else
+ using ceph::encode;
+ encode(tv.tv_sec, bl);
+ encode(tv.tv_nsec, bl);
+#endif
+ }
+ void decode(ceph::buffer::list::const_iterator &p) {
+#if defined(CEPH_LITTLE_ENDIAN)
+ p.copy(sizeof(__u32) + sizeof(__u32), (char *)(this));
+#else
+ using ceph::decode;
+ decode(tv.tv_sec, p);
+ decode(tv.tv_nsec, p);
+#endif
+ }
+
+ DENC(utime_t, v, p) {
+ denc(v.tv.tv_sec, p);
+ denc(v.tv.tv_nsec, p);
+ }
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<utime_t*>& o);
+
+ void encode_timeval(struct ceph_timespec *t) const {
+ t->tv_sec = tv.tv_sec;
+ t->tv_nsec = tv.tv_nsec;
+ }
+ void decode_timeval(const struct ceph_timespec *t) {
+ tv.tv_sec = t->tv_sec;
+ tv.tv_nsec = t->tv_nsec;
+ }
+
+ utime_t round_to_minute() {
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+ bdt.tm_sec = 0;
+ tt = mktime(&bdt);
+ return utime_t(tt, 0);
+ }
+
+ utime_t round_to_hour() {
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+ bdt.tm_sec = 0;
+ bdt.tm_min = 0;
+ tt = mktime(&bdt);
+ return utime_t(tt, 0);
+ }
+
+ utime_t round_to_day() {
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+ bdt.tm_sec = 0;
+ bdt.tm_min = 0;
+ bdt.tm_hour = 0;
+ tt = mktime(&bdt);
+ return utime_t(tt, 0);
+ }
+
+ // cast to double
+ operator double() const {
+ return (double)sec() + ((double)nsec() / 1000000000.0L);
+ }
+ operator ceph_timespec() const {
+ ceph_timespec ts;
+ ts.tv_sec = sec();
+ ts.tv_nsec = nsec();
+ return ts;
+ }
+
+ void sleep() const {
+ struct timespec ts;
+ to_timespec(&ts);
+ nanosleep(&ts, NULL);
+ }
+
+ // output
+ std::ostream& gmtime(std::ostream& out, bool legacy_form=false) const {
+ out.setf(std::ios::right);
+ char oldfill = out.fill();
+ out.fill('0');
+ if (sec() < ((time_t)(60*60*24*365*10))) {
+ // raw seconds. this looks like a relative time.
+ out << (long)sec() << "." << std::setw(6) << usec();
+ } else {
+ // this looks like an absolute time.
+ // conform to http://en.wikipedia.org/wiki/ISO_8601
+ struct tm bdt;
+ time_t tt = sec();
+ gmtime_r(&tt, &bdt);
+ out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07'
+ << '-' << std::setw(2) << (bdt.tm_mon+1)
+ << '-' << std::setw(2) << bdt.tm_mday;
+ if (legacy_form) {
+ out << ' ';
+ } else {
+ out << 'T';
+ }
+ out << std::setw(2) << bdt.tm_hour
+ << ':' << std::setw(2) << bdt.tm_min
+ << ':' << std::setw(2) << bdt.tm_sec;
+ out << "." << std::setw(6) << usec();
+ out << "Z";
+ }
+ out.fill(oldfill);
+ out.unsetf(std::ios::right);
+ return out;
+ }
+
+ // output
+ std::ostream& gmtime_nsec(std::ostream& out) const {
+ out.setf(std::ios::right);
+ char oldfill = out.fill();
+ out.fill('0');
+ if (sec() < ((time_t)(60*60*24*365*10))) {
+ // raw seconds. this looks like a relative time.
+ out << (long)sec() << "." << std::setw(6) << usec();
+ } else {
+ // this looks like an absolute time.
+ // conform to http://en.wikipedia.org/wiki/ISO_8601
+ struct tm bdt;
+ time_t tt = sec();
+ gmtime_r(&tt, &bdt);
+ out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07'
+ << '-' << std::setw(2) << (bdt.tm_mon+1)
+ << '-' << std::setw(2) << bdt.tm_mday
+ << 'T'
+ << std::setw(2) << bdt.tm_hour
+ << ':' << std::setw(2) << bdt.tm_min
+ << ':' << std::setw(2) << bdt.tm_sec;
+ out << "." << std::setw(9) << nsec();
+ out << "Z";
+ }
+ out.fill(oldfill);
+ out.unsetf(std::ios::right);
+ return out;
+ }
+
+ // output
+ std::ostream& asctime(std::ostream& out) const {
+ out.setf(std::ios::right);
+ char oldfill = out.fill();
+ out.fill('0');
+ if (sec() < ((time_t)(60*60*24*365*10))) {
+ // raw seconds. this looks like a relative time.
+ out << (long)sec() << "." << std::setw(6) << usec();
+ } else {
+ // this looks like an absolute time.
+ struct tm bdt;
+ time_t tt = sec();
+ gmtime_r(&tt, &bdt);
+
+ char buf[128];
+ asctime_r(&bdt, buf);
+ int len = strlen(buf);
+ if (buf[len - 1] == '\n')
+ buf[len - 1] = '\0';
+ out << buf;
+ }
+ out.fill(oldfill);
+ out.unsetf(std::ios::right);
+ return out;
+ }
+
+ std::ostream& localtime(std::ostream& out, bool legacy_form=false) const {
+ out.setf(std::ios::right);
+ char oldfill = out.fill();
+ out.fill('0');
+ if (sec() < ((time_t)(60*60*24*365*10))) {
+ // raw seconds. this looks like a relative time.
+ out << (long)sec() << "." << std::setw(6) << usec();
+ } else {
+ // this looks like an absolute time.
+ // conform to http://en.wikipedia.org/wiki/ISO_8601
+ struct tm bdt;
+ time_t tt = sec();
+ localtime_r(&tt, &bdt);
+ out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07'
+ << '-' << std::setw(2) << (bdt.tm_mon+1)
+ << '-' << std::setw(2) << bdt.tm_mday;
+ if (legacy_form) {
+ out << ' ';
+ } else {
+ out << 'T';
+ }
+ out << std::setw(2) << bdt.tm_hour
+ << ':' << std::setw(2) << bdt.tm_min
+ << ':' << std::setw(2) << bdt.tm_sec;
+ out << "." << std::setw(6) << usec();
+ if (!legacy_form) {
+ char buf[32] = { 0 };
+ strftime(buf, sizeof(buf), "%z", &bdt);
+ out << buf;
+ }
+ }
+ out.fill(oldfill);
+ out.unsetf(std::ios::right);
+ return out;
+ }
+
+ static int invoke_date(const std::string& date_str, utime_t *result) {
+ char buf[256];
+
+ SubProcess bin_date("/bin/date", SubProcess::CLOSE, SubProcess::PIPE,
+ SubProcess::KEEP);
+ bin_date.add_cmd_args("-d", date_str.c_str(), "+%s %N", NULL);
+
+ int r = bin_date.spawn();
+ if (r < 0) return r;
+
+ ssize_t n = safe_read(bin_date.get_stdout(), buf, sizeof(buf));
+
+ r = bin_date.join();
+ if (r || n <= 0) return -EINVAL;
+
+ uint64_t epoch, nsec;
+ std::istringstream iss(buf);
+
+ iss >> epoch;
+ iss >> nsec;
+
+ *result = utime_t(epoch, nsec);
+
+ return 0;
+ }
+
+
+ static int parse_date(const std::string& date, uint64_t *epoch, uint64_t *nsec,
+ std::string *out_date=nullptr,
+ std::string *out_time=nullptr) {
+ struct tm tm;
+ memset(&tm, 0, sizeof(tm));
+
+ if (nsec)
+ *nsec = 0;
+
+ const char *p = strptime(date.c_str(), "%Y-%m-%d", &tm);
+ if (p) {
+ if (*p == ' ' || *p == 'T') {
+ p++;
+ // strptime doesn't understand fractional/decimal seconds, and
+ // it also only takes format chars or literals, so we have to
+ // get creative.
+ char fmt[32] = {0};
+ strncpy(fmt, p, sizeof(fmt) - 1);
+ fmt[0] = '%';
+ fmt[1] = 'H';
+ fmt[2] = ':';
+ fmt[3] = '%';
+ fmt[4] = 'M';
+ fmt[6] = '%';
+ fmt[7] = 'S';
+ const char *subsec = 0;
+ char *q = fmt + 8;
+ if (*q == '.') {
+ ++q;
+ subsec = p + 9;
+ q = fmt + 9;
+ while (*q && isdigit(*q)) {
+ ++q;
+ }
+ }
+ // look for tz...
+ if (*q == '-' || *q == '+') {
+ *q = '%';
+ *(q+1) = 'z';
+ *(q+2) = 0;
+ }
+ p = strptime(p, fmt, &tm);
+ if (!p) {
+ return -EINVAL;
+ }
+ if (nsec && subsec) {
+ unsigned i;
+ char buf[10]; /* 9 digit + null termination */
+ for (i = 0; (i < sizeof(buf) - 1) && isdigit(*subsec); ++i, ++subsec) {
+ buf[i] = *subsec;
+ }
+ for (; i < sizeof(buf) - 1; ++i) {
+ buf[i] = '0';
+ }
+ buf[i] = '\0';
+ std::string err;
+ *nsec = (uint64_t)strict_strtol(buf, 10, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+ }
+ }
+ } else {
+ int sec, usec;
+ int r = sscanf(date.c_str(), "%d.%d", &sec, &usec);
+ if (r != 2) {
+ return -EINVAL;
+ }
+
+ time_t tt = sec;
+ gmtime_r(&tt, &tm);
+
+ if (nsec) {
+ *nsec = (uint64_t)usec * 1000;
+ }
+ }
+
+ #ifndef _WIN32
+ // apply the tm_gmtoff manually below, since none of mktime,
+ // gmtime, and localtime seem to do it. zero it out here just in
+ // case some other libc *does* apply it. :(
+ auto gmtoff = tm.tm_gmtoff;
+ tm.tm_gmtoff = 0;
+ #else
+ auto gmtoff = _timezone;
+ #endif /* _WIN32 */
+
+ time_t t = internal_timegm(&tm);
+ if (epoch)
+ *epoch = (uint64_t)t;
+
+ *epoch -= gmtoff;
+
+ if (out_date) {
+ char buf[32];
+ strftime(buf, sizeof(buf), "%Y-%m-%d", &tm);
+ *out_date = buf;
+ }
+ if (out_time) {
+ char buf[32];
+ strftime(buf, sizeof(buf), "%H:%M:%S", &tm);
+ *out_time = buf;
+ }
+
+ return 0;
+ }
+
+ bool parse(const std::string& s) {
+ uint64_t epoch, nsec;
+ int r = parse_date(s, &epoch, &nsec);
+ if (r < 0) {
+ return false;
+ }
+ *this = utime_t(epoch, nsec);
+ return true;
+ }
+};
+WRITE_CLASS_ENCODER(utime_t)
+WRITE_CLASS_DENC(utime_t)
+
+// arithmetic operators
+inline utime_t operator+(const utime_t& l, const utime_t& r) {
+ __u64 sec = (__u64)l.sec() + r.sec();
+ return utime_t(cap_to_u32_max(sec), l.nsec() + r.nsec());
+}
+inline utime_t& operator+=(utime_t& l, const utime_t& r) {
+ l.sec_ref() = cap_to_u32_max((__u64)l.sec() + r.sec());
+ l.nsec_ref() += r.nsec();
+ l.normalize();
+ return l;
+}
+inline utime_t& operator+=(utime_t& l, double f) {
+ double fs = trunc(f);
+ double ns = (f - fs) * 1000000000.0;
+ l.sec_ref() = cap_to_u32_max(l.sec() + (__u64)fs);
+ l.nsec_ref() += (long)ns;
+ l.normalize();
+ return l;
+}
+
+inline utime_t operator-(const utime_t& l, const utime_t& r) {
+ return utime_t( l.sec() - r.sec() - (l.nsec()<r.nsec() ? 1:0),
+ l.nsec() - r.nsec() + (l.nsec()<r.nsec() ? 1000000000:0) );
+}
+inline utime_t& operator-=(utime_t& l, const utime_t& r) {
+ l.sec_ref() -= r.sec();
+ if (l.nsec() >= r.nsec())
+ l.nsec_ref() -= r.nsec();
+ else {
+ l.nsec_ref() += 1000000000L - r.nsec();
+ l.sec_ref()--;
+ }
+ return l;
+}
+inline utime_t& operator-=(utime_t& l, double f) {
+ double fs = trunc(f);
+ double ns = (f - fs) * 1000000000.0;
+ l.sec_ref() -= (long)fs;
+ long nsl = (long)ns;
+ if (nsl) {
+ l.sec_ref()--;
+ l.nsec_ref() = 1000000000L + l.nsec_ref() - nsl;
+ }
+ l.normalize();
+ return l;
+}
+
+
+// comparators
+inline bool operator>(const utime_t& a, const utime_t& b)
+{
+ return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.nsec() > b.nsec());
+}
+inline bool operator<=(const utime_t& a, const utime_t& b)
+{
+ return !(operator>(a, b));
+}
+inline bool operator<(const utime_t& a, const utime_t& b)
+{
+ return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.nsec() < b.nsec());
+}
+inline bool operator>=(const utime_t& a, const utime_t& b)
+{
+ return !(operator<(a, b));
+}
+
+inline bool operator==(const utime_t& a, const utime_t& b)
+{
+ return a.sec() == b.sec() && a.nsec() == b.nsec();
+}
+inline bool operator!=(const utime_t& a, const utime_t& b)
+{
+ return a.sec() != b.sec() || a.nsec() != b.nsec();
+}
+
+
+// output
+
+// ostream
+inline std::ostream& operator<<(std::ostream& out, const utime_t& t)
+{
+ return t.localtime(out);
+}
+
+inline std::string utimespan_str(const utime_t& age) {
+ auto age_ts = ceph::timespan(age.nsec()) + std::chrono::seconds(age.sec());
+ return ceph::timespan_str(age_ts);
+}
+
+#endif
diff --git a/src/include/uuid.cc b/src/include/uuid.cc
new file mode 100644
index 000000000..106fc1db5
--- /dev/null
+++ b/src/include/uuid.cc
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "uuid.h"
+#include "common/Formatter.h"
+
+void uuid_d::dump(ceph::Formatter *f) const
+{
+ f->dump_stream("uuid") << to_string();
+}
+
+void uuid_d::generate_test_instances(std::list<uuid_d*>& o)
+{
+ // these are sourced from examples at
+ // https://www.boost.org/doc/libs/1_62_0/libs/uuid/uuid.html#Synopsis_generators
+ boost::uuids::string_generator gen;
+ o.push_back(new uuid_d());
+ o.back()->uuid = gen("{01234567-89ab-cdef-0123-456789abcdef}");
+ o.push_back(new uuid_d());
+ o.back()->uuid = gen(L"01234567-89ab-cdef-0123-456789abcdef");
+ o.push_back(new uuid_d());
+ o.back()->uuid = gen(std::string("0123456789abcdef0123456789abcdef"));
+ o.push_back(new uuid_d());
+ o.back()->uuid = gen(std::wstring(L"01234567-89ab-cdef-0123-456789abcdef"));
+}
diff --git a/src/include/uuid.h b/src/include/uuid.h
new file mode 100644
index 000000000..cc735025a
--- /dev/null
+++ b/src/include/uuid.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef _CEPH_UUID_H
+#define _CEPH_UUID_H
+
+/*
+ * Thin C++ wrapper around libuuid.
+ */
+
+#include "encoding.h"
+#include "random.h"
+
+#include <ostream>
+#include <random>
+
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+namespace ceph {
+ class Formatter;
+}
+
+struct uuid_d {
+ boost::uuids::uuid uuid;
+
+ uuid_d() {
+ boost::uuids::nil_generator gen;
+ uuid = gen();
+ }
+
+ bool is_zero() const {
+ return uuid.is_nil();
+ }
+
+ void generate_random() {
+ random_device_t rng;
+ boost::uuids::basic_random_generator gen(rng);
+ uuid = gen();
+ }
+
+ bool parse(const char *s) {
+ try {
+ boost::uuids::string_generator gen;
+ uuid = gen(s);
+ return true;
+ } catch (std::runtime_error& e) {
+ return false;
+ }
+ }
+ void print(char *s) const {
+ memcpy(s, boost::uuids::to_string(uuid).c_str(), 37);
+ }
+
+ std::string to_string() const {
+ return boost::uuids::to_string(uuid);
+ }
+
+ const char *bytes() const {
+ return (const char*)uuid.data;
+ }
+
+ void encode(::ceph::buffer::list::contiguous_appender& p) const {
+ p.append(reinterpret_cast<const char *>(&uuid), sizeof(uuid));
+ }
+
+ void bound_encode(size_t& p) const {
+ p += sizeof(uuid);
+ }
+
+ void decode(::ceph::buffer::ptr::const_iterator& p) {
+ assert((p.get_end() - p.get_pos()) >= (int)sizeof(*this));
+ memcpy((char *)this, p.get_pos_add(sizeof(*this)), sizeof(*this));
+ }
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<uuid_d*>& o);
+};
+WRITE_CLASS_DENC_BOUNDED(uuid_d)
+
+inline std::ostream& operator<<(std::ostream& out, const uuid_d& u) {
+ char b[37];
+ u.print(b);
+ return out << b;
+}
+
+inline bool operator==(const uuid_d& l, const uuid_d& r) {
+ return l.uuid == r.uuid;
+}
+inline bool operator!=(const uuid_d& l, const uuid_d& r) {
+ return l.uuid != r.uuid;
+}
+inline bool operator<(const uuid_d& l, const uuid_d& r) {
+ return l.to_string() < r.to_string();
+}
+
+
+#endif
diff --git a/src/include/win32/arpa/inet.h b/src/include/win32/arpa/inet.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/arpa/inet.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/fs_compat.h b/src/include/win32/fs_compat.h
new file mode 100644
index 000000000..c3405e670
--- /dev/null
+++ b/src/include/win32/fs_compat.h
@@ -0,0 +1,36 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2021 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+// Those definitions allow handling information coming from Ceph and should
+// not be passed to Windows functions.
+
+#define S_IFLNK 0120000
+
+#define S_ISTYPE(m, TYPE) ((m & S_IFMT) == TYPE)
+#define S_ISLNK(m) S_ISTYPE(m, S_IFLNK)
+#define S_ISUID 04000
+#define S_ISGID 02000
+#define S_ISVTX 01000
+
+#define LOCK_SH 1
+#define LOCK_EX 2
+#define LOCK_NB 4
+#define LOCK_UN 8
+#define LOCK_MAND 32
+#define LOCK_READ 64
+#define LOCK_WRITE 128
+#define LOCK_RW 192
+
+#define AT_SYMLINK_NOFOLLOW 0x100
+#define AT_REMOVEDIR 0x200
+
+#define MAXSYMLINKS 65000
diff --git a/src/include/win32/ifaddrs.h b/src/include/win32/ifaddrs.h
new file mode 100644
index 000000000..45e1a362c
--- /dev/null
+++ b/src/include/win32/ifaddrs.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2002-2016 Free Software Foundation, Inc.
+ * Copyright (C) 2019 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef IFADDRS_H
+#define IFADDRS_H
+
+#include "winsock_compat.h"
+#include <ifdef.h>
+
+struct ifaddrs {
+ struct ifaddrs *ifa_next; /* Next item in list */
+ char *ifa_name; /* Name of interface */
+ unsigned int ifa_flags; /* Flags from SIOCGIFFLAGS */
+ struct sockaddr *ifa_addr; /* Address of interface */
+ struct sockaddr *ifa_netmask; /* Netmask of interface */
+
+ struct sockaddr_storage in_addrs;
+ struct sockaddr_storage in_netmasks;
+
+ char ad_name[IF_MAX_STRING_SIZE];
+ size_t speed;
+};
+
+int getifaddrs(struct ifaddrs **ifap);
+void freeifaddrs(struct ifaddrs *ifa);
+
+#endif
diff --git a/src/include/win32/netdb.h b/src/include/win32/netdb.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/netdb.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/netinet/in.h b/src/include/win32/netinet/in.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/netinet/in.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/netinet/ip.h b/src/include/win32/netinet/ip.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/include/win32/netinet/ip.h
diff --git a/src/include/win32/netinet/tcp.h b/src/include/win32/netinet/tcp.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/include/win32/netinet/tcp.h
diff --git a/src/include/win32/poll.h b/src/include/win32/poll.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/poll.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/sys/errno.h b/src/include/win32/sys/errno.h
new file mode 100644
index 000000000..339f4fc10
--- /dev/null
+++ b/src/include/win32/sys/errno.h
@@ -0,0 +1 @@
+#include <errno.h>
diff --git a/src/include/win32/sys/select.h b/src/include/win32/sys/select.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/include/win32/sys/select.h
diff --git a/src/include/win32/sys/socket.h b/src/include/win32/sys/socket.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/sys/socket.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/sys/statvfs.h b/src/include/win32/sys/statvfs.h
new file mode 100644
index 000000000..73a892b88
--- /dev/null
+++ b/src/include/win32/sys/statvfs.h
@@ -0,0 +1,36 @@
+#ifndef _SYS_STATVFS_H
+#define _SYS_STATVFS_H 1
+
+typedef unsigned __int64 fsfilcnt64_t;
+typedef unsigned __int64 fsblkcnt64_t;
+typedef unsigned __int64 fsblkcnt_t;
+
+struct statvfs
+{
+ unsigned long int f_bsize;
+ unsigned long int f_frsize;
+ fsblkcnt64_t f_blocks;
+ fsblkcnt64_t f_bfree;
+ fsblkcnt64_t f_bavail;
+ fsfilcnt64_t f_files;
+ fsfilcnt64_t f_ffree;
+ fsfilcnt64_t f_favail;
+ unsigned long int f_fsid;
+ unsigned long int f_flag;
+ unsigned long int f_namemax;
+ int __f_spare[6];
+};
+struct flock {
+ short l_type;
+ short l_whence;
+ off_t l_start;
+ off_t l_len;
+ pid_t l_pid;
+};
+
+#define F_RDLCK 0
+#define F_WRLCK 1
+#define F_UNLCK 2
+#define F_SETLK 6
+
+#endif /* _SYS_STATVFS_H */
diff --git a/src/include/win32/sys/uio.h b/src/include/win32/sys/uio.h
new file mode 100644
index 000000000..15e95be7f
--- /dev/null
+++ b/src/include/win32/sys/uio.h
@@ -0,0 +1 @@
+#include "include/compat.h"
diff --git a/src/include/win32/sys/un.h b/src/include/win32/sys/un.h
new file mode 100644
index 000000000..d08940b2c
--- /dev/null
+++ b/src/include/win32/sys/un.h
@@ -0,0 +1 @@
+#include "include/win32/winsock_compat.h"
diff --git a/src/include/win32/syslog.h b/src/include/win32/syslog.h
new file mode 100644
index 000000000..28389e0b9
--- /dev/null
+++ b/src/include/win32/syslog.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2013, 2015 Cloudbase Solutions Srl
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.You may obtain
+ * a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef SYSLOG_H
+#define SYSLOG_H 1
+
+#define LOG_EMERG 0 /* system is unusable */
+#define LOG_ALERT 1 /* action must be taken immediately */
+#define LOG_CRIT 2 /* critical conditions */
+#define LOG_ERR 3 /* error conditions */
+#define LOG_WARNING 4 /* warning conditions */
+#define LOG_NOTICE 5 /* normal but significant condition */
+#define LOG_INFO 6 /* informational */
+#define LOG_DEBUG 7 /* debug-level messages */
+
+#define LOG_KERN (0<<3) /* kernel messages */
+#define LOG_USER (1<<3) /* user-level messages */
+#define LOG_MAIL (2<<3) /* mail system */
+#define LOG_DAEMON (3<<3) /* system daemons */
+#define LOG_AUTH (4<<3) /* security/authorization messages */
+#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */
+#define LOG_LPR (6<<3) /* line printer subsystem */
+#define LOG_NEWS (7<<3) /* network news subsystem */
+#define LOG_UUCP (8<<3) /* UUCP subsystem */
+#define LOG_CRON (9<<3) /* clock daemon */
+#define LOG_AUTHPRIV (10<<3) /* security/authorization messages */
+#define LOG_FTP (11<<3) /* FTP daemon */
+
+#define LOG_LOCAL0 (16<<3) /* reserved for local use */
+#define LOG_LOCAL1 (17<<3) /* reserved for local use */
+#define LOG_LOCAL2 (18<<3) /* reserved for local use */
+#define LOG_LOCAL3 (19<<3) /* reserved for local use */
+#define LOG_LOCAL4 (20<<3) /* reserved for local use */
+#define LOG_LOCAL5 (21<<3) /* reserved for local use */
+#define LOG_LOCAL6 (22<<3) /* reserved for local use */
+#define LOG_LOCAL7 (23<<3) /* reserved for local use */
+
+#define LOG_PRIMASK 0x07 /* mask to extract priority part (internal) */
+ /* extract priority */
+#define LOG_PRI(p) ((p) & LOG_PRIMASK)
+
+
+static inline void
+openlog(const char *ident, int option, int facility)
+{
+}
+
+void
+syslog(int priority, const char *format, ...);
+
+#endif /* syslog.h */
diff --git a/src/include/win32/win32_errno.h b/src/include/win32/win32_errno.h
new file mode 100644
index 000000000..c842b250f
--- /dev/null
+++ b/src/include/win32/win32_errno.h
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+// We're going to preserve the error numbers defined by the Windows SDK but not
+// by Mingw headers. For others, we're going to use numbers greater than 256 to
+// avoid unintended overlaps.
+
+#ifndef WIN32_ERRNO_H
+#define WIN32_ERRNO_H 1
+
+#include <errno.h>
+
+#include "include/int_types.h"
+
+#ifndef EBADMSG
+#define EBADMSG 104
+#endif
+
+#ifndef ENODATA
+#define ENODATA 120
+#endif
+
+#ifndef ENOLINK
+#define ENOLINK 121
+#endif
+
+#ifndef ENOMSG
+#define ENOMSG 122
+#endif
+
+#ifndef ENOTRECOVERABLE
+#define ENOTRECOVERABLE 127
+#endif
+
+#ifndef ETIME
+#define ETIME 137
+#endif
+
+#ifndef ETXTBSY
+#define ETXTBSY 139
+#endif
+
+#ifndef ENODATA
+#define ENODATA 120
+#endif
+
+#define ESTALE 256
+#define EREMOTEIO 257
+
+#ifndef EBADE
+#define EBADE 258
+#endif
+
+#define EUCLEAN 259
+#define EREMCHG 260
+#define EKEYREJECTED 261
+#define EREMOTE 262
+
+// Not used at moment. Full coverage ensures that remote errors will be
+// converted and handled properly.
+#define EADV 263
+#define EBADFD 264
+#define EBADR 265
+#define EBADRQC 266
+#define EBADSLT 267
+#define EBFONT 268
+#define ECHRNG 269
+#define ECOMM 270
+#define EDOTDOT 271
+#define EHOSTDOWN 272
+#define EHWPOISON 273
+// Defined by Boost.
+#ifndef EIDRM
+#define EIDRM 274
+#endif
+#define EISNAM 275
+#define EKEYEXPIRED 276
+#define EKEYREVOKED 277
+#define EL2HLT 278
+#define EL2NSYNC 279
+#define EL3HLT 280
+#define EL3RST 281
+#define ELIBACC 282
+#define ELIBBAD 283
+#define ELIBEXEC 284
+#define ELIBMAX 285
+#define ELIBSCN 286
+#define ELNRNG 287
+#define EMEDIUMTYPE 288
+#define EMULTIHOP 289
+#define ENAVAIL 290
+#define ENOANO 291
+#define ENOCSI 292
+#define ENOKEY 293
+#define ENOMEDIUM 294
+#define ENONET 295
+#define ENOPKG 296
+#ifndef ENOSR
+#define ENOSR 297
+#endif
+#ifndef ENOSTR
+#define ENOSTR 298
+#endif
+#define ENOTNAM 299
+#define ENOTUNIQ 300
+#define EPFNOSUPPORT 301
+#define ERFKILL 302
+#define ESOCKTNOSUPPORT 303
+#define ESRMNT 304
+#define ESTRPIPE 305
+#define ETOOMANYREFS 306
+#define EUNATCH 307
+#define EUSERS 308
+#define EXFULL 309
+#define ENOTBLK 310
+
+#ifndef EDQUOT
+#define EDQUOT 311
+#endif
+
+#define ESHUTDOWN 312
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+__s32 wsae_to_errno(__s32 r);
+__u32 errno_to_ntstatus(__s32 r);
+__u32 cephfs_errno_to_ntsatus(int cephfs_errno);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // WIN32_ERRNO_H
diff --git a/src/include/win32/winsock_compat.h b/src/include/win32/winsock_compat.h
new file mode 100644
index 000000000..990cc4823
--- /dev/null
+++ b/src/include/win32/winsock_compat.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2019 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef WINSOCK_COMPAT_H
+#define WINSOCK_COMPAT_H 1
+
+#include "winsock_wrapper.h"
+
+#ifndef poll
+#define poll WSAPoll
+#endif
+
+// afunix.h is available starting with Windows SDK 17063. Still, it wasn't
+// picked up by mingw yet, for which reason we're going to define sockaddr_un
+// here.
+#ifndef _AFUNIX_
+#define UNIX_PATH_MAX 108
+
+typedef struct sockaddr_un
+{
+ ADDRESS_FAMILY sun_family; /* AF_UNIX */
+ char sun_path[UNIX_PATH_MAX]; /* pathname */
+} SOCKADDR_UN, *PSOCKADDR_UN;
+
+#define SIO_AF_UNIX_GETPEERPID _WSAIOR(IOC_VENDOR, 256)
+#endif /* _AFUNIX */
+
+#endif /* WINSOCK_COMPAT_H */
diff --git a/src/include/win32/winsock_wrapper.h b/src/include/win32/winsock_wrapper.h
new file mode 100644
index 000000000..1bb951a9d
--- /dev/null
+++ b/src/include/win32/winsock_wrapper.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2020 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef WINSOCK_WRAPPER_H
+#define WINSOCK_WRAPPER_H 1
+
+#ifdef __cplusplus
+// Boost complains if winsock2.h (or windows.h) is included before asio.hpp.
+#include <boost/asio.hpp>
+#endif
+
+#include <winsock2.h>
+#include <ws2ipdef.h>
+#include <ws2tcpip.h>
+
+#endif /* WINSOCK_WRAPPER_H */
diff --git a/src/include/xlist.h b/src/include/xlist.h
new file mode 100644
index 000000000..733a318a9
--- /dev/null
+++ b/src/include/xlist.h
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_XLIST_H
+#define CEPH_XLIST_H
+
+#include <iterator>
+#include <cstdlib>
+#include <ostream>
+
+#include "include/ceph_assert.h"
+
+template<typename T>
+class xlist {
+public:
+ class item {
+ public:
+ item(T i) : _item(i) {}
+ ~item() {
+ ceph_assert(!is_on_list());
+ }
+
+ item(const item& other) = delete;
+ item(item&& other) = delete;
+ const item& operator= (const item& right) = delete;
+ item& operator= (item&& right) = delete;
+
+ xlist* get_list() { return _list; }
+ bool is_on_list() const { return _list ? true:false; }
+ bool remove_myself() {
+ if (_list) {
+ _list->remove(this);
+ ceph_assert(_list == 0);
+ return true;
+ } else
+ return false;
+ }
+ void move_to_front() {
+ ceph_assert(_list);
+ _list->push_front(this);
+ }
+ void move_to_back() {
+ ceph_assert(_list);
+ _list->push_back(this);
+ }
+
+ private:
+ friend xlist;
+ T _item;
+ item *_prev = nullptr, *_next = nullptr;
+ xlist *_list = nullptr;
+ };
+
+ typedef item* value_type;
+ typedef item* const_reference;
+
+private:
+ item *_front, *_back;
+ size_t _size;
+
+public:
+ xlist(const xlist& other) {
+ _front = other._front;
+ _back = other._back;
+ _size = other._size;
+ }
+
+ xlist() : _front(0), _back(0), _size(0) {}
+ ~xlist() {
+ ceph_assert(_size == 0);
+ ceph_assert(_front == 0);
+ ceph_assert(_back == 0);
+ }
+
+ size_t size() const {
+ ceph_assert((bool)_front == (bool)_size);
+ return _size;
+ }
+ bool empty() const {
+ ceph_assert((bool)_front == (bool)_size);
+ return _front == 0;
+ }
+
+ void clear() {
+ while (_front)
+ remove(_front);
+ ceph_assert((bool)_front == (bool)_size);
+ }
+
+ void push_front(item *i) {
+ if (i->_list)
+ i->_list->remove(i);
+
+ i->_list = this;
+ i->_next = _front;
+ i->_prev = 0;
+ if (_front)
+ _front->_prev = i;
+ else
+ _back = i;
+ _front = i;
+ _size++;
+ }
+ void push_back(item *i) {
+ if (i->_list)
+ i->_list->remove(i);
+
+ i->_list = this;
+ i->_next = 0;
+ i->_prev = _back;
+ if (_back)
+ _back->_next = i;
+ else
+ _front = i;
+ _back = i;
+ _size++;
+ }
+ void remove(item *i) {
+ ceph_assert(i->_list == this);
+
+ if (i->_prev)
+ i->_prev->_next = i->_next;
+ else
+ _front = i->_next;
+ if (i->_next)
+ i->_next->_prev = i->_prev;
+ else
+ _back = i->_prev;
+ _size--;
+
+ i->_list = 0;
+ i->_next = i->_prev = 0;
+ ceph_assert((bool)_front == (bool)_size);
+ }
+
+ T front() { return static_cast<T>(_front->_item); }
+ const T front() const { return static_cast<const T>(_front->_item); }
+
+ T back() { return static_cast<T>(_back->_item); }
+ const T back() const { return static_cast<const T>(_back->_item); }
+
+ void pop_front() {
+ ceph_assert(!empty());
+ remove(_front);
+ }
+ void pop_back() {
+ ceph_assert(!empty());
+ remove(_back);
+ }
+
+ class iterator: std::iterator<std::forward_iterator_tag, T> {
+ private:
+ item *cur;
+ public:
+ iterator(item *i = 0) : cur(i) {}
+ T operator*() { return static_cast<T>(cur->_item); }
+ iterator& operator++() {
+ ceph_assert(cur);
+ ceph_assert(cur->_list);
+ cur = cur->_next;
+ return *this;
+ }
+ bool end() const { return cur == 0; }
+ bool operator==(const iterator& rhs) const {
+ return cur == rhs.cur;
+ }
+ bool operator!=(const iterator& rhs) const {
+ return cur != rhs.cur;
+ }
+ };
+
+ iterator begin() { return iterator(_front); }
+ iterator end() { return iterator(NULL); }
+
+ class const_iterator: std::iterator<std::forward_iterator_tag, T> {
+ private:
+ item *cur;
+ public:
+ const_iterator(item *i = 0) : cur(i) {}
+ const T operator*() { return static_cast<const T>(cur->_item); }
+ const_iterator& operator++() {
+ ceph_assert(cur);
+ ceph_assert(cur->_list);
+ cur = cur->_next;
+ return *this;
+ }
+ bool end() const { return cur == 0; }
+ bool operator==(const_iterator& rhs) const {
+ return cur == rhs.cur;
+ }
+ bool operator!=(const_iterator& rhs) const {
+ return cur != rhs.cur;
+ }
+ };
+
+ const_iterator begin() const { return const_iterator(_front); }
+ const_iterator end() const { return const_iterator(NULL); }
+
+ friend std::ostream &operator<<(std::ostream &oss, const xlist<T> &list) {
+ bool first = true;
+ for (const auto &item : list) {
+ if (!first) {
+ oss << ", ";
+ }
+ oss << *item; /* item should be a pointer */
+ first = false;
+ }
+ return oss;
+ }
+};
+
+
+#endif